31120001 / trainer_state.json
rica40325's picture
Upload folder using huggingface_hub
6df19d8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7629947544110635,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0019074868860276585,
"grad_norm": 5.96875,
"learning_rate": 1.9987282207808727e-05,
"loss": 1.8153,
"mean_token_accuracy": 0.5708044067025184,
"step": 5
},
{
"epoch": 0.003814973772055317,
"grad_norm": 5.40625,
"learning_rate": 1.997456441561745e-05,
"loss": 1.5088,
"mean_token_accuracy": 0.6002451926469803,
"step": 10
},
{
"epoch": 0.005722460658082976,
"grad_norm": 5.5625,
"learning_rate": 1.9961846623426175e-05,
"loss": 1.5696,
"mean_token_accuracy": 0.5999565117061139,
"step": 15
},
{
"epoch": 0.007629947544110634,
"grad_norm": 6.0,
"learning_rate": 1.9949128831234897e-05,
"loss": 1.4962,
"mean_token_accuracy": 0.6145946934819222,
"step": 20
},
{
"epoch": 0.009537434430138292,
"grad_norm": 5.0625,
"learning_rate": 1.9936411039043622e-05,
"loss": 1.4145,
"mean_token_accuracy": 0.6282299846410752,
"step": 25
},
{
"epoch": 0.011444921316165951,
"grad_norm": 5.75,
"learning_rate": 1.9923693246852348e-05,
"loss": 1.4615,
"mean_token_accuracy": 0.6196133770048619,
"step": 30
},
{
"epoch": 0.01335240820219361,
"grad_norm": 4.84375,
"learning_rate": 1.9910975454661073e-05,
"loss": 1.4118,
"mean_token_accuracy": 0.6272433631122112,
"step": 35
},
{
"epoch": 0.015259895088221268,
"grad_norm": 5.0625,
"learning_rate": 1.98982576624698e-05,
"loss": 1.3874,
"mean_token_accuracy": 0.6295698702335357,
"step": 40
},
{
"epoch": 0.017167381974248927,
"grad_norm": 5.46875,
"learning_rate": 1.988553987027852e-05,
"loss": 1.4997,
"mean_token_accuracy": 0.6131108298897743,
"step": 45
},
{
"epoch": 0.019074868860276584,
"grad_norm": 4.59375,
"learning_rate": 1.9872822078087246e-05,
"loss": 1.3386,
"mean_token_accuracy": 0.6355413243174552,
"step": 50
},
{
"epoch": 0.020982355746304245,
"grad_norm": 4.46875,
"learning_rate": 1.986010428589597e-05,
"loss": 1.3453,
"mean_token_accuracy": 0.6322750248014927,
"step": 55
},
{
"epoch": 0.022889842632331903,
"grad_norm": 5.0625,
"learning_rate": 1.9847386493704694e-05,
"loss": 1.4213,
"mean_token_accuracy": 0.6266717866063118,
"step": 60
},
{
"epoch": 0.02479732951835956,
"grad_norm": 5.5625,
"learning_rate": 1.983466870151342e-05,
"loss": 1.3913,
"mean_token_accuracy": 0.6290012784302235,
"step": 65
},
{
"epoch": 0.02670481640438722,
"grad_norm": 5.0625,
"learning_rate": 1.982195090932214e-05,
"loss": 1.3871,
"mean_token_accuracy": 0.627622963488102,
"step": 70
},
{
"epoch": 0.02861230329041488,
"grad_norm": 4.84375,
"learning_rate": 1.9809233117130867e-05,
"loss": 1.3232,
"mean_token_accuracy": 0.6432097807526589,
"step": 75
},
{
"epoch": 0.030519790176442536,
"grad_norm": 5.96875,
"learning_rate": 1.9796515324939593e-05,
"loss": 1.3287,
"mean_token_accuracy": 0.6405125185847282,
"step": 80
},
{
"epoch": 0.03242727706247019,
"grad_norm": 4.78125,
"learning_rate": 1.9783797532748318e-05,
"loss": 1.4044,
"mean_token_accuracy": 0.6292843967676163,
"step": 85
},
{
"epoch": 0.034334763948497854,
"grad_norm": 4.6875,
"learning_rate": 1.977107974055704e-05,
"loss": 1.3368,
"mean_token_accuracy": 0.6363929770886898,
"step": 90
},
{
"epoch": 0.036242250834525515,
"grad_norm": 4.6875,
"learning_rate": 1.9758361948365766e-05,
"loss": 1.3156,
"mean_token_accuracy": 0.6389718689024448,
"step": 95
},
{
"epoch": 0.03814973772055317,
"grad_norm": 4.84375,
"learning_rate": 1.974564415617449e-05,
"loss": 1.3213,
"mean_token_accuracy": 0.6437985837459564,
"step": 100
},
{
"epoch": 0.04005722460658083,
"grad_norm": 5.34375,
"learning_rate": 1.9732926363983213e-05,
"loss": 1.3403,
"mean_token_accuracy": 0.6304401338100434,
"step": 105
},
{
"epoch": 0.04196471149260849,
"grad_norm": 5.21875,
"learning_rate": 1.972020857179194e-05,
"loss": 1.4318,
"mean_token_accuracy": 0.6314706668257714,
"step": 110
},
{
"epoch": 0.043872198378636144,
"grad_norm": 5.625,
"learning_rate": 1.970749077960066e-05,
"loss": 1.3123,
"mean_token_accuracy": 0.6482092589139938,
"step": 115
},
{
"epoch": 0.045779685264663805,
"grad_norm": 5.03125,
"learning_rate": 1.9694772987409387e-05,
"loss": 1.2722,
"mean_token_accuracy": 0.6475286811590195,
"step": 120
},
{
"epoch": 0.047687172150691466,
"grad_norm": 4.78125,
"learning_rate": 1.9682055195218112e-05,
"loss": 1.3184,
"mean_token_accuracy": 0.636443517357111,
"step": 125
},
{
"epoch": 0.04959465903671912,
"grad_norm": 4.75,
"learning_rate": 1.9669337403026834e-05,
"loss": 1.3604,
"mean_token_accuracy": 0.6383991658687591,
"step": 130
},
{
"epoch": 0.05150214592274678,
"grad_norm": 6.3125,
"learning_rate": 1.965661961083556e-05,
"loss": 1.3624,
"mean_token_accuracy": 0.6359823271632195,
"step": 135
},
{
"epoch": 0.05340963280877444,
"grad_norm": 4.9375,
"learning_rate": 1.9643901818644285e-05,
"loss": 1.3622,
"mean_token_accuracy": 0.6360629022121429,
"step": 140
},
{
"epoch": 0.055317119694802096,
"grad_norm": 5.25,
"learning_rate": 1.963118402645301e-05,
"loss": 1.3007,
"mean_token_accuracy": 0.6486773908138275,
"step": 145
},
{
"epoch": 0.05722460658082976,
"grad_norm": 4.625,
"learning_rate": 1.9618466234261733e-05,
"loss": 1.259,
"mean_token_accuracy": 0.6483664289116859,
"step": 150
},
{
"epoch": 0.05913209346685742,
"grad_norm": 4.4375,
"learning_rate": 1.9605748442070458e-05,
"loss": 1.2701,
"mean_token_accuracy": 0.6497010916471482,
"step": 155
},
{
"epoch": 0.06103958035288507,
"grad_norm": 5.0625,
"learning_rate": 1.9593030649879184e-05,
"loss": 1.2925,
"mean_token_accuracy": 0.645444954931736,
"step": 160
},
{
"epoch": 0.06294706723891273,
"grad_norm": 4.46875,
"learning_rate": 1.9580312857687906e-05,
"loss": 1.3194,
"mean_token_accuracy": 0.6443428501486779,
"step": 165
},
{
"epoch": 0.06485455412494039,
"grad_norm": 5.15625,
"learning_rate": 1.956759506549663e-05,
"loss": 1.2786,
"mean_token_accuracy": 0.6503037214279175,
"step": 170
},
{
"epoch": 0.06676204101096805,
"grad_norm": 5.65625,
"learning_rate": 1.9554877273305353e-05,
"loss": 1.3482,
"mean_token_accuracy": 0.6470891699194908,
"step": 175
},
{
"epoch": 0.06866952789699571,
"grad_norm": 4.21875,
"learning_rate": 1.954215948111408e-05,
"loss": 1.3022,
"mean_token_accuracy": 0.6400970220565796,
"step": 180
},
{
"epoch": 0.07057701478302336,
"grad_norm": 5.03125,
"learning_rate": 1.9529441688922804e-05,
"loss": 1.2821,
"mean_token_accuracy": 0.6461471430957317,
"step": 185
},
{
"epoch": 0.07248450166905103,
"grad_norm": 5.0,
"learning_rate": 1.951672389673153e-05,
"loss": 1.2824,
"mean_token_accuracy": 0.649085208773613,
"step": 190
},
{
"epoch": 0.07439198855507868,
"grad_norm": 4.5,
"learning_rate": 1.9504006104540255e-05,
"loss": 1.2508,
"mean_token_accuracy": 0.6536262959241868,
"step": 195
},
{
"epoch": 0.07629947544110634,
"grad_norm": 4.9375,
"learning_rate": 1.9491288312348978e-05,
"loss": 1.3044,
"mean_token_accuracy": 0.6513546489179134,
"step": 200
},
{
"epoch": 0.078206962327134,
"grad_norm": 5.125,
"learning_rate": 1.9478570520157703e-05,
"loss": 1.2265,
"mean_token_accuracy": 0.6645126178860664,
"step": 205
},
{
"epoch": 0.08011444921316166,
"grad_norm": 5.875,
"learning_rate": 1.9465852727966425e-05,
"loss": 1.3156,
"mean_token_accuracy": 0.6494258716702461,
"step": 210
},
{
"epoch": 0.08202193609918931,
"grad_norm": 5.0,
"learning_rate": 1.945313493577515e-05,
"loss": 1.1655,
"mean_token_accuracy": 0.6783517330884934,
"step": 215
},
{
"epoch": 0.08392942298521698,
"grad_norm": 5.4375,
"learning_rate": 1.9440417143583876e-05,
"loss": 1.2306,
"mean_token_accuracy": 0.6659620314836502,
"step": 220
},
{
"epoch": 0.08583690987124463,
"grad_norm": 4.40625,
"learning_rate": 1.9427699351392598e-05,
"loss": 1.3798,
"mean_token_accuracy": 0.6320724219083786,
"step": 225
},
{
"epoch": 0.08774439675727229,
"grad_norm": 4.75,
"learning_rate": 1.9414981559201324e-05,
"loss": 1.2437,
"mean_token_accuracy": 0.6454930439591408,
"step": 230
},
{
"epoch": 0.08965188364329996,
"grad_norm": 5.90625,
"learning_rate": 1.940226376701005e-05,
"loss": 1.2523,
"mean_token_accuracy": 0.6506562553346157,
"step": 235
},
{
"epoch": 0.09155937052932761,
"grad_norm": 4.75,
"learning_rate": 1.938954597481877e-05,
"loss": 1.2268,
"mean_token_accuracy": 0.6620514318346977,
"step": 240
},
{
"epoch": 0.09346685741535526,
"grad_norm": 4.53125,
"learning_rate": 1.9376828182627497e-05,
"loss": 1.2215,
"mean_token_accuracy": 0.6610720351338386,
"step": 245
},
{
"epoch": 0.09537434430138293,
"grad_norm": 5.15625,
"learning_rate": 1.9364110390436222e-05,
"loss": 1.2494,
"mean_token_accuracy": 0.653127409517765,
"step": 250
},
{
"epoch": 0.09728183118741059,
"grad_norm": 5.1875,
"learning_rate": 1.9351392598244948e-05,
"loss": 1.3126,
"mean_token_accuracy": 0.6540979892015457,
"step": 255
},
{
"epoch": 0.09918931807343824,
"grad_norm": 5.15625,
"learning_rate": 1.933867480605367e-05,
"loss": 1.2498,
"mean_token_accuracy": 0.6507880866527558,
"step": 260
},
{
"epoch": 0.10109680495946591,
"grad_norm": 5.8125,
"learning_rate": 1.9325957013862396e-05,
"loss": 1.3294,
"mean_token_accuracy": 0.6403125211596489,
"step": 265
},
{
"epoch": 0.10300429184549356,
"grad_norm": 6.09375,
"learning_rate": 1.9313239221671118e-05,
"loss": 1.3099,
"mean_token_accuracy": 0.6520273745059967,
"step": 270
},
{
"epoch": 0.10491177873152122,
"grad_norm": 5.40625,
"learning_rate": 1.9300521429479843e-05,
"loss": 1.2857,
"mean_token_accuracy": 0.6506179749965668,
"step": 275
},
{
"epoch": 0.10681926561754888,
"grad_norm": 5.71875,
"learning_rate": 1.928780363728857e-05,
"loss": 1.3496,
"mean_token_accuracy": 0.6408744707703591,
"step": 280
},
{
"epoch": 0.10872675250357654,
"grad_norm": 5.0625,
"learning_rate": 1.927508584509729e-05,
"loss": 1.2012,
"mean_token_accuracy": 0.6639982044696808,
"step": 285
},
{
"epoch": 0.11063423938960419,
"grad_norm": 5.5,
"learning_rate": 1.9262368052906016e-05,
"loss": 1.2449,
"mean_token_accuracy": 0.6594835132360458,
"step": 290
},
{
"epoch": 0.11254172627563186,
"grad_norm": 4.125,
"learning_rate": 1.9249650260714742e-05,
"loss": 1.1314,
"mean_token_accuracy": 0.6807294517755509,
"step": 295
},
{
"epoch": 0.11444921316165951,
"grad_norm": 5.25,
"learning_rate": 1.9236932468523467e-05,
"loss": 1.2528,
"mean_token_accuracy": 0.6543513402342797,
"step": 300
},
{
"epoch": 0.11635670004768717,
"grad_norm": 4.5,
"learning_rate": 1.9224214676332193e-05,
"loss": 1.201,
"mean_token_accuracy": 0.6674724757671356,
"step": 305
},
{
"epoch": 0.11826418693371483,
"grad_norm": 5.09375,
"learning_rate": 1.9211496884140915e-05,
"loss": 1.2232,
"mean_token_accuracy": 0.6661024749279022,
"step": 310
},
{
"epoch": 0.12017167381974249,
"grad_norm": 4.65625,
"learning_rate": 1.919877909194964e-05,
"loss": 1.2015,
"mean_token_accuracy": 0.6703806266188621,
"step": 315
},
{
"epoch": 0.12207916070577014,
"grad_norm": 4.90625,
"learning_rate": 1.9186061299758362e-05,
"loss": 1.265,
"mean_token_accuracy": 0.6571035169064998,
"step": 320
},
{
"epoch": 0.12398664759179781,
"grad_norm": 4.5625,
"learning_rate": 1.9173343507567088e-05,
"loss": 1.1817,
"mean_token_accuracy": 0.6720203042030335,
"step": 325
},
{
"epoch": 0.12589413447782546,
"grad_norm": 5.40625,
"learning_rate": 1.916062571537581e-05,
"loss": 1.3305,
"mean_token_accuracy": 0.6445886738598346,
"step": 330
},
{
"epoch": 0.12780162136385312,
"grad_norm": 5.34375,
"learning_rate": 1.9147907923184536e-05,
"loss": 1.2302,
"mean_token_accuracy": 0.6632393077015877,
"step": 335
},
{
"epoch": 0.12970910824988077,
"grad_norm": 4.40625,
"learning_rate": 1.913519013099326e-05,
"loss": 1.2463,
"mean_token_accuracy": 0.6618235319852829,
"step": 340
},
{
"epoch": 0.13161659513590845,
"grad_norm": 4.84375,
"learning_rate": 1.9122472338801987e-05,
"loss": 1.1995,
"mean_token_accuracy": 0.6734424993395806,
"step": 345
},
{
"epoch": 0.1335240820219361,
"grad_norm": 6.09375,
"learning_rate": 1.9109754546610712e-05,
"loss": 1.1695,
"mean_token_accuracy": 0.6744470730423927,
"step": 350
},
{
"epoch": 0.13543156890796376,
"grad_norm": 4.4375,
"learning_rate": 1.9097036754419434e-05,
"loss": 1.235,
"mean_token_accuracy": 0.6613016352057457,
"step": 355
},
{
"epoch": 0.13733905579399142,
"grad_norm": 5.15625,
"learning_rate": 1.908431896222816e-05,
"loss": 1.1908,
"mean_token_accuracy": 0.6631178431212902,
"step": 360
},
{
"epoch": 0.13924654268001907,
"grad_norm": 6.03125,
"learning_rate": 1.9071601170036885e-05,
"loss": 1.2538,
"mean_token_accuracy": 0.6583275809884072,
"step": 365
},
{
"epoch": 0.14115402956604672,
"grad_norm": 4.875,
"learning_rate": 1.9058883377845607e-05,
"loss": 1.1481,
"mean_token_accuracy": 0.6743377096951008,
"step": 370
},
{
"epoch": 0.1430615164520744,
"grad_norm": 4.65625,
"learning_rate": 1.9046165585654333e-05,
"loss": 1.1758,
"mean_token_accuracy": 0.6755423441529274,
"step": 375
},
{
"epoch": 0.14496900333810206,
"grad_norm": 4.21875,
"learning_rate": 1.9033447793463055e-05,
"loss": 1.1526,
"mean_token_accuracy": 0.6709615409374237,
"step": 380
},
{
"epoch": 0.1468764902241297,
"grad_norm": 5.34375,
"learning_rate": 1.902073000127178e-05,
"loss": 1.1614,
"mean_token_accuracy": 0.6803930580615998,
"step": 385
},
{
"epoch": 0.14878397711015737,
"grad_norm": 5.15625,
"learning_rate": 1.9008012209080503e-05,
"loss": 1.1651,
"mean_token_accuracy": 0.6757953256368637,
"step": 390
},
{
"epoch": 0.15069146399618502,
"grad_norm": 5.65625,
"learning_rate": 1.8995294416889228e-05,
"loss": 1.2219,
"mean_token_accuracy": 0.6725533396005631,
"step": 395
},
{
"epoch": 0.15259895088221268,
"grad_norm": 4.90625,
"learning_rate": 1.8982576624697954e-05,
"loss": 1.2598,
"mean_token_accuracy": 0.6562705941498279,
"step": 400
},
{
"epoch": 0.15450643776824036,
"grad_norm": 4.59375,
"learning_rate": 1.896985883250668e-05,
"loss": 1.1586,
"mean_token_accuracy": 0.6731373474001885,
"step": 405
},
{
"epoch": 0.156413924654268,
"grad_norm": 5.65625,
"learning_rate": 1.8957141040315405e-05,
"loss": 1.2544,
"mean_token_accuracy": 0.6664155155420304,
"step": 410
},
{
"epoch": 0.15832141154029566,
"grad_norm": 5.09375,
"learning_rate": 1.8944423248124127e-05,
"loss": 1.2045,
"mean_token_accuracy": 0.6721395581960679,
"step": 415
},
{
"epoch": 0.16022889842632332,
"grad_norm": 4.65625,
"learning_rate": 1.8931705455932852e-05,
"loss": 1.1623,
"mean_token_accuracy": 0.6825975701212883,
"step": 420
},
{
"epoch": 0.16213638531235097,
"grad_norm": 5.25,
"learning_rate": 1.8918987663741578e-05,
"loss": 1.2345,
"mean_token_accuracy": 0.655138723552227,
"step": 425
},
{
"epoch": 0.16404387219837863,
"grad_norm": 4.46875,
"learning_rate": 1.89062698715503e-05,
"loss": 1.1583,
"mean_token_accuracy": 0.6771410465240478,
"step": 430
},
{
"epoch": 0.1659513590844063,
"grad_norm": 4.34375,
"learning_rate": 1.8893552079359025e-05,
"loss": 1.2226,
"mean_token_accuracy": 0.6566869288682937,
"step": 435
},
{
"epoch": 0.16785884597043396,
"grad_norm": 4.8125,
"learning_rate": 1.8880834287167747e-05,
"loss": 1.2531,
"mean_token_accuracy": 0.6537335075438022,
"step": 440
},
{
"epoch": 0.16976633285646162,
"grad_norm": 5.1875,
"learning_rate": 1.8868116494976473e-05,
"loss": 1.1736,
"mean_token_accuracy": 0.6741863384842872,
"step": 445
},
{
"epoch": 0.17167381974248927,
"grad_norm": 5.0,
"learning_rate": 1.88553987027852e-05,
"loss": 1.1912,
"mean_token_accuracy": 0.6718283355236053,
"step": 450
},
{
"epoch": 0.17358130662851692,
"grad_norm": 5.03125,
"learning_rate": 1.8842680910593924e-05,
"loss": 1.2016,
"mean_token_accuracy": 0.670920492708683,
"step": 455
},
{
"epoch": 0.17548879351454458,
"grad_norm": 5.375,
"learning_rate": 1.882996311840265e-05,
"loss": 1.1818,
"mean_token_accuracy": 0.6705762408673763,
"step": 460
},
{
"epoch": 0.17739628040057226,
"grad_norm": 5.9375,
"learning_rate": 1.881724532621137e-05,
"loss": 1.3159,
"mean_token_accuracy": 0.6546284504234791,
"step": 465
},
{
"epoch": 0.1793037672865999,
"grad_norm": 4.8125,
"learning_rate": 1.8804527534020097e-05,
"loss": 1.2004,
"mean_token_accuracy": 0.6597321718931198,
"step": 470
},
{
"epoch": 0.18121125417262757,
"grad_norm": 6.9375,
"learning_rate": 1.879180974182882e-05,
"loss": 1.1672,
"mean_token_accuracy": 0.6835288152098655,
"step": 475
},
{
"epoch": 0.18311874105865522,
"grad_norm": 5.0625,
"learning_rate": 1.8779091949637545e-05,
"loss": 1.2113,
"mean_token_accuracy": 0.6654989182949066,
"step": 480
},
{
"epoch": 0.18502622794468288,
"grad_norm": 5.09375,
"learning_rate": 1.876637415744627e-05,
"loss": 1.076,
"mean_token_accuracy": 0.6919501051306725,
"step": 485
},
{
"epoch": 0.18693371483071053,
"grad_norm": 4.96875,
"learning_rate": 1.8753656365254992e-05,
"loss": 1.1946,
"mean_token_accuracy": 0.671660166978836,
"step": 490
},
{
"epoch": 0.1888412017167382,
"grad_norm": 5.875,
"learning_rate": 1.8740938573063718e-05,
"loss": 1.2334,
"mean_token_accuracy": 0.6634502306580543,
"step": 495
},
{
"epoch": 0.19074868860276586,
"grad_norm": 5.34375,
"learning_rate": 1.872822078087244e-05,
"loss": 1.1412,
"mean_token_accuracy": 0.6843819186091423,
"step": 500
},
{
"epoch": 0.19265617548879352,
"grad_norm": 4.875,
"learning_rate": 1.8715502988681165e-05,
"loss": 1.1017,
"mean_token_accuracy": 0.6859075799584389,
"step": 505
},
{
"epoch": 0.19456366237482117,
"grad_norm": 5.0,
"learning_rate": 1.870278519648989e-05,
"loss": 1.099,
"mean_token_accuracy": 0.6794175133109093,
"step": 510
},
{
"epoch": 0.19647114926084883,
"grad_norm": 5.0,
"learning_rate": 1.8690067404298616e-05,
"loss": 1.162,
"mean_token_accuracy": 0.6769187614321709,
"step": 515
},
{
"epoch": 0.19837863614687648,
"grad_norm": 4.21875,
"learning_rate": 1.8677349612107342e-05,
"loss": 1.1241,
"mean_token_accuracy": 0.6816845044493676,
"step": 520
},
{
"epoch": 0.20028612303290416,
"grad_norm": 7.09375,
"learning_rate": 1.8664631819916064e-05,
"loss": 1.1331,
"mean_token_accuracy": 0.6768893599510193,
"step": 525
},
{
"epoch": 0.20219360991893182,
"grad_norm": 4.25,
"learning_rate": 1.865191402772479e-05,
"loss": 1.2657,
"mean_token_accuracy": 0.6479664385318756,
"step": 530
},
{
"epoch": 0.20410109680495947,
"grad_norm": 4.5,
"learning_rate": 1.863919623553351e-05,
"loss": 1.2343,
"mean_token_accuracy": 0.6734806634485722,
"step": 535
},
{
"epoch": 0.20600858369098712,
"grad_norm": 12.3125,
"learning_rate": 1.8626478443342237e-05,
"loss": 1.1263,
"mean_token_accuracy": 0.6808793410658837,
"step": 540
},
{
"epoch": 0.20791607057701478,
"grad_norm": 5.9375,
"learning_rate": 1.8613760651150963e-05,
"loss": 1.0617,
"mean_token_accuracy": 0.6977316424250603,
"step": 545
},
{
"epoch": 0.20982355746304243,
"grad_norm": 5.96875,
"learning_rate": 1.8601042858959685e-05,
"loss": 1.1229,
"mean_token_accuracy": 0.6831489652395248,
"step": 550
},
{
"epoch": 0.2117310443490701,
"grad_norm": 4.84375,
"learning_rate": 1.858832506676841e-05,
"loss": 1.1216,
"mean_token_accuracy": 0.6838791735470295,
"step": 555
},
{
"epoch": 0.21363853123509777,
"grad_norm": 4.78125,
"learning_rate": 1.8575607274577136e-05,
"loss": 1.2059,
"mean_token_accuracy": 0.6669023260474205,
"step": 560
},
{
"epoch": 0.21554601812112542,
"grad_norm": 4.875,
"learning_rate": 1.856288948238586e-05,
"loss": 1.1515,
"mean_token_accuracy": 0.6748893111944199,
"step": 565
},
{
"epoch": 0.21745350500715308,
"grad_norm": 4.21875,
"learning_rate": 1.8550171690194583e-05,
"loss": 1.063,
"mean_token_accuracy": 0.7043292924761773,
"step": 570
},
{
"epoch": 0.21936099189318073,
"grad_norm": 5.34375,
"learning_rate": 1.853745389800331e-05,
"loss": 1.1071,
"mean_token_accuracy": 0.6816813468933105,
"step": 575
},
{
"epoch": 0.22126847877920838,
"grad_norm": 7.25,
"learning_rate": 1.8524736105812034e-05,
"loss": 1.1493,
"mean_token_accuracy": 0.6842595711350441,
"step": 580
},
{
"epoch": 0.22317596566523606,
"grad_norm": 6.3125,
"learning_rate": 1.8512018313620756e-05,
"loss": 1.2036,
"mean_token_accuracy": 0.6606533020734787,
"step": 585
},
{
"epoch": 0.22508345255126372,
"grad_norm": 5.1875,
"learning_rate": 1.8499300521429482e-05,
"loss": 1.0329,
"mean_token_accuracy": 0.7044132232666016,
"step": 590
},
{
"epoch": 0.22699093943729137,
"grad_norm": 4.90625,
"learning_rate": 1.8486582729238204e-05,
"loss": 1.183,
"mean_token_accuracy": 0.6742020189762116,
"step": 595
},
{
"epoch": 0.22889842632331903,
"grad_norm": 5.375,
"learning_rate": 1.847386493704693e-05,
"loss": 1.1212,
"mean_token_accuracy": 0.6900173485279083,
"step": 600
},
{
"epoch": 0.23080591320934668,
"grad_norm": 5.5,
"learning_rate": 1.8461147144855655e-05,
"loss": 1.1654,
"mean_token_accuracy": 0.6756381630897522,
"step": 605
},
{
"epoch": 0.23271340009537433,
"grad_norm": 4.8125,
"learning_rate": 1.8448429352664377e-05,
"loss": 1.1592,
"mean_token_accuracy": 0.6697646602988243,
"step": 610
},
{
"epoch": 0.23462088698140202,
"grad_norm": 4.84375,
"learning_rate": 1.8435711560473103e-05,
"loss": 1.0309,
"mean_token_accuracy": 0.6977804109454155,
"step": 615
},
{
"epoch": 0.23652837386742967,
"grad_norm": 5.78125,
"learning_rate": 1.8422993768281828e-05,
"loss": 1.1705,
"mean_token_accuracy": 0.6789061531424523,
"step": 620
},
{
"epoch": 0.23843586075345732,
"grad_norm": 4.84375,
"learning_rate": 1.8410275976090554e-05,
"loss": 1.1715,
"mean_token_accuracy": 0.6736443802714348,
"step": 625
},
{
"epoch": 0.24034334763948498,
"grad_norm": 4.6875,
"learning_rate": 1.8397558183899276e-05,
"loss": 1.1535,
"mean_token_accuracy": 0.6837850168347359,
"step": 630
},
{
"epoch": 0.24225083452551263,
"grad_norm": 5.625,
"learning_rate": 1.8384840391708e-05,
"loss": 1.1403,
"mean_token_accuracy": 0.6860886000096797,
"step": 635
},
{
"epoch": 0.24415832141154029,
"grad_norm": 5.78125,
"learning_rate": 1.8372122599516727e-05,
"loss": 1.1777,
"mean_token_accuracy": 0.6730990558862686,
"step": 640
},
{
"epoch": 0.24606580829756797,
"grad_norm": 4.96875,
"learning_rate": 1.835940480732545e-05,
"loss": 1.2174,
"mean_token_accuracy": 0.6607669338583946,
"step": 645
},
{
"epoch": 0.24797329518359562,
"grad_norm": 5.03125,
"learning_rate": 1.8346687015134174e-05,
"loss": 1.1306,
"mean_token_accuracy": 0.6789732642471791,
"step": 650
},
{
"epoch": 0.24988078206962328,
"grad_norm": 4.5,
"learning_rate": 1.8333969222942896e-05,
"loss": 1.0958,
"mean_token_accuracy": 0.6940956100821495,
"step": 655
},
{
"epoch": 0.25178826895565093,
"grad_norm": 4.53125,
"learning_rate": 1.8321251430751622e-05,
"loss": 1.1623,
"mean_token_accuracy": 0.6806924149394036,
"step": 660
},
{
"epoch": 0.2536957558416786,
"grad_norm": 5.59375,
"learning_rate": 1.8308533638560347e-05,
"loss": 1.1438,
"mean_token_accuracy": 0.6787467435002327,
"step": 665
},
{
"epoch": 0.25560324272770624,
"grad_norm": 4.65625,
"learning_rate": 1.8295815846369073e-05,
"loss": 1.2902,
"mean_token_accuracy": 0.655436672270298,
"step": 670
},
{
"epoch": 0.2575107296137339,
"grad_norm": 4.46875,
"learning_rate": 1.82830980541778e-05,
"loss": 1.1499,
"mean_token_accuracy": 0.6697306737303734,
"step": 675
},
{
"epoch": 0.25941821649976154,
"grad_norm": 5.78125,
"learning_rate": 1.827038026198652e-05,
"loss": 1.1432,
"mean_token_accuracy": 0.6788436755537987,
"step": 680
},
{
"epoch": 0.2613257033857892,
"grad_norm": 6.40625,
"learning_rate": 1.8257662469795246e-05,
"loss": 1.1457,
"mean_token_accuracy": 0.6794642567634582,
"step": 685
},
{
"epoch": 0.2632331902718169,
"grad_norm": 4.5625,
"learning_rate": 1.8244944677603968e-05,
"loss": 1.1842,
"mean_token_accuracy": 0.6739339649677276,
"step": 690
},
{
"epoch": 0.26514067715784456,
"grad_norm": 5.375,
"learning_rate": 1.8232226885412694e-05,
"loss": 1.1979,
"mean_token_accuracy": 0.671060286462307,
"step": 695
},
{
"epoch": 0.2670481640438722,
"grad_norm": 5.9375,
"learning_rate": 1.821950909322142e-05,
"loss": 1.1926,
"mean_token_accuracy": 0.68006531894207,
"step": 700
},
{
"epoch": 0.26895565092989987,
"grad_norm": 4.5,
"learning_rate": 1.820679130103014e-05,
"loss": 1.1425,
"mean_token_accuracy": 0.686011116206646,
"step": 705
},
{
"epoch": 0.2708631378159275,
"grad_norm": 4.125,
"learning_rate": 1.8194073508838867e-05,
"loss": 1.1241,
"mean_token_accuracy": 0.6813527546823025,
"step": 710
},
{
"epoch": 0.2727706247019552,
"grad_norm": 4.59375,
"learning_rate": 1.8181355716647592e-05,
"loss": 1.0763,
"mean_token_accuracy": 0.6978838533163071,
"step": 715
},
{
"epoch": 0.27467811158798283,
"grad_norm": 4.71875,
"learning_rate": 1.8168637924456314e-05,
"loss": 1.1214,
"mean_token_accuracy": 0.6879947543144226,
"step": 720
},
{
"epoch": 0.2765855984740105,
"grad_norm": 4.65625,
"learning_rate": 1.815592013226504e-05,
"loss": 1.145,
"mean_token_accuracy": 0.6740229934453964,
"step": 725
},
{
"epoch": 0.27849308536003814,
"grad_norm": 5.0,
"learning_rate": 1.8143202340073765e-05,
"loss": 1.0915,
"mean_token_accuracy": 0.6966636836528778,
"step": 730
},
{
"epoch": 0.2804005722460658,
"grad_norm": 5.46875,
"learning_rate": 1.813048454788249e-05,
"loss": 1.1534,
"mean_token_accuracy": 0.6881409972906113,
"step": 735
},
{
"epoch": 0.28230805913209345,
"grad_norm": 5.21875,
"learning_rate": 1.8117766755691213e-05,
"loss": 1.1306,
"mean_token_accuracy": 0.6856201700866222,
"step": 740
},
{
"epoch": 0.2842155460181211,
"grad_norm": 4.3125,
"learning_rate": 1.810504896349994e-05,
"loss": 1.1514,
"mean_token_accuracy": 0.6833734557032585,
"step": 745
},
{
"epoch": 0.2861230329041488,
"grad_norm": 5.1875,
"learning_rate": 1.809233117130866e-05,
"loss": 1.1379,
"mean_token_accuracy": 0.6824650421738625,
"step": 750
},
{
"epoch": 0.28803051979017646,
"grad_norm": 4.8125,
"learning_rate": 1.8079613379117386e-05,
"loss": 1.1685,
"mean_token_accuracy": 0.6773202955722809,
"step": 755
},
{
"epoch": 0.2899380066762041,
"grad_norm": 5.09375,
"learning_rate": 1.806689558692611e-05,
"loss": 1.1683,
"mean_token_accuracy": 0.6770651459693908,
"step": 760
},
{
"epoch": 0.2918454935622318,
"grad_norm": 4.875,
"learning_rate": 1.8054177794734834e-05,
"loss": 1.0853,
"mean_token_accuracy": 0.6914259925484657,
"step": 765
},
{
"epoch": 0.2937529804482594,
"grad_norm": 5.0,
"learning_rate": 1.804146000254356e-05,
"loss": 1.1336,
"mean_token_accuracy": 0.6820079162716866,
"step": 770
},
{
"epoch": 0.2956604673342871,
"grad_norm": 4.09375,
"learning_rate": 1.8028742210352285e-05,
"loss": 1.1233,
"mean_token_accuracy": 0.6821878552436829,
"step": 775
},
{
"epoch": 0.29756795422031473,
"grad_norm": 4.71875,
"learning_rate": 1.801602441816101e-05,
"loss": 1.1204,
"mean_token_accuracy": 0.692288200557232,
"step": 780
},
{
"epoch": 0.2994754411063424,
"grad_norm": 4.9375,
"learning_rate": 1.8003306625969736e-05,
"loss": 1.2294,
"mean_token_accuracy": 0.673324004560709,
"step": 785
},
{
"epoch": 0.30138292799237004,
"grad_norm": 4.9375,
"learning_rate": 1.7990588833778458e-05,
"loss": 1.2059,
"mean_token_accuracy": 0.6697646111249924,
"step": 790
},
{
"epoch": 0.3032904148783977,
"grad_norm": 4.5625,
"learning_rate": 1.7977871041587183e-05,
"loss": 1.1745,
"mean_token_accuracy": 0.6823094062507152,
"step": 795
},
{
"epoch": 0.30519790176442535,
"grad_norm": 4.625,
"learning_rate": 1.7965153249395905e-05,
"loss": 1.1519,
"mean_token_accuracy": 0.6778446674346924,
"step": 800
},
{
"epoch": 0.307105388650453,
"grad_norm": 4.53125,
"learning_rate": 1.795243545720463e-05,
"loss": 1.0918,
"mean_token_accuracy": 0.6978467896580696,
"step": 805
},
{
"epoch": 0.3090128755364807,
"grad_norm": 3.921875,
"learning_rate": 1.7939717665013353e-05,
"loss": 1.1631,
"mean_token_accuracy": 0.6788024313747882,
"step": 810
},
{
"epoch": 0.31092036242250837,
"grad_norm": 4.78125,
"learning_rate": 1.792699987282208e-05,
"loss": 1.1259,
"mean_token_accuracy": 0.6834620237350464,
"step": 815
},
{
"epoch": 0.312827849308536,
"grad_norm": 5.21875,
"learning_rate": 1.7914282080630804e-05,
"loss": 1.0979,
"mean_token_accuracy": 0.6804828964173794,
"step": 820
},
{
"epoch": 0.3147353361945637,
"grad_norm": 4.875,
"learning_rate": 1.790156428843953e-05,
"loss": 1.204,
"mean_token_accuracy": 0.6713850289583206,
"step": 825
},
{
"epoch": 0.31664282308059133,
"grad_norm": 4.6875,
"learning_rate": 1.788884649624825e-05,
"loss": 1.14,
"mean_token_accuracy": 0.6814699381589889,
"step": 830
},
{
"epoch": 0.318550309966619,
"grad_norm": 4.3125,
"learning_rate": 1.7876128704056977e-05,
"loss": 1.1383,
"mean_token_accuracy": 0.672491405904293,
"step": 835
},
{
"epoch": 0.32045779685264664,
"grad_norm": 4.875,
"learning_rate": 1.7863410911865703e-05,
"loss": 1.1993,
"mean_token_accuracy": 0.662742418050766,
"step": 840
},
{
"epoch": 0.3223652837386743,
"grad_norm": 4.53125,
"learning_rate": 1.7850693119674428e-05,
"loss": 1.0142,
"mean_token_accuracy": 0.7142936125397682,
"step": 845
},
{
"epoch": 0.32427277062470194,
"grad_norm": 4.6875,
"learning_rate": 1.783797532748315e-05,
"loss": 1.1083,
"mean_token_accuracy": 0.6907312035560608,
"step": 850
},
{
"epoch": 0.3261802575107296,
"grad_norm": 4.40625,
"learning_rate": 1.7825257535291876e-05,
"loss": 1.0248,
"mean_token_accuracy": 0.7127422258257866,
"step": 855
},
{
"epoch": 0.32808774439675725,
"grad_norm": 4.375,
"learning_rate": 1.7812539743100598e-05,
"loss": 1.1234,
"mean_token_accuracy": 0.6854491457343102,
"step": 860
},
{
"epoch": 0.3299952312827849,
"grad_norm": 5.0625,
"learning_rate": 1.7799821950909323e-05,
"loss": 1.1691,
"mean_token_accuracy": 0.6794403240084648,
"step": 865
},
{
"epoch": 0.3319027181688126,
"grad_norm": 5.5625,
"learning_rate": 1.7787104158718045e-05,
"loss": 1.1286,
"mean_token_accuracy": 0.6781167238950729,
"step": 870
},
{
"epoch": 0.33381020505484027,
"grad_norm": 4.8125,
"learning_rate": 1.777438636652677e-05,
"loss": 1.1314,
"mean_token_accuracy": 0.6972567990422249,
"step": 875
},
{
"epoch": 0.3357176919408679,
"grad_norm": 5.34375,
"learning_rate": 1.7761668574335496e-05,
"loss": 1.0309,
"mean_token_accuracy": 0.7109489843249321,
"step": 880
},
{
"epoch": 0.3376251788268956,
"grad_norm": 5.53125,
"learning_rate": 1.7748950782144222e-05,
"loss": 1.122,
"mean_token_accuracy": 0.6877056941390037,
"step": 885
},
{
"epoch": 0.33953266571292323,
"grad_norm": 4.53125,
"learning_rate": 1.7736232989952947e-05,
"loss": 1.0609,
"mean_token_accuracy": 0.7033521652221679,
"step": 890
},
{
"epoch": 0.3414401525989509,
"grad_norm": 4.125,
"learning_rate": 1.772351519776167e-05,
"loss": 1.0418,
"mean_token_accuracy": 0.7021679773926734,
"step": 895
},
{
"epoch": 0.34334763948497854,
"grad_norm": 4.4375,
"learning_rate": 1.7710797405570395e-05,
"loss": 1.1233,
"mean_token_accuracy": 0.696443286538124,
"step": 900
},
{
"epoch": 0.3452551263710062,
"grad_norm": 4.53125,
"learning_rate": 1.7698079613379117e-05,
"loss": 1.043,
"mean_token_accuracy": 0.7000239789485931,
"step": 905
},
{
"epoch": 0.34716261325703385,
"grad_norm": 4.75,
"learning_rate": 1.7685361821187843e-05,
"loss": 1.0404,
"mean_token_accuracy": 0.7076486960053444,
"step": 910
},
{
"epoch": 0.3490701001430615,
"grad_norm": 4.78125,
"learning_rate": 1.7672644028996568e-05,
"loss": 1.2165,
"mean_token_accuracy": 0.6694424465298653,
"step": 915
},
{
"epoch": 0.35097758702908916,
"grad_norm": 5.15625,
"learning_rate": 1.765992623680529e-05,
"loss": 1.1061,
"mean_token_accuracy": 0.689212466776371,
"step": 920
},
{
"epoch": 0.3528850739151168,
"grad_norm": 4.9375,
"learning_rate": 1.7647208444614016e-05,
"loss": 1.1573,
"mean_token_accuracy": 0.6761819615960121,
"step": 925
},
{
"epoch": 0.3547925608011445,
"grad_norm": 4.96875,
"learning_rate": 1.763449065242274e-05,
"loss": 1.0511,
"mean_token_accuracy": 0.7017502933740616,
"step": 930
},
{
"epoch": 0.3567000476871722,
"grad_norm": 5.0625,
"learning_rate": 1.7621772860231467e-05,
"loss": 1.176,
"mean_token_accuracy": 0.6732675984501839,
"step": 935
},
{
"epoch": 0.3586075345731998,
"grad_norm": 4.1875,
"learning_rate": 1.760905506804019e-05,
"loss": 1.0912,
"mean_token_accuracy": 0.6839179575443268,
"step": 940
},
{
"epoch": 0.3605150214592275,
"grad_norm": 5.53125,
"learning_rate": 1.7596337275848914e-05,
"loss": 1.1048,
"mean_token_accuracy": 0.6849195197224617,
"step": 945
},
{
"epoch": 0.36242250834525513,
"grad_norm": 4.5625,
"learning_rate": 1.758361948365764e-05,
"loss": 1.0162,
"mean_token_accuracy": 0.6860754758119583,
"step": 950
},
{
"epoch": 0.3643299952312828,
"grad_norm": 4.59375,
"learning_rate": 1.7570901691466362e-05,
"loss": 1.0933,
"mean_token_accuracy": 0.6905333071947097,
"step": 955
},
{
"epoch": 0.36623748211731044,
"grad_norm": 4.9375,
"learning_rate": 1.7558183899275088e-05,
"loss": 1.0952,
"mean_token_accuracy": 0.6910875916481019,
"step": 960
},
{
"epoch": 0.3681449690033381,
"grad_norm": 5.25,
"learning_rate": 1.754546610708381e-05,
"loss": 1.2019,
"mean_token_accuracy": 0.6728987120091915,
"step": 965
},
{
"epoch": 0.37005245588936575,
"grad_norm": 3.890625,
"learning_rate": 1.7532748314892535e-05,
"loss": 1.0999,
"mean_token_accuracy": 0.6894315019249916,
"step": 970
},
{
"epoch": 0.3719599427753934,
"grad_norm": 4.28125,
"learning_rate": 1.752003052270126e-05,
"loss": 1.0943,
"mean_token_accuracy": 0.6812393218278885,
"step": 975
},
{
"epoch": 0.37386742966142106,
"grad_norm": 4.5,
"learning_rate": 1.7507312730509983e-05,
"loss": 1.1383,
"mean_token_accuracy": 0.6867758512496949,
"step": 980
},
{
"epoch": 0.3757749165474487,
"grad_norm": 4.46875,
"learning_rate": 1.7494594938318708e-05,
"loss": 1.0795,
"mean_token_accuracy": 0.6977563664317131,
"step": 985
},
{
"epoch": 0.3776824034334764,
"grad_norm": 5.46875,
"learning_rate": 1.7481877146127434e-05,
"loss": 1.0531,
"mean_token_accuracy": 0.7000179141759872,
"step": 990
},
{
"epoch": 0.3795898903195041,
"grad_norm": 5.09375,
"learning_rate": 1.746915935393616e-05,
"loss": 1.1515,
"mean_token_accuracy": 0.679880291223526,
"step": 995
},
{
"epoch": 0.38149737720553173,
"grad_norm": 4.96875,
"learning_rate": 1.7456441561744885e-05,
"loss": 1.139,
"mean_token_accuracy": 0.6736231818795204,
"step": 1000
},
{
"epoch": 0.3834048640915594,
"grad_norm": 4.5625,
"learning_rate": 1.7443723769553607e-05,
"loss": 1.161,
"mean_token_accuracy": 0.6775983899831772,
"step": 1005
},
{
"epoch": 0.38531235097758704,
"grad_norm": 5.65625,
"learning_rate": 1.7431005977362332e-05,
"loss": 1.1337,
"mean_token_accuracy": 0.6791507929563523,
"step": 1010
},
{
"epoch": 0.3872198378636147,
"grad_norm": 4.71875,
"learning_rate": 1.7418288185171054e-05,
"loss": 1.0665,
"mean_token_accuracy": 0.6990373253822326,
"step": 1015
},
{
"epoch": 0.38912732474964234,
"grad_norm": 5.34375,
"learning_rate": 1.740557039297978e-05,
"loss": 0.9875,
"mean_token_accuracy": 0.7201899453997612,
"step": 1020
},
{
"epoch": 0.39103481163567,
"grad_norm": 5.15625,
"learning_rate": 1.7392852600788502e-05,
"loss": 1.1103,
"mean_token_accuracy": 0.6919103190302849,
"step": 1025
},
{
"epoch": 0.39294229852169765,
"grad_norm": 4.90625,
"learning_rate": 1.7380134808597228e-05,
"loss": 1.1621,
"mean_token_accuracy": 0.6763632833957672,
"step": 1030
},
{
"epoch": 0.3948497854077253,
"grad_norm": 5.375,
"learning_rate": 1.7367417016405953e-05,
"loss": 1.0891,
"mean_token_accuracy": 0.6873229309916496,
"step": 1035
},
{
"epoch": 0.39675727229375296,
"grad_norm": 4.9375,
"learning_rate": 1.735469922421468e-05,
"loss": 1.0736,
"mean_token_accuracy": 0.6943288549780846,
"step": 1040
},
{
"epoch": 0.3986647591797806,
"grad_norm": 4.84375,
"learning_rate": 1.7341981432023404e-05,
"loss": 1.1739,
"mean_token_accuracy": 0.6772709146142006,
"step": 1045
},
{
"epoch": 0.4005722460658083,
"grad_norm": 4.9375,
"learning_rate": 1.7329263639832126e-05,
"loss": 1.132,
"mean_token_accuracy": 0.6861761540174485,
"step": 1050
},
{
"epoch": 0.402479732951836,
"grad_norm": 5.0,
"learning_rate": 1.731654584764085e-05,
"loss": 1.0849,
"mean_token_accuracy": 0.6876397714018821,
"step": 1055
},
{
"epoch": 0.40438721983786363,
"grad_norm": 5.28125,
"learning_rate": 1.7303828055449577e-05,
"loss": 1.1119,
"mean_token_accuracy": 0.6828581809997558,
"step": 1060
},
{
"epoch": 0.4062947067238913,
"grad_norm": 4.78125,
"learning_rate": 1.72911102632583e-05,
"loss": 1.1542,
"mean_token_accuracy": 0.6845688432455063,
"step": 1065
},
{
"epoch": 0.40820219360991894,
"grad_norm": 4.78125,
"learning_rate": 1.7278392471067025e-05,
"loss": 1.245,
"mean_token_accuracy": 0.6623896270990371,
"step": 1070
},
{
"epoch": 0.4101096804959466,
"grad_norm": 4.40625,
"learning_rate": 1.7265674678875747e-05,
"loss": 0.9855,
"mean_token_accuracy": 0.7169230833649636,
"step": 1075
},
{
"epoch": 0.41201716738197425,
"grad_norm": 5.28125,
"learning_rate": 1.7252956886684472e-05,
"loss": 1.0003,
"mean_token_accuracy": 0.7067764922976494,
"step": 1080
},
{
"epoch": 0.4139246542680019,
"grad_norm": 4.71875,
"learning_rate": 1.7240239094493198e-05,
"loss": 1.0566,
"mean_token_accuracy": 0.7076253116130828,
"step": 1085
},
{
"epoch": 0.41583214115402956,
"grad_norm": 4.8125,
"learning_rate": 1.722752130230192e-05,
"loss": 1.0692,
"mean_token_accuracy": 0.6991090714931488,
"step": 1090
},
{
"epoch": 0.4177396280400572,
"grad_norm": 4.84375,
"learning_rate": 1.7214803510110646e-05,
"loss": 1.1702,
"mean_token_accuracy": 0.6843620404601097,
"step": 1095
},
{
"epoch": 0.41964711492608486,
"grad_norm": 5.34375,
"learning_rate": 1.720208571791937e-05,
"loss": 1.1131,
"mean_token_accuracy": 0.6780217066407204,
"step": 1100
},
{
"epoch": 0.4215546018121125,
"grad_norm": 5.03125,
"learning_rate": 1.7189367925728097e-05,
"loss": 1.1733,
"mean_token_accuracy": 0.6738237209618092,
"step": 1105
},
{
"epoch": 0.4234620886981402,
"grad_norm": 4.15625,
"learning_rate": 1.717665013353682e-05,
"loss": 1.0971,
"mean_token_accuracy": 0.6967151150107384,
"step": 1110
},
{
"epoch": 0.4253695755841679,
"grad_norm": 6.125,
"learning_rate": 1.7163932341345544e-05,
"loss": 1.0593,
"mean_token_accuracy": 0.6975896939635277,
"step": 1115
},
{
"epoch": 0.42727706247019553,
"grad_norm": 4.875,
"learning_rate": 1.715121454915427e-05,
"loss": 1.045,
"mean_token_accuracy": 0.6995080903172493,
"step": 1120
},
{
"epoch": 0.4291845493562232,
"grad_norm": 4.6875,
"learning_rate": 1.7138496756962992e-05,
"loss": 1.1514,
"mean_token_accuracy": 0.6870008051395416,
"step": 1125
},
{
"epoch": 0.43109203624225084,
"grad_norm": 4.9375,
"learning_rate": 1.7125778964771717e-05,
"loss": 1.147,
"mean_token_accuracy": 0.6756279736757278,
"step": 1130
},
{
"epoch": 0.4329995231282785,
"grad_norm": 3.953125,
"learning_rate": 1.711306117258044e-05,
"loss": 1.0626,
"mean_token_accuracy": 0.6975025564432145,
"step": 1135
},
{
"epoch": 0.43490701001430615,
"grad_norm": 4.78125,
"learning_rate": 1.7100343380389165e-05,
"loss": 1.0843,
"mean_token_accuracy": 0.6906091332435608,
"step": 1140
},
{
"epoch": 0.4368144969003338,
"grad_norm": 4.84375,
"learning_rate": 1.708762558819789e-05,
"loss": 1.1134,
"mean_token_accuracy": 0.6875185921788216,
"step": 1145
},
{
"epoch": 0.43872198378636146,
"grad_norm": 5.28125,
"learning_rate": 1.7074907796006616e-05,
"loss": 1.1802,
"mean_token_accuracy": 0.6715085253119468,
"step": 1150
},
{
"epoch": 0.4406294706723891,
"grad_norm": 5.1875,
"learning_rate": 1.706219000381534e-05,
"loss": 1.0848,
"mean_token_accuracy": 0.6938497066497803,
"step": 1155
},
{
"epoch": 0.44253695755841677,
"grad_norm": 4.34375,
"learning_rate": 1.7049472211624063e-05,
"loss": 0.99,
"mean_token_accuracy": 0.7224840387701988,
"step": 1160
},
{
"epoch": 0.4444444444444444,
"grad_norm": 5.0625,
"learning_rate": 1.703675441943279e-05,
"loss": 1.0878,
"mean_token_accuracy": 0.6980762526392936,
"step": 1165
},
{
"epoch": 0.44635193133047213,
"grad_norm": 4.84375,
"learning_rate": 1.702403662724151e-05,
"loss": 1.0144,
"mean_token_accuracy": 0.7007738411426544,
"step": 1170
},
{
"epoch": 0.4482594182164998,
"grad_norm": 4.71875,
"learning_rate": 1.7011318835050237e-05,
"loss": 1.0875,
"mean_token_accuracy": 0.6976178154349327,
"step": 1175
},
{
"epoch": 0.45016690510252744,
"grad_norm": 4.25,
"learning_rate": 1.6998601042858962e-05,
"loss": 1.1086,
"mean_token_accuracy": 0.6936508253216743,
"step": 1180
},
{
"epoch": 0.4520743919885551,
"grad_norm": 4.625,
"learning_rate": 1.6985883250667684e-05,
"loss": 1.0357,
"mean_token_accuracy": 0.7059789746999741,
"step": 1185
},
{
"epoch": 0.45398187887458274,
"grad_norm": 5.84375,
"learning_rate": 1.697316545847641e-05,
"loss": 1.1516,
"mean_token_accuracy": 0.6797626093029976,
"step": 1190
},
{
"epoch": 0.4558893657606104,
"grad_norm": 4.59375,
"learning_rate": 1.6960447666285135e-05,
"loss": 1.0598,
"mean_token_accuracy": 0.6971547856926918,
"step": 1195
},
{
"epoch": 0.45779685264663805,
"grad_norm": 4.34375,
"learning_rate": 1.6947729874093857e-05,
"loss": 0.9843,
"mean_token_accuracy": 0.7089567899703979,
"step": 1200
},
{
"epoch": 0.4597043395326657,
"grad_norm": 5.6875,
"learning_rate": 1.6935012081902583e-05,
"loss": 1.1058,
"mean_token_accuracy": 0.6865063227713109,
"step": 1205
},
{
"epoch": 0.46161182641869336,
"grad_norm": 4.65625,
"learning_rate": 1.6922294289711308e-05,
"loss": 1.1277,
"mean_token_accuracy": 0.6827705770730972,
"step": 1210
},
{
"epoch": 0.463519313304721,
"grad_norm": 4.875,
"learning_rate": 1.6909576497520034e-05,
"loss": 1.1664,
"mean_token_accuracy": 0.6819883540272713,
"step": 1215
},
{
"epoch": 0.46542680019074867,
"grad_norm": 5.09375,
"learning_rate": 1.6896858705328756e-05,
"loss": 1.0562,
"mean_token_accuracy": 0.6979849010705947,
"step": 1220
},
{
"epoch": 0.4673342870767763,
"grad_norm": 4.875,
"learning_rate": 1.688414091313748e-05,
"loss": 1.1879,
"mean_token_accuracy": 0.6789781466126442,
"step": 1225
},
{
"epoch": 0.46924177396280403,
"grad_norm": 4.6875,
"learning_rate": 1.6871423120946204e-05,
"loss": 1.1305,
"mean_token_accuracy": 0.6878492012619972,
"step": 1230
},
{
"epoch": 0.4711492608488317,
"grad_norm": 4.90625,
"learning_rate": 1.685870532875493e-05,
"loss": 1.1696,
"mean_token_accuracy": 0.6815823867917061,
"step": 1235
},
{
"epoch": 0.47305674773485934,
"grad_norm": 4.4375,
"learning_rate": 1.6845987536563655e-05,
"loss": 0.9887,
"mean_token_accuracy": 0.7134695425629616,
"step": 1240
},
{
"epoch": 0.474964234620887,
"grad_norm": 4.625,
"learning_rate": 1.6833269744372377e-05,
"loss": 1.0254,
"mean_token_accuracy": 0.7122278586030006,
"step": 1245
},
{
"epoch": 0.47687172150691465,
"grad_norm": 5.25,
"learning_rate": 1.6820551952181102e-05,
"loss": 1.123,
"mean_token_accuracy": 0.6886913120746613,
"step": 1250
},
{
"epoch": 0.4787792083929423,
"grad_norm": 4.5625,
"learning_rate": 1.6807834159989828e-05,
"loss": 1.1054,
"mean_token_accuracy": 0.6899633683264256,
"step": 1255
},
{
"epoch": 0.48068669527896996,
"grad_norm": 4.5,
"learning_rate": 1.6795116367798553e-05,
"loss": 0.9764,
"mean_token_accuracy": 0.7191405698657036,
"step": 1260
},
{
"epoch": 0.4825941821649976,
"grad_norm": 4.9375,
"learning_rate": 1.6782398575607275e-05,
"loss": 1.0562,
"mean_token_accuracy": 0.7035323694348335,
"step": 1265
},
{
"epoch": 0.48450166905102526,
"grad_norm": 4.25,
"learning_rate": 1.6769680783416e-05,
"loss": 1.0389,
"mean_token_accuracy": 0.708684840798378,
"step": 1270
},
{
"epoch": 0.4864091559370529,
"grad_norm": 4.375,
"learning_rate": 1.6756962991224726e-05,
"loss": 1.0037,
"mean_token_accuracy": 0.7036843597888947,
"step": 1275
},
{
"epoch": 0.48831664282308057,
"grad_norm": 4.46875,
"learning_rate": 1.674424519903345e-05,
"loss": 0.9991,
"mean_token_accuracy": 0.708541002869606,
"step": 1280
},
{
"epoch": 0.4902241297091082,
"grad_norm": 4.53125,
"learning_rate": 1.6731527406842174e-05,
"loss": 1.0307,
"mean_token_accuracy": 0.7065932080149651,
"step": 1285
},
{
"epoch": 0.49213161659513593,
"grad_norm": 5.3125,
"learning_rate": 1.6718809614650896e-05,
"loss": 1.1893,
"mean_token_accuracy": 0.6674706935882568,
"step": 1290
},
{
"epoch": 0.4940391034811636,
"grad_norm": 4.75,
"learning_rate": 1.670609182245962e-05,
"loss": 1.0691,
"mean_token_accuracy": 0.6954927012324333,
"step": 1295
},
{
"epoch": 0.49594659036719124,
"grad_norm": 5.0,
"learning_rate": 1.6693374030268347e-05,
"loss": 1.1,
"mean_token_accuracy": 0.687789686024189,
"step": 1300
},
{
"epoch": 0.4978540772532189,
"grad_norm": 5.4375,
"learning_rate": 1.6680656238077072e-05,
"loss": 0.995,
"mean_token_accuracy": 0.7104071035981179,
"step": 1305
},
{
"epoch": 0.49976156413924655,
"grad_norm": 8.9375,
"learning_rate": 1.6667938445885795e-05,
"loss": 1.1605,
"mean_token_accuracy": 0.6653927579522133,
"step": 1310
},
{
"epoch": 0.5016690510252741,
"grad_norm": 5.28125,
"learning_rate": 1.665522065369452e-05,
"loss": 1.0855,
"mean_token_accuracy": 0.6951776430010795,
"step": 1315
},
{
"epoch": 0.5035765379113019,
"grad_norm": 5.40625,
"learning_rate": 1.6642502861503246e-05,
"loss": 1.0628,
"mean_token_accuracy": 0.6940785989165306,
"step": 1320
},
{
"epoch": 0.5054840247973296,
"grad_norm": 5.34375,
"learning_rate": 1.6629785069311968e-05,
"loss": 0.988,
"mean_token_accuracy": 0.7169103771448135,
"step": 1325
},
{
"epoch": 0.5073915116833572,
"grad_norm": 6.0,
"learning_rate": 1.6617067277120693e-05,
"loss": 1.1389,
"mean_token_accuracy": 0.6772550821304322,
"step": 1330
},
{
"epoch": 0.5092989985693849,
"grad_norm": 5.21875,
"learning_rate": 1.660434948492942e-05,
"loss": 1.0482,
"mean_token_accuracy": 0.7066885620355606,
"step": 1335
},
{
"epoch": 0.5112064854554125,
"grad_norm": 4.46875,
"learning_rate": 1.659163169273814e-05,
"loss": 1.0581,
"mean_token_accuracy": 0.6999363213777542,
"step": 1340
},
{
"epoch": 0.5131139723414402,
"grad_norm": 4.625,
"learning_rate": 1.6578913900546866e-05,
"loss": 1.0167,
"mean_token_accuracy": 0.7108445912599564,
"step": 1345
},
{
"epoch": 0.5150214592274678,
"grad_norm": 4.34375,
"learning_rate": 1.656619610835559e-05,
"loss": 1.0204,
"mean_token_accuracy": 0.7141309767961502,
"step": 1350
},
{
"epoch": 0.5169289461134955,
"grad_norm": 5.0,
"learning_rate": 1.6553478316164314e-05,
"loss": 1.0762,
"mean_token_accuracy": 0.681866991519928,
"step": 1355
},
{
"epoch": 0.5188364329995231,
"grad_norm": 4.75,
"learning_rate": 1.654076052397304e-05,
"loss": 1.11,
"mean_token_accuracy": 0.6910865843296051,
"step": 1360
},
{
"epoch": 0.5207439198855508,
"grad_norm": 4.9375,
"learning_rate": 1.6528042731781765e-05,
"loss": 1.1043,
"mean_token_accuracy": 0.6882475554943085,
"step": 1365
},
{
"epoch": 0.5226514067715784,
"grad_norm": 4.25,
"learning_rate": 1.651532493959049e-05,
"loss": 1.0554,
"mean_token_accuracy": 0.7098303481936454,
"step": 1370
},
{
"epoch": 0.5245588936576061,
"grad_norm": 4.875,
"learning_rate": 1.6502607147399213e-05,
"loss": 1.0832,
"mean_token_accuracy": 0.7008169665932655,
"step": 1375
},
{
"epoch": 0.5264663805436338,
"grad_norm": 4.75,
"learning_rate": 1.6489889355207938e-05,
"loss": 1.0496,
"mean_token_accuracy": 0.7001980841159821,
"step": 1380
},
{
"epoch": 0.5283738674296614,
"grad_norm": 4.96875,
"learning_rate": 1.647717156301666e-05,
"loss": 1.058,
"mean_token_accuracy": 0.6955149456858635,
"step": 1385
},
{
"epoch": 0.5302813543156891,
"grad_norm": 4.5,
"learning_rate": 1.6464453770825386e-05,
"loss": 1.0637,
"mean_token_accuracy": 0.6973527297377586,
"step": 1390
},
{
"epoch": 0.5321888412017167,
"grad_norm": 4.65625,
"learning_rate": 1.645173597863411e-05,
"loss": 1.0224,
"mean_token_accuracy": 0.6985811904072762,
"step": 1395
},
{
"epoch": 0.5340963280877444,
"grad_norm": 5.90625,
"learning_rate": 1.6439018186442833e-05,
"loss": 0.9889,
"mean_token_accuracy": 0.7180039718747139,
"step": 1400
},
{
"epoch": 0.536003814973772,
"grad_norm": 5.65625,
"learning_rate": 1.642630039425156e-05,
"loss": 1.1035,
"mean_token_accuracy": 0.6911762669682503,
"step": 1405
},
{
"epoch": 0.5379113018597997,
"grad_norm": 5.3125,
"learning_rate": 1.6413582602060284e-05,
"loss": 1.121,
"mean_token_accuracy": 0.6914720147848129,
"step": 1410
},
{
"epoch": 0.5398187887458273,
"grad_norm": 4.6875,
"learning_rate": 1.640086480986901e-05,
"loss": 1.0923,
"mean_token_accuracy": 0.7057502642273903,
"step": 1415
},
{
"epoch": 0.541726275631855,
"grad_norm": 4.03125,
"learning_rate": 1.6388147017677732e-05,
"loss": 1.0645,
"mean_token_accuracy": 0.6903545215725899,
"step": 1420
},
{
"epoch": 0.5436337625178826,
"grad_norm": 4.46875,
"learning_rate": 1.6375429225486457e-05,
"loss": 1.067,
"mean_token_accuracy": 0.6956262946128845,
"step": 1425
},
{
"epoch": 0.5455412494039104,
"grad_norm": 5.40625,
"learning_rate": 1.6362711433295183e-05,
"loss": 1.051,
"mean_token_accuracy": 0.7086583986878395,
"step": 1430
},
{
"epoch": 0.547448736289938,
"grad_norm": 5.15625,
"learning_rate": 1.6349993641103905e-05,
"loss": 1.083,
"mean_token_accuracy": 0.6992680087685585,
"step": 1435
},
{
"epoch": 0.5493562231759657,
"grad_norm": 4.875,
"learning_rate": 1.633727584891263e-05,
"loss": 1.048,
"mean_token_accuracy": 0.7062048301100731,
"step": 1440
},
{
"epoch": 0.5512637100619934,
"grad_norm": 4.375,
"learning_rate": 1.6324558056721353e-05,
"loss": 1.0279,
"mean_token_accuracy": 0.6988534897565841,
"step": 1445
},
{
"epoch": 0.553171196948021,
"grad_norm": 5.59375,
"learning_rate": 1.6311840264530078e-05,
"loss": 1.1266,
"mean_token_accuracy": 0.6839306525886059,
"step": 1450
},
{
"epoch": 0.5550786838340487,
"grad_norm": 5.25,
"learning_rate": 1.6299122472338804e-05,
"loss": 1.0217,
"mean_token_accuracy": 0.711203609406948,
"step": 1455
},
{
"epoch": 0.5569861707200763,
"grad_norm": 4.59375,
"learning_rate": 1.6286404680147526e-05,
"loss": 1.0203,
"mean_token_accuracy": 0.7061103895306587,
"step": 1460
},
{
"epoch": 0.558893657606104,
"grad_norm": 4.5,
"learning_rate": 1.627368688795625e-05,
"loss": 1.1219,
"mean_token_accuracy": 0.6869349181652069,
"step": 1465
},
{
"epoch": 0.5608011444921316,
"grad_norm": 5.78125,
"learning_rate": 1.6260969095764977e-05,
"loss": 1.0224,
"mean_token_accuracy": 0.7102344155311584,
"step": 1470
},
{
"epoch": 0.5627086313781593,
"grad_norm": 4.71875,
"learning_rate": 1.6248251303573702e-05,
"loss": 1.0301,
"mean_token_accuracy": 0.7014199420809746,
"step": 1475
},
{
"epoch": 0.5646161182641869,
"grad_norm": 4.84375,
"learning_rate": 1.6235533511382428e-05,
"loss": 1.0919,
"mean_token_accuracy": 0.6958119504153728,
"step": 1480
},
{
"epoch": 0.5665236051502146,
"grad_norm": 3.796875,
"learning_rate": 1.622281571919115e-05,
"loss": 1.0172,
"mean_token_accuracy": 0.7171666666865348,
"step": 1485
},
{
"epoch": 0.5684310920362422,
"grad_norm": 4.9375,
"learning_rate": 1.6210097926999875e-05,
"loss": 1.0293,
"mean_token_accuracy": 0.7110596433281898,
"step": 1490
},
{
"epoch": 0.5703385789222699,
"grad_norm": 5.0,
"learning_rate": 1.6197380134808597e-05,
"loss": 1.0437,
"mean_token_accuracy": 0.7055989898741245,
"step": 1495
},
{
"epoch": 0.5722460658082976,
"grad_norm": 4.5625,
"learning_rate": 1.6184662342617323e-05,
"loss": 1.1109,
"mean_token_accuracy": 0.6983527675271034,
"step": 1500
},
{
"epoch": 0.5741535526943252,
"grad_norm": 5.21875,
"learning_rate": 1.6171944550426045e-05,
"loss": 1.0585,
"mean_token_accuracy": 0.7039476573467255,
"step": 1505
},
{
"epoch": 0.5760610395803529,
"grad_norm": 4.71875,
"learning_rate": 1.615922675823477e-05,
"loss": 1.0795,
"mean_token_accuracy": 0.6893104076385498,
"step": 1510
},
{
"epoch": 0.5779685264663805,
"grad_norm": 4.71875,
"learning_rate": 1.6146508966043496e-05,
"loss": 0.9975,
"mean_token_accuracy": 0.7137760400772095,
"step": 1515
},
{
"epoch": 0.5798760133524082,
"grad_norm": 4.5625,
"learning_rate": 1.613379117385222e-05,
"loss": 1.0283,
"mean_token_accuracy": 0.7071486204862595,
"step": 1520
},
{
"epoch": 0.5817835002384358,
"grad_norm": 4.125,
"learning_rate": 1.6121073381660947e-05,
"loss": 0.9634,
"mean_token_accuracy": 0.7267276033759117,
"step": 1525
},
{
"epoch": 0.5836909871244635,
"grad_norm": 5.28125,
"learning_rate": 1.610835558946967e-05,
"loss": 1.141,
"mean_token_accuracy": 0.6760995179414749,
"step": 1530
},
{
"epoch": 0.5855984740104911,
"grad_norm": 4.3125,
"learning_rate": 1.6095637797278395e-05,
"loss": 1.0584,
"mean_token_accuracy": 0.7001825541257858,
"step": 1535
},
{
"epoch": 0.5875059608965189,
"grad_norm": 4.71875,
"learning_rate": 1.608292000508712e-05,
"loss": 0.9554,
"mean_token_accuracy": 0.7191137507557869,
"step": 1540
},
{
"epoch": 0.5894134477825465,
"grad_norm": 4.59375,
"learning_rate": 1.6070202212895842e-05,
"loss": 1.0338,
"mean_token_accuracy": 0.6991492182016372,
"step": 1545
},
{
"epoch": 0.5913209346685742,
"grad_norm": 4.375,
"learning_rate": 1.6057484420704568e-05,
"loss": 1.1116,
"mean_token_accuracy": 0.6837974414229393,
"step": 1550
},
{
"epoch": 0.5932284215546018,
"grad_norm": 5.0,
"learning_rate": 1.604476662851329e-05,
"loss": 1.1166,
"mean_token_accuracy": 0.6914651602506637,
"step": 1555
},
{
"epoch": 0.5951359084406295,
"grad_norm": 4.53125,
"learning_rate": 1.6032048836322015e-05,
"loss": 1.092,
"mean_token_accuracy": 0.6894845418632031,
"step": 1560
},
{
"epoch": 0.5970433953266572,
"grad_norm": 3.96875,
"learning_rate": 1.601933104413074e-05,
"loss": 1.0382,
"mean_token_accuracy": 0.7044162392616272,
"step": 1565
},
{
"epoch": 0.5989508822126848,
"grad_norm": 5.28125,
"learning_rate": 1.6006613251939463e-05,
"loss": 1.1431,
"mean_token_accuracy": 0.6869931921362877,
"step": 1570
},
{
"epoch": 0.6008583690987125,
"grad_norm": 5.03125,
"learning_rate": 1.599389545974819e-05,
"loss": 0.9842,
"mean_token_accuracy": 0.7128385215997696,
"step": 1575
},
{
"epoch": 0.6027658559847401,
"grad_norm": 4.09375,
"learning_rate": 1.5981177667556914e-05,
"loss": 0.9261,
"mean_token_accuracy": 0.7247092142701149,
"step": 1580
},
{
"epoch": 0.6046733428707678,
"grad_norm": 6.1875,
"learning_rate": 1.596845987536564e-05,
"loss": 1.1425,
"mean_token_accuracy": 0.690905112028122,
"step": 1585
},
{
"epoch": 0.6065808297567954,
"grad_norm": 4.15625,
"learning_rate": 1.595574208317436e-05,
"loss": 1.0104,
"mean_token_accuracy": 0.7160186618566513,
"step": 1590
},
{
"epoch": 0.6084883166428231,
"grad_norm": 4.25,
"learning_rate": 1.5943024290983087e-05,
"loss": 1.0177,
"mean_token_accuracy": 0.7115990072488785,
"step": 1595
},
{
"epoch": 0.6103958035288507,
"grad_norm": 4.125,
"learning_rate": 1.5930306498791813e-05,
"loss": 0.9649,
"mean_token_accuracy": 0.7167477622628212,
"step": 1600
},
{
"epoch": 0.6123032904148784,
"grad_norm": 4.78125,
"learning_rate": 1.5917588706600535e-05,
"loss": 1.0788,
"mean_token_accuracy": 0.6938311874866485,
"step": 1605
},
{
"epoch": 0.614210777300906,
"grad_norm": 4.8125,
"learning_rate": 1.590487091440926e-05,
"loss": 1.0125,
"mean_token_accuracy": 0.7016476511955261,
"step": 1610
},
{
"epoch": 0.6161182641869337,
"grad_norm": 4.875,
"learning_rate": 1.5892153122217982e-05,
"loss": 1.0339,
"mean_token_accuracy": 0.7032722011208534,
"step": 1615
},
{
"epoch": 0.6180257510729614,
"grad_norm": 5.0625,
"learning_rate": 1.5879435330026708e-05,
"loss": 1.0328,
"mean_token_accuracy": 0.7095901571214199,
"step": 1620
},
{
"epoch": 0.619933237958989,
"grad_norm": 5.25,
"learning_rate": 1.5866717537835433e-05,
"loss": 1.0373,
"mean_token_accuracy": 0.7035743817687035,
"step": 1625
},
{
"epoch": 0.6218407248450167,
"grad_norm": 4.53125,
"learning_rate": 1.585399974564416e-05,
"loss": 0.9541,
"mean_token_accuracy": 0.7259443908929825,
"step": 1630
},
{
"epoch": 0.6237482117310443,
"grad_norm": 5.40625,
"learning_rate": 1.5841281953452884e-05,
"loss": 1.163,
"mean_token_accuracy": 0.6807891383767128,
"step": 1635
},
{
"epoch": 0.625655698617072,
"grad_norm": 5.46875,
"learning_rate": 1.5828564161261606e-05,
"loss": 0.9723,
"mean_token_accuracy": 0.7241554304957389,
"step": 1640
},
{
"epoch": 0.6275631855030996,
"grad_norm": 4.5625,
"learning_rate": 1.5815846369070332e-05,
"loss": 1.0736,
"mean_token_accuracy": 0.7066580310463906,
"step": 1645
},
{
"epoch": 0.6294706723891274,
"grad_norm": 5.125,
"learning_rate": 1.5803128576879054e-05,
"loss": 1.0553,
"mean_token_accuracy": 0.7062164053320885,
"step": 1650
},
{
"epoch": 0.631378159275155,
"grad_norm": 4.1875,
"learning_rate": 1.579041078468778e-05,
"loss": 1.0238,
"mean_token_accuracy": 0.7111857526004315,
"step": 1655
},
{
"epoch": 0.6332856461611827,
"grad_norm": 4.59375,
"learning_rate": 1.5777692992496505e-05,
"loss": 0.9547,
"mean_token_accuracy": 0.7140131160616875,
"step": 1660
},
{
"epoch": 0.6351931330472103,
"grad_norm": 4.75,
"learning_rate": 1.5764975200305227e-05,
"loss": 1.0719,
"mean_token_accuracy": 0.6966261744499207,
"step": 1665
},
{
"epoch": 0.637100619933238,
"grad_norm": 4.5625,
"learning_rate": 1.5752257408113953e-05,
"loss": 1.0385,
"mean_token_accuracy": 0.7105199143290519,
"step": 1670
},
{
"epoch": 0.6390081068192656,
"grad_norm": 5.40625,
"learning_rate": 1.5739539615922678e-05,
"loss": 1.0362,
"mean_token_accuracy": 0.7015564523637294,
"step": 1675
},
{
"epoch": 0.6409155937052933,
"grad_norm": 5.3125,
"learning_rate": 1.57268218237314e-05,
"loss": 0.9366,
"mean_token_accuracy": 0.7337832853198052,
"step": 1680
},
{
"epoch": 0.642823080591321,
"grad_norm": 5.4375,
"learning_rate": 1.5714104031540126e-05,
"loss": 1.1166,
"mean_token_accuracy": 0.6753425896167755,
"step": 1685
},
{
"epoch": 0.6447305674773486,
"grad_norm": 4.5625,
"learning_rate": 1.570138623934885e-05,
"loss": 1.0364,
"mean_token_accuracy": 0.7042675077915191,
"step": 1690
},
{
"epoch": 0.6466380543633763,
"grad_norm": 5.1875,
"learning_rate": 1.5688668447157577e-05,
"loss": 0.9948,
"mean_token_accuracy": 0.7185570999979973,
"step": 1695
},
{
"epoch": 0.6485455412494039,
"grad_norm": 5.25,
"learning_rate": 1.56759506549663e-05,
"loss": 1.0878,
"mean_token_accuracy": 0.6928838163614273,
"step": 1700
},
{
"epoch": 0.6504530281354316,
"grad_norm": 5.90625,
"learning_rate": 1.5663232862775024e-05,
"loss": 1.0671,
"mean_token_accuracy": 0.7043869346380234,
"step": 1705
},
{
"epoch": 0.6523605150214592,
"grad_norm": 4.5,
"learning_rate": 1.5650515070583746e-05,
"loss": 1.0721,
"mean_token_accuracy": 0.7013254553079605,
"step": 1710
},
{
"epoch": 0.6542680019074869,
"grad_norm": 4.90625,
"learning_rate": 1.5637797278392472e-05,
"loss": 1.0065,
"mean_token_accuracy": 0.7042224168777466,
"step": 1715
},
{
"epoch": 0.6561754887935145,
"grad_norm": 4.15625,
"learning_rate": 1.5625079486201197e-05,
"loss": 1.0739,
"mean_token_accuracy": 0.6945244207978248,
"step": 1720
},
{
"epoch": 0.6580829756795422,
"grad_norm": 5.5625,
"learning_rate": 1.561236169400992e-05,
"loss": 1.0859,
"mean_token_accuracy": 0.6950926452875137,
"step": 1725
},
{
"epoch": 0.6599904625655698,
"grad_norm": 4.8125,
"learning_rate": 1.5599643901818645e-05,
"loss": 0.9103,
"mean_token_accuracy": 0.7312288954854012,
"step": 1730
},
{
"epoch": 0.6618979494515975,
"grad_norm": 4.375,
"learning_rate": 1.558692610962737e-05,
"loss": 0.9913,
"mean_token_accuracy": 0.7258316233754158,
"step": 1735
},
{
"epoch": 0.6638054363376252,
"grad_norm": 5.84375,
"learning_rate": 1.5574208317436096e-05,
"loss": 1.1365,
"mean_token_accuracy": 0.6847386986017228,
"step": 1740
},
{
"epoch": 0.6657129232236528,
"grad_norm": 4.34375,
"learning_rate": 1.5561490525244818e-05,
"loss": 0.9208,
"mean_token_accuracy": 0.7318013399839401,
"step": 1745
},
{
"epoch": 0.6676204101096805,
"grad_norm": 4.59375,
"learning_rate": 1.5548772733053544e-05,
"loss": 1.0381,
"mean_token_accuracy": 0.7117675840854645,
"step": 1750
},
{
"epoch": 0.6695278969957081,
"grad_norm": 4.0,
"learning_rate": 1.553605494086227e-05,
"loss": 0.9807,
"mean_token_accuracy": 0.7145498290657997,
"step": 1755
},
{
"epoch": 0.6714353838817358,
"grad_norm": 4.4375,
"learning_rate": 1.552333714867099e-05,
"loss": 0.9746,
"mean_token_accuracy": 0.7150619328022003,
"step": 1760
},
{
"epoch": 0.6733428707677634,
"grad_norm": 5.0625,
"learning_rate": 1.5510619356479717e-05,
"loss": 1.0595,
"mean_token_accuracy": 0.7089547023177147,
"step": 1765
},
{
"epoch": 0.6752503576537912,
"grad_norm": 4.59375,
"learning_rate": 1.549790156428844e-05,
"loss": 1.0631,
"mean_token_accuracy": 0.7015356197953224,
"step": 1770
},
{
"epoch": 0.6771578445398188,
"grad_norm": 6.28125,
"learning_rate": 1.5485183772097164e-05,
"loss": 1.0188,
"mean_token_accuracy": 0.7043671816587448,
"step": 1775
},
{
"epoch": 0.6790653314258465,
"grad_norm": 4.34375,
"learning_rate": 1.547246597990589e-05,
"loss": 1.017,
"mean_token_accuracy": 0.707620695233345,
"step": 1780
},
{
"epoch": 0.6809728183118741,
"grad_norm": 5.25,
"learning_rate": 1.5459748187714615e-05,
"loss": 1.0261,
"mean_token_accuracy": 0.7107647344470024,
"step": 1785
},
{
"epoch": 0.6828803051979018,
"grad_norm": 5.1875,
"learning_rate": 1.5447030395523338e-05,
"loss": 0.976,
"mean_token_accuracy": 0.7236540615558624,
"step": 1790
},
{
"epoch": 0.6847877920839294,
"grad_norm": 4.40625,
"learning_rate": 1.5434312603332063e-05,
"loss": 1.0489,
"mean_token_accuracy": 0.7052806288003921,
"step": 1795
},
{
"epoch": 0.6866952789699571,
"grad_norm": 4.53125,
"learning_rate": 1.542159481114079e-05,
"loss": 1.0648,
"mean_token_accuracy": 0.6937704533338547,
"step": 1800
},
{
"epoch": 0.6886027658559848,
"grad_norm": 3.921875,
"learning_rate": 1.540887701894951e-05,
"loss": 1.0472,
"mean_token_accuracy": 0.7000033929944038,
"step": 1805
},
{
"epoch": 0.6905102527420124,
"grad_norm": 4.0625,
"learning_rate": 1.5396159226758236e-05,
"loss": 0.9398,
"mean_token_accuracy": 0.7206048682332039,
"step": 1810
},
{
"epoch": 0.6924177396280401,
"grad_norm": 4.71875,
"learning_rate": 1.538344143456696e-05,
"loss": 1.078,
"mean_token_accuracy": 0.6949393272399902,
"step": 1815
},
{
"epoch": 0.6943252265140677,
"grad_norm": 4.78125,
"learning_rate": 1.5370723642375684e-05,
"loss": 0.9936,
"mean_token_accuracy": 0.716967236995697,
"step": 1820
},
{
"epoch": 0.6962327134000954,
"grad_norm": 5.46875,
"learning_rate": 1.535800585018441e-05,
"loss": 1.0597,
"mean_token_accuracy": 0.7033153355121613,
"step": 1825
},
{
"epoch": 0.698140200286123,
"grad_norm": 5.59375,
"learning_rate": 1.534528805799313e-05,
"loss": 1.0689,
"mean_token_accuracy": 0.6991181001067162,
"step": 1830
},
{
"epoch": 0.7000476871721507,
"grad_norm": 5.59375,
"learning_rate": 1.5332570265801857e-05,
"loss": 1.0807,
"mean_token_accuracy": 0.6999934658408165,
"step": 1835
},
{
"epoch": 0.7019551740581783,
"grad_norm": 5.0625,
"learning_rate": 1.5319852473610582e-05,
"loss": 0.9682,
"mean_token_accuracy": 0.723272667825222,
"step": 1840
},
{
"epoch": 0.703862660944206,
"grad_norm": 4.15625,
"learning_rate": 1.5307134681419308e-05,
"loss": 1.031,
"mean_token_accuracy": 0.6966592162847519,
"step": 1845
},
{
"epoch": 0.7057701478302336,
"grad_norm": 5.15625,
"learning_rate": 1.5294416889228033e-05,
"loss": 1.0431,
"mean_token_accuracy": 0.714342576265335,
"step": 1850
},
{
"epoch": 0.7076776347162613,
"grad_norm": 5.78125,
"learning_rate": 1.5281699097036755e-05,
"loss": 1.0259,
"mean_token_accuracy": 0.7083830907940865,
"step": 1855
},
{
"epoch": 0.709585121602289,
"grad_norm": 4.5625,
"learning_rate": 1.526898130484548e-05,
"loss": 0.9847,
"mean_token_accuracy": 0.7163553655147552,
"step": 1860
},
{
"epoch": 0.7114926084883166,
"grad_norm": 6.28125,
"learning_rate": 1.5256263512654203e-05,
"loss": 0.9951,
"mean_token_accuracy": 0.7172233253717423,
"step": 1865
},
{
"epoch": 0.7134000953743443,
"grad_norm": 5.25,
"learning_rate": 1.5243545720462929e-05,
"loss": 1.0708,
"mean_token_accuracy": 0.6974482744932174,
"step": 1870
},
{
"epoch": 0.7153075822603719,
"grad_norm": 4.625,
"learning_rate": 1.5230827928271654e-05,
"loss": 1.0573,
"mean_token_accuracy": 0.7091765016317367,
"step": 1875
},
{
"epoch": 0.7172150691463997,
"grad_norm": 4.96875,
"learning_rate": 1.5218110136080378e-05,
"loss": 0.969,
"mean_token_accuracy": 0.7144028797745705,
"step": 1880
},
{
"epoch": 0.7191225560324273,
"grad_norm": 4.71875,
"learning_rate": 1.5205392343889103e-05,
"loss": 1.0841,
"mean_token_accuracy": 0.696842522919178,
"step": 1885
},
{
"epoch": 0.721030042918455,
"grad_norm": 4.21875,
"learning_rate": 1.5192674551697825e-05,
"loss": 0.9193,
"mean_token_accuracy": 0.7247643247246742,
"step": 1890
},
{
"epoch": 0.7229375298044826,
"grad_norm": 5.3125,
"learning_rate": 1.5179956759506551e-05,
"loss": 0.9693,
"mean_token_accuracy": 0.7202573090791702,
"step": 1895
},
{
"epoch": 0.7248450166905103,
"grad_norm": 4.59375,
"learning_rate": 1.5167238967315275e-05,
"loss": 0.9176,
"mean_token_accuracy": 0.7282809093594551,
"step": 1900
},
{
"epoch": 0.7267525035765379,
"grad_norm": 4.53125,
"learning_rate": 1.5154521175124e-05,
"loss": 0.9122,
"mean_token_accuracy": 0.7338912770152092,
"step": 1905
},
{
"epoch": 0.7286599904625656,
"grad_norm": 4.96875,
"learning_rate": 1.5141803382932724e-05,
"loss": 1.0756,
"mean_token_accuracy": 0.6949716225266457,
"step": 1910
},
{
"epoch": 0.7305674773485932,
"grad_norm": 4.96875,
"learning_rate": 1.5129085590741448e-05,
"loss": 1.0789,
"mean_token_accuracy": 0.6967040143907071,
"step": 1915
},
{
"epoch": 0.7324749642346209,
"grad_norm": 4.71875,
"learning_rate": 1.5116367798550173e-05,
"loss": 0.9752,
"mean_token_accuracy": 0.7204620942473412,
"step": 1920
},
{
"epoch": 0.7343824511206486,
"grad_norm": 3.8125,
"learning_rate": 1.5103650006358897e-05,
"loss": 1.0254,
"mean_token_accuracy": 0.7050608977675438,
"step": 1925
},
{
"epoch": 0.7362899380066762,
"grad_norm": 4.5625,
"learning_rate": 1.5090932214167621e-05,
"loss": 0.9845,
"mean_token_accuracy": 0.718043963611126,
"step": 1930
},
{
"epoch": 0.7381974248927039,
"grad_norm": 4.75,
"learning_rate": 1.5078214421976347e-05,
"loss": 0.9798,
"mean_token_accuracy": 0.7155537083745003,
"step": 1935
},
{
"epoch": 0.7401049117787315,
"grad_norm": 4.125,
"learning_rate": 1.506549662978507e-05,
"loss": 0.9766,
"mean_token_accuracy": 0.7140335828065872,
"step": 1940
},
{
"epoch": 0.7420123986647592,
"grad_norm": 4.125,
"learning_rate": 1.5052778837593796e-05,
"loss": 1.048,
"mean_token_accuracy": 0.6943504139780998,
"step": 1945
},
{
"epoch": 0.7439198855507868,
"grad_norm": 4.46875,
"learning_rate": 1.5040061045402518e-05,
"loss": 0.9615,
"mean_token_accuracy": 0.7220544546842576,
"step": 1950
},
{
"epoch": 0.7458273724368145,
"grad_norm": 5.0,
"learning_rate": 1.5027343253211243e-05,
"loss": 0.9978,
"mean_token_accuracy": 0.7115991845726967,
"step": 1955
},
{
"epoch": 0.7477348593228421,
"grad_norm": 4.25,
"learning_rate": 1.5014625461019967e-05,
"loss": 0.9384,
"mean_token_accuracy": 0.7263565197587013,
"step": 1960
},
{
"epoch": 0.7496423462088698,
"grad_norm": 4.90625,
"learning_rate": 1.5001907668828693e-05,
"loss": 1.0709,
"mean_token_accuracy": 0.6875549122691155,
"step": 1965
},
{
"epoch": 0.7515498330948974,
"grad_norm": 5.4375,
"learning_rate": 1.4989189876637418e-05,
"loss": 1.0374,
"mean_token_accuracy": 0.7037866428494454,
"step": 1970
},
{
"epoch": 0.7534573199809251,
"grad_norm": 4.4375,
"learning_rate": 1.497647208444614e-05,
"loss": 0.9627,
"mean_token_accuracy": 0.7117268234491348,
"step": 1975
},
{
"epoch": 0.7553648068669528,
"grad_norm": 4.34375,
"learning_rate": 1.4963754292254866e-05,
"loss": 0.9921,
"mean_token_accuracy": 0.7058862507343292,
"step": 1980
},
{
"epoch": 0.7572722937529804,
"grad_norm": 5.09375,
"learning_rate": 1.495103650006359e-05,
"loss": 0.961,
"mean_token_accuracy": 0.7215966627001762,
"step": 1985
},
{
"epoch": 0.7591797806390082,
"grad_norm": 4.375,
"learning_rate": 1.4938318707872315e-05,
"loss": 0.8637,
"mean_token_accuracy": 0.7477660223841667,
"step": 1990
},
{
"epoch": 0.7610872675250357,
"grad_norm": 4.71875,
"learning_rate": 1.492560091568104e-05,
"loss": 0.9912,
"mean_token_accuracy": 0.7125525683164596,
"step": 1995
},
{
"epoch": 0.7629947544110635,
"grad_norm": 4.09375,
"learning_rate": 1.4912883123489763e-05,
"loss": 0.9836,
"mean_token_accuracy": 0.7163909748196602,
"step": 2000
}
],
"logging_steps": 5,
"max_steps": 7863,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.321196176694559e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}