terry69's picture
Model save
530dd4b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9995138551288284,
"eval_steps": 500,
"global_step": 1028,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0009722897423432182,
"grad_norm": 22.865641182810197,
"learning_rate": 9.70873786407767e-08,
"loss": 1.3413,
"step": 1
},
{
"epoch": 0.004861448711716091,
"grad_norm": 21.908798159393672,
"learning_rate": 4.854368932038835e-07,
"loss": 1.3405,
"step": 5
},
{
"epoch": 0.009722897423432183,
"grad_norm": 8.67999928561487,
"learning_rate": 9.70873786407767e-07,
"loss": 1.227,
"step": 10
},
{
"epoch": 0.014584346135148274,
"grad_norm": 9.222051962167232,
"learning_rate": 1.4563106796116506e-06,
"loss": 1.0774,
"step": 15
},
{
"epoch": 0.019445794846864366,
"grad_norm": 3.108065986341407,
"learning_rate": 1.941747572815534e-06,
"loss": 0.9546,
"step": 20
},
{
"epoch": 0.024307243558580455,
"grad_norm": 2.3305737963317545,
"learning_rate": 2.427184466019418e-06,
"loss": 0.8883,
"step": 25
},
{
"epoch": 0.02916869227029655,
"grad_norm": 2.166786564817553,
"learning_rate": 2.912621359223301e-06,
"loss": 0.8596,
"step": 30
},
{
"epoch": 0.03403014098201264,
"grad_norm": 2.2200337024713366,
"learning_rate": 3.398058252427185e-06,
"loss": 0.8302,
"step": 35
},
{
"epoch": 0.03889158969372873,
"grad_norm": 2.266283635100358,
"learning_rate": 3.883495145631068e-06,
"loss": 0.82,
"step": 40
},
{
"epoch": 0.043753038405444825,
"grad_norm": 2.4391536905162696,
"learning_rate": 4.368932038834952e-06,
"loss": 0.8,
"step": 45
},
{
"epoch": 0.04861448711716091,
"grad_norm": 2.3258135076743423,
"learning_rate": 4.854368932038836e-06,
"loss": 0.792,
"step": 50
},
{
"epoch": 0.053475935828877004,
"grad_norm": 2.4765334900301914,
"learning_rate": 5.3398058252427185e-06,
"loss": 0.7804,
"step": 55
},
{
"epoch": 0.0583373845405931,
"grad_norm": 2.7215966994164704,
"learning_rate": 5.825242718446602e-06,
"loss": 0.764,
"step": 60
},
{
"epoch": 0.06319883325230918,
"grad_norm": 2.4553563315970854,
"learning_rate": 6.310679611650487e-06,
"loss": 0.7521,
"step": 65
},
{
"epoch": 0.06806028196402528,
"grad_norm": 2.4367790074569675,
"learning_rate": 6.79611650485437e-06,
"loss": 0.7394,
"step": 70
},
{
"epoch": 0.07292173067574137,
"grad_norm": 2.335256384006913,
"learning_rate": 7.2815533980582534e-06,
"loss": 0.7134,
"step": 75
},
{
"epoch": 0.07778317938745746,
"grad_norm": 2.1323333889527953,
"learning_rate": 7.766990291262136e-06,
"loss": 0.7293,
"step": 80
},
{
"epoch": 0.08264462809917356,
"grad_norm": 2.1774984480511095,
"learning_rate": 8.25242718446602e-06,
"loss": 0.7164,
"step": 85
},
{
"epoch": 0.08750607681088965,
"grad_norm": 2.2022981209541608,
"learning_rate": 8.737864077669904e-06,
"loss": 0.7011,
"step": 90
},
{
"epoch": 0.09236752552260574,
"grad_norm": 2.312357582283152,
"learning_rate": 9.223300970873788e-06,
"loss": 0.7079,
"step": 95
},
{
"epoch": 0.09722897423432182,
"grad_norm": 2.1629606540297406,
"learning_rate": 9.708737864077671e-06,
"loss": 0.7021,
"step": 100
},
{
"epoch": 0.10209042294603791,
"grad_norm": 2.5455373567567054,
"learning_rate": 9.999884650793835e-06,
"loss": 0.6992,
"step": 105
},
{
"epoch": 0.10695187165775401,
"grad_norm": 2.3692741540462237,
"learning_rate": 9.998587033345546e-06,
"loss": 0.694,
"step": 110
},
{
"epoch": 0.1118133203694701,
"grad_norm": 2.1411842110512964,
"learning_rate": 9.995847987378953e-06,
"loss": 0.6975,
"step": 115
},
{
"epoch": 0.1166747690811862,
"grad_norm": 2.089784612791783,
"learning_rate": 9.991668302745053e-06,
"loss": 0.6838,
"step": 120
},
{
"epoch": 0.12153621779290229,
"grad_norm": 2.323386252470974,
"learning_rate": 9.98604918472778e-06,
"loss": 0.6777,
"step": 125
},
{
"epoch": 0.12639766650461837,
"grad_norm": 2.109074650826608,
"learning_rate": 9.97899225369643e-06,
"loss": 0.6804,
"step": 130
},
{
"epoch": 0.13125911521633446,
"grad_norm": 1.977824454327765,
"learning_rate": 9.970499544638405e-06,
"loss": 0.6828,
"step": 135
},
{
"epoch": 0.13612056392805055,
"grad_norm": 2.3675044930177314,
"learning_rate": 9.960573506572391e-06,
"loss": 0.6685,
"step": 140
},
{
"epoch": 0.14098201263976665,
"grad_norm": 2.135635558376172,
"learning_rate": 9.949217001842128e-06,
"loss": 0.6799,
"step": 145
},
{
"epoch": 0.14584346135148274,
"grad_norm": 2.1238487943590982,
"learning_rate": 9.93643330529103e-06,
"loss": 0.662,
"step": 150
},
{
"epoch": 0.15070491006319883,
"grad_norm": 2.0658607430202953,
"learning_rate": 9.922226103317802e-06,
"loss": 0.6682,
"step": 155
},
{
"epoch": 0.15556635877491493,
"grad_norm": 2.3142186772351936,
"learning_rate": 9.906599492813413e-06,
"loss": 0.6465,
"step": 160
},
{
"epoch": 0.16042780748663102,
"grad_norm": 2.078145593908007,
"learning_rate": 9.889557979979695e-06,
"loss": 0.6493,
"step": 165
},
{
"epoch": 0.1652892561983471,
"grad_norm": 2.174446805793003,
"learning_rate": 9.871106479029889e-06,
"loss": 0.6642,
"step": 170
},
{
"epoch": 0.1701507049100632,
"grad_norm": 2.1609913986701184,
"learning_rate": 9.851250310771552e-06,
"loss": 0.6563,
"step": 175
},
{
"epoch": 0.1750121536217793,
"grad_norm": 2.0544241967243533,
"learning_rate": 9.829995201072217e-06,
"loss": 0.6421,
"step": 180
},
{
"epoch": 0.1798736023334954,
"grad_norm": 2.1208283810533115,
"learning_rate": 9.807347279208233e-06,
"loss": 0.6427,
"step": 185
},
{
"epoch": 0.18473505104521148,
"grad_norm": 2.3054596970818593,
"learning_rate": 9.783313076097285e-06,
"loss": 0.6441,
"step": 190
},
{
"epoch": 0.18959649975692758,
"grad_norm": 2.002111307715066,
"learning_rate": 9.75789952241509e-06,
"loss": 0.6211,
"step": 195
},
{
"epoch": 0.19445794846864364,
"grad_norm": 2.0437914538851896,
"learning_rate": 9.73111394659682e-06,
"loss": 0.6499,
"step": 200
},
{
"epoch": 0.19931939718035974,
"grad_norm": 2.0833682723245106,
"learning_rate": 9.702964072723825e-06,
"loss": 0.6286,
"step": 205
},
{
"epoch": 0.20418084589207583,
"grad_norm": 1.8608505397920612,
"learning_rate": 9.673458018296249e-06,
"loss": 0.6214,
"step": 210
},
{
"epoch": 0.20904229460379192,
"grad_norm": 1.9565897157549554,
"learning_rate": 9.642604291892227e-06,
"loss": 0.6313,
"step": 215
},
{
"epoch": 0.21390374331550802,
"grad_norm": 2.0254219704696546,
"learning_rate": 9.610411790714274e-06,
"loss": 0.6294,
"step": 220
},
{
"epoch": 0.2187651920272241,
"grad_norm": 2.042861634862239,
"learning_rate": 9.576889798023632e-06,
"loss": 0.6062,
"step": 225
},
{
"epoch": 0.2236266407389402,
"grad_norm": 1.9416084530570248,
"learning_rate": 9.54204798046328e-06,
"loss": 0.6163,
"step": 230
},
{
"epoch": 0.2284880894506563,
"grad_norm": 2.1101068845725446,
"learning_rate": 9.505896385270397e-06,
"loss": 0.6086,
"step": 235
},
{
"epoch": 0.2333495381623724,
"grad_norm": 1.9316514600214862,
"learning_rate": 9.468445437379054e-06,
"loss": 0.6163,
"step": 240
},
{
"epoch": 0.23821098687408848,
"grad_norm": 2.31121577708352,
"learning_rate": 9.42970593641402e-06,
"loss": 0.6083,
"step": 245
},
{
"epoch": 0.24307243558580457,
"grad_norm": 2.084199435888248,
"learning_rate": 9.389689053576497e-06,
"loss": 0.5875,
"step": 250
},
{
"epoch": 0.24793388429752067,
"grad_norm": 2.0679175188925742,
"learning_rate": 9.348406328422714e-06,
"loss": 0.5889,
"step": 255
},
{
"epoch": 0.25279533300923673,
"grad_norm": 1.935193185903998,
"learning_rate": 9.305869665536296e-06,
"loss": 0.5978,
"step": 260
},
{
"epoch": 0.25765678172095285,
"grad_norm": 1.9881581060186517,
"learning_rate": 9.262091331095375e-06,
"loss": 0.5963,
"step": 265
},
{
"epoch": 0.2625182304326689,
"grad_norm": 1.9171404527587066,
"learning_rate": 9.217083949335429e-06,
"loss": 0.5826,
"step": 270
},
{
"epoch": 0.26737967914438504,
"grad_norm": 2.066905479586149,
"learning_rate": 9.170860498908851e-06,
"loss": 0.5868,
"step": 275
},
{
"epoch": 0.2722411278561011,
"grad_norm": 1.9405147945598347,
"learning_rate": 9.12343430914236e-06,
"loss": 0.5938,
"step": 280
},
{
"epoch": 0.2771025765678172,
"grad_norm": 2.042477110270263,
"learning_rate": 9.07481905619323e-06,
"loss": 0.5879,
"step": 285
},
{
"epoch": 0.2819640252795333,
"grad_norm": 2.316892218301555,
"learning_rate": 9.025028759105558e-06,
"loss": 0.5862,
"step": 290
},
{
"epoch": 0.2868254739912494,
"grad_norm": 1.9826669776669494,
"learning_rate": 8.974077775767618e-06,
"loss": 0.5795,
"step": 295
},
{
"epoch": 0.2916869227029655,
"grad_norm": 1.9928663954865473,
"learning_rate": 8.921980798771521e-06,
"loss": 0.5711,
"step": 300
},
{
"epoch": 0.2965483714146816,
"grad_norm": 1.9834504354777311,
"learning_rate": 8.868752851176357e-06,
"loss": 0.5797,
"step": 305
},
{
"epoch": 0.30140982012639767,
"grad_norm": 1.8922576118910497,
"learning_rate": 8.814409282176029e-06,
"loss": 0.5816,
"step": 310
},
{
"epoch": 0.30627126883811373,
"grad_norm": 2.054072819946173,
"learning_rate": 8.758965762673065e-06,
"loss": 0.5671,
"step": 315
},
{
"epoch": 0.31113271754982985,
"grad_norm": 1.9718238348510062,
"learning_rate": 8.70243828075962e-06,
"loss": 0.5582,
"step": 320
},
{
"epoch": 0.3159941662615459,
"grad_norm": 1.8711561533553072,
"learning_rate": 8.644843137107058e-06,
"loss": 0.5616,
"step": 325
},
{
"epoch": 0.32085561497326204,
"grad_norm": 2.0374989076214676,
"learning_rate": 8.58619694026536e-06,
"loss": 0.5593,
"step": 330
},
{
"epoch": 0.3257170636849781,
"grad_norm": 2.505113100765879,
"learning_rate": 8.526516601873764e-06,
"loss": 0.5492,
"step": 335
},
{
"epoch": 0.3305785123966942,
"grad_norm": 2.210634577584573,
"learning_rate": 8.46581933178401e-06,
"loss": 0.5564,
"step": 340
},
{
"epoch": 0.3354399611084103,
"grad_norm": 1.9498504768203495,
"learning_rate": 8.404122633097573e-06,
"loss": 0.5446,
"step": 345
},
{
"epoch": 0.3403014098201264,
"grad_norm": 2.1908777181373567,
"learning_rate": 8.341444297118353e-06,
"loss": 0.5435,
"step": 350
},
{
"epoch": 0.3451628585318425,
"grad_norm": 1.9450484392786167,
"learning_rate": 8.27780239822224e-06,
"loss": 0.5566,
"step": 355
},
{
"epoch": 0.3500243072435586,
"grad_norm": 1.9837258801331765,
"learning_rate": 8.213215288645058e-06,
"loss": 0.5267,
"step": 360
},
{
"epoch": 0.35488575595527466,
"grad_norm": 1.9536552831602378,
"learning_rate": 8.147701593190384e-06,
"loss": 0.5387,
"step": 365
},
{
"epoch": 0.3597472046669908,
"grad_norm": 1.9999100970286463,
"learning_rate": 8.081280203858767e-06,
"loss": 0.5331,
"step": 370
},
{
"epoch": 0.36460865337870685,
"grad_norm": 1.9212833886247676,
"learning_rate": 8.01397027439989e-06,
"loss": 0.5349,
"step": 375
},
{
"epoch": 0.36947010209042297,
"grad_norm": 1.9162389845327823,
"learning_rate": 7.945791214789261e-06,
"loss": 0.5221,
"step": 380
},
{
"epoch": 0.37433155080213903,
"grad_norm": 1.909711155316506,
"learning_rate": 7.876762685631005e-06,
"loss": 0.5235,
"step": 385
},
{
"epoch": 0.37919299951385516,
"grad_norm": 2.357077558485306,
"learning_rate": 7.806904592488409e-06,
"loss": 0.5222,
"step": 390
},
{
"epoch": 0.3840544482255712,
"grad_norm": 1.9488437106922831,
"learning_rate": 7.736237080143788e-06,
"loss": 0.5198,
"step": 395
},
{
"epoch": 0.3889158969372873,
"grad_norm": 1.9458627487121898,
"learning_rate": 7.664780526789409e-06,
"loss": 0.5145,
"step": 400
},
{
"epoch": 0.3937773456490034,
"grad_norm": 1.944072611413161,
"learning_rate": 7.592555538151073e-06,
"loss": 0.5136,
"step": 405
},
{
"epoch": 0.3986387943607195,
"grad_norm": 1.965724726537108,
"learning_rate": 7.519582941546117e-06,
"loss": 0.5235,
"step": 410
},
{
"epoch": 0.4035002430724356,
"grad_norm": 2.100470738223745,
"learning_rate": 7.445883779877483e-06,
"loss": 0.5094,
"step": 415
},
{
"epoch": 0.40836169178415166,
"grad_norm": 1.9100636260210968,
"learning_rate": 7.371479305565644e-06,
"loss": 0.5103,
"step": 420
},
{
"epoch": 0.4132231404958678,
"grad_norm": 1.9333255998991457,
"learning_rate": 7.296390974420102e-06,
"loss": 0.5054,
"step": 425
},
{
"epoch": 0.41808458920758385,
"grad_norm": 2.080473533066085,
"learning_rate": 7.220640439452236e-06,
"loss": 0.4959,
"step": 430
},
{
"epoch": 0.42294603791929997,
"grad_norm": 1.981039750107447,
"learning_rate": 7.144249544631279e-06,
"loss": 0.4991,
"step": 435
},
{
"epoch": 0.42780748663101603,
"grad_norm": 1.9695849449401175,
"learning_rate": 7.067240318585242e-06,
"loss": 0.499,
"step": 440
},
{
"epoch": 0.43266893534273215,
"grad_norm": 1.9718354318180367,
"learning_rate": 6.989634968248578e-06,
"loss": 0.4999,
"step": 445
},
{
"epoch": 0.4375303840544482,
"grad_norm": 1.849425307215673,
"learning_rate": 6.911455872458423e-06,
"loss": 0.5024,
"step": 450
},
{
"epoch": 0.44239183276616434,
"grad_norm": 1.968118907489162,
"learning_rate": 6.832725575501287e-06,
"loss": 0.4928,
"step": 455
},
{
"epoch": 0.4472532814778804,
"grad_norm": 2.052190888318457,
"learning_rate": 6.753466780612008e-06,
"loss": 0.4883,
"step": 460
},
{
"epoch": 0.4521147301895965,
"grad_norm": 2.023182275453982,
"learning_rate": 6.673702343426894e-06,
"loss": 0.4917,
"step": 465
},
{
"epoch": 0.4569761789013126,
"grad_norm": 1.9163489656538493,
"learning_rate": 6.593455265392901e-06,
"loss": 0.4805,
"step": 470
},
{
"epoch": 0.46183762761302866,
"grad_norm": 2.01148328208047,
"learning_rate": 6.512748687134771e-06,
"loss": 0.4848,
"step": 475
},
{
"epoch": 0.4666990763247448,
"grad_norm": 1.9062562183206269,
"learning_rate": 6.431605881782043e-06,
"loss": 0.4846,
"step": 480
},
{
"epoch": 0.47156052503646084,
"grad_norm": 2.1740484017488173,
"learning_rate": 6.3500502482578296e-06,
"loss": 0.4697,
"step": 485
},
{
"epoch": 0.47642197374817696,
"grad_norm": 1.9616349762922658,
"learning_rate": 6.268105304531353e-06,
"loss": 0.4781,
"step": 490
},
{
"epoch": 0.48128342245989303,
"grad_norm": 2.0065445987317467,
"learning_rate": 6.185794680836124e-06,
"loss": 0.4682,
"step": 495
},
{
"epoch": 0.48614487117160915,
"grad_norm": 2.1759803858020716,
"learning_rate": 6.103142112855758e-06,
"loss": 0.4669,
"step": 500
},
{
"epoch": 0.4910063198833252,
"grad_norm": 1.928661549886414,
"learning_rate": 6.020171434879385e-06,
"loss": 0.4635,
"step": 505
},
{
"epoch": 0.49586776859504134,
"grad_norm": 2.0042980585449324,
"learning_rate": 5.936906572928625e-06,
"loss": 0.4657,
"step": 510
},
{
"epoch": 0.5007292173067575,
"grad_norm": 1.9953252819560416,
"learning_rate": 5.8533715378581e-06,
"loss": 0.4642,
"step": 515
},
{
"epoch": 0.5055906660184735,
"grad_norm": 2.055778442556058,
"learning_rate": 5.769590418431502e-06,
"loss": 0.4586,
"step": 520
},
{
"epoch": 0.5104521147301896,
"grad_norm": 1.9430537005076507,
"learning_rate": 5.685587374375176e-06,
"loss": 0.4513,
"step": 525
},
{
"epoch": 0.5153135634419057,
"grad_norm": 1.850811801991838,
"learning_rate": 5.601386629411247e-06,
"loss": 0.4545,
"step": 530
},
{
"epoch": 0.5201750121536218,
"grad_norm": 1.8848477276717563,
"learning_rate": 5.5170124642723035e-06,
"loss": 0.4459,
"step": 535
},
{
"epoch": 0.5250364608653378,
"grad_norm": 2.0655655892354985,
"learning_rate": 5.432489209699614e-06,
"loss": 0.4485,
"step": 540
},
{
"epoch": 0.529897909577054,
"grad_norm": 1.8863958006941897,
"learning_rate": 5.347841239426956e-06,
"loss": 0.4353,
"step": 545
},
{
"epoch": 0.5347593582887701,
"grad_norm": 1.9547833187632682,
"learning_rate": 5.263092963152021e-06,
"loss": 0.4469,
"step": 550
},
{
"epoch": 0.5396208070004861,
"grad_norm": 1.9174269856279669,
"learning_rate": 5.178268819497459e-06,
"loss": 0.45,
"step": 555
},
{
"epoch": 0.5444822557122022,
"grad_norm": 2.0224149703614436,
"learning_rate": 5.0933932689635855e-06,
"loss": 0.4389,
"step": 560
},
{
"epoch": 0.5493437044239183,
"grad_norm": 1.9813759308607048,
"learning_rate": 5.008490786874775e-06,
"loss": 0.4344,
"step": 565
},
{
"epoch": 0.5542051531356345,
"grad_norm": 1.8864523302122398,
"learning_rate": 4.923585856321577e-06,
"loss": 0.4385,
"step": 570
},
{
"epoch": 0.5590666018473505,
"grad_norm": 1.915948818115916,
"learning_rate": 4.8387029611005945e-06,
"loss": 0.4402,
"step": 575
},
{
"epoch": 0.5639280505590666,
"grad_norm": 1.8637742571775495,
"learning_rate": 4.753866578654171e-06,
"loss": 0.4315,
"step": 580
},
{
"epoch": 0.5687894992707827,
"grad_norm": 1.9492458299998499,
"learning_rate": 4.669101173011885e-06,
"loss": 0.4262,
"step": 585
},
{
"epoch": 0.5736509479824988,
"grad_norm": 1.919366061689666,
"learning_rate": 4.584431187735939e-06,
"loss": 0.4329,
"step": 590
},
{
"epoch": 0.5785123966942148,
"grad_norm": 1.9206859765403341,
"learning_rate": 4.499881038872424e-06,
"loss": 0.4333,
"step": 595
},
{
"epoch": 0.583373845405931,
"grad_norm": 2.0207639364785663,
"learning_rate": 4.415475107910553e-06,
"loss": 0.4247,
"step": 600
},
{
"epoch": 0.5882352941176471,
"grad_norm": 1.9003063950335572,
"learning_rate": 4.331237734751813e-06,
"loss": 0.419,
"step": 605
},
{
"epoch": 0.5930967428293632,
"grad_norm": 1.9589720672960007,
"learning_rate": 4.247193210691164e-06,
"loss": 0.4135,
"step": 610
},
{
"epoch": 0.5979581915410792,
"grad_norm": 1.8725922589668766,
"learning_rate": 4.1633657714122e-06,
"loss": 0.4203,
"step": 615
},
{
"epoch": 0.6028196402527953,
"grad_norm": 1.8638498804130523,
"learning_rate": 4.0797795899983984e-06,
"loss": 0.4199,
"step": 620
},
{
"epoch": 0.6076810889645115,
"grad_norm": 1.896661777937066,
"learning_rate": 3.9964587699623705e-06,
"loss": 0.4083,
"step": 625
},
{
"epoch": 0.6125425376762275,
"grad_norm": 1.850040585103086,
"learning_rate": 3.913427338295222e-06,
"loss": 0.4132,
"step": 630
},
{
"epoch": 0.6174039863879436,
"grad_norm": 1.8523931506545597,
"learning_rate": 3.830709238537938e-06,
"loss": 0.4136,
"step": 635
},
{
"epoch": 0.6222654350996597,
"grad_norm": 1.8618206136881559,
"learning_rate": 3.7483283238768685e-06,
"loss": 0.4089,
"step": 640
},
{
"epoch": 0.6271268838113758,
"grad_norm": 1.975112166712253,
"learning_rate": 3.6663083502652335e-06,
"loss": 0.4063,
"step": 645
},
{
"epoch": 0.6319883325230918,
"grad_norm": 1.8034124348015945,
"learning_rate": 3.5846729695727055e-06,
"loss": 0.4022,
"step": 650
},
{
"epoch": 0.636849781234808,
"grad_norm": 1.8638316469191787,
"learning_rate": 3.503445722764967e-06,
"loss": 0.4087,
"step": 655
},
{
"epoch": 0.6417112299465241,
"grad_norm": 1.843833767339686,
"learning_rate": 3.4226500331152843e-06,
"loss": 0.4009,
"step": 660
},
{
"epoch": 0.6465726786582402,
"grad_norm": 1.9595878869963705,
"learning_rate": 3.342309199449991e-06,
"loss": 0.4068,
"step": 665
},
{
"epoch": 0.6514341273699562,
"grad_norm": 1.8378464923421844,
"learning_rate": 3.262446389429883e-06,
"loss": 0.3986,
"step": 670
},
{
"epoch": 0.6562955760816723,
"grad_norm": 1.8906212458541776,
"learning_rate": 3.183084632869411e-06,
"loss": 0.3993,
"step": 675
},
{
"epoch": 0.6611570247933884,
"grad_norm": 1.8741986531721049,
"learning_rate": 3.104246815095653e-06,
"loss": 0.3957,
"step": 680
},
{
"epoch": 0.6660184735051046,
"grad_norm": 1.9025410789293065,
"learning_rate": 3.0259556703489245e-06,
"loss": 0.3961,
"step": 685
},
{
"epoch": 0.6708799222168206,
"grad_norm": 1.8317452262969969,
"learning_rate": 2.948233775226975e-06,
"loss": 0.3936,
"step": 690
},
{
"epoch": 0.6757413709285367,
"grad_norm": 1.791486581405094,
"learning_rate": 2.871103542174637e-06,
"loss": 0.3954,
"step": 695
},
{
"epoch": 0.6806028196402528,
"grad_norm": 1.8421048675047038,
"learning_rate": 2.794587213020813e-06,
"loss": 0.392,
"step": 700
},
{
"epoch": 0.6854642683519689,
"grad_norm": 1.8053899992220983,
"learning_rate": 2.7187068525646578e-06,
"loss": 0.3884,
"step": 705
},
{
"epoch": 0.690325717063685,
"grad_norm": 1.8164816090358933,
"learning_rate": 2.6434843422128225e-06,
"loss": 0.3833,
"step": 710
},
{
"epoch": 0.6951871657754011,
"grad_norm": 1.8692336513214356,
"learning_rate": 2.5689413736695623e-06,
"loss": 0.3925,
"step": 715
},
{
"epoch": 0.7000486144871172,
"grad_norm": 1.8131226789865298,
"learning_rate": 2.495099442681574e-06,
"loss": 0.3772,
"step": 720
},
{
"epoch": 0.7049100631988332,
"grad_norm": 1.7456402646995377,
"learning_rate": 2.4219798428393167e-06,
"loss": 0.3836,
"step": 725
},
{
"epoch": 0.7097715119105493,
"grad_norm": 1.8417483163838206,
"learning_rate": 2.3496036594366478e-06,
"loss": 0.3767,
"step": 730
},
{
"epoch": 0.7146329606222654,
"grad_norm": 1.8419144501834137,
"learning_rate": 2.2779917633905075e-06,
"loss": 0.3798,
"step": 735
},
{
"epoch": 0.7194944093339816,
"grad_norm": 1.697011917829243,
"learning_rate": 2.207164805222441e-06,
"loss": 0.3731,
"step": 740
},
{
"epoch": 0.7243558580456976,
"grad_norm": 1.7913218626855516,
"learning_rate": 2.1371432091036525e-06,
"loss": 0.3695,
"step": 745
},
{
"epoch": 0.7292173067574137,
"grad_norm": 1.805255097420597,
"learning_rate": 2.0679471669653596e-06,
"loss": 0.3758,
"step": 750
},
{
"epoch": 0.7340787554691298,
"grad_norm": 1.67858386866423,
"learning_rate": 1.999596632676087e-06,
"loss": 0.3723,
"step": 755
},
{
"epoch": 0.7389402041808459,
"grad_norm": 1.6993602534801597,
"learning_rate": 1.93211131628764e-06,
"loss": 0.3684,
"step": 760
},
{
"epoch": 0.743801652892562,
"grad_norm": 1.6768650553126156,
"learning_rate": 1.865510678351361e-06,
"loss": 0.3664,
"step": 765
},
{
"epoch": 0.7486631016042781,
"grad_norm": 1.8063986711573161,
"learning_rate": 1.7998139243063523e-06,
"loss": 0.3664,
"step": 770
},
{
"epoch": 0.7535245503159942,
"grad_norm": 1.8433399163627073,
"learning_rate": 1.7350399989412503e-06,
"loss": 0.3792,
"step": 775
},
{
"epoch": 0.7583859990277103,
"grad_norm": 1.789706761318329,
"learning_rate": 1.6712075809311801e-06,
"loss": 0.3665,
"step": 780
},
{
"epoch": 0.7632474477394263,
"grad_norm": 1.874352972700248,
"learning_rate": 1.6083350774514256e-06,
"loss": 0.3588,
"step": 785
},
{
"epoch": 0.7681088964511424,
"grad_norm": 1.8708699807411355,
"learning_rate": 1.5464406188694176e-06,
"loss": 0.3596,
"step": 790
},
{
"epoch": 0.7729703451628586,
"grad_norm": 1.7622927599849374,
"learning_rate": 1.4855420535165177e-06,
"loss": 0.3706,
"step": 795
},
{
"epoch": 0.7778317938745746,
"grad_norm": 1.8033628264638553,
"learning_rate": 1.4256569425411565e-06,
"loss": 0.3627,
"step": 800
},
{
"epoch": 0.7826932425862907,
"grad_norm": 1.6982941151170463,
"learning_rate": 1.3668025548447645e-06,
"loss": 0.3654,
"step": 805
},
{
"epoch": 0.7875546912980068,
"grad_norm": 1.822085860012342,
"learning_rate": 1.3089958621019966e-06,
"loss": 0.3566,
"step": 810
},
{
"epoch": 0.7924161400097229,
"grad_norm": 1.6803332236549615,
"learning_rate": 1.2522535338666487e-06,
"loss": 0.3539,
"step": 815
},
{
"epoch": 0.797277588721439,
"grad_norm": 1.8018723375209953,
"learning_rate": 1.1965919327647152e-06,
"loss": 0.3624,
"step": 820
},
{
"epoch": 0.8021390374331551,
"grad_norm": 1.681272890467866,
"learning_rate": 1.1420271097759339e-06,
"loss": 0.3527,
"step": 825
},
{
"epoch": 0.8070004861448712,
"grad_norm": 1.7619365658437645,
"learning_rate": 1.0885747996052203e-06,
"loss": 0.353,
"step": 830
},
{
"epoch": 0.8118619348565873,
"grad_norm": 1.7697650095429134,
"learning_rate": 1.0362504161452857e-06,
"loss": 0.3571,
"step": 835
},
{
"epoch": 0.8167233835683033,
"grad_norm": 1.7834693090793727,
"learning_rate": 9.850690480317837e-07,
"loss": 0.3588,
"step": 840
},
{
"epoch": 0.8215848322800194,
"grad_norm": 1.6879014168829525,
"learning_rate": 9.350454542922366e-07,
"loss": 0.3485,
"step": 845
},
{
"epoch": 0.8264462809917356,
"grad_norm": 1.6631076557983835,
"learning_rate": 8.861940600900215e-07,
"loss": 0.3569,
"step": 850
},
{
"epoch": 0.8313077297034517,
"grad_norm": 1.7470295248017005,
"learning_rate": 8.385289525646211e-07,
"loss": 0.3545,
"step": 855
},
{
"epoch": 0.8361691784151677,
"grad_norm": 1.6694583459061383,
"learning_rate": 7.920638767693606e-07,
"loss": 0.3459,
"step": 860
},
{
"epoch": 0.8410306271268838,
"grad_norm": 1.812544085868088,
"learning_rate": 7.468122317077786e-07,
"loss": 0.3497,
"step": 865
},
{
"epoch": 0.8458920758385999,
"grad_norm": 1.804488776919829,
"learning_rate": 7.027870664698011e-07,
"loss": 0.3447,
"step": 870
},
{
"epoch": 0.8507535245503159,
"grad_norm": 1.6565427674580868,
"learning_rate": 6.600010764688042e-07,
"loss": 0.3488,
"step": 875
},
{
"epoch": 0.8556149732620321,
"grad_norm": 1.5660595611029826,
"learning_rate": 6.184665997806832e-07,
"loss": 0.3449,
"step": 880
},
{
"epoch": 0.8604764219737482,
"grad_norm": 1.628170781974643,
"learning_rate": 5.781956135859446e-07,
"loss": 0.3554,
"step": 885
},
{
"epoch": 0.8653378706854643,
"grad_norm": 1.6261129189211507,
"learning_rate": 5.39199730715892e-07,
"loss": 0.345,
"step": 890
},
{
"epoch": 0.8701993193971803,
"grad_norm": 1.6307410900682924,
"learning_rate": 5.01490196303856e-07,
"loss": 0.3456,
"step": 895
},
{
"epoch": 0.8750607681088964,
"grad_norm": 1.6901295083588348,
"learning_rate": 4.650778845424758e-07,
"loss": 0.3476,
"step": 900
},
{
"epoch": 0.8799222168206126,
"grad_norm": 1.730644298411666,
"learning_rate": 4.2997329554792965e-07,
"loss": 0.3532,
"step": 905
},
{
"epoch": 0.8847836655323287,
"grad_norm": 1.5909360839199094,
"learning_rate": 3.961865523320557e-07,
"loss": 0.3406,
"step": 910
},
{
"epoch": 0.8896451142440447,
"grad_norm": 1.7660527301033595,
"learning_rate": 3.637273978831984e-07,
"loss": 0.3434,
"step": 915
},
{
"epoch": 0.8945065629557608,
"grad_norm": 1.5431742144466063,
"learning_rate": 3.326051923566559e-07,
"loss": 0.3389,
"step": 920
},
{
"epoch": 0.8993680116674769,
"grad_norm": 1.5777885568012344,
"learning_rate": 3.028289103755172e-07,
"loss": 0.3438,
"step": 925
},
{
"epoch": 0.904229460379193,
"grad_norm": 1.6736588557089753,
"learning_rate": 2.744071384426733e-07,
"loss": 0.3456,
"step": 930
},
{
"epoch": 0.9090909090909091,
"grad_norm": 1.584446915524184,
"learning_rate": 2.473480724647548e-07,
"loss": 0.3519,
"step": 935
},
{
"epoch": 0.9139523578026252,
"grad_norm": 1.544706705307836,
"learning_rate": 2.216595153886969e-07,
"loss": 0.3504,
"step": 940
},
{
"epoch": 0.9188138065143413,
"grad_norm": 1.6069910572578268,
"learning_rate": 1.9734887495163114e-07,
"loss": 0.3396,
"step": 945
},
{
"epoch": 0.9236752552260573,
"grad_norm": 1.5262336921306658,
"learning_rate": 1.7442316154473004e-07,
"loss": 0.3391,
"step": 950
},
{
"epoch": 0.9285367039377734,
"grad_norm": 1.6151419038210972,
"learning_rate": 1.528889861916477e-07,
"loss": 0.3455,
"step": 955
},
{
"epoch": 0.9333981526494896,
"grad_norm": 1.7463253293156233,
"learning_rate": 1.3275255864211245e-07,
"loss": 0.342,
"step": 960
},
{
"epoch": 0.9382596013612057,
"grad_norm": 2.15764657474219,
"learning_rate": 1.1401968558123977e-07,
"loss": 0.3432,
"step": 965
},
{
"epoch": 0.9431210500729217,
"grad_norm": 1.5836218621495903,
"learning_rate": 9.669576895507515e-08,
"loss": 0.3434,
"step": 970
},
{
"epoch": 0.9479824987846378,
"grad_norm": 1.6606392678607504,
"learning_rate": 8.078580441285067e-08,
"loss": 0.3488,
"step": 975
},
{
"epoch": 0.9528439474963539,
"grad_norm": 1.634410010175144,
"learning_rate": 6.629437986640397e-08,
"loss": 0.348,
"step": 980
},
{
"epoch": 0.95770539620807,
"grad_norm": 1.6947942875406026,
"learning_rate": 5.322567416717106e-08,
"loss": 0.3356,
"step": 985
},
{
"epoch": 0.9625668449197861,
"grad_norm": 1.5244193912137922,
"learning_rate": 4.158345590114965e-08,
"loss": 0.3461,
"step": 990
},
{
"epoch": 0.9674282936315022,
"grad_norm": 1.6425083834973517,
"learning_rate": 3.137108230215513e-08,
"loss": 0.3489,
"step": 995
},
{
"epoch": 0.9722897423432183,
"grad_norm": 1.6739305879871746,
"learning_rate": 2.259149828370999e-08,
"loss": 0.3466,
"step": 1000
},
{
"epoch": 0.9771511910549344,
"grad_norm": 1.6190798474108024,
"learning_rate": 1.5247235589824772e-08,
"loss": 0.3391,
"step": 1005
},
{
"epoch": 0.9820126397666504,
"grad_norm": 1.72920925073275,
"learning_rate": 9.340412064927084e-09,
"loss": 0.3438,
"step": 1010
},
{
"epoch": 0.9868740884783666,
"grad_norm": 1.6092917702604965,
"learning_rate": 4.872731043143453e-09,
"loss": 0.3454,
"step": 1015
},
{
"epoch": 0.9917355371900827,
"grad_norm": 1.5519329352931979,
"learning_rate": 1.845480857116111e-09,
"loss": 0.3391,
"step": 1020
},
{
"epoch": 0.9965969859017987,
"grad_norm": 1.6136231766922748,
"learning_rate": 2.595344664868549e-10,
"loss": 0.3413,
"step": 1025
},
{
"epoch": 0.9995138551288284,
"eval_loss": 0.33695414662361145,
"eval_runtime": 96.5912,
"eval_samples_per_second": 3.127,
"eval_steps_per_second": 0.787,
"step": 1028
},
{
"epoch": 0.9995138551288284,
"step": 1028,
"total_flos": 215189941125120.0,
"train_loss": 0.5025305894098393,
"train_runtime": 23224.052,
"train_samples_per_second": 1.417,
"train_steps_per_second": 0.044
}
],
"logging_steps": 5,
"max_steps": 1028,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 215189941125120.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}