zephyr-1b-olmo-sft-qlora / trainer_state.json
Ritvik19's picture
Upload 12 files
b21cdb8 verified
raw
history blame
34.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1001,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000999000999000999,
"grad_norm": 1.0703125,
"learning_rate": 1.9801980198019803e-06,
"loss": 1.9138,
"step": 1
},
{
"epoch": 0.004995004995004995,
"grad_norm": 1.0546875,
"learning_rate": 9.900990099009901e-06,
"loss": 1.95,
"step": 5
},
{
"epoch": 0.00999000999000999,
"grad_norm": 0.80078125,
"learning_rate": 1.9801980198019803e-05,
"loss": 1.9142,
"step": 10
},
{
"epoch": 0.014985014985014986,
"grad_norm": 0.671875,
"learning_rate": 2.9702970297029702e-05,
"loss": 1.8437,
"step": 15
},
{
"epoch": 0.01998001998001998,
"grad_norm": 0.83203125,
"learning_rate": 3.9603960396039605e-05,
"loss": 1.7806,
"step": 20
},
{
"epoch": 0.024975024975024976,
"grad_norm": 0.439453125,
"learning_rate": 4.950495049504951e-05,
"loss": 1.645,
"step": 25
},
{
"epoch": 0.029970029970029972,
"grad_norm": 0.33203125,
"learning_rate": 5.9405940594059404e-05,
"loss": 1.5936,
"step": 30
},
{
"epoch": 0.03496503496503497,
"grad_norm": 0.345703125,
"learning_rate": 6.93069306930693e-05,
"loss": 1.5451,
"step": 35
},
{
"epoch": 0.03996003996003996,
"grad_norm": 0.2138671875,
"learning_rate": 7.920792079207921e-05,
"loss": 1.5002,
"step": 40
},
{
"epoch": 0.04495504495504495,
"grad_norm": 0.15625,
"learning_rate": 8.910891089108912e-05,
"loss": 1.4846,
"step": 45
},
{
"epoch": 0.04995004995004995,
"grad_norm": 0.16015625,
"learning_rate": 9.900990099009902e-05,
"loss": 1.4525,
"step": 50
},
{
"epoch": 0.054945054945054944,
"grad_norm": 0.1328125,
"learning_rate": 0.00010891089108910893,
"loss": 1.4746,
"step": 55
},
{
"epoch": 0.059940059940059943,
"grad_norm": 0.177734375,
"learning_rate": 0.00011881188118811881,
"loss": 1.439,
"step": 60
},
{
"epoch": 0.06493506493506493,
"grad_norm": 0.1044921875,
"learning_rate": 0.00012871287128712872,
"loss": 1.4421,
"step": 65
},
{
"epoch": 0.06993006993006994,
"grad_norm": 0.09326171875,
"learning_rate": 0.0001386138613861386,
"loss": 1.4223,
"step": 70
},
{
"epoch": 0.07492507492507493,
"grad_norm": 0.091796875,
"learning_rate": 0.0001485148514851485,
"loss": 1.4025,
"step": 75
},
{
"epoch": 0.07992007992007992,
"grad_norm": 0.1083984375,
"learning_rate": 0.00015841584158415842,
"loss": 1.4102,
"step": 80
},
{
"epoch": 0.08491508491508491,
"grad_norm": 0.0869140625,
"learning_rate": 0.00016831683168316833,
"loss": 1.3908,
"step": 85
},
{
"epoch": 0.0899100899100899,
"grad_norm": 0.09814453125,
"learning_rate": 0.00017821782178217824,
"loss": 1.4007,
"step": 90
},
{
"epoch": 0.09490509490509491,
"grad_norm": 0.0859375,
"learning_rate": 0.00018811881188118812,
"loss": 1.3824,
"step": 95
},
{
"epoch": 0.0999000999000999,
"grad_norm": 0.099609375,
"learning_rate": 0.00019801980198019803,
"loss": 1.3897,
"step": 100
},
{
"epoch": 0.1048951048951049,
"grad_norm": 0.08935546875,
"learning_rate": 0.00019999025240093044,
"loss": 1.3614,
"step": 105
},
{
"epoch": 0.10989010989010989,
"grad_norm": 0.087890625,
"learning_rate": 0.00019995065603657316,
"loss": 1.3792,
"step": 110
},
{
"epoch": 0.11488511488511488,
"grad_norm": 0.09423828125,
"learning_rate": 0.0001998806137341434,
"loss": 1.37,
"step": 115
},
{
"epoch": 0.11988011988011989,
"grad_norm": 0.09619140625,
"learning_rate": 0.000199780146829205,
"loss": 1.3638,
"step": 120
},
{
"epoch": 0.12487512487512488,
"grad_norm": 0.10107421875,
"learning_rate": 0.00019964928592495045,
"loss": 1.3911,
"step": 125
},
{
"epoch": 0.12987012987012986,
"grad_norm": 0.10107421875,
"learning_rate": 0.00019948807088287883,
"loss": 1.3534,
"step": 130
},
{
"epoch": 0.13486513486513488,
"grad_norm": 0.095703125,
"learning_rate": 0.0001992965508106537,
"loss": 1.3619,
"step": 135
},
{
"epoch": 0.13986013986013987,
"grad_norm": 0.1025390625,
"learning_rate": 0.00019907478404714436,
"loss": 1.3612,
"step": 140
},
{
"epoch": 0.14485514485514486,
"grad_norm": 0.10302734375,
"learning_rate": 0.0001988228381446553,
"loss": 1.3531,
"step": 145
},
{
"epoch": 0.14985014985014986,
"grad_norm": 0.09130859375,
"learning_rate": 0.00019854078984834903,
"loss": 1.3589,
"step": 150
},
{
"epoch": 0.15484515484515485,
"grad_norm": 0.1044921875,
"learning_rate": 0.0001982287250728689,
"loss": 1.3393,
"step": 155
},
{
"epoch": 0.15984015984015984,
"grad_norm": 0.09765625,
"learning_rate": 0.0001978867388761685,
"loss": 1.3378,
"step": 160
},
{
"epoch": 0.16483516483516483,
"grad_norm": 0.09521484375,
"learning_rate": 0.00019751493543055632,
"loss": 1.3435,
"step": 165
},
{
"epoch": 0.16983016983016982,
"grad_norm": 0.10595703125,
"learning_rate": 0.00019711342799096361,
"loss": 1.3241,
"step": 170
},
{
"epoch": 0.17482517482517482,
"grad_norm": 0.11376953125,
"learning_rate": 0.00019668233886044597,
"loss": 1.3394,
"step": 175
},
{
"epoch": 0.1798201798201798,
"grad_norm": 0.09765625,
"learning_rate": 0.00019622179935292855,
"loss": 1.3357,
"step": 180
},
{
"epoch": 0.1848151848151848,
"grad_norm": 0.09814453125,
"learning_rate": 0.00019573194975320673,
"loss": 1.3377,
"step": 185
},
{
"epoch": 0.18981018981018982,
"grad_norm": 0.103515625,
"learning_rate": 0.00019521293927421388,
"loss": 1.3461,
"step": 190
},
{
"epoch": 0.19480519480519481,
"grad_norm": 0.09765625,
"learning_rate": 0.00019466492601156966,
"loss": 1.3366,
"step": 195
},
{
"epoch": 0.1998001998001998,
"grad_norm": 0.11083984375,
"learning_rate": 0.00019408807689542257,
"loss": 1.3307,
"step": 200
},
{
"epoch": 0.2047952047952048,
"grad_norm": 0.1005859375,
"learning_rate": 0.00019348256763960145,
"loss": 1.336,
"step": 205
},
{
"epoch": 0.2097902097902098,
"grad_norm": 0.10205078125,
"learning_rate": 0.00019284858268809137,
"loss": 1.3395,
"step": 210
},
{
"epoch": 0.21478521478521478,
"grad_norm": 0.09814453125,
"learning_rate": 0.00019218631515885006,
"loss": 1.3457,
"step": 215
},
{
"epoch": 0.21978021978021978,
"grad_norm": 0.09912109375,
"learning_rate": 0.0001914959667849825,
"loss": 1.3472,
"step": 220
},
{
"epoch": 0.22477522477522477,
"grad_norm": 0.10791015625,
"learning_rate": 0.00019077774785329087,
"loss": 1.3418,
"step": 225
},
{
"epoch": 0.22977022977022976,
"grad_norm": 0.1044921875,
"learning_rate": 0.00019003187714021938,
"loss": 1.3217,
"step": 230
},
{
"epoch": 0.23476523476523475,
"grad_norm": 0.09716796875,
"learning_rate": 0.00018925858184521256,
"loss": 1.3256,
"step": 235
},
{
"epoch": 0.23976023976023977,
"grad_norm": 0.099609375,
"learning_rate": 0.0001884580975215084,
"loss": 1.3417,
"step": 240
},
{
"epoch": 0.24475524475524477,
"grad_norm": 0.09619140625,
"learning_rate": 0.00018763066800438636,
"loss": 1.3261,
"step": 245
},
{
"epoch": 0.24975024975024976,
"grad_norm": 0.09912109375,
"learning_rate": 0.00018677654533689287,
"loss": 1.3252,
"step": 250
},
{
"epoch": 0.2547452547452547,
"grad_norm": 0.1025390625,
"learning_rate": 0.00018589598969306645,
"loss": 1.3387,
"step": 255
},
{
"epoch": 0.2597402597402597,
"grad_norm": 0.1025390625,
"learning_rate": 0.00018498926929868642,
"loss": 1.3577,
"step": 260
},
{
"epoch": 0.2647352647352647,
"grad_norm": 0.10302734375,
"learning_rate": 0.00018405666034956844,
"loss": 1.3321,
"step": 265
},
{
"epoch": 0.26973026973026976,
"grad_norm": 0.10400390625,
"learning_rate": 0.00018309844692743283,
"loss": 1.3067,
"step": 270
},
{
"epoch": 0.27472527472527475,
"grad_norm": 0.1103515625,
"learning_rate": 0.00018211492091337042,
"loss": 1.3489,
"step": 275
},
{
"epoch": 0.27972027972027974,
"grad_norm": 0.10400390625,
"learning_rate": 0.00018110638189893267,
"loss": 1.3276,
"step": 280
},
{
"epoch": 0.28471528471528473,
"grad_norm": 0.09814453125,
"learning_rate": 0.00018007313709487334,
"loss": 1.3258,
"step": 285
},
{
"epoch": 0.2897102897102897,
"grad_norm": 0.09521484375,
"learning_rate": 0.00017901550123756906,
"loss": 1.332,
"step": 290
},
{
"epoch": 0.2947052947052947,
"grad_norm": 0.10009765625,
"learning_rate": 0.00017793379649314744,
"loss": 1.335,
"step": 295
},
{
"epoch": 0.2997002997002997,
"grad_norm": 0.09912109375,
"learning_rate": 0.00017682835235935236,
"loss": 1.3266,
"step": 300
},
{
"epoch": 0.3046953046953047,
"grad_norm": 0.1083984375,
"learning_rate": 0.00017569950556517566,
"loss": 1.3174,
"step": 305
},
{
"epoch": 0.3096903096903097,
"grad_norm": 0.09326171875,
"learning_rate": 0.00017454759996828623,
"loss": 1.3212,
"step": 310
},
{
"epoch": 0.3146853146853147,
"grad_norm": 0.10107421875,
"learning_rate": 0.00017337298645028764,
"loss": 1.326,
"step": 315
},
{
"epoch": 0.3196803196803197,
"grad_norm": 0.10400390625,
"learning_rate": 0.00017217602280983623,
"loss": 1.3364,
"step": 320
},
{
"epoch": 0.3246753246753247,
"grad_norm": 0.09814453125,
"learning_rate": 0.0001709570736536521,
"loss": 1.3168,
"step": 325
},
{
"epoch": 0.32967032967032966,
"grad_norm": 0.10498046875,
"learning_rate": 0.00016971651028545648,
"loss": 1.3166,
"step": 330
},
{
"epoch": 0.33466533466533466,
"grad_norm": 0.1005859375,
"learning_rate": 0.00016845471059286887,
"loss": 1.3228,
"step": 335
},
{
"epoch": 0.33966033966033965,
"grad_norm": 0.1015625,
"learning_rate": 0.00016717205893229903,
"loss": 1.3277,
"step": 340
},
{
"epoch": 0.34465534465534464,
"grad_norm": 0.0986328125,
"learning_rate": 0.00016586894601186805,
"loss": 1.3197,
"step": 345
},
{
"epoch": 0.34965034965034963,
"grad_norm": 0.09765625,
"learning_rate": 0.00016454576877239507,
"loss": 1.3276,
"step": 350
},
{
"epoch": 0.3546453546453546,
"grad_norm": 0.09912109375,
"learning_rate": 0.0001632029302664851,
"loss": 1.309,
"step": 355
},
{
"epoch": 0.3596403596403596,
"grad_norm": 0.095703125,
"learning_rate": 0.0001618408395357554,
"loss": 1.334,
"step": 360
},
{
"epoch": 0.3646353646353646,
"grad_norm": 0.10400390625,
"learning_rate": 0.0001604599114862375,
"loss": 1.3287,
"step": 365
},
{
"epoch": 0.3696303696303696,
"grad_norm": 0.09765625,
"learning_rate": 0.00015906056676199255,
"loss": 1.3165,
"step": 370
},
{
"epoch": 0.37462537462537465,
"grad_norm": 0.10107421875,
"learning_rate": 0.00015764323161697935,
"loss": 1.3127,
"step": 375
},
{
"epoch": 0.37962037962037964,
"grad_norm": 0.09814453125,
"learning_rate": 0.00015620833778521307,
"loss": 1.3297,
"step": 380
},
{
"epoch": 0.38461538461538464,
"grad_norm": 0.10400390625,
"learning_rate": 0.00015475632234925504,
"loss": 1.3184,
"step": 385
},
{
"epoch": 0.38961038961038963,
"grad_norm": 0.0986328125,
"learning_rate": 0.000153287627607073,
"loss": 1.3004,
"step": 390
},
{
"epoch": 0.3946053946053946,
"grad_norm": 0.09912109375,
"learning_rate": 0.00015180270093731303,
"loss": 1.3179,
"step": 395
},
{
"epoch": 0.3996003996003996,
"grad_norm": 0.09619140625,
"learning_rate": 0.00015030199466302353,
"loss": 1.3294,
"step": 400
},
{
"epoch": 0.4045954045954046,
"grad_norm": 0.1044921875,
"learning_rate": 0.0001487859659138733,
"loss": 1.3469,
"step": 405
},
{
"epoch": 0.4095904095904096,
"grad_norm": 0.10009765625,
"learning_rate": 0.00014725507648690543,
"loss": 1.3166,
"step": 410
},
{
"epoch": 0.4145854145854146,
"grad_norm": 0.10205078125,
"learning_rate": 0.00014570979270586945,
"loss": 1.3149,
"step": 415
},
{
"epoch": 0.4195804195804196,
"grad_norm": 0.09716796875,
"learning_rate": 0.00014415058527917452,
"loss": 1.3365,
"step": 420
},
{
"epoch": 0.4245754245754246,
"grad_norm": 0.10693359375,
"learning_rate": 0.00014257792915650728,
"loss": 1.3289,
"step": 425
},
{
"epoch": 0.42957042957042957,
"grad_norm": 0.103515625,
"learning_rate": 0.00014099230338415728,
"loss": 1.2932,
"step": 430
},
{
"epoch": 0.43456543456543456,
"grad_norm": 0.11865234375,
"learning_rate": 0.00013939419095909512,
"loss": 1.318,
"step": 435
},
{
"epoch": 0.43956043956043955,
"grad_norm": 0.10400390625,
"learning_rate": 0.00013778407868184672,
"loss": 1.2892,
"step": 440
},
{
"epoch": 0.44455544455544455,
"grad_norm": 0.09619140625,
"learning_rate": 0.00013616245700820922,
"loss": 1.3114,
"step": 445
},
{
"epoch": 0.44955044955044954,
"grad_norm": 0.1015625,
"learning_rate": 0.00013452981989985348,
"loss": 1.3143,
"step": 450
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.10498046875,
"learning_rate": 0.00013288666467385833,
"loss": 1.3231,
"step": 455
},
{
"epoch": 0.4595404595404595,
"grad_norm": 0.095703125,
"learning_rate": 0.00013123349185122327,
"loss": 1.3077,
"step": 460
},
{
"epoch": 0.4645354645354645,
"grad_norm": 0.0966796875,
"learning_rate": 0.00012957080500440468,
"loss": 1.3247,
"step": 465
},
{
"epoch": 0.4695304695304695,
"grad_norm": 0.0947265625,
"learning_rate": 0.00012789911060392294,
"loss": 1.3097,
"step": 470
},
{
"epoch": 0.4745254745254745,
"grad_norm": 0.09765625,
"learning_rate": 0.00012621891786408648,
"loss": 1.3099,
"step": 475
},
{
"epoch": 0.47952047952047955,
"grad_norm": 0.09912109375,
"learning_rate": 0.00012453073858788026,
"loss": 1.3267,
"step": 480
},
{
"epoch": 0.48451548451548454,
"grad_norm": 0.0986328125,
"learning_rate": 0.00012283508701106557,
"loss": 1.3114,
"step": 485
},
{
"epoch": 0.48951048951048953,
"grad_norm": 0.09521484375,
"learning_rate": 0.00012113247964553888,
"loss": 1.2904,
"step": 490
},
{
"epoch": 0.4945054945054945,
"grad_norm": 0.103515625,
"learning_rate": 0.0001194234351219972,
"loss": 1.3151,
"step": 495
},
{
"epoch": 0.4995004995004995,
"grad_norm": 0.10693359375,
"learning_rate": 0.00011770847403195834,
"loss": 1.3126,
"step": 500
},
{
"epoch": 0.5044955044955045,
"grad_norm": 0.099609375,
"learning_rate": 0.0001159881187691835,
"loss": 1.3108,
"step": 505
},
{
"epoch": 0.5094905094905094,
"grad_norm": 0.10302734375,
"learning_rate": 0.00011426289337055119,
"loss": 1.3032,
"step": 510
},
{
"epoch": 0.5144855144855145,
"grad_norm": 0.09912109375,
"learning_rate": 0.00011253332335643043,
"loss": 1.3043,
"step": 515
},
{
"epoch": 0.5194805194805194,
"grad_norm": 0.099609375,
"learning_rate": 0.0001107999355706023,
"loss": 1.3188,
"step": 520
},
{
"epoch": 0.5244755244755245,
"grad_norm": 0.09619140625,
"learning_rate": 0.00010906325801977804,
"loss": 1.3266,
"step": 525
},
{
"epoch": 0.5294705294705294,
"grad_norm": 0.10498046875,
"learning_rate": 0.00010732381971276318,
"loss": 1.3156,
"step": 530
},
{
"epoch": 0.5344655344655345,
"grad_norm": 0.10400390625,
"learning_rate": 0.00010558215049931638,
"loss": 1.304,
"step": 535
},
{
"epoch": 0.5394605394605395,
"grad_norm": 0.095703125,
"learning_rate": 0.00010383878090875201,
"loss": 1.3089,
"step": 540
},
{
"epoch": 0.5444555444555444,
"grad_norm": 0.09765625,
"learning_rate": 0.0001020942419883357,
"loss": 1.3298,
"step": 545
},
{
"epoch": 0.5494505494505495,
"grad_norm": 0.09521484375,
"learning_rate": 0.00010034906514152238,
"loss": 1.3127,
"step": 550
},
{
"epoch": 0.5544455544455544,
"grad_norm": 0.10107421875,
"learning_rate": 9.860378196608549e-05,
"loss": 1.3248,
"step": 555
},
{
"epoch": 0.5594405594405595,
"grad_norm": 0.10107421875,
"learning_rate": 9.685892409218717e-05,
"loss": 1.3116,
"step": 560
},
{
"epoch": 0.5644355644355644,
"grad_norm": 0.09619140625,
"learning_rate": 9.511502302043868e-05,
"loss": 1.3196,
"step": 565
},
{
"epoch": 0.5694305694305695,
"grad_norm": 0.1005859375,
"learning_rate": 9.337260996000002e-05,
"loss": 1.3183,
"step": 570
},
{
"epoch": 0.5744255744255744,
"grad_norm": 0.10107421875,
"learning_rate": 9.163221566676847e-05,
"loss": 1.3165,
"step": 575
},
{
"epoch": 0.5794205794205795,
"grad_norm": 0.09716796875,
"learning_rate": 8.989437028170537e-05,
"loss": 1.3125,
"step": 580
},
{
"epoch": 0.5844155844155844,
"grad_norm": 0.0986328125,
"learning_rate": 8.81596031693499e-05,
"loss": 1.325,
"step": 585
},
{
"epoch": 0.5894105894105894,
"grad_norm": 0.09716796875,
"learning_rate": 8.642844275656957e-05,
"loss": 1.3308,
"step": 590
},
{
"epoch": 0.5944055944055944,
"grad_norm": 0.10107421875,
"learning_rate": 8.47014163715962e-05,
"loss": 1.3099,
"step": 595
},
{
"epoch": 0.5994005994005994,
"grad_norm": 0.10009765625,
"learning_rate": 8.297905008339677e-05,
"loss": 1.2791,
"step": 600
},
{
"epoch": 0.6043956043956044,
"grad_norm": 0.09619140625,
"learning_rate": 8.126186854142752e-05,
"loss": 1.3162,
"step": 605
},
{
"epoch": 0.6093906093906094,
"grad_norm": 0.095703125,
"learning_rate": 7.955039481582097e-05,
"loss": 1.3093,
"step": 610
},
{
"epoch": 0.6143856143856143,
"grad_norm": 0.09716796875,
"learning_rate": 7.784515023805328e-05,
"loss": 1.309,
"step": 615
},
{
"epoch": 0.6193806193806194,
"grad_norm": 0.095703125,
"learning_rate": 7.614665424214193e-05,
"loss": 1.285,
"step": 620
},
{
"epoch": 0.6243756243756243,
"grad_norm": 0.095703125,
"learning_rate": 7.445542420642097e-05,
"loss": 1.3155,
"step": 625
},
{
"epoch": 0.6293706293706294,
"grad_norm": 0.1005859375,
"learning_rate": 7.277197529594257e-05,
"loss": 1.301,
"step": 630
},
{
"epoch": 0.6343656343656343,
"grad_norm": 0.09912109375,
"learning_rate": 7.109682030555283e-05,
"loss": 1.295,
"step": 635
},
{
"epoch": 0.6393606393606394,
"grad_norm": 0.0966796875,
"learning_rate": 6.943046950368944e-05,
"loss": 1.315,
"step": 640
},
{
"epoch": 0.6443556443556444,
"grad_norm": 0.09765625,
"learning_rate": 6.77734304769489e-05,
"loss": 1.2878,
"step": 645
},
{
"epoch": 0.6493506493506493,
"grad_norm": 0.1005859375,
"learning_rate": 6.612620797547087e-05,
"loss": 1.3135,
"step": 650
},
{
"epoch": 0.6543456543456544,
"grad_norm": 0.09912109375,
"learning_rate": 6.448930375918631e-05,
"loss": 1.2958,
"step": 655
},
{
"epoch": 0.6593406593406593,
"grad_norm": 0.10107421875,
"learning_rate": 6.286321644497655e-05,
"loss": 1.3286,
"step": 660
},
{
"epoch": 0.6643356643356644,
"grad_norm": 0.09765625,
"learning_rate": 6.12484413547897e-05,
"loss": 1.2988,
"step": 665
},
{
"epoch": 0.6693306693306693,
"grad_norm": 0.0966796875,
"learning_rate": 5.964547036476099e-05,
"loss": 1.3086,
"step": 670
},
{
"epoch": 0.6743256743256744,
"grad_norm": 0.09765625,
"learning_rate": 5.805479175538229e-05,
"loss": 1.3056,
"step": 675
},
{
"epoch": 0.6793206793206793,
"grad_norm": 0.0966796875,
"learning_rate": 5.647689006276726e-05,
"loss": 1.3259,
"step": 680
},
{
"epoch": 0.6843156843156843,
"grad_norm": 0.09619140625,
"learning_rate": 5.491224593105695e-05,
"loss": 1.3105,
"step": 685
},
{
"epoch": 0.6893106893106893,
"grad_norm": 0.095703125,
"learning_rate": 5.33613359660109e-05,
"loss": 1.3113,
"step": 690
},
{
"epoch": 0.6943056943056943,
"grad_norm": 0.095703125,
"learning_rate": 5.182463258982846e-05,
"loss": 1.3144,
"step": 695
},
{
"epoch": 0.6993006993006993,
"grad_norm": 0.09912109375,
"learning_rate": 5.0302603897244474e-05,
"loss": 1.3059,
"step": 700
},
{
"epoch": 0.7042957042957043,
"grad_norm": 0.0966796875,
"learning_rate": 4.8795713512942865e-05,
"loss": 1.2977,
"step": 705
},
{
"epoch": 0.7092907092907093,
"grad_norm": 0.10009765625,
"learning_rate": 4.7304420450332244e-05,
"loss": 1.3051,
"step": 710
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.09912109375,
"learning_rate": 4.582917897172603e-05,
"loss": 1.3229,
"step": 715
},
{
"epoch": 0.7192807192807192,
"grad_norm": 0.09423828125,
"learning_rate": 4.437043844996952e-05,
"loss": 1.3063,
"step": 720
},
{
"epoch": 0.7242757242757243,
"grad_norm": 0.099609375,
"learning_rate": 4.2928643231556844e-05,
"loss": 1.2956,
"step": 725
},
{
"epoch": 0.7292707292707292,
"grad_norm": 0.0947265625,
"learning_rate": 4.150423250127845e-05,
"loss": 1.3036,
"step": 730
},
{
"epoch": 0.7342657342657343,
"grad_norm": 0.0947265625,
"learning_rate": 4.009764014844143e-05,
"loss": 1.3252,
"step": 735
},
{
"epoch": 0.7392607392607392,
"grad_norm": 0.09375,
"learning_rate": 3.8709294634702376e-05,
"loss": 1.3207,
"step": 740
},
{
"epoch": 0.7442557442557443,
"grad_norm": 0.09423828125,
"learning_rate": 3.733961886355398e-05,
"loss": 1.2915,
"step": 745
},
{
"epoch": 0.7492507492507493,
"grad_norm": 0.0986328125,
"learning_rate": 3.5989030051504434e-05,
"loss": 1.3117,
"step": 750
},
{
"epoch": 0.7542457542457542,
"grad_norm": 0.09814453125,
"learning_rate": 3.465793960098945e-05,
"loss": 1.3022,
"step": 755
},
{
"epoch": 0.7592407592407593,
"grad_norm": 0.09716796875,
"learning_rate": 3.334675297505476e-05,
"loss": 1.3298,
"step": 760
},
{
"epoch": 0.7642357642357642,
"grad_norm": 0.09521484375,
"learning_rate": 3.205586957384838e-05,
"loss": 1.3085,
"step": 765
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.09326171875,
"learning_rate": 3.078568261295933e-05,
"loss": 1.2917,
"step": 770
},
{
"epoch": 0.7742257742257742,
"grad_norm": 0.09814453125,
"learning_rate": 2.953657900364053e-05,
"loss": 1.3111,
"step": 775
},
{
"epoch": 0.7792207792207793,
"grad_norm": 0.09326171875,
"learning_rate": 2.8308939234951726e-05,
"loss": 1.3108,
"step": 780
},
{
"epoch": 0.7842157842157842,
"grad_norm": 0.09326171875,
"learning_rate": 2.7103137257858868e-05,
"loss": 1.3106,
"step": 785
},
{
"epoch": 0.7892107892107892,
"grad_norm": 0.0947265625,
"learning_rate": 2.5919540371325e-05,
"loss": 1.292,
"step": 790
},
{
"epoch": 0.7942057942057942,
"grad_norm": 0.0947265625,
"learning_rate": 2.4758509110427575e-05,
"loss": 1.3089,
"step": 795
},
{
"epoch": 0.7992007992007992,
"grad_norm": 0.09375,
"learning_rate": 2.362039713653581e-05,
"loss": 1.3201,
"step": 800
},
{
"epoch": 0.8041958041958042,
"grad_norm": 0.09326171875,
"learning_rate": 2.2505551129582047e-05,
"loss": 1.2985,
"step": 805
},
{
"epoch": 0.8091908091908092,
"grad_norm": 0.095703125,
"learning_rate": 2.1414310682459802e-05,
"loss": 1.309,
"step": 810
},
{
"epoch": 0.8141858141858141,
"grad_norm": 0.0927734375,
"learning_rate": 2.0347008197580374e-05,
"loss": 1.3097,
"step": 815
},
{
"epoch": 0.8191808191808192,
"grad_norm": 0.0927734375,
"learning_rate": 1.930396878561983e-05,
"loss": 1.3215,
"step": 820
},
{
"epoch": 0.8241758241758241,
"grad_norm": 0.09326171875,
"learning_rate": 1.8285510166487152e-05,
"loss": 1.3002,
"step": 825
},
{
"epoch": 0.8291708291708292,
"grad_norm": 0.0927734375,
"learning_rate": 1.7291942572543807e-05,
"loss": 1.3233,
"step": 830
},
{
"epoch": 0.8341658341658341,
"grad_norm": 0.09521484375,
"learning_rate": 1.632356865410384e-05,
"loss": 1.3163,
"step": 835
},
{
"epoch": 0.8391608391608392,
"grad_norm": 0.0947265625,
"learning_rate": 1.538068338724361e-05,
"loss": 1.2986,
"step": 840
},
{
"epoch": 0.8441558441558441,
"grad_norm": 0.09375,
"learning_rate": 1.4463573983949341e-05,
"loss": 1.319,
"step": 845
},
{
"epoch": 0.8491508491508492,
"grad_norm": 0.09326171875,
"learning_rate": 1.3572519804629536e-05,
"loss": 1.3119,
"step": 850
},
{
"epoch": 0.8541458541458542,
"grad_norm": 0.09326171875,
"learning_rate": 1.2707792273019048e-05,
"loss": 1.3047,
"step": 855
},
{
"epoch": 0.8591408591408591,
"grad_norm": 0.09375,
"learning_rate": 1.1869654793500784e-05,
"loss": 1.3001,
"step": 860
},
{
"epoch": 0.8641358641358642,
"grad_norm": 0.09228515625,
"learning_rate": 1.1058362670870249e-05,
"loss": 1.3192,
"step": 865
},
{
"epoch": 0.8691308691308691,
"grad_norm": 0.0947265625,
"learning_rate": 1.0274163032567163e-05,
"loss": 1.3049,
"step": 870
},
{
"epoch": 0.8741258741258742,
"grad_norm": 0.09130859375,
"learning_rate": 9.517294753398064e-06,
"loss": 1.3049,
"step": 875
},
{
"epoch": 0.8791208791208791,
"grad_norm": 0.091796875,
"learning_rate": 8.787988382772705e-06,
"loss": 1.3111,
"step": 880
},
{
"epoch": 0.8841158841158842,
"grad_norm": 0.09375,
"learning_rate": 8.086466074476563e-06,
"loss": 1.2984,
"step": 885
},
{
"epoch": 0.8891108891108891,
"grad_norm": 0.09228515625,
"learning_rate": 7.412941519000527e-06,
"loss": 1.3023,
"step": 890
},
{
"epoch": 0.8941058941058941,
"grad_norm": 0.09326171875,
"learning_rate": 6.767619878448783e-06,
"loss": 1.3173,
"step": 895
},
{
"epoch": 0.8991008991008991,
"grad_norm": 0.0927734375,
"learning_rate": 6.1506977240444074e-06,
"loss": 1.3053,
"step": 900
},
{
"epoch": 0.9040959040959041,
"grad_norm": 0.09375,
"learning_rate": 5.562362976251901e-06,
"loss": 1.312,
"step": 905
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.09326171875,
"learning_rate": 5.002794847534764e-06,
"loss": 1.2995,
"step": 910
},
{
"epoch": 0.9140859140859141,
"grad_norm": 0.0947265625,
"learning_rate": 4.4721637877656375e-06,
"loss": 1.332,
"step": 915
},
{
"epoch": 0.919080919080919,
"grad_norm": 0.091796875,
"learning_rate": 3.970631432305694e-06,
"loss": 1.3225,
"step": 920
},
{
"epoch": 0.9240759240759241,
"grad_norm": 0.0927734375,
"learning_rate": 3.4983505527688586e-06,
"loss": 1.2955,
"step": 925
},
{
"epoch": 0.929070929070929,
"grad_norm": 0.095703125,
"learning_rate": 3.0554650104861136e-06,
"loss": 1.3007,
"step": 930
},
{
"epoch": 0.9340659340659341,
"grad_norm": 0.09326171875,
"learning_rate": 2.6421097126839712e-06,
"loss": 1.3259,
"step": 935
},
{
"epoch": 0.939060939060939,
"grad_norm": 0.095703125,
"learning_rate": 2.2584105713904125e-06,
"loss": 1.3091,
"step": 940
},
{
"epoch": 0.9440559440559441,
"grad_norm": 0.09423828125,
"learning_rate": 1.904484465080847e-06,
"loss": 1.3195,
"step": 945
},
{
"epoch": 0.949050949050949,
"grad_norm": 0.09423828125,
"learning_rate": 1.580439203075812e-06,
"loss": 1.3043,
"step": 950
},
{
"epoch": 0.954045954045954,
"grad_norm": 0.09375,
"learning_rate": 1.2863734927012095e-06,
"loss": 1.3167,
"step": 955
},
{
"epoch": 0.9590409590409591,
"grad_norm": 0.09326171875,
"learning_rate": 1.0223769092211012e-06,
"loss": 1.3064,
"step": 960
},
{
"epoch": 0.964035964035964,
"grad_norm": 0.091796875,
"learning_rate": 7.885298685522235e-07,
"loss": 1.3197,
"step": 965
},
{
"epoch": 0.9690309690309691,
"grad_norm": 0.09228515625,
"learning_rate": 5.849036027684606e-07,
"loss": 1.3074,
"step": 970
},
{
"epoch": 0.974025974025974,
"grad_norm": 0.09912109375,
"learning_rate": 4.115601384029666e-07,
"loss": 1.3183,
"step": 975
},
{
"epoch": 0.9790209790209791,
"grad_norm": 0.09326171875,
"learning_rate": 2.685522775541904e-07,
"loss": 1.3165,
"step": 980
},
{
"epoch": 0.984015984015984,
"grad_norm": 0.09130859375,
"learning_rate": 1.5592358180189782e-07,
"loss": 1.3114,
"step": 985
},
{
"epoch": 0.989010989010989,
"grad_norm": 0.09326171875,
"learning_rate": 7.370835893788508e-08,
"loss": 1.2917,
"step": 990
},
{
"epoch": 0.994005994005994,
"grad_norm": 0.095703125,
"learning_rate": 2.193165251545004e-08,
"loss": 1.3122,
"step": 995
},
{
"epoch": 0.999000999000999,
"grad_norm": 0.09326171875,
"learning_rate": 6.092342209607083e-10,
"loss": 1.3184,
"step": 1000
},
{
"epoch": 1.0,
"eval_loss": 1.3126407861709595,
"eval_runtime": 1177.4856,
"eval_samples_per_second": 12.041,
"eval_steps_per_second": 12.041,
"step": 1001
},
{
"epoch": 1.0,
"step": 1001,
"total_flos": 1.709524252278915e+18,
"train_loss": 0.07992646839473393,
"train_runtime": 3675.5245,
"train_samples_per_second": 34.86,
"train_steps_per_second": 0.272
}
],
"logging_steps": 5,
"max_steps": 1001,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 20,
"total_flos": 1.709524252278915e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}