zephyr-7b-gemma-sft-20p-2048 / trainer_state.json
Qin Liu
Model save
08be738 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 675,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 73.53218812891343,
"learning_rate": 2.9411764705882355e-06,
"loss": 17.1887,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 72.20908013900717,
"learning_rate": 1.4705882352941177e-05,
"loss": 16.7463,
"step": 5
},
{
"epoch": 0.01,
"grad_norm": 62.778264757967754,
"learning_rate": 2.9411764705882354e-05,
"loss": 15.7811,
"step": 10
},
{
"epoch": 0.02,
"grad_norm": 49.113837778155805,
"learning_rate": 4.411764705882353e-05,
"loss": 12.4438,
"step": 15
},
{
"epoch": 0.03,
"grad_norm": 32.240597800281805,
"learning_rate": 5.882352941176471e-05,
"loss": 7.8515,
"step": 20
},
{
"epoch": 0.04,
"grad_norm": 15.86664975490431,
"learning_rate": 7.352941176470589e-05,
"loss": 5.0038,
"step": 25
},
{
"epoch": 0.04,
"grad_norm": 10.302430667844247,
"learning_rate": 8.823529411764706e-05,
"loss": 3.5358,
"step": 30
},
{
"epoch": 0.05,
"grad_norm": 7.201438595032495,
"learning_rate": 0.00010294117647058823,
"loss": 2.3707,
"step": 35
},
{
"epoch": 0.06,
"grad_norm": 4.623788563021874,
"learning_rate": 0.00011764705882352942,
"loss": 1.7962,
"step": 40
},
{
"epoch": 0.07,
"grad_norm": 2.827534440198122,
"learning_rate": 0.0001323529411764706,
"loss": 1.5507,
"step": 45
},
{
"epoch": 0.07,
"grad_norm": 2.4371715524744273,
"learning_rate": 0.00014705882352941178,
"loss": 1.4152,
"step": 50
},
{
"epoch": 0.08,
"grad_norm": 4.6308985646582865,
"learning_rate": 0.00016176470588235295,
"loss": 1.3047,
"step": 55
},
{
"epoch": 0.09,
"grad_norm": 2.6277992850244916,
"learning_rate": 0.00017647058823529413,
"loss": 1.1884,
"step": 60
},
{
"epoch": 0.1,
"grad_norm": 1.810606688485043,
"learning_rate": 0.0001911764705882353,
"loss": 1.1846,
"step": 65
},
{
"epoch": 0.1,
"grad_norm": 1.6607629181186598,
"learning_rate": 0.00019999464266898484,
"loss": 1.0985,
"step": 70
},
{
"epoch": 0.11,
"grad_norm": 1.4899860561533123,
"learning_rate": 0.00019993437928712978,
"loss": 1.0778,
"step": 75
},
{
"epoch": 0.12,
"grad_norm": 1.5873695514869177,
"learning_rate": 0.0001998071963486563,
"loss": 1.1472,
"step": 80
},
{
"epoch": 0.13,
"grad_norm": 1.5725818281652244,
"learning_rate": 0.00019961317901970953,
"loss": 1.0508,
"step": 85
},
{
"epoch": 0.13,
"grad_norm": 1.7033192337312772,
"learning_rate": 0.0001993524572210807,
"loss": 1.1184,
"step": 90
},
{
"epoch": 0.14,
"grad_norm": 1.864811477430765,
"learning_rate": 0.00019902520554120772,
"loss": 1.0191,
"step": 95
},
{
"epoch": 0.15,
"grad_norm": 1.9842754161579026,
"learning_rate": 0.00019863164311926433,
"loss": 1.0969,
"step": 100
},
{
"epoch": 0.16,
"grad_norm": 1.8657659018311334,
"learning_rate": 0.00019817203349841738,
"loss": 1.0578,
"step": 105
},
{
"epoch": 0.16,
"grad_norm": 1.4402254237747114,
"learning_rate": 0.00019764668444934854,
"loss": 1.0136,
"step": 110
},
{
"epoch": 0.17,
"grad_norm": 1.985678949607785,
"learning_rate": 0.0001970559477641606,
"loss": 1.0014,
"step": 115
},
{
"epoch": 0.18,
"grad_norm": 1.4715740807243745,
"learning_rate": 0.0001964002190208052,
"loss": 1.0444,
"step": 120
},
{
"epoch": 0.19,
"grad_norm": 2.351331444939703,
"learning_rate": 0.00019567993731818984,
"loss": 1.0044,
"step": 125
},
{
"epoch": 0.19,
"grad_norm": 1.7332542810251639,
"learning_rate": 0.00019489558498214196,
"loss": 0.9762,
"step": 130
},
{
"epoch": 0.2,
"grad_norm": 1.3569747435542954,
"learning_rate": 0.00019404768724242666,
"loss": 1.0202,
"step": 135
},
{
"epoch": 0.21,
"grad_norm": 1.6992316193002084,
"learning_rate": 0.00019313681188103457,
"loss": 1.0261,
"step": 140
},
{
"epoch": 0.21,
"grad_norm": 2.269643225680648,
"learning_rate": 0.000192163568851975,
"loss": 1.0437,
"step": 145
},
{
"epoch": 0.22,
"grad_norm": 1.7489937385172625,
"learning_rate": 0.00019112860987282958,
"loss": 0.9816,
"step": 150
},
{
"epoch": 0.23,
"grad_norm": 1.4197466438048754,
"learning_rate": 0.0001900326279883392,
"loss": 0.9973,
"step": 155
},
{
"epoch": 0.24,
"grad_norm": 1.5625301281965893,
"learning_rate": 0.00018887635710631716,
"loss": 1.0078,
"step": 160
},
{
"epoch": 0.24,
"grad_norm": 1.341641051289465,
"learning_rate": 0.00018766057150619865,
"loss": 0.9759,
"step": 165
},
{
"epoch": 0.25,
"grad_norm": 1.9218279347652023,
"learning_rate": 0.00018638608532055634,
"loss": 0.9497,
"step": 170
},
{
"epoch": 0.26,
"grad_norm": 1.5814298979333805,
"learning_rate": 0.00018505375198992857,
"loss": 0.9593,
"step": 175
},
{
"epoch": 0.27,
"grad_norm": 1.6133434657400032,
"learning_rate": 0.00018366446369132578,
"loss": 0.9657,
"step": 180
},
{
"epoch": 0.27,
"grad_norm": 1.8656214764246037,
"learning_rate": 0.00018221915074079762,
"loss": 0.931,
"step": 185
},
{
"epoch": 0.28,
"grad_norm": 1.475284079334899,
"learning_rate": 0.00018071878097046065,
"loss": 1.0032,
"step": 190
},
{
"epoch": 0.29,
"grad_norm": 1.3417878211289882,
"learning_rate": 0.00017916435908040413,
"loss": 0.9502,
"step": 195
},
{
"epoch": 0.3,
"grad_norm": 1.5671700546108844,
"learning_rate": 0.00017755692596590778,
"loss": 0.9655,
"step": 200
},
{
"epoch": 0.3,
"grad_norm": 1.5637519254714574,
"learning_rate": 0.00017589755802042186,
"loss": 1.0083,
"step": 205
},
{
"epoch": 0.31,
"grad_norm": 1.3335973670360752,
"learning_rate": 0.00017418736641477636,
"loss": 0.9257,
"step": 210
},
{
"epoch": 0.32,
"grad_norm": 1.851933280343888,
"learning_rate": 0.0001724274963531022,
"loss": 0.9555,
"step": 215
},
{
"epoch": 0.33,
"grad_norm": 1.597267903033207,
"learning_rate": 0.00017061912630596252,
"loss": 0.961,
"step": 220
},
{
"epoch": 0.33,
"grad_norm": 1.2586072933244326,
"learning_rate": 0.00016876346722120747,
"loss": 0.9545,
"step": 225
},
{
"epoch": 0.34,
"grad_norm": 1.91126837231012,
"learning_rate": 0.00016686176171308126,
"loss": 1.0021,
"step": 230
},
{
"epoch": 0.35,
"grad_norm": 1.221506251787188,
"learning_rate": 0.0001649152832301241,
"loss": 0.9536,
"step": 235
},
{
"epoch": 0.36,
"grad_norm": 1.6421046726120057,
"learning_rate": 0.00016292533520242662,
"loss": 0.989,
"step": 240
},
{
"epoch": 0.36,
"grad_norm": 1.65537527957207,
"learning_rate": 0.00016089325016880736,
"loss": 0.9306,
"step": 245
},
{
"epoch": 0.37,
"grad_norm": 1.551249778846203,
"learning_rate": 0.0001588203888844982,
"loss": 0.933,
"step": 250
},
{
"epoch": 0.38,
"grad_norm": 1.4718905144882328,
"learning_rate": 0.00015670813940993502,
"loss": 0.9966,
"step": 255
},
{
"epoch": 0.39,
"grad_norm": 1.6457075355627533,
"learning_rate": 0.00015455791618126404,
"loss": 0.9326,
"step": 260
},
{
"epoch": 0.39,
"grad_norm": 1.4689616265296752,
"learning_rate": 0.00015237115906318563,
"loss": 0.9327,
"step": 265
},
{
"epoch": 0.4,
"grad_norm": 1.5768108727267562,
"learning_rate": 0.0001501493323847707,
"loss": 0.8785,
"step": 270
},
{
"epoch": 0.41,
"grad_norm": 1.5154414103112674,
"learning_rate": 0.00014789392395889468,
"loss": 0.9677,
"step": 275
},
{
"epoch": 0.41,
"grad_norm": 1.7991573095908828,
"learning_rate": 0.00014560644408594602,
"loss": 0.9732,
"step": 280
},
{
"epoch": 0.42,
"grad_norm": 1.331331813541862,
"learning_rate": 0.0001432884245424761,
"loss": 0.895,
"step": 285
},
{
"epoch": 0.43,
"grad_norm": 1.3265879307262,
"learning_rate": 0.00014094141755546815,
"loss": 0.9495,
"step": 290
},
{
"epoch": 0.44,
"grad_norm": 1.373983753071535,
"learning_rate": 0.00013856699476291176,
"loss": 0.9167,
"step": 295
},
{
"epoch": 0.44,
"grad_norm": 1.3848598834980737,
"learning_rate": 0.000136166746161379,
"loss": 0.9327,
"step": 300
},
{
"epoch": 0.45,
"grad_norm": 1.349182069364577,
"learning_rate": 0.00013374227904130724,
"loss": 0.9156,
"step": 305
},
{
"epoch": 0.46,
"grad_norm": 1.231890373583718,
"learning_rate": 0.00013129521691070107,
"loss": 0.9024,
"step": 310
},
{
"epoch": 0.47,
"grad_norm": 1.3184286986849199,
"learning_rate": 0.00012882719840797473,
"loss": 0.946,
"step": 315
},
{
"epoch": 0.47,
"grad_norm": 1.2850837837862192,
"learning_rate": 0.0001263398762046623,
"loss": 0.9647,
"step": 320
},
{
"epoch": 0.48,
"grad_norm": 1.0818663632355159,
"learning_rate": 0.00012383491589873123,
"loss": 0.8986,
"step": 325
},
{
"epoch": 0.49,
"grad_norm": 1.31127052136108,
"learning_rate": 0.0001213139948992394,
"loss": 0.9347,
"step": 330
},
{
"epoch": 0.5,
"grad_norm": 1.3354281558595502,
"learning_rate": 0.0001187788013030837,
"loss": 0.912,
"step": 335
},
{
"epoch": 0.5,
"grad_norm": 2.3230181317972467,
"learning_rate": 0.00011623103276459086,
"loss": 0.9542,
"step": 340
},
{
"epoch": 0.51,
"grad_norm": 1.3558452166577937,
"learning_rate": 0.00011367239535870913,
"loss": 0.9307,
"step": 345
},
{
"epoch": 0.52,
"grad_norm": 1.2776477730191607,
"learning_rate": 0.00011110460243856052,
"loss": 0.842,
"step": 350
},
{
"epoch": 0.53,
"grad_norm": 1.1931456337138724,
"learning_rate": 0.0001085293734881197,
"loss": 1.023,
"step": 355
},
{
"epoch": 0.53,
"grad_norm": 1.4101263363441188,
"learning_rate": 0.00010594843297078737,
"loss": 0.8469,
"step": 360
},
{
"epoch": 0.54,
"grad_norm": 1.4759810995782983,
"learning_rate": 0.00010336350917462925,
"loss": 0.9429,
"step": 365
},
{
"epoch": 0.55,
"grad_norm": 1.3195800017049626,
"learning_rate": 0.00010077633305505403,
"loss": 0.9467,
"step": 370
},
{
"epoch": 0.56,
"grad_norm": 1.4340773951366725,
"learning_rate": 9.818863707570475e-05,
"loss": 0.9234,
"step": 375
},
{
"epoch": 0.56,
"grad_norm": 1.3906713677707887,
"learning_rate": 9.560215404834095e-05,
"loss": 0.8627,
"step": 380
},
{
"epoch": 0.57,
"grad_norm": 1.2312829341126241,
"learning_rate": 9.30186159724869e-05,
"loss": 0.9707,
"step": 385
},
{
"epoch": 0.58,
"grad_norm": 1.2296503762823234,
"learning_rate": 9.043975287562441e-05,
"loss": 0.8975,
"step": 390
},
{
"epoch": 0.59,
"grad_norm": 1.428266488089383,
"learning_rate": 8.786729165470584e-05,
"loss": 0.9242,
"step": 395
},
{
"epoch": 0.59,
"grad_norm": 1.4351708822396818,
"learning_rate": 8.530295491976337e-05,
"loss": 0.9613,
"step": 400
},
{
"epoch": 0.6,
"grad_norm": 1.4023979907327273,
"learning_rate": 8.274845984038916e-05,
"loss": 0.9386,
"step": 405
},
{
"epoch": 0.61,
"grad_norm": 1.5289715737644887,
"learning_rate": 8.020551699585842e-05,
"loss": 0.8882,
"step": 410
},
{
"epoch": 0.61,
"grad_norm": 1.3427576459400747,
"learning_rate": 7.76758292296659e-05,
"loss": 0.9386,
"step": 415
},
{
"epoch": 0.62,
"grad_norm": 1.6234983552618958,
"learning_rate": 7.516109050924201e-05,
"loss": 0.9497,
"step": 420
},
{
"epoch": 0.63,
"grad_norm": 1.3757787363056968,
"learning_rate": 7.266298479161318e-05,
"loss": 0.9353,
"step": 425
},
{
"epoch": 0.64,
"grad_norm": 1.0207515861639591,
"learning_rate": 7.01831848957653e-05,
"loss": 0.8773,
"step": 430
},
{
"epoch": 0.64,
"grad_norm": 1.066825211996411,
"learning_rate": 6.772335138246548e-05,
"loss": 0.8815,
"step": 435
},
{
"epoch": 0.65,
"grad_norm": 1.585875694236883,
"learning_rate": 6.528513144229255e-05,
"loss": 0.8624,
"step": 440
},
{
"epoch": 0.66,
"grad_norm": 3.1374098679372833,
"learning_rate": 6.287015779262064e-05,
"loss": 0.8769,
"step": 445
},
{
"epoch": 0.67,
"grad_norm": 1.4323569006609465,
"learning_rate": 6.048004758429451e-05,
"loss": 0.9578,
"step": 450
},
{
"epoch": 0.67,
"grad_norm": 1.1745446431201967,
"learning_rate": 5.8116401318728667e-05,
"loss": 0.9778,
"step": 455
},
{
"epoch": 0.68,
"grad_norm": 1.3489994553201705,
"learning_rate": 5.578080177615575e-05,
"loss": 0.9453,
"step": 460
},
{
"epoch": 0.69,
"grad_norm": 1.4726017758093604,
"learning_rate": 5.3474812955741404e-05,
"loss": 0.9388,
"step": 465
},
{
"epoch": 0.7,
"grad_norm": 1.4222376466782578,
"learning_rate": 5.119997902827584e-05,
"loss": 0.9389,
"step": 470
},
{
"epoch": 0.7,
"grad_norm": 1.312463261970253,
"learning_rate": 4.895782330214291e-05,
"loss": 0.9978,
"step": 475
},
{
"epoch": 0.71,
"grad_norm": 1.4811777334942215,
"learning_rate": 4.674984720325961e-05,
"loss": 0.9229,
"step": 480
},
{
"epoch": 0.72,
"grad_norm": 1.187387116264144,
"learning_rate": 4.4577529269668874e-05,
"loss": 0.9319,
"step": 485
},
{
"epoch": 0.73,
"grad_norm": 1.573886175317443,
"learning_rate": 4.244232416145839e-05,
"loss": 0.8582,
"step": 490
},
{
"epoch": 0.73,
"grad_norm": 1.8423570100062419,
"learning_rate": 4.0345661686669745e-05,
"loss": 0.9875,
"step": 495
},
{
"epoch": 0.74,
"grad_norm": 1.3959739056560523,
"learning_rate": 3.828894584384867e-05,
"loss": 0.9499,
"step": 500
},
{
"epoch": 0.75,
"grad_norm": 1.2522644394201856,
"learning_rate": 3.62735538818787e-05,
"loss": 0.9465,
"step": 505
},
{
"epoch": 0.76,
"grad_norm": 1.1092455291419923,
"learning_rate": 3.43008353777269e-05,
"loss": 0.8705,
"step": 510
},
{
"epoch": 0.76,
"grad_norm": 1.3494618399232021,
"learning_rate": 3.237211133272004e-05,
"loss": 0.8925,
"step": 515
},
{
"epoch": 0.77,
"grad_norm": 1.327141307012543,
"learning_rate": 3.0488673287955882e-05,
"loss": 0.8864,
"step": 520
},
{
"epoch": 0.78,
"grad_norm": 1.4712560536545136,
"learning_rate": 2.8651782459442176e-05,
"loss": 0.9095,
"step": 525
},
{
"epoch": 0.79,
"grad_norm": 1.0753618251829293,
"learning_rate": 2.686266889354211e-05,
"loss": 0.9094,
"step": 530
},
{
"epoch": 0.79,
"grad_norm": 1.1938128239663364,
"learning_rate": 2.5122530643292275e-05,
"loss": 0.895,
"step": 535
},
{
"epoch": 0.8,
"grad_norm": 1.2604734873430494,
"learning_rate": 2.3432532966144527e-05,
"loss": 0.9122,
"step": 540
},
{
"epoch": 0.81,
"grad_norm": 1.1630833266614449,
"learning_rate": 2.1793807543668853e-05,
"loss": 0.8707,
"step": 545
},
{
"epoch": 0.81,
"grad_norm": 1.4007959537230523,
"learning_rate": 2.0207451723739633e-05,
"loss": 0.9303,
"step": 550
},
{
"epoch": 0.82,
"grad_norm": 1.1568429331812096,
"learning_rate": 1.8674527785713247e-05,
"loss": 0.918,
"step": 555
},
{
"epoch": 0.83,
"grad_norm": 1.7514698026073916,
"learning_rate": 1.7196062229088604e-05,
"loss": 0.9194,
"step": 560
},
{
"epoch": 0.84,
"grad_norm": 3.820001651436939,
"learning_rate": 1.577304508612717e-05,
"loss": 0.9079,
"step": 565
},
{
"epoch": 0.84,
"grad_norm": 1.189776848938503,
"learning_rate": 1.4406429258892762e-05,
"loss": 0.8622,
"step": 570
},
{
"epoch": 0.85,
"grad_norm": 1.2555397613395767,
"learning_rate": 1.3097129881154934e-05,
"loss": 0.963,
"step": 575
},
{
"epoch": 0.86,
"grad_norm": 1.434313272835203,
"learning_rate": 1.1846023705583442e-05,
"loss": 0.903,
"step": 580
},
{
"epoch": 0.87,
"grad_norm": 1.1102917022071244,
"learning_rate": 1.065394851664394e-05,
"loss": 0.8873,
"step": 585
},
{
"epoch": 0.87,
"grad_norm": 1.346033677968864,
"learning_rate": 9.521702569588198e-06,
"loss": 0.9241,
"step": 590
},
{
"epoch": 0.88,
"grad_norm": 1.352314708606863,
"learning_rate": 8.450044055914497e-06,
"loss": 0.844,
"step": 595
},
{
"epoch": 0.89,
"grad_norm": 1.4077107378660827,
"learning_rate": 7.439690595656013e-06,
"loss": 0.9185,
"step": 600
},
{
"epoch": 0.9,
"grad_norm": 1.2576697014152542,
"learning_rate": 6.4913187568374164e-06,
"loss": 0.9464,
"step": 605
},
{
"epoch": 0.9,
"grad_norm": 1.185889063940016,
"learning_rate": 5.605563602421149e-06,
"loss": 0.9113,
"step": 610
},
{
"epoch": 0.91,
"grad_norm": 1.289407769067274,
"learning_rate": 4.783018265047179e-06,
"loss": 0.9136,
"step": 615
},
{
"epoch": 0.92,
"grad_norm": 1.1909907385752865,
"learning_rate": 4.024233549850509e-06,
"loss": 0.9004,
"step": 620
},
{
"epoch": 0.93,
"grad_norm": 1.3163426883308564,
"learning_rate": 3.329717565622825e-06,
"loss": 0.9107,
"step": 625
},
{
"epoch": 0.93,
"grad_norm": 1.0164303782764494,
"learning_rate": 2.699935384565111e-06,
"loss": 0.863,
"step": 630
},
{
"epoch": 0.94,
"grad_norm": 1.2480074548201365,
"learning_rate": 2.1353087308590314e-06,
"loss": 0.9415,
"step": 635
},
{
"epoch": 0.95,
"grad_norm": 1.2887306090020527,
"learning_rate": 1.6362156982656084e-06,
"loss": 0.9614,
"step": 640
},
{
"epoch": 0.96,
"grad_norm": 1.4090113723084001,
"learning_rate": 1.2029904969404482e-06,
"loss": 0.9415,
"step": 645
},
{
"epoch": 0.96,
"grad_norm": 1.2326893640043235,
"learning_rate": 8.359232296349162e-07,
"loss": 0.9132,
"step": 650
},
{
"epoch": 0.97,
"grad_norm": 1.2349784200342029,
"learning_rate": 5.352596974332436e-07,
"loss": 0.9187,
"step": 655
},
{
"epoch": 0.98,
"grad_norm": 1.3101498596239862,
"learning_rate": 3.0120123515540164e-07,
"loss": 0.9452,
"step": 660
},
{
"epoch": 0.99,
"grad_norm": 1.281188961147188,
"learning_rate": 1.3390457653639222e-07,
"loss": 0.9203,
"step": 665
},
{
"epoch": 0.99,
"grad_norm": 1.4762914419165216,
"learning_rate": 3.3481749271768726e-08,
"loss": 0.8818,
"step": 670
},
{
"epoch": 1.0,
"grad_norm": 1.479271940054715,
"learning_rate": 0.0,
"loss": 0.9395,
"step": 675
},
{
"epoch": 1.0,
"eval_loss": 1.2425206899642944,
"eval_runtime": 248.4678,
"eval_samples_per_second": 9.297,
"eval_steps_per_second": 0.584,
"step": 675
},
{
"epoch": 1.0,
"step": 675,
"total_flos": 1369955766894592.0,
"train_loss": 1.3901304527565286,
"train_runtime": 7332.1213,
"train_samples_per_second": 2.945,
"train_steps_per_second": 0.092
}
],
"logging_steps": 5,
"max_steps": 675,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"total_flos": 1369955766894592.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}