luxxx / trainer_state.json
forbiddensoul90's picture
Upload folder using huggingface_hub
786f2d3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9998077107970387,
"eval_steps": 500,
"global_step": 2600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007691568118450149,
"grad_norm": 0.8310319185256958,
"learning_rate": 1.923076923076923e-05,
"loss": 2.3717,
"step": 10
},
{
"epoch": 0.015383136236900298,
"grad_norm": 0.8349233269691467,
"learning_rate": 3.846153846153846e-05,
"loss": 2.2925,
"step": 20
},
{
"epoch": 0.02307470435535045,
"grad_norm": 0.7424416542053223,
"learning_rate": 4.992229992229993e-05,
"loss": 2.0978,
"step": 30
},
{
"epoch": 0.030766272473800597,
"grad_norm": 0.7559643387794495,
"learning_rate": 4.972804972804973e-05,
"loss": 1.9267,
"step": 40
},
{
"epoch": 0.03845784059225075,
"grad_norm": 0.6887438297271729,
"learning_rate": 4.9533799533799534e-05,
"loss": 1.7711,
"step": 50
},
{
"epoch": 0.0461494087107009,
"grad_norm": 0.8163133263587952,
"learning_rate": 4.9339549339549344e-05,
"loss": 1.7121,
"step": 60
},
{
"epoch": 0.053840976829151045,
"grad_norm": 0.979810357093811,
"learning_rate": 4.9145299145299147e-05,
"loss": 1.6597,
"step": 70
},
{
"epoch": 0.061532544947601193,
"grad_norm": 0.9440365433692932,
"learning_rate": 4.8951048951048956e-05,
"loss": 1.6236,
"step": 80
},
{
"epoch": 0.06922411306605133,
"grad_norm": 0.8523435592651367,
"learning_rate": 4.875679875679876e-05,
"loss": 1.616,
"step": 90
},
{
"epoch": 0.0769156811845015,
"grad_norm": 1.1356542110443115,
"learning_rate": 4.856254856254856e-05,
"loss": 1.58,
"step": 100
},
{
"epoch": 0.08460724930295165,
"grad_norm": 1.1632951498031616,
"learning_rate": 4.836829836829837e-05,
"loss": 1.5241,
"step": 110
},
{
"epoch": 0.0922988174214018,
"grad_norm": 1.1739879846572876,
"learning_rate": 4.8174048174048175e-05,
"loss": 1.4737,
"step": 120
},
{
"epoch": 0.09999038553985194,
"grad_norm": 1.1852024793624878,
"learning_rate": 4.797979797979798e-05,
"loss": 1.5002,
"step": 130
},
{
"epoch": 0.10768195365830209,
"grad_norm": 1.2705507278442383,
"learning_rate": 4.778554778554779e-05,
"loss": 1.4457,
"step": 140
},
{
"epoch": 0.11537352177675224,
"grad_norm": 1.3838472366333008,
"learning_rate": 4.75912975912976e-05,
"loss": 1.4372,
"step": 150
},
{
"epoch": 0.12306508989520239,
"grad_norm": 1.463762640953064,
"learning_rate": 4.73970473970474e-05,
"loss": 1.4695,
"step": 160
},
{
"epoch": 0.13075665801365252,
"grad_norm": 1.4812041521072388,
"learning_rate": 4.7202797202797204e-05,
"loss": 1.465,
"step": 170
},
{
"epoch": 0.13844822613210267,
"grad_norm": 1.3840124607086182,
"learning_rate": 4.700854700854701e-05,
"loss": 1.4121,
"step": 180
},
{
"epoch": 0.14613979425055282,
"grad_norm": 1.5955114364624023,
"learning_rate": 4.681429681429682e-05,
"loss": 1.4064,
"step": 190
},
{
"epoch": 0.153831362369003,
"grad_norm": 1.4825975894927979,
"learning_rate": 4.662004662004662e-05,
"loss": 1.3869,
"step": 200
},
{
"epoch": 0.16152293048745314,
"grad_norm": 1.5520519018173218,
"learning_rate": 4.642579642579643e-05,
"loss": 1.3914,
"step": 210
},
{
"epoch": 0.1692144986059033,
"grad_norm": 1.3405165672302246,
"learning_rate": 4.623154623154623e-05,
"loss": 1.378,
"step": 220
},
{
"epoch": 0.17690606672435344,
"grad_norm": 1.4172375202178955,
"learning_rate": 4.603729603729604e-05,
"loss": 1.3843,
"step": 230
},
{
"epoch": 0.1845976348428036,
"grad_norm": 1.447364330291748,
"learning_rate": 4.5843045843045846e-05,
"loss": 1.3959,
"step": 240
},
{
"epoch": 0.19228920296125374,
"grad_norm": 1.5292952060699463,
"learning_rate": 4.564879564879565e-05,
"loss": 1.3926,
"step": 250
},
{
"epoch": 0.19998077107970388,
"grad_norm": 1.5983524322509766,
"learning_rate": 4.545454545454546e-05,
"loss": 1.3721,
"step": 260
},
{
"epoch": 0.20767233919815403,
"grad_norm": 1.520230770111084,
"learning_rate": 4.526029526029526e-05,
"loss": 1.362,
"step": 270
},
{
"epoch": 0.21536390731660418,
"grad_norm": 1.5406700372695923,
"learning_rate": 4.506604506604507e-05,
"loss": 1.36,
"step": 280
},
{
"epoch": 0.22305547543505433,
"grad_norm": 1.5974327325820923,
"learning_rate": 4.4871794871794874e-05,
"loss": 1.3326,
"step": 290
},
{
"epoch": 0.23074704355350448,
"grad_norm": 1.5478413105010986,
"learning_rate": 4.467754467754468e-05,
"loss": 1.3277,
"step": 300
},
{
"epoch": 0.23843861167195463,
"grad_norm": 1.537761926651001,
"learning_rate": 4.448329448329449e-05,
"loss": 1.3329,
"step": 310
},
{
"epoch": 0.24613017979040477,
"grad_norm": 1.5494816303253174,
"learning_rate": 4.428904428904429e-05,
"loss": 1.2966,
"step": 320
},
{
"epoch": 0.2538217479088549,
"grad_norm": 1.6187357902526855,
"learning_rate": 4.4094794094794093e-05,
"loss": 1.3152,
"step": 330
},
{
"epoch": 0.26151331602730504,
"grad_norm": 1.9419796466827393,
"learning_rate": 4.39005439005439e-05,
"loss": 1.3223,
"step": 340
},
{
"epoch": 0.2692048841457552,
"grad_norm": 1.6638212203979492,
"learning_rate": 4.370629370629371e-05,
"loss": 1.3366,
"step": 350
},
{
"epoch": 0.27689645226420534,
"grad_norm": 1.8047728538513184,
"learning_rate": 4.3512043512043516e-05,
"loss": 1.3156,
"step": 360
},
{
"epoch": 0.2845880203826555,
"grad_norm": 1.6696109771728516,
"learning_rate": 4.331779331779332e-05,
"loss": 1.3229,
"step": 370
},
{
"epoch": 0.29227958850110564,
"grad_norm": 1.6470463275909424,
"learning_rate": 4.312354312354312e-05,
"loss": 1.2842,
"step": 380
},
{
"epoch": 0.2999711566195558,
"grad_norm": 1.6966880559921265,
"learning_rate": 4.292929292929293e-05,
"loss": 1.2962,
"step": 390
},
{
"epoch": 0.307662724738006,
"grad_norm": 2.1080636978149414,
"learning_rate": 4.2735042735042735e-05,
"loss": 1.2865,
"step": 400
},
{
"epoch": 0.3153542928564561,
"grad_norm": 1.9488563537597656,
"learning_rate": 4.254079254079254e-05,
"loss": 1.2932,
"step": 410
},
{
"epoch": 0.3230458609749063,
"grad_norm": 1.8715245723724365,
"learning_rate": 4.234654234654235e-05,
"loss": 1.2751,
"step": 420
},
{
"epoch": 0.3307374290933564,
"grad_norm": 1.6135368347167969,
"learning_rate": 4.215229215229216e-05,
"loss": 1.2923,
"step": 430
},
{
"epoch": 0.3384289972118066,
"grad_norm": 2.0071396827697754,
"learning_rate": 4.195804195804196e-05,
"loss": 1.3004,
"step": 440
},
{
"epoch": 0.3461205653302567,
"grad_norm": 1.954253077507019,
"learning_rate": 4.1763791763791764e-05,
"loss": 1.2898,
"step": 450
},
{
"epoch": 0.3538121334487069,
"grad_norm": 2.44484806060791,
"learning_rate": 4.1569541569541574e-05,
"loss": 1.2873,
"step": 460
},
{
"epoch": 0.361503701567157,
"grad_norm": 1.7962288856506348,
"learning_rate": 4.1375291375291377e-05,
"loss": 1.2697,
"step": 470
},
{
"epoch": 0.3691952696856072,
"grad_norm": 1.6677371263504028,
"learning_rate": 4.1181041181041186e-05,
"loss": 1.2602,
"step": 480
},
{
"epoch": 0.3768868378040573,
"grad_norm": 1.6148051023483276,
"learning_rate": 4.098679098679099e-05,
"loss": 1.2889,
"step": 490
},
{
"epoch": 0.38457840592250747,
"grad_norm": 2.0414624214172363,
"learning_rate": 4.079254079254079e-05,
"loss": 1.2861,
"step": 500
},
{
"epoch": 0.3922699740409576,
"grad_norm": 1.5257327556610107,
"learning_rate": 4.05982905982906e-05,
"loss": 1.2399,
"step": 510
},
{
"epoch": 0.39996154215940777,
"grad_norm": 1.9141441583633423,
"learning_rate": 4.0404040404040405e-05,
"loss": 1.2775,
"step": 520
},
{
"epoch": 0.4076531102778579,
"grad_norm": 1.686346411705017,
"learning_rate": 4.020979020979021e-05,
"loss": 1.2471,
"step": 530
},
{
"epoch": 0.41534467839630806,
"grad_norm": 1.7622222900390625,
"learning_rate": 4.001554001554002e-05,
"loss": 1.2707,
"step": 540
},
{
"epoch": 0.4230362465147582,
"grad_norm": 1.8795199394226074,
"learning_rate": 3.982128982128983e-05,
"loss": 1.2862,
"step": 550
},
{
"epoch": 0.43072781463320836,
"grad_norm": 1.8214608430862427,
"learning_rate": 3.962703962703963e-05,
"loss": 1.2553,
"step": 560
},
{
"epoch": 0.4384193827516585,
"grad_norm": 1.7057331800460815,
"learning_rate": 3.9432789432789434e-05,
"loss": 1.2684,
"step": 570
},
{
"epoch": 0.44611095087010866,
"grad_norm": 1.9549400806427002,
"learning_rate": 3.923853923853924e-05,
"loss": 1.2512,
"step": 580
},
{
"epoch": 0.4538025189885588,
"grad_norm": 1.906495451927185,
"learning_rate": 3.904428904428905e-05,
"loss": 1.2524,
"step": 590
},
{
"epoch": 0.46149408710700895,
"grad_norm": 1.76618230342865,
"learning_rate": 3.885003885003885e-05,
"loss": 1.2613,
"step": 600
},
{
"epoch": 0.4691856552254591,
"grad_norm": 2.038341522216797,
"learning_rate": 3.865578865578865e-05,
"loss": 1.2266,
"step": 610
},
{
"epoch": 0.47687722334390925,
"grad_norm": 1.7204527854919434,
"learning_rate": 3.846153846153846e-05,
"loss": 1.2478,
"step": 620
},
{
"epoch": 0.48456879146235937,
"grad_norm": 1.7129229307174683,
"learning_rate": 3.826728826728827e-05,
"loss": 1.2207,
"step": 630
},
{
"epoch": 0.49226035958080955,
"grad_norm": 1.8858224153518677,
"learning_rate": 3.8073038073038076e-05,
"loss": 1.2219,
"step": 640
},
{
"epoch": 0.49995192769925967,
"grad_norm": 1.8016833066940308,
"learning_rate": 3.787878787878788e-05,
"loss": 1.2586,
"step": 650
},
{
"epoch": 0.5076434958177098,
"grad_norm": 1.7311338186264038,
"learning_rate": 3.768453768453769e-05,
"loss": 1.2072,
"step": 660
},
{
"epoch": 0.51533506393616,
"grad_norm": 1.7362291812896729,
"learning_rate": 3.749028749028749e-05,
"loss": 1.229,
"step": 670
},
{
"epoch": 0.5230266320546101,
"grad_norm": 2.358015537261963,
"learning_rate": 3.72960372960373e-05,
"loss": 1.196,
"step": 680
},
{
"epoch": 0.5307182001730603,
"grad_norm": 1.9326188564300537,
"learning_rate": 3.7101787101787104e-05,
"loss": 1.2491,
"step": 690
},
{
"epoch": 0.5384097682915104,
"grad_norm": 1.9867770671844482,
"learning_rate": 3.690753690753691e-05,
"loss": 1.2383,
"step": 700
},
{
"epoch": 0.5461013364099606,
"grad_norm": 2.026604175567627,
"learning_rate": 3.671328671328672e-05,
"loss": 1.2172,
"step": 710
},
{
"epoch": 0.5537929045284107,
"grad_norm": 2.0612032413482666,
"learning_rate": 3.651903651903652e-05,
"loss": 1.2607,
"step": 720
},
{
"epoch": 0.5614844726468609,
"grad_norm": 1.8911861181259155,
"learning_rate": 3.6324786324786323e-05,
"loss": 1.2352,
"step": 730
},
{
"epoch": 0.569176040765311,
"grad_norm": 1.8968251943588257,
"learning_rate": 3.613053613053613e-05,
"loss": 1.2274,
"step": 740
},
{
"epoch": 0.5768676088837612,
"grad_norm": 1.8784927129745483,
"learning_rate": 3.593628593628594e-05,
"loss": 1.2138,
"step": 750
},
{
"epoch": 0.5845591770022113,
"grad_norm": 2.070425033569336,
"learning_rate": 3.5742035742035746e-05,
"loss": 1.2343,
"step": 760
},
{
"epoch": 0.5922507451206614,
"grad_norm": 2.00534987449646,
"learning_rate": 3.554778554778555e-05,
"loss": 1.2484,
"step": 770
},
{
"epoch": 0.5999423132391116,
"grad_norm": 2.018569231033325,
"learning_rate": 3.535353535353535e-05,
"loss": 1.1964,
"step": 780
},
{
"epoch": 0.6076338813575618,
"grad_norm": 2.2748711109161377,
"learning_rate": 3.515928515928516e-05,
"loss": 1.2415,
"step": 790
},
{
"epoch": 0.615325449476012,
"grad_norm": 1.8931610584259033,
"learning_rate": 3.4965034965034965e-05,
"loss": 1.2515,
"step": 800
},
{
"epoch": 0.623017017594462,
"grad_norm": 1.8468350172042847,
"learning_rate": 3.477078477078477e-05,
"loss": 1.2381,
"step": 810
},
{
"epoch": 0.6307085857129122,
"grad_norm": 2.2593376636505127,
"learning_rate": 3.457653457653458e-05,
"loss": 1.2556,
"step": 820
},
{
"epoch": 0.6384001538313624,
"grad_norm": 1.9469683170318604,
"learning_rate": 3.438228438228439e-05,
"loss": 1.2071,
"step": 830
},
{
"epoch": 0.6460917219498126,
"grad_norm": 1.769471526145935,
"learning_rate": 3.418803418803419e-05,
"loss": 1.2112,
"step": 840
},
{
"epoch": 0.6537832900682626,
"grad_norm": 2.112982988357544,
"learning_rate": 3.3993783993783994e-05,
"loss": 1.2471,
"step": 850
},
{
"epoch": 0.6614748581867128,
"grad_norm": 1.8016294240951538,
"learning_rate": 3.37995337995338e-05,
"loss": 1.2145,
"step": 860
},
{
"epoch": 0.669166426305163,
"grad_norm": 1.9988363981246948,
"learning_rate": 3.3605283605283607e-05,
"loss": 1.191,
"step": 870
},
{
"epoch": 0.6768579944236132,
"grad_norm": 1.8531577587127686,
"learning_rate": 3.341103341103341e-05,
"loss": 1.1722,
"step": 880
},
{
"epoch": 0.6845495625420632,
"grad_norm": 1.9101706743240356,
"learning_rate": 3.321678321678322e-05,
"loss": 1.1738,
"step": 890
},
{
"epoch": 0.6922411306605134,
"grad_norm": 2.064260482788086,
"learning_rate": 3.302253302253302e-05,
"loss": 1.2045,
"step": 900
},
{
"epoch": 0.6999326987789636,
"grad_norm": 1.895975947380066,
"learning_rate": 3.282828282828283e-05,
"loss": 1.1815,
"step": 910
},
{
"epoch": 0.7076242668974138,
"grad_norm": 1.6610194444656372,
"learning_rate": 3.2634032634032635e-05,
"loss": 1.2059,
"step": 920
},
{
"epoch": 0.7153158350158638,
"grad_norm": 1.8297659158706665,
"learning_rate": 3.243978243978244e-05,
"loss": 1.2034,
"step": 930
},
{
"epoch": 0.723007403134314,
"grad_norm": 2.0258054733276367,
"learning_rate": 3.224553224553225e-05,
"loss": 1.2051,
"step": 940
},
{
"epoch": 0.7306989712527642,
"grad_norm": 1.9012644290924072,
"learning_rate": 3.205128205128206e-05,
"loss": 1.2072,
"step": 950
},
{
"epoch": 0.7383905393712143,
"grad_norm": 1.8051044940948486,
"learning_rate": 3.185703185703186e-05,
"loss": 1.2149,
"step": 960
},
{
"epoch": 0.7460821074896644,
"grad_norm": 2.0245988368988037,
"learning_rate": 3.1662781662781664e-05,
"loss": 1.1886,
"step": 970
},
{
"epoch": 0.7537736756081146,
"grad_norm": 1.9541068077087402,
"learning_rate": 3.146853146853147e-05,
"loss": 1.2094,
"step": 980
},
{
"epoch": 0.7614652437265648,
"grad_norm": 1.8421745300292969,
"learning_rate": 3.127428127428128e-05,
"loss": 1.2041,
"step": 990
},
{
"epoch": 0.7691568118450149,
"grad_norm": 1.866947889328003,
"learning_rate": 3.108003108003108e-05,
"loss": 1.2103,
"step": 1000
},
{
"epoch": 0.776848379963465,
"grad_norm": 2.1929476261138916,
"learning_rate": 3.088578088578088e-05,
"loss": 1.2058,
"step": 1010
},
{
"epoch": 0.7845399480819152,
"grad_norm": 2.0555646419525146,
"learning_rate": 3.069153069153069e-05,
"loss": 1.1894,
"step": 1020
},
{
"epoch": 0.7922315162003654,
"grad_norm": 2.0418717861175537,
"learning_rate": 3.04972804972805e-05,
"loss": 1.1942,
"step": 1030
},
{
"epoch": 0.7999230843188155,
"grad_norm": 2.218815326690674,
"learning_rate": 3.0303030303030306e-05,
"loss": 1.1676,
"step": 1040
},
{
"epoch": 0.8076146524372656,
"grad_norm": 1.9924787282943726,
"learning_rate": 3.010878010878011e-05,
"loss": 1.1938,
"step": 1050
},
{
"epoch": 0.8153062205557158,
"grad_norm": 1.7555562257766724,
"learning_rate": 2.9914529914529915e-05,
"loss": 1.1737,
"step": 1060
},
{
"epoch": 0.822997788674166,
"grad_norm": 1.8218282461166382,
"learning_rate": 2.972027972027972e-05,
"loss": 1.1635,
"step": 1070
},
{
"epoch": 0.8306893567926161,
"grad_norm": 1.9514037370681763,
"learning_rate": 2.9526029526029525e-05,
"loss": 1.1751,
"step": 1080
},
{
"epoch": 0.8383809249110662,
"grad_norm": 1.995100498199463,
"learning_rate": 2.9331779331779334e-05,
"loss": 1.1705,
"step": 1090
},
{
"epoch": 0.8460724930295164,
"grad_norm": 1.9724220037460327,
"learning_rate": 2.913752913752914e-05,
"loss": 1.2067,
"step": 1100
},
{
"epoch": 0.8537640611479665,
"grad_norm": 2.083712577819824,
"learning_rate": 2.8943278943278944e-05,
"loss": 1.1644,
"step": 1110
},
{
"epoch": 0.8614556292664167,
"grad_norm": 2.1601104736328125,
"learning_rate": 2.874902874902875e-05,
"loss": 1.1772,
"step": 1120
},
{
"epoch": 0.8691471973848668,
"grad_norm": 2.0567638874053955,
"learning_rate": 2.8554778554778557e-05,
"loss": 1.1846,
"step": 1130
},
{
"epoch": 0.876838765503317,
"grad_norm": 2.4114835262298584,
"learning_rate": 2.836052836052836e-05,
"loss": 1.1658,
"step": 1140
},
{
"epoch": 0.8845303336217671,
"grad_norm": 1.685823917388916,
"learning_rate": 2.8166278166278166e-05,
"loss": 1.1858,
"step": 1150
},
{
"epoch": 0.8922219017402173,
"grad_norm": 2.006232738494873,
"learning_rate": 2.7972027972027976e-05,
"loss": 1.1781,
"step": 1160
},
{
"epoch": 0.8999134698586674,
"grad_norm": 2.089742660522461,
"learning_rate": 2.777777777777778e-05,
"loss": 1.1569,
"step": 1170
},
{
"epoch": 0.9076050379771176,
"grad_norm": 1.8699461221694946,
"learning_rate": 2.7583527583527586e-05,
"loss": 1.1653,
"step": 1180
},
{
"epoch": 0.9152966060955677,
"grad_norm": 1.997068166732788,
"learning_rate": 2.738927738927739e-05,
"loss": 1.2116,
"step": 1190
},
{
"epoch": 0.9229881742140179,
"grad_norm": 2.135075807571411,
"learning_rate": 2.7195027195027195e-05,
"loss": 1.2062,
"step": 1200
},
{
"epoch": 0.9306797423324681,
"grad_norm": 2.3898837566375732,
"learning_rate": 2.7000777000777e-05,
"loss": 1.1802,
"step": 1210
},
{
"epoch": 0.9383713104509182,
"grad_norm": 1.8470486402511597,
"learning_rate": 2.680652680652681e-05,
"loss": 1.1388,
"step": 1220
},
{
"epoch": 0.9460628785693683,
"grad_norm": 1.909771203994751,
"learning_rate": 2.6612276612276614e-05,
"loss": 1.1456,
"step": 1230
},
{
"epoch": 0.9537544466878185,
"grad_norm": 1.75010085105896,
"learning_rate": 2.641802641802642e-05,
"loss": 1.1792,
"step": 1240
},
{
"epoch": 0.9614460148062687,
"grad_norm": 1.835392951965332,
"learning_rate": 2.6223776223776224e-05,
"loss": 1.1666,
"step": 1250
},
{
"epoch": 0.9691375829247187,
"grad_norm": 1.74484384059906,
"learning_rate": 2.602952602952603e-05,
"loss": 1.184,
"step": 1260
},
{
"epoch": 0.9768291510431689,
"grad_norm": 1.9739990234375,
"learning_rate": 2.5835275835275837e-05,
"loss": 1.1342,
"step": 1270
},
{
"epoch": 0.9845207191616191,
"grad_norm": 1.9165340662002563,
"learning_rate": 2.564102564102564e-05,
"loss": 1.1852,
"step": 1280
},
{
"epoch": 0.9922122872800693,
"grad_norm": 2.0717194080352783,
"learning_rate": 2.544677544677545e-05,
"loss": 1.1611,
"step": 1290
},
{
"epoch": 0.9999038553985193,
"grad_norm": 1.7079836130142212,
"learning_rate": 2.5252525252525256e-05,
"loss": 1.1498,
"step": 1300
},
{
"epoch": 1.0075954235169695,
"grad_norm": 1.904863953590393,
"learning_rate": 2.505827505827506e-05,
"loss": 1.252,
"step": 1310
},
{
"epoch": 1.0152869916354197,
"grad_norm": 1.7407102584838867,
"learning_rate": 2.4864024864024865e-05,
"loss": 1.1177,
"step": 1320
},
{
"epoch": 1.0229785597538699,
"grad_norm": 2.13275146484375,
"learning_rate": 2.4669774669774672e-05,
"loss": 1.1409,
"step": 1330
},
{
"epoch": 1.03067012787232,
"grad_norm": 1.8755226135253906,
"learning_rate": 2.4475524475524478e-05,
"loss": 1.1262,
"step": 1340
},
{
"epoch": 1.0383616959907702,
"grad_norm": 2.0891833305358887,
"learning_rate": 2.428127428127428e-05,
"loss": 1.1126,
"step": 1350
},
{
"epoch": 1.0460532641092202,
"grad_norm": 2.072007894515991,
"learning_rate": 2.4087024087024088e-05,
"loss": 1.1098,
"step": 1360
},
{
"epoch": 1.0537448322276703,
"grad_norm": 1.8499518632888794,
"learning_rate": 2.3892773892773894e-05,
"loss": 1.1199,
"step": 1370
},
{
"epoch": 1.0614364003461205,
"grad_norm": 1.9085497856140137,
"learning_rate": 2.36985236985237e-05,
"loss": 1.1178,
"step": 1380
},
{
"epoch": 1.0691279684645707,
"grad_norm": 1.980193018913269,
"learning_rate": 2.3504273504273504e-05,
"loss": 1.1405,
"step": 1390
},
{
"epoch": 1.0768195365830209,
"grad_norm": 1.961827039718628,
"learning_rate": 2.331002331002331e-05,
"loss": 1.1294,
"step": 1400
},
{
"epoch": 1.084511104701471,
"grad_norm": 2.014617443084717,
"learning_rate": 2.3115773115773116e-05,
"loss": 1.1186,
"step": 1410
},
{
"epoch": 1.0922026728199212,
"grad_norm": 2.0065386295318604,
"learning_rate": 2.2921522921522923e-05,
"loss": 1.0977,
"step": 1420
},
{
"epoch": 1.0998942409383714,
"grad_norm": 1.9613438844680786,
"learning_rate": 2.272727272727273e-05,
"loss": 1.1649,
"step": 1430
},
{
"epoch": 1.1075858090568214,
"grad_norm": 2.0321905612945557,
"learning_rate": 2.2533022533022536e-05,
"loss": 1.1437,
"step": 1440
},
{
"epoch": 1.1152773771752715,
"grad_norm": 1.954882264137268,
"learning_rate": 2.233877233877234e-05,
"loss": 1.1166,
"step": 1450
},
{
"epoch": 1.1229689452937217,
"grad_norm": 2.1001336574554443,
"learning_rate": 2.2144522144522145e-05,
"loss": 1.1222,
"step": 1460
},
{
"epoch": 1.1306605134121719,
"grad_norm": 1.9967658519744873,
"learning_rate": 2.195027195027195e-05,
"loss": 1.113,
"step": 1470
},
{
"epoch": 1.138352081530622,
"grad_norm": 2.000359296798706,
"learning_rate": 2.1756021756021758e-05,
"loss": 1.1019,
"step": 1480
},
{
"epoch": 1.1460436496490722,
"grad_norm": 1.7127162218093872,
"learning_rate": 2.156177156177156e-05,
"loss": 1.1077,
"step": 1490
},
{
"epoch": 1.1537352177675224,
"grad_norm": 2.044915199279785,
"learning_rate": 2.1367521367521368e-05,
"loss": 1.1493,
"step": 1500
},
{
"epoch": 1.1614267858859726,
"grad_norm": 1.906977653503418,
"learning_rate": 2.1173271173271174e-05,
"loss": 1.0935,
"step": 1510
},
{
"epoch": 1.1691183540044228,
"grad_norm": 2.004258871078491,
"learning_rate": 2.097902097902098e-05,
"loss": 1.1548,
"step": 1520
},
{
"epoch": 1.1768099221228727,
"grad_norm": 2.028057813644409,
"learning_rate": 2.0784770784770787e-05,
"loss": 1.12,
"step": 1530
},
{
"epoch": 1.184501490241323,
"grad_norm": 2.0909903049468994,
"learning_rate": 2.0590520590520593e-05,
"loss": 1.1347,
"step": 1540
},
{
"epoch": 1.192193058359773,
"grad_norm": 2.309083938598633,
"learning_rate": 2.0396270396270396e-05,
"loss": 1.1467,
"step": 1550
},
{
"epoch": 1.1998846264782232,
"grad_norm": 2.195953845977783,
"learning_rate": 2.0202020202020203e-05,
"loss": 1.1166,
"step": 1560
},
{
"epoch": 1.2075761945966734,
"grad_norm": 1.8063645362854004,
"learning_rate": 2.000777000777001e-05,
"loss": 1.0918,
"step": 1570
},
{
"epoch": 1.2152677627151236,
"grad_norm": 2.0312583446502686,
"learning_rate": 1.9813519813519816e-05,
"loss": 1.1409,
"step": 1580
},
{
"epoch": 1.2229593308335738,
"grad_norm": 2.091669797897339,
"learning_rate": 1.961926961926962e-05,
"loss": 1.0938,
"step": 1590
},
{
"epoch": 1.2306508989520237,
"grad_norm": 1.7335410118103027,
"learning_rate": 1.9425019425019425e-05,
"loss": 1.1085,
"step": 1600
},
{
"epoch": 1.238342467070474,
"grad_norm": 2.0697057247161865,
"learning_rate": 1.923076923076923e-05,
"loss": 1.1355,
"step": 1610
},
{
"epoch": 1.246034035188924,
"grad_norm": 2.063481569290161,
"learning_rate": 1.9036519036519038e-05,
"loss": 1.1069,
"step": 1620
},
{
"epoch": 1.2537256033073743,
"grad_norm": 2.1378798484802246,
"learning_rate": 1.8842268842268844e-05,
"loss": 1.1273,
"step": 1630
},
{
"epoch": 1.2614171714258244,
"grad_norm": 2.2633299827575684,
"learning_rate": 1.864801864801865e-05,
"loss": 1.1173,
"step": 1640
},
{
"epoch": 1.2691087395442746,
"grad_norm": 1.8550769090652466,
"learning_rate": 1.8453768453768454e-05,
"loss": 1.0902,
"step": 1650
},
{
"epoch": 1.2768003076627248,
"grad_norm": 1.6822105646133423,
"learning_rate": 1.825951825951826e-05,
"loss": 1.0986,
"step": 1660
},
{
"epoch": 1.284491875781175,
"grad_norm": 1.8236192464828491,
"learning_rate": 1.8065268065268067e-05,
"loss": 1.1102,
"step": 1670
},
{
"epoch": 1.2921834438996251,
"grad_norm": 2.2287065982818604,
"learning_rate": 1.7871017871017873e-05,
"loss": 1.1302,
"step": 1680
},
{
"epoch": 1.299875012018075,
"grad_norm": 2.2197864055633545,
"learning_rate": 1.7676767676767676e-05,
"loss": 1.0902,
"step": 1690
},
{
"epoch": 1.3075665801365253,
"grad_norm": 2.1204230785369873,
"learning_rate": 1.7482517482517483e-05,
"loss": 1.1228,
"step": 1700
},
{
"epoch": 1.3152581482549754,
"grad_norm": 1.8001035451889038,
"learning_rate": 1.728826728826729e-05,
"loss": 1.1183,
"step": 1710
},
{
"epoch": 1.3229497163734256,
"grad_norm": 1.9405324459075928,
"learning_rate": 1.7094017094017095e-05,
"loss": 1.1377,
"step": 1720
},
{
"epoch": 1.3306412844918758,
"grad_norm": 1.9750436544418335,
"learning_rate": 1.68997668997669e-05,
"loss": 1.1051,
"step": 1730
},
{
"epoch": 1.338332852610326,
"grad_norm": 1.7894169092178345,
"learning_rate": 1.6705516705516705e-05,
"loss": 1.0972,
"step": 1740
},
{
"epoch": 1.3460244207287761,
"grad_norm": 1.7701419591903687,
"learning_rate": 1.651126651126651e-05,
"loss": 1.0815,
"step": 1750
},
{
"epoch": 1.353715988847226,
"grad_norm": 2.0538313388824463,
"learning_rate": 1.6317016317016318e-05,
"loss": 1.1072,
"step": 1760
},
{
"epoch": 1.3614075569656765,
"grad_norm": 2.0709080696105957,
"learning_rate": 1.6122766122766124e-05,
"loss": 1.0918,
"step": 1770
},
{
"epoch": 1.3690991250841265,
"grad_norm": 2.1939444541931152,
"learning_rate": 1.592851592851593e-05,
"loss": 1.0957,
"step": 1780
},
{
"epoch": 1.3767906932025766,
"grad_norm": 1.5446499586105347,
"learning_rate": 1.5734265734265734e-05,
"loss": 1.1032,
"step": 1790
},
{
"epoch": 1.3844822613210268,
"grad_norm": 1.971351981163025,
"learning_rate": 1.554001554001554e-05,
"loss": 1.1319,
"step": 1800
},
{
"epoch": 1.392173829439477,
"grad_norm": 1.9268254041671753,
"learning_rate": 1.5345765345765346e-05,
"loss": 1.1242,
"step": 1810
},
{
"epoch": 1.3998653975579272,
"grad_norm": 2.0808980464935303,
"learning_rate": 1.5151515151515153e-05,
"loss": 1.1173,
"step": 1820
},
{
"epoch": 1.4075569656763773,
"grad_norm": 2.044785976409912,
"learning_rate": 1.4957264957264958e-05,
"loss": 1.1481,
"step": 1830
},
{
"epoch": 1.4152485337948275,
"grad_norm": 1.690045714378357,
"learning_rate": 1.4763014763014762e-05,
"loss": 1.1239,
"step": 1840
},
{
"epoch": 1.4229401019132775,
"grad_norm": 1.950405478477478,
"learning_rate": 1.456876456876457e-05,
"loss": 1.1306,
"step": 1850
},
{
"epoch": 1.4306316700317276,
"grad_norm": 2.236177921295166,
"learning_rate": 1.4374514374514375e-05,
"loss": 1.0963,
"step": 1860
},
{
"epoch": 1.4383232381501778,
"grad_norm": 2.0903525352478027,
"learning_rate": 1.418026418026418e-05,
"loss": 1.1024,
"step": 1870
},
{
"epoch": 1.446014806268628,
"grad_norm": 2.220693349838257,
"learning_rate": 1.3986013986013988e-05,
"loss": 1.1132,
"step": 1880
},
{
"epoch": 1.4537063743870782,
"grad_norm": 1.9859932661056519,
"learning_rate": 1.3791763791763793e-05,
"loss": 1.1163,
"step": 1890
},
{
"epoch": 1.4613979425055283,
"grad_norm": 2.0174367427825928,
"learning_rate": 1.3597513597513598e-05,
"loss": 1.0975,
"step": 1900
},
{
"epoch": 1.4690895106239785,
"grad_norm": 2.1779918670654297,
"learning_rate": 1.3403263403263406e-05,
"loss": 1.1076,
"step": 1910
},
{
"epoch": 1.4767810787424285,
"grad_norm": 2.0282936096191406,
"learning_rate": 1.320901320901321e-05,
"loss": 1.1014,
"step": 1920
},
{
"epoch": 1.4844726468608789,
"grad_norm": 2.0938243865966797,
"learning_rate": 1.3014763014763015e-05,
"loss": 1.1028,
"step": 1930
},
{
"epoch": 1.4921642149793288,
"grad_norm": 2.0279526710510254,
"learning_rate": 1.282051282051282e-05,
"loss": 1.0922,
"step": 1940
},
{
"epoch": 1.499855783097779,
"grad_norm": 1.9357686042785645,
"learning_rate": 1.2626262626262628e-05,
"loss": 1.0896,
"step": 1950
},
{
"epoch": 1.5075473512162292,
"grad_norm": 2.1684436798095703,
"learning_rate": 1.2432012432012433e-05,
"loss": 1.1063,
"step": 1960
},
{
"epoch": 1.5152389193346794,
"grad_norm": 1.9890198707580566,
"learning_rate": 1.2237762237762239e-05,
"loss": 1.0927,
"step": 1970
},
{
"epoch": 1.5229304874531295,
"grad_norm": 1.9705474376678467,
"learning_rate": 1.2043512043512044e-05,
"loss": 1.1135,
"step": 1980
},
{
"epoch": 1.5306220555715797,
"grad_norm": 2.2323312759399414,
"learning_rate": 1.184926184926185e-05,
"loss": 1.1035,
"step": 1990
},
{
"epoch": 1.5383136236900299,
"grad_norm": 2.031677007675171,
"learning_rate": 1.1655011655011655e-05,
"loss": 1.0881,
"step": 2000
},
{
"epoch": 1.5460051918084798,
"grad_norm": 1.9229755401611328,
"learning_rate": 1.1460761460761461e-05,
"loss": 1.1678,
"step": 2010
},
{
"epoch": 1.5536967599269302,
"grad_norm": 2.2385263442993164,
"learning_rate": 1.1266511266511268e-05,
"loss": 1.0922,
"step": 2020
},
{
"epoch": 1.5613883280453802,
"grad_norm": 2.14032244682312,
"learning_rate": 1.1072261072261073e-05,
"loss": 1.0948,
"step": 2030
},
{
"epoch": 1.5690798961638304,
"grad_norm": 2.2739627361297607,
"learning_rate": 1.0878010878010879e-05,
"loss": 1.1033,
"step": 2040
},
{
"epoch": 1.5767714642822805,
"grad_norm": 2.117405891418457,
"learning_rate": 1.0683760683760684e-05,
"loss": 1.1144,
"step": 2050
},
{
"epoch": 1.5844630324007307,
"grad_norm": 1.917315125465393,
"learning_rate": 1.048951048951049e-05,
"loss": 1.1462,
"step": 2060
},
{
"epoch": 1.592154600519181,
"grad_norm": 1.9057689905166626,
"learning_rate": 1.0295260295260297e-05,
"loss": 1.1243,
"step": 2070
},
{
"epoch": 1.5998461686376308,
"grad_norm": 2.0241281986236572,
"learning_rate": 1.0101010101010101e-05,
"loss": 1.1061,
"step": 2080
},
{
"epoch": 1.6075377367560812,
"grad_norm": 2.33536958694458,
"learning_rate": 9.906759906759908e-06,
"loss": 1.0989,
"step": 2090
},
{
"epoch": 1.6152293048745312,
"grad_norm": 2.1172101497650146,
"learning_rate": 9.712509712509713e-06,
"loss": 1.0936,
"step": 2100
},
{
"epoch": 1.6229208729929816,
"grad_norm": 1.8319240808486938,
"learning_rate": 9.518259518259519e-06,
"loss": 1.1099,
"step": 2110
},
{
"epoch": 1.6306124411114316,
"grad_norm": 1.9867093563079834,
"learning_rate": 9.324009324009325e-06,
"loss": 1.0983,
"step": 2120
},
{
"epoch": 1.6383040092298817,
"grad_norm": 1.9787898063659668,
"learning_rate": 9.12975912975913e-06,
"loss": 1.1042,
"step": 2130
},
{
"epoch": 1.645995577348332,
"grad_norm": 1.9842088222503662,
"learning_rate": 8.935508935508937e-06,
"loss": 1.1094,
"step": 2140
},
{
"epoch": 1.653687145466782,
"grad_norm": 1.9378868341445923,
"learning_rate": 8.741258741258741e-06,
"loss": 1.0995,
"step": 2150
},
{
"epoch": 1.6613787135852323,
"grad_norm": 1.8845783472061157,
"learning_rate": 8.547008547008548e-06,
"loss": 1.0884,
"step": 2160
},
{
"epoch": 1.6690702817036822,
"grad_norm": 1.9667729139328003,
"learning_rate": 8.352758352758352e-06,
"loss": 1.0916,
"step": 2170
},
{
"epoch": 1.6767618498221326,
"grad_norm": 2.1440649032592773,
"learning_rate": 8.158508158508159e-06,
"loss": 1.1166,
"step": 2180
},
{
"epoch": 1.6844534179405826,
"grad_norm": 2.1377923488616943,
"learning_rate": 7.964257964257965e-06,
"loss": 1.0733,
"step": 2190
},
{
"epoch": 1.6921449860590327,
"grad_norm": 1.9222036600112915,
"learning_rate": 7.77000777000777e-06,
"loss": 1.1064,
"step": 2200
},
{
"epoch": 1.699836554177483,
"grad_norm": 1.9658797979354858,
"learning_rate": 7.5757575757575764e-06,
"loss": 1.1212,
"step": 2210
},
{
"epoch": 1.707528122295933,
"grad_norm": 1.9604525566101074,
"learning_rate": 7.381507381507381e-06,
"loss": 1.094,
"step": 2220
},
{
"epoch": 1.7152196904143833,
"grad_norm": 2.1394431591033936,
"learning_rate": 7.187257187257188e-06,
"loss": 1.0974,
"step": 2230
},
{
"epoch": 1.7229112585328332,
"grad_norm": 1.819686770439148,
"learning_rate": 6.993006993006994e-06,
"loss": 1.0815,
"step": 2240
},
{
"epoch": 1.7306028266512836,
"grad_norm": 1.8422613143920898,
"learning_rate": 6.798756798756799e-06,
"loss": 1.1034,
"step": 2250
},
{
"epoch": 1.7382943947697336,
"grad_norm": 1.9406566619873047,
"learning_rate": 6.604506604506605e-06,
"loss": 1.0835,
"step": 2260
},
{
"epoch": 1.745985962888184,
"grad_norm": 1.831289291381836,
"learning_rate": 6.41025641025641e-06,
"loss": 1.0857,
"step": 2270
},
{
"epoch": 1.753677531006634,
"grad_norm": 1.942730188369751,
"learning_rate": 6.216006216006216e-06,
"loss": 1.1112,
"step": 2280
},
{
"epoch": 1.761369099125084,
"grad_norm": 1.8144134283065796,
"learning_rate": 6.021756021756022e-06,
"loss": 1.0873,
"step": 2290
},
{
"epoch": 1.7690606672435343,
"grad_norm": 1.7852503061294556,
"learning_rate": 5.8275058275058275e-06,
"loss": 1.0756,
"step": 2300
},
{
"epoch": 1.7767522353619845,
"grad_norm": 2.232545852661133,
"learning_rate": 5.633255633255634e-06,
"loss": 1.114,
"step": 2310
},
{
"epoch": 1.7844438034804346,
"grad_norm": 2.143216133117676,
"learning_rate": 5.4390054390054395e-06,
"loss": 1.1015,
"step": 2320
},
{
"epoch": 1.7921353715988846,
"grad_norm": 1.8443506956100464,
"learning_rate": 5.244755244755245e-06,
"loss": 1.1006,
"step": 2330
},
{
"epoch": 1.799826939717335,
"grad_norm": 1.7747691869735718,
"learning_rate": 5.050505050505051e-06,
"loss": 1.099,
"step": 2340
},
{
"epoch": 1.807518507835785,
"grad_norm": 1.8745768070220947,
"learning_rate": 4.856254856254856e-06,
"loss": 1.0979,
"step": 2350
},
{
"epoch": 1.8152100759542353,
"grad_norm": 1.9972470998764038,
"learning_rate": 4.662004662004663e-06,
"loss": 1.1083,
"step": 2360
},
{
"epoch": 1.8229016440726853,
"grad_norm": 1.8246738910675049,
"learning_rate": 4.467754467754468e-06,
"loss": 1.1061,
"step": 2370
},
{
"epoch": 1.8305932121911355,
"grad_norm": 1.9446369409561157,
"learning_rate": 4.273504273504274e-06,
"loss": 1.0843,
"step": 2380
},
{
"epoch": 1.8382847803095856,
"grad_norm": 1.9822577238082886,
"learning_rate": 4.079254079254079e-06,
"loss": 1.0722,
"step": 2390
},
{
"epoch": 1.8459763484280358,
"grad_norm": 2.3684942722320557,
"learning_rate": 3.885003885003885e-06,
"loss": 1.1354,
"step": 2400
},
{
"epoch": 1.853667916546486,
"grad_norm": 1.8989230394363403,
"learning_rate": 3.6907536907536906e-06,
"loss": 1.0931,
"step": 2410
},
{
"epoch": 1.861359484664936,
"grad_norm": 1.952711582183838,
"learning_rate": 3.496503496503497e-06,
"loss": 1.0823,
"step": 2420
},
{
"epoch": 1.8690510527833863,
"grad_norm": 2.154554605484009,
"learning_rate": 3.3022533022533026e-06,
"loss": 1.0846,
"step": 2430
},
{
"epoch": 1.8767426209018363,
"grad_norm": 1.8646643161773682,
"learning_rate": 3.108003108003108e-06,
"loss": 1.0961,
"step": 2440
},
{
"epoch": 1.8844341890202865,
"grad_norm": 1.9604412317276,
"learning_rate": 2.9137529137529138e-06,
"loss": 1.0823,
"step": 2450
},
{
"epoch": 1.8921257571387367,
"grad_norm": 1.7040070295333862,
"learning_rate": 2.7195027195027198e-06,
"loss": 1.0831,
"step": 2460
},
{
"epoch": 1.8998173252571868,
"grad_norm": 2.3262603282928467,
"learning_rate": 2.5252525252525253e-06,
"loss": 1.0668,
"step": 2470
},
{
"epoch": 1.907508893375637,
"grad_norm": 2.0159196853637695,
"learning_rate": 2.3310023310023313e-06,
"loss": 1.094,
"step": 2480
},
{
"epoch": 1.915200461494087,
"grad_norm": 1.9653328657150269,
"learning_rate": 2.136752136752137e-06,
"loss": 1.0677,
"step": 2490
},
{
"epoch": 1.9228920296125374,
"grad_norm": 1.7489328384399414,
"learning_rate": 1.9425019425019425e-06,
"loss": 1.0947,
"step": 2500
},
{
"epoch": 1.9305835977309873,
"grad_norm": 1.9971283674240112,
"learning_rate": 1.7482517482517485e-06,
"loss": 1.135,
"step": 2510
},
{
"epoch": 1.9382751658494377,
"grad_norm": 1.9700243473052979,
"learning_rate": 1.554001554001554e-06,
"loss": 1.0783,
"step": 2520
},
{
"epoch": 1.9459667339678877,
"grad_norm": 1.8461562395095825,
"learning_rate": 1.3597513597513599e-06,
"loss": 1.113,
"step": 2530
},
{
"epoch": 1.9536583020863378,
"grad_norm": 1.6744381189346313,
"learning_rate": 1.1655011655011657e-06,
"loss": 1.0829,
"step": 2540
},
{
"epoch": 1.961349870204788,
"grad_norm": 1.9034210443496704,
"learning_rate": 9.712509712509713e-07,
"loss": 1.0781,
"step": 2550
},
{
"epoch": 1.9690414383232382,
"grad_norm": 1.98823881149292,
"learning_rate": 7.77000777000777e-07,
"loss": 1.1031,
"step": 2560
},
{
"epoch": 1.9767330064416884,
"grad_norm": 1.905478596687317,
"learning_rate": 5.827505827505828e-07,
"loss": 1.081,
"step": 2570
},
{
"epoch": 1.9844245745601383,
"grad_norm": 1.8001110553741455,
"learning_rate": 3.885003885003885e-07,
"loss": 1.0793,
"step": 2580
},
{
"epoch": 1.9921161426785887,
"grad_norm": 2.089235305786133,
"learning_rate": 1.9425019425019426e-07,
"loss": 1.1,
"step": 2590
},
{
"epoch": 1.9998077107970387,
"grad_norm": 2.018970489501953,
"learning_rate": 0.0,
"loss": 1.0801,
"step": 2600
}
],
"logging_steps": 10,
"max_steps": 2600,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.1771710700767724e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}