llama-3.2-1b-coral.org-expert / trainer_state.json
sanjay920's picture
Upload folder using huggingface_hub
de034be verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.999336957963135,
"eval_steps": 500,
"global_step": 9425,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005304336294921098,
"grad_norm": 3.020922899246216,
"learning_rate": 2.1208907741251328e-07,
"loss": 0.3096,
"step": 10
},
{
"epoch": 0.010608672589842195,
"grad_norm": 2.9058778285980225,
"learning_rate": 4.2417815482502656e-07,
"loss": 0.2989,
"step": 20
},
{
"epoch": 0.015913008884763293,
"grad_norm": 3.3067986965179443,
"learning_rate": 6.362672322375398e-07,
"loss": 0.2989,
"step": 30
},
{
"epoch": 0.02121734517968439,
"grad_norm": 3.3193933963775635,
"learning_rate": 8.483563096500531e-07,
"loss": 0.3077,
"step": 40
},
{
"epoch": 0.02652168147460549,
"grad_norm": 3.428807497024536,
"learning_rate": 1.0604453870625663e-06,
"loss": 0.3117,
"step": 50
},
{
"epoch": 0.031826017769526586,
"grad_norm": 3.1738169193267822,
"learning_rate": 1.2725344644750796e-06,
"loss": 0.307,
"step": 60
},
{
"epoch": 0.03713035406444769,
"grad_norm": 2.9309284687042236,
"learning_rate": 1.4846235418875928e-06,
"loss": 0.3017,
"step": 70
},
{
"epoch": 0.04243469035936878,
"grad_norm": 3.575754165649414,
"learning_rate": 1.6967126193001062e-06,
"loss": 0.2868,
"step": 80
},
{
"epoch": 0.04773902665428988,
"grad_norm": 2.9849185943603516,
"learning_rate": 1.9088016967126195e-06,
"loss": 0.3072,
"step": 90
},
{
"epoch": 0.05304336294921098,
"grad_norm": 3.3160886764526367,
"learning_rate": 2.1208907741251327e-06,
"loss": 0.3127,
"step": 100
},
{
"epoch": 0.05834769924413208,
"grad_norm": 3.4454386234283447,
"learning_rate": 2.332979851537646e-06,
"loss": 0.3105,
"step": 110
},
{
"epoch": 0.06365203553905317,
"grad_norm": 3.1443614959716797,
"learning_rate": 2.545068928950159e-06,
"loss": 0.3011,
"step": 120
},
{
"epoch": 0.06895637183397427,
"grad_norm": 3.260246515274048,
"learning_rate": 2.7571580063626724e-06,
"loss": 0.2922,
"step": 130
},
{
"epoch": 0.07426070812889538,
"grad_norm": 3.3770997524261475,
"learning_rate": 2.9692470837751856e-06,
"loss": 0.2962,
"step": 140
},
{
"epoch": 0.07956504442381647,
"grad_norm": 3.2429087162017822,
"learning_rate": 3.1813361611876992e-06,
"loss": 0.3195,
"step": 150
},
{
"epoch": 0.08486938071873756,
"grad_norm": 2.878188371658325,
"learning_rate": 3.3934252386002125e-06,
"loss": 0.3045,
"step": 160
},
{
"epoch": 0.09017371701365867,
"grad_norm": 3.4501426219940186,
"learning_rate": 3.6055143160127253e-06,
"loss": 0.2964,
"step": 170
},
{
"epoch": 0.09547805330857977,
"grad_norm": 3.384909152984619,
"learning_rate": 3.817603393425239e-06,
"loss": 0.2967,
"step": 180
},
{
"epoch": 0.10078238960350086,
"grad_norm": 3.2439138889312744,
"learning_rate": 4.029692470837753e-06,
"loss": 0.3126,
"step": 190
},
{
"epoch": 0.10608672589842195,
"grad_norm": 3.6719398498535156,
"learning_rate": 4.241781548250265e-06,
"loss": 0.3142,
"step": 200
},
{
"epoch": 0.11139106219334306,
"grad_norm": 3.7998712062835693,
"learning_rate": 4.453870625662779e-06,
"loss": 0.3003,
"step": 210
},
{
"epoch": 0.11669539848826416,
"grad_norm": 3.800631523132324,
"learning_rate": 4.665959703075292e-06,
"loss": 0.3009,
"step": 220
},
{
"epoch": 0.12199973478318525,
"grad_norm": 3.281419038772583,
"learning_rate": 4.8780487804878055e-06,
"loss": 0.3126,
"step": 230
},
{
"epoch": 0.12730407107810635,
"grad_norm": 3.1733639240264893,
"learning_rate": 5.090137857900318e-06,
"loss": 0.313,
"step": 240
},
{
"epoch": 0.13260840737302745,
"grad_norm": 3.4202017784118652,
"learning_rate": 5.302226935312832e-06,
"loss": 0.317,
"step": 250
},
{
"epoch": 0.13791274366794853,
"grad_norm": 3.541494369506836,
"learning_rate": 5.514316012725345e-06,
"loss": 0.3093,
"step": 260
},
{
"epoch": 0.14321707996286964,
"grad_norm": 3.3184351921081543,
"learning_rate": 5.726405090137858e-06,
"loss": 0.3197,
"step": 270
},
{
"epoch": 0.14852141625779075,
"grad_norm": 3.494412660598755,
"learning_rate": 5.938494167550371e-06,
"loss": 0.3238,
"step": 280
},
{
"epoch": 0.15382575255271183,
"grad_norm": 3.590353488922119,
"learning_rate": 6.150583244962884e-06,
"loss": 0.3147,
"step": 290
},
{
"epoch": 0.15913008884763294,
"grad_norm": 4.020939350128174,
"learning_rate": 6.3626723223753985e-06,
"loss": 0.3161,
"step": 300
},
{
"epoch": 0.16443442514255405,
"grad_norm": 3.1689860820770264,
"learning_rate": 6.574761399787911e-06,
"loss": 0.3161,
"step": 310
},
{
"epoch": 0.16973876143747513,
"grad_norm": 3.292384386062622,
"learning_rate": 6.786850477200425e-06,
"loss": 0.3076,
"step": 320
},
{
"epoch": 0.17504309773239624,
"grad_norm": 3.379220962524414,
"learning_rate": 6.998939554612938e-06,
"loss": 0.3119,
"step": 330
},
{
"epoch": 0.18034743402731734,
"grad_norm": 3.823171377182007,
"learning_rate": 7.2110286320254506e-06,
"loss": 0.3157,
"step": 340
},
{
"epoch": 0.18565177032223842,
"grad_norm": 3.715949058532715,
"learning_rate": 7.423117709437965e-06,
"loss": 0.3354,
"step": 350
},
{
"epoch": 0.19095610661715953,
"grad_norm": 3.638728141784668,
"learning_rate": 7.635206786850478e-06,
"loss": 0.3333,
"step": 360
},
{
"epoch": 0.1962604429120806,
"grad_norm": 3.572036027908325,
"learning_rate": 7.847295864262992e-06,
"loss": 0.336,
"step": 370
},
{
"epoch": 0.20156477920700172,
"grad_norm": 3.472360610961914,
"learning_rate": 8.059384941675505e-06,
"loss": 0.333,
"step": 380
},
{
"epoch": 0.20686911550192283,
"grad_norm": 3.5541131496429443,
"learning_rate": 8.271474019088017e-06,
"loss": 0.3476,
"step": 390
},
{
"epoch": 0.2121734517968439,
"grad_norm": 3.7231061458587646,
"learning_rate": 8.48356309650053e-06,
"loss": 0.3391,
"step": 400
},
{
"epoch": 0.21747778809176502,
"grad_norm": 3.7742016315460205,
"learning_rate": 8.695652173913044e-06,
"loss": 0.3374,
"step": 410
},
{
"epoch": 0.22278212438668613,
"grad_norm": 4.2606892585754395,
"learning_rate": 8.907741251325558e-06,
"loss": 0.3653,
"step": 420
},
{
"epoch": 0.2280864606816072,
"grad_norm": 4.846815586090088,
"learning_rate": 9.11983032873807e-06,
"loss": 0.3342,
"step": 430
},
{
"epoch": 0.23339079697652831,
"grad_norm": 3.6985626220703125,
"learning_rate": 9.331919406150584e-06,
"loss": 0.3438,
"step": 440
},
{
"epoch": 0.23869513327144942,
"grad_norm": 3.6264142990112305,
"learning_rate": 9.544008483563097e-06,
"loss": 0.3692,
"step": 450
},
{
"epoch": 0.2439994695663705,
"grad_norm": 3.990809440612793,
"learning_rate": 9.756097560975611e-06,
"loss": 0.3545,
"step": 460
},
{
"epoch": 0.2493038058612916,
"grad_norm": 3.816340208053589,
"learning_rate": 9.968186638388125e-06,
"loss": 0.3347,
"step": 470
},
{
"epoch": 0.2546081421562127,
"grad_norm": 4.031066417694092,
"learning_rate": 1.0180275715800637e-05,
"loss": 0.3554,
"step": 480
},
{
"epoch": 0.2599124784511338,
"grad_norm": 3.9148292541503906,
"learning_rate": 1.039236479321315e-05,
"loss": 0.3644,
"step": 490
},
{
"epoch": 0.2652168147460549,
"grad_norm": 3.6338350772857666,
"learning_rate": 1.0604453870625664e-05,
"loss": 0.3755,
"step": 500
},
{
"epoch": 0.270521151040976,
"grad_norm": 3.52591872215271,
"learning_rate": 1.0816542948038178e-05,
"loss": 0.3513,
"step": 510
},
{
"epoch": 0.27582548733589707,
"grad_norm": 4.284359455108643,
"learning_rate": 1.102863202545069e-05,
"loss": 0.3726,
"step": 520
},
{
"epoch": 0.2811298236308182,
"grad_norm": 3.469064712524414,
"learning_rate": 1.1240721102863203e-05,
"loss": 0.3705,
"step": 530
},
{
"epoch": 0.2864341599257393,
"grad_norm": 3.983943223953247,
"learning_rate": 1.1452810180275717e-05,
"loss": 0.3772,
"step": 540
},
{
"epoch": 0.2917384962206604,
"grad_norm": 4.600942134857178,
"learning_rate": 1.1664899257688229e-05,
"loss": 0.3685,
"step": 550
},
{
"epoch": 0.2970428325155815,
"grad_norm": 3.9568793773651123,
"learning_rate": 1.1876988335100742e-05,
"loss": 0.3545,
"step": 560
},
{
"epoch": 0.3023471688105026,
"grad_norm": 4.284022808074951,
"learning_rate": 1.2089077412513258e-05,
"loss": 0.3806,
"step": 570
},
{
"epoch": 0.30765150510542366,
"grad_norm": 3.7012698650360107,
"learning_rate": 1.2301166489925768e-05,
"loss": 0.3717,
"step": 580
},
{
"epoch": 0.31295584140034477,
"grad_norm": 3.5977935791015625,
"learning_rate": 1.2513255567338283e-05,
"loss": 0.3741,
"step": 590
},
{
"epoch": 0.3182601776952659,
"grad_norm": 4.306045055389404,
"learning_rate": 1.2725344644750797e-05,
"loss": 0.3774,
"step": 600
},
{
"epoch": 0.323564513990187,
"grad_norm": 4.045034408569336,
"learning_rate": 1.293743372216331e-05,
"loss": 0.3828,
"step": 610
},
{
"epoch": 0.3288688502851081,
"grad_norm": 3.6002280712127686,
"learning_rate": 1.3149522799575823e-05,
"loss": 0.3613,
"step": 620
},
{
"epoch": 0.33417318658002915,
"grad_norm": 3.792759418487549,
"learning_rate": 1.3361611876988336e-05,
"loss": 0.3916,
"step": 630
},
{
"epoch": 0.33947752287495025,
"grad_norm": 4.016223907470703,
"learning_rate": 1.357370095440085e-05,
"loss": 0.3832,
"step": 640
},
{
"epoch": 0.34478185916987136,
"grad_norm": 3.6611733436584473,
"learning_rate": 1.3785790031813362e-05,
"loss": 0.3726,
"step": 650
},
{
"epoch": 0.35008619546479247,
"grad_norm": 4.31847620010376,
"learning_rate": 1.3997879109225876e-05,
"loss": 0.4027,
"step": 660
},
{
"epoch": 0.3553905317597136,
"grad_norm": 3.5145909786224365,
"learning_rate": 1.4209968186638389e-05,
"loss": 0.4077,
"step": 670
},
{
"epoch": 0.3606948680546347,
"grad_norm": 4.157687664031982,
"learning_rate": 1.4422057264050901e-05,
"loss": 0.3887,
"step": 680
},
{
"epoch": 0.36599920434955574,
"grad_norm": 3.945263624191284,
"learning_rate": 1.4634146341463415e-05,
"loss": 0.4042,
"step": 690
},
{
"epoch": 0.37130354064447685,
"grad_norm": 4.426787853240967,
"learning_rate": 1.484623541887593e-05,
"loss": 0.4188,
"step": 700
},
{
"epoch": 0.37660787693939796,
"grad_norm": 3.5950045585632324,
"learning_rate": 1.5058324496288444e-05,
"loss": 0.4235,
"step": 710
},
{
"epoch": 0.38191221323431906,
"grad_norm": 3.7992334365844727,
"learning_rate": 1.5270413573700956e-05,
"loss": 0.3891,
"step": 720
},
{
"epoch": 0.3872165495292402,
"grad_norm": 4.29195499420166,
"learning_rate": 1.548250265111347e-05,
"loss": 0.4039,
"step": 730
},
{
"epoch": 0.3925208858241612,
"grad_norm": 4.078744411468506,
"learning_rate": 1.5694591728525983e-05,
"loss": 0.4045,
"step": 740
},
{
"epoch": 0.39782522211908233,
"grad_norm": 4.213825702667236,
"learning_rate": 1.5906680805938493e-05,
"loss": 0.428,
"step": 750
},
{
"epoch": 0.40312955841400344,
"grad_norm": 3.6432559490203857,
"learning_rate": 1.611876988335101e-05,
"loss": 0.3985,
"step": 760
},
{
"epoch": 0.40843389470892455,
"grad_norm": 3.458439826965332,
"learning_rate": 1.6330858960763524e-05,
"loss": 0.4257,
"step": 770
},
{
"epoch": 0.41373823100384566,
"grad_norm": 3.8959126472473145,
"learning_rate": 1.6542948038176034e-05,
"loss": 0.4293,
"step": 780
},
{
"epoch": 0.41904256729876677,
"grad_norm": 4.051570415496826,
"learning_rate": 1.6755037115588548e-05,
"loss": 0.4293,
"step": 790
},
{
"epoch": 0.4243469035936878,
"grad_norm": 3.807042121887207,
"learning_rate": 1.696712619300106e-05,
"loss": 0.4263,
"step": 800
},
{
"epoch": 0.4296512398886089,
"grad_norm": 3.8195431232452393,
"learning_rate": 1.7179215270413575e-05,
"loss": 0.4341,
"step": 810
},
{
"epoch": 0.43495557618353003,
"grad_norm": 3.7309257984161377,
"learning_rate": 1.739130434782609e-05,
"loss": 0.4238,
"step": 820
},
{
"epoch": 0.44025991247845114,
"grad_norm": 3.7905941009521484,
"learning_rate": 1.7603393425238602e-05,
"loss": 0.423,
"step": 830
},
{
"epoch": 0.44556424877337225,
"grad_norm": 3.9321465492248535,
"learning_rate": 1.7815482502651116e-05,
"loss": 0.4333,
"step": 840
},
{
"epoch": 0.4508685850682933,
"grad_norm": 4.208277702331543,
"learning_rate": 1.8027571580063626e-05,
"loss": 0.4355,
"step": 850
},
{
"epoch": 0.4561729213632144,
"grad_norm": 4.083523273468018,
"learning_rate": 1.823966065747614e-05,
"loss": 0.4345,
"step": 860
},
{
"epoch": 0.4614772576581355,
"grad_norm": 4.052921772003174,
"learning_rate": 1.8451749734888657e-05,
"loss": 0.4345,
"step": 870
},
{
"epoch": 0.46678159395305663,
"grad_norm": 3.9257407188415527,
"learning_rate": 1.8663838812301167e-05,
"loss": 0.438,
"step": 880
},
{
"epoch": 0.47208593024797774,
"grad_norm": 3.8625378608703613,
"learning_rate": 1.887592788971368e-05,
"loss": 0.4479,
"step": 890
},
{
"epoch": 0.47739026654289884,
"grad_norm": 3.6812710762023926,
"learning_rate": 1.9088016967126195e-05,
"loss": 0.4728,
"step": 900
},
{
"epoch": 0.4826946028378199,
"grad_norm": 3.700044631958008,
"learning_rate": 1.9300106044538708e-05,
"loss": 0.4685,
"step": 910
},
{
"epoch": 0.487998939132741,
"grad_norm": 3.957547187805176,
"learning_rate": 1.9512195121951222e-05,
"loss": 0.4518,
"step": 920
},
{
"epoch": 0.4933032754276621,
"grad_norm": 4.383725643157959,
"learning_rate": 1.9724284199363736e-05,
"loss": 0.4822,
"step": 930
},
{
"epoch": 0.4986076117225832,
"grad_norm": 3.6373391151428223,
"learning_rate": 1.993637327677625e-05,
"loss": 0.4518,
"step": 940
},
{
"epoch": 0.5039119480175043,
"grad_norm": 3.9900004863739014,
"learning_rate": 1.9999966389958385e-05,
"loss": 0.4512,
"step": 950
},
{
"epoch": 0.5092162843124254,
"grad_norm": 3.9511730670928955,
"learning_rate": 1.9999801769890262e-05,
"loss": 0.4611,
"step": 960
},
{
"epoch": 0.5145206206073465,
"grad_norm": 4.0564751625061035,
"learning_rate": 1.9999499968778183e-05,
"loss": 0.4532,
"step": 970
},
{
"epoch": 0.5198249569022676,
"grad_norm": 4.041463375091553,
"learning_rate": 1.999906099076237e-05,
"loss": 0.4699,
"step": 980
},
{
"epoch": 0.5251292931971887,
"grad_norm": 4.434980869293213,
"learning_rate": 1.9998484841864885e-05,
"loss": 0.4849,
"step": 990
},
{
"epoch": 0.5304336294921098,
"grad_norm": 3.809227228164673,
"learning_rate": 1.999777152998956e-05,
"loss": 0.4654,
"step": 1000
},
{
"epoch": 0.5357379657870309,
"grad_norm": 3.6580355167388916,
"learning_rate": 1.999692106492187e-05,
"loss": 0.4694,
"step": 1010
},
{
"epoch": 0.541042302081952,
"grad_norm": 4.149374485015869,
"learning_rate": 1.9995933458328816e-05,
"loss": 0.4854,
"step": 1020
},
{
"epoch": 0.5463466383768731,
"grad_norm": 4.361349105834961,
"learning_rate": 1.999480872375876e-05,
"loss": 0.4966,
"step": 1030
},
{
"epoch": 0.5516509746717941,
"grad_norm": 4.03767728805542,
"learning_rate": 1.999354687664123e-05,
"loss": 0.4779,
"step": 1040
},
{
"epoch": 0.5569553109667152,
"grad_norm": 3.866338014602661,
"learning_rate": 1.9992147934286726e-05,
"loss": 0.4843,
"step": 1050
},
{
"epoch": 0.5622596472616364,
"grad_norm": 3.858006238937378,
"learning_rate": 1.999061191588646e-05,
"loss": 0.4805,
"step": 1060
},
{
"epoch": 0.5675639835565575,
"grad_norm": 4.386377811431885,
"learning_rate": 1.9988938842512117e-05,
"loss": 0.4837,
"step": 1070
},
{
"epoch": 0.5728683198514786,
"grad_norm": 3.8602404594421387,
"learning_rate": 1.998712873711554e-05,
"loss": 0.4861,
"step": 1080
},
{
"epoch": 0.5781726561463997,
"grad_norm": 3.921699047088623,
"learning_rate": 1.9985181624528435e-05,
"loss": 0.4724,
"step": 1090
},
{
"epoch": 0.5834769924413208,
"grad_norm": 4.1873884201049805,
"learning_rate": 1.998309753146203e-05,
"loss": 0.4928,
"step": 1100
},
{
"epoch": 0.5887813287362419,
"grad_norm": 4.134608745574951,
"learning_rate": 1.99808764865067e-05,
"loss": 0.4961,
"step": 1110
},
{
"epoch": 0.594085665031163,
"grad_norm": 4.108495712280273,
"learning_rate": 1.9978518520131574e-05,
"loss": 0.5054,
"step": 1120
},
{
"epoch": 0.5993900013260841,
"grad_norm": 4.025395393371582,
"learning_rate": 1.9976023664684114e-05,
"loss": 0.4999,
"step": 1130
},
{
"epoch": 0.6046943376210052,
"grad_norm": 4.111999034881592,
"learning_rate": 1.9973391954389697e-05,
"loss": 0.4848,
"step": 1140
},
{
"epoch": 0.6099986739159262,
"grad_norm": 3.806658983230591,
"learning_rate": 1.997062342535111e-05,
"loss": 0.4896,
"step": 1150
},
{
"epoch": 0.6153030102108473,
"grad_norm": 3.9219367504119873,
"learning_rate": 1.996771811554808e-05,
"loss": 0.4868,
"step": 1160
},
{
"epoch": 0.6206073465057684,
"grad_norm": 4.265909194946289,
"learning_rate": 1.9964676064836733e-05,
"loss": 0.4984,
"step": 1170
},
{
"epoch": 0.6259116828006895,
"grad_norm": 3.8776512145996094,
"learning_rate": 1.996149731494907e-05,
"loss": 0.4891,
"step": 1180
},
{
"epoch": 0.6312160190956106,
"grad_norm": 4.08100700378418,
"learning_rate": 1.995818190949238e-05,
"loss": 0.4922,
"step": 1190
},
{
"epoch": 0.6365203553905318,
"grad_norm": 3.563994884490967,
"learning_rate": 1.995472989394864e-05,
"loss": 0.4939,
"step": 1200
},
{
"epoch": 0.6418246916854529,
"grad_norm": 4.029845714569092,
"learning_rate": 1.9951141315673897e-05,
"loss": 0.4856,
"step": 1210
},
{
"epoch": 0.647129027980374,
"grad_norm": 3.762803554534912,
"learning_rate": 1.9947416223897624e-05,
"loss": 0.4912,
"step": 1220
},
{
"epoch": 0.6524333642752951,
"grad_norm": 4.067233562469482,
"learning_rate": 1.9943554669722027e-05,
"loss": 0.4884,
"step": 1230
},
{
"epoch": 0.6577377005702162,
"grad_norm": 3.9958083629608154,
"learning_rate": 1.993955670612136e-05,
"loss": 0.4926,
"step": 1240
},
{
"epoch": 0.6630420368651373,
"grad_norm": 4.066989421844482,
"learning_rate": 1.9935422387941194e-05,
"loss": 0.5063,
"step": 1250
},
{
"epoch": 0.6683463731600583,
"grad_norm": 4.203117370605469,
"learning_rate": 1.9931151771897658e-05,
"loss": 0.4957,
"step": 1260
},
{
"epoch": 0.6736507094549794,
"grad_norm": 3.6308250427246094,
"learning_rate": 1.9926744916576674e-05,
"loss": 0.4939,
"step": 1270
},
{
"epoch": 0.6789550457499005,
"grad_norm": 4.011183738708496,
"learning_rate": 1.992220188243314e-05,
"loss": 0.5002,
"step": 1280
},
{
"epoch": 0.6842593820448216,
"grad_norm": 4.1181840896606445,
"learning_rate": 1.991752273179011e-05,
"loss": 0.5035,
"step": 1290
},
{
"epoch": 0.6895637183397427,
"grad_norm": 3.892821788787842,
"learning_rate": 1.9912707528837935e-05,
"loss": 0.5061,
"step": 1300
},
{
"epoch": 0.6948680546346638,
"grad_norm": 3.6360907554626465,
"learning_rate": 1.990775633963337e-05,
"loss": 0.5024,
"step": 1310
},
{
"epoch": 0.7001723909295849,
"grad_norm": 3.6487557888031006,
"learning_rate": 1.9902669232098707e-05,
"loss": 0.511,
"step": 1320
},
{
"epoch": 0.705476727224506,
"grad_norm": 3.6783857345581055,
"learning_rate": 1.989744627602079e-05,
"loss": 0.5203,
"step": 1330
},
{
"epoch": 0.7107810635194272,
"grad_norm": 4.257678508758545,
"learning_rate": 1.9892087543050102e-05,
"loss": 0.4958,
"step": 1340
},
{
"epoch": 0.7160853998143483,
"grad_norm": 4.080876350402832,
"learning_rate": 1.988659310669976e-05,
"loss": 0.5152,
"step": 1350
},
{
"epoch": 0.7213897361092694,
"grad_norm": 3.8536195755004883,
"learning_rate": 1.9880963042344502e-05,
"loss": 0.4987,
"step": 1360
},
{
"epoch": 0.7266940724041904,
"grad_norm": 3.5171945095062256,
"learning_rate": 1.987519742721968e-05,
"loss": 0.5109,
"step": 1370
},
{
"epoch": 0.7319984086991115,
"grad_norm": 3.6282453536987305,
"learning_rate": 1.9869296340420162e-05,
"loss": 0.5077,
"step": 1380
},
{
"epoch": 0.7373027449940326,
"grad_norm": 3.358875274658203,
"learning_rate": 1.9863259862899285e-05,
"loss": 0.5084,
"step": 1390
},
{
"epoch": 0.7426070812889537,
"grad_norm": 3.6957879066467285,
"learning_rate": 1.9857088077467713e-05,
"loss": 0.5272,
"step": 1400
},
{
"epoch": 0.7479114175838748,
"grad_norm": 3.5185537338256836,
"learning_rate": 1.9850781068792327e-05,
"loss": 0.5016,
"step": 1410
},
{
"epoch": 0.7532157538787959,
"grad_norm": 3.4131875038146973,
"learning_rate": 1.9844338923395044e-05,
"loss": 0.491,
"step": 1420
},
{
"epoch": 0.758520090173717,
"grad_norm": 4.015808582305908,
"learning_rate": 1.9837761729651635e-05,
"loss": 0.5011,
"step": 1430
},
{
"epoch": 0.7638244264686381,
"grad_norm": 4.074829578399658,
"learning_rate": 1.9831049577790526e-05,
"loss": 0.5272,
"step": 1440
},
{
"epoch": 0.7691287627635592,
"grad_norm": 3.5524649620056152,
"learning_rate": 1.9824202559891534e-05,
"loss": 0.5363,
"step": 1450
},
{
"epoch": 0.7744330990584803,
"grad_norm": 3.5700480937957764,
"learning_rate": 1.9817220769884636e-05,
"loss": 0.5078,
"step": 1460
},
{
"epoch": 0.7797374353534015,
"grad_norm": 3.697791337966919,
"learning_rate": 1.981010430354865e-05,
"loss": 0.5136,
"step": 1470
},
{
"epoch": 0.7850417716483225,
"grad_norm": 3.734912633895874,
"learning_rate": 1.9802853258509937e-05,
"loss": 0.5108,
"step": 1480
},
{
"epoch": 0.7903461079432436,
"grad_norm": 3.951476812362671,
"learning_rate": 1.9795467734241068e-05,
"loss": 0.5172,
"step": 1490
},
{
"epoch": 0.7956504442381647,
"grad_norm": 3.73142671585083,
"learning_rate": 1.9787947832059437e-05,
"loss": 0.5052,
"step": 1500
},
{
"epoch": 0.8009547805330858,
"grad_norm": 3.8046658039093018,
"learning_rate": 1.97802936551259e-05,
"loss": 0.4869,
"step": 1510
},
{
"epoch": 0.8062591168280069,
"grad_norm": 3.8006317615509033,
"learning_rate": 1.9772505308443332e-05,
"loss": 0.5227,
"step": 1520
},
{
"epoch": 0.811563453122928,
"grad_norm": 4.094967842102051,
"learning_rate": 1.9764582898855203e-05,
"loss": 0.5027,
"step": 1530
},
{
"epoch": 0.8168677894178491,
"grad_norm": 3.843932628631592,
"learning_rate": 1.975652653504411e-05,
"loss": 0.5099,
"step": 1540
},
{
"epoch": 0.8221721257127702,
"grad_norm": 3.7942137718200684,
"learning_rate": 1.9748336327530287e-05,
"loss": 0.5271,
"step": 1550
},
{
"epoch": 0.8274764620076913,
"grad_norm": 3.671407699584961,
"learning_rate": 1.9740012388670077e-05,
"loss": 0.515,
"step": 1560
},
{
"epoch": 0.8327807983026124,
"grad_norm": 3.8499674797058105,
"learning_rate": 1.9731554832654414e-05,
"loss": 0.5197,
"step": 1570
},
{
"epoch": 0.8380851345975335,
"grad_norm": 3.9028103351593018,
"learning_rate": 1.9722963775507225e-05,
"loss": 0.5098,
"step": 1580
},
{
"epoch": 0.8433894708924545,
"grad_norm": 3.6346793174743652,
"learning_rate": 1.971423933508387e-05,
"loss": 0.5,
"step": 1590
},
{
"epoch": 0.8486938071873756,
"grad_norm": 3.3686752319335938,
"learning_rate": 1.9705381631069508e-05,
"loss": 0.5078,
"step": 1600
},
{
"epoch": 0.8539981434822967,
"grad_norm": 3.9609599113464355,
"learning_rate": 1.9696390784977453e-05,
"loss": 0.4996,
"step": 1610
},
{
"epoch": 0.8593024797772179,
"grad_norm": 3.7242376804351807,
"learning_rate": 1.9687266920147517e-05,
"loss": 0.5064,
"step": 1620
},
{
"epoch": 0.864606816072139,
"grad_norm": 3.655386209487915,
"learning_rate": 1.967801016174431e-05,
"loss": 0.5239,
"step": 1630
},
{
"epoch": 0.8699111523670601,
"grad_norm": 3.86031436920166,
"learning_rate": 1.9668620636755525e-05,
"loss": 0.5372,
"step": 1640
},
{
"epoch": 0.8752154886619812,
"grad_norm": 3.772238254547119,
"learning_rate": 1.965909847399021e-05,
"loss": 0.5121,
"step": 1650
},
{
"epoch": 0.8805198249569023,
"grad_norm": 4.217292785644531,
"learning_rate": 1.9649443804076962e-05,
"loss": 0.5204,
"step": 1660
},
{
"epoch": 0.8858241612518234,
"grad_norm": 3.409078359603882,
"learning_rate": 1.9639656759462186e-05,
"loss": 0.5083,
"step": 1670
},
{
"epoch": 0.8911284975467445,
"grad_norm": 3.6432178020477295,
"learning_rate": 1.962973747440824e-05,
"loss": 0.5179,
"step": 1680
},
{
"epoch": 0.8964328338416656,
"grad_norm": 4.103431701660156,
"learning_rate": 1.961968608499161e-05,
"loss": 0.5162,
"step": 1690
},
{
"epoch": 0.9017371701365866,
"grad_norm": 4.007205009460449,
"learning_rate": 1.9609502729101043e-05,
"loss": 0.5192,
"step": 1700
},
{
"epoch": 0.9070415064315077,
"grad_norm": 3.750261068344116,
"learning_rate": 1.959918754643564e-05,
"loss": 0.5089,
"step": 1710
},
{
"epoch": 0.9123458427264288,
"grad_norm": 3.946521043777466,
"learning_rate": 1.9588740678502963e-05,
"loss": 0.5104,
"step": 1720
},
{
"epoch": 0.9176501790213499,
"grad_norm": 3.7120089530944824,
"learning_rate": 1.957816226861708e-05,
"loss": 0.5117,
"step": 1730
},
{
"epoch": 0.922954515316271,
"grad_norm": 3.9578349590301514,
"learning_rate": 1.956745246189659e-05,
"loss": 0.5057,
"step": 1740
},
{
"epoch": 0.9282588516111921,
"grad_norm": 3.6276957988739014,
"learning_rate": 1.9556611405262665e-05,
"loss": 0.5168,
"step": 1750
},
{
"epoch": 0.9335631879061133,
"grad_norm": 3.7194759845733643,
"learning_rate": 1.954563924743699e-05,
"loss": 0.527,
"step": 1760
},
{
"epoch": 0.9388675242010344,
"grad_norm": 3.7220964431762695,
"learning_rate": 1.953453613893976e-05,
"loss": 0.516,
"step": 1770
},
{
"epoch": 0.9441718604959555,
"grad_norm": 3.6736011505126953,
"learning_rate": 1.9523302232087592e-05,
"loss": 0.5347,
"step": 1780
},
{
"epoch": 0.9494761967908766,
"grad_norm": 3.3162753582000732,
"learning_rate": 1.951193768099145e-05,
"loss": 0.5142,
"step": 1790
},
{
"epoch": 0.9547805330857977,
"grad_norm": 4.169299602508545,
"learning_rate": 1.9500442641554523e-05,
"loss": 0.5339,
"step": 1800
},
{
"epoch": 0.9600848693807187,
"grad_norm": 3.8769702911376953,
"learning_rate": 1.9488817271470087e-05,
"loss": 0.5086,
"step": 1810
},
{
"epoch": 0.9653892056756398,
"grad_norm": 3.482774496078491,
"learning_rate": 1.9477061730219345e-05,
"loss": 0.5216,
"step": 1820
},
{
"epoch": 0.9706935419705609,
"grad_norm": 4.0699028968811035,
"learning_rate": 1.9465176179069235e-05,
"loss": 0.5136,
"step": 1830
},
{
"epoch": 0.975997878265482,
"grad_norm": 3.416879177093506,
"learning_rate": 1.9453160781070222e-05,
"loss": 0.5161,
"step": 1840
},
{
"epoch": 0.9813022145604031,
"grad_norm": 3.698963165283203,
"learning_rate": 1.9441015701054056e-05,
"loss": 0.5084,
"step": 1850
},
{
"epoch": 0.9866065508553242,
"grad_norm": 3.572234630584717,
"learning_rate": 1.9428741105631515e-05,
"loss": 0.5304,
"step": 1860
},
{
"epoch": 0.9919108871502453,
"grad_norm": 3.557875394821167,
"learning_rate": 1.9416337163190123e-05,
"loss": 0.5258,
"step": 1870
},
{
"epoch": 0.9972152234451664,
"grad_norm": 3.5830304622650146,
"learning_rate": 1.9403804043891824e-05,
"loss": 0.5101,
"step": 1880
},
{
"epoch": 1.0025195597400876,
"grad_norm": 3.1445114612579346,
"learning_rate": 1.9391141919670668e-05,
"loss": 0.4397,
"step": 1890
},
{
"epoch": 1.0078238960350085,
"grad_norm": 4.033489227294922,
"learning_rate": 1.9378350964230442e-05,
"loss": 0.2934,
"step": 1900
},
{
"epoch": 1.0131282323299298,
"grad_norm": 3.4734363555908203,
"learning_rate": 1.9365431353042283e-05,
"loss": 0.299,
"step": 1910
},
{
"epoch": 1.0184325686248508,
"grad_norm": 3.221336841583252,
"learning_rate": 1.9352383263342284e-05,
"loss": 0.285,
"step": 1920
},
{
"epoch": 1.023736904919772,
"grad_norm": 3.4902710914611816,
"learning_rate": 1.9339206874129043e-05,
"loss": 0.304,
"step": 1930
},
{
"epoch": 1.029041241214693,
"grad_norm": 3.831965684890747,
"learning_rate": 1.932590236616123e-05,
"loss": 0.2964,
"step": 1940
},
{
"epoch": 1.0343455775096142,
"grad_norm": 3.539476156234741,
"learning_rate": 1.9312469921955092e-05,
"loss": 0.3061,
"step": 1950
},
{
"epoch": 1.0396499138045352,
"grad_norm": 3.1916446685791016,
"learning_rate": 1.9298909725781957e-05,
"loss": 0.2949,
"step": 1960
},
{
"epoch": 1.0449542500994564,
"grad_norm": 3.2898848056793213,
"learning_rate": 1.9285221963665695e-05,
"loss": 0.2918,
"step": 1970
},
{
"epoch": 1.0502585863943774,
"grad_norm": 3.646435260772705,
"learning_rate": 1.927140682338018e-05,
"loss": 0.3051,
"step": 1980
},
{
"epoch": 1.0555629226892984,
"grad_norm": 3.4552927017211914,
"learning_rate": 1.9257464494446702e-05,
"loss": 0.31,
"step": 1990
},
{
"epoch": 1.0608672589842196,
"grad_norm": 3.9061553478240967,
"learning_rate": 1.924339516813138e-05,
"loss": 0.299,
"step": 2000
},
{
"epoch": 1.0661715952791406,
"grad_norm": 3.764522075653076,
"learning_rate": 1.922919903744253e-05,
"loss": 0.304,
"step": 2010
},
{
"epoch": 1.0714759315740618,
"grad_norm": 3.577147960662842,
"learning_rate": 1.9214876297128007e-05,
"loss": 0.3022,
"step": 2020
},
{
"epoch": 1.0767802678689828,
"grad_norm": 3.5394883155822754,
"learning_rate": 1.9200427143672557e-05,
"loss": 0.3077,
"step": 2030
},
{
"epoch": 1.082084604163904,
"grad_norm": 3.72452712059021,
"learning_rate": 1.918585177529511e-05,
"loss": 0.3006,
"step": 2040
},
{
"epoch": 1.087388940458825,
"grad_norm": 3.8295013904571533,
"learning_rate": 1.9171150391946045e-05,
"loss": 0.3118,
"step": 2050
},
{
"epoch": 1.0926932767537463,
"grad_norm": 3.4648563861846924,
"learning_rate": 1.9156323195304477e-05,
"loss": 0.3059,
"step": 2060
},
{
"epoch": 1.0979976130486673,
"grad_norm": 3.20479416847229,
"learning_rate": 1.914137038877547e-05,
"loss": 0.2892,
"step": 2070
},
{
"epoch": 1.1033019493435883,
"grad_norm": 3.4433279037475586,
"learning_rate": 1.9126292177487248e-05,
"loss": 0.287,
"step": 2080
},
{
"epoch": 1.1086062856385095,
"grad_norm": 3.7677085399627686,
"learning_rate": 1.911108876828839e-05,
"loss": 0.3007,
"step": 2090
},
{
"epoch": 1.1139106219334305,
"grad_norm": 3.7692863941192627,
"learning_rate": 1.9095760369744987e-05,
"loss": 0.2983,
"step": 2100
},
{
"epoch": 1.1192149582283517,
"grad_norm": 3.512312650680542,
"learning_rate": 1.9080307192137776e-05,
"loss": 0.3138,
"step": 2110
},
{
"epoch": 1.1245192945232727,
"grad_norm": 3.7949514389038086,
"learning_rate": 1.906472944745926e-05,
"loss": 0.3279,
"step": 2120
},
{
"epoch": 1.129823630818194,
"grad_norm": 3.4511559009552,
"learning_rate": 1.9049027349410812e-05,
"loss": 0.2923,
"step": 2130
},
{
"epoch": 1.135127967113115,
"grad_norm": 3.5923140048980713,
"learning_rate": 1.9033201113399713e-05,
"loss": 0.3115,
"step": 2140
},
{
"epoch": 1.1404323034080361,
"grad_norm": 3.7241883277893066,
"learning_rate": 1.901725095653623e-05,
"loss": 0.3077,
"step": 2150
},
{
"epoch": 1.1457366397029571,
"grad_norm": 3.86002779006958,
"learning_rate": 1.9001177097630617e-05,
"loss": 0.3038,
"step": 2160
},
{
"epoch": 1.1510409759978784,
"grad_norm": 3.63974666595459,
"learning_rate": 1.8984979757190115e-05,
"loss": 0.2945,
"step": 2170
},
{
"epoch": 1.1563453122927994,
"grad_norm": 3.8336093425750732,
"learning_rate": 1.896865915741594e-05,
"loss": 0.3103,
"step": 2180
},
{
"epoch": 1.1616496485877206,
"grad_norm": 3.468128204345703,
"learning_rate": 1.8952215522200226e-05,
"loss": 0.3029,
"step": 2190
},
{
"epoch": 1.1669539848826416,
"grad_norm": 3.8509416580200195,
"learning_rate": 1.893564907712294e-05,
"loss": 0.3047,
"step": 2200
},
{
"epoch": 1.1722583211775626,
"grad_norm": 3.3073980808258057,
"learning_rate": 1.8918960049448815e-05,
"loss": 0.3121,
"step": 2210
},
{
"epoch": 1.1775626574724838,
"grad_norm": 3.4550652503967285,
"learning_rate": 1.890214866812421e-05,
"loss": 0.3049,
"step": 2220
},
{
"epoch": 1.1828669937674048,
"grad_norm": 3.4764418601989746,
"learning_rate": 1.8885215163773987e-05,
"loss": 0.3146,
"step": 2230
},
{
"epoch": 1.188171330062326,
"grad_norm": 3.708789110183716,
"learning_rate": 1.8868159768698325e-05,
"loss": 0.3119,
"step": 2240
},
{
"epoch": 1.193475666357247,
"grad_norm": 3.81087064743042,
"learning_rate": 1.885098271686956e-05,
"loss": 0.311,
"step": 2250
},
{
"epoch": 1.1987800026521682,
"grad_norm": 3.6135127544403076,
"learning_rate": 1.8833684243928943e-05,
"loss": 0.3046,
"step": 2260
},
{
"epoch": 1.2040843389470892,
"grad_norm": 3.372121810913086,
"learning_rate": 1.8816264587183442e-05,
"loss": 0.306,
"step": 2270
},
{
"epoch": 1.2093886752420104,
"grad_norm": 3.7073960304260254,
"learning_rate": 1.8798723985602465e-05,
"loss": 0.3059,
"step": 2280
},
{
"epoch": 1.2146930115369314,
"grad_norm": 3.6234822273254395,
"learning_rate": 1.878106267981458e-05,
"loss": 0.3151,
"step": 2290
},
{
"epoch": 1.2199973478318524,
"grad_norm": 3.8061039447784424,
"learning_rate": 1.8763280912104233e-05,
"loss": 0.3116,
"step": 2300
},
{
"epoch": 1.2253016841267736,
"grad_norm": 3.3875911235809326,
"learning_rate": 1.8745378926408403e-05,
"loss": 0.3252,
"step": 2310
},
{
"epoch": 1.2306060204216946,
"grad_norm": 3.5641937255859375,
"learning_rate": 1.8727356968313265e-05,
"loss": 0.3094,
"step": 2320
},
{
"epoch": 1.2359103567166159,
"grad_norm": 3.8727028369903564,
"learning_rate": 1.870921528505082e-05,
"loss": 0.3199,
"step": 2330
},
{
"epoch": 1.2412146930115369,
"grad_norm": 3.7284903526306152,
"learning_rate": 1.8690954125495516e-05,
"loss": 0.3129,
"step": 2340
},
{
"epoch": 1.246519029306458,
"grad_norm": 3.500553607940674,
"learning_rate": 1.8672573740160802e-05,
"loss": 0.3133,
"step": 2350
},
{
"epoch": 1.251823365601379,
"grad_norm": 3.956127643585205,
"learning_rate": 1.8654074381195726e-05,
"loss": 0.3166,
"step": 2360
},
{
"epoch": 1.2571277018963003,
"grad_norm": 3.6364858150482178,
"learning_rate": 1.8635456302381456e-05,
"loss": 0.3075,
"step": 2370
},
{
"epoch": 1.2624320381912213,
"grad_norm": 3.55739164352417,
"learning_rate": 1.8616719759127803e-05,
"loss": 0.3061,
"step": 2380
},
{
"epoch": 1.2677363744861423,
"grad_norm": 3.479860305786133,
"learning_rate": 1.859786500846972e-05,
"loss": 0.3171,
"step": 2390
},
{
"epoch": 1.2730407107810635,
"grad_norm": 3.628953218460083,
"learning_rate": 1.857889230906377e-05,
"loss": 0.305,
"step": 2400
},
{
"epoch": 1.2783450470759847,
"grad_norm": 3.855865478515625,
"learning_rate": 1.8559801921184587e-05,
"loss": 0.296,
"step": 2410
},
{
"epoch": 1.2836493833709057,
"grad_norm": 3.509706497192383,
"learning_rate": 1.8540594106721293e-05,
"loss": 0.3224,
"step": 2420
},
{
"epoch": 1.2889537196658267,
"grad_norm": 3.4830563068389893,
"learning_rate": 1.8521269129173914e-05,
"loss": 0.3124,
"step": 2430
},
{
"epoch": 1.294258055960748,
"grad_norm": 3.4599504470825195,
"learning_rate": 1.850182725364977e-05,
"loss": 0.3223,
"step": 2440
},
{
"epoch": 1.299562392255669,
"grad_norm": 3.6792361736297607,
"learning_rate": 1.848226874685982e-05,
"loss": 0.325,
"step": 2450
},
{
"epoch": 1.3048667285505902,
"grad_norm": 3.6066126823425293,
"learning_rate": 1.8462593877115027e-05,
"loss": 0.3135,
"step": 2460
},
{
"epoch": 1.3101710648455112,
"grad_norm": 4.144036769866943,
"learning_rate": 1.8442802914322655e-05,
"loss": 0.3293,
"step": 2470
},
{
"epoch": 1.3154754011404324,
"grad_norm": 4.001558303833008,
"learning_rate": 1.8422896129982578e-05,
"loss": 0.3391,
"step": 2480
},
{
"epoch": 1.3207797374353534,
"grad_norm": 3.577042818069458,
"learning_rate": 1.840287379718356e-05,
"loss": 0.3207,
"step": 2490
},
{
"epoch": 1.3260840737302746,
"grad_norm": 3.525238037109375,
"learning_rate": 1.8382736190599494e-05,
"loss": 0.3254,
"step": 2500
},
{
"epoch": 1.3313884100251956,
"grad_norm": 3.2063822746276855,
"learning_rate": 1.8362483586485642e-05,
"loss": 0.314,
"step": 2510
},
{
"epoch": 1.3366927463201166,
"grad_norm": 3.878560781478882,
"learning_rate": 1.834211626267486e-05,
"loss": 0.3223,
"step": 2520
},
{
"epoch": 1.3419970826150378,
"grad_norm": 3.4493205547332764,
"learning_rate": 1.8321634498573748e-05,
"loss": 0.3296,
"step": 2530
},
{
"epoch": 1.347301418909959,
"grad_norm": 3.4929230213165283,
"learning_rate": 1.830103857515886e-05,
"loss": 0.3135,
"step": 2540
},
{
"epoch": 1.35260575520488,
"grad_norm": 3.288297653198242,
"learning_rate": 1.828032877497283e-05,
"loss": 0.317,
"step": 2550
},
{
"epoch": 1.357910091499801,
"grad_norm": 4.501832485198975,
"learning_rate": 1.8259505382120483e-05,
"loss": 0.3067,
"step": 2560
},
{
"epoch": 1.3632144277947222,
"grad_norm": 3.346526622772217,
"learning_rate": 1.8238568682264962e-05,
"loss": 0.3127,
"step": 2570
},
{
"epoch": 1.3685187640896432,
"grad_norm": 3.7969963550567627,
"learning_rate": 1.8217518962623792e-05,
"loss": 0.3217,
"step": 2580
},
{
"epoch": 1.3738231003845645,
"grad_norm": 3.661844253540039,
"learning_rate": 1.8196356511964955e-05,
"loss": 0.321,
"step": 2590
},
{
"epoch": 1.3791274366794855,
"grad_norm": 3.460169553756714,
"learning_rate": 1.8175081620602903e-05,
"loss": 0.3245,
"step": 2600
},
{
"epoch": 1.3844317729744064,
"grad_norm": 3.679044008255005,
"learning_rate": 1.815369458039461e-05,
"loss": 0.3167,
"step": 2610
},
{
"epoch": 1.3897361092693277,
"grad_norm": 3.7198102474212646,
"learning_rate": 1.8132195684735545e-05,
"loss": 0.3249,
"step": 2620
},
{
"epoch": 1.3950404455642489,
"grad_norm": 3.785710334777832,
"learning_rate": 1.811058522855564e-05,
"loss": 0.3279,
"step": 2630
},
{
"epoch": 1.4003447818591699,
"grad_norm": 3.7823047637939453,
"learning_rate": 1.808886350831527e-05,
"loss": 0.3297,
"step": 2640
},
{
"epoch": 1.4056491181540909,
"grad_norm": 3.903444766998291,
"learning_rate": 1.806703082200117e-05,
"loss": 0.3324,
"step": 2650
},
{
"epoch": 1.410953454449012,
"grad_norm": 3.5661463737487793,
"learning_rate": 1.8045087469122346e-05,
"loss": 0.3207,
"step": 2660
},
{
"epoch": 1.416257790743933,
"grad_norm": 4.140559673309326,
"learning_rate": 1.8023033750705972e-05,
"loss": 0.3151,
"step": 2670
},
{
"epoch": 1.4215621270388543,
"grad_norm": 3.873009443283081,
"learning_rate": 1.8000869969293254e-05,
"loss": 0.3327,
"step": 2680
},
{
"epoch": 1.4268664633337753,
"grad_norm": 3.7390902042388916,
"learning_rate": 1.7978596428935286e-05,
"loss": 0.3073,
"step": 2690
},
{
"epoch": 1.4321707996286965,
"grad_norm": 3.8170132637023926,
"learning_rate": 1.7956213435188884e-05,
"loss": 0.3235,
"step": 2700
},
{
"epoch": 1.4374751359236175,
"grad_norm": 4.089386940002441,
"learning_rate": 1.793372129511237e-05,
"loss": 0.325,
"step": 2710
},
{
"epoch": 1.4427794722185387,
"grad_norm": 3.624798059463501,
"learning_rate": 1.791112031726139e-05,
"loss": 0.3221,
"step": 2720
},
{
"epoch": 1.4480838085134597,
"grad_norm": 3.935511350631714,
"learning_rate": 1.788841081168467e-05,
"loss": 0.3301,
"step": 2730
},
{
"epoch": 1.4533881448083807,
"grad_norm": 3.4397850036621094,
"learning_rate": 1.7865593089919745e-05,
"loss": 0.3137,
"step": 2740
},
{
"epoch": 1.458692481103302,
"grad_norm": 3.8265187740325928,
"learning_rate": 1.784266746498871e-05,
"loss": 0.3299,
"step": 2750
},
{
"epoch": 1.4639968173982232,
"grad_norm": 3.81622052192688,
"learning_rate": 1.781963425139392e-05,
"loss": 0.3155,
"step": 2760
},
{
"epoch": 1.4693011536931442,
"grad_norm": 3.5892412662506104,
"learning_rate": 1.7796493765113666e-05,
"loss": 0.3248,
"step": 2770
},
{
"epoch": 1.4746054899880652,
"grad_norm": 3.772759437561035,
"learning_rate": 1.7773246323597845e-05,
"loss": 0.313,
"step": 2780
},
{
"epoch": 1.4799098262829864,
"grad_norm": 3.4618637561798096,
"learning_rate": 1.7749892245763614e-05,
"loss": 0.3127,
"step": 2790
},
{
"epoch": 1.4852141625779074,
"grad_norm": 3.3471107482910156,
"learning_rate": 1.7726431851990992e-05,
"loss": 0.3208,
"step": 2800
},
{
"epoch": 1.4905184988728286,
"grad_norm": 3.3492703437805176,
"learning_rate": 1.77028654641185e-05,
"loss": 0.3252,
"step": 2810
},
{
"epoch": 1.4958228351677496,
"grad_norm": 3.785266160964966,
"learning_rate": 1.7679193405438713e-05,
"loss": 0.3229,
"step": 2820
},
{
"epoch": 1.5011271714626706,
"grad_norm": 3.609468460083008,
"learning_rate": 1.7655416000693836e-05,
"loss": 0.3193,
"step": 2830
},
{
"epoch": 1.5064315077575918,
"grad_norm": 3.516772508621216,
"learning_rate": 1.763153357607126e-05,
"loss": 0.3318,
"step": 2840
},
{
"epoch": 1.511735844052513,
"grad_norm": 3.5849831104278564,
"learning_rate": 1.760754645919907e-05,
"loss": 0.3284,
"step": 2850
},
{
"epoch": 1.517040180347434,
"grad_norm": 4.03626012802124,
"learning_rate": 1.758345497914157e-05,
"loss": 0.32,
"step": 2860
},
{
"epoch": 1.522344516642355,
"grad_norm": 4.129549980163574,
"learning_rate": 1.755925946639474e-05,
"loss": 0.3312,
"step": 2870
},
{
"epoch": 1.5276488529372763,
"grad_norm": 3.846771717071533,
"learning_rate": 1.7534960252881735e-05,
"loss": 0.3263,
"step": 2880
},
{
"epoch": 1.5329531892321975,
"grad_norm": 3.8618805408477783,
"learning_rate": 1.7510557671948314e-05,
"loss": 0.3203,
"step": 2890
},
{
"epoch": 1.5382575255271185,
"grad_norm": 3.563652992248535,
"learning_rate": 1.748605205835826e-05,
"loss": 0.3393,
"step": 2900
},
{
"epoch": 1.5435618618220395,
"grad_norm": 3.45072865486145,
"learning_rate": 1.7461443748288797e-05,
"loss": 0.3234,
"step": 2910
},
{
"epoch": 1.5488661981169605,
"grad_norm": 3.2662858963012695,
"learning_rate": 1.7436733079326e-05,
"loss": 0.3229,
"step": 2920
},
{
"epoch": 1.5541705344118817,
"grad_norm": 3.587158679962158,
"learning_rate": 1.741192039046011e-05,
"loss": 0.3394,
"step": 2930
},
{
"epoch": 1.559474870706803,
"grad_norm": 4.137202262878418,
"learning_rate": 1.738700602208094e-05,
"loss": 0.3364,
"step": 2940
},
{
"epoch": 1.564779207001724,
"grad_norm": 3.8183786869049072,
"learning_rate": 1.7361990315973166e-05,
"loss": 0.3245,
"step": 2950
},
{
"epoch": 1.570083543296645,
"grad_norm": 3.716970205307007,
"learning_rate": 1.733687361531166e-05,
"loss": 0.3379,
"step": 2960
},
{
"epoch": 1.5753878795915661,
"grad_norm": 3.4474103450775146,
"learning_rate": 1.731165626465678e-05,
"loss": 0.3321,
"step": 2970
},
{
"epoch": 1.5806922158864873,
"grad_norm": 3.9459433555603027,
"learning_rate": 1.7286338609949623e-05,
"loss": 0.3319,
"step": 2980
},
{
"epoch": 1.5859965521814083,
"grad_norm": 3.303617477416992,
"learning_rate": 1.7260920998507315e-05,
"loss": 0.3383,
"step": 2990
},
{
"epoch": 1.5913008884763293,
"grad_norm": 3.5322844982147217,
"learning_rate": 1.72354037790182e-05,
"loss": 0.3154,
"step": 3000
},
{
"epoch": 1.5966052247712503,
"grad_norm": 3.635408401489258,
"learning_rate": 1.7209787301537116e-05,
"loss": 0.3277,
"step": 3010
},
{
"epoch": 1.6019095610661716,
"grad_norm": 3.167227268218994,
"learning_rate": 1.7184071917480526e-05,
"loss": 0.3179,
"step": 3020
},
{
"epoch": 1.6072138973610928,
"grad_norm": 3.5568039417266846,
"learning_rate": 1.7158257979621756e-05,
"loss": 0.3293,
"step": 3030
},
{
"epoch": 1.6125182336560138,
"grad_norm": 3.3058979511260986,
"learning_rate": 1.7132345842086114e-05,
"loss": 0.3355,
"step": 3040
},
{
"epoch": 1.6178225699509348,
"grad_norm": 3.9339213371276855,
"learning_rate": 1.710633586034606e-05,
"loss": 0.3188,
"step": 3050
},
{
"epoch": 1.623126906245856,
"grad_norm": 3.5868442058563232,
"learning_rate": 1.7080228391216305e-05,
"loss": 0.3212,
"step": 3060
},
{
"epoch": 1.6284312425407772,
"grad_norm": 3.4200713634490967,
"learning_rate": 1.705402379284894e-05,
"loss": 0.3173,
"step": 3070
},
{
"epoch": 1.6337355788356982,
"grad_norm": 3.5358498096466064,
"learning_rate": 1.7027722424728513e-05,
"loss": 0.3298,
"step": 3080
},
{
"epoch": 1.6390399151306192,
"grad_norm": 3.7147319316864014,
"learning_rate": 1.700132464766708e-05,
"loss": 0.3235,
"step": 3090
},
{
"epoch": 1.6443442514255404,
"grad_norm": 3.8844540119171143,
"learning_rate": 1.6974830823799285e-05,
"loss": 0.3201,
"step": 3100
},
{
"epoch": 1.6496485877204616,
"grad_norm": 3.6052422523498535,
"learning_rate": 1.6948241316577375e-05,
"loss": 0.3228,
"step": 3110
},
{
"epoch": 1.6549529240153826,
"grad_norm": 3.4337236881256104,
"learning_rate": 1.692155649076621e-05,
"loss": 0.3332,
"step": 3120
},
{
"epoch": 1.6602572603103036,
"grad_norm": 3.9126977920532227,
"learning_rate": 1.6894776712438288e-05,
"loss": 0.3291,
"step": 3130
},
{
"epoch": 1.6655615966052246,
"grad_norm": 3.7995638847351074,
"learning_rate": 1.686790234896867e-05,
"loss": 0.3222,
"step": 3140
},
{
"epoch": 1.6708659329001458,
"grad_norm": 3.664808988571167,
"learning_rate": 1.6840933769030002e-05,
"loss": 0.3301,
"step": 3150
},
{
"epoch": 1.676170269195067,
"grad_norm": 3.8624629974365234,
"learning_rate": 1.6813871342587404e-05,
"loss": 0.3265,
"step": 3160
},
{
"epoch": 1.681474605489988,
"grad_norm": 3.776545763015747,
"learning_rate": 1.678671544089343e-05,
"loss": 0.3266,
"step": 3170
},
{
"epoch": 1.686778941784909,
"grad_norm": 4.29400634765625,
"learning_rate": 1.6759466436482954e-05,
"loss": 0.3385,
"step": 3180
},
{
"epoch": 1.6920832780798303,
"grad_norm": 4.078743934631348,
"learning_rate": 1.6732124703168075e-05,
"loss": 0.3357,
"step": 3190
},
{
"epoch": 1.6973876143747515,
"grad_norm": 3.473541498184204,
"learning_rate": 1.6704690616032987e-05,
"loss": 0.3294,
"step": 3200
},
{
"epoch": 1.7026919506696725,
"grad_norm": 3.7348885536193848,
"learning_rate": 1.667716455142881e-05,
"loss": 0.3351,
"step": 3210
},
{
"epoch": 1.7079962869645935,
"grad_norm": 3.998124361038208,
"learning_rate": 1.6649546886968473e-05,
"loss": 0.3324,
"step": 3220
},
{
"epoch": 1.7133006232595145,
"grad_norm": 3.9629366397857666,
"learning_rate": 1.662183800152148e-05,
"loss": 0.335,
"step": 3230
},
{
"epoch": 1.7186049595544357,
"grad_norm": 3.7607057094573975,
"learning_rate": 1.6594038275208748e-05,
"loss": 0.3442,
"step": 3240
},
{
"epoch": 1.723909295849357,
"grad_norm": 4.3320112228393555,
"learning_rate": 1.6566148089397387e-05,
"loss": 0.3319,
"step": 3250
},
{
"epoch": 1.729213632144278,
"grad_norm": 3.361454963684082,
"learning_rate": 1.6538167826695466e-05,
"loss": 0.3291,
"step": 3260
},
{
"epoch": 1.734517968439199,
"grad_norm": 3.9114990234375,
"learning_rate": 1.6510097870946752e-05,
"loss": 0.3157,
"step": 3270
},
{
"epoch": 1.7398223047341201,
"grad_norm": 4.4820237159729,
"learning_rate": 1.6481938607225468e-05,
"loss": 0.3245,
"step": 3280
},
{
"epoch": 1.7451266410290414,
"grad_norm": 3.609239101409912,
"learning_rate": 1.6453690421830987e-05,
"loss": 0.3129,
"step": 3290
},
{
"epoch": 1.7504309773239624,
"grad_norm": 3.86490797996521,
"learning_rate": 1.6425353702282543e-05,
"loss": 0.3103,
"step": 3300
},
{
"epoch": 1.7557353136188834,
"grad_norm": 3.5796539783477783,
"learning_rate": 1.639692883731393e-05,
"loss": 0.3202,
"step": 3310
},
{
"epoch": 1.7610396499138046,
"grad_norm": 3.493624687194824,
"learning_rate": 1.6368416216868137e-05,
"loss": 0.3331,
"step": 3320
},
{
"epoch": 1.7663439862087258,
"grad_norm": 3.596022129058838,
"learning_rate": 1.633981623209202e-05,
"loss": 0.3175,
"step": 3330
},
{
"epoch": 1.7716483225036468,
"grad_norm": 3.6291396617889404,
"learning_rate": 1.6311129275330936e-05,
"loss": 0.3256,
"step": 3340
},
{
"epoch": 1.7769526587985678,
"grad_norm": 3.8184401988983154,
"learning_rate": 1.628235574012335e-05,
"loss": 0.3112,
"step": 3350
},
{
"epoch": 1.7822569950934888,
"grad_norm": 3.4983391761779785,
"learning_rate": 1.6253496021195453e-05,
"loss": 0.3198,
"step": 3360
},
{
"epoch": 1.78756133138841,
"grad_norm": 3.6440587043762207,
"learning_rate": 1.6224550514455724e-05,
"loss": 0.3341,
"step": 3370
},
{
"epoch": 1.7928656676833312,
"grad_norm": 3.429333448410034,
"learning_rate": 1.619551961698952e-05,
"loss": 0.3255,
"step": 3380
},
{
"epoch": 1.7981700039782522,
"grad_norm": 3.5707192420959473,
"learning_rate": 1.6166403727053617e-05,
"loss": 0.3268,
"step": 3390
},
{
"epoch": 1.8034743402731732,
"grad_norm": 3.5965964794158936,
"learning_rate": 1.6137203244070755e-05,
"loss": 0.3194,
"step": 3400
},
{
"epoch": 1.8087786765680944,
"grad_norm": 3.7592883110046387,
"learning_rate": 1.610791856862414e-05,
"loss": 0.3398,
"step": 3410
},
{
"epoch": 1.8140830128630157,
"grad_norm": 4.1979265213012695,
"learning_rate": 1.6078550102451974e-05,
"loss": 0.3345,
"step": 3420
},
{
"epoch": 1.8193873491579367,
"grad_norm": 3.106452465057373,
"learning_rate": 1.6049098248441936e-05,
"loss": 0.321,
"step": 3430
},
{
"epoch": 1.8246916854528576,
"grad_norm": 3.2761764526367188,
"learning_rate": 1.6019563410625635e-05,
"loss": 0.327,
"step": 3440
},
{
"epoch": 1.8299960217477786,
"grad_norm": 3.375182867050171,
"learning_rate": 1.5989945994173094e-05,
"loss": 0.3178,
"step": 3450
},
{
"epoch": 1.8353003580426999,
"grad_norm": 3.563046932220459,
"learning_rate": 1.5960246405387173e-05,
"loss": 0.3322,
"step": 3460
},
{
"epoch": 1.840604694337621,
"grad_norm": 4.122814178466797,
"learning_rate": 1.5930465051698016e-05,
"loss": 0.3335,
"step": 3470
},
{
"epoch": 1.845909030632542,
"grad_norm": 3.5397937297821045,
"learning_rate": 1.5900602341657435e-05,
"loss": 0.3326,
"step": 3480
},
{
"epoch": 1.851213366927463,
"grad_norm": 3.227036952972412,
"learning_rate": 1.5870658684933327e-05,
"loss": 0.3265,
"step": 3490
},
{
"epoch": 1.8565177032223843,
"grad_norm": 3.8446407318115234,
"learning_rate": 1.5840634492304045e-05,
"loss": 0.334,
"step": 3500
},
{
"epoch": 1.8618220395173055,
"grad_norm": 3.7455456256866455,
"learning_rate": 1.581053017565276e-05,
"loss": 0.3173,
"step": 3510
},
{
"epoch": 1.8671263758122265,
"grad_norm": 3.5950396060943604,
"learning_rate": 1.5780346147961814e-05,
"loss": 0.3386,
"step": 3520
},
{
"epoch": 1.8724307121071475,
"grad_norm": 3.6555497646331787,
"learning_rate": 1.5750082823307067e-05,
"loss": 0.3404,
"step": 3530
},
{
"epoch": 1.8777350484020687,
"grad_norm": 3.5170364379882812,
"learning_rate": 1.5719740616852192e-05,
"loss": 0.3267,
"step": 3540
},
{
"epoch": 1.88303938469699,
"grad_norm": 3.70723295211792,
"learning_rate": 1.568931994484299e-05,
"loss": 0.3214,
"step": 3550
},
{
"epoch": 1.888343720991911,
"grad_norm": 3.543966770172119,
"learning_rate": 1.5658821224601693e-05,
"loss": 0.3206,
"step": 3560
},
{
"epoch": 1.893648057286832,
"grad_norm": 3.4180748462677,
"learning_rate": 1.562824487452123e-05,
"loss": 0.3221,
"step": 3570
},
{
"epoch": 1.898952393581753,
"grad_norm": 4.066466808319092,
"learning_rate": 1.5597591314059464e-05,
"loss": 0.3219,
"step": 3580
},
{
"epoch": 1.9042567298766742,
"grad_norm": 3.359677791595459,
"learning_rate": 1.5566860963733486e-05,
"loss": 0.3372,
"step": 3590
},
{
"epoch": 1.9095610661715954,
"grad_norm": 4.057319164276123,
"learning_rate": 1.55360542451138e-05,
"loss": 0.3306,
"step": 3600
},
{
"epoch": 1.9148654024665164,
"grad_norm": 3.5447351932525635,
"learning_rate": 1.550517158081857e-05,
"loss": 0.3054,
"step": 3610
},
{
"epoch": 1.9201697387614374,
"grad_norm": 3.775535821914673,
"learning_rate": 1.5474213394507798e-05,
"loss": 0.3222,
"step": 3620
},
{
"epoch": 1.9254740750563586,
"grad_norm": 3.208907127380371,
"learning_rate": 1.544318011087754e-05,
"loss": 0.3168,
"step": 3630
},
{
"epoch": 1.9307784113512798,
"grad_norm": 3.393601179122925,
"learning_rate": 1.541207215565407e-05,
"loss": 0.3234,
"step": 3640
},
{
"epoch": 1.9360827476462008,
"grad_norm": 3.797776222229004,
"learning_rate": 1.5380889955588006e-05,
"loss": 0.3022,
"step": 3650
},
{
"epoch": 1.9413870839411218,
"grad_norm": 3.817760705947876,
"learning_rate": 1.5349633938448517e-05,
"loss": 0.3118,
"step": 3660
},
{
"epoch": 1.9466914202360428,
"grad_norm": 3.5809881687164307,
"learning_rate": 1.5318304533017403e-05,
"loss": 0.327,
"step": 3670
},
{
"epoch": 1.951995756530964,
"grad_norm": 3.3459973335266113,
"learning_rate": 1.528690216908324e-05,
"loss": 0.3222,
"step": 3680
},
{
"epoch": 1.9573000928258852,
"grad_norm": 3.520768165588379,
"learning_rate": 1.5255427277435474e-05,
"loss": 0.3303,
"step": 3690
},
{
"epoch": 1.9626044291208062,
"grad_norm": 3.473872423171997,
"learning_rate": 1.5223880289858515e-05,
"loss": 0.3337,
"step": 3700
},
{
"epoch": 1.9679087654157272,
"grad_norm": 4.005605220794678,
"learning_rate": 1.5192261639125807e-05,
"loss": 0.3206,
"step": 3710
},
{
"epoch": 1.9732131017106485,
"grad_norm": 3.648740530014038,
"learning_rate": 1.5160571758993902e-05,
"loss": 0.3143,
"step": 3720
},
{
"epoch": 1.9785174380055697,
"grad_norm": 4.329815864562988,
"learning_rate": 1.5128811084196505e-05,
"loss": 0.3402,
"step": 3730
},
{
"epoch": 1.9838217743004907,
"grad_norm": 3.976907253265381,
"learning_rate": 1.5096980050438501e-05,
"loss": 0.3137,
"step": 3740
},
{
"epoch": 1.9891261105954117,
"grad_norm": 3.991213321685791,
"learning_rate": 1.5065079094389994e-05,
"loss": 0.3039,
"step": 3750
},
{
"epoch": 1.9944304468903329,
"grad_norm": 3.3972666263580322,
"learning_rate": 1.5033108653680298e-05,
"loss": 0.3288,
"step": 3760
},
{
"epoch": 1.999734783185254,
"grad_norm": 3.2909555435180664,
"learning_rate": 1.5001069166891957e-05,
"loss": 0.3225,
"step": 3770
},
{
"epoch": 2.005039119480175,
"grad_norm": 2.6935155391693115,
"learning_rate": 1.4968961073554708e-05,
"loss": 0.1467,
"step": 3780
},
{
"epoch": 2.010343455775096,
"grad_norm": 2.5926148891448975,
"learning_rate": 1.4936784814139453e-05,
"loss": 0.1284,
"step": 3790
},
{
"epoch": 2.015647792070017,
"grad_norm": 2.8939812183380127,
"learning_rate": 1.4904540830052234e-05,
"loss": 0.1273,
"step": 3800
},
{
"epoch": 2.0209521283649385,
"grad_norm": 3.236732244491577,
"learning_rate": 1.4872229563628158e-05,
"loss": 0.1306,
"step": 3810
},
{
"epoch": 2.0262564646598595,
"grad_norm": 2.838834524154663,
"learning_rate": 1.4839851458125331e-05,
"loss": 0.134,
"step": 3820
},
{
"epoch": 2.0315608009547805,
"grad_norm": 2.6801376342773438,
"learning_rate": 1.48074069577188e-05,
"loss": 0.1286,
"step": 3830
},
{
"epoch": 2.0368651372497015,
"grad_norm": 3.151033878326416,
"learning_rate": 1.4774896507494426e-05,
"loss": 0.1371,
"step": 3840
},
{
"epoch": 2.0421694735446225,
"grad_norm": 2.7879486083984375,
"learning_rate": 1.4742320553442797e-05,
"loss": 0.1313,
"step": 3850
},
{
"epoch": 2.047473809839544,
"grad_norm": 2.801165819168091,
"learning_rate": 1.4709679542453115e-05,
"loss": 0.1343,
"step": 3860
},
{
"epoch": 2.052778146134465,
"grad_norm": 3.195171594619751,
"learning_rate": 1.4676973922307052e-05,
"loss": 0.1304,
"step": 3870
},
{
"epoch": 2.058082482429386,
"grad_norm": 3.2752623558044434,
"learning_rate": 1.4644204141672614e-05,
"loss": 0.1325,
"step": 3880
},
{
"epoch": 2.063386818724307,
"grad_norm": 2.8927152156829834,
"learning_rate": 1.461137065009798e-05,
"loss": 0.1298,
"step": 3890
},
{
"epoch": 2.0686911550192284,
"grad_norm": 2.9264607429504395,
"learning_rate": 1.4578473898005346e-05,
"loss": 0.1326,
"step": 3900
},
{
"epoch": 2.0739954913141494,
"grad_norm": 2.765760660171509,
"learning_rate": 1.454551433668474e-05,
"loss": 0.136,
"step": 3910
},
{
"epoch": 2.0792998276090704,
"grad_norm": 2.7184066772460938,
"learning_rate": 1.4512492418287828e-05,
"loss": 0.1267,
"step": 3920
},
{
"epoch": 2.0846041639039914,
"grad_norm": 3.297851800918579,
"learning_rate": 1.4479408595821707e-05,
"loss": 0.138,
"step": 3930
},
{
"epoch": 2.089908500198913,
"grad_norm": 2.9943556785583496,
"learning_rate": 1.4446263323142713e-05,
"loss": 0.1289,
"step": 3940
},
{
"epoch": 2.095212836493834,
"grad_norm": 2.7158663272857666,
"learning_rate": 1.4413057054950166e-05,
"loss": 0.1328,
"step": 3950
},
{
"epoch": 2.100517172788755,
"grad_norm": 2.943380832672119,
"learning_rate": 1.4379790246780152e-05,
"loss": 0.1399,
"step": 3960
},
{
"epoch": 2.105821509083676,
"grad_norm": 2.653359889984131,
"learning_rate": 1.434646335499926e-05,
"loss": 0.1339,
"step": 3970
},
{
"epoch": 2.111125845378597,
"grad_norm": 2.7849793434143066,
"learning_rate": 1.431307683679834e-05,
"loss": 0.1315,
"step": 3980
},
{
"epoch": 2.1164301816735183,
"grad_norm": 2.9013118743896484,
"learning_rate": 1.4279631150186207e-05,
"loss": 0.132,
"step": 3990
},
{
"epoch": 2.1217345179684393,
"grad_norm": 3.1449782848358154,
"learning_rate": 1.4246126753983378e-05,
"loss": 0.1368,
"step": 4000
},
{
"epoch": 2.1270388542633603,
"grad_norm": 2.9267892837524414,
"learning_rate": 1.4212564107815774e-05,
"loss": 0.1361,
"step": 4010
},
{
"epoch": 2.1323431905582813,
"grad_norm": 3.1600418090820312,
"learning_rate": 1.4178943672108402e-05,
"loss": 0.1302,
"step": 4020
},
{
"epoch": 2.1376475268532027,
"grad_norm": 2.9146716594696045,
"learning_rate": 1.4145265908079051e-05,
"loss": 0.1346,
"step": 4030
},
{
"epoch": 2.1429518631481237,
"grad_norm": 2.721554756164551,
"learning_rate": 1.4111531277731965e-05,
"loss": 0.141,
"step": 4040
},
{
"epoch": 2.1482561994430447,
"grad_norm": 2.9390203952789307,
"learning_rate": 1.4077740243851497e-05,
"loss": 0.1367,
"step": 4050
},
{
"epoch": 2.1535605357379657,
"grad_norm": 2.7774572372436523,
"learning_rate": 1.4043893269995766e-05,
"loss": 0.1393,
"step": 4060
},
{
"epoch": 2.1588648720328867,
"grad_norm": 3.0352425575256348,
"learning_rate": 1.4009990820490296e-05,
"loss": 0.1333,
"step": 4070
},
{
"epoch": 2.164169208327808,
"grad_norm": 2.9310920238494873,
"learning_rate": 1.3976033360421652e-05,
"loss": 0.1385,
"step": 4080
},
{
"epoch": 2.169473544622729,
"grad_norm": 2.895400047302246,
"learning_rate": 1.3942021355631047e-05,
"loss": 0.1364,
"step": 4090
},
{
"epoch": 2.17477788091765,
"grad_norm": 2.785146474838257,
"learning_rate": 1.3907955272707963e-05,
"loss": 0.1345,
"step": 4100
},
{
"epoch": 2.180082217212571,
"grad_norm": 2.920459032058716,
"learning_rate": 1.3873835578983747e-05,
"loss": 0.1379,
"step": 4110
},
{
"epoch": 2.1853865535074926,
"grad_norm": 2.7607204914093018,
"learning_rate": 1.3839662742525199e-05,
"loss": 0.1348,
"step": 4120
},
{
"epoch": 2.1906908898024136,
"grad_norm": 2.9964351654052734,
"learning_rate": 1.3805437232128149e-05,
"loss": 0.1308,
"step": 4130
},
{
"epoch": 2.1959952260973346,
"grad_norm": 2.8842570781707764,
"learning_rate": 1.3771159517311026e-05,
"loss": 0.1329,
"step": 4140
},
{
"epoch": 2.2012995623922555,
"grad_norm": 2.794616222381592,
"learning_rate": 1.3736830068308429e-05,
"loss": 0.1359,
"step": 4150
},
{
"epoch": 2.2066038986871765,
"grad_norm": 3.242060661315918,
"learning_rate": 1.3702449356064648e-05,
"loss": 0.1376,
"step": 4160
},
{
"epoch": 2.211908234982098,
"grad_norm": 2.9713857173919678,
"learning_rate": 1.366801785222724e-05,
"loss": 0.1388,
"step": 4170
},
{
"epoch": 2.217212571277019,
"grad_norm": 2.71294903755188,
"learning_rate": 1.3633536029140535e-05,
"loss": 0.1348,
"step": 4180
},
{
"epoch": 2.22251690757194,
"grad_norm": 3.0826351642608643,
"learning_rate": 1.359900435983915e-05,
"loss": 0.1376,
"step": 4190
},
{
"epoch": 2.227821243866861,
"grad_norm": 2.7096450328826904,
"learning_rate": 1.3564423318041527e-05,
"loss": 0.1364,
"step": 4200
},
{
"epoch": 2.2331255801617824,
"grad_norm": 2.883322238922119,
"learning_rate": 1.3529793378143407e-05,
"loss": 0.1364,
"step": 4210
},
{
"epoch": 2.2384299164567034,
"grad_norm": 3.0577938556671143,
"learning_rate": 1.3495115015211341e-05,
"loss": 0.1353,
"step": 4220
},
{
"epoch": 2.2437342527516244,
"grad_norm": 2.8565526008605957,
"learning_rate": 1.3460388704976162e-05,
"loss": 0.1355,
"step": 4230
},
{
"epoch": 2.2490385890465454,
"grad_norm": 2.8537418842315674,
"learning_rate": 1.3425614923826463e-05,
"loss": 0.1387,
"step": 4240
},
{
"epoch": 2.254342925341467,
"grad_norm": 2.9709837436676025,
"learning_rate": 1.3390794148802055e-05,
"loss": 0.1392,
"step": 4250
},
{
"epoch": 2.259647261636388,
"grad_norm": 3.2056241035461426,
"learning_rate": 1.3355926857587442e-05,
"loss": 0.132,
"step": 4260
},
{
"epoch": 2.264951597931309,
"grad_norm": 2.8401198387145996,
"learning_rate": 1.3321013528505242e-05,
"loss": 0.139,
"step": 4270
},
{
"epoch": 2.27025593422623,
"grad_norm": 3.095860481262207,
"learning_rate": 1.3286054640509642e-05,
"loss": 0.1421,
"step": 4280
},
{
"epoch": 2.275560270521151,
"grad_norm": 2.8098833560943604,
"learning_rate": 1.325105067317983e-05,
"loss": 0.1374,
"step": 4290
},
{
"epoch": 2.2808646068160723,
"grad_norm": 2.9357638359069824,
"learning_rate": 1.3216002106713394e-05,
"loss": 0.1357,
"step": 4300
},
{
"epoch": 2.2861689431109933,
"grad_norm": 2.8763411045074463,
"learning_rate": 1.3180909421919763e-05,
"loss": 0.134,
"step": 4310
},
{
"epoch": 2.2914732794059143,
"grad_norm": 3.224968194961548,
"learning_rate": 1.3145773100213596e-05,
"loss": 0.1379,
"step": 4320
},
{
"epoch": 2.2967776157008353,
"grad_norm": 2.9192330837249756,
"learning_rate": 1.3110593623608174e-05,
"loss": 0.1336,
"step": 4330
},
{
"epoch": 2.3020819519957567,
"grad_norm": 3.039752721786499,
"learning_rate": 1.3075371474708798e-05,
"loss": 0.1378,
"step": 4340
},
{
"epoch": 2.3073862882906777,
"grad_norm": 3.0629944801330566,
"learning_rate": 1.3040107136706162e-05,
"loss": 0.1362,
"step": 4350
},
{
"epoch": 2.3126906245855987,
"grad_norm": 2.8435914516448975,
"learning_rate": 1.3004801093369723e-05,
"loss": 0.1313,
"step": 4360
},
{
"epoch": 2.3179949608805197,
"grad_norm": 2.789707660675049,
"learning_rate": 1.2969453829041073e-05,
"loss": 0.1389,
"step": 4370
},
{
"epoch": 2.323299297175441,
"grad_norm": 3.2722177505493164,
"learning_rate": 1.2934065828627285e-05,
"loss": 0.1369,
"step": 4380
},
{
"epoch": 2.328603633470362,
"grad_norm": 2.8057737350463867,
"learning_rate": 1.289863757759427e-05,
"loss": 0.1396,
"step": 4390
},
{
"epoch": 2.333907969765283,
"grad_norm": 2.924314022064209,
"learning_rate": 1.2863169561960105e-05,
"loss": 0.1414,
"step": 4400
},
{
"epoch": 2.339212306060204,
"grad_norm": 2.850343942642212,
"learning_rate": 1.2827662268288377e-05,
"loss": 0.1359,
"step": 4410
},
{
"epoch": 2.344516642355125,
"grad_norm": 2.6075124740600586,
"learning_rate": 1.2792116183681506e-05,
"loss": 0.1347,
"step": 4420
},
{
"epoch": 2.3498209786500466,
"grad_norm": 3.160735845565796,
"learning_rate": 1.2756531795774053e-05,
"loss": 0.1304,
"step": 4430
},
{
"epoch": 2.3551253149449676,
"grad_norm": 3.01257586479187,
"learning_rate": 1.2720909592726045e-05,
"loss": 0.1412,
"step": 4440
},
{
"epoch": 2.3604296512398886,
"grad_norm": 2.9084932804107666,
"learning_rate": 1.268525006321627e-05,
"loss": 0.1402,
"step": 4450
},
{
"epoch": 2.3657339875348096,
"grad_norm": 2.940924644470215,
"learning_rate": 1.2649553696435576e-05,
"loss": 0.1357,
"step": 4460
},
{
"epoch": 2.3710383238297306,
"grad_norm": 3.0558974742889404,
"learning_rate": 1.261382098208015e-05,
"loss": 0.1414,
"step": 4470
},
{
"epoch": 2.376342660124652,
"grad_norm": 2.9939308166503906,
"learning_rate": 1.2578052410344823e-05,
"loss": 0.1388,
"step": 4480
},
{
"epoch": 2.381646996419573,
"grad_norm": 3.261167287826538,
"learning_rate": 1.2542248471916319e-05,
"loss": 0.1338,
"step": 4490
},
{
"epoch": 2.386951332714494,
"grad_norm": 2.484503746032715,
"learning_rate": 1.2506409657966536e-05,
"loss": 0.1409,
"step": 4500
},
{
"epoch": 2.3922556690094154,
"grad_norm": 2.55649733543396,
"learning_rate": 1.2470536460145818e-05,
"loss": 0.1354,
"step": 4510
},
{
"epoch": 2.3975600053043364,
"grad_norm": 2.5738770961761475,
"learning_rate": 1.2434629370576188e-05,
"loss": 0.1365,
"step": 4520
},
{
"epoch": 2.4028643415992574,
"grad_norm": 2.8223392963409424,
"learning_rate": 1.2398688881844613e-05,
"loss": 0.1331,
"step": 4530
},
{
"epoch": 2.4081686778941784,
"grad_norm": 2.891040325164795,
"learning_rate": 1.236271548699625e-05,
"loss": 0.1336,
"step": 4540
},
{
"epoch": 2.4134730141890994,
"grad_norm": 2.738555431365967,
"learning_rate": 1.2326709679527662e-05,
"loss": 0.1382,
"step": 4550
},
{
"epoch": 2.418777350484021,
"grad_norm": 2.6195318698883057,
"learning_rate": 1.229067195338007e-05,
"loss": 0.1376,
"step": 4560
},
{
"epoch": 2.424081686778942,
"grad_norm": 3.151268243789673,
"learning_rate": 1.2254602802932556e-05,
"loss": 0.1474,
"step": 4570
},
{
"epoch": 2.429386023073863,
"grad_norm": 2.687887668609619,
"learning_rate": 1.2218502722995306e-05,
"loss": 0.1337,
"step": 4580
},
{
"epoch": 2.434690359368784,
"grad_norm": 2.977978229522705,
"learning_rate": 1.2182372208802804e-05,
"loss": 0.1308,
"step": 4590
},
{
"epoch": 2.439994695663705,
"grad_norm": 2.9313466548919678,
"learning_rate": 1.2146211756007035e-05,
"loss": 0.1352,
"step": 4600
},
{
"epoch": 2.4452990319586263,
"grad_norm": 3.2692997455596924,
"learning_rate": 1.2110021860670703e-05,
"loss": 0.1345,
"step": 4610
},
{
"epoch": 2.4506033682535473,
"grad_norm": 2.7192065715789795,
"learning_rate": 1.207380301926041e-05,
"loss": 0.1386,
"step": 4620
},
{
"epoch": 2.4559077045484683,
"grad_norm": 3.0179250240325928,
"learning_rate": 1.2037555728639856e-05,
"loss": 0.1375,
"step": 4630
},
{
"epoch": 2.4612120408433893,
"grad_norm": 3.132338523864746,
"learning_rate": 1.200128048606301e-05,
"loss": 0.139,
"step": 4640
},
{
"epoch": 2.4665163771383107,
"grad_norm": 2.6613612174987793,
"learning_rate": 1.1964977789167304e-05,
"loss": 0.1374,
"step": 4650
},
{
"epoch": 2.4718207134332317,
"grad_norm": 2.995262384414673,
"learning_rate": 1.1928648135966799e-05,
"loss": 0.1402,
"step": 4660
},
{
"epoch": 2.4771250497281527,
"grad_norm": 2.70011305809021,
"learning_rate": 1.1892292024845343e-05,
"loss": 0.1353,
"step": 4670
},
{
"epoch": 2.4824293860230737,
"grad_norm": 2.9024317264556885,
"learning_rate": 1.1855909954549754e-05,
"loss": 0.1415,
"step": 4680
},
{
"epoch": 2.487733722317995,
"grad_norm": 3.167525291442871,
"learning_rate": 1.1819502424182965e-05,
"loss": 0.1358,
"step": 4690
},
{
"epoch": 2.493038058612916,
"grad_norm": 2.754595994949341,
"learning_rate": 1.178306993319718e-05,
"loss": 0.1383,
"step": 4700
},
{
"epoch": 2.498342394907837,
"grad_norm": 3.2000560760498047,
"learning_rate": 1.1746612981387016e-05,
"loss": 0.1368,
"step": 4710
},
{
"epoch": 2.503646731202758,
"grad_norm": 2.9707882404327393,
"learning_rate": 1.1710132068882663e-05,
"loss": 0.1298,
"step": 4720
},
{
"epoch": 2.508951067497679,
"grad_norm": 3.044996500015259,
"learning_rate": 1.1673627696143006e-05,
"loss": 0.134,
"step": 4730
},
{
"epoch": 2.5142554037926006,
"grad_norm": 2.9082412719726562,
"learning_rate": 1.1637100363948767e-05,
"loss": 0.139,
"step": 4740
},
{
"epoch": 2.5195597400875216,
"grad_norm": 3.294870138168335,
"learning_rate": 1.1600550573395639e-05,
"loss": 0.1407,
"step": 4750
},
{
"epoch": 2.5248640763824426,
"grad_norm": 2.9033164978027344,
"learning_rate": 1.1563978825887403e-05,
"loss": 0.1336,
"step": 4760
},
{
"epoch": 2.5301684126773636,
"grad_norm": 3.089531183242798,
"learning_rate": 1.152738562312905e-05,
"loss": 0.1344,
"step": 4770
},
{
"epoch": 2.5354727489722846,
"grad_norm": 3.015316963195801,
"learning_rate": 1.149077146711991e-05,
"loss": 0.1361,
"step": 4780
},
{
"epoch": 2.540777085267206,
"grad_norm": 2.8561995029449463,
"learning_rate": 1.1454136860146757e-05,
"loss": 0.1305,
"step": 4790
},
{
"epoch": 2.546081421562127,
"grad_norm": 3.1055350303649902,
"learning_rate": 1.141748230477691e-05,
"loss": 0.1332,
"step": 4800
},
{
"epoch": 2.551385757857048,
"grad_norm": 3.5380849838256836,
"learning_rate": 1.138080830385136e-05,
"loss": 0.1342,
"step": 4810
},
{
"epoch": 2.5566900941519695,
"grad_norm": 2.7498412132263184,
"learning_rate": 1.134411536047785e-05,
"loss": 0.1383,
"step": 4820
},
{
"epoch": 2.5619944304468905,
"grad_norm": 3.1483683586120605,
"learning_rate": 1.1307403978023985e-05,
"loss": 0.1364,
"step": 4830
},
{
"epoch": 2.5672987667418115,
"grad_norm": 2.886155128479004,
"learning_rate": 1.127067466011033e-05,
"loss": 0.1355,
"step": 4840
},
{
"epoch": 2.5726031030367325,
"grad_norm": 3.173999786376953,
"learning_rate": 1.1233927910603486e-05,
"loss": 0.132,
"step": 4850
},
{
"epoch": 2.5779074393316534,
"grad_norm": 3.0697178840637207,
"learning_rate": 1.1197164233609195e-05,
"loss": 0.1379,
"step": 4860
},
{
"epoch": 2.583211775626575,
"grad_norm": 2.831016778945923,
"learning_rate": 1.1160384133465413e-05,
"loss": 0.1311,
"step": 4870
},
{
"epoch": 2.588516111921496,
"grad_norm": 3.2016053199768066,
"learning_rate": 1.1123588114735394e-05,
"loss": 0.1378,
"step": 4880
},
{
"epoch": 2.593820448216417,
"grad_norm": 3.0537898540496826,
"learning_rate": 1.108677668220077e-05,
"loss": 0.1367,
"step": 4890
},
{
"epoch": 2.599124784511338,
"grad_norm": 2.7495782375335693,
"learning_rate": 1.1049950340854629e-05,
"loss": 0.1347,
"step": 4900
},
{
"epoch": 2.604429120806259,
"grad_norm": 3.102475881576538,
"learning_rate": 1.1013109595894578e-05,
"loss": 0.1307,
"step": 4910
},
{
"epoch": 2.6097334571011803,
"grad_norm": 2.821542978286743,
"learning_rate": 1.0976254952715821e-05,
"loss": 0.1356,
"step": 4920
},
{
"epoch": 2.6150377933961013,
"grad_norm": 2.982942819595337,
"learning_rate": 1.0939386916904227e-05,
"loss": 0.138,
"step": 4930
},
{
"epoch": 2.6203421296910223,
"grad_norm": 2.680748701095581,
"learning_rate": 1.0902505994229377e-05,
"loss": 0.1361,
"step": 4940
},
{
"epoch": 2.6256464659859438,
"grad_norm": 2.7981534004211426,
"learning_rate": 1.0865612690637657e-05,
"loss": 0.1361,
"step": 4950
},
{
"epoch": 2.6309508022808648,
"grad_norm": 2.9631705284118652,
"learning_rate": 1.0828707512245285e-05,
"loss": 0.1325,
"step": 4960
},
{
"epoch": 2.6362551385757858,
"grad_norm": 3.211549997329712,
"learning_rate": 1.0791790965331388e-05,
"loss": 0.1308,
"step": 4970
},
{
"epoch": 2.6415594748707067,
"grad_norm": 3.033447027206421,
"learning_rate": 1.075486355633105e-05,
"loss": 0.1334,
"step": 4980
},
{
"epoch": 2.6468638111656277,
"grad_norm": 3.050119638442993,
"learning_rate": 1.0717925791828362e-05,
"loss": 0.1373,
"step": 4990
},
{
"epoch": 2.652168147460549,
"grad_norm": 2.9277257919311523,
"learning_rate": 1.0680978178549488e-05,
"loss": 0.1349,
"step": 5000
},
{
"epoch": 2.65747248375547,
"grad_norm": 3.1963348388671875,
"learning_rate": 1.0644021223355679e-05,
"loss": 0.134,
"step": 5010
},
{
"epoch": 2.662776820050391,
"grad_norm": 2.882453441619873,
"learning_rate": 1.060705543323636e-05,
"loss": 0.1337,
"step": 5020
},
{
"epoch": 2.668081156345312,
"grad_norm": 2.7132582664489746,
"learning_rate": 1.0570081315302152e-05,
"loss": 0.1386,
"step": 5030
},
{
"epoch": 2.673385492640233,
"grad_norm": 2.729783535003662,
"learning_rate": 1.0533099376777922e-05,
"loss": 0.1344,
"step": 5040
},
{
"epoch": 2.6786898289351546,
"grad_norm": 3.0825912952423096,
"learning_rate": 1.0496110124995814e-05,
"loss": 0.1321,
"step": 5050
},
{
"epoch": 2.6839941652300756,
"grad_norm": 2.804955005645752,
"learning_rate": 1.0459114067388308e-05,
"loss": 0.1308,
"step": 5060
},
{
"epoch": 2.6892985015249966,
"grad_norm": 2.9028618335723877,
"learning_rate": 1.0422111711481246e-05,
"loss": 0.1356,
"step": 5070
},
{
"epoch": 2.694602837819918,
"grad_norm": 3.0055623054504395,
"learning_rate": 1.0385103564886869e-05,
"loss": 0.1338,
"step": 5080
},
{
"epoch": 2.6999071741148386,
"grad_norm": 2.8479676246643066,
"learning_rate": 1.0348090135296865e-05,
"loss": 0.1324,
"step": 5090
},
{
"epoch": 2.70521151040976,
"grad_norm": 3.0957729816436768,
"learning_rate": 1.0311071930475382e-05,
"loss": 0.1368,
"step": 5100
},
{
"epoch": 2.710515846704681,
"grad_norm": 2.8696446418762207,
"learning_rate": 1.0274049458252091e-05,
"loss": 0.1292,
"step": 5110
},
{
"epoch": 2.715820182999602,
"grad_norm": 2.482884168624878,
"learning_rate": 1.0237023226515197e-05,
"loss": 0.1345,
"step": 5120
},
{
"epoch": 2.7211245192945235,
"grad_norm": 2.8822643756866455,
"learning_rate": 1.019999374320448e-05,
"loss": 0.1282,
"step": 5130
},
{
"epoch": 2.7264288555894445,
"grad_norm": 2.7656660079956055,
"learning_rate": 1.0162961516304333e-05,
"loss": 0.1256,
"step": 5140
},
{
"epoch": 2.7317331918843655,
"grad_norm": 2.6746714115142822,
"learning_rate": 1.0125927053836773e-05,
"loss": 0.1309,
"step": 5150
},
{
"epoch": 2.7370375281792865,
"grad_norm": 2.917820930480957,
"learning_rate": 1.0088890863854497e-05,
"loss": 0.134,
"step": 5160
},
{
"epoch": 2.7423418644742075,
"grad_norm": 2.9841034412384033,
"learning_rate": 1.0051853454433902e-05,
"loss": 0.1326,
"step": 5170
},
{
"epoch": 2.747646200769129,
"grad_norm": 3.027301549911499,
"learning_rate": 1.0014815333668101e-05,
"loss": 0.1286,
"step": 5180
},
{
"epoch": 2.75295053706405,
"grad_norm": 3.0368714332580566,
"learning_rate": 9.97777700965998e-06,
"loss": 0.1368,
"step": 5190
},
{
"epoch": 2.758254873358971,
"grad_norm": 2.925471782684326,
"learning_rate": 9.940738990515202e-06,
"loss": 0.1326,
"step": 5200
},
{
"epoch": 2.763559209653892,
"grad_norm": 2.8439114093780518,
"learning_rate": 9.903701784335256e-06,
"loss": 0.1254,
"step": 5210
},
{
"epoch": 2.768863545948813,
"grad_norm": 2.9670259952545166,
"learning_rate": 9.866665899210472e-06,
"loss": 0.1309,
"step": 5220
},
{
"epoch": 2.7741678822437343,
"grad_norm": 3.0596694946289062,
"learning_rate": 9.829631843213059e-06,
"loss": 0.1317,
"step": 5230
},
{
"epoch": 2.7794722185386553,
"grad_norm": 2.936415910720825,
"learning_rate": 9.79260012439014e-06,
"loss": 0.128,
"step": 5240
},
{
"epoch": 2.7847765548335763,
"grad_norm": 2.7913734912872314,
"learning_rate": 9.755571250756761e-06,
"loss": 0.1293,
"step": 5250
},
{
"epoch": 2.7900808911284978,
"grad_norm": 2.662478446960449,
"learning_rate": 9.718545730288956e-06,
"loss": 0.1309,
"step": 5260
},
{
"epoch": 2.7953852274234188,
"grad_norm": 2.7307651042938232,
"learning_rate": 9.681524070916745e-06,
"loss": 0.1293,
"step": 5270
},
{
"epoch": 2.8006895637183398,
"grad_norm": 2.8188600540161133,
"learning_rate": 9.644506780517178e-06,
"loss": 0.1251,
"step": 5280
},
{
"epoch": 2.8059939000132608,
"grad_norm": 2.798912286758423,
"learning_rate": 9.607494366907384e-06,
"loss": 0.1342,
"step": 5290
},
{
"epoch": 2.8112982363081818,
"grad_norm": 2.9201605319976807,
"learning_rate": 9.57048733783758e-06,
"loss": 0.129,
"step": 5300
},
{
"epoch": 2.816602572603103,
"grad_norm": 2.914685010910034,
"learning_rate": 9.53348620098411e-06,
"loss": 0.1334,
"step": 5310
},
{
"epoch": 2.821906908898024,
"grad_norm": 2.8835930824279785,
"learning_rate": 9.496491463942507e-06,
"loss": 0.1331,
"step": 5320
},
{
"epoch": 2.827211245192945,
"grad_norm": 3.123211145401001,
"learning_rate": 9.459503634220488e-06,
"loss": 0.1247,
"step": 5330
},
{
"epoch": 2.832515581487866,
"grad_norm": 3.094163656234741,
"learning_rate": 9.422523219231019e-06,
"loss": 0.1308,
"step": 5340
},
{
"epoch": 2.837819917782787,
"grad_norm": 3.100656747817993,
"learning_rate": 9.385550726285357e-06,
"loss": 0.1288,
"step": 5350
},
{
"epoch": 2.8431242540777086,
"grad_norm": 2.8668949604034424,
"learning_rate": 9.348586662586072e-06,
"loss": 0.1294,
"step": 5360
},
{
"epoch": 2.8484285903726296,
"grad_norm": 2.8769168853759766,
"learning_rate": 9.311631535220096e-06,
"loss": 0.1197,
"step": 5370
},
{
"epoch": 2.8537329266675506,
"grad_norm": 3.1265363693237305,
"learning_rate": 9.274685851151777e-06,
"loss": 0.1264,
"step": 5380
},
{
"epoch": 2.859037262962472,
"grad_norm": 2.7183499336242676,
"learning_rate": 9.237750117215917e-06,
"loss": 0.1277,
"step": 5390
},
{
"epoch": 2.864341599257393,
"grad_norm": 2.801440954208374,
"learning_rate": 9.200824840110808e-06,
"loss": 0.1289,
"step": 5400
},
{
"epoch": 2.869645935552314,
"grad_norm": 2.798792600631714,
"learning_rate": 9.163910526391301e-06,
"loss": 0.1305,
"step": 5410
},
{
"epoch": 2.874950271847235,
"grad_norm": 2.9811506271362305,
"learning_rate": 9.12700768246184e-06,
"loss": 0.1265,
"step": 5420
},
{
"epoch": 2.880254608142156,
"grad_norm": 2.8695929050445557,
"learning_rate": 9.090116814569532e-06,
"loss": 0.1311,
"step": 5430
},
{
"epoch": 2.8855589444370775,
"grad_norm": 3.0875537395477295,
"learning_rate": 9.053238428797184e-06,
"loss": 0.1239,
"step": 5440
},
{
"epoch": 2.8908632807319985,
"grad_norm": 3.061387062072754,
"learning_rate": 9.016373031056365e-06,
"loss": 0.1288,
"step": 5450
},
{
"epoch": 2.8961676170269195,
"grad_norm": 2.614323139190674,
"learning_rate": 8.979521127080482e-06,
"loss": 0.1315,
"step": 5460
},
{
"epoch": 2.9014719533218405,
"grad_norm": 2.8680355548858643,
"learning_rate": 8.942683222417823e-06,
"loss": 0.125,
"step": 5470
},
{
"epoch": 2.9067762896167615,
"grad_norm": 2.898186206817627,
"learning_rate": 8.905859822424617e-06,
"loss": 0.1294,
"step": 5480
},
{
"epoch": 2.912080625911683,
"grad_norm": 2.7704014778137207,
"learning_rate": 8.869051432258137e-06,
"loss": 0.121,
"step": 5490
},
{
"epoch": 2.917384962206604,
"grad_norm": 3.1199190616607666,
"learning_rate": 8.832258556869724e-06,
"loss": 0.1312,
"step": 5500
},
{
"epoch": 2.922689298501525,
"grad_norm": 3.096050500869751,
"learning_rate": 8.795481700997886e-06,
"loss": 0.1288,
"step": 5510
},
{
"epoch": 2.9279936347964464,
"grad_norm": 2.9244625568389893,
"learning_rate": 8.75872136916138e-06,
"loss": 0.1241,
"step": 5520
},
{
"epoch": 2.933297971091367,
"grad_norm": 2.6919119358062744,
"learning_rate": 8.72197806565227e-06,
"loss": 0.1238,
"step": 5530
},
{
"epoch": 2.9386023073862884,
"grad_norm": 3.3755581378936768,
"learning_rate": 8.685252294529016e-06,
"loss": 0.1244,
"step": 5540
},
{
"epoch": 2.9439066436812094,
"grad_norm": 2.898566246032715,
"learning_rate": 8.648544559609575e-06,
"loss": 0.1223,
"step": 5550
},
{
"epoch": 2.9492109799761304,
"grad_norm": 2.690538167953491,
"learning_rate": 8.611855364464465e-06,
"loss": 0.124,
"step": 5560
},
{
"epoch": 2.954515316271052,
"grad_norm": 2.957213878631592,
"learning_rate": 8.57518521240987e-06,
"loss": 0.1265,
"step": 5570
},
{
"epoch": 2.959819652565973,
"grad_norm": 3.0998001098632812,
"learning_rate": 8.538534606500743e-06,
"loss": 0.1282,
"step": 5580
},
{
"epoch": 2.965123988860894,
"grad_norm": 2.7344188690185547,
"learning_rate": 8.50190404952388e-06,
"loss": 0.1244,
"step": 5590
},
{
"epoch": 2.970428325155815,
"grad_norm": 2.824951171875,
"learning_rate": 8.465294043991056e-06,
"loss": 0.1251,
"step": 5600
},
{
"epoch": 2.975732661450736,
"grad_norm": 2.772991418838501,
"learning_rate": 8.428705092132102e-06,
"loss": 0.1219,
"step": 5610
},
{
"epoch": 2.9810369977456572,
"grad_norm": 2.6752243041992188,
"learning_rate": 8.392137695888022e-06,
"loss": 0.1251,
"step": 5620
},
{
"epoch": 2.986341334040578,
"grad_norm": 3.0803351402282715,
"learning_rate": 8.355592356904132e-06,
"loss": 0.1318,
"step": 5630
},
{
"epoch": 2.991645670335499,
"grad_norm": 2.781517744064331,
"learning_rate": 8.319069576523136e-06,
"loss": 0.1195,
"step": 5640
},
{
"epoch": 2.99695000663042,
"grad_norm": 2.9005682468414307,
"learning_rate": 8.282569855778282e-06,
"loss": 0.1278,
"step": 5650
},
{
"epoch": 3.0022543429253417,
"grad_norm": 1.316965103149414,
"learning_rate": 8.246093695386475e-06,
"loss": 0.0921,
"step": 5660
},
{
"epoch": 3.0075586792202627,
"grad_norm": 1.6865043640136719,
"learning_rate": 8.209641595741413e-06,
"loss": 0.0469,
"step": 5670
},
{
"epoch": 3.0128630155151837,
"grad_norm": 1.917323350906372,
"learning_rate": 8.173214056906716e-06,
"loss": 0.0447,
"step": 5680
},
{
"epoch": 3.0181673518101046,
"grad_norm": 1.8996626138687134,
"learning_rate": 8.13681157860907e-06,
"loss": 0.0455,
"step": 5690
},
{
"epoch": 3.0234716881050256,
"grad_norm": 1.5856680870056152,
"learning_rate": 8.10043466023137e-06,
"loss": 0.0436,
"step": 5700
},
{
"epoch": 3.028776024399947,
"grad_norm": 1.8763635158538818,
"learning_rate": 8.064083800805875e-06,
"loss": 0.044,
"step": 5710
},
{
"epoch": 3.034080360694868,
"grad_norm": 1.6315194368362427,
"learning_rate": 8.027759499007356e-06,
"loss": 0.0485,
"step": 5720
},
{
"epoch": 3.039384696989789,
"grad_norm": 1.9602937698364258,
"learning_rate": 7.991462253146251e-06,
"loss": 0.0432,
"step": 5730
},
{
"epoch": 3.04468903328471,
"grad_norm": 1.6057361364364624,
"learning_rate": 7.955192561161841e-06,
"loss": 0.0407,
"step": 5740
},
{
"epoch": 3.0499933695796315,
"grad_norm": 1.7018084526062012,
"learning_rate": 7.918950920615412e-06,
"loss": 0.043,
"step": 5750
},
{
"epoch": 3.0552977058745525,
"grad_norm": 2.058243989944458,
"learning_rate": 7.882737828683423e-06,
"loss": 0.0433,
"step": 5760
},
{
"epoch": 3.0606020421694735,
"grad_norm": 1.7645084857940674,
"learning_rate": 7.846553782150703e-06,
"loss": 0.04,
"step": 5770
},
{
"epoch": 3.0659063784643945,
"grad_norm": 2.0199408531188965,
"learning_rate": 7.810399277403618e-06,
"loss": 0.0428,
"step": 5780
},
{
"epoch": 3.071210714759316,
"grad_norm": 2.021491050720215,
"learning_rate": 7.774274810423265e-06,
"loss": 0.0441,
"step": 5790
},
{
"epoch": 3.076515051054237,
"grad_norm": 1.8167482614517212,
"learning_rate": 7.738180876778686e-06,
"loss": 0.0421,
"step": 5800
},
{
"epoch": 3.081819387349158,
"grad_norm": 2.0647082328796387,
"learning_rate": 7.702117971620042e-06,
"loss": 0.0437,
"step": 5810
},
{
"epoch": 3.087123723644079,
"grad_norm": 1.6139365434646606,
"learning_rate": 7.666086589671833e-06,
"loss": 0.0427,
"step": 5820
},
{
"epoch": 3.092428059939,
"grad_norm": 1.4240131378173828,
"learning_rate": 7.630087225226126e-06,
"loss": 0.0432,
"step": 5830
},
{
"epoch": 3.0977323962339214,
"grad_norm": 1.8977570533752441,
"learning_rate": 7.594120372135743e-06,
"loss": 0.0402,
"step": 5840
},
{
"epoch": 3.1030367325288424,
"grad_norm": 1.6448214054107666,
"learning_rate": 7.558186523807509e-06,
"loss": 0.041,
"step": 5850
},
{
"epoch": 3.1083410688237634,
"grad_norm": 1.8806428909301758,
"learning_rate": 7.5222861731954856e-06,
"loss": 0.0411,
"step": 5860
},
{
"epoch": 3.1136454051186844,
"grad_norm": 1.971258521080017,
"learning_rate": 7.48641981279419e-06,
"loss": 0.042,
"step": 5870
},
{
"epoch": 3.118949741413606,
"grad_norm": 1.4597933292388916,
"learning_rate": 7.4505879346318475e-06,
"loss": 0.0396,
"step": 5880
},
{
"epoch": 3.124254077708527,
"grad_norm": 1.6617002487182617,
"learning_rate": 7.414791030263655e-06,
"loss": 0.0391,
"step": 5890
},
{
"epoch": 3.129558414003448,
"grad_norm": 1.851900577545166,
"learning_rate": 7.379029590765015e-06,
"loss": 0.0411,
"step": 5900
},
{
"epoch": 3.134862750298369,
"grad_norm": 1.9996873140335083,
"learning_rate": 7.343304106724807e-06,
"loss": 0.0398,
"step": 5910
},
{
"epoch": 3.14016708659329,
"grad_norm": 1.655671238899231,
"learning_rate": 7.307615068238676e-06,
"loss": 0.0405,
"step": 5920
},
{
"epoch": 3.1454714228882112,
"grad_norm": 1.929533839225769,
"learning_rate": 7.271962964902277e-06,
"loss": 0.041,
"step": 5930
},
{
"epoch": 3.1507757591831322,
"grad_norm": 1.8329622745513916,
"learning_rate": 7.236348285804581e-06,
"loss": 0.0408,
"step": 5940
},
{
"epoch": 3.1560800954780532,
"grad_norm": 1.4492971897125244,
"learning_rate": 7.200771519521161e-06,
"loss": 0.0417,
"step": 5950
},
{
"epoch": 3.1613844317729742,
"grad_norm": 1.6154602766036987,
"learning_rate": 7.1652331541074845e-06,
"loss": 0.0428,
"step": 5960
},
{
"epoch": 3.1666887680678957,
"grad_norm": 1.6101605892181396,
"learning_rate": 7.129733677092225e-06,
"loss": 0.0409,
"step": 5970
},
{
"epoch": 3.1719931043628167,
"grad_norm": 1.7562830448150635,
"learning_rate": 7.094273575470562e-06,
"loss": 0.0406,
"step": 5980
},
{
"epoch": 3.1772974406577377,
"grad_norm": 1.5611249208450317,
"learning_rate": 7.058853335697517e-06,
"loss": 0.0429,
"step": 5990
},
{
"epoch": 3.1826017769526587,
"grad_norm": 1.839369535446167,
"learning_rate": 7.023473443681275e-06,
"loss": 0.0396,
"step": 6000
},
{
"epoch": 3.18790611324758,
"grad_norm": 1.6211901903152466,
"learning_rate": 6.9881343847765025e-06,
"loss": 0.037,
"step": 6010
},
{
"epoch": 3.193210449542501,
"grad_norm": 1.5419272184371948,
"learning_rate": 6.952836643777707e-06,
"loss": 0.0405,
"step": 6020
},
{
"epoch": 3.198514785837422,
"grad_norm": 1.750074863433838,
"learning_rate": 6.917580704912592e-06,
"loss": 0.0412,
"step": 6030
},
{
"epoch": 3.203819122132343,
"grad_norm": 1.8624082803726196,
"learning_rate": 6.882367051835389e-06,
"loss": 0.0429,
"step": 6040
},
{
"epoch": 3.209123458427264,
"grad_norm": 1.7120383977890015,
"learning_rate": 6.84719616762024e-06,
"loss": 0.0408,
"step": 6050
},
{
"epoch": 3.2144277947221855,
"grad_norm": 1.9718371629714966,
"learning_rate": 6.812068534754579e-06,
"loss": 0.0404,
"step": 6060
},
{
"epoch": 3.2197321310171065,
"grad_norm": 1.9837217330932617,
"learning_rate": 6.776984635132491e-06,
"loss": 0.0425,
"step": 6070
},
{
"epoch": 3.2250364673120275,
"grad_norm": 1.57326078414917,
"learning_rate": 6.741944950048106e-06,
"loss": 0.0413,
"step": 6080
},
{
"epoch": 3.2303408036069485,
"grad_norm": 1.6847975254058838,
"learning_rate": 6.706949960189022e-06,
"loss": 0.0408,
"step": 6090
},
{
"epoch": 3.23564513990187,
"grad_norm": 1.8581093549728394,
"learning_rate": 6.672000145629671e-06,
"loss": 0.0427,
"step": 6100
},
{
"epoch": 3.240949476196791,
"grad_norm": 1.6762524843215942,
"learning_rate": 6.637095985824771e-06,
"loss": 0.0389,
"step": 6110
},
{
"epoch": 3.246253812491712,
"grad_norm": 1.8649601936340332,
"learning_rate": 6.602237959602715e-06,
"loss": 0.0404,
"step": 6120
},
{
"epoch": 3.251558148786633,
"grad_norm": 1.706531047821045,
"learning_rate": 6.567426545159024e-06,
"loss": 0.0414,
"step": 6130
},
{
"epoch": 3.2568624850815544,
"grad_norm": 1.5406407117843628,
"learning_rate": 6.532662220049788e-06,
"loss": 0.039,
"step": 6140
},
{
"epoch": 3.2621668213764754,
"grad_norm": 1.4194878339767456,
"learning_rate": 6.4979454611851e-06,
"loss": 0.0385,
"step": 6150
},
{
"epoch": 3.2674711576713964,
"grad_norm": 1.7470570802688599,
"learning_rate": 6.463276744822517e-06,
"loss": 0.0409,
"step": 6160
},
{
"epoch": 3.2727754939663174,
"grad_norm": 1.7166268825531006,
"learning_rate": 6.428656546560547e-06,
"loss": 0.0402,
"step": 6170
},
{
"epoch": 3.2780798302612384,
"grad_norm": 1.7307934761047363,
"learning_rate": 6.394085341332092e-06,
"loss": 0.0411,
"step": 6180
},
{
"epoch": 3.28338416655616,
"grad_norm": 1.5930451154708862,
"learning_rate": 6.359563603397956e-06,
"loss": 0.0368,
"step": 6190
},
{
"epoch": 3.288688502851081,
"grad_norm": 1.637946605682373,
"learning_rate": 6.325091806340335e-06,
"loss": 0.0409,
"step": 6200
},
{
"epoch": 3.293992839146002,
"grad_norm": 1.4354368448257446,
"learning_rate": 6.290670423056313e-06,
"loss": 0.0382,
"step": 6210
},
{
"epoch": 3.299297175440923,
"grad_norm": 1.6184732913970947,
"learning_rate": 6.256299925751374e-06,
"loss": 0.0394,
"step": 6220
},
{
"epoch": 3.304601511735844,
"grad_norm": 1.510335087776184,
"learning_rate": 6.221980785932945e-06,
"loss": 0.037,
"step": 6230
},
{
"epoch": 3.3099058480307653,
"grad_norm": 1.6335468292236328,
"learning_rate": 6.187713474403895e-06,
"loss": 0.0391,
"step": 6240
},
{
"epoch": 3.3152101843256863,
"grad_norm": 1.9890940189361572,
"learning_rate": 6.1534984612561e-06,
"loss": 0.0424,
"step": 6250
},
{
"epoch": 3.3205145206206073,
"grad_norm": 1.5295640230178833,
"learning_rate": 6.119336215863988e-06,
"loss": 0.038,
"step": 6260
},
{
"epoch": 3.3258188569155283,
"grad_norm": 1.9646100997924805,
"learning_rate": 6.0852272068780975e-06,
"loss": 0.0402,
"step": 6270
},
{
"epoch": 3.3311231932104497,
"grad_norm": 1.5410913228988647,
"learning_rate": 6.051171902218651e-06,
"loss": 0.0411,
"step": 6280
},
{
"epoch": 3.3364275295053707,
"grad_norm": 1.6982203722000122,
"learning_rate": 6.017170769069134e-06,
"loss": 0.04,
"step": 6290
},
{
"epoch": 3.3417318658002917,
"grad_norm": 1.836172103881836,
"learning_rate": 5.983224273869881e-06,
"loss": 0.0376,
"step": 6300
},
{
"epoch": 3.3470362020952127,
"grad_norm": 1.5369129180908203,
"learning_rate": 5.949332882311697e-06,
"loss": 0.0408,
"step": 6310
},
{
"epoch": 3.352340538390134,
"grad_norm": 1.420976996421814,
"learning_rate": 5.915497059329442e-06,
"loss": 0.0422,
"step": 6320
},
{
"epoch": 3.357644874685055,
"grad_norm": 1.4260170459747314,
"learning_rate": 5.881717269095668e-06,
"loss": 0.0359,
"step": 6330
},
{
"epoch": 3.362949210979976,
"grad_norm": 1.5417782068252563,
"learning_rate": 5.8479939750142535e-06,
"loss": 0.0379,
"step": 6340
},
{
"epoch": 3.368253547274897,
"grad_norm": 2.017428159713745,
"learning_rate": 5.814327639714037e-06,
"loss": 0.0378,
"step": 6350
},
{
"epoch": 3.373557883569818,
"grad_norm": 1.7676209211349487,
"learning_rate": 5.7807187250424665e-06,
"loss": 0.0403,
"step": 6360
},
{
"epoch": 3.3788622198647396,
"grad_norm": 1.666673183441162,
"learning_rate": 5.7471676920593015e-06,
"loss": 0.04,
"step": 6370
},
{
"epoch": 3.3841665561596606,
"grad_norm": 1.7691739797592163,
"learning_rate": 5.713675001030221e-06,
"loss": 0.0368,
"step": 6380
},
{
"epoch": 3.3894708924545816,
"grad_norm": 1.5981837511062622,
"learning_rate": 5.680241111420572e-06,
"loss": 0.0366,
"step": 6390
},
{
"epoch": 3.3947752287495025,
"grad_norm": 1.4606654644012451,
"learning_rate": 5.646866481889035e-06,
"loss": 0.0399,
"step": 6400
},
{
"epoch": 3.400079565044424,
"grad_norm": 1.8489834070205688,
"learning_rate": 5.613551570281337e-06,
"loss": 0.0363,
"step": 6410
},
{
"epoch": 3.405383901339345,
"grad_norm": 1.9334096908569336,
"learning_rate": 5.580296833623977e-06,
"loss": 0.0405,
"step": 6420
},
{
"epoch": 3.410688237634266,
"grad_norm": 1.5512571334838867,
"learning_rate": 5.547102728117939e-06,
"loss": 0.0389,
"step": 6430
},
{
"epoch": 3.415992573929187,
"grad_norm": 1.4512279033660889,
"learning_rate": 5.513969709132458e-06,
"loss": 0.0383,
"step": 6440
},
{
"epoch": 3.4212969102241084,
"grad_norm": 1.7268502712249756,
"learning_rate": 5.48089823119876e-06,
"loss": 0.0382,
"step": 6450
},
{
"epoch": 3.4266012465190294,
"grad_norm": 2.3311150074005127,
"learning_rate": 5.447888748003827e-06,
"loss": 0.0398,
"step": 6460
},
{
"epoch": 3.4319055828139504,
"grad_norm": 1.7974883317947388,
"learning_rate": 5.414941712384161e-06,
"loss": 0.0373,
"step": 6470
},
{
"epoch": 3.4372099191088714,
"grad_norm": 1.693633794784546,
"learning_rate": 5.382057576319613e-06,
"loss": 0.04,
"step": 6480
},
{
"epoch": 3.4425142554037924,
"grad_norm": 2.0825114250183105,
"learning_rate": 5.349236790927122e-06,
"loss": 0.0394,
"step": 6490
},
{
"epoch": 3.447818591698714,
"grad_norm": 1.58704674243927,
"learning_rate": 5.316479806454578e-06,
"loss": 0.0375,
"step": 6500
},
{
"epoch": 3.453122927993635,
"grad_norm": 1.7522614002227783,
"learning_rate": 5.283787072274624e-06,
"loss": 0.039,
"step": 6510
},
{
"epoch": 3.458427264288556,
"grad_norm": 1.915915846824646,
"learning_rate": 5.251159036878493e-06,
"loss": 0.0416,
"step": 6520
},
{
"epoch": 3.463731600583477,
"grad_norm": 1.9502066373825073,
"learning_rate": 5.2185961478698435e-06,
"loss": 0.0356,
"step": 6530
},
{
"epoch": 3.469035936878398,
"grad_norm": 1.562122106552124,
"learning_rate": 5.186098851958656e-06,
"loss": 0.0394,
"step": 6540
},
{
"epoch": 3.4743402731733193,
"grad_norm": 1.9908523559570312,
"learning_rate": 5.1536675949550545e-06,
"loss": 0.0404,
"step": 6550
},
{
"epoch": 3.4796446094682403,
"grad_norm": 1.6014606952667236,
"learning_rate": 5.12130282176323e-06,
"loss": 0.039,
"step": 6560
},
{
"epoch": 3.4849489457631613,
"grad_norm": 1.838715672492981,
"learning_rate": 5.089004976375322e-06,
"loss": 0.0374,
"step": 6570
},
{
"epoch": 3.4902532820580827,
"grad_norm": 1.7421026229858398,
"learning_rate": 5.056774501865329e-06,
"loss": 0.0371,
"step": 6580
},
{
"epoch": 3.4955576183530037,
"grad_norm": 1.673134684562683,
"learning_rate": 5.02461184038303e-06,
"loss": 0.0383,
"step": 6590
},
{
"epoch": 3.5008619546479247,
"grad_norm": 1.484175443649292,
"learning_rate": 4.992517433147922e-06,
"loss": 0.0357,
"step": 6600
},
{
"epoch": 3.5061662909428457,
"grad_norm": 1.5539370775222778,
"learning_rate": 4.960491720443151e-06,
"loss": 0.0395,
"step": 6610
},
{
"epoch": 3.5114706272377667,
"grad_norm": 1.7498281002044678,
"learning_rate": 4.92853514160951e-06,
"loss": 0.0403,
"step": 6620
},
{
"epoch": 3.516774963532688,
"grad_norm": 1.8837767839431763,
"learning_rate": 4.8966481350393655e-06,
"loss": 0.0409,
"step": 6630
},
{
"epoch": 3.522079299827609,
"grad_norm": 1.9168699979782104,
"learning_rate": 4.864831138170675e-06,
"loss": 0.0405,
"step": 6640
},
{
"epoch": 3.52738363612253,
"grad_norm": 1.812839388847351,
"learning_rate": 4.833084587480975e-06,
"loss": 0.0371,
"step": 6650
},
{
"epoch": 3.532687972417451,
"grad_norm": 1.71259605884552,
"learning_rate": 4.801408918481402e-06,
"loss": 0.0388,
"step": 6660
},
{
"epoch": 3.537992308712372,
"grad_norm": 1.6360466480255127,
"learning_rate": 4.769804565710693e-06,
"loss": 0.0371,
"step": 6670
},
{
"epoch": 3.5432966450072936,
"grad_norm": 1.8788199424743652,
"learning_rate": 4.7382719627292595e-06,
"loss": 0.0388,
"step": 6680
},
{
"epoch": 3.5486009813022146,
"grad_norm": 1.5442296266555786,
"learning_rate": 4.7068115421132146e-06,
"loss": 0.0336,
"step": 6690
},
{
"epoch": 3.5539053175971356,
"grad_norm": 1.5917887687683105,
"learning_rate": 4.675423735448448e-06,
"loss": 0.0393,
"step": 6700
},
{
"epoch": 3.559209653892057,
"grad_norm": 2.0295755863189697,
"learning_rate": 4.644108973324708e-06,
"loss": 0.036,
"step": 6710
},
{
"epoch": 3.564513990186978,
"grad_norm": 2.050459861755371,
"learning_rate": 4.612867685329679e-06,
"loss": 0.0406,
"step": 6720
},
{
"epoch": 3.569818326481899,
"grad_norm": 1.806357502937317,
"learning_rate": 4.58170030004311e-06,
"loss": 0.036,
"step": 6730
},
{
"epoch": 3.57512266277682,
"grad_norm": 1.5718046426773071,
"learning_rate": 4.550607245030923e-06,
"loss": 0.0336,
"step": 6740
},
{
"epoch": 3.580426999071741,
"grad_norm": 1.753868818283081,
"learning_rate": 4.519588946839346e-06,
"loss": 0.0396,
"step": 6750
},
{
"epoch": 3.5857313353666624,
"grad_norm": 1.49803626537323,
"learning_rate": 4.488645830989069e-06,
"loss": 0.0371,
"step": 6760
},
{
"epoch": 3.5910356716615834,
"grad_norm": 1.698613166809082,
"learning_rate": 4.457778321969404e-06,
"loss": 0.0356,
"step": 6770
},
{
"epoch": 3.5963400079565044,
"grad_norm": 1.8640644550323486,
"learning_rate": 4.426986843232443e-06,
"loss": 0.0393,
"step": 6780
},
{
"epoch": 3.6016443442514254,
"grad_norm": 1.7568309307098389,
"learning_rate": 4.3962718171872945e-06,
"loss": 0.0375,
"step": 6790
},
{
"epoch": 3.6069486805463464,
"grad_norm": 1.7229827642440796,
"learning_rate": 4.3656336651942355e-06,
"loss": 0.0417,
"step": 6800
},
{
"epoch": 3.612253016841268,
"grad_norm": 1.8499397039413452,
"learning_rate": 4.3350728075589676e-06,
"loss": 0.0408,
"step": 6810
},
{
"epoch": 3.617557353136189,
"grad_norm": 1.4821689128875732,
"learning_rate": 4.304589663526838e-06,
"loss": 0.0382,
"step": 6820
},
{
"epoch": 3.62286168943111,
"grad_norm": 1.4930757284164429,
"learning_rate": 4.274184651277092e-06,
"loss": 0.0378,
"step": 6830
},
{
"epoch": 3.6281660257260313,
"grad_norm": 1.834444522857666,
"learning_rate": 4.243858187917117e-06,
"loss": 0.0357,
"step": 6840
},
{
"epoch": 3.633470362020952,
"grad_norm": 1.4575175046920776,
"learning_rate": 4.213610689476767e-06,
"loss": 0.0392,
"step": 6850
},
{
"epoch": 3.6387746983158733,
"grad_norm": 1.5352399349212646,
"learning_rate": 4.183442570902597e-06,
"loss": 0.0358,
"step": 6860
},
{
"epoch": 3.6440790346107943,
"grad_norm": 1.5042182207107544,
"learning_rate": 4.1533542460522155e-06,
"loss": 0.0366,
"step": 6870
},
{
"epoch": 3.6493833709057153,
"grad_norm": 1.5749527215957642,
"learning_rate": 4.123346127688587e-06,
"loss": 0.0385,
"step": 6880
},
{
"epoch": 3.6546877072006367,
"grad_norm": 1.647090196609497,
"learning_rate": 4.093418627474373e-06,
"loss": 0.037,
"step": 6890
},
{
"epoch": 3.6599920434955577,
"grad_norm": 1.7862918376922607,
"learning_rate": 4.063572155966274e-06,
"loss": 0.0366,
"step": 6900
},
{
"epoch": 3.6652963797904787,
"grad_norm": 1.605738639831543,
"learning_rate": 4.033807122609435e-06,
"loss": 0.0373,
"step": 6910
},
{
"epoch": 3.6706007160853997,
"grad_norm": 1.787914514541626,
"learning_rate": 4.0041239357317725e-06,
"loss": 0.0348,
"step": 6920
},
{
"epoch": 3.6759050523803207,
"grad_norm": 1.8057564496994019,
"learning_rate": 3.97452300253842e-06,
"loss": 0.0349,
"step": 6930
},
{
"epoch": 3.681209388675242,
"grad_norm": 1.519795536994934,
"learning_rate": 3.945004729106116e-06,
"loss": 0.0354,
"step": 6940
},
{
"epoch": 3.686513724970163,
"grad_norm": 1.7589900493621826,
"learning_rate": 3.915569520377648e-06,
"loss": 0.0385,
"step": 6950
},
{
"epoch": 3.691818061265084,
"grad_norm": 1.6296403408050537,
"learning_rate": 3.886217780156285e-06,
"loss": 0.0359,
"step": 6960
},
{
"epoch": 3.697122397560005,
"grad_norm": 1.6236915588378906,
"learning_rate": 3.856949911100249e-06,
"loss": 0.0365,
"step": 6970
},
{
"epoch": 3.702426733854926,
"grad_norm": 1.5271315574645996,
"learning_rate": 3.827766314717175e-06,
"loss": 0.0355,
"step": 6980
},
{
"epoch": 3.7077310701498476,
"grad_norm": 1.6985539197921753,
"learning_rate": 3.7986673913586246e-06,
"loss": 0.0376,
"step": 6990
},
{
"epoch": 3.7130354064447686,
"grad_norm": 1.4878424406051636,
"learning_rate": 3.7696535402145807e-06,
"loss": 0.0363,
"step": 7000
},
{
"epoch": 3.7183397427396896,
"grad_norm": 1.5377414226531982,
"learning_rate": 3.7407251593079697e-06,
"loss": 0.0355,
"step": 7010
},
{
"epoch": 3.723644079034611,
"grad_norm": 1.6395467519760132,
"learning_rate": 3.7118826454892132e-06,
"loss": 0.0353,
"step": 7020
},
{
"epoch": 3.728948415329532,
"grad_norm": 1.8777011632919312,
"learning_rate": 3.6831263944307626e-06,
"loss": 0.0339,
"step": 7030
},
{
"epoch": 3.734252751624453,
"grad_norm": 1.419209361076355,
"learning_rate": 3.654456800621695e-06,
"loss": 0.036,
"step": 7040
},
{
"epoch": 3.739557087919374,
"grad_norm": 1.9455891847610474,
"learning_rate": 3.6258742573622887e-06,
"loss": 0.0374,
"step": 7050
},
{
"epoch": 3.744861424214295,
"grad_norm": 1.8757051229476929,
"learning_rate": 3.5973791567586313e-06,
"loss": 0.0369,
"step": 7060
},
{
"epoch": 3.7501657605092165,
"grad_norm": 1.5636701583862305,
"learning_rate": 3.5689718897172265e-06,
"loss": 0.0362,
"step": 7070
},
{
"epoch": 3.7554700968041375,
"grad_norm": 1.814666986465454,
"learning_rate": 3.540652845939667e-06,
"loss": 0.0348,
"step": 7080
},
{
"epoch": 3.7607744330990585,
"grad_norm": 1.4877405166625977,
"learning_rate": 3.5124224139172413e-06,
"loss": 0.0364,
"step": 7090
},
{
"epoch": 3.7660787693939795,
"grad_norm": 1.3752930164337158,
"learning_rate": 3.4842809809256527e-06,
"loss": 0.036,
"step": 7100
},
{
"epoch": 3.7713831056889005,
"grad_norm": 1.8896254301071167,
"learning_rate": 3.4562289330196586e-06,
"loss": 0.036,
"step": 7110
},
{
"epoch": 3.776687441983822,
"grad_norm": 1.7569184303283691,
"learning_rate": 3.428266655027812e-06,
"loss": 0.0383,
"step": 7120
},
{
"epoch": 3.781991778278743,
"grad_norm": 1.7225362062454224,
"learning_rate": 3.4003945305471676e-06,
"loss": 0.039,
"step": 7130
},
{
"epoch": 3.787296114573664,
"grad_norm": 1.4813677072525024,
"learning_rate": 3.3726129419380203e-06,
"loss": 0.0354,
"step": 7140
},
{
"epoch": 3.7926004508685853,
"grad_norm": 1.5851385593414307,
"learning_rate": 3.344922270318649e-06,
"loss": 0.0368,
"step": 7150
},
{
"epoch": 3.7979047871635063,
"grad_norm": 1.8603835105895996,
"learning_rate": 3.31732289556012e-06,
"loss": 0.0375,
"step": 7160
},
{
"epoch": 3.8032091234584273,
"grad_norm": 1.8360573053359985,
"learning_rate": 3.289815196281033e-06,
"loss": 0.0382,
"step": 7170
},
{
"epoch": 3.8085134597533483,
"grad_norm": 1.8814833164215088,
"learning_rate": 3.2623995498423622e-06,
"loss": 0.0367,
"step": 7180
},
{
"epoch": 3.8138177960482693,
"grad_norm": 1.66216242313385,
"learning_rate": 3.235076332342264e-06,
"loss": 0.0353,
"step": 7190
},
{
"epoch": 3.8191221323431908,
"grad_norm": 1.5455678701400757,
"learning_rate": 3.207845918610921e-06,
"loss": 0.0376,
"step": 7200
},
{
"epoch": 3.8244264686381118,
"grad_norm": 1.4232027530670166,
"learning_rate": 3.1807086822053867e-06,
"loss": 0.034,
"step": 7210
},
{
"epoch": 3.8297308049330328,
"grad_norm": 1.4929195642471313,
"learning_rate": 3.153664995404496e-06,
"loss": 0.0328,
"step": 7220
},
{
"epoch": 3.8350351412279537,
"grad_norm": 1.6196081638336182,
"learning_rate": 3.126715229203713e-06,
"loss": 0.0346,
"step": 7230
},
{
"epoch": 3.8403394775228747,
"grad_norm": 1.8542481660842896,
"learning_rate": 3.099859753310075e-06,
"loss": 0.0329,
"step": 7240
},
{
"epoch": 3.845643813817796,
"grad_norm": 1.6762479543685913,
"learning_rate": 3.0730989361371056e-06,
"loss": 0.0335,
"step": 7250
},
{
"epoch": 3.850948150112717,
"grad_norm": 1.3557182550430298,
"learning_rate": 3.0464331447997686e-06,
"loss": 0.0345,
"step": 7260
},
{
"epoch": 3.856252486407638,
"grad_norm": 1.3587796688079834,
"learning_rate": 3.0198627451094264e-06,
"loss": 0.0341,
"step": 7270
},
{
"epoch": 3.861556822702559,
"grad_norm": 1.8085464239120483,
"learning_rate": 2.993388101568816e-06,
"loss": 0.0339,
"step": 7280
},
{
"epoch": 3.86686115899748,
"grad_norm": 1.6431082487106323,
"learning_rate": 2.9670095773670626e-06,
"loss": 0.0328,
"step": 7290
},
{
"epoch": 3.8721654952924016,
"grad_norm": 1.445833683013916,
"learning_rate": 2.9407275343746886e-06,
"loss": 0.0364,
"step": 7300
},
{
"epoch": 3.8774698315873226,
"grad_norm": 1.570131778717041,
"learning_rate": 2.9145423331386546e-06,
"loss": 0.0384,
"step": 7310
},
{
"epoch": 3.8827741678822436,
"grad_norm": 1.5940959453582764,
"learning_rate": 2.888454332877396e-06,
"loss": 0.0324,
"step": 7320
},
{
"epoch": 3.888078504177165,
"grad_norm": 1.5815328359603882,
"learning_rate": 2.8624638914759306e-06,
"loss": 0.0375,
"step": 7330
},
{
"epoch": 3.893382840472086,
"grad_norm": 2.0120179653167725,
"learning_rate": 2.8365713654809058e-06,
"loss": 0.0356,
"step": 7340
},
{
"epoch": 3.898687176767007,
"grad_norm": 1.5655955076217651,
"learning_rate": 2.8107771100957393e-06,
"loss": 0.0339,
"step": 7350
},
{
"epoch": 3.903991513061928,
"grad_norm": 1.696489930152893,
"learning_rate": 2.785081479175734e-06,
"loss": 0.037,
"step": 7360
},
{
"epoch": 3.909295849356849,
"grad_norm": 1.6997973918914795,
"learning_rate": 2.75948482522323e-06,
"loss": 0.0362,
"step": 7370
},
{
"epoch": 3.9146001856517705,
"grad_norm": 1.550915241241455,
"learning_rate": 2.7339874993827476e-06,
"loss": 0.0323,
"step": 7380
},
{
"epoch": 3.9199045219466915,
"grad_norm": 1.8627867698669434,
"learning_rate": 2.708589851436211e-06,
"loss": 0.0336,
"step": 7390
},
{
"epoch": 3.9252088582416125,
"grad_norm": 1.4691085815429688,
"learning_rate": 2.6832922297981044e-06,
"loss": 0.0336,
"step": 7400
},
{
"epoch": 3.9305131945365335,
"grad_norm": 1.572690725326538,
"learning_rate": 2.6580949815107248e-06,
"loss": 0.036,
"step": 7410
},
{
"epoch": 3.9358175308314545,
"grad_norm": 1.718151569366455,
"learning_rate": 2.6329984522394057e-06,
"loss": 0.0348,
"step": 7420
},
{
"epoch": 3.941121867126376,
"grad_norm": 1.8015989065170288,
"learning_rate": 2.6080029862677813e-06,
"loss": 0.0331,
"step": 7430
},
{
"epoch": 3.946426203421297,
"grad_norm": 2.0196962356567383,
"learning_rate": 2.5831089264930607e-06,
"loss": 0.0365,
"step": 7440
},
{
"epoch": 3.951730539716218,
"grad_norm": 1.51595938205719,
"learning_rate": 2.5583166144213265e-06,
"loss": 0.0342,
"step": 7450
},
{
"epoch": 3.9570348760111393,
"grad_norm": 1.6583274602890015,
"learning_rate": 2.5336263901628387e-06,
"loss": 0.0345,
"step": 7460
},
{
"epoch": 3.9623392123060603,
"grad_norm": 1.7668447494506836,
"learning_rate": 2.5090385924273953e-06,
"loss": 0.0332,
"step": 7470
},
{
"epoch": 3.9676435486009813,
"grad_norm": 1.3587608337402344,
"learning_rate": 2.4845535585196503e-06,
"loss": 0.0314,
"step": 7480
},
{
"epoch": 3.9729478848959023,
"grad_norm": 1.6988006830215454,
"learning_rate": 2.4601716243345176e-06,
"loss": 0.0343,
"step": 7490
},
{
"epoch": 3.9782522211908233,
"grad_norm": 1.744411826133728,
"learning_rate": 2.435893124352545e-06,
"loss": 0.0347,
"step": 7500
},
{
"epoch": 3.9835565574857448,
"grad_norm": 1.4949431419372559,
"learning_rate": 2.4117183916353357e-06,
"loss": 0.0341,
"step": 7510
},
{
"epoch": 3.9888608937806658,
"grad_norm": 1.8944114446640015,
"learning_rate": 2.3876477578209657e-06,
"loss": 0.0351,
"step": 7520
},
{
"epoch": 3.9941652300755868,
"grad_norm": 1.938253402709961,
"learning_rate": 2.363681553119449e-06,
"loss": 0.034,
"step": 7530
},
{
"epoch": 3.9994695663705078,
"grad_norm": 1.3915281295776367,
"learning_rate": 2.339820106308204e-06,
"loss": 0.0325,
"step": 7540
},
{
"epoch": 4.004773902665429,
"grad_norm": 0.8860509395599365,
"learning_rate": 2.3160637447275347e-06,
"loss": 0.0159,
"step": 7550
},
{
"epoch": 4.01007823896035,
"grad_norm": 0.8337376117706299,
"learning_rate": 2.292412794276152e-06,
"loss": 0.0137,
"step": 7560
},
{
"epoch": 4.015382575255271,
"grad_norm": 0.7365363836288452,
"learning_rate": 2.268867579406697e-06,
"loss": 0.0127,
"step": 7570
},
{
"epoch": 4.020686911550192,
"grad_norm": 0.7730606198310852,
"learning_rate": 2.245428423121282e-06,
"loss": 0.013,
"step": 7580
},
{
"epoch": 4.025991247845114,
"grad_norm": 0.6678166389465332,
"learning_rate": 2.2220956469670774e-06,
"loss": 0.0131,
"step": 7590
},
{
"epoch": 4.031295584140034,
"grad_norm": 0.9078973531723022,
"learning_rate": 2.1988695710318875e-06,
"loss": 0.0125,
"step": 7600
},
{
"epoch": 4.036599920434956,
"grad_norm": 0.8652137517929077,
"learning_rate": 2.1757505139397627e-06,
"loss": 0.0129,
"step": 7610
},
{
"epoch": 4.041904256729877,
"grad_norm": 0.6685816645622253,
"learning_rate": 2.1527387928466313e-06,
"loss": 0.0111,
"step": 7620
},
{
"epoch": 4.047208593024798,
"grad_norm": 0.7991968393325806,
"learning_rate": 2.129834723435935e-06,
"loss": 0.0122,
"step": 7630
},
{
"epoch": 4.052512929319719,
"grad_norm": 0.8681981563568115,
"learning_rate": 2.1070386199143288e-06,
"loss": 0.0126,
"step": 7640
},
{
"epoch": 4.05781726561464,
"grad_norm": 0.6502617001533508,
"learning_rate": 2.0843507950073317e-06,
"loss": 0.0122,
"step": 7650
},
{
"epoch": 4.063121601909561,
"grad_norm": 0.7759711742401123,
"learning_rate": 2.061771559955066e-06,
"loss": 0.0113,
"step": 7660
},
{
"epoch": 4.0684259382044825,
"grad_norm": 0.9321727752685547,
"learning_rate": 2.0393012245079757e-06,
"loss": 0.0124,
"step": 7670
},
{
"epoch": 4.073730274499403,
"grad_norm": 0.715567946434021,
"learning_rate": 2.016940096922582e-06,
"loss": 0.0133,
"step": 7680
},
{
"epoch": 4.0790346107943245,
"grad_norm": 0.8862237930297852,
"learning_rate": 1.99468848395724e-06,
"loss": 0.0132,
"step": 7690
},
{
"epoch": 4.084338947089245,
"grad_norm": 0.6084906458854675,
"learning_rate": 1.9725466908679626e-06,
"loss": 0.0129,
"step": 7700
},
{
"epoch": 4.0896432833841665,
"grad_norm": 0.9061374068260193,
"learning_rate": 1.950515021404189e-06,
"loss": 0.0122,
"step": 7710
},
{
"epoch": 4.094947619679088,
"grad_norm": 0.8219320178031921,
"learning_rate": 1.9285937778046582e-06,
"loss": 0.0124,
"step": 7720
},
{
"epoch": 4.1002519559740085,
"grad_norm": 0.9768364429473877,
"learning_rate": 1.906783260793238e-06,
"loss": 0.0118,
"step": 7730
},
{
"epoch": 4.10555629226893,
"grad_norm": 0.9576259851455688,
"learning_rate": 1.8850837695748104e-06,
"loss": 0.0127,
"step": 7740
},
{
"epoch": 4.110860628563851,
"grad_norm": 0.5971377491950989,
"learning_rate": 1.8634956018311566e-06,
"loss": 0.0113,
"step": 7750
},
{
"epoch": 4.116164964858772,
"grad_norm": 0.8423818349838257,
"learning_rate": 1.8420190537168947e-06,
"loss": 0.0125,
"step": 7760
},
{
"epoch": 4.121469301153693,
"grad_norm": 0.8644461631774902,
"learning_rate": 1.8206544198553855e-06,
"loss": 0.0114,
"step": 7770
},
{
"epoch": 4.126773637448614,
"grad_norm": 0.6005859375,
"learning_rate": 1.7994019933347252e-06,
"loss": 0.0121,
"step": 7780
},
{
"epoch": 4.132077973743535,
"grad_norm": 0.7392746806144714,
"learning_rate": 1.778262065703692e-06,
"loss": 0.0112,
"step": 7790
},
{
"epoch": 4.137382310038457,
"grad_norm": 0.646115779876709,
"learning_rate": 1.7572349269677713e-06,
"loss": 0.0114,
"step": 7800
},
{
"epoch": 4.142686646333377,
"grad_norm": 0.8041878342628479,
"learning_rate": 1.7363208655851649e-06,
"loss": 0.0121,
"step": 7810
},
{
"epoch": 4.147990982628299,
"grad_norm": 0.6283585429191589,
"learning_rate": 1.715520168462842e-06,
"loss": 0.0118,
"step": 7820
},
{
"epoch": 4.153295318923219,
"grad_norm": 0.8467313051223755,
"learning_rate": 1.6948331209525859e-06,
"loss": 0.011,
"step": 7830
},
{
"epoch": 4.158599655218141,
"grad_norm": 0.7284769415855408,
"learning_rate": 1.674260006847105e-06,
"loss": 0.0125,
"step": 7840
},
{
"epoch": 4.163903991513062,
"grad_norm": 0.808991551399231,
"learning_rate": 1.6538011083761186e-06,
"loss": 0.012,
"step": 7850
},
{
"epoch": 4.169208327807983,
"grad_norm": 0.5860360264778137,
"learning_rate": 1.6334567062024963e-06,
"loss": 0.0112,
"step": 7860
},
{
"epoch": 4.174512664102904,
"grad_norm": 0.8675763010978699,
"learning_rate": 1.613227079418407e-06,
"loss": 0.0116,
"step": 7870
},
{
"epoch": 4.179817000397826,
"grad_norm": 0.9475387930870056,
"learning_rate": 1.5931125055414764e-06,
"loss": 0.0108,
"step": 7880
},
{
"epoch": 4.185121336692746,
"grad_norm": 0.7731668949127197,
"learning_rate": 1.5731132605110034e-06,
"loss": 0.0108,
"step": 7890
},
{
"epoch": 4.190425672987668,
"grad_norm": 0.8824770450592041,
"learning_rate": 1.5532296186841577e-06,
"loss": 0.0118,
"step": 7900
},
{
"epoch": 4.195730009282588,
"grad_norm": 0.7245500087738037,
"learning_rate": 1.5334618528322231e-06,
"loss": 0.0115,
"step": 7910
},
{
"epoch": 4.20103434557751,
"grad_norm": 0.8868821263313293,
"learning_rate": 1.513810234136842e-06,
"loss": 0.0118,
"step": 7920
},
{
"epoch": 4.206338681872431,
"grad_norm": 0.7213976979255676,
"learning_rate": 1.4942750321863274e-06,
"loss": 0.0108,
"step": 7930
},
{
"epoch": 4.211643018167352,
"grad_norm": 0.6786092519760132,
"learning_rate": 1.4748565149719196e-06,
"loss": 0.012,
"step": 7940
},
{
"epoch": 4.216947354462273,
"grad_norm": 0.9421653747558594,
"learning_rate": 1.4555549488841568e-06,
"loss": 0.0108,
"step": 7950
},
{
"epoch": 4.222251690757194,
"grad_norm": 0.8586103320121765,
"learning_rate": 1.4363705987091781e-06,
"loss": 0.012,
"step": 7960
},
{
"epoch": 4.227556027052115,
"grad_norm": 0.8113248944282532,
"learning_rate": 1.4173037276251222e-06,
"loss": 0.0113,
"step": 7970
},
{
"epoch": 4.2328603633470365,
"grad_norm": 0.7525209188461304,
"learning_rate": 1.3983545971985024e-06,
"loss": 0.0112,
"step": 7980
},
{
"epoch": 4.238164699641957,
"grad_norm": 0.773442804813385,
"learning_rate": 1.3795234673806223e-06,
"loss": 0.0109,
"step": 7990
},
{
"epoch": 4.2434690359368785,
"grad_norm": 0.9594974517822266,
"learning_rate": 1.3608105965040008e-06,
"loss": 0.0111,
"step": 8000
},
{
"epoch": 4.248773372231799,
"grad_norm": 1.0915862321853638,
"learning_rate": 1.3422162412788532e-06,
"loss": 0.0123,
"step": 8010
},
{
"epoch": 4.2540777085267205,
"grad_norm": 0.7315853238105774,
"learning_rate": 1.323740656789535e-06,
"loss": 0.0122,
"step": 8020
},
{
"epoch": 4.259382044821642,
"grad_norm": 0.7983627319335938,
"learning_rate": 1.305384096491068e-06,
"loss": 0.0113,
"step": 8030
},
{
"epoch": 4.2646863811165625,
"grad_norm": 0.9778603911399841,
"learning_rate": 1.2871468122056574e-06,
"loss": 0.0121,
"step": 8040
},
{
"epoch": 4.269990717411484,
"grad_norm": 0.9073354005813599,
"learning_rate": 1.2690290541192317e-06,
"loss": 0.0118,
"step": 8050
},
{
"epoch": 4.275295053706405,
"grad_norm": 0.9715719819068909,
"learning_rate": 1.2510310707780093e-06,
"loss": 0.011,
"step": 8060
},
{
"epoch": 4.280599390001326,
"grad_norm": 1.0042502880096436,
"learning_rate": 1.233153109085108e-06,
"loss": 0.0122,
"step": 8070
},
{
"epoch": 4.285903726296247,
"grad_norm": 0.7477414608001709,
"learning_rate": 1.215395414297127e-06,
"loss": 0.0109,
"step": 8080
},
{
"epoch": 4.291208062591168,
"grad_norm": 0.7065203785896301,
"learning_rate": 1.1977582300208102e-06,
"loss": 0.0112,
"step": 8090
},
{
"epoch": 4.296512398886089,
"grad_norm": 0.9468834400177002,
"learning_rate": 1.180241798209687e-06,
"loss": 0.0107,
"step": 8100
},
{
"epoch": 4.301816735181011,
"grad_norm": 0.8056593537330627,
"learning_rate": 1.162846359160762e-06,
"loss": 0.0115,
"step": 8110
},
{
"epoch": 4.307121071475931,
"grad_norm": 0.6542430520057678,
"learning_rate": 1.1455721515112161e-06,
"loss": 0.0107,
"step": 8120
},
{
"epoch": 4.312425407770853,
"grad_norm": 0.6646844744682312,
"learning_rate": 1.1284194122351276e-06,
"loss": 0.0133,
"step": 8130
},
{
"epoch": 4.317729744065773,
"grad_norm": 0.9997266530990601,
"learning_rate": 1.11138837664023e-06,
"loss": 0.0108,
"step": 8140
},
{
"epoch": 4.323034080360695,
"grad_norm": 0.8001927137374878,
"learning_rate": 1.0944792783646808e-06,
"loss": 0.0122,
"step": 8150
},
{
"epoch": 4.328338416655616,
"grad_norm": 0.635581374168396,
"learning_rate": 1.077692349373851e-06,
"loss": 0.0119,
"step": 8160
},
{
"epoch": 4.333642752950537,
"grad_norm": 0.7629982233047485,
"learning_rate": 1.0610278199571522e-06,
"loss": 0.012,
"step": 8170
},
{
"epoch": 4.338947089245458,
"grad_norm": 0.9843791127204895,
"learning_rate": 1.0444859187248701e-06,
"loss": 0.0116,
"step": 8180
},
{
"epoch": 4.34425142554038,
"grad_norm": 0.7684705853462219,
"learning_rate": 1.0280668726050302e-06,
"loss": 0.0128,
"step": 8190
},
{
"epoch": 4.3495557618353,
"grad_norm": 1.196272373199463,
"learning_rate": 1.0117709068402858e-06,
"loss": 0.0132,
"step": 8200
},
{
"epoch": 4.354860098130222,
"grad_norm": 0.6995633244514465,
"learning_rate": 9.95598244984829e-07,
"loss": 0.0113,
"step": 8210
},
{
"epoch": 4.360164434425142,
"grad_norm": 0.9181510806083679,
"learning_rate": 9.795491089013233e-07,
"loss": 0.012,
"step": 8220
},
{
"epoch": 4.365468770720064,
"grad_norm": 0.9330178499221802,
"learning_rate": 9.636237187578502e-07,
"loss": 0.0119,
"step": 8230
},
{
"epoch": 4.370773107014985,
"grad_norm": 0.7912993431091309,
"learning_rate": 9.478222930249148e-07,
"loss": 0.0108,
"step": 8240
},
{
"epoch": 4.376077443309906,
"grad_norm": 0.8073052167892456,
"learning_rate": 9.32145048472416e-07,
"loss": 0.0123,
"step": 8250
},
{
"epoch": 4.381381779604827,
"grad_norm": 0.778070330619812,
"learning_rate": 9.165922001666949e-07,
"loss": 0.0117,
"step": 8260
},
{
"epoch": 4.386686115899748,
"grad_norm": 0.8094714283943176,
"learning_rate": 9.011639614675783e-07,
"loss": 0.0111,
"step": 8270
},
{
"epoch": 4.391990452194669,
"grad_norm": 0.6484878659248352,
"learning_rate": 8.858605440254519e-07,
"loss": 0.0112,
"step": 8280
},
{
"epoch": 4.3972947884895905,
"grad_norm": 0.6638253927230835,
"learning_rate": 8.706821577783542e-07,
"loss": 0.0113,
"step": 8290
},
{
"epoch": 4.402599124784511,
"grad_norm": 0.948729395866394,
"learning_rate": 8.556290109491017e-07,
"loss": 0.0118,
"step": 8300
},
{
"epoch": 4.4079034610794325,
"grad_norm": 1.0264664888381958,
"learning_rate": 8.407013100424222e-07,
"loss": 0.0116,
"step": 8310
},
{
"epoch": 4.413207797374353,
"grad_norm": 1.1023824214935303,
"learning_rate": 8.258992598421422e-07,
"loss": 0.0116,
"step": 8320
},
{
"epoch": 4.4185121336692745,
"grad_norm": 0.6815558671951294,
"learning_rate": 8.112230634083518e-07,
"loss": 0.0111,
"step": 8330
},
{
"epoch": 4.423816469964196,
"grad_norm": 0.6698101758956909,
"learning_rate": 7.966729220746372e-07,
"loss": 0.0118,
"step": 8340
},
{
"epoch": 4.4291208062591165,
"grad_norm": 0.6504749059677124,
"learning_rate": 7.82249035445315e-07,
"loss": 0.0104,
"step": 8350
},
{
"epoch": 4.434425142554038,
"grad_norm": 0.7233061194419861,
"learning_rate": 7.679516013926902e-07,
"loss": 0.0113,
"step": 8360
},
{
"epoch": 4.439729478848959,
"grad_norm": 0.8585641384124756,
"learning_rate": 7.537808160543403e-07,
"loss": 0.0115,
"step": 8370
},
{
"epoch": 4.44503381514388,
"grad_norm": 0.7206557989120483,
"learning_rate": 7.397368738304367e-07,
"loss": 0.0106,
"step": 8380
},
{
"epoch": 4.450338151438801,
"grad_norm": 0.8581918478012085,
"learning_rate": 7.258199673810595e-07,
"loss": 0.0113,
"step": 8390
},
{
"epoch": 4.455642487733722,
"grad_norm": 0.7734334468841553,
"learning_rate": 7.120302876235707e-07,
"loss": 0.011,
"step": 8400
},
{
"epoch": 4.460946824028643,
"grad_norm": 0.7396286725997925,
"learning_rate": 6.983680237299861e-07,
"loss": 0.0106,
"step": 8410
},
{
"epoch": 4.466251160323565,
"grad_norm": 1.0284420251846313,
"learning_rate": 6.848333631243853e-07,
"loss": 0.0114,
"step": 8420
},
{
"epoch": 4.471555496618485,
"grad_norm": 0.8151918053627014,
"learning_rate": 6.714264914803348e-07,
"loss": 0.0113,
"step": 8430
},
{
"epoch": 4.476859832913407,
"grad_norm": 0.8683885335922241,
"learning_rate": 6.581475927183444e-07,
"loss": 0.0108,
"step": 8440
},
{
"epoch": 4.482164169208328,
"grad_norm": 1.0174139738082886,
"learning_rate": 6.449968490033453e-07,
"loss": 0.0122,
"step": 8450
},
{
"epoch": 4.487468505503249,
"grad_norm": 0.7874058485031128,
"learning_rate": 6.319744407421891e-07,
"loss": 0.0117,
"step": 8460
},
{
"epoch": 4.49277284179817,
"grad_norm": 0.9520915746688843,
"learning_rate": 6.190805465811745e-07,
"loss": 0.0103,
"step": 8470
},
{
"epoch": 4.498077178093091,
"grad_norm": 0.8549929261207581,
"learning_rate": 6.063153434035896e-07,
"loss": 0.0121,
"step": 8480
},
{
"epoch": 4.503381514388012,
"grad_norm": 0.8030751943588257,
"learning_rate": 5.936790063273013e-07,
"loss": 0.0104,
"step": 8490
},
{
"epoch": 4.508685850682934,
"grad_norm": 0.6169995069503784,
"learning_rate": 5.811717087023327e-07,
"loss": 0.0104,
"step": 8500
},
{
"epoch": 4.513990186977854,
"grad_norm": 0.9810777306556702,
"learning_rate": 5.687936221085022e-07,
"loss": 0.0125,
"step": 8510
},
{
"epoch": 4.519294523272776,
"grad_norm": 0.7712530493736267,
"learning_rate": 5.565449163530578e-07,
"loss": 0.0103,
"step": 8520
},
{
"epoch": 4.524598859567696,
"grad_norm": 0.6955736875534058,
"learning_rate": 5.444257594683577e-07,
"loss": 0.0115,
"step": 8530
},
{
"epoch": 4.529903195862618,
"grad_norm": 1.2472916841506958,
"learning_rate": 5.324363177095526e-07,
"loss": 0.0111,
"step": 8540
},
{
"epoch": 4.535207532157539,
"grad_norm": 0.8412001729011536,
"learning_rate": 5.205767555523211e-07,
"loss": 0.0113,
"step": 8550
},
{
"epoch": 4.54051186845246,
"grad_norm": 0.7206942439079285,
"learning_rate": 5.088472356905971e-07,
"loss": 0.0109,
"step": 8560
},
{
"epoch": 4.545816204747381,
"grad_norm": 0.8209089040756226,
"learning_rate": 4.972479190343494e-07,
"loss": 0.0107,
"step": 8570
},
{
"epoch": 4.551120541042302,
"grad_norm": 1.1593587398529053,
"learning_rate": 4.857789647073685e-07,
"loss": 0.0115,
"step": 8580
},
{
"epoch": 4.556424877337223,
"grad_norm": 0.6742283701896667,
"learning_rate": 4.7444053004508716e-07,
"loss": 0.0113,
"step": 8590
},
{
"epoch": 4.561729213632145,
"grad_norm": 1.103501558303833,
"learning_rate": 4.632327705924178e-07,
"loss": 0.0121,
"step": 8600
},
{
"epoch": 4.567033549927065,
"grad_norm": 0.7072399258613586,
"learning_rate": 4.521558401016246e-07,
"loss": 0.0132,
"step": 8610
},
{
"epoch": 4.572337886221987,
"grad_norm": 0.7149022221565247,
"learning_rate": 4.4120989053020423e-07,
"loss": 0.01,
"step": 8620
},
{
"epoch": 4.577642222516907,
"grad_norm": 0.7452453970909119,
"learning_rate": 4.3039507203881836e-07,
"loss": 0.0122,
"step": 8630
},
{
"epoch": 4.5829465588118286,
"grad_norm": 0.5006600022315979,
"learning_rate": 4.197115329892121e-07,
"loss": 0.0105,
"step": 8640
},
{
"epoch": 4.58825089510675,
"grad_norm": 0.8474171757698059,
"learning_rate": 4.091594199421967e-07,
"loss": 0.0102,
"step": 8650
},
{
"epoch": 4.5935552314016705,
"grad_norm": 1.0133711099624634,
"learning_rate": 3.9873887765563e-07,
"loss": 0.0111,
"step": 8660
},
{
"epoch": 4.598859567696592,
"grad_norm": 0.6478588581085205,
"learning_rate": 3.884500490824339e-07,
"loss": 0.0104,
"step": 8670
},
{
"epoch": 4.604163903991513,
"grad_norm": 0.7794708609580994,
"learning_rate": 3.782930753686287e-07,
"loss": 0.0109,
"step": 8680
},
{
"epoch": 4.609468240286434,
"grad_norm": 0.9100542068481445,
"learning_rate": 3.6826809585140287e-07,
"loss": 0.0121,
"step": 8690
},
{
"epoch": 4.614772576581355,
"grad_norm": 0.7963326573371887,
"learning_rate": 3.5837524805719784e-07,
"loss": 0.0106,
"step": 8700
},
{
"epoch": 4.620076912876276,
"grad_norm": 0.7512088418006897,
"learning_rate": 3.4861466769982364e-07,
"loss": 0.0105,
"step": 8710
},
{
"epoch": 4.625381249171197,
"grad_norm": 0.9739687442779541,
"learning_rate": 3.389864886785943e-07,
"loss": 0.012,
"step": 8720
},
{
"epoch": 4.630685585466119,
"grad_norm": 0.7893660068511963,
"learning_rate": 3.2949084307649317e-07,
"loss": 0.0117,
"step": 8730
},
{
"epoch": 4.635989921761039,
"grad_norm": 0.738175630569458,
"learning_rate": 3.2012786115836024e-07,
"loss": 0.0115,
"step": 8740
},
{
"epoch": 4.641294258055961,
"grad_norm": 0.6872140765190125,
"learning_rate": 3.1089767136910475e-07,
"loss": 0.0102,
"step": 8750
},
{
"epoch": 4.646598594350882,
"grad_norm": 0.6499819755554199,
"learning_rate": 3.0180040033194415e-07,
"loss": 0.0102,
"step": 8760
},
{
"epoch": 4.651902930645803,
"grad_norm": 0.8301778435707092,
"learning_rate": 2.9283617284666666e-07,
"loss": 0.0125,
"step": 8770
},
{
"epoch": 4.657207266940724,
"grad_norm": 0.8570474982261658,
"learning_rate": 2.8400511188791834e-07,
"loss": 0.0109,
"step": 8780
},
{
"epoch": 4.662511603235645,
"grad_norm": 0.6350365877151489,
"learning_rate": 2.7530733860351434e-07,
"loss": 0.0103,
"step": 8790
},
{
"epoch": 4.667815939530566,
"grad_norm": 0.9536558985710144,
"learning_rate": 2.6674297231278677e-07,
"loss": 0.0118,
"step": 8800
},
{
"epoch": 4.673120275825488,
"grad_norm": 0.6549102663993835,
"learning_rate": 2.583121305049308e-07,
"loss": 0.0102,
"step": 8810
},
{
"epoch": 4.678424612120408,
"grad_norm": 0.7842022776603699,
"learning_rate": 2.5001492883740984e-07,
"loss": 0.0116,
"step": 8820
},
{
"epoch": 4.68372894841533,
"grad_norm": 0.8783378005027771,
"learning_rate": 2.418514811343575e-07,
"loss": 0.0115,
"step": 8830
},
{
"epoch": 4.68903328471025,
"grad_norm": 0.8600866198539734,
"learning_rate": 2.3382189938502387e-07,
"loss": 0.012,
"step": 8840
},
{
"epoch": 4.694337621005172,
"grad_norm": 1.0350086688995361,
"learning_rate": 2.2592629374222907e-07,
"loss": 0.0105,
"step": 8850
},
{
"epoch": 4.699641957300093,
"grad_norm": 0.8720340132713318,
"learning_rate": 2.1816477252086689e-07,
"loss": 0.0115,
"step": 8860
},
{
"epoch": 4.704946293595014,
"grad_norm": 0.8609745502471924,
"learning_rate": 2.105374421964046e-07,
"loss": 0.0124,
"step": 8870
},
{
"epoch": 4.710250629889935,
"grad_norm": 0.7636396288871765,
"learning_rate": 2.030444074034288e-07,
"loss": 0.0095,
"step": 8880
},
{
"epoch": 4.715554966184856,
"grad_norm": 0.6747872233390808,
"learning_rate": 1.9568577093421303e-07,
"loss": 0.0098,
"step": 8890
},
{
"epoch": 4.720859302479777,
"grad_norm": 0.6326771974563599,
"learning_rate": 1.884616337373002e-07,
"loss": 0.0118,
"step": 8900
},
{
"epoch": 4.726163638774699,
"grad_norm": 0.7632216215133667,
"learning_rate": 1.813720949161235e-07,
"loss": 0.0105,
"step": 8910
},
{
"epoch": 4.731467975069619,
"grad_norm": 0.9641050100326538,
"learning_rate": 1.7441725172764434e-07,
"loss": 0.0111,
"step": 8920
},
{
"epoch": 4.736772311364541,
"grad_norm": 0.7419525384902954,
"learning_rate": 1.6759719958101883e-07,
"loss": 0.0125,
"step": 8930
},
{
"epoch": 4.742076647659461,
"grad_norm": 0.8092560768127441,
"learning_rate": 1.6091203203629003e-07,
"loss": 0.0111,
"step": 8940
},
{
"epoch": 4.747380983954383,
"grad_norm": 0.710726797580719,
"learning_rate": 1.5436184080310112e-07,
"loss": 0.0107,
"step": 8950
},
{
"epoch": 4.752685320249304,
"grad_norm": 1.1181225776672363,
"learning_rate": 1.4794671573944096e-07,
"loss": 0.0114,
"step": 8960
},
{
"epoch": 4.757989656544225,
"grad_norm": 0.8513716459274292,
"learning_rate": 1.416667448504083e-07,
"loss": 0.0116,
"step": 8970
},
{
"epoch": 4.763293992839146,
"grad_norm": 0.6909376382827759,
"learning_rate": 1.355220142870095e-07,
"loss": 0.0109,
"step": 8980
},
{
"epoch": 4.7685983291340674,
"grad_norm": 0.6143341660499573,
"learning_rate": 1.2951260834496826e-07,
"loss": 0.0105,
"step": 8990
},
{
"epoch": 4.773902665428988,
"grad_norm": 0.8020744323730469,
"learning_rate": 1.2363860946357885e-07,
"loss": 0.0109,
"step": 9000
},
{
"epoch": 4.779207001723909,
"grad_norm": 0.9505652785301208,
"learning_rate": 1.1790009822456704e-07,
"loss": 0.0106,
"step": 9010
},
{
"epoch": 4.784511338018831,
"grad_norm": 0.5784342885017395,
"learning_rate": 1.1229715335098978e-07,
"loss": 0.0106,
"step": 9020
},
{
"epoch": 4.789815674313751,
"grad_norm": 0.9723225831985474,
"learning_rate": 1.0682985170615612e-07,
"loss": 0.0115,
"step": 9030
},
{
"epoch": 4.795120010608673,
"grad_norm": 0.7735673189163208,
"learning_rate": 1.014982682925636e-07,
"loss": 0.0116,
"step": 9040
},
{
"epoch": 4.800424346903593,
"grad_norm": 0.7800854444503784,
"learning_rate": 9.630247625088129e-08,
"loss": 0.0114,
"step": 9050
},
{
"epoch": 4.805728683198515,
"grad_norm": 0.8556525111198425,
"learning_rate": 9.124254685894174e-08,
"loss": 0.0107,
"step": 9060
},
{
"epoch": 4.811033019493436,
"grad_norm": 0.8377553820610046,
"learning_rate": 8.631854953075836e-08,
"loss": 0.0104,
"step": 9070
},
{
"epoch": 4.816337355788357,
"grad_norm": 0.8369781374931335,
"learning_rate": 8.153055181557956e-08,
"loss": 0.0119,
"step": 9080
},
{
"epoch": 4.821641692083278,
"grad_norm": 0.49618351459503174,
"learning_rate": 7.687861939696173e-08,
"loss": 0.0103,
"step": 9090
},
{
"epoch": 4.826946028378199,
"grad_norm": 0.8221543431282043,
"learning_rate": 7.236281609186213e-08,
"loss": 0.0113,
"step": 9100
},
{
"epoch": 4.83225036467312,
"grad_norm": 0.7929049134254456,
"learning_rate": 6.798320384977297e-08,
"loss": 0.0109,
"step": 9110
},
{
"epoch": 4.837554700968042,
"grad_norm": 0.9620218873023987,
"learning_rate": 6.373984275185985e-08,
"loss": 0.0122,
"step": 9120
},
{
"epoch": 4.842859037262962,
"grad_norm": 0.7376503348350525,
"learning_rate": 5.963279101014907e-08,
"loss": 0.0104,
"step": 9130
},
{
"epoch": 4.848163373557884,
"grad_norm": 0.6144804358482361,
"learning_rate": 5.566210496672164e-08,
"loss": 0.0114,
"step": 9140
},
{
"epoch": 4.853467709852804,
"grad_norm": 0.5833884477615356,
"learning_rate": 5.1827839092943864e-08,
"loss": 0.0104,
"step": 9150
},
{
"epoch": 4.858772046147726,
"grad_norm": 1.0123804807662964,
"learning_rate": 4.813004598871684e-08,
"loss": 0.0118,
"step": 9160
},
{
"epoch": 4.864076382442647,
"grad_norm": 0.6741945743560791,
"learning_rate": 4.456877638175927e-08,
"loss": 0.0105,
"step": 9170
},
{
"epoch": 4.869380718737568,
"grad_norm": 1.017996907234192,
"learning_rate": 4.114407912690577e-08,
"loss": 0.0116,
"step": 9180
},
{
"epoch": 4.874685055032489,
"grad_norm": 0.6289529800415039,
"learning_rate": 3.785600120544297e-08,
"loss": 0.0111,
"step": 9190
},
{
"epoch": 4.87998939132741,
"grad_norm": 0.6166532635688782,
"learning_rate": 3.470458772446228e-08,
"loss": 0.0103,
"step": 9200
},
{
"epoch": 4.885293727622331,
"grad_norm": 0.7388268113136292,
"learning_rate": 3.168988191623923e-08,
"loss": 0.012,
"step": 9210
},
{
"epoch": 4.890598063917253,
"grad_norm": 0.883564293384552,
"learning_rate": 2.8811925137641748e-08,
"loss": 0.0116,
"step": 9220
},
{
"epoch": 4.895902400212173,
"grad_norm": 0.649075448513031,
"learning_rate": 2.607075686956617e-08,
"loss": 0.0106,
"step": 9230
},
{
"epoch": 4.901206736507095,
"grad_norm": 0.5050731897354126,
"learning_rate": 2.3466414716387664e-08,
"loss": 0.011,
"step": 9240
},
{
"epoch": 4.906511072802015,
"grad_norm": 0.7908238172531128,
"learning_rate": 2.0998934405453973e-08,
"loss": 0.0113,
"step": 9250
},
{
"epoch": 4.911815409096937,
"grad_norm": 0.8571054339408875,
"learning_rate": 1.866834978658805e-08,
"loss": 0.0106,
"step": 9260
},
{
"epoch": 4.917119745391858,
"grad_norm": 1.0134270191192627,
"learning_rate": 1.647469283162617e-08,
"loss": 0.0102,
"step": 9270
},
{
"epoch": 4.922424081686779,
"grad_norm": 0.7379550933837891,
"learning_rate": 1.4417993633980553e-08,
"loss": 0.011,
"step": 9280
},
{
"epoch": 4.9277284179817,
"grad_norm": 0.6573840379714966,
"learning_rate": 1.2498280408225205e-08,
"loss": 0.0109,
"step": 9290
},
{
"epoch": 4.9330327542766215,
"grad_norm": 1.1111972332000732,
"learning_rate": 1.0715579489707362e-08,
"loss": 0.0117,
"step": 9300
},
{
"epoch": 4.938337090571542,
"grad_norm": 1.0421433448791504,
"learning_rate": 9.069915334189994e-09,
"loss": 0.0115,
"step": 9310
},
{
"epoch": 4.9436414268664635,
"grad_norm": 0.7943109273910522,
"learning_rate": 7.561310517514298e-09,
"loss": 0.0121,
"step": 9320
},
{
"epoch": 4.948945763161385,
"grad_norm": 1.0451573133468628,
"learning_rate": 6.189785735286613e-09,
"loss": 0.0117,
"step": 9330
},
{
"epoch": 4.9542500994563055,
"grad_norm": 0.661553144454956,
"learning_rate": 4.955359802601978e-09,
"loss": 0.011,
"step": 9340
},
{
"epoch": 4.959554435751227,
"grad_norm": 0.6280125975608826,
"learning_rate": 3.858049653778783e-09,
"loss": 0.0116,
"step": 9350
},
{
"epoch": 4.9648587720461475,
"grad_norm": 0.9882097244262695,
"learning_rate": 2.8978703421311815e-09,
"loss": 0.0118,
"step": 9360
},
{
"epoch": 4.970163108341069,
"grad_norm": 0.9897807240486145,
"learning_rate": 2.0748350397592487e-09,
"loss": 0.0108,
"step": 9370
},
{
"epoch": 4.97546744463599,
"grad_norm": 0.885931670665741,
"learning_rate": 1.388955037373574e-09,
"loss": 0.0112,
"step": 9380
},
{
"epoch": 4.980771780930911,
"grad_norm": 1.1094691753387451,
"learning_rate": 8.40239744130944e-10,
"loss": 0.0111,
"step": 9390
},
{
"epoch": 4.986076117225832,
"grad_norm": 0.7109507322311401,
"learning_rate": 4.286966875166609e-10,
"loss": 0.0119,
"step": 9400
},
{
"epoch": 4.991380453520753,
"grad_norm": 0.9953659772872925,
"learning_rate": 1.5433151323129835e-10,
"loss": 0.0097,
"step": 9410
},
{
"epoch": 4.996684789815674,
"grad_norm": 0.77718186378479,
"learning_rate": 1.7147985121868106e-11,
"loss": 0.011,
"step": 9420
},
{
"epoch": 4.999336957963135,
"step": 9425,
"total_flos": 9.928428136927396e+17,
"train_loss": 0.1870748889659697,
"train_runtime": 13802.6902,
"train_samples_per_second": 21.853,
"train_steps_per_second": 0.683
}
],
"logging_steps": 10,
"max_steps": 9425,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 40000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.928428136927396e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}