llama-3-8b-instruct-laws / trainer_state.json
Sayalik45's picture
Uploading model weights
4b301fd verified
raw
history blame contribute delete
No virus
95.8 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.99876492383697,
"eval_steps": 500,
"global_step": 5463,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005489227391244683,
"grad_norm": 1.3054485321044922,
"learning_rate": 9.140767824497258e-07,
"loss": 1.7028,
"step": 10
},
{
"epoch": 0.010978454782489365,
"grad_norm": 1.6326860189437866,
"learning_rate": 1.8281535648994516e-06,
"loss": 1.6575,
"step": 20
},
{
"epoch": 0.016467682173734045,
"grad_norm": 0.9034647345542908,
"learning_rate": 2.7422303473491773e-06,
"loss": 1.3904,
"step": 30
},
{
"epoch": 0.02195690956497873,
"grad_norm": 0.8317720293998718,
"learning_rate": 3.6563071297989032e-06,
"loss": 1.2568,
"step": 40
},
{
"epoch": 0.027446136956223412,
"grad_norm": 0.8646258115768433,
"learning_rate": 4.570383912248629e-06,
"loss": 1.2404,
"step": 50
},
{
"epoch": 0.03293536434746809,
"grad_norm": 0.7493156790733337,
"learning_rate": 5.484460694698355e-06,
"loss": 1.1597,
"step": 60
},
{
"epoch": 0.03842459173871278,
"grad_norm": 0.7537096738815308,
"learning_rate": 6.398537477148081e-06,
"loss": 1.1168,
"step": 70
},
{
"epoch": 0.04391381912995746,
"grad_norm": 0.6768060922622681,
"learning_rate": 7.3126142595978065e-06,
"loss": 1.0531,
"step": 80
},
{
"epoch": 0.04940304652120214,
"grad_norm": 0.8540539145469666,
"learning_rate": 8.226691042047533e-06,
"loss": 1.0974,
"step": 90
},
{
"epoch": 0.054892273912446825,
"grad_norm": 0.7654123306274414,
"learning_rate": 9.140767824497258e-06,
"loss": 1.0468,
"step": 100
},
{
"epoch": 0.06038150130369151,
"grad_norm": 0.838114857673645,
"learning_rate": 1.0054844606946984e-05,
"loss": 1.0481,
"step": 110
},
{
"epoch": 0.06587072869493618,
"grad_norm": 0.7839793562889099,
"learning_rate": 1.096892138939671e-05,
"loss": 1.0444,
"step": 120
},
{
"epoch": 0.07135995608618087,
"grad_norm": 1.0483232736587524,
"learning_rate": 1.1882998171846435e-05,
"loss": 1.005,
"step": 130
},
{
"epoch": 0.07684918347742556,
"grad_norm": 0.9476339221000671,
"learning_rate": 1.2797074954296162e-05,
"loss": 1.0538,
"step": 140
},
{
"epoch": 0.08233841086867023,
"grad_norm": 0.8280003070831299,
"learning_rate": 1.3711151736745886e-05,
"loss": 1.0122,
"step": 150
},
{
"epoch": 0.08782763825991492,
"grad_norm": 0.8112940788269043,
"learning_rate": 1.4625228519195613e-05,
"loss": 0.9613,
"step": 160
},
{
"epoch": 0.0933168656511596,
"grad_norm": 0.9424939155578613,
"learning_rate": 1.553930530164534e-05,
"loss": 0.9637,
"step": 170
},
{
"epoch": 0.09880609304240429,
"grad_norm": 0.781250536441803,
"learning_rate": 1.6453382084095066e-05,
"loss": 0.9984,
"step": 180
},
{
"epoch": 0.10429532043364896,
"grad_norm": 0.9252836108207703,
"learning_rate": 1.7367458866544793e-05,
"loss": 0.9984,
"step": 190
},
{
"epoch": 0.10978454782489365,
"grad_norm": 0.9257864356040955,
"learning_rate": 1.8281535648994517e-05,
"loss": 0.949,
"step": 200
},
{
"epoch": 0.11527377521613832,
"grad_norm": 1.042043924331665,
"learning_rate": 1.9195612431444244e-05,
"loss": 1.0031,
"step": 210
},
{
"epoch": 0.12076300260738301,
"grad_norm": 1.0521234273910522,
"learning_rate": 2.0109689213893968e-05,
"loss": 0.9751,
"step": 220
},
{
"epoch": 0.1262522299986277,
"grad_norm": 0.865064263343811,
"learning_rate": 2.1023765996343695e-05,
"loss": 0.9564,
"step": 230
},
{
"epoch": 0.13174145738987236,
"grad_norm": 0.8879236578941345,
"learning_rate": 2.193784277879342e-05,
"loss": 0.9182,
"step": 240
},
{
"epoch": 0.13723068478111705,
"grad_norm": 0.9224317669868469,
"learning_rate": 2.2851919561243146e-05,
"loss": 0.9037,
"step": 250
},
{
"epoch": 0.14271991217236174,
"grad_norm": 0.8295108675956726,
"learning_rate": 2.376599634369287e-05,
"loss": 0.9708,
"step": 260
},
{
"epoch": 0.14820913956360643,
"grad_norm": 0.7987868785858154,
"learning_rate": 2.4680073126142597e-05,
"loss": 0.9611,
"step": 270
},
{
"epoch": 0.15369836695485112,
"grad_norm": 0.774760901927948,
"learning_rate": 2.5594149908592324e-05,
"loss": 0.9872,
"step": 280
},
{
"epoch": 0.15918759434609578,
"grad_norm": 0.7601301670074463,
"learning_rate": 2.6508226691042048e-05,
"loss": 0.9,
"step": 290
},
{
"epoch": 0.16467682173734047,
"grad_norm": 0.9270791411399841,
"learning_rate": 2.742230347349177e-05,
"loss": 0.8798,
"step": 300
},
{
"epoch": 0.17016604912858516,
"grad_norm": 0.873102605342865,
"learning_rate": 2.8336380255941502e-05,
"loss": 0.8962,
"step": 310
},
{
"epoch": 0.17565527651982984,
"grad_norm": 0.9427269101142883,
"learning_rate": 2.9250457038391226e-05,
"loss": 0.886,
"step": 320
},
{
"epoch": 0.1811445039110745,
"grad_norm": 0.8019095659255981,
"learning_rate": 3.016453382084095e-05,
"loss": 0.8335,
"step": 330
},
{
"epoch": 0.1866337313023192,
"grad_norm": 0.9028713703155518,
"learning_rate": 3.107861060329068e-05,
"loss": 0.8946,
"step": 340
},
{
"epoch": 0.19212295869356388,
"grad_norm": 1.0009723901748657,
"learning_rate": 3.1992687385740404e-05,
"loss": 0.9274,
"step": 350
},
{
"epoch": 0.19761218608480857,
"grad_norm": 0.7785693407058716,
"learning_rate": 3.290676416819013e-05,
"loss": 0.8001,
"step": 360
},
{
"epoch": 0.20310141347605323,
"grad_norm": 0.9450286030769348,
"learning_rate": 3.382084095063985e-05,
"loss": 0.9036,
"step": 370
},
{
"epoch": 0.20859064086729792,
"grad_norm": 0.899732232093811,
"learning_rate": 3.4734917733089586e-05,
"loss": 0.8944,
"step": 380
},
{
"epoch": 0.2140798682585426,
"grad_norm": 1.35003662109375,
"learning_rate": 3.5648994515539306e-05,
"loss": 0.8581,
"step": 390
},
{
"epoch": 0.2195690956497873,
"grad_norm": 1.1555213928222656,
"learning_rate": 3.656307129798903e-05,
"loss": 0.8412,
"step": 400
},
{
"epoch": 0.225058323041032,
"grad_norm": 0.8920039534568787,
"learning_rate": 3.7477148080438754e-05,
"loss": 0.9105,
"step": 410
},
{
"epoch": 0.23054755043227665,
"grad_norm": 0.8022063970565796,
"learning_rate": 3.839122486288849e-05,
"loss": 0.9217,
"step": 420
},
{
"epoch": 0.23603677782352134,
"grad_norm": 1.1498247385025024,
"learning_rate": 3.930530164533821e-05,
"loss": 0.9259,
"step": 430
},
{
"epoch": 0.24152600521476603,
"grad_norm": 1.0198287963867188,
"learning_rate": 4.0219378427787935e-05,
"loss": 0.8857,
"step": 440
},
{
"epoch": 0.24701523260601072,
"grad_norm": 0.9331903457641602,
"learning_rate": 4.113345521023766e-05,
"loss": 0.8754,
"step": 450
},
{
"epoch": 0.2525044599972554,
"grad_norm": 0.9897291660308838,
"learning_rate": 4.204753199268739e-05,
"loss": 0.8769,
"step": 460
},
{
"epoch": 0.2579936873885001,
"grad_norm": 1.6721230745315552,
"learning_rate": 4.296160877513711e-05,
"loss": 0.8992,
"step": 470
},
{
"epoch": 0.2634829147797447,
"grad_norm": 1.1787182092666626,
"learning_rate": 4.387568555758684e-05,
"loss": 0.8918,
"step": 480
},
{
"epoch": 0.2689721421709894,
"grad_norm": 1.0543595552444458,
"learning_rate": 4.4789762340036564e-05,
"loss": 0.8167,
"step": 490
},
{
"epoch": 0.2744613695622341,
"grad_norm": 0.9777544140815735,
"learning_rate": 4.570383912248629e-05,
"loss": 0.8888,
"step": 500
},
{
"epoch": 0.2799505969534788,
"grad_norm": 0.9173258543014526,
"learning_rate": 4.661791590493602e-05,
"loss": 0.8296,
"step": 510
},
{
"epoch": 0.2854398243447235,
"grad_norm": 1.0830740928649902,
"learning_rate": 4.753199268738574e-05,
"loss": 0.8703,
"step": 520
},
{
"epoch": 0.29092905173596817,
"grad_norm": 1.115646243095398,
"learning_rate": 4.844606946983547e-05,
"loss": 0.9081,
"step": 530
},
{
"epoch": 0.29641827912721286,
"grad_norm": 1.219681739807129,
"learning_rate": 4.936014625228519e-05,
"loss": 0.9165,
"step": 540
},
{
"epoch": 0.30190750651845755,
"grad_norm": 1.178253173828125,
"learning_rate": 4.999995405604411e-05,
"loss": 0.8977,
"step": 550
},
{
"epoch": 0.30739673390970224,
"grad_norm": 1.3987079858779907,
"learning_rate": 4.999913727930364e-05,
"loss": 0.8527,
"step": 560
},
{
"epoch": 0.31288596130094687,
"grad_norm": 1.095534324645996,
"learning_rate": 4.999729956415998e-05,
"loss": 0.8716,
"step": 570
},
{
"epoch": 0.31837518869219156,
"grad_norm": 1.1343433856964111,
"learning_rate": 4.9994440985663475e-05,
"loss": 0.8402,
"step": 580
},
{
"epoch": 0.32386441608343625,
"grad_norm": 1.177049994468689,
"learning_rate": 4.9990561660555454e-05,
"loss": 0.8629,
"step": 590
},
{
"epoch": 0.32935364347468093,
"grad_norm": 1.091205358505249,
"learning_rate": 4.998566174726347e-05,
"loss": 0.7973,
"step": 600
},
{
"epoch": 0.3348428708659256,
"grad_norm": 1.3899606466293335,
"learning_rate": 4.997974144589481e-05,
"loss": 0.8956,
"step": 610
},
{
"epoch": 0.3403320982571703,
"grad_norm": 1.2220797538757324,
"learning_rate": 4.997280099822833e-05,
"loss": 0.794,
"step": 620
},
{
"epoch": 0.345821325648415,
"grad_norm": 1.0186364650726318,
"learning_rate": 4.996484068770461e-05,
"loss": 0.7641,
"step": 630
},
{
"epoch": 0.3513105530396597,
"grad_norm": 1.3602491617202759,
"learning_rate": 4.9955860839414324e-05,
"loss": 0.8582,
"step": 640
},
{
"epoch": 0.3567997804309043,
"grad_norm": 1.2544053792953491,
"learning_rate": 4.994586182008501e-05,
"loss": 0.8087,
"step": 650
},
{
"epoch": 0.362289007822149,
"grad_norm": 1.25338876247406,
"learning_rate": 4.993484403806609e-05,
"loss": 0.8814,
"step": 660
},
{
"epoch": 0.3677782352133937,
"grad_norm": 1.6701184511184692,
"learning_rate": 4.9922807943312135e-05,
"loss": 0.8039,
"step": 670
},
{
"epoch": 0.3732674626046384,
"grad_norm": 1.2474104166030884,
"learning_rate": 4.990975402736457e-05,
"loss": 0.8411,
"step": 680
},
{
"epoch": 0.3787566899958831,
"grad_norm": 1.0203585624694824,
"learning_rate": 4.9895682823331564e-05,
"loss": 0.7838,
"step": 690
},
{
"epoch": 0.38424591738712777,
"grad_norm": 1.2643638849258423,
"learning_rate": 4.988059490586624e-05,
"loss": 0.7802,
"step": 700
},
{
"epoch": 0.38973514477837246,
"grad_norm": 1.2018098831176758,
"learning_rate": 4.986449089114325e-05,
"loss": 0.8049,
"step": 710
},
{
"epoch": 0.39522437216961714,
"grad_norm": 1.5600682497024536,
"learning_rate": 4.984737143683356e-05,
"loss": 0.864,
"step": 720
},
{
"epoch": 0.40071359956086183,
"grad_norm": 1.1996121406555176,
"learning_rate": 4.982923724207764e-05,
"loss": 0.8222,
"step": 730
},
{
"epoch": 0.40620282695210647,
"grad_norm": 1.2239071130752563,
"learning_rate": 4.9810089047456873e-05,
"loss": 0.7757,
"step": 740
},
{
"epoch": 0.41169205434335115,
"grad_norm": 1.278192162513733,
"learning_rate": 4.978992763496334e-05,
"loss": 0.7693,
"step": 750
},
{
"epoch": 0.41718128173459584,
"grad_norm": 1.3768647909164429,
"learning_rate": 4.976875382796786e-05,
"loss": 0.7927,
"step": 760
},
{
"epoch": 0.42267050912584053,
"grad_norm": 1.3257420063018799,
"learning_rate": 4.974656849118638e-05,
"loss": 0.7997,
"step": 770
},
{
"epoch": 0.4281597365170852,
"grad_norm": 1.4355076551437378,
"learning_rate": 4.972337253064466e-05,
"loss": 0.7719,
"step": 780
},
{
"epoch": 0.4336489639083299,
"grad_norm": 1.0469034910202026,
"learning_rate": 4.969916689364128e-05,
"loss": 0.8203,
"step": 790
},
{
"epoch": 0.4391381912995746,
"grad_norm": 1.4641021490097046,
"learning_rate": 4.9673952568708906e-05,
"loss": 0.8303,
"step": 800
},
{
"epoch": 0.4446274186908193,
"grad_norm": 1.6394554376602173,
"learning_rate": 4.964773058557399e-05,
"loss": 0.8693,
"step": 810
},
{
"epoch": 0.450116646082064,
"grad_norm": 1.777869462966919,
"learning_rate": 4.9620502015114675e-05,
"loss": 0.7929,
"step": 820
},
{
"epoch": 0.4556058734733086,
"grad_norm": 1.161238670349121,
"learning_rate": 4.959226796931706e-05,
"loss": 0.8393,
"step": 830
},
{
"epoch": 0.4610951008645533,
"grad_norm": 1.5231930017471313,
"learning_rate": 4.95630296012298e-05,
"loss": 0.8195,
"step": 840
},
{
"epoch": 0.466584328255798,
"grad_norm": 1.446094274520874,
"learning_rate": 4.953278810491701e-05,
"loss": 0.8157,
"step": 850
},
{
"epoch": 0.4720735556470427,
"grad_norm": 1.702967882156372,
"learning_rate": 4.950154471540951e-05,
"loss": 0.7932,
"step": 860
},
{
"epoch": 0.47756278303828736,
"grad_norm": 1.3679907321929932,
"learning_rate": 4.9469300708654385e-05,
"loss": 0.7741,
"step": 870
},
{
"epoch": 0.48305201042953205,
"grad_norm": 1.1557847261428833,
"learning_rate": 4.943605740146286e-05,
"loss": 0.8406,
"step": 880
},
{
"epoch": 0.48854123782077674,
"grad_norm": 1.4791802167892456,
"learning_rate": 4.940181615145655e-05,
"loss": 0.7731,
"step": 890
},
{
"epoch": 0.49403046521202143,
"grad_norm": 1.3994717597961426,
"learning_rate": 4.936657835701198e-05,
"loss": 0.7903,
"step": 900
},
{
"epoch": 0.49951969260326606,
"grad_norm": 1.2580246925354004,
"learning_rate": 4.933034545720354e-05,
"loss": 0.7601,
"step": 910
},
{
"epoch": 0.5050089199945108,
"grad_norm": 1.4461493492126465,
"learning_rate": 4.9293118931744624e-05,
"loss": 0.8246,
"step": 920
},
{
"epoch": 0.5104981473857555,
"grad_norm": 1.9255192279815674,
"learning_rate": 4.925490030092729e-05,
"loss": 0.7729,
"step": 930
},
{
"epoch": 0.5159873747770002,
"grad_norm": 1.2568154335021973,
"learning_rate": 4.9215691125560104e-05,
"loss": 0.7711,
"step": 940
},
{
"epoch": 0.5214766021682449,
"grad_norm": 1.2998193502426147,
"learning_rate": 4.917549300690445e-05,
"loss": 0.7897,
"step": 950
},
{
"epoch": 0.5269658295594895,
"grad_norm": 1.712433099746704,
"learning_rate": 4.9134307586609104e-05,
"loss": 0.7356,
"step": 960
},
{
"epoch": 0.5324550569507341,
"grad_norm": 1.4403119087219238,
"learning_rate": 4.9092136546643184e-05,
"loss": 0.7599,
"step": 970
},
{
"epoch": 0.5379442843419788,
"grad_norm": 1.2811603546142578,
"learning_rate": 4.9048981609227504e-05,
"loss": 0.7572,
"step": 980
},
{
"epoch": 0.5434335117332235,
"grad_norm": 1.6650887727737427,
"learning_rate": 4.9004844536764185e-05,
"loss": 0.7726,
"step": 990
},
{
"epoch": 0.5489227391244682,
"grad_norm": 1.4498590230941772,
"learning_rate": 4.8959727131764735e-05,
"loss": 0.7772,
"step": 1000
},
{
"epoch": 0.5544119665157129,
"grad_norm": 1.38353431224823,
"learning_rate": 4.891363123677638e-05,
"loss": 0.7954,
"step": 1010
},
{
"epoch": 0.5599011939069576,
"grad_norm": 1.5972951650619507,
"learning_rate": 4.886655873430687e-05,
"loss": 0.759,
"step": 1020
},
{
"epoch": 0.5653904212982023,
"grad_norm": 1.409515380859375,
"learning_rate": 4.881851154674757e-05,
"loss": 0.675,
"step": 1030
},
{
"epoch": 0.570879648689447,
"grad_norm": 1.2562367916107178,
"learning_rate": 4.876949163629494e-05,
"loss": 0.8194,
"step": 1040
},
{
"epoch": 0.5763688760806917,
"grad_norm": 1.520317554473877,
"learning_rate": 4.871950100487043e-05,
"loss": 0.7587,
"step": 1050
},
{
"epoch": 0.5818581034719363,
"grad_norm": 1.5166853666305542,
"learning_rate": 4.866854169403871e-05,
"loss": 0.6909,
"step": 1060
},
{
"epoch": 0.587347330863181,
"grad_norm": 1.4219826459884644,
"learning_rate": 4.861661578492429e-05,
"loss": 0.7907,
"step": 1070
},
{
"epoch": 0.5928365582544257,
"grad_norm": 1.449629545211792,
"learning_rate": 4.856372539812655e-05,
"loss": 0.7512,
"step": 1080
},
{
"epoch": 0.5983257856456704,
"grad_norm": 1.715462565422058,
"learning_rate": 4.850987269363311e-05,
"loss": 0.7171,
"step": 1090
},
{
"epoch": 0.6038150130369151,
"grad_norm": 1.6240124702453613,
"learning_rate": 4.845505987073161e-05,
"loss": 0.763,
"step": 1100
},
{
"epoch": 0.6093042404281598,
"grad_norm": 1.3949427604675293,
"learning_rate": 4.839928916791996e-05,
"loss": 0.7513,
"step": 1110
},
{
"epoch": 0.6147934678194045,
"grad_norm": 1.491368293762207,
"learning_rate": 4.834256286281482e-05,
"loss": 0.6982,
"step": 1120
},
{
"epoch": 0.620282695210649,
"grad_norm": 1.2943052053451538,
"learning_rate": 4.82848832720587e-05,
"loss": 0.8051,
"step": 1130
},
{
"epoch": 0.6257719226018937,
"grad_norm": 1.7091878652572632,
"learning_rate": 4.8226252751225245e-05,
"loss": 0.7914,
"step": 1140
},
{
"epoch": 0.6312611499931384,
"grad_norm": 1.2987576723098755,
"learning_rate": 4.816667369472309e-05,
"loss": 0.7705,
"step": 1150
},
{
"epoch": 0.6367503773843831,
"grad_norm": 1.4213101863861084,
"learning_rate": 4.810614853569807e-05,
"loss": 0.7916,
"step": 1160
},
{
"epoch": 0.6422396047756278,
"grad_norm": 1.4974167346954346,
"learning_rate": 4.804467974593387e-05,
"loss": 0.7628,
"step": 1170
},
{
"epoch": 0.6477288321668725,
"grad_norm": 1.729684591293335,
"learning_rate": 4.798226983575103e-05,
"loss": 0.7393,
"step": 1180
},
{
"epoch": 0.6532180595581172,
"grad_norm": 1.765308141708374,
"learning_rate": 4.7918921353904464e-05,
"loss": 0.7251,
"step": 1190
},
{
"epoch": 0.6587072869493619,
"grad_norm": 1.7703893184661865,
"learning_rate": 4.785463688747937e-05,
"loss": 0.7329,
"step": 1200
},
{
"epoch": 0.6641965143406066,
"grad_norm": 2.700155258178711,
"learning_rate": 4.778941906178556e-05,
"loss": 0.6967,
"step": 1210
},
{
"epoch": 0.6696857417318512,
"grad_norm": 1.3553398847579956,
"learning_rate": 4.772327054025027e-05,
"loss": 0.7221,
"step": 1220
},
{
"epoch": 0.6751749691230959,
"grad_norm": 1.2455166578292847,
"learning_rate": 4.765619402430934e-05,
"loss": 0.6925,
"step": 1230
},
{
"epoch": 0.6806641965143406,
"grad_norm": 1.7047752141952515,
"learning_rate": 4.758819225329696e-05,
"loss": 0.7373,
"step": 1240
},
{
"epoch": 0.6861534239055853,
"grad_norm": 1.5384269952774048,
"learning_rate": 4.751926800433374e-05,
"loss": 0.7348,
"step": 1250
},
{
"epoch": 0.69164265129683,
"grad_norm": 1.491666316986084,
"learning_rate": 4.744942409221333e-05,
"loss": 0.7121,
"step": 1260
},
{
"epoch": 0.6971318786880747,
"grad_norm": 1.4360090494155884,
"learning_rate": 4.7378663369287445e-05,
"loss": 0.6728,
"step": 1270
},
{
"epoch": 0.7026211060793194,
"grad_norm": 1.3977197408676147,
"learning_rate": 4.730698872534938e-05,
"loss": 0.7617,
"step": 1280
},
{
"epoch": 0.7081103334705641,
"grad_norm": 1.7353872060775757,
"learning_rate": 4.723440308751601e-05,
"loss": 0.6887,
"step": 1290
},
{
"epoch": 0.7135995608618086,
"grad_norm": 1.3200151920318604,
"learning_rate": 4.716090942010823e-05,
"loss": 0.752,
"step": 1300
},
{
"epoch": 0.7190887882530533,
"grad_norm": 1.333355188369751,
"learning_rate": 4.708651072452993e-05,
"loss": 0.7336,
"step": 1310
},
{
"epoch": 0.724578015644298,
"grad_norm": 1.6440070867538452,
"learning_rate": 4.701121003914537e-05,
"loss": 0.7333,
"step": 1320
},
{
"epoch": 0.7300672430355427,
"grad_norm": 1.848791480064392,
"learning_rate": 4.693501043915514e-05,
"loss": 0.7648,
"step": 1330
},
{
"epoch": 0.7355564704267874,
"grad_norm": 1.593891978263855,
"learning_rate": 4.685791503647052e-05,
"loss": 0.787,
"step": 1340
},
{
"epoch": 0.7410456978180321,
"grad_norm": 1.6957751512527466,
"learning_rate": 4.6779926979586475e-05,
"loss": 0.7212,
"step": 1350
},
{
"epoch": 0.7465349252092768,
"grad_norm": 1.3588330745697021,
"learning_rate": 4.6701049453453e-05,
"loss": 0.7175,
"step": 1360
},
{
"epoch": 0.7520241526005215,
"grad_norm": 1.462112307548523,
"learning_rate": 4.662128567934509e-05,
"loss": 0.7133,
"step": 1370
},
{
"epoch": 0.7575133799917662,
"grad_norm": 1.5633749961853027,
"learning_rate": 4.654063891473115e-05,
"loss": 0.6978,
"step": 1380
},
{
"epoch": 0.7630026073830108,
"grad_norm": 1.7605217695236206,
"learning_rate": 4.645911245314e-05,
"loss": 0.7019,
"step": 1390
},
{
"epoch": 0.7684918347742555,
"grad_norm": 1.6843842267990112,
"learning_rate": 4.637670962402636e-05,
"loss": 0.7483,
"step": 1400
},
{
"epoch": 0.7739810621655002,
"grad_norm": 2.015845537185669,
"learning_rate": 4.629343379263487e-05,
"loss": 0.7208,
"step": 1410
},
{
"epoch": 0.7794702895567449,
"grad_norm": 1.8158447742462158,
"learning_rate": 4.620928835986267e-05,
"loss": 0.7733,
"step": 1420
},
{
"epoch": 0.7849595169479896,
"grad_norm": 1.7793387174606323,
"learning_rate": 4.6124276762120485e-05,
"loss": 0.7111,
"step": 1430
},
{
"epoch": 0.7904487443392343,
"grad_norm": 1.6674373149871826,
"learning_rate": 4.603840247119233e-05,
"loss": 0.6663,
"step": 1440
},
{
"epoch": 0.795937971730479,
"grad_norm": 1.4028520584106445,
"learning_rate": 4.595166899409368e-05,
"loss": 0.7692,
"step": 1450
},
{
"epoch": 0.8014271991217237,
"grad_norm": 1.6022142171859741,
"learning_rate": 4.5864079872928265e-05,
"loss": 0.7305,
"step": 1460
},
{
"epoch": 0.8069164265129684,
"grad_norm": 1.4971508979797363,
"learning_rate": 4.577563868474344e-05,
"loss": 0.6875,
"step": 1470
},
{
"epoch": 0.8124056539042129,
"grad_norm": 1.8490726947784424,
"learning_rate": 4.5686349041384055e-05,
"loss": 0.6849,
"step": 1480
},
{
"epoch": 0.8178948812954576,
"grad_norm": 1.9100017547607422,
"learning_rate": 4.559621458934498e-05,
"loss": 0.6506,
"step": 1490
},
{
"epoch": 0.8233841086867023,
"grad_norm": 1.6782461404800415,
"learning_rate": 4.550523900962219e-05,
"loss": 0.704,
"step": 1500
},
{
"epoch": 0.828873336077947,
"grad_norm": 2.1226425170898438,
"learning_rate": 4.541342601756242e-05,
"loss": 0.6988,
"step": 1510
},
{
"epoch": 0.8343625634691917,
"grad_norm": 1.658097267150879,
"learning_rate": 4.532077936271144e-05,
"loss": 0.705,
"step": 1520
},
{
"epoch": 0.8398517908604364,
"grad_norm": 1.8850988149642944,
"learning_rate": 4.522730282866093e-05,
"loss": 0.6801,
"step": 1530
},
{
"epoch": 0.8453410182516811,
"grad_norm": 1.5480940341949463,
"learning_rate": 4.513300023289397e-05,
"loss": 0.6308,
"step": 1540
},
{
"epoch": 0.8508302456429258,
"grad_norm": 1.7652947902679443,
"learning_rate": 4.503787542662912e-05,
"loss": 0.6731,
"step": 1550
},
{
"epoch": 0.8563194730341704,
"grad_norm": 1.902155876159668,
"learning_rate": 4.494193229466314e-05,
"loss": 0.7404,
"step": 1560
},
{
"epoch": 0.8618087004254151,
"grad_norm": 2.1436920166015625,
"learning_rate": 4.4845174755212385e-05,
"loss": 0.6884,
"step": 1570
},
{
"epoch": 0.8672979278166598,
"grad_norm": 1.6139538288116455,
"learning_rate": 4.47476067597527e-05,
"loss": 0.6947,
"step": 1580
},
{
"epoch": 0.8727871552079045,
"grad_norm": 1.5919870138168335,
"learning_rate": 4.464923229285816e-05,
"loss": 0.6982,
"step": 1590
},
{
"epoch": 0.8782763825991492,
"grad_norm": 1.6209038496017456,
"learning_rate": 4.4550055372038225e-05,
"loss": 0.7124,
"step": 1600
},
{
"epoch": 0.8837656099903939,
"grad_norm": 1.631515383720398,
"learning_rate": 4.445008004757376e-05,
"loss": 0.6771,
"step": 1610
},
{
"epoch": 0.8892548373816386,
"grad_norm": 1.4836645126342773,
"learning_rate": 4.434931040235159e-05,
"loss": 0.6272,
"step": 1620
},
{
"epoch": 0.8947440647728833,
"grad_norm": 1.3640625476837158,
"learning_rate": 4.4247750551697756e-05,
"loss": 0.6477,
"step": 1630
},
{
"epoch": 0.900233292164128,
"grad_norm": 1.5562537908554077,
"learning_rate": 4.414540464320945e-05,
"loss": 0.7128,
"step": 1640
},
{
"epoch": 0.9057225195553725,
"grad_norm": 1.548048973083496,
"learning_rate": 4.404227685658565e-05,
"loss": 0.7098,
"step": 1650
},
{
"epoch": 0.9112117469466172,
"grad_norm": 1.613368034362793,
"learning_rate": 4.39383714034564e-05,
"loss": 0.6926,
"step": 1660
},
{
"epoch": 0.9167009743378619,
"grad_norm": 1.789654016494751,
"learning_rate": 4.383369252721084e-05,
"loss": 0.6398,
"step": 1670
},
{
"epoch": 0.9221902017291066,
"grad_norm": 1.625928282737732,
"learning_rate": 4.372824450282388e-05,
"loss": 0.7087,
"step": 1680
},
{
"epoch": 0.9276794291203513,
"grad_norm": 1.686936855316162,
"learning_rate": 4.362203163668164e-05,
"loss": 0.6764,
"step": 1690
},
{
"epoch": 0.933168656511596,
"grad_norm": 1.6460559368133545,
"learning_rate": 4.351505826640555e-05,
"loss": 0.6969,
"step": 1700
},
{
"epoch": 0.9386578839028407,
"grad_norm": 1.6267837285995483,
"learning_rate": 4.3407328760675245e-05,
"loss": 0.672,
"step": 1710
},
{
"epoch": 0.9441471112940854,
"grad_norm": 1.5070548057556152,
"learning_rate": 4.329884751905014e-05,
"loss": 0.6586,
"step": 1720
},
{
"epoch": 0.94963633868533,
"grad_norm": 1.8759193420410156,
"learning_rate": 4.3189618971789747e-05,
"loss": 0.6601,
"step": 1730
},
{
"epoch": 0.9551255660765747,
"grad_norm": 1.6111549139022827,
"learning_rate": 4.307964757967273e-05,
"loss": 0.7042,
"step": 1740
},
{
"epoch": 0.9606147934678194,
"grad_norm": 1.3748118877410889,
"learning_rate": 4.2968937833814784e-05,
"loss": 0.6573,
"step": 1750
},
{
"epoch": 0.9661040208590641,
"grad_norm": 1.7284533977508545,
"learning_rate": 4.285749425548518e-05,
"loss": 0.619,
"step": 1760
},
{
"epoch": 0.9715932482503088,
"grad_norm": 1.5528743267059326,
"learning_rate": 4.274532139592211e-05,
"loss": 0.6601,
"step": 1770
},
{
"epoch": 0.9770824756415535,
"grad_norm": 1.6220190525054932,
"learning_rate": 4.2632423836146885e-05,
"loss": 0.6449,
"step": 1780
},
{
"epoch": 0.9825717030327982,
"grad_norm": 2.00435471534729,
"learning_rate": 4.251880618677678e-05,
"loss": 0.6404,
"step": 1790
},
{
"epoch": 0.9880609304240429,
"grad_norm": 1.8456660509109497,
"learning_rate": 4.240447308783679e-05,
"loss": 0.7124,
"step": 1800
},
{
"epoch": 0.9935501578152875,
"grad_norm": 1.8724040985107422,
"learning_rate": 4.2289429208570094e-05,
"loss": 0.7138,
"step": 1810
},
{
"epoch": 0.9990393852065321,
"grad_norm": 1.441105842590332,
"learning_rate": 4.217367924724741e-05,
"loss": 0.7439,
"step": 1820
},
{
"epoch": 1.0045286125977768,
"grad_norm": 1.392276406288147,
"learning_rate": 4.2057227930975066e-05,
"loss": 0.4876,
"step": 1830
},
{
"epoch": 1.0100178399890216,
"grad_norm": 1.4682689905166626,
"learning_rate": 4.194008001550204e-05,
"loss": 0.4949,
"step": 1840
},
{
"epoch": 1.0155070673802662,
"grad_norm": 1.7317707538604736,
"learning_rate": 4.1822240285025635e-05,
"loss": 0.5329,
"step": 1850
},
{
"epoch": 1.020996294771511,
"grad_norm": 1.9328278303146362,
"learning_rate": 4.170371355199621e-05,
"loss": 0.5068,
"step": 1860
},
{
"epoch": 1.0264855221627556,
"grad_norm": 1.7879178524017334,
"learning_rate": 4.158450465692051e-05,
"loss": 0.5112,
"step": 1870
},
{
"epoch": 1.0319747495540004,
"grad_norm": 1.6801658868789673,
"learning_rate": 4.146461846816411e-05,
"loss": 0.4826,
"step": 1880
},
{
"epoch": 1.037463976945245,
"grad_norm": 1.6541537046432495,
"learning_rate": 4.1344059881752534e-05,
"loss": 0.4522,
"step": 1890
},
{
"epoch": 1.0429532043364897,
"grad_norm": 2.27681303024292,
"learning_rate": 4.1222833821171315e-05,
"loss": 0.4726,
"step": 1900
},
{
"epoch": 1.0484424317277343,
"grad_norm": 1.6335279941558838,
"learning_rate": 4.110094523716492e-05,
"loss": 0.469,
"step": 1910
},
{
"epoch": 1.053931659118979,
"grad_norm": 1.730760931968689,
"learning_rate": 4.0978399107534584e-05,
"loss": 0.4554,
"step": 1920
},
{
"epoch": 1.0594208865102237,
"grad_norm": 1.636106014251709,
"learning_rate": 4.0855200436935e-05,
"loss": 0.4914,
"step": 1930
},
{
"epoch": 1.0649101139014683,
"grad_norm": 1.855231523513794,
"learning_rate": 4.073135425666997e-05,
"loss": 0.4609,
"step": 1940
},
{
"epoch": 1.070399341292713,
"grad_norm": 2.0908730030059814,
"learning_rate": 4.0606865624486875e-05,
"loss": 0.472,
"step": 1950
},
{
"epoch": 1.0758885686839577,
"grad_norm": 1.7960741519927979,
"learning_rate": 4.048173962437019e-05,
"loss": 0.5072,
"step": 1960
},
{
"epoch": 1.0813777960752025,
"grad_norm": 1.6274662017822266,
"learning_rate": 4.035598136633378e-05,
"loss": 0.455,
"step": 1970
},
{
"epoch": 1.086867023466447,
"grad_norm": 1.898768663406372,
"learning_rate": 4.0229595986212304e-05,
"loss": 0.5023,
"step": 1980
},
{
"epoch": 1.0923562508576918,
"grad_norm": 1.6245406866073608,
"learning_rate": 4.0102588645451396e-05,
"loss": 0.4863,
"step": 1990
},
{
"epoch": 1.0978454782489364,
"grad_norm": 1.440356731414795,
"learning_rate": 3.997496453089692e-05,
"loss": 0.4912,
"step": 2000
},
{
"epoch": 1.1033347056401812,
"grad_norm": 1.9108120203018188,
"learning_rate": 3.984672885458312e-05,
"loss": 0.4691,
"step": 2010
},
{
"epoch": 1.1088239330314258,
"grad_norm": 1.7355122566223145,
"learning_rate": 3.971788685351978e-05,
"loss": 0.4965,
"step": 2020
},
{
"epoch": 1.1143131604226706,
"grad_norm": 1.7125989198684692,
"learning_rate": 3.9588443789478366e-05,
"loss": 0.468,
"step": 2030
},
{
"epoch": 1.1198023878139152,
"grad_norm": 1.8434703350067139,
"learning_rate": 3.945840494877709e-05,
"loss": 0.4886,
"step": 2040
},
{
"epoch": 1.12529161520516,
"grad_norm": 2.302004337310791,
"learning_rate": 3.934086499185402e-05,
"loss": 0.4932,
"step": 2050
},
{
"epoch": 1.1307808425964045,
"grad_norm": 1.931429147720337,
"learning_rate": 3.9209708826272075e-05,
"loss": 0.5121,
"step": 2060
},
{
"epoch": 1.1362700699876491,
"grad_norm": 1.889414668083191,
"learning_rate": 3.907797235116677e-05,
"loss": 0.5094,
"step": 2070
},
{
"epoch": 1.141759297378894,
"grad_norm": 2.243352174758911,
"learning_rate": 3.894566094651682e-05,
"loss": 0.488,
"step": 2080
},
{
"epoch": 1.1472485247701387,
"grad_norm": 1.6855474710464478,
"learning_rate": 3.881278001578046e-05,
"loss": 0.531,
"step": 2090
},
{
"epoch": 1.1527377521613833,
"grad_norm": 2.328468084335327,
"learning_rate": 3.8679334985674786e-05,
"loss": 0.5397,
"step": 2100
},
{
"epoch": 1.1582269795526279,
"grad_norm": 1.8057246208190918,
"learning_rate": 3.854533130595408e-05,
"loss": 0.4964,
"step": 2110
},
{
"epoch": 1.1637162069438727,
"grad_norm": 1.6702812910079956,
"learning_rate": 3.8410774449187315e-05,
"loss": 0.5011,
"step": 2120
},
{
"epoch": 1.1692054343351173,
"grad_norm": 1.4972355365753174,
"learning_rate": 3.827566991053461e-05,
"loss": 0.4922,
"step": 2130
},
{
"epoch": 1.174694661726362,
"grad_norm": 1.739022970199585,
"learning_rate": 3.814002320752287e-05,
"loss": 0.4309,
"step": 2140
},
{
"epoch": 1.1801838891176066,
"grad_norm": 1.8909087181091309,
"learning_rate": 3.8003839879820377e-05,
"loss": 0.4761,
"step": 2150
},
{
"epoch": 1.1856731165088514,
"grad_norm": 1.9765682220458984,
"learning_rate": 3.786712548901064e-05,
"loss": 0.4895,
"step": 2160
},
{
"epoch": 1.191162343900096,
"grad_norm": 2.1266307830810547,
"learning_rate": 3.772988561836517e-05,
"loss": 0.4894,
"step": 2170
},
{
"epoch": 1.1966515712913408,
"grad_norm": 1.7856028079986572,
"learning_rate": 3.759212587261559e-05,
"loss": 0.4812,
"step": 2180
},
{
"epoch": 1.2021407986825854,
"grad_norm": 1.8546531200408936,
"learning_rate": 3.745385187772463e-05,
"loss": 0.4928,
"step": 2190
},
{
"epoch": 1.2076300260738302,
"grad_norm": 1.8596118688583374,
"learning_rate": 3.731506928065641e-05,
"loss": 0.512,
"step": 2200
},
{
"epoch": 1.2131192534650748,
"grad_norm": 2.024635076522827,
"learning_rate": 3.717578374914585e-05,
"loss": 0.4715,
"step": 2210
},
{
"epoch": 1.2186084808563196,
"grad_norm": 2.1620028018951416,
"learning_rate": 3.703600097146718e-05,
"loss": 0.4754,
"step": 2220
},
{
"epoch": 1.2240977082475641,
"grad_norm": 1.9437251091003418,
"learning_rate": 3.68957266562016e-05,
"loss": 0.475,
"step": 2230
},
{
"epoch": 1.229586935638809,
"grad_norm": 2.7284131050109863,
"learning_rate": 3.675496653200425e-05,
"loss": 0.4901,
"step": 2240
},
{
"epoch": 1.2350761630300535,
"grad_norm": 2.2666921615600586,
"learning_rate": 3.661372634737013e-05,
"loss": 0.4694,
"step": 2250
},
{
"epoch": 1.240565390421298,
"grad_norm": 1.5657079219818115,
"learning_rate": 3.647201187039946e-05,
"loss": 0.4809,
"step": 2260
},
{
"epoch": 1.246054617812543,
"grad_norm": 2.3592708110809326,
"learning_rate": 3.632982888856202e-05,
"loss": 0.4539,
"step": 2270
},
{
"epoch": 1.2515438452037877,
"grad_norm": 1.7647560834884644,
"learning_rate": 3.6187183208460844e-05,
"loss": 0.4945,
"step": 2280
},
{
"epoch": 1.2570330725950323,
"grad_norm": 2.0273566246032715,
"learning_rate": 3.604408065559508e-05,
"loss": 0.4853,
"step": 2290
},
{
"epoch": 1.2625222999862769,
"grad_norm": 2.0692555904388428,
"learning_rate": 3.590052707412208e-05,
"loss": 0.498,
"step": 2300
},
{
"epoch": 1.2680115273775217,
"grad_norm": 2.35859751701355,
"learning_rate": 3.575652832661872e-05,
"loss": 0.5287,
"step": 2310
},
{
"epoch": 1.2735007547687662,
"grad_norm": 1.8455514907836914,
"learning_rate": 3.5612090293841994e-05,
"loss": 0.5035,
"step": 2320
},
{
"epoch": 1.278989982160011,
"grad_norm": 2.233416795730591,
"learning_rate": 3.5467218874488837e-05,
"loss": 0.5078,
"step": 2330
},
{
"epoch": 1.2844792095512556,
"grad_norm": 1.7934064865112305,
"learning_rate": 3.5321919984955244e-05,
"loss": 0.5015,
"step": 2340
},
{
"epoch": 1.2899684369425004,
"grad_norm": 1.753578543663025,
"learning_rate": 3.517619955909463e-05,
"loss": 0.4556,
"step": 2350
},
{
"epoch": 1.295457664333745,
"grad_norm": 1.9207135438919067,
"learning_rate": 3.5030063547975525e-05,
"loss": 0.4417,
"step": 2360
},
{
"epoch": 1.3009468917249898,
"grad_norm": 1.77664315700531,
"learning_rate": 3.488351791963849e-05,
"loss": 0.435,
"step": 2370
},
{
"epoch": 1.3064361191162344,
"grad_norm": 1.5567264556884766,
"learning_rate": 3.473656865885248e-05,
"loss": 0.4872,
"step": 2380
},
{
"epoch": 1.3119253465074792,
"grad_norm": 1.9232813119888306,
"learning_rate": 3.4589221766870306e-05,
"loss": 0.479,
"step": 2390
},
{
"epoch": 1.3174145738987237,
"grad_norm": 1.6090134382247925,
"learning_rate": 3.444148326118366e-05,
"loss": 0.577,
"step": 2400
},
{
"epoch": 1.3229038012899683,
"grad_norm": 1.7962336540222168,
"learning_rate": 3.4293359175277314e-05,
"loss": 0.4801,
"step": 2410
},
{
"epoch": 1.3283930286812131,
"grad_norm": 2.1019630432128906,
"learning_rate": 3.414485555838273e-05,
"loss": 0.4884,
"step": 2420
},
{
"epoch": 1.333882256072458,
"grad_norm": 1.8056087493896484,
"learning_rate": 3.3995978475231024e-05,
"loss": 0.4527,
"step": 2430
},
{
"epoch": 1.3393714834637025,
"grad_norm": 1.7557107210159302,
"learning_rate": 3.3846734005805254e-05,
"loss": 0.4831,
"step": 2440
},
{
"epoch": 1.344860710854947,
"grad_norm": 1.7773773670196533,
"learning_rate": 3.369712824509217e-05,
"loss": 0.4994,
"step": 2450
},
{
"epoch": 1.3503499382461919,
"grad_norm": 1.7856857776641846,
"learning_rate": 3.354716730283327e-05,
"loss": 0.4761,
"step": 2460
},
{
"epoch": 1.3558391656374364,
"grad_norm": 2.119858980178833,
"learning_rate": 3.3396857303275296e-05,
"loss": 0.4891,
"step": 2470
},
{
"epoch": 1.3613283930286812,
"grad_norm": 1.9912039041519165,
"learning_rate": 3.324620438492011e-05,
"loss": 0.4415,
"step": 2480
},
{
"epoch": 1.3668176204199258,
"grad_norm": 2.347066879272461,
"learning_rate": 3.309521470027403e-05,
"loss": 0.4733,
"step": 2490
},
{
"epoch": 1.3723068478111706,
"grad_norm": 1.963139533996582,
"learning_rate": 3.294389441559655e-05,
"loss": 0.4626,
"step": 2500
},
{
"epoch": 1.3777960752024152,
"grad_norm": 2.103672742843628,
"learning_rate": 3.279224971064851e-05,
"loss": 0.5168,
"step": 2510
},
{
"epoch": 1.38328530259366,
"grad_norm": 1.741493821144104,
"learning_rate": 3.2640286778439746e-05,
"loss": 0.4687,
"step": 2520
},
{
"epoch": 1.3887745299849046,
"grad_norm": 1.9451817274093628,
"learning_rate": 3.248801182497615e-05,
"loss": 0.454,
"step": 2530
},
{
"epoch": 1.3942637573761494,
"grad_norm": 2.4190995693206787,
"learning_rate": 3.233543106900624e-05,
"loss": 0.4594,
"step": 2540
},
{
"epoch": 1.399752984767394,
"grad_norm": 1.543632984161377,
"learning_rate": 3.21825507417672e-05,
"loss": 0.4408,
"step": 2550
},
{
"epoch": 1.4052422121586385,
"grad_norm": 2.006373882293701,
"learning_rate": 3.202937708673033e-05,
"loss": 0.4802,
"step": 2560
},
{
"epoch": 1.4107314395498833,
"grad_norm": 1.912208914756775,
"learning_rate": 3.1875916359346214e-05,
"loss": 0.4731,
"step": 2570
},
{
"epoch": 1.4162206669411281,
"grad_norm": 1.6737933158874512,
"learning_rate": 3.17221748267891e-05,
"loss": 0.454,
"step": 2580
},
{
"epoch": 1.4217098943323727,
"grad_norm": 1.9672836065292358,
"learning_rate": 3.156815876770105e-05,
"loss": 0.4229,
"step": 2590
},
{
"epoch": 1.4271991217236173,
"grad_norm": 1.512810230255127,
"learning_rate": 3.1413874471935496e-05,
"loss": 0.4896,
"step": 2600
},
{
"epoch": 1.432688349114862,
"grad_norm": 1.8552961349487305,
"learning_rate": 3.125932824030037e-05,
"loss": 0.4808,
"step": 2610
},
{
"epoch": 1.438177576506107,
"grad_norm": 1.7483348846435547,
"learning_rate": 3.110452638430081e-05,
"loss": 0.4271,
"step": 2620
},
{
"epoch": 1.4436668038973515,
"grad_norm": 1.7746537923812866,
"learning_rate": 3.094947522588135e-05,
"loss": 0.4618,
"step": 2630
},
{
"epoch": 1.449156031288596,
"grad_norm": 2.1067216396331787,
"learning_rate": 3.079418109716778e-05,
"loss": 0.4765,
"step": 2640
},
{
"epoch": 1.4546452586798408,
"grad_norm": 1.6052168607711792,
"learning_rate": 3.063865034020857e-05,
"loss": 0.4596,
"step": 2650
},
{
"epoch": 1.4601344860710854,
"grad_norm": 1.6968189477920532,
"learning_rate": 3.0482889306715813e-05,
"loss": 0.4384,
"step": 2660
},
{
"epoch": 1.4656237134623302,
"grad_norm": 1.869379997253418,
"learning_rate": 3.032690435780584e-05,
"loss": 0.4872,
"step": 2670
},
{
"epoch": 1.4711129408535748,
"grad_norm": 1.8812456130981445,
"learning_rate": 3.017070186373949e-05,
"loss": 0.4581,
"step": 2680
},
{
"epoch": 1.4766021682448196,
"grad_norm": 1.99275803565979,
"learning_rate": 3.001428820366187e-05,
"loss": 0.49,
"step": 2690
},
{
"epoch": 1.4820913956360642,
"grad_norm": 1.6111352443695068,
"learning_rate": 2.9857669765341928e-05,
"loss": 0.4262,
"step": 2700
},
{
"epoch": 1.487580623027309,
"grad_norm": 2.6115357875823975,
"learning_rate": 2.9700852944911512e-05,
"loss": 0.4774,
"step": 2710
},
{
"epoch": 1.4930698504185536,
"grad_norm": 1.8144830465316772,
"learning_rate": 2.9543844146604195e-05,
"loss": 0.4618,
"step": 2720
},
{
"epoch": 1.4985590778097984,
"grad_norm": 1.7375366687774658,
"learning_rate": 2.938664978249372e-05,
"loss": 0.4278,
"step": 2730
},
{
"epoch": 1.504048305201043,
"grad_norm": 1.914023756980896,
"learning_rate": 2.9229276272232146e-05,
"loss": 0.4706,
"step": 2740
},
{
"epoch": 1.5095375325922875,
"grad_norm": 1.7386229038238525,
"learning_rate": 2.907173004278768e-05,
"loss": 0.4308,
"step": 2750
},
{
"epoch": 1.5150267599835323,
"grad_norm": 1.5574982166290283,
"learning_rate": 2.8914017528182185e-05,
"loss": 0.4487,
"step": 2760
},
{
"epoch": 1.5205159873747771,
"grad_norm": 2.144409418106079,
"learning_rate": 2.8756145169228432e-05,
"loss": 0.4232,
"step": 2770
},
{
"epoch": 1.5260052147660217,
"grad_norm": 2.5904343128204346,
"learning_rate": 2.859811941326709e-05,
"loss": 0.4603,
"step": 2780
},
{
"epoch": 1.5314944421572663,
"grad_norm": 2.3824493885040283,
"learning_rate": 2.8439946713903354e-05,
"loss": 0.4649,
"step": 2790
},
{
"epoch": 1.536983669548511,
"grad_norm": 2.1540448665618896,
"learning_rate": 2.8281633530743497e-05,
"loss": 0.4988,
"step": 2800
},
{
"epoch": 1.5424728969397559,
"grad_norm": 2.121973752975464,
"learning_rate": 2.8123186329130942e-05,
"loss": 0.4795,
"step": 2810
},
{
"epoch": 1.5479621243310004,
"grad_norm": 1.8560881614685059,
"learning_rate": 2.7964611579882317e-05,
"loss": 0.427,
"step": 2820
},
{
"epoch": 1.553451351722245,
"grad_norm": 2.625507354736328,
"learning_rate": 2.7805915759023153e-05,
"loss": 0.4982,
"step": 2830
},
{
"epoch": 1.5589405791134898,
"grad_norm": 1.8594845533370972,
"learning_rate": 2.764710534752342e-05,
"loss": 0.4489,
"step": 2840
},
{
"epoch": 1.5644298065047346,
"grad_norm": 1.9746872186660767,
"learning_rate": 2.748818683103285e-05,
"loss": 0.4177,
"step": 2850
},
{
"epoch": 1.569919033895979,
"grad_norm": 1.9741085767745972,
"learning_rate": 2.7329166699616064e-05,
"loss": 0.4816,
"step": 2860
},
{
"epoch": 1.5754082612872238,
"grad_norm": 1.9904859066009521,
"learning_rate": 2.7170051447487532e-05,
"loss": 0.4392,
"step": 2870
},
{
"epoch": 1.5808974886784686,
"grad_norm": 1.9376888275146484,
"learning_rate": 2.7010847572746356e-05,
"loss": 0.5002,
"step": 2880
},
{
"epoch": 1.5863867160697132,
"grad_norm": 1.6673862934112549,
"learning_rate": 2.6851561577110874e-05,
"loss": 0.437,
"step": 2890
},
{
"epoch": 1.5918759434609577,
"grad_norm": 1.8041437864303589,
"learning_rate": 2.6692199965653185e-05,
"loss": 0.4565,
"step": 2900
},
{
"epoch": 1.5973651708522025,
"grad_norm": 1.6648041009902954,
"learning_rate": 2.6532769246533435e-05,
"loss": 0.4755,
"step": 2910
},
{
"epoch": 1.6028543982434473,
"grad_norm": 2.290234088897705,
"learning_rate": 2.6373275930734075e-05,
"loss": 0.4603,
"step": 2920
},
{
"epoch": 1.608343625634692,
"grad_norm": 2.1882123947143555,
"learning_rate": 2.621372653179391e-05,
"loss": 0.4551,
"step": 2930
},
{
"epoch": 1.6138328530259365,
"grad_norm": 1.7908653020858765,
"learning_rate": 2.6054127565542146e-05,
"loss": 0.5062,
"step": 2940
},
{
"epoch": 1.6193220804171813,
"grad_norm": 2.1407206058502197,
"learning_rate": 2.5894485549832254e-05,
"loss": 0.5046,
"step": 2950
},
{
"epoch": 1.624811307808426,
"grad_norm": 1.8676074743270874,
"learning_rate": 2.57348070042758e-05,
"loss": 0.4685,
"step": 2960
},
{
"epoch": 1.6303005351996707,
"grad_norm": 2.0238535404205322,
"learning_rate": 2.5575098449976204e-05,
"loss": 0.4836,
"step": 2970
},
{
"epoch": 1.6357897625909152,
"grad_norm": 2.1416938304901123,
"learning_rate": 2.541536640926238e-05,
"loss": 0.4146,
"step": 2980
},
{
"epoch": 1.64127898998216,
"grad_norm": 2.006524085998535,
"learning_rate": 2.5255617405422443e-05,
"loss": 0.441,
"step": 2990
},
{
"epoch": 1.6467682173734048,
"grad_norm": 2.2434608936309814,
"learning_rate": 2.5095857962437226e-05,
"loss": 0.4932,
"step": 3000
},
{
"epoch": 1.6522574447646494,
"grad_norm": 2.113938093185425,
"learning_rate": 2.4936094604713918e-05,
"loss": 0.4324,
"step": 3010
},
{
"epoch": 1.657746672155894,
"grad_norm": 1.8772289752960205,
"learning_rate": 2.4776333856819565e-05,
"loss": 0.4655,
"step": 3020
},
{
"epoch": 1.6632358995471388,
"grad_norm": 2.1245956420898438,
"learning_rate": 2.4616582243214623e-05,
"loss": 0.4631,
"step": 3030
},
{
"epoch": 1.6687251269383834,
"grad_norm": 2.2539162635803223,
"learning_rate": 2.4456846287986525e-05,
"loss": 0.4492,
"step": 3040
},
{
"epoch": 1.674214354329628,
"grad_norm": 2.4101765155792236,
"learning_rate": 2.429713251458323e-05,
"loss": 0.4326,
"step": 3050
},
{
"epoch": 1.6797035817208728,
"grad_norm": 2.1554176807403564,
"learning_rate": 2.4137447445546837e-05,
"loss": 0.4527,
"step": 3060
},
{
"epoch": 1.6851928091121176,
"grad_norm": 2.0779566764831543,
"learning_rate": 2.397779760224713e-05,
"loss": 0.4331,
"step": 3070
},
{
"epoch": 1.6906820365033621,
"grad_norm": 2.3145909309387207,
"learning_rate": 2.3818189504615367e-05,
"loss": 0.4159,
"step": 3080
},
{
"epoch": 1.6961712638946067,
"grad_norm": 2.1414687633514404,
"learning_rate": 2.3658629670877938e-05,
"loss": 0.4996,
"step": 3090
},
{
"epoch": 1.7016604912858515,
"grad_norm": 1.6377606391906738,
"learning_rate": 2.3499124617290187e-05,
"loss": 0.4827,
"step": 3100
},
{
"epoch": 1.7071497186770963,
"grad_norm": 2.27193546295166,
"learning_rate": 2.3339680857870288e-05,
"loss": 0.5358,
"step": 3110
},
{
"epoch": 1.7126389460683409,
"grad_norm": 1.6943323612213135,
"learning_rate": 2.318030490413323e-05,
"loss": 0.4584,
"step": 3120
},
{
"epoch": 1.7181281734595855,
"grad_norm": 2.1574575901031494,
"learning_rate": 2.30210032648249e-05,
"loss": 0.4366,
"step": 3130
},
{
"epoch": 1.7236174008508303,
"grad_norm": 1.805962085723877,
"learning_rate": 2.286178244565625e-05,
"loss": 0.4633,
"step": 3140
},
{
"epoch": 1.729106628242075,
"grad_norm": 2.4552011489868164,
"learning_rate": 2.2702648949037618e-05,
"loss": 0.4861,
"step": 3150
},
{
"epoch": 1.7345958556333196,
"grad_norm": 1.9455459117889404,
"learning_rate": 2.2543609273813195e-05,
"loss": 0.4881,
"step": 3160
},
{
"epoch": 1.7400850830245642,
"grad_norm": 1.8050341606140137,
"learning_rate": 2.2384669914995592e-05,
"loss": 0.418,
"step": 3170
},
{
"epoch": 1.745574310415809,
"grad_norm": 2.0198020935058594,
"learning_rate": 2.2225837363500636e-05,
"loss": 0.472,
"step": 3180
},
{
"epoch": 1.7510635378070538,
"grad_norm": 2.245699167251587,
"learning_rate": 2.2067118105882195e-05,
"loss": 0.4718,
"step": 3190
},
{
"epoch": 1.7565527651982984,
"grad_norm": 2.3782804012298584,
"learning_rate": 2.190851862406739e-05,
"loss": 0.4318,
"step": 3200
},
{
"epoch": 1.762041992589543,
"grad_norm": 1.766295075416565,
"learning_rate": 2.17500453950918e-05,
"loss": 0.4728,
"step": 3210
},
{
"epoch": 1.7675312199807878,
"grad_norm": 1.883118987083435,
"learning_rate": 2.159170489083498e-05,
"loss": 0.4229,
"step": 3220
},
{
"epoch": 1.7730204473720323,
"grad_norm": 2.2457189559936523,
"learning_rate": 2.1433503577756137e-05,
"loss": 0.3906,
"step": 3230
},
{
"epoch": 1.778509674763277,
"grad_norm": 1.66023850440979,
"learning_rate": 2.1275447916630055e-05,
"loss": 0.379,
"step": 3240
},
{
"epoch": 1.7839989021545217,
"grad_norm": 2.2401814460754395,
"learning_rate": 2.1117544362283286e-05,
"loss": 0.4173,
"step": 3250
},
{
"epoch": 1.7894881295457665,
"grad_norm": 2.2202141284942627,
"learning_rate": 2.0959799363330425e-05,
"loss": 0.426,
"step": 3260
},
{
"epoch": 1.794977356937011,
"grad_norm": 2.292778253555298,
"learning_rate": 2.0802219361910908e-05,
"loss": 0.4165,
"step": 3270
},
{
"epoch": 1.8004665843282557,
"grad_norm": 2.025392770767212,
"learning_rate": 2.0644810793425807e-05,
"loss": 0.4216,
"step": 3280
},
{
"epoch": 1.8059558117195005,
"grad_norm": 1.669911503791809,
"learning_rate": 2.048758008627506e-05,
"loss": 0.4745,
"step": 3290
},
{
"epoch": 1.8114450391107453,
"grad_norm": 2.2425777912139893,
"learning_rate": 2.033053366159493e-05,
"loss": 0.4314,
"step": 3300
},
{
"epoch": 1.8169342665019899,
"grad_norm": 2.065985679626465,
"learning_rate": 2.0173677932995787e-05,
"loss": 0.4882,
"step": 3310
},
{
"epoch": 1.8224234938932344,
"grad_norm": 1.8231384754180908,
"learning_rate": 2.0017019306300182e-05,
"loss": 0.4346,
"step": 3320
},
{
"epoch": 1.8279127212844792,
"grad_norm": 2.203216075897217,
"learning_rate": 1.9860564179281217e-05,
"loss": 0.515,
"step": 3330
},
{
"epoch": 1.833401948675724,
"grad_norm": 1.8703504800796509,
"learning_rate": 1.970431894140128e-05,
"loss": 0.4268,
"step": 3340
},
{
"epoch": 1.8388911760669686,
"grad_norm": 2.13779616355896,
"learning_rate": 1.954828997355112e-05,
"loss": 0.4324,
"step": 3350
},
{
"epoch": 1.8443804034582132,
"grad_norm": 2.4620044231414795,
"learning_rate": 1.939248364778924e-05,
"loss": 0.4542,
"step": 3360
},
{
"epoch": 1.849869630849458,
"grad_norm": 2.0297598838806152,
"learning_rate": 1.923690632708169e-05,
"loss": 0.4695,
"step": 3370
},
{
"epoch": 1.8553588582407026,
"grad_norm": 2.2314670085906982,
"learning_rate": 1.908156436504215e-05,
"loss": 0.4433,
"step": 3380
},
{
"epoch": 1.8608480856319471,
"grad_norm": 1.7243778705596924,
"learning_rate": 1.892646410567255e-05,
"loss": 0.4257,
"step": 3390
},
{
"epoch": 1.866337313023192,
"grad_norm": 1.6004356145858765,
"learning_rate": 1.877161188310392e-05,
"loss": 0.3585,
"step": 3400
},
{
"epoch": 1.8718265404144367,
"grad_norm": 2.1382360458374023,
"learning_rate": 1.8617014021337732e-05,
"loss": 0.4234,
"step": 3410
},
{
"epoch": 1.8773157678056813,
"grad_norm": 1.9200503826141357,
"learning_rate": 1.846267683398761e-05,
"loss": 0.4546,
"step": 3420
},
{
"epoch": 1.882804995196926,
"grad_norm": 1.9421885013580322,
"learning_rate": 1.830860662402153e-05,
"loss": 0.4505,
"step": 3430
},
{
"epoch": 1.8882942225881707,
"grad_norm": 2.25044584274292,
"learning_rate": 1.8154809683504403e-05,
"loss": 0.4684,
"step": 3440
},
{
"epoch": 1.8937834499794155,
"grad_norm": 1.7311737537384033,
"learning_rate": 1.8001292293341087e-05,
"loss": 0.4478,
"step": 3450
},
{
"epoch": 1.89927267737066,
"grad_norm": 1.8768479824066162,
"learning_rate": 1.7848060723019894e-05,
"loss": 0.4323,
"step": 3460
},
{
"epoch": 1.9047619047619047,
"grad_norm": 1.617492914199829,
"learning_rate": 1.7695121230356566e-05,
"loss": 0.4429,
"step": 3470
},
{
"epoch": 1.9102511321531495,
"grad_norm": 2.0293169021606445,
"learning_rate": 1.7542480061238685e-05,
"loss": 0.4399,
"step": 3480
},
{
"epoch": 1.9157403595443943,
"grad_norm": 1.7568955421447754,
"learning_rate": 1.7390143449370663e-05,
"loss": 0.4029,
"step": 3490
},
{
"epoch": 1.9212295869356388,
"grad_norm": 1.6997624635696411,
"learning_rate": 1.723811761601904e-05,
"loss": 0.4072,
"step": 3500
},
{
"epoch": 1.9267188143268834,
"grad_norm": 2.185622453689575,
"learning_rate": 1.708640876975855e-05,
"loss": 0.4502,
"step": 3510
},
{
"epoch": 1.9322080417181282,
"grad_norm": 2.803870439529419,
"learning_rate": 1.693502310621848e-05,
"loss": 0.4397,
"step": 3520
},
{
"epoch": 1.937697269109373,
"grad_norm": 2.163422107696533,
"learning_rate": 1.6783966807829692e-05,
"loss": 0.4562,
"step": 3530
},
{
"epoch": 1.9431864965006176,
"grad_norm": 1.7700269222259521,
"learning_rate": 1.66332460435721e-05,
"loss": 0.4332,
"step": 3540
},
{
"epoch": 1.9486757238918622,
"grad_norm": 1.9938660860061646,
"learning_rate": 1.648286696872277e-05,
"loss": 0.4392,
"step": 3550
},
{
"epoch": 1.954164951283107,
"grad_norm": 1.796387791633606,
"learning_rate": 1.6332835724604556e-05,
"loss": 0.4079,
"step": 3560
},
{
"epoch": 1.9596541786743515,
"grad_norm": 1.6725765466690063,
"learning_rate": 1.6183158438335223e-05,
"loss": 0.4156,
"step": 3570
},
{
"epoch": 1.9651434060655961,
"grad_norm": 2.1465470790863037,
"learning_rate": 1.6033841222577312e-05,
"loss": 0.4514,
"step": 3580
},
{
"epoch": 1.970632633456841,
"grad_norm": 2.0636017322540283,
"learning_rate": 1.588489017528844e-05,
"loss": 0.4107,
"step": 3590
},
{
"epoch": 1.9761218608480857,
"grad_norm": 2.281461000442505,
"learning_rate": 1.573631137947232e-05,
"loss": 0.4247,
"step": 3600
},
{
"epoch": 1.9816110882393303,
"grad_norm": 1.8369041681289673,
"learning_rate": 1.5588110902930252e-05,
"loss": 0.3993,
"step": 3610
},
{
"epoch": 1.9871003156305749,
"grad_norm": 1.538087010383606,
"learning_rate": 1.5440294798013445e-05,
"loss": 0.4032,
"step": 3620
},
{
"epoch": 1.9925895430218197,
"grad_norm": 1.7788771390914917,
"learning_rate": 1.5292869101375718e-05,
"loss": 0.4191,
"step": 3630
},
{
"epoch": 1.9980787704130645,
"grad_norm": 1.663266897201538,
"learning_rate": 1.514583983372707e-05,
"loss": 0.4065,
"step": 3640
},
{
"epoch": 2.003567997804309,
"grad_norm": 1.6794120073318481,
"learning_rate": 1.4999212999587723e-05,
"loss": 0.3012,
"step": 3650
},
{
"epoch": 2.0090572251955536,
"grad_norm": 2.0947213172912598,
"learning_rate": 1.4852994587042957e-05,
"loss": 0.2699,
"step": 3660
},
{
"epoch": 2.0145464525867984,
"grad_norm": 1.647148609161377,
"learning_rate": 1.4707190567498552e-05,
"loss": 0.256,
"step": 3670
},
{
"epoch": 2.0200356799780432,
"grad_norm": 1.6485075950622559,
"learning_rate": 1.4561806895436907e-05,
"loss": 0.2306,
"step": 3680
},
{
"epoch": 2.0255249073692876,
"grad_norm": 1.5765630006790161,
"learning_rate": 1.4416849508173864e-05,
"loss": 0.2363,
"step": 3690
},
{
"epoch": 2.0310141347605324,
"grad_norm": 2.0813677310943604,
"learning_rate": 1.4272324325616251e-05,
"loss": 0.2407,
"step": 3700
},
{
"epoch": 2.036503362151777,
"grad_norm": 2.0815207958221436,
"learning_rate": 1.4128237250020115e-05,
"loss": 0.2525,
"step": 3710
},
{
"epoch": 2.041992589543022,
"grad_norm": 2.1880548000335693,
"learning_rate": 1.3984594165749676e-05,
"loss": 0.2371,
"step": 3720
},
{
"epoch": 2.0474818169342663,
"grad_norm": 1.7031059265136719,
"learning_rate": 1.3841400939037013e-05,
"loss": 0.241,
"step": 3730
},
{
"epoch": 2.052971044325511,
"grad_norm": 1.9227020740509033,
"learning_rate": 1.3698663417742496e-05,
"loss": 0.2509,
"step": 3740
},
{
"epoch": 2.058460271716756,
"grad_norm": 1.8670146465301514,
"learning_rate": 1.3556387431115969e-05,
"loss": 0.265,
"step": 3750
},
{
"epoch": 2.0639494991080007,
"grad_norm": 2.06124210357666,
"learning_rate": 1.3414578789558696e-05,
"loss": 0.2182,
"step": 3760
},
{
"epoch": 2.069438726499245,
"grad_norm": 2.2382616996765137,
"learning_rate": 1.3273243284386023e-05,
"loss": 0.2477,
"step": 3770
},
{
"epoch": 2.07492795389049,
"grad_norm": 2.178103446960449,
"learning_rate": 1.3132386687590958e-05,
"loss": 0.2379,
"step": 3780
},
{
"epoch": 2.0804171812817347,
"grad_norm": 1.947129249572754,
"learning_rate": 1.2992014751608372e-05,
"loss": 0.2319,
"step": 3790
},
{
"epoch": 2.0859064086729795,
"grad_norm": 2.0307064056396484,
"learning_rate": 1.2852133209080097e-05,
"loss": 0.2416,
"step": 3800
},
{
"epoch": 2.091395636064224,
"grad_norm": 1.988174319267273,
"learning_rate": 1.2712747772620801e-05,
"loss": 0.2629,
"step": 3810
},
{
"epoch": 2.0968848634554687,
"grad_norm": 3.22770094871521,
"learning_rate": 1.2573864134584718e-05,
"loss": 0.2069,
"step": 3820
},
{
"epoch": 2.1023740908467135,
"grad_norm": 2.0717577934265137,
"learning_rate": 1.243548796683319e-05,
"loss": 0.237,
"step": 3830
},
{
"epoch": 2.107863318237958,
"grad_norm": 2.118257522583008,
"learning_rate": 1.2297624920502953e-05,
"loss": 0.2531,
"step": 3840
},
{
"epoch": 2.1133525456292026,
"grad_norm": 2.584071636199951,
"learning_rate": 1.2160280625775447e-05,
"loss": 0.2464,
"step": 3850
},
{
"epoch": 2.1188417730204474,
"grad_norm": 2.8391823768615723,
"learning_rate": 1.2023460691646821e-05,
"loss": 0.2344,
"step": 3860
},
{
"epoch": 2.124331000411692,
"grad_norm": 1.8388172388076782,
"learning_rate": 1.1887170705698905e-05,
"loss": 0.2191,
"step": 3870
},
{
"epoch": 2.1298202278029366,
"grad_norm": 2.278942346572876,
"learning_rate": 1.1751416233870999e-05,
"loss": 0.2303,
"step": 3880
},
{
"epoch": 2.1353094551941814,
"grad_norm": 2.4428744316101074,
"learning_rate": 1.1616202820232567e-05,
"loss": 0.2493,
"step": 3890
},
{
"epoch": 2.140798682585426,
"grad_norm": 2.272839069366455,
"learning_rate": 1.1481535986756828e-05,
"loss": 0.2527,
"step": 3900
},
{
"epoch": 2.146287909976671,
"grad_norm": 2.1877167224884033,
"learning_rate": 1.134742123309525e-05,
"loss": 0.2599,
"step": 3910
},
{
"epoch": 2.1517771373679153,
"grad_norm": 1.6496747732162476,
"learning_rate": 1.1213864036352939e-05,
"loss": 0.2457,
"step": 3920
},
{
"epoch": 2.15726636475916,
"grad_norm": 2.1241507530212402,
"learning_rate": 1.1080869850864964e-05,
"loss": 0.2532,
"step": 3930
},
{
"epoch": 2.162755592150405,
"grad_norm": 1.5271326303482056,
"learning_rate": 1.094844410797361e-05,
"loss": 0.2651,
"step": 3940
},
{
"epoch": 2.1682448195416497,
"grad_norm": 1.9697147607803345,
"learning_rate": 1.0816592215806562e-05,
"loss": 0.2171,
"step": 3950
},
{
"epoch": 2.173734046932894,
"grad_norm": 1.604737401008606,
"learning_rate": 1.0685319559056051e-05,
"loss": 0.2579,
"step": 3960
},
{
"epoch": 2.179223274324139,
"grad_norm": 2.053114414215088,
"learning_rate": 1.0554631498758943e-05,
"loss": 0.2541,
"step": 3970
},
{
"epoch": 2.1847125017153837,
"grad_norm": 2.424609422683716,
"learning_rate": 1.0424533372077803e-05,
"loss": 0.2053,
"step": 3980
},
{
"epoch": 2.1902017291066285,
"grad_norm": 2.34372615814209,
"learning_rate": 1.029503049208293e-05,
"loss": 0.214,
"step": 3990
},
{
"epoch": 2.195690956497873,
"grad_norm": 2.1274008750915527,
"learning_rate": 1.0166128147535352e-05,
"loss": 0.2133,
"step": 4000
},
{
"epoch": 2.2011801838891176,
"grad_norm": 1.7201030254364014,
"learning_rate": 1.003783160267091e-05,
"loss": 0.2233,
"step": 4010
},
{
"epoch": 2.2066694112803624,
"grad_norm": 2.844679832458496,
"learning_rate": 9.91014609698519e-06,
"loss": 0.2389,
"step": 4020
},
{
"epoch": 2.212158638671607,
"grad_norm": 1.7438113689422607,
"learning_rate": 9.783076845019598e-06,
"loss": 0.2297,
"step": 4030
},
{
"epoch": 2.2176478660628516,
"grad_norm": 2.076685905456543,
"learning_rate": 9.656629036148365e-06,
"loss": 0.2519,
"step": 4040
},
{
"epoch": 2.2231370934540964,
"grad_norm": 2.197861671447754,
"learning_rate": 9.530807834366658e-06,
"loss": 0.2416,
"step": 4050
},
{
"epoch": 2.228626320845341,
"grad_norm": 1.6589149236679077,
"learning_rate": 9.405618378079686e-06,
"loss": 0.238,
"step": 4060
},
{
"epoch": 2.2341155482365855,
"grad_norm": 2.1636011600494385,
"learning_rate": 9.281065779892826e-06,
"loss": 0.2165,
"step": 4070
},
{
"epoch": 2.2396047756278303,
"grad_norm": 2.11350679397583,
"learning_rate": 9.15715512640282e-06,
"loss": 0.2539,
"step": 4080
},
{
"epoch": 2.245094003019075,
"grad_norm": 2.1883132457733154,
"learning_rate": 9.033891477990091e-06,
"loss": 0.2392,
"step": 4090
},
{
"epoch": 2.25058323041032,
"grad_norm": 1.9660489559173584,
"learning_rate": 8.923511544874787e-06,
"loss": 0.2507,
"step": 4100
},
{
"epoch": 2.2560724578015643,
"grad_norm": 1.596596360206604,
"learning_rate": 8.801491052657259e-06,
"loss": 0.2205,
"step": 4110
},
{
"epoch": 2.261561685192809,
"grad_norm": 1.9153178930282593,
"learning_rate": 8.680132090462712e-06,
"loss": 0.2189,
"step": 4120
},
{
"epoch": 2.267050912584054,
"grad_norm": 2.2162649631500244,
"learning_rate": 8.559439614463177e-06,
"loss": 0.2573,
"step": 4130
},
{
"epoch": 2.2725401399752982,
"grad_norm": 2.1483819484710693,
"learning_rate": 8.439418553612105e-06,
"loss": 0.2464,
"step": 4140
},
{
"epoch": 2.278029367366543,
"grad_norm": 2.043180465698242,
"learning_rate": 8.320073809443024e-06,
"loss": 0.236,
"step": 4150
},
{
"epoch": 2.283518594757788,
"grad_norm": 1.6839346885681152,
"learning_rate": 8.201410255869458e-06,
"loss": 0.2596,
"step": 4160
},
{
"epoch": 2.2890078221490326,
"grad_norm": 2.334520101547241,
"learning_rate": 8.083432738985782e-06,
"loss": 0.2233,
"step": 4170
},
{
"epoch": 2.2944970495402774,
"grad_norm": 2.2806715965270996,
"learning_rate": 7.966146076869386e-06,
"loss": 0.223,
"step": 4180
},
{
"epoch": 2.299986276931522,
"grad_norm": 2.726118803024292,
"learning_rate": 7.849555059383839e-06,
"loss": 0.2236,
"step": 4190
},
{
"epoch": 2.3054755043227666,
"grad_norm": 1.6786760091781616,
"learning_rate": 7.733664447983349e-06,
"loss": 0.2509,
"step": 4200
},
{
"epoch": 2.3109647317140114,
"grad_norm": 1.767065405845642,
"learning_rate": 7.618478975518292e-06,
"loss": 0.2373,
"step": 4210
},
{
"epoch": 2.3164539591052558,
"grad_norm": 1.5344187021255493,
"learning_rate": 7.504003346041871e-06,
"loss": 0.2404,
"step": 4220
},
{
"epoch": 2.3219431864965006,
"grad_norm": 2.3337080478668213,
"learning_rate": 7.390242234618075e-06,
"loss": 0.1858,
"step": 4230
},
{
"epoch": 2.3274324138877454,
"grad_norm": 2.3712496757507324,
"learning_rate": 7.277200287130728e-06,
"loss": 0.2595,
"step": 4240
},
{
"epoch": 2.33292164127899,
"grad_norm": 2.5574772357940674,
"learning_rate": 7.164882120093757e-06,
"loss": 0.2419,
"step": 4250
},
{
"epoch": 2.3384108686702345,
"grad_norm": 2.367032051086426,
"learning_rate": 7.053292320462654e-06,
"loss": 0.256,
"step": 4260
},
{
"epoch": 2.3439000960614793,
"grad_norm": 2.3279731273651123,
"learning_rate": 6.942435445447159e-06,
"loss": 0.2319,
"step": 4270
},
{
"epoch": 2.349389323452724,
"grad_norm": 2.5082924365997314,
"learning_rate": 6.832316022325138e-06,
"loss": 0.212,
"step": 4280
},
{
"epoch": 2.354878550843969,
"grad_norm": 3.303452730178833,
"learning_rate": 6.7229385482577065e-06,
"loss": 0.2434,
"step": 4290
},
{
"epoch": 2.3603677782352133,
"grad_norm": 2.3842883110046387,
"learning_rate": 6.614307490105557e-06,
"loss": 0.2644,
"step": 4300
},
{
"epoch": 2.365857005626458,
"grad_norm": 2.0827999114990234,
"learning_rate": 6.506427284246547e-06,
"loss": 0.2327,
"step": 4310
},
{
"epoch": 2.371346233017703,
"grad_norm": 1.9834315776824951,
"learning_rate": 6.3993023363945165e-06,
"loss": 0.2206,
"step": 4320
},
{
"epoch": 2.376835460408947,
"grad_norm": 1.9981067180633545,
"learning_rate": 6.2929370214193735e-06,
"loss": 0.2736,
"step": 4330
},
{
"epoch": 2.382324687800192,
"grad_norm": 2.4763762950897217,
"learning_rate": 6.1873356831683884e-06,
"loss": 0.2344,
"step": 4340
},
{
"epoch": 2.387813915191437,
"grad_norm": 1.4734795093536377,
"learning_rate": 6.082502634288873e-06,
"loss": 0.2019,
"step": 4350
},
{
"epoch": 2.3933031425826816,
"grad_norm": 2.4421563148498535,
"learning_rate": 5.978442156051986e-06,
"loss": 0.2289,
"step": 4360
},
{
"epoch": 2.3987923699739264,
"grad_norm": 2.192746162414551,
"learning_rate": 5.875158498177921e-06,
"loss": 0.2396,
"step": 4370
},
{
"epoch": 2.4042815973651708,
"grad_norm": 1.9783297777175903,
"learning_rate": 5.772655878662339e-06,
"loss": 0.2531,
"step": 4380
},
{
"epoch": 2.4097708247564156,
"grad_norm": 2.5523509979248047,
"learning_rate": 5.6709384836041184e-06,
"loss": 0.2405,
"step": 4390
},
{
"epoch": 2.4152600521476604,
"grad_norm": 1.6953893899917603,
"learning_rate": 5.570010467034425e-06,
"loss": 0.2279,
"step": 4400
},
{
"epoch": 2.4207492795389047,
"grad_norm": 1.7373918294906616,
"learning_rate": 5.469875950747016e-06,
"loss": 0.2081,
"step": 4410
},
{
"epoch": 2.4262385069301495,
"grad_norm": 2.789266347885132,
"learning_rate": 5.370539024129928e-06,
"loss": 0.2455,
"step": 4420
},
{
"epoch": 2.4317277343213943,
"grad_norm": 1.8112378120422363,
"learning_rate": 5.272003743998489e-06,
"loss": 0.256,
"step": 4430
},
{
"epoch": 2.437216961712639,
"grad_norm": 1.8407344818115234,
"learning_rate": 5.1742741344296246e-06,
"loss": 0.2481,
"step": 4440
},
{
"epoch": 2.4427061891038835,
"grad_norm": 2.0232059955596924,
"learning_rate": 5.077354186597541e-06,
"loss": 0.2213,
"step": 4450
},
{
"epoch": 2.4481954164951283,
"grad_norm": 2.0662572383880615,
"learning_rate": 4.981247858610688e-06,
"loss": 0.2064,
"step": 4460
},
{
"epoch": 2.453684643886373,
"grad_norm": 1.9686827659606934,
"learning_rate": 4.885959075350149e-06,
"loss": 0.2344,
"step": 4470
},
{
"epoch": 2.459173871277618,
"grad_norm": 2.095710515975952,
"learning_rate": 4.791491728309347e-06,
"loss": 0.2727,
"step": 4480
},
{
"epoch": 2.4646630986688622,
"grad_norm": 2.7409679889678955,
"learning_rate": 4.697849675435112e-06,
"loss": 0.2449,
"step": 4490
},
{
"epoch": 2.470152326060107,
"grad_norm": 1.6528655290603638,
"learning_rate": 4.605036740970134e-06,
"loss": 0.2228,
"step": 4500
},
{
"epoch": 2.475641553451352,
"grad_norm": 2.210045337677002,
"learning_rate": 4.513056715296773e-06,
"loss": 0.2399,
"step": 4510
},
{
"epoch": 2.481130780842596,
"grad_norm": 1.4573155641555786,
"learning_rate": 4.4219133547822865e-06,
"loss": 0.2133,
"step": 4520
},
{
"epoch": 2.486620008233841,
"grad_norm": 2.013803482055664,
"learning_rate": 4.331610381625395e-06,
"loss": 0.2318,
"step": 4530
},
{
"epoch": 2.492109235625086,
"grad_norm": 2.090888261795044,
"learning_rate": 4.242151483704293e-06,
"loss": 0.2393,
"step": 4540
},
{
"epoch": 2.4975984630163306,
"grad_norm": 2.1223104000091553,
"learning_rate": 4.153540314426033e-06,
"loss": 0.2343,
"step": 4550
},
{
"epoch": 2.5030876904075754,
"grad_norm": 2.2379000186920166,
"learning_rate": 4.065780492577326e-06,
"loss": 0.2608,
"step": 4560
},
{
"epoch": 2.5085769177988197,
"grad_norm": 1.780354380607605,
"learning_rate": 3.978875602176726e-06,
"loss": 0.2401,
"step": 4570
},
{
"epoch": 2.5140661451900645,
"grad_norm": 2.5559253692626953,
"learning_rate": 3.892829192328337e-06,
"loss": 0.2381,
"step": 4580
},
{
"epoch": 2.5195553725813093,
"grad_norm": 1.7930986881256104,
"learning_rate": 3.8076447770767796e-06,
"loss": 0.2712,
"step": 4590
},
{
"epoch": 2.5250445999725537,
"grad_norm": 2.0700008869171143,
"learning_rate": 3.7233258352637553e-06,
"loss": 0.2162,
"step": 4600
},
{
"epoch": 2.5305338273637985,
"grad_norm": 2.3782896995544434,
"learning_rate": 3.6398758103859067e-06,
"loss": 0.208,
"step": 4610
},
{
"epoch": 2.5360230547550433,
"grad_norm": 2.345578670501709,
"learning_rate": 3.557298110454252e-06,
"loss": 0.2231,
"step": 4620
},
{
"epoch": 2.5415122821462877,
"grad_norm": 1.9046144485473633,
"learning_rate": 3.475596107854981e-06,
"loss": 0.2359,
"step": 4630
},
{
"epoch": 2.5470015095375325,
"grad_norm": 2.2235357761383057,
"learning_rate": 3.3947731392117237e-06,
"loss": 0.2268,
"step": 4640
},
{
"epoch": 2.5524907369287773,
"grad_norm": 2.3540749549865723,
"learning_rate": 3.3148325052492713e-06,
"loss": 0.2382,
"step": 4650
},
{
"epoch": 2.557979964320022,
"grad_norm": 2.550370931625366,
"learning_rate": 3.2357774706588157e-06,
"loss": 0.2364,
"step": 4660
},
{
"epoch": 2.563469191711267,
"grad_norm": 2.06571626663208,
"learning_rate": 3.1576112639646023e-06,
"loss": 0.2379,
"step": 4670
},
{
"epoch": 2.568958419102511,
"grad_norm": 1.3834680318832397,
"learning_rate": 3.08033707739209e-06,
"loss": 0.2012,
"step": 4680
},
{
"epoch": 2.574447646493756,
"grad_norm": 1.7131993770599365,
"learning_rate": 3.0039580667375557e-06,
"loss": 0.2019,
"step": 4690
},
{
"epoch": 2.579936873885001,
"grad_norm": 2.2222018241882324,
"learning_rate": 2.9284773512392475e-06,
"loss": 0.2565,
"step": 4700
},
{
"epoch": 2.585426101276245,
"grad_norm": 1.8102763891220093,
"learning_rate": 2.8538980134499958e-06,
"loss": 0.2254,
"step": 4710
},
{
"epoch": 2.59091532866749,
"grad_norm": 2.4614617824554443,
"learning_rate": 2.780223099111298e-06,
"loss": 0.2505,
"step": 4720
},
{
"epoch": 2.5964045560587348,
"grad_norm": 2.120408535003662,
"learning_rate": 2.7074556170289674e-06,
"loss": 0.1887,
"step": 4730
},
{
"epoch": 2.6018937834499796,
"grad_norm": 2.1147496700286865,
"learning_rate": 2.6355985389502293e-06,
"loss": 0.1995,
"step": 4740
},
{
"epoch": 2.6073830108412244,
"grad_norm": 1.818030595779419,
"learning_rate": 2.5646547994423784e-06,
"loss": 0.2527,
"step": 4750
},
{
"epoch": 2.6128722382324687,
"grad_norm": 1.935293197631836,
"learning_rate": 2.4946272957729165e-06,
"loss": 0.2007,
"step": 4760
},
{
"epoch": 2.6183614656237135,
"grad_norm": 1.837039589881897,
"learning_rate": 2.4255188877912477e-06,
"loss": 0.2044,
"step": 4770
},
{
"epoch": 2.6238506930149583,
"grad_norm": 2.487926721572876,
"learning_rate": 2.3573323978118705e-06,
"loss": 0.2825,
"step": 4780
},
{
"epoch": 2.6293399204062027,
"grad_norm": 1.880677342414856,
"learning_rate": 2.29007061049914e-06,
"loss": 0.209,
"step": 4790
},
{
"epoch": 2.6348291477974475,
"grad_norm": 2.060526132583618,
"learning_rate": 2.2237362727535043e-06,
"loss": 0.2069,
"step": 4800
},
{
"epoch": 2.6403183751886923,
"grad_norm": 1.8091875314712524,
"learning_rate": 2.1583320935993605e-06,
"loss": 0.2606,
"step": 4810
},
{
"epoch": 2.6458076025799366,
"grad_norm": 2.821810483932495,
"learning_rate": 2.0938607440744274e-06,
"loss": 0.2235,
"step": 4820
},
{
"epoch": 2.6512968299711814,
"grad_norm": 2.605480194091797,
"learning_rate": 2.0303248571206244e-06,
"loss": 0.2454,
"step": 4830
},
{
"epoch": 2.6567860573624262,
"grad_norm": 2.1553428173065186,
"learning_rate": 1.967727027476568e-06,
"loss": 0.1998,
"step": 4840
},
{
"epoch": 2.662275284753671,
"grad_norm": 2.485527753829956,
"learning_rate": 1.9060698115716063e-06,
"loss": 0.2377,
"step": 4850
},
{
"epoch": 2.667764512144916,
"grad_norm": 1.8272162675857544,
"learning_rate": 1.8453557274214162e-06,
"loss": 0.2288,
"step": 4860
},
{
"epoch": 2.67325373953616,
"grad_norm": 2.5798721313476562,
"learning_rate": 1.7855872545251757e-06,
"loss": 0.2419,
"step": 4870
},
{
"epoch": 2.678742966927405,
"grad_norm": 2.7085471153259277,
"learning_rate": 1.7267668337642761e-06,
"loss": 0.222,
"step": 4880
},
{
"epoch": 2.68423219431865,
"grad_norm": 2.2424449920654297,
"learning_rate": 1.6688968673026773e-06,
"loss": 0.1913,
"step": 4890
},
{
"epoch": 2.689721421709894,
"grad_norm": 2.0495047569274902,
"learning_rate": 1.6119797184887792e-06,
"loss": 0.1905,
"step": 4900
},
{
"epoch": 2.695210649101139,
"grad_norm": 2.048985242843628,
"learning_rate": 1.5560177117589197e-06,
"loss": 0.1978,
"step": 4910
},
{
"epoch": 2.7006998764923837,
"grad_norm": 2.4100003242492676,
"learning_rate": 1.5010131325424337e-06,
"loss": 0.2575,
"step": 4920
},
{
"epoch": 2.7061891038836285,
"grad_norm": 2.420525074005127,
"learning_rate": 1.4469682271683327e-06,
"loss": 0.215,
"step": 4930
},
{
"epoch": 2.711678331274873,
"grad_norm": 1.7320780754089355,
"learning_rate": 1.3938852027735594e-06,
"loss": 0.2259,
"step": 4940
},
{
"epoch": 2.7171675586661177,
"grad_norm": 2.3320376873016357,
"learning_rate": 1.3417662272128484e-06,
"loss": 0.2514,
"step": 4950
},
{
"epoch": 2.7226567860573625,
"grad_norm": 2.5615234375,
"learning_rate": 1.2906134289701998e-06,
"loss": 0.2342,
"step": 4960
},
{
"epoch": 2.7281460134486073,
"grad_norm": 2.3866004943847656,
"learning_rate": 1.240428897071949e-06,
"loss": 0.2273,
"step": 4970
},
{
"epoch": 2.7336352408398517,
"grad_norm": 2.7888448238372803,
"learning_rate": 1.191214681001454e-06,
"loss": 0.216,
"step": 4980
},
{
"epoch": 2.7391244682310965,
"grad_norm": 2.318481922149658,
"learning_rate": 1.142972790615407e-06,
"loss": 0.2053,
"step": 4990
},
{
"epoch": 2.7446136956223413,
"grad_norm": 1.82982337474823,
"learning_rate": 1.095705196061722e-06,
"loss": 0.2265,
"step": 5000
},
{
"epoch": 2.7501029230135856,
"grad_norm": 1.4905016422271729,
"learning_rate": 1.0494138276991278e-06,
"loss": 0.1784,
"step": 5010
},
{
"epoch": 2.7555921504048304,
"grad_norm": 1.6236484050750732,
"learning_rate": 1.0041005760182853e-06,
"loss": 0.1953,
"step": 5020
},
{
"epoch": 2.761081377796075,
"grad_norm": 1.9028195142745972,
"learning_rate": 9.597672915646116e-07,
"loss": 0.2076,
"step": 5030
},
{
"epoch": 2.76657060518732,
"grad_norm": 2.1804606914520264,
"learning_rate": 9.164157848626842e-07,
"loss": 0.2155,
"step": 5040
},
{
"epoch": 2.772059832578565,
"grad_norm": 1.961748480796814,
"learning_rate": 8.740478263423197e-07,
"loss": 0.2072,
"step": 5050
},
{
"epoch": 2.777549059969809,
"grad_norm": 2.2191579341888428,
"learning_rate": 8.32665146266276e-07,
"loss": 0.2314,
"step": 5060
},
{
"epoch": 2.783038287361054,
"grad_norm": 2.2354133129119873,
"learning_rate": 7.922694346595511e-07,
"loss": 0.2297,
"step": 5070
},
{
"epoch": 2.7885275147522988,
"grad_norm": 1.8483319282531738,
"learning_rate": 7.528623412404179e-07,
"loss": 0.214,
"step": 5080
},
{
"epoch": 2.794016742143543,
"grad_norm": 1.9461801052093506,
"learning_rate": 7.144454753530067e-07,
"loss": 0.2173,
"step": 5090
},
{
"epoch": 2.799505969534788,
"grad_norm": 1.8403065204620361,
"learning_rate": 6.770204059016127e-07,
"loss": 0.2012,
"step": 5100
},
{
"epoch": 2.8049951969260327,
"grad_norm": 2.654057264328003,
"learning_rate": 6.405886612866036e-07,
"loss": 0.224,
"step": 5110
},
{
"epoch": 2.810484424317277,
"grad_norm": 2.0720152854919434,
"learning_rate": 6.051517293420101e-07,
"loss": 0.1893,
"step": 5120
},
{
"epoch": 2.815973651708522,
"grad_norm": 1.8411768674850464,
"learning_rate": 5.707110572747587e-07,
"loss": 0.2351,
"step": 5130
},
{
"epoch": 2.8214628790997667,
"grad_norm": 1.893263578414917,
"learning_rate": 5.3726805160558e-07,
"loss": 0.2541,
"step": 5140
},
{
"epoch": 2.8269521064910115,
"grad_norm": 2.347729206085205,
"learning_rate": 5.048240781115571e-07,
"loss": 0.2351,
"step": 5150
},
{
"epoch": 2.8324413338822563,
"grad_norm": 2.2554593086242676,
"learning_rate": 4.7338046177035354e-07,
"loss": 0.245,
"step": 5160
},
{
"epoch": 2.8379305612735006,
"grad_norm": 2.649017095565796,
"learning_rate": 4.429384867061015e-07,
"loss": 0.2444,
"step": 5170
},
{
"epoch": 2.8434197886647454,
"grad_norm": 2.740800142288208,
"learning_rate": 4.1349939613695434e-07,
"loss": 0.2354,
"step": 5180
},
{
"epoch": 2.8489090160559902,
"grad_norm": 2.5310161113739014,
"learning_rate": 3.85064392324333e-07,
"loss": 0.2086,
"step": 5190
},
{
"epoch": 2.8543982434472346,
"grad_norm": 2.4969282150268555,
"learning_rate": 3.5763463652380146e-07,
"loss": 0.2329,
"step": 5200
},
{
"epoch": 2.8598874708384794,
"grad_norm": 2.0695693492889404,
"learning_rate": 3.3121124893766287e-07,
"loss": 0.1665,
"step": 5210
},
{
"epoch": 2.865376698229724,
"grad_norm": 2.0718321800231934,
"learning_rate": 3.057953086692017e-07,
"loss": 0.2444,
"step": 5220
},
{
"epoch": 2.870865925620969,
"grad_norm": 2.0257515907287598,
"learning_rate": 2.8138785367860796e-07,
"loss": 0.2303,
"step": 5230
},
{
"epoch": 2.876355153012214,
"grad_norm": 2.0960233211517334,
"learning_rate": 2.5798988074061394e-07,
"loss": 0.2274,
"step": 5240
},
{
"epoch": 2.881844380403458,
"grad_norm": 2.4282174110412598,
"learning_rate": 2.3560234540375424e-07,
"loss": 0.1995,
"step": 5250
},
{
"epoch": 2.887333607794703,
"grad_norm": 1.898910641670227,
"learning_rate": 2.1422616195136692e-07,
"loss": 0.2002,
"step": 5260
},
{
"epoch": 2.8928228351859477,
"grad_norm": 1.6507282257080078,
"learning_rate": 1.9386220336423678e-07,
"loss": 0.1811,
"step": 5270
},
{
"epoch": 2.898312062577192,
"grad_norm": 2.447411060333252,
"learning_rate": 1.7451130128495753e-07,
"loss": 0.2376,
"step": 5280
},
{
"epoch": 2.903801289968437,
"grad_norm": 1.9209644794464111,
"learning_rate": 1.5617424598396712e-07,
"loss": 0.236,
"step": 5290
},
{
"epoch": 2.9092905173596817,
"grad_norm": 2.0094797611236572,
"learning_rate": 1.3885178632726536e-07,
"loss": 0.208,
"step": 5300
},
{
"epoch": 2.914779744750926,
"grad_norm": 2.3810887336730957,
"learning_rate": 1.225446297458327e-07,
"loss": 0.2124,
"step": 5310
},
{
"epoch": 2.920268972142171,
"grad_norm": 2.039491891860962,
"learning_rate": 1.0725344220675337e-07,
"loss": 0.1983,
"step": 5320
},
{
"epoch": 2.9257581995334156,
"grad_norm": 2.2363178730010986,
"learning_rate": 9.297884818599556e-08,
"loss": 0.2173,
"step": 5330
},
{
"epoch": 2.9312474269246604,
"grad_norm": 2.1318199634552,
"learning_rate": 7.972143064292892e-08,
"loss": 0.182,
"step": 5340
},
{
"epoch": 2.9367366543159052,
"grad_norm": 1.9546360969543457,
"learning_rate": 6.748173099650202e-08,
"loss": 0.1871,
"step": 5350
},
{
"epoch": 2.9422258817071496,
"grad_norm": 1.8476676940917969,
"learning_rate": 5.626024910314609e-08,
"loss": 0.1989,
"step": 5360
},
{
"epoch": 2.9477151090983944,
"grad_norm": 2.1973392963409424,
"learning_rate": 4.605744323634142e-08,
"loss": 0.2186,
"step": 5370
},
{
"epoch": 2.953204336489639,
"grad_norm": 1.935354232788086,
"learning_rate": 3.687373006792394e-08,
"loss": 0.2124,
"step": 5380
},
{
"epoch": 2.9586935638808836,
"grad_norm": 2.1322507858276367,
"learning_rate": 2.870948465105161e-08,
"loss": 0.2145,
"step": 5390
},
{
"epoch": 2.9641827912721284,
"grad_norm": 2.501298189163208,
"learning_rate": 2.1565040404902813e-08,
"loss": 0.22,
"step": 5400
},
{
"epoch": 2.969672018663373,
"grad_norm": 1.6250516176223755,
"learning_rate": 1.544068910104002e-08,
"loss": 0.218,
"step": 5410
},
{
"epoch": 2.975161246054618,
"grad_norm": 2.794093370437622,
"learning_rate": 1.0336680851516512e-08,
"loss": 0.257,
"step": 5420
},
{
"epoch": 2.9806504734458628,
"grad_norm": 2.0987253189086914,
"learning_rate": 6.2532240986457044e-09,
"loss": 0.2193,
"step": 5430
},
{
"epoch": 2.986139700837107,
"grad_norm": 2.217175006866455,
"learning_rate": 3.1904856064940424e-09,
"loss": 0.2392,
"step": 5440
},
{
"epoch": 2.991628928228352,
"grad_norm": 2.0461556911468506,
"learning_rate": 1.1485904540697867e-09,
"loss": 0.2137,
"step": 5450
},
{
"epoch": 2.9971181556195967,
"grad_norm": 1.9249966144561768,
"learning_rate": 1.276220302215414e-10,
"loss": 0.2132,
"step": 5460
},
{
"epoch": 2.99876492383697,
"step": 5463,
"total_flos": 3.233486247100416e+17,
"train_loss": 0.5040911292388242,
"train_runtime": 9149.3841,
"train_samples_per_second": 4.779,
"train_steps_per_second": 0.597
}
],
"logging_steps": 10,
"max_steps": 5463,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.233486247100416e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}