llama-2-7b-fourierft-alpaca-loca / trainer_state.json
vantaa32's picture
Upload folder using huggingface_hub
39c6661 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 9705,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0030911901081916537,
"grad_norm": 0.0021730465814471245,
"learning_rate": 0.0009989696032972695,
"loss": 1.1462,
"step": 10
},
{
"epoch": 0.0061823802163833074,
"grad_norm": 0.0018489831127226353,
"learning_rate": 0.000997939206594539,
"loss": 1.0204,
"step": 20
},
{
"epoch": 0.00927357032457496,
"grad_norm": 0.0019520210335031152,
"learning_rate": 0.0009969088098918083,
"loss": 1.0607,
"step": 30
},
{
"epoch": 0.012364760432766615,
"grad_norm": 0.002070036716759205,
"learning_rate": 0.0009958784131890777,
"loss": 1.0284,
"step": 40
},
{
"epoch": 0.015455950540958269,
"grad_norm": 0.0020703268237411976,
"learning_rate": 0.0009948480164863472,
"loss": 1.0654,
"step": 50
},
{
"epoch": 0.01854714064914992,
"grad_norm": 0.0019948079716414213,
"learning_rate": 0.0009938176197836167,
"loss": 1.0034,
"step": 60
},
{
"epoch": 0.021638330757341576,
"grad_norm": 0.002126147970557213,
"learning_rate": 0.0009927872230808862,
"loss": 1.0173,
"step": 70
},
{
"epoch": 0.02472952086553323,
"grad_norm": 0.0017046661814674735,
"learning_rate": 0.0009917568263781555,
"loss": 1.0153,
"step": 80
},
{
"epoch": 0.027820710973724884,
"grad_norm": 0.0018810734618455172,
"learning_rate": 0.000990726429675425,
"loss": 0.9208,
"step": 90
},
{
"epoch": 0.030911901081916538,
"grad_norm": 0.0019539243075996637,
"learning_rate": 0.0009896960329726944,
"loss": 1.0072,
"step": 100
},
{
"epoch": 0.03400309119010819,
"grad_norm": 0.0015656334580853581,
"learning_rate": 0.000988665636269964,
"loss": 0.9477,
"step": 110
},
{
"epoch": 0.03709428129829984,
"grad_norm": 0.0018790484173223376,
"learning_rate": 0.0009876352395672334,
"loss": 0.9543,
"step": 120
},
{
"epoch": 0.0401854714064915,
"grad_norm": 0.0014801452634856105,
"learning_rate": 0.0009866048428645029,
"loss": 0.9507,
"step": 130
},
{
"epoch": 0.04327666151468315,
"grad_norm": 0.0023686159402132034,
"learning_rate": 0.0009855744461617724,
"loss": 0.9471,
"step": 140
},
{
"epoch": 0.04636785162287481,
"grad_norm": 0.0018689304124563932,
"learning_rate": 0.0009845440494590419,
"loss": 0.9483,
"step": 150
},
{
"epoch": 0.04945904173106646,
"grad_norm": 0.0017488012090325356,
"learning_rate": 0.0009835136527563111,
"loss": 0.954,
"step": 160
},
{
"epoch": 0.05255023183925812,
"grad_norm": 0.001690636039711535,
"learning_rate": 0.0009824832560535806,
"loss": 0.9277,
"step": 170
},
{
"epoch": 0.05564142194744977,
"grad_norm": 0.0017418304923921824,
"learning_rate": 0.00098145285935085,
"loss": 0.9258,
"step": 180
},
{
"epoch": 0.05873261205564142,
"grad_norm": 0.0014526835875585675,
"learning_rate": 0.0009804224626481196,
"loss": 0.9577,
"step": 190
},
{
"epoch": 0.061823802163833076,
"grad_norm": 0.002275065751746297,
"learning_rate": 0.000979392065945389,
"loss": 0.9241,
"step": 200
},
{
"epoch": 0.06491499227202473,
"grad_norm": 0.0012924526818096638,
"learning_rate": 0.0009783616692426585,
"loss": 0.9271,
"step": 210
},
{
"epoch": 0.06800618238021638,
"grad_norm": 0.001761179999448359,
"learning_rate": 0.000977331272539928,
"loss": 0.9516,
"step": 220
},
{
"epoch": 0.07109737248840804,
"grad_norm": 0.001756934798322618,
"learning_rate": 0.0009763008758371973,
"loss": 0.9176,
"step": 230
},
{
"epoch": 0.07418856259659969,
"grad_norm": 0.0015818781685084105,
"learning_rate": 0.0009752704791344668,
"loss": 0.9549,
"step": 240
},
{
"epoch": 0.07727975270479134,
"grad_norm": 0.0015478008426725864,
"learning_rate": 0.0009742400824317363,
"loss": 0.9418,
"step": 250
},
{
"epoch": 0.080370942812983,
"grad_norm": 0.0014897435903549194,
"learning_rate": 0.0009732096857290057,
"loss": 0.9052,
"step": 260
},
{
"epoch": 0.08346213292117466,
"grad_norm": 0.0013134022010490298,
"learning_rate": 0.0009721792890262751,
"loss": 0.9479,
"step": 270
},
{
"epoch": 0.0865533230293663,
"grad_norm": 0.0018637892790138721,
"learning_rate": 0.0009711488923235445,
"loss": 0.8966,
"step": 280
},
{
"epoch": 0.08964451313755796,
"grad_norm": 0.0032756649889051914,
"learning_rate": 0.000970118495620814,
"loss": 0.9641,
"step": 290
},
{
"epoch": 0.09273570324574962,
"grad_norm": 0.001614488777704537,
"learning_rate": 0.0009690880989180835,
"loss": 0.9373,
"step": 300
},
{
"epoch": 0.09582689335394126,
"grad_norm": 0.0015908046625554562,
"learning_rate": 0.000968057702215353,
"loss": 0.953,
"step": 310
},
{
"epoch": 0.09891808346213292,
"grad_norm": 0.0017299671890214086,
"learning_rate": 0.0009670273055126224,
"loss": 0.9651,
"step": 320
},
{
"epoch": 0.10200927357032458,
"grad_norm": 0.0015082499012351036,
"learning_rate": 0.0009659969088098919,
"loss": 0.952,
"step": 330
},
{
"epoch": 0.10510046367851623,
"grad_norm": 0.00181696773506701,
"learning_rate": 0.0009649665121071612,
"loss": 0.9673,
"step": 340
},
{
"epoch": 0.10819165378670788,
"grad_norm": 0.001661508227698505,
"learning_rate": 0.0009639361154044307,
"loss": 0.9167,
"step": 350
},
{
"epoch": 0.11128284389489954,
"grad_norm": 0.0016009090468287468,
"learning_rate": 0.0009629057187017002,
"loss": 0.9363,
"step": 360
},
{
"epoch": 0.1143740340030912,
"grad_norm": 0.0020189261995255947,
"learning_rate": 0.0009618753219989696,
"loss": 0.9025,
"step": 370
},
{
"epoch": 0.11746522411128284,
"grad_norm": 0.0024259143974632025,
"learning_rate": 0.0009608449252962391,
"loss": 0.9646,
"step": 380
},
{
"epoch": 0.1205564142194745,
"grad_norm": 0.0015644305385649204,
"learning_rate": 0.0009598145285935085,
"loss": 0.9226,
"step": 390
},
{
"epoch": 0.12364760432766615,
"grad_norm": 0.0017478523077443242,
"learning_rate": 0.000958784131890778,
"loss": 0.9232,
"step": 400
},
{
"epoch": 0.1267387944358578,
"grad_norm": 0.002042934997007251,
"learning_rate": 0.0009577537351880475,
"loss": 0.9464,
"step": 410
},
{
"epoch": 0.12982998454404945,
"grad_norm": 0.0017407669220119715,
"learning_rate": 0.0009567233384853169,
"loss": 0.9258,
"step": 420
},
{
"epoch": 0.13292117465224113,
"grad_norm": 0.001527661457657814,
"learning_rate": 0.0009556929417825863,
"loss": 0.9104,
"step": 430
},
{
"epoch": 0.13601236476043277,
"grad_norm": 0.0014940258115530014,
"learning_rate": 0.0009546625450798557,
"loss": 0.9208,
"step": 440
},
{
"epoch": 0.1391035548686244,
"grad_norm": 0.0018555278657004237,
"learning_rate": 0.0009536321483771252,
"loss": 0.9441,
"step": 450
},
{
"epoch": 0.14219474497681608,
"grad_norm": 0.002033649478107691,
"learning_rate": 0.0009526017516743947,
"loss": 0.9155,
"step": 460
},
{
"epoch": 0.14528593508500773,
"grad_norm": 0.0015595933655276895,
"learning_rate": 0.0009515713549716642,
"loss": 0.9164,
"step": 470
},
{
"epoch": 0.14837712519319937,
"grad_norm": 0.0016203809063881636,
"learning_rate": 0.0009505409582689336,
"loss": 0.9267,
"step": 480
},
{
"epoch": 0.15146831530139104,
"grad_norm": 0.0014497883385047317,
"learning_rate": 0.0009495105615662029,
"loss": 0.9328,
"step": 490
},
{
"epoch": 0.1545595054095827,
"grad_norm": 0.0019730357453227043,
"learning_rate": 0.0009484801648634724,
"loss": 0.9057,
"step": 500
},
{
"epoch": 0.15765069551777433,
"grad_norm": 0.0020096744410693645,
"learning_rate": 0.0009474497681607419,
"loss": 0.9217,
"step": 510
},
{
"epoch": 0.160741885625966,
"grad_norm": 0.0019119798671454191,
"learning_rate": 0.0009464193714580114,
"loss": 0.9324,
"step": 520
},
{
"epoch": 0.16383307573415765,
"grad_norm": 0.0014629390789195895,
"learning_rate": 0.0009453889747552809,
"loss": 0.9424,
"step": 530
},
{
"epoch": 0.16692426584234932,
"grad_norm": 0.0016167230205610394,
"learning_rate": 0.0009443585780525502,
"loss": 0.8622,
"step": 540
},
{
"epoch": 0.17001545595054096,
"grad_norm": 0.0016633981140330434,
"learning_rate": 0.0009433281813498197,
"loss": 0.9366,
"step": 550
},
{
"epoch": 0.1731066460587326,
"grad_norm": 0.0015964192571118474,
"learning_rate": 0.0009422977846470892,
"loss": 0.9552,
"step": 560
},
{
"epoch": 0.17619783616692428,
"grad_norm": 0.0013956124894320965,
"learning_rate": 0.0009412673879443586,
"loss": 0.9995,
"step": 570
},
{
"epoch": 0.17928902627511592,
"grad_norm": 0.0014371597208082676,
"learning_rate": 0.0009402369912416281,
"loss": 0.8622,
"step": 580
},
{
"epoch": 0.18238021638330756,
"grad_norm": 0.0021855118684470654,
"learning_rate": 0.0009392065945388974,
"loss": 0.9862,
"step": 590
},
{
"epoch": 0.18547140649149924,
"grad_norm": 0.001544400816783309,
"learning_rate": 0.0009381761978361669,
"loss": 0.919,
"step": 600
},
{
"epoch": 0.18856259659969088,
"grad_norm": 0.0016864053905010223,
"learning_rate": 0.0009371458011334364,
"loss": 0.9405,
"step": 610
},
{
"epoch": 0.19165378670788252,
"grad_norm": 0.0021651415154337883,
"learning_rate": 0.0009361154044307059,
"loss": 0.9939,
"step": 620
},
{
"epoch": 0.1947449768160742,
"grad_norm": 0.001466871122829616,
"learning_rate": 0.0009350850077279754,
"loss": 0.8776,
"step": 630
},
{
"epoch": 0.19783616692426584,
"grad_norm": 0.0013067360268905759,
"learning_rate": 0.0009340546110252446,
"loss": 0.9684,
"step": 640
},
{
"epoch": 0.2009273570324575,
"grad_norm": 0.001740931300446391,
"learning_rate": 0.0009330242143225141,
"loss": 0.9786,
"step": 650
},
{
"epoch": 0.20401854714064915,
"grad_norm": 0.0017273235134780407,
"learning_rate": 0.0009319938176197836,
"loss": 0.9089,
"step": 660
},
{
"epoch": 0.2071097372488408,
"grad_norm": 0.001577013055793941,
"learning_rate": 0.0009309634209170531,
"loss": 0.9154,
"step": 670
},
{
"epoch": 0.21020092735703247,
"grad_norm": 0.001800213591195643,
"learning_rate": 0.0009299330242143226,
"loss": 0.9726,
"step": 680
},
{
"epoch": 0.2132921174652241,
"grad_norm": 0.0015475867548957467,
"learning_rate": 0.000928902627511592,
"loss": 0.8864,
"step": 690
},
{
"epoch": 0.21638330757341576,
"grad_norm": 0.002068218309432268,
"learning_rate": 0.0009278722308088614,
"loss": 0.8666,
"step": 700
},
{
"epoch": 0.21947449768160743,
"grad_norm": 0.0014894594205543399,
"learning_rate": 0.0009268418341061309,
"loss": 0.9178,
"step": 710
},
{
"epoch": 0.22256568778979907,
"grad_norm": 0.0018193743890151381,
"learning_rate": 0.0009258114374034003,
"loss": 0.913,
"step": 720
},
{
"epoch": 0.22565687789799072,
"grad_norm": 0.0019854323472827673,
"learning_rate": 0.0009247810407006698,
"loss": 0.9015,
"step": 730
},
{
"epoch": 0.2287480680061824,
"grad_norm": 0.0015401191776618361,
"learning_rate": 0.0009237506439979392,
"loss": 0.942,
"step": 740
},
{
"epoch": 0.23183925811437403,
"grad_norm": 0.0016189438756555319,
"learning_rate": 0.0009227202472952086,
"loss": 0.9647,
"step": 750
},
{
"epoch": 0.23493044822256567,
"grad_norm": 0.0018937138374894857,
"learning_rate": 0.0009216898505924781,
"loss": 0.9455,
"step": 760
},
{
"epoch": 0.23802163833075735,
"grad_norm": 0.0015417198883369565,
"learning_rate": 0.0009206594538897476,
"loss": 0.926,
"step": 770
},
{
"epoch": 0.241112828438949,
"grad_norm": 0.0014824847457930446,
"learning_rate": 0.0009196290571870171,
"loss": 0.8998,
"step": 780
},
{
"epoch": 0.24420401854714066,
"grad_norm": 0.0015080615412443876,
"learning_rate": 0.0009185986604842864,
"loss": 0.905,
"step": 790
},
{
"epoch": 0.2472952086553323,
"grad_norm": 0.0017820092616602778,
"learning_rate": 0.0009175682637815559,
"loss": 0.9504,
"step": 800
},
{
"epoch": 0.250386398763524,
"grad_norm": 0.001539805787615478,
"learning_rate": 0.0009165378670788253,
"loss": 0.9369,
"step": 810
},
{
"epoch": 0.2534775888717156,
"grad_norm": 0.001816835138015449,
"learning_rate": 0.0009155074703760948,
"loss": 0.8814,
"step": 820
},
{
"epoch": 0.25656877897990726,
"grad_norm": 0.0017855115002021194,
"learning_rate": 0.0009144770736733643,
"loss": 0.9066,
"step": 830
},
{
"epoch": 0.2596599690880989,
"grad_norm": 0.0015401588752865791,
"learning_rate": 0.0009134466769706337,
"loss": 0.9257,
"step": 840
},
{
"epoch": 0.26275115919629055,
"grad_norm": 0.0023962745908647776,
"learning_rate": 0.0009124162802679032,
"loss": 0.9428,
"step": 850
},
{
"epoch": 0.26584234930448225,
"grad_norm": 0.0014296959852799773,
"learning_rate": 0.0009113858835651726,
"loss": 0.9263,
"step": 860
},
{
"epoch": 0.2689335394126739,
"grad_norm": 0.0018063073512166739,
"learning_rate": 0.000910355486862442,
"loss": 0.9235,
"step": 870
},
{
"epoch": 0.27202472952086554,
"grad_norm": 0.0016391921089962125,
"learning_rate": 0.0009093250901597115,
"loss": 0.8952,
"step": 880
},
{
"epoch": 0.2751159196290572,
"grad_norm": 0.0023141205310821533,
"learning_rate": 0.0009082946934569809,
"loss": 0.9494,
"step": 890
},
{
"epoch": 0.2782071097372488,
"grad_norm": 0.0018004857702180743,
"learning_rate": 0.0009072642967542504,
"loss": 0.9346,
"step": 900
},
{
"epoch": 0.28129829984544047,
"grad_norm": 0.002469270955771208,
"learning_rate": 0.0009062339000515199,
"loss": 0.9245,
"step": 910
},
{
"epoch": 0.28438948995363217,
"grad_norm": 0.001746363122947514,
"learning_rate": 0.0009052035033487893,
"loss": 0.8899,
"step": 920
},
{
"epoch": 0.2874806800618238,
"grad_norm": 0.0017825034447014332,
"learning_rate": 0.0009041731066460588,
"loss": 0.9097,
"step": 930
},
{
"epoch": 0.29057187017001546,
"grad_norm": 0.0015837879618629813,
"learning_rate": 0.0009031427099433283,
"loss": 0.9406,
"step": 940
},
{
"epoch": 0.2936630602782071,
"grad_norm": 0.00214924244210124,
"learning_rate": 0.0009021123132405976,
"loss": 0.8884,
"step": 950
},
{
"epoch": 0.29675425038639874,
"grad_norm": 0.001666102441959083,
"learning_rate": 0.0009010819165378671,
"loss": 0.9393,
"step": 960
},
{
"epoch": 0.29984544049459044,
"grad_norm": 0.0013613239862024784,
"learning_rate": 0.0009000515198351365,
"loss": 0.9177,
"step": 970
},
{
"epoch": 0.3029366306027821,
"grad_norm": 0.0017140130512416363,
"learning_rate": 0.000899021123132406,
"loss": 0.9315,
"step": 980
},
{
"epoch": 0.30602782071097373,
"grad_norm": 0.001512790797278285,
"learning_rate": 0.0008979907264296755,
"loss": 0.9364,
"step": 990
},
{
"epoch": 0.3091190108191654,
"grad_norm": 0.0017727742670103908,
"learning_rate": 0.0008969603297269449,
"loss": 0.9175,
"step": 1000
},
{
"epoch": 0.312210200927357,
"grad_norm": 0.0019065055530518293,
"learning_rate": 0.0008959299330242144,
"loss": 0.9585,
"step": 1010
},
{
"epoch": 0.31530139103554866,
"grad_norm": 0.002561497036367655,
"learning_rate": 0.0008948995363214837,
"loss": 0.936,
"step": 1020
},
{
"epoch": 0.31839258114374036,
"grad_norm": 0.0015061356825754046,
"learning_rate": 0.0008938691396187532,
"loss": 0.8856,
"step": 1030
},
{
"epoch": 0.321483771251932,
"grad_norm": 0.0022356980480253696,
"learning_rate": 0.0008928387429160227,
"loss": 0.9152,
"step": 1040
},
{
"epoch": 0.32457496136012365,
"grad_norm": 0.0017119398107752204,
"learning_rate": 0.0008918083462132921,
"loss": 0.8799,
"step": 1050
},
{
"epoch": 0.3276661514683153,
"grad_norm": 0.0021370877511799335,
"learning_rate": 0.0008907779495105616,
"loss": 0.9308,
"step": 1060
},
{
"epoch": 0.33075734157650694,
"grad_norm": 0.0019707437604665756,
"learning_rate": 0.0008897475528078311,
"loss": 0.9182,
"step": 1070
},
{
"epoch": 0.33384853168469864,
"grad_norm": 0.001411254983395338,
"learning_rate": 0.0008887171561051005,
"loss": 0.9057,
"step": 1080
},
{
"epoch": 0.3369397217928903,
"grad_norm": 0.0021633992437273264,
"learning_rate": 0.00088768675940237,
"loss": 0.9089,
"step": 1090
},
{
"epoch": 0.3400309119010819,
"grad_norm": 0.001840689335949719,
"learning_rate": 0.0008866563626996393,
"loss": 0.8939,
"step": 1100
},
{
"epoch": 0.34312210200927357,
"grad_norm": 0.0018973862752318382,
"learning_rate": 0.0008856259659969088,
"loss": 0.9206,
"step": 1110
},
{
"epoch": 0.3462132921174652,
"grad_norm": 0.0015567062655463815,
"learning_rate": 0.0008845955692941783,
"loss": 0.9424,
"step": 1120
},
{
"epoch": 0.34930448222565685,
"grad_norm": 0.0021733948960900307,
"learning_rate": 0.0008835651725914478,
"loss": 0.9965,
"step": 1130
},
{
"epoch": 0.35239567233384855,
"grad_norm": 0.0025423939805477858,
"learning_rate": 0.0008825347758887172,
"loss": 0.8603,
"step": 1140
},
{
"epoch": 0.3554868624420402,
"grad_norm": 0.0014392200391739607,
"learning_rate": 0.0008815043791859866,
"loss": 0.9483,
"step": 1150
},
{
"epoch": 0.35857805255023184,
"grad_norm": 0.0015780443791300058,
"learning_rate": 0.0008804739824832561,
"loss": 0.8963,
"step": 1160
},
{
"epoch": 0.3616692426584235,
"grad_norm": 0.002231495687738061,
"learning_rate": 0.0008794435857805255,
"loss": 0.9123,
"step": 1170
},
{
"epoch": 0.36476043276661513,
"grad_norm": 0.0014009552542120218,
"learning_rate": 0.000878413189077795,
"loss": 0.9337,
"step": 1180
},
{
"epoch": 0.3678516228748068,
"grad_norm": 0.0020310496911406517,
"learning_rate": 0.0008773827923750644,
"loss": 0.9137,
"step": 1190
},
{
"epoch": 0.37094281298299847,
"grad_norm": 0.00151388393715024,
"learning_rate": 0.0008763523956723338,
"loss": 0.9091,
"step": 1200
},
{
"epoch": 0.3740340030911901,
"grad_norm": 0.0018349160673096776,
"learning_rate": 0.0008753219989696033,
"loss": 0.9486,
"step": 1210
},
{
"epoch": 0.37712519319938176,
"grad_norm": 0.0018245832761749625,
"learning_rate": 0.0008742916022668728,
"loss": 0.9163,
"step": 1220
},
{
"epoch": 0.3802163833075734,
"grad_norm": 0.002115410752594471,
"learning_rate": 0.0008732612055641423,
"loss": 0.9043,
"step": 1230
},
{
"epoch": 0.38330757341576505,
"grad_norm": 0.0019245495786890388,
"learning_rate": 0.0008722308088614118,
"loss": 0.9212,
"step": 1240
},
{
"epoch": 0.38639876352395675,
"grad_norm": 0.001513317576609552,
"learning_rate": 0.000871200412158681,
"loss": 0.9068,
"step": 1250
},
{
"epoch": 0.3894899536321484,
"grad_norm": 0.0017635183176025748,
"learning_rate": 0.0008701700154559505,
"loss": 0.9261,
"step": 1260
},
{
"epoch": 0.39258114374034003,
"grad_norm": 0.0017686467617750168,
"learning_rate": 0.00086913961875322,
"loss": 0.8763,
"step": 1270
},
{
"epoch": 0.3956723338485317,
"grad_norm": 0.0015009143389761448,
"learning_rate": 0.0008681092220504895,
"loss": 0.8968,
"step": 1280
},
{
"epoch": 0.3987635239567233,
"grad_norm": 0.0014831022126600146,
"learning_rate": 0.000867078825347759,
"loss": 0.8927,
"step": 1290
},
{
"epoch": 0.401854714064915,
"grad_norm": 0.0029206760227680206,
"learning_rate": 0.0008660484286450283,
"loss": 0.9455,
"step": 1300
},
{
"epoch": 0.40494590417310666,
"grad_norm": 0.0014479625970125198,
"learning_rate": 0.0008650180319422978,
"loss": 0.9229,
"step": 1310
},
{
"epoch": 0.4080370942812983,
"grad_norm": 0.0014661536552011967,
"learning_rate": 0.0008639876352395672,
"loss": 0.9088,
"step": 1320
},
{
"epoch": 0.41112828438948995,
"grad_norm": 0.0014888847945258021,
"learning_rate": 0.0008629572385368367,
"loss": 0.9633,
"step": 1330
},
{
"epoch": 0.4142194744976816,
"grad_norm": 0.0017181559232994914,
"learning_rate": 0.0008619268418341062,
"loss": 0.9224,
"step": 1340
},
{
"epoch": 0.41731066460587324,
"grad_norm": 0.0015694062458351254,
"learning_rate": 0.0008608964451313755,
"loss": 0.8789,
"step": 1350
},
{
"epoch": 0.42040185471406494,
"grad_norm": 0.001495172269642353,
"learning_rate": 0.000859866048428645,
"loss": 0.8666,
"step": 1360
},
{
"epoch": 0.4234930448222566,
"grad_norm": 0.002242365386337042,
"learning_rate": 0.0008588356517259145,
"loss": 0.9108,
"step": 1370
},
{
"epoch": 0.4265842349304482,
"grad_norm": 0.0014668918447569013,
"learning_rate": 0.000857805255023184,
"loss": 0.916,
"step": 1380
},
{
"epoch": 0.42967542503863987,
"grad_norm": 0.001498398371040821,
"learning_rate": 0.0008567748583204535,
"loss": 0.9183,
"step": 1390
},
{
"epoch": 0.4327666151468315,
"grad_norm": 0.001534744049422443,
"learning_rate": 0.0008557444616177228,
"loss": 0.9114,
"step": 1400
},
{
"epoch": 0.43585780525502316,
"grad_norm": 0.0014011348830536008,
"learning_rate": 0.0008547140649149922,
"loss": 0.9394,
"step": 1410
},
{
"epoch": 0.43894899536321486,
"grad_norm": 0.0016137160127982497,
"learning_rate": 0.0008536836682122617,
"loss": 0.8852,
"step": 1420
},
{
"epoch": 0.4420401854714065,
"grad_norm": 0.0018244803650304675,
"learning_rate": 0.0008526532715095312,
"loss": 0.9006,
"step": 1430
},
{
"epoch": 0.44513137557959814,
"grad_norm": 0.002001643180847168,
"learning_rate": 0.0008516228748068007,
"loss": 0.8877,
"step": 1440
},
{
"epoch": 0.4482225656877898,
"grad_norm": 0.001545743434689939,
"learning_rate": 0.0008505924781040701,
"loss": 0.9169,
"step": 1450
},
{
"epoch": 0.45131375579598143,
"grad_norm": 0.0015252763405442238,
"learning_rate": 0.0008495620814013395,
"loss": 0.9302,
"step": 1460
},
{
"epoch": 0.45440494590417313,
"grad_norm": 0.0018486313056200743,
"learning_rate": 0.0008485316846986089,
"loss": 0.8845,
"step": 1470
},
{
"epoch": 0.4574961360123648,
"grad_norm": 0.0013468407560139894,
"learning_rate": 0.0008475012879958784,
"loss": 0.9079,
"step": 1480
},
{
"epoch": 0.4605873261205564,
"grad_norm": 0.0011928731109946966,
"learning_rate": 0.0008464708912931479,
"loss": 0.8997,
"step": 1490
},
{
"epoch": 0.46367851622874806,
"grad_norm": 0.0018724995898082852,
"learning_rate": 0.0008454404945904173,
"loss": 0.944,
"step": 1500
},
{
"epoch": 0.4667697063369397,
"grad_norm": 0.0021075448021292686,
"learning_rate": 0.0008444100978876868,
"loss": 0.9155,
"step": 1510
},
{
"epoch": 0.46986089644513135,
"grad_norm": 0.0016975891776382923,
"learning_rate": 0.0008433797011849562,
"loss": 0.9349,
"step": 1520
},
{
"epoch": 0.47295208655332305,
"grad_norm": 0.001552650355733931,
"learning_rate": 0.0008423493044822257,
"loss": 0.9145,
"step": 1530
},
{
"epoch": 0.4760432766615147,
"grad_norm": 0.0016928149852901697,
"learning_rate": 0.0008413189077794952,
"loss": 0.8694,
"step": 1540
},
{
"epoch": 0.47913446676970634,
"grad_norm": 0.0015649759443476796,
"learning_rate": 0.0008402885110767645,
"loss": 0.9505,
"step": 1550
},
{
"epoch": 0.482225656877898,
"grad_norm": 0.0020843998063355684,
"learning_rate": 0.000839258114374034,
"loss": 0.9463,
"step": 1560
},
{
"epoch": 0.4853168469860896,
"grad_norm": 0.0012236249167472124,
"learning_rate": 0.0008382277176713034,
"loss": 0.9033,
"step": 1570
},
{
"epoch": 0.4884080370942813,
"grad_norm": 0.0017901939572766423,
"learning_rate": 0.0008371973209685729,
"loss": 0.911,
"step": 1580
},
{
"epoch": 0.49149922720247297,
"grad_norm": 0.0018610935658216476,
"learning_rate": 0.0008361669242658424,
"loss": 0.8655,
"step": 1590
},
{
"epoch": 0.4945904173106646,
"grad_norm": 0.001789487199857831,
"learning_rate": 0.0008351365275631119,
"loss": 0.9427,
"step": 1600
},
{
"epoch": 0.49768160741885625,
"grad_norm": 0.00190592254512012,
"learning_rate": 0.0008341061308603813,
"loss": 0.9192,
"step": 1610
},
{
"epoch": 0.500772797527048,
"grad_norm": 0.0016090746503323317,
"learning_rate": 0.0008330757341576506,
"loss": 0.9487,
"step": 1620
},
{
"epoch": 0.5038639876352395,
"grad_norm": 0.0016335912514477968,
"learning_rate": 0.0008320453374549201,
"loss": 0.9051,
"step": 1630
},
{
"epoch": 0.5069551777434312,
"grad_norm": 0.0016785924090072513,
"learning_rate": 0.0008310149407521896,
"loss": 0.9104,
"step": 1640
},
{
"epoch": 0.5100463678516228,
"grad_norm": 0.0022380806040018797,
"learning_rate": 0.0008299845440494591,
"loss": 0.9038,
"step": 1650
},
{
"epoch": 0.5131375579598145,
"grad_norm": 0.0016855926951393485,
"learning_rate": 0.0008289541473467285,
"loss": 0.9256,
"step": 1660
},
{
"epoch": 0.5162287480680062,
"grad_norm": 0.0019196901703253388,
"learning_rate": 0.000827923750643998,
"loss": 0.9271,
"step": 1670
},
{
"epoch": 0.5193199381761978,
"grad_norm": 0.001529015600681305,
"learning_rate": 0.0008268933539412674,
"loss": 0.9053,
"step": 1680
},
{
"epoch": 0.5224111282843895,
"grad_norm": 0.001290348474867642,
"learning_rate": 0.0008258629572385369,
"loss": 0.8879,
"step": 1690
},
{
"epoch": 0.5255023183925811,
"grad_norm": 0.0017919385572895408,
"learning_rate": 0.0008248325605358063,
"loss": 0.8537,
"step": 1700
},
{
"epoch": 0.5285935085007728,
"grad_norm": 0.0017021787352859974,
"learning_rate": 0.0008238021638330757,
"loss": 0.9126,
"step": 1710
},
{
"epoch": 0.5316846986089645,
"grad_norm": 0.0017202612943947315,
"learning_rate": 0.0008227717671303452,
"loss": 0.8977,
"step": 1720
},
{
"epoch": 0.5347758887171561,
"grad_norm": 0.0021942094899713993,
"learning_rate": 0.0008217413704276147,
"loss": 0.944,
"step": 1730
},
{
"epoch": 0.5378670788253478,
"grad_norm": 0.001882906537503004,
"learning_rate": 0.0008207109737248841,
"loss": 0.9704,
"step": 1740
},
{
"epoch": 0.5409582689335394,
"grad_norm": 0.0015875013777986169,
"learning_rate": 0.0008196805770221536,
"loss": 0.927,
"step": 1750
},
{
"epoch": 0.5440494590417311,
"grad_norm": 0.0017645510379225016,
"learning_rate": 0.000818650180319423,
"loss": 0.9719,
"step": 1760
},
{
"epoch": 0.5471406491499228,
"grad_norm": 0.0016093801241368055,
"learning_rate": 0.0008176197836166924,
"loss": 0.8753,
"step": 1770
},
{
"epoch": 0.5502318392581144,
"grad_norm": 0.0016733432421460748,
"learning_rate": 0.0008165893869139619,
"loss": 0.9132,
"step": 1780
},
{
"epoch": 0.5533230293663061,
"grad_norm": 0.0014284460339695215,
"learning_rate": 0.0008155589902112313,
"loss": 0.9242,
"step": 1790
},
{
"epoch": 0.5564142194744977,
"grad_norm": 0.0018177167512476444,
"learning_rate": 0.0008145285935085008,
"loss": 0.9469,
"step": 1800
},
{
"epoch": 0.5595054095826894,
"grad_norm": 0.0020286261569708586,
"learning_rate": 0.0008134981968057702,
"loss": 0.8884,
"step": 1810
},
{
"epoch": 0.5625965996908809,
"grad_norm": 0.0014468576991930604,
"learning_rate": 0.0008124678001030397,
"loss": 0.908,
"step": 1820
},
{
"epoch": 0.5656877897990726,
"grad_norm": 0.001559574855491519,
"learning_rate": 0.0008114374034003092,
"loss": 0.9104,
"step": 1830
},
{
"epoch": 0.5687789799072643,
"grad_norm": 0.0017769662663340569,
"learning_rate": 0.0008104070066975787,
"loss": 0.9222,
"step": 1840
},
{
"epoch": 0.5718701700154559,
"grad_norm": 0.001862520119175315,
"learning_rate": 0.000809376609994848,
"loss": 0.8862,
"step": 1850
},
{
"epoch": 0.5749613601236476,
"grad_norm": 0.0021106936037540436,
"learning_rate": 0.0008083462132921174,
"loss": 0.8862,
"step": 1860
},
{
"epoch": 0.5780525502318392,
"grad_norm": 0.0013291973154991865,
"learning_rate": 0.0008073158165893869,
"loss": 0.9381,
"step": 1870
},
{
"epoch": 0.5811437403400309,
"grad_norm": 0.001646311953663826,
"learning_rate": 0.0008062854198866564,
"loss": 0.8862,
"step": 1880
},
{
"epoch": 0.5842349304482226,
"grad_norm": 0.0015801474219188094,
"learning_rate": 0.0008052550231839259,
"loss": 0.9402,
"step": 1890
},
{
"epoch": 0.5873261205564142,
"grad_norm": 0.0015533153200522065,
"learning_rate": 0.0008042246264811953,
"loss": 0.9242,
"step": 1900
},
{
"epoch": 0.5904173106646059,
"grad_norm": 0.0016890340484678745,
"learning_rate": 0.0008031942297784647,
"loss": 0.9234,
"step": 1910
},
{
"epoch": 0.5935085007727975,
"grad_norm": 0.0014929634053260088,
"learning_rate": 0.0008021638330757341,
"loss": 0.9383,
"step": 1920
},
{
"epoch": 0.5965996908809892,
"grad_norm": 0.001471440540626645,
"learning_rate": 0.0008011334363730036,
"loss": 0.908,
"step": 1930
},
{
"epoch": 0.5996908809891809,
"grad_norm": 0.00180807092692703,
"learning_rate": 0.0008001030396702731,
"loss": 0.9211,
"step": 1940
},
{
"epoch": 0.6027820710973725,
"grad_norm": 0.0016187585424631834,
"learning_rate": 0.0007990726429675426,
"loss": 0.9254,
"step": 1950
},
{
"epoch": 0.6058732612055642,
"grad_norm": 0.0016198824159801006,
"learning_rate": 0.0007980422462648119,
"loss": 0.9118,
"step": 1960
},
{
"epoch": 0.6089644513137558,
"grad_norm": 0.0017275193240493536,
"learning_rate": 0.0007970118495620814,
"loss": 0.9444,
"step": 1970
},
{
"epoch": 0.6120556414219475,
"grad_norm": 0.002495982451364398,
"learning_rate": 0.0007959814528593509,
"loss": 0.8879,
"step": 1980
},
{
"epoch": 0.615146831530139,
"grad_norm": 0.0013608982553705573,
"learning_rate": 0.0007949510561566204,
"loss": 0.8695,
"step": 1990
},
{
"epoch": 0.6182380216383307,
"grad_norm": 0.0015398486284539104,
"learning_rate": 0.0007939206594538898,
"loss": 0.916,
"step": 2000
},
{
"epoch": 0.6213292117465224,
"grad_norm": 0.0016108902636915445,
"learning_rate": 0.0007928902627511591,
"loss": 0.9244,
"step": 2010
},
{
"epoch": 0.624420401854714,
"grad_norm": 0.0016804412007331848,
"learning_rate": 0.0007918598660484286,
"loss": 0.9358,
"step": 2020
},
{
"epoch": 0.6275115919629057,
"grad_norm": 0.0018602460622787476,
"learning_rate": 0.0007908294693456981,
"loss": 0.8829,
"step": 2030
},
{
"epoch": 0.6306027820710973,
"grad_norm": 0.0017209933139383793,
"learning_rate": 0.0007897990726429676,
"loss": 0.9365,
"step": 2040
},
{
"epoch": 0.633693972179289,
"grad_norm": 0.002164942678064108,
"learning_rate": 0.0007887686759402371,
"loss": 0.9349,
"step": 2050
},
{
"epoch": 0.6367851622874807,
"grad_norm": 0.0021378265228122473,
"learning_rate": 0.0007877382792375064,
"loss": 0.8884,
"step": 2060
},
{
"epoch": 0.6398763523956723,
"grad_norm": 0.001859784359112382,
"learning_rate": 0.0007867078825347758,
"loss": 0.9461,
"step": 2070
},
{
"epoch": 0.642967542503864,
"grad_norm": 0.001505703548900783,
"learning_rate": 0.0007856774858320453,
"loss": 0.9284,
"step": 2080
},
{
"epoch": 0.6460587326120556,
"grad_norm": 0.0014758047182112932,
"learning_rate": 0.0007846470891293148,
"loss": 0.8943,
"step": 2090
},
{
"epoch": 0.6491499227202473,
"grad_norm": 0.001482132589444518,
"learning_rate": 0.0007836166924265843,
"loss": 0.9309,
"step": 2100
},
{
"epoch": 0.652241112828439,
"grad_norm": 0.0018712684977799654,
"learning_rate": 0.0007825862957238537,
"loss": 0.9705,
"step": 2110
},
{
"epoch": 0.6553323029366306,
"grad_norm": 0.0025388901121914387,
"learning_rate": 0.0007815558990211231,
"loss": 0.9129,
"step": 2120
},
{
"epoch": 0.6584234930448223,
"grad_norm": 0.001495323609560728,
"learning_rate": 0.0007805255023183926,
"loss": 0.9435,
"step": 2130
},
{
"epoch": 0.6615146831530139,
"grad_norm": 0.0016260349657386541,
"learning_rate": 0.0007794951056156621,
"loss": 0.9124,
"step": 2140
},
{
"epoch": 0.6646058732612056,
"grad_norm": 0.0019677565433084965,
"learning_rate": 0.0007784647089129315,
"loss": 0.9317,
"step": 2150
},
{
"epoch": 0.6676970633693973,
"grad_norm": 0.0023265851195901632,
"learning_rate": 0.0007774343122102009,
"loss": 0.9538,
"step": 2160
},
{
"epoch": 0.6707882534775889,
"grad_norm": 0.0014457949437201023,
"learning_rate": 0.0007764039155074703,
"loss": 0.9538,
"step": 2170
},
{
"epoch": 0.6738794435857806,
"grad_norm": 0.0014781310455873609,
"learning_rate": 0.0007753735188047398,
"loss": 0.8816,
"step": 2180
},
{
"epoch": 0.6769706336939721,
"grad_norm": 0.0017087183659896255,
"learning_rate": 0.0007743431221020093,
"loss": 0.9142,
"step": 2190
},
{
"epoch": 0.6800618238021638,
"grad_norm": 0.002620991552248597,
"learning_rate": 0.0007733127253992788,
"loss": 0.9345,
"step": 2200
},
{
"epoch": 0.6831530139103554,
"grad_norm": 0.001568863750435412,
"learning_rate": 0.0007722823286965483,
"loss": 0.9089,
"step": 2210
},
{
"epoch": 0.6862442040185471,
"grad_norm": 0.0017851140582934022,
"learning_rate": 0.0007712519319938175,
"loss": 0.8943,
"step": 2220
},
{
"epoch": 0.6893353941267388,
"grad_norm": 0.0013759072171524167,
"learning_rate": 0.000770221535291087,
"loss": 0.9176,
"step": 2230
},
{
"epoch": 0.6924265842349304,
"grad_norm": 0.0015892439987510443,
"learning_rate": 0.0007691911385883565,
"loss": 0.8854,
"step": 2240
},
{
"epoch": 0.6955177743431221,
"grad_norm": 0.001841311459429562,
"learning_rate": 0.000768160741885626,
"loss": 0.8824,
"step": 2250
},
{
"epoch": 0.6986089644513137,
"grad_norm": 0.0016689961776137352,
"learning_rate": 0.0007671303451828955,
"loss": 0.9442,
"step": 2260
},
{
"epoch": 0.7017001545595054,
"grad_norm": 0.0018330076709389687,
"learning_rate": 0.0007660999484801649,
"loss": 0.9398,
"step": 2270
},
{
"epoch": 0.7047913446676971,
"grad_norm": 0.0021366167347878218,
"learning_rate": 0.0007650695517774343,
"loss": 0.9507,
"step": 2280
},
{
"epoch": 0.7078825347758887,
"grad_norm": 0.0017535964725539088,
"learning_rate": 0.0007640391550747038,
"loss": 0.8902,
"step": 2290
},
{
"epoch": 0.7109737248840804,
"grad_norm": 0.0015847982140257955,
"learning_rate": 0.0007630087583719732,
"loss": 0.9163,
"step": 2300
},
{
"epoch": 0.714064914992272,
"grad_norm": 0.0016199509846046567,
"learning_rate": 0.0007619783616692427,
"loss": 0.8775,
"step": 2310
},
{
"epoch": 0.7171561051004637,
"grad_norm": 0.001773869269527495,
"learning_rate": 0.0007609479649665121,
"loss": 0.9406,
"step": 2320
},
{
"epoch": 0.7202472952086554,
"grad_norm": 0.0015184408985078335,
"learning_rate": 0.0007599175682637816,
"loss": 0.8945,
"step": 2330
},
{
"epoch": 0.723338485316847,
"grad_norm": 0.005015387199819088,
"learning_rate": 0.000758887171561051,
"loss": 0.9407,
"step": 2340
},
{
"epoch": 0.7264296754250387,
"grad_norm": 0.0020386965479701757,
"learning_rate": 0.0007578567748583205,
"loss": 0.9122,
"step": 2350
},
{
"epoch": 0.7295208655332303,
"grad_norm": 0.002024392830207944,
"learning_rate": 0.00075682637815559,
"loss": 0.8902,
"step": 2360
},
{
"epoch": 0.732612055641422,
"grad_norm": 0.0016152510652318597,
"learning_rate": 0.0007557959814528593,
"loss": 0.907,
"step": 2370
},
{
"epoch": 0.7357032457496137,
"grad_norm": 0.001684612943790853,
"learning_rate": 0.0007547655847501288,
"loss": 0.8557,
"step": 2380
},
{
"epoch": 0.7387944358578052,
"grad_norm": 0.0018417539540678263,
"learning_rate": 0.0007537351880473982,
"loss": 0.932,
"step": 2390
},
{
"epoch": 0.7418856259659969,
"grad_norm": 0.0020610857754945755,
"learning_rate": 0.0007527047913446677,
"loss": 0.9125,
"step": 2400
},
{
"epoch": 0.7449768160741885,
"grad_norm": 0.0023381186183542013,
"learning_rate": 0.0007516743946419372,
"loss": 0.9251,
"step": 2410
},
{
"epoch": 0.7480680061823802,
"grad_norm": 0.002318663988262415,
"learning_rate": 0.0007506439979392066,
"loss": 0.9043,
"step": 2420
},
{
"epoch": 0.7511591962905718,
"grad_norm": 0.0017375986790284514,
"learning_rate": 0.0007496136012364761,
"loss": 0.9061,
"step": 2430
},
{
"epoch": 0.7542503863987635,
"grad_norm": 0.0018655112944543362,
"learning_rate": 0.0007485832045337456,
"loss": 1.0007,
"step": 2440
},
{
"epoch": 0.7573415765069552,
"grad_norm": 0.0019265462178736925,
"learning_rate": 0.0007475528078310149,
"loss": 0.9089,
"step": 2450
},
{
"epoch": 0.7604327666151468,
"grad_norm": 0.0018528448417782784,
"learning_rate": 0.0007465224111282844,
"loss": 0.8856,
"step": 2460
},
{
"epoch": 0.7635239567233385,
"grad_norm": 0.0016357959248125553,
"learning_rate": 0.0007454920144255538,
"loss": 0.8712,
"step": 2470
},
{
"epoch": 0.7666151468315301,
"grad_norm": 0.0013976657064631581,
"learning_rate": 0.0007444616177228233,
"loss": 0.9761,
"step": 2480
},
{
"epoch": 0.7697063369397218,
"grad_norm": 0.002004920970648527,
"learning_rate": 0.0007434312210200928,
"loss": 0.9023,
"step": 2490
},
{
"epoch": 0.7727975270479135,
"grad_norm": 0.0015114221023395658,
"learning_rate": 0.0007424008243173622,
"loss": 0.9048,
"step": 2500
},
{
"epoch": 0.7758887171561051,
"grad_norm": 0.0016935282619670033,
"learning_rate": 0.0007413704276146317,
"loss": 0.9617,
"step": 2510
},
{
"epoch": 0.7789799072642968,
"grad_norm": 0.0015191826969385147,
"learning_rate": 0.000740340030911901,
"loss": 0.927,
"step": 2520
},
{
"epoch": 0.7820710973724884,
"grad_norm": 0.0022661250550299883,
"learning_rate": 0.0007393096342091705,
"loss": 0.8871,
"step": 2530
},
{
"epoch": 0.7851622874806801,
"grad_norm": 0.002032969379797578,
"learning_rate": 0.00073827923750644,
"loss": 0.9149,
"step": 2540
},
{
"epoch": 0.7882534775888718,
"grad_norm": 0.0017924593994393945,
"learning_rate": 0.0007372488408037095,
"loss": 0.9564,
"step": 2550
},
{
"epoch": 0.7913446676970634,
"grad_norm": 0.0014548065373674035,
"learning_rate": 0.0007362184441009789,
"loss": 0.9095,
"step": 2560
},
{
"epoch": 0.794435857805255,
"grad_norm": 0.0025248208548873663,
"learning_rate": 0.0007351880473982483,
"loss": 0.9337,
"step": 2570
},
{
"epoch": 0.7975270479134466,
"grad_norm": 0.002107022562995553,
"learning_rate": 0.0007341576506955178,
"loss": 0.9482,
"step": 2580
},
{
"epoch": 0.8006182380216383,
"grad_norm": 0.0016369119985029101,
"learning_rate": 0.0007331272539927873,
"loss": 0.9035,
"step": 2590
},
{
"epoch": 0.80370942812983,
"grad_norm": 0.0018871091306209564,
"learning_rate": 0.0007320968572900567,
"loss": 0.8882,
"step": 2600
},
{
"epoch": 0.8068006182380216,
"grad_norm": 0.0019260449334979057,
"learning_rate": 0.0007310664605873261,
"loss": 0.9074,
"step": 2610
},
{
"epoch": 0.8098918083462133,
"grad_norm": 0.0018819051329046488,
"learning_rate": 0.0007300360638845955,
"loss": 0.9218,
"step": 2620
},
{
"epoch": 0.8129829984544049,
"grad_norm": 0.0015719749499112368,
"learning_rate": 0.000729005667181865,
"loss": 0.9155,
"step": 2630
},
{
"epoch": 0.8160741885625966,
"grad_norm": 0.0015479732537642121,
"learning_rate": 0.0007279752704791345,
"loss": 0.909,
"step": 2640
},
{
"epoch": 0.8191653786707882,
"grad_norm": 0.0013954916503280401,
"learning_rate": 0.000726944873776404,
"loss": 0.9217,
"step": 2650
},
{
"epoch": 0.8222565687789799,
"grad_norm": 0.0013768866192549467,
"learning_rate": 0.0007259144770736735,
"loss": 0.9496,
"step": 2660
},
{
"epoch": 0.8253477588871716,
"grad_norm": 0.001571536879055202,
"learning_rate": 0.0007248840803709427,
"loss": 0.9305,
"step": 2670
},
{
"epoch": 0.8284389489953632,
"grad_norm": 0.002294939709827304,
"learning_rate": 0.0007238536836682122,
"loss": 0.9262,
"step": 2680
},
{
"epoch": 0.8315301391035549,
"grad_norm": 0.0016881937626749277,
"learning_rate": 0.0007228232869654817,
"loss": 0.8624,
"step": 2690
},
{
"epoch": 0.8346213292117465,
"grad_norm": 0.001524370047263801,
"learning_rate": 0.0007217928902627512,
"loss": 0.9194,
"step": 2700
},
{
"epoch": 0.8377125193199382,
"grad_norm": 0.0020258829463273287,
"learning_rate": 0.0007207624935600207,
"loss": 0.9034,
"step": 2710
},
{
"epoch": 0.8408037094281299,
"grad_norm": 0.0019363865721970797,
"learning_rate": 0.00071973209685729,
"loss": 0.929,
"step": 2720
},
{
"epoch": 0.8438948995363215,
"grad_norm": 0.00164938741363585,
"learning_rate": 0.0007187017001545595,
"loss": 0.9569,
"step": 2730
},
{
"epoch": 0.8469860896445132,
"grad_norm": 0.0016705304151400924,
"learning_rate": 0.000717671303451829,
"loss": 0.8835,
"step": 2740
},
{
"epoch": 0.8500772797527048,
"grad_norm": 0.0018051480874419212,
"learning_rate": 0.0007166409067490984,
"loss": 0.8838,
"step": 2750
},
{
"epoch": 0.8531684698608965,
"grad_norm": 0.0015429792692884803,
"learning_rate": 0.0007156105100463679,
"loss": 0.9004,
"step": 2760
},
{
"epoch": 0.8562596599690881,
"grad_norm": 0.002758611924946308,
"learning_rate": 0.0007145801133436372,
"loss": 0.9299,
"step": 2770
},
{
"epoch": 0.8593508500772797,
"grad_norm": 0.002330324612557888,
"learning_rate": 0.0007135497166409067,
"loss": 0.9749,
"step": 2780
},
{
"epoch": 0.8624420401854714,
"grad_norm": 0.0015028082998469472,
"learning_rate": 0.0007125193199381762,
"loss": 0.9128,
"step": 2790
},
{
"epoch": 0.865533230293663,
"grad_norm": 0.0014309794642031193,
"learning_rate": 0.0007114889232354457,
"loss": 0.8633,
"step": 2800
},
{
"epoch": 0.8686244204018547,
"grad_norm": 0.0013788605574518442,
"learning_rate": 0.0007104585265327152,
"loss": 0.921,
"step": 2810
},
{
"epoch": 0.8717156105100463,
"grad_norm": 0.0019114799797534943,
"learning_rate": 0.0007094281298299847,
"loss": 0.9312,
"step": 2820
},
{
"epoch": 0.874806800618238,
"grad_norm": 0.002490201499313116,
"learning_rate": 0.0007083977331272539,
"loss": 0.9027,
"step": 2830
},
{
"epoch": 0.8778979907264297,
"grad_norm": 0.0015281651867553592,
"learning_rate": 0.0007073673364245234,
"loss": 0.9189,
"step": 2840
},
{
"epoch": 0.8809891808346213,
"grad_norm": 0.0015036484692245722,
"learning_rate": 0.0007063369397217929,
"loss": 0.8979,
"step": 2850
},
{
"epoch": 0.884080370942813,
"grad_norm": 0.0015251452568918467,
"learning_rate": 0.0007053065430190624,
"loss": 0.9028,
"step": 2860
},
{
"epoch": 0.8871715610510046,
"grad_norm": 0.001428957679308951,
"learning_rate": 0.0007042761463163319,
"loss": 0.8717,
"step": 2870
},
{
"epoch": 0.8902627511591963,
"grad_norm": 0.0015625401865690947,
"learning_rate": 0.0007032457496136012,
"loss": 0.9324,
"step": 2880
},
{
"epoch": 0.893353941267388,
"grad_norm": 0.001634020241908729,
"learning_rate": 0.0007022153529108707,
"loss": 0.8697,
"step": 2890
},
{
"epoch": 0.8964451313755796,
"grad_norm": 0.0015363607089966536,
"learning_rate": 0.0007011849562081401,
"loss": 0.8956,
"step": 2900
},
{
"epoch": 0.8995363214837713,
"grad_norm": 0.0012279663933441043,
"learning_rate": 0.0007001545595054096,
"loss": 0.9616,
"step": 2910
},
{
"epoch": 0.9026275115919629,
"grad_norm": 0.0015052827075123787,
"learning_rate": 0.0006991241628026791,
"loss": 0.8944,
"step": 2920
},
{
"epoch": 0.9057187017001546,
"grad_norm": 0.001815865165553987,
"learning_rate": 0.0006980937660999485,
"loss": 0.8739,
"step": 2930
},
{
"epoch": 0.9088098918083463,
"grad_norm": 0.0019412669353187084,
"learning_rate": 0.0006970633693972179,
"loss": 0.8708,
"step": 2940
},
{
"epoch": 0.9119010819165378,
"grad_norm": 0.001834962284192443,
"learning_rate": 0.0006960329726944874,
"loss": 0.9456,
"step": 2950
},
{
"epoch": 0.9149922720247295,
"grad_norm": 0.002000207779929042,
"learning_rate": 0.0006950025759917569,
"loss": 0.9188,
"step": 2960
},
{
"epoch": 0.9180834621329211,
"grad_norm": 0.0017901693936437368,
"learning_rate": 0.0006939721792890264,
"loss": 0.9377,
"step": 2970
},
{
"epoch": 0.9211746522411128,
"grad_norm": 0.002317288890480995,
"learning_rate": 0.0006929417825862957,
"loss": 0.9159,
"step": 2980
},
{
"epoch": 0.9242658423493045,
"grad_norm": 0.0019505108939483762,
"learning_rate": 0.0006919113858835651,
"loss": 0.9171,
"step": 2990
},
{
"epoch": 0.9273570324574961,
"grad_norm": 0.0016651200130581856,
"learning_rate": 0.0006908809891808346,
"loss": 0.9331,
"step": 3000
},
{
"epoch": 0.9304482225656878,
"grad_norm": 0.0016768771456554532,
"learning_rate": 0.0006898505924781041,
"loss": 0.9731,
"step": 3010
},
{
"epoch": 0.9335394126738794,
"grad_norm": 0.0020638711284846067,
"learning_rate": 0.0006888201957753736,
"loss": 0.9023,
"step": 3020
},
{
"epoch": 0.9366306027820711,
"grad_norm": 0.001335518783889711,
"learning_rate": 0.000687789799072643,
"loss": 0.917,
"step": 3030
},
{
"epoch": 0.9397217928902627,
"grad_norm": 0.0013942529913038015,
"learning_rate": 0.0006867594023699125,
"loss": 0.9273,
"step": 3040
},
{
"epoch": 0.9428129829984544,
"grad_norm": 0.0013294010423123837,
"learning_rate": 0.0006857290056671818,
"loss": 0.8909,
"step": 3050
},
{
"epoch": 0.9459041731066461,
"grad_norm": 0.0017269228119403124,
"learning_rate": 0.0006846986089644513,
"loss": 0.9405,
"step": 3060
},
{
"epoch": 0.9489953632148377,
"grad_norm": 0.0018391566118225455,
"learning_rate": 0.0006836682122617208,
"loss": 0.9,
"step": 3070
},
{
"epoch": 0.9520865533230294,
"grad_norm": 0.0018784053390845656,
"learning_rate": 0.0006826378155589902,
"loss": 0.9081,
"step": 3080
},
{
"epoch": 0.955177743431221,
"grad_norm": 0.0017941402038559318,
"learning_rate": 0.0006816074188562597,
"loss": 0.9343,
"step": 3090
},
{
"epoch": 0.9582689335394127,
"grad_norm": 0.0019338412676006556,
"learning_rate": 0.0006805770221535291,
"loss": 0.8904,
"step": 3100
},
{
"epoch": 0.9613601236476044,
"grad_norm": 0.0016496024327352643,
"learning_rate": 0.0006795466254507986,
"loss": 0.8983,
"step": 3110
},
{
"epoch": 0.964451313755796,
"grad_norm": 0.0015189488185569644,
"learning_rate": 0.0006785162287480681,
"loss": 0.9118,
"step": 3120
},
{
"epoch": 0.9675425038639877,
"grad_norm": 0.001589839463122189,
"learning_rate": 0.0006774858320453374,
"loss": 0.8996,
"step": 3130
},
{
"epoch": 0.9706336939721792,
"grad_norm": 0.00146203744225204,
"learning_rate": 0.0006764554353426069,
"loss": 0.9122,
"step": 3140
},
{
"epoch": 0.973724884080371,
"grad_norm": 0.0015465939650312066,
"learning_rate": 0.0006754250386398764,
"loss": 0.935,
"step": 3150
},
{
"epoch": 0.9768160741885626,
"grad_norm": 0.002063942141830921,
"learning_rate": 0.0006743946419371458,
"loss": 0.9088,
"step": 3160
},
{
"epoch": 0.9799072642967542,
"grad_norm": 0.0012433248339220881,
"learning_rate": 0.0006733642452344153,
"loss": 0.8963,
"step": 3170
},
{
"epoch": 0.9829984544049459,
"grad_norm": 0.001582261291332543,
"learning_rate": 0.0006723338485316847,
"loss": 0.9304,
"step": 3180
},
{
"epoch": 0.9860896445131375,
"grad_norm": 0.0015674213645979762,
"learning_rate": 0.0006713034518289542,
"loss": 0.9299,
"step": 3190
},
{
"epoch": 0.9891808346213292,
"grad_norm": 0.0017826062394306064,
"learning_rate": 0.0006702730551262236,
"loss": 0.8951,
"step": 3200
},
{
"epoch": 0.9922720247295209,
"grad_norm": 0.002112460555508733,
"learning_rate": 0.000669242658423493,
"loss": 0.9393,
"step": 3210
},
{
"epoch": 0.9953632148377125,
"grad_norm": 0.0014212781097739935,
"learning_rate": 0.0006682122617207625,
"loss": 0.916,
"step": 3220
},
{
"epoch": 0.9984544049459042,
"grad_norm": 0.001774617237970233,
"learning_rate": 0.0006671818650180319,
"loss": 0.9271,
"step": 3230
},
{
"epoch": 1.001545595054096,
"grad_norm": 0.0016346676275134087,
"learning_rate": 0.0006661514683153014,
"loss": 0.8911,
"step": 3240
},
{
"epoch": 1.0046367851622875,
"grad_norm": 0.0014871886232867837,
"learning_rate": 0.0006651210716125709,
"loss": 0.8988,
"step": 3250
},
{
"epoch": 1.007727975270479,
"grad_norm": 0.0014850738225504756,
"learning_rate": 0.0006640906749098404,
"loss": 0.8936,
"step": 3260
},
{
"epoch": 1.010819165378671,
"grad_norm": 0.0015949340304359794,
"learning_rate": 0.0006630602782071098,
"loss": 0.8987,
"step": 3270
},
{
"epoch": 1.0139103554868625,
"grad_norm": 0.0019407504005357623,
"learning_rate": 0.0006620298815043791,
"loss": 0.8736,
"step": 3280
},
{
"epoch": 1.017001545595054,
"grad_norm": 0.0019730369094759226,
"learning_rate": 0.0006609994848016486,
"loss": 0.8818,
"step": 3290
},
{
"epoch": 1.0200927357032457,
"grad_norm": 0.0018432583892717957,
"learning_rate": 0.0006599690880989181,
"loss": 0.9042,
"step": 3300
},
{
"epoch": 1.0231839258114375,
"grad_norm": 0.0017056462820619345,
"learning_rate": 0.0006589386913961876,
"loss": 0.932,
"step": 3310
},
{
"epoch": 1.026275115919629,
"grad_norm": 0.0015121812466531992,
"learning_rate": 0.000657908294693457,
"loss": 0.8639,
"step": 3320
},
{
"epoch": 1.0293663060278206,
"grad_norm": 0.004580393433570862,
"learning_rate": 0.0006568778979907264,
"loss": 0.9044,
"step": 3330
},
{
"epoch": 1.0324574961360125,
"grad_norm": 0.0015995085705071688,
"learning_rate": 0.0006558475012879959,
"loss": 0.9453,
"step": 3340
},
{
"epoch": 1.035548686244204,
"grad_norm": 0.002061218721792102,
"learning_rate": 0.0006548171045852653,
"loss": 0.8802,
"step": 3350
},
{
"epoch": 1.0386398763523956,
"grad_norm": 0.0015771668404340744,
"learning_rate": 0.0006537867078825348,
"loss": 0.8759,
"step": 3360
},
{
"epoch": 1.0417310664605872,
"grad_norm": 0.0015714009059593081,
"learning_rate": 0.0006527563111798043,
"loss": 0.8915,
"step": 3370
},
{
"epoch": 1.044822256568779,
"grad_norm": 0.0019235257059335709,
"learning_rate": 0.0006517259144770736,
"loss": 0.9143,
"step": 3380
},
{
"epoch": 1.0479134466769706,
"grad_norm": 0.0017326029483228922,
"learning_rate": 0.0006506955177743431,
"loss": 0.8883,
"step": 3390
},
{
"epoch": 1.0510046367851622,
"grad_norm": 0.001543081016279757,
"learning_rate": 0.0006496651210716126,
"loss": 0.9235,
"step": 3400
},
{
"epoch": 1.054095826893354,
"grad_norm": 0.0018034736858680844,
"learning_rate": 0.0006486347243688821,
"loss": 0.9445,
"step": 3410
},
{
"epoch": 1.0571870170015456,
"grad_norm": 0.001585203455761075,
"learning_rate": 0.0006476043276661516,
"loss": 0.9192,
"step": 3420
},
{
"epoch": 1.0602782071097372,
"grad_norm": 0.0018698821077123284,
"learning_rate": 0.0006465739309634208,
"loss": 0.9463,
"step": 3430
},
{
"epoch": 1.063369397217929,
"grad_norm": 0.0014568824553862214,
"learning_rate": 0.0006455435342606903,
"loss": 0.8907,
"step": 3440
},
{
"epoch": 1.0664605873261206,
"grad_norm": 0.002080600941553712,
"learning_rate": 0.0006445131375579598,
"loss": 0.9314,
"step": 3450
},
{
"epoch": 1.0695517774343122,
"grad_norm": 0.0016853328561410308,
"learning_rate": 0.0006434827408552293,
"loss": 0.9277,
"step": 3460
},
{
"epoch": 1.0726429675425038,
"grad_norm": 0.0016854364657774568,
"learning_rate": 0.0006424523441524988,
"loss": 0.9045,
"step": 3470
},
{
"epoch": 1.0757341576506956,
"grad_norm": 0.0020330501720309258,
"learning_rate": 0.0006414219474497683,
"loss": 0.9253,
"step": 3480
},
{
"epoch": 1.0788253477588872,
"grad_norm": 0.001624101772904396,
"learning_rate": 0.0006403915507470376,
"loss": 0.9275,
"step": 3490
},
{
"epoch": 1.0819165378670788,
"grad_norm": 0.0021750659216195345,
"learning_rate": 0.000639361154044307,
"loss": 0.9483,
"step": 3500
},
{
"epoch": 1.0850077279752706,
"grad_norm": 0.002389618894085288,
"learning_rate": 0.0006383307573415765,
"loss": 0.9259,
"step": 3510
},
{
"epoch": 1.0880989180834622,
"grad_norm": 0.0017334523145109415,
"learning_rate": 0.000637300360638846,
"loss": 0.9142,
"step": 3520
},
{
"epoch": 1.0911901081916537,
"grad_norm": 0.0015356771182268858,
"learning_rate": 0.0006362699639361155,
"loss": 0.9139,
"step": 3530
},
{
"epoch": 1.0942812982998453,
"grad_norm": 0.001495121861808002,
"learning_rate": 0.0006352395672333848,
"loss": 0.9382,
"step": 3540
},
{
"epoch": 1.0973724884080371,
"grad_norm": 0.0018960656598210335,
"learning_rate": 0.0006342091705306543,
"loss": 0.9208,
"step": 3550
},
{
"epoch": 1.1004636785162287,
"grad_norm": 0.0019199528032913804,
"learning_rate": 0.0006331787738279238,
"loss": 0.8903,
"step": 3560
},
{
"epoch": 1.1035548686244203,
"grad_norm": 0.0016839156160131097,
"learning_rate": 0.0006321483771251933,
"loss": 0.8923,
"step": 3570
},
{
"epoch": 1.1066460587326121,
"grad_norm": 0.0016232216730713844,
"learning_rate": 0.0006311179804224627,
"loss": 0.8847,
"step": 3580
},
{
"epoch": 1.1097372488408037,
"grad_norm": 0.0016339183785021305,
"learning_rate": 0.000630087583719732,
"loss": 0.9283,
"step": 3590
},
{
"epoch": 1.1128284389489953,
"grad_norm": 0.001477651298046112,
"learning_rate": 0.0006290571870170015,
"loss": 0.9211,
"step": 3600
},
{
"epoch": 1.1159196290571871,
"grad_norm": 0.0016854658024385571,
"learning_rate": 0.000628026790314271,
"loss": 0.8857,
"step": 3610
},
{
"epoch": 1.1190108191653787,
"grad_norm": 0.001720211817882955,
"learning_rate": 0.0006269963936115405,
"loss": 0.93,
"step": 3620
},
{
"epoch": 1.1221020092735703,
"grad_norm": 0.0018675002502277493,
"learning_rate": 0.00062596599690881,
"loss": 0.9222,
"step": 3630
},
{
"epoch": 1.125193199381762,
"grad_norm": 0.0014751511625945568,
"learning_rate": 0.0006249356002060794,
"loss": 0.8624,
"step": 3640
},
{
"epoch": 1.1282843894899537,
"grad_norm": 0.001356113119982183,
"learning_rate": 0.0006239052035033487,
"loss": 0.92,
"step": 3650
},
{
"epoch": 1.1313755795981453,
"grad_norm": 0.0012932941317558289,
"learning_rate": 0.0006228748068006182,
"loss": 0.9394,
"step": 3660
},
{
"epoch": 1.1344667697063369,
"grad_norm": 0.0022874162532389164,
"learning_rate": 0.0006218444100978877,
"loss": 0.8998,
"step": 3670
},
{
"epoch": 1.1375579598145287,
"grad_norm": 0.0017121599521487951,
"learning_rate": 0.0006208140133951572,
"loss": 0.9188,
"step": 3680
},
{
"epoch": 1.1406491499227203,
"grad_norm": 0.001489990041591227,
"learning_rate": 0.0006197836166924266,
"loss": 0.8829,
"step": 3690
},
{
"epoch": 1.1437403400309119,
"grad_norm": 0.0015349757159128785,
"learning_rate": 0.000618753219989696,
"loss": 0.9028,
"step": 3700
},
{
"epoch": 1.1468315301391034,
"grad_norm": 0.0022630542516708374,
"learning_rate": 0.0006177228232869655,
"loss": 0.9536,
"step": 3710
},
{
"epoch": 1.1499227202472952,
"grad_norm": 0.0017121895216405392,
"learning_rate": 0.000616692426584235,
"loss": 0.9069,
"step": 3720
},
{
"epoch": 1.1530139103554868,
"grad_norm": 0.0019011534750461578,
"learning_rate": 0.0006156620298815044,
"loss": 0.8873,
"step": 3730
},
{
"epoch": 1.1561051004636784,
"grad_norm": 0.0014921397669240832,
"learning_rate": 0.0006146316331787738,
"loss": 0.9354,
"step": 3740
},
{
"epoch": 1.1591962905718702,
"grad_norm": 0.001888715079985559,
"learning_rate": 0.0006136012364760433,
"loss": 0.9054,
"step": 3750
},
{
"epoch": 1.1622874806800618,
"grad_norm": 0.0026860106736421585,
"learning_rate": 0.0006125708397733127,
"loss": 0.9096,
"step": 3760
},
{
"epoch": 1.1653786707882534,
"grad_norm": 0.0014799319906160235,
"learning_rate": 0.0006115404430705822,
"loss": 0.892,
"step": 3770
},
{
"epoch": 1.1684698608964452,
"grad_norm": 0.001760624349117279,
"learning_rate": 0.0006105100463678517,
"loss": 0.9615,
"step": 3780
},
{
"epoch": 1.1715610510046368,
"grad_norm": 0.0016477296594530344,
"learning_rate": 0.0006094796496651211,
"loss": 0.8632,
"step": 3790
},
{
"epoch": 1.1746522411128284,
"grad_norm": 0.0015910883666947484,
"learning_rate": 0.0006084492529623905,
"loss": 0.9434,
"step": 3800
},
{
"epoch": 1.1777434312210202,
"grad_norm": 0.002248365432024002,
"learning_rate": 0.0006074188562596599,
"loss": 0.9087,
"step": 3810
},
{
"epoch": 1.1808346213292118,
"grad_norm": 0.0020230677910149097,
"learning_rate": 0.0006063884595569294,
"loss": 0.9356,
"step": 3820
},
{
"epoch": 1.1839258114374034,
"grad_norm": 0.0016803268808871508,
"learning_rate": 0.0006053580628541989,
"loss": 0.9394,
"step": 3830
},
{
"epoch": 1.187017001545595,
"grad_norm": 0.0012555584544315934,
"learning_rate": 0.0006043276661514683,
"loss": 0.8883,
"step": 3840
},
{
"epoch": 1.1901081916537868,
"grad_norm": 0.0015084665501490235,
"learning_rate": 0.0006032972694487378,
"loss": 0.8967,
"step": 3850
},
{
"epoch": 1.1931993817619784,
"grad_norm": 0.001311285886913538,
"learning_rate": 0.0006022668727460073,
"loss": 0.8849,
"step": 3860
},
{
"epoch": 1.19629057187017,
"grad_norm": 0.0018462970620021224,
"learning_rate": 0.0006012364760432767,
"loss": 0.8442,
"step": 3870
},
{
"epoch": 1.1993817619783615,
"grad_norm": 0.0016767915803939104,
"learning_rate": 0.0006002060793405461,
"loss": 0.9013,
"step": 3880
},
{
"epoch": 1.2024729520865534,
"grad_norm": 0.0018409952754154801,
"learning_rate": 0.0005991756826378155,
"loss": 0.9189,
"step": 3890
},
{
"epoch": 1.205564142194745,
"grad_norm": 0.0016513046575710177,
"learning_rate": 0.000598145285935085,
"loss": 0.935,
"step": 3900
},
{
"epoch": 1.2086553323029365,
"grad_norm": 0.0017494558123871684,
"learning_rate": 0.0005971148892323545,
"loss": 0.9038,
"step": 3910
},
{
"epoch": 1.2117465224111283,
"grad_norm": 0.0021330672316253185,
"learning_rate": 0.0005960844925296239,
"loss": 0.8762,
"step": 3920
},
{
"epoch": 1.21483771251932,
"grad_norm": 0.00180672702845186,
"learning_rate": 0.0005950540958268934,
"loss": 0.8729,
"step": 3930
},
{
"epoch": 1.2179289026275115,
"grad_norm": 0.001298164832405746,
"learning_rate": 0.0005940236991241628,
"loss": 0.9208,
"step": 3940
},
{
"epoch": 1.2210200927357033,
"grad_norm": 0.0016548632411286235,
"learning_rate": 0.0005929933024214322,
"loss": 0.8658,
"step": 3950
},
{
"epoch": 1.224111282843895,
"grad_norm": 0.0019319544080644846,
"learning_rate": 0.0005919629057187017,
"loss": 0.9357,
"step": 3960
},
{
"epoch": 1.2272024729520865,
"grad_norm": 0.0016805862542241812,
"learning_rate": 0.0005909325090159712,
"loss": 0.8781,
"step": 3970
},
{
"epoch": 1.2302936630602783,
"grad_norm": 0.0023612873628735542,
"learning_rate": 0.0005899021123132406,
"loss": 0.8979,
"step": 3980
},
{
"epoch": 1.23338485316847,
"grad_norm": 0.001474004122428596,
"learning_rate": 0.00058887171561051,
"loss": 0.9272,
"step": 3990
},
{
"epoch": 1.2364760432766615,
"grad_norm": 0.0016212017508223653,
"learning_rate": 0.0005878413189077795,
"loss": 0.9453,
"step": 4000
},
{
"epoch": 1.239567233384853,
"grad_norm": 0.0017642155289649963,
"learning_rate": 0.000586810922205049,
"loss": 0.9517,
"step": 4010
},
{
"epoch": 1.242658423493045,
"grad_norm": 0.0018736496567726135,
"learning_rate": 0.0005857805255023185,
"loss": 0.8862,
"step": 4020
},
{
"epoch": 1.2457496136012365,
"grad_norm": 0.001532053924165666,
"learning_rate": 0.0005847501287995878,
"loss": 0.8866,
"step": 4030
},
{
"epoch": 1.248840803709428,
"grad_norm": 0.0017100380500778556,
"learning_rate": 0.0005837197320968572,
"loss": 0.9678,
"step": 4040
},
{
"epoch": 1.2519319938176197,
"grad_norm": 0.0018053441308438778,
"learning_rate": 0.0005826893353941267,
"loss": 0.9066,
"step": 4050
},
{
"epoch": 1.2550231839258115,
"grad_norm": 0.0015492727980017662,
"learning_rate": 0.0005816589386913962,
"loss": 0.9649,
"step": 4060
},
{
"epoch": 1.258114374034003,
"grad_norm": 0.0014449515147134662,
"learning_rate": 0.0005806285419886657,
"loss": 0.8973,
"step": 4070
},
{
"epoch": 1.2612055641421946,
"grad_norm": 0.001734506106004119,
"learning_rate": 0.0005795981452859352,
"loss": 0.8801,
"step": 4080
},
{
"epoch": 1.2642967542503865,
"grad_norm": 0.0013686501188203692,
"learning_rate": 0.0005785677485832046,
"loss": 0.8936,
"step": 4090
},
{
"epoch": 1.267387944358578,
"grad_norm": 0.0016823920886963606,
"learning_rate": 0.0005775373518804739,
"loss": 0.9334,
"step": 4100
},
{
"epoch": 1.2704791344667696,
"grad_norm": 0.0022253175266087055,
"learning_rate": 0.0005765069551777434,
"loss": 0.9328,
"step": 4110
},
{
"epoch": 1.2735703245749614,
"grad_norm": 0.0017447506543248892,
"learning_rate": 0.0005754765584750129,
"loss": 0.884,
"step": 4120
},
{
"epoch": 1.276661514683153,
"grad_norm": 0.0017223991453647614,
"learning_rate": 0.0005744461617722824,
"loss": 0.8914,
"step": 4130
},
{
"epoch": 1.2797527047913446,
"grad_norm": 0.001338414615020156,
"learning_rate": 0.0005734157650695518,
"loss": 0.8588,
"step": 4140
},
{
"epoch": 1.2828438948995364,
"grad_norm": 0.0020726649090647697,
"learning_rate": 0.0005723853683668212,
"loss": 0.8827,
"step": 4150
},
{
"epoch": 1.285935085007728,
"grad_norm": 0.0016284299781545997,
"learning_rate": 0.0005713549716640907,
"loss": 0.9044,
"step": 4160
},
{
"epoch": 1.2890262751159196,
"grad_norm": 0.0015132429543882608,
"learning_rate": 0.0005703245749613602,
"loss": 0.8997,
"step": 4170
},
{
"epoch": 1.2921174652241114,
"grad_norm": 0.0019543899688869715,
"learning_rate": 0.0005692941782586296,
"loss": 0.8779,
"step": 4180
},
{
"epoch": 1.295208655332303,
"grad_norm": 0.0016743885353207588,
"learning_rate": 0.000568263781555899,
"loss": 0.9378,
"step": 4190
},
{
"epoch": 1.2982998454404946,
"grad_norm": 0.0015272155869752169,
"learning_rate": 0.0005672333848531684,
"loss": 0.9391,
"step": 4200
},
{
"epoch": 1.3013910355486862,
"grad_norm": 0.001885525998659432,
"learning_rate": 0.0005662029881504379,
"loss": 0.9257,
"step": 4210
},
{
"epoch": 1.3044822256568778,
"grad_norm": 0.0016695179510861635,
"learning_rate": 0.0005651725914477074,
"loss": 0.9012,
"step": 4220
},
{
"epoch": 1.3075734157650696,
"grad_norm": 0.0013361867750063539,
"learning_rate": 0.0005641421947449769,
"loss": 0.8968,
"step": 4230
},
{
"epoch": 1.3106646058732612,
"grad_norm": 0.0015216304454952478,
"learning_rate": 0.0005631117980422464,
"loss": 0.9027,
"step": 4240
},
{
"epoch": 1.3137557959814528,
"grad_norm": 0.0013232153141871095,
"learning_rate": 0.0005620814013395156,
"loss": 0.8984,
"step": 4250
},
{
"epoch": 1.3168469860896446,
"grad_norm": 0.0019559410866349936,
"learning_rate": 0.0005610510046367851,
"loss": 0.8915,
"step": 4260
},
{
"epoch": 1.3199381761978362,
"grad_norm": 0.0014317093882709742,
"learning_rate": 0.0005600206079340546,
"loss": 0.9054,
"step": 4270
},
{
"epoch": 1.3230293663060277,
"grad_norm": 0.0013388870283961296,
"learning_rate": 0.0005589902112313241,
"loss": 0.8892,
"step": 4280
},
{
"epoch": 1.3261205564142196,
"grad_norm": 0.0015756129287183285,
"learning_rate": 0.0005579598145285936,
"loss": 0.8773,
"step": 4290
},
{
"epoch": 1.3292117465224111,
"grad_norm": 0.0014559467090293765,
"learning_rate": 0.0005569294178258629,
"loss": 0.9396,
"step": 4300
},
{
"epoch": 1.3323029366306027,
"grad_norm": 0.0015288847498595715,
"learning_rate": 0.0005558990211231324,
"loss": 0.8731,
"step": 4310
},
{
"epoch": 1.3353941267387945,
"grad_norm": 0.0019514070590958,
"learning_rate": 0.0005548686244204019,
"loss": 0.9192,
"step": 4320
},
{
"epoch": 1.3384853168469861,
"grad_norm": 0.001415872247889638,
"learning_rate": 0.0005538382277176713,
"loss": 0.8815,
"step": 4330
},
{
"epoch": 1.3415765069551777,
"grad_norm": 0.0014958428218960762,
"learning_rate": 0.0005528078310149408,
"loss": 0.9032,
"step": 4340
},
{
"epoch": 1.3446676970633695,
"grad_norm": 0.002053620759397745,
"learning_rate": 0.0005517774343122102,
"loss": 0.887,
"step": 4350
},
{
"epoch": 1.3477588871715611,
"grad_norm": 0.0018398250686004758,
"learning_rate": 0.0005507470376094796,
"loss": 0.9225,
"step": 4360
},
{
"epoch": 1.3508500772797527,
"grad_norm": 0.0017333675641566515,
"learning_rate": 0.0005497166409067491,
"loss": 0.8951,
"step": 4370
},
{
"epoch": 1.3539412673879443,
"grad_norm": 0.002342061372473836,
"learning_rate": 0.0005486862442040186,
"loss": 0.8829,
"step": 4380
},
{
"epoch": 1.3570324574961359,
"grad_norm": 0.00188271829392761,
"learning_rate": 0.0005476558475012881,
"loss": 0.8816,
"step": 4390
},
{
"epoch": 1.3601236476043277,
"grad_norm": 0.0013321408769115806,
"learning_rate": 0.0005466254507985575,
"loss": 0.9114,
"step": 4400
},
{
"epoch": 1.3632148377125193,
"grad_norm": 0.00140297575853765,
"learning_rate": 0.0005455950540958268,
"loss": 0.9209,
"step": 4410
},
{
"epoch": 1.3663060278207109,
"grad_norm": 0.002004598267376423,
"learning_rate": 0.0005445646573930963,
"loss": 0.8764,
"step": 4420
},
{
"epoch": 1.3693972179289027,
"grad_norm": 0.0019030956318601966,
"learning_rate": 0.0005435342606903658,
"loss": 0.9532,
"step": 4430
},
{
"epoch": 1.3724884080370943,
"grad_norm": 0.002172063337638974,
"learning_rate": 0.0005425038639876353,
"loss": 0.9243,
"step": 4440
},
{
"epoch": 1.3755795981452859,
"grad_norm": 0.0018728856230154634,
"learning_rate": 0.0005414734672849047,
"loss": 0.893,
"step": 4450
},
{
"epoch": 1.3786707882534777,
"grad_norm": 0.0015217667678371072,
"learning_rate": 0.0005404430705821742,
"loss": 0.9209,
"step": 4460
},
{
"epoch": 1.3817619783616693,
"grad_norm": 0.001939924550242722,
"learning_rate": 0.0005394126738794436,
"loss": 0.9407,
"step": 4470
},
{
"epoch": 1.3848531684698608,
"grad_norm": 0.001418776111677289,
"learning_rate": 0.000538382277176713,
"loss": 0.8918,
"step": 4480
},
{
"epoch": 1.3879443585780527,
"grad_norm": 0.0015887707704678178,
"learning_rate": 0.0005373518804739825,
"loss": 0.8835,
"step": 4490
},
{
"epoch": 1.3910355486862442,
"grad_norm": 0.002404952421784401,
"learning_rate": 0.0005363214837712519,
"loss": 0.9215,
"step": 4500
},
{
"epoch": 1.3941267387944358,
"grad_norm": 0.0013456016313284636,
"learning_rate": 0.0005352910870685214,
"loss": 0.9352,
"step": 4510
},
{
"epoch": 1.3972179289026276,
"grad_norm": 0.0014747907407581806,
"learning_rate": 0.0005342606903657908,
"loss": 0.9033,
"step": 4520
},
{
"epoch": 1.4003091190108192,
"grad_norm": 0.0016936981119215488,
"learning_rate": 0.0005332302936630603,
"loss": 0.8774,
"step": 4530
},
{
"epoch": 1.4034003091190108,
"grad_norm": 0.00203963671810925,
"learning_rate": 0.0005321998969603298,
"loss": 0.941,
"step": 4540
},
{
"epoch": 1.4064914992272024,
"grad_norm": 0.0018227493856102228,
"learning_rate": 0.0005311695002575992,
"loss": 0.8724,
"step": 4550
},
{
"epoch": 1.409582689335394,
"grad_norm": 0.002075436757877469,
"learning_rate": 0.0005301391035548686,
"loss": 0.8892,
"step": 4560
},
{
"epoch": 1.4126738794435858,
"grad_norm": 0.0016266778111457825,
"learning_rate": 0.000529108706852138,
"loss": 0.8583,
"step": 4570
},
{
"epoch": 1.4157650695517774,
"grad_norm": 0.0016663891728967428,
"learning_rate": 0.0005280783101494075,
"loss": 0.9634,
"step": 4580
},
{
"epoch": 1.418856259659969,
"grad_norm": 0.0015336048090830445,
"learning_rate": 0.000527047913446677,
"loss": 0.9069,
"step": 4590
},
{
"epoch": 1.4219474497681608,
"grad_norm": 0.0020592466462403536,
"learning_rate": 0.0005260175167439464,
"loss": 0.8638,
"step": 4600
},
{
"epoch": 1.4250386398763524,
"grad_norm": 0.0019336834084242582,
"learning_rate": 0.0005249871200412159,
"loss": 0.9,
"step": 4610
},
{
"epoch": 1.428129829984544,
"grad_norm": 0.001620001159608364,
"learning_rate": 0.0005239567233384854,
"loss": 0.8783,
"step": 4620
},
{
"epoch": 1.4312210200927358,
"grad_norm": 0.0018929082434624434,
"learning_rate": 0.0005229263266357547,
"loss": 0.8888,
"step": 4630
},
{
"epoch": 1.4343122102009274,
"grad_norm": 0.00144308025483042,
"learning_rate": 0.0005218959299330242,
"loss": 0.9338,
"step": 4640
},
{
"epoch": 1.437403400309119,
"grad_norm": 0.0015054781688377261,
"learning_rate": 0.0005208655332302936,
"loss": 0.8962,
"step": 4650
},
{
"epoch": 1.4404945904173108,
"grad_norm": 0.0017711712280288339,
"learning_rate": 0.0005198351365275631,
"loss": 0.9417,
"step": 4660
},
{
"epoch": 1.4435857805255023,
"grad_norm": 0.0019218528177589178,
"learning_rate": 0.0005188047398248326,
"loss": 0.9703,
"step": 4670
},
{
"epoch": 1.446676970633694,
"grad_norm": 0.0019779358990490437,
"learning_rate": 0.000517774343122102,
"loss": 0.8804,
"step": 4680
},
{
"epoch": 1.4497681607418857,
"grad_norm": 0.001458328333683312,
"learning_rate": 0.0005167439464193715,
"loss": 0.9228,
"step": 4690
},
{
"epoch": 1.4528593508500773,
"grad_norm": 0.0017885727575048804,
"learning_rate": 0.000515713549716641,
"loss": 0.9256,
"step": 4700
},
{
"epoch": 1.455950540958269,
"grad_norm": 0.0015944467158988118,
"learning_rate": 0.0005146831530139103,
"loss": 0.8739,
"step": 4710
},
{
"epoch": 1.4590417310664605,
"grad_norm": 0.001835488947108388,
"learning_rate": 0.0005136527563111798,
"loss": 0.952,
"step": 4720
},
{
"epoch": 1.4621329211746523,
"grad_norm": 0.0025023729540407658,
"learning_rate": 0.0005126223596084493,
"loss": 0.9499,
"step": 4730
},
{
"epoch": 1.465224111282844,
"grad_norm": 0.0017449932638555765,
"learning_rate": 0.0005115919629057187,
"loss": 0.9269,
"step": 4740
},
{
"epoch": 1.4683153013910355,
"grad_norm": 0.0013545970432460308,
"learning_rate": 0.0005105615662029882,
"loss": 0.8909,
"step": 4750
},
{
"epoch": 1.471406491499227,
"grad_norm": 0.001730005955323577,
"learning_rate": 0.0005095311695002576,
"loss": 0.8975,
"step": 4760
},
{
"epoch": 1.474497681607419,
"grad_norm": 0.0017201779410243034,
"learning_rate": 0.0005085007727975271,
"loss": 0.9217,
"step": 4770
},
{
"epoch": 1.4775888717156105,
"grad_norm": 0.0020651696249842644,
"learning_rate": 0.0005074703760947965,
"loss": 0.8986,
"step": 4780
},
{
"epoch": 1.480680061823802,
"grad_norm": 0.0016624495619907975,
"learning_rate": 0.000506439979392066,
"loss": 0.8983,
"step": 4790
},
{
"epoch": 1.4837712519319939,
"grad_norm": 0.0014232158428058028,
"learning_rate": 0.0005054095826893354,
"loss": 0.8733,
"step": 4800
},
{
"epoch": 1.4868624420401855,
"grad_norm": 0.0019593520555645227,
"learning_rate": 0.0005043791859866048,
"loss": 0.9165,
"step": 4810
},
{
"epoch": 1.489953632148377,
"grad_norm": 0.002281294437125325,
"learning_rate": 0.0005033487892838743,
"loss": 0.9311,
"step": 4820
},
{
"epoch": 1.4930448222565689,
"grad_norm": 0.001690705306828022,
"learning_rate": 0.0005023183925811438,
"loss": 0.9152,
"step": 4830
},
{
"epoch": 1.4961360123647605,
"grad_norm": 0.0024157485458999872,
"learning_rate": 0.0005012879958784133,
"loss": 0.9276,
"step": 4840
},
{
"epoch": 1.499227202472952,
"grad_norm": 0.0015235710889101028,
"learning_rate": 0.0005002575991756827,
"loss": 0.8763,
"step": 4850
},
{
"epoch": 1.5023183925811439,
"grad_norm": 0.0018749197479337454,
"learning_rate": 0.0004992272024729521,
"loss": 0.9412,
"step": 4860
},
{
"epoch": 1.5054095826893354,
"grad_norm": 0.0020271523389965296,
"learning_rate": 0.0004981968057702215,
"loss": 0.8985,
"step": 4870
},
{
"epoch": 1.508500772797527,
"grad_norm": 0.0015005801105871797,
"learning_rate": 0.000497166409067491,
"loss": 0.9426,
"step": 4880
},
{
"epoch": 1.5115919629057188,
"grad_norm": 0.002262561582028866,
"learning_rate": 0.0004961360123647605,
"loss": 0.9107,
"step": 4890
},
{
"epoch": 1.5146831530139102,
"grad_norm": 0.001319503178820014,
"learning_rate": 0.0004951056156620298,
"loss": 0.9054,
"step": 4900
},
{
"epoch": 1.517774343122102,
"grad_norm": 0.0014885893324390054,
"learning_rate": 0.0004940752189592993,
"loss": 0.9265,
"step": 4910
},
{
"epoch": 1.5208655332302936,
"grad_norm": 0.0017433296889066696,
"learning_rate": 0.0004930448222565688,
"loss": 0.8997,
"step": 4920
},
{
"epoch": 1.5239567233384852,
"grad_norm": 0.0013538316125050187,
"learning_rate": 0.0004920144255538382,
"loss": 0.8618,
"step": 4930
},
{
"epoch": 1.527047913446677,
"grad_norm": 0.0014708703383803368,
"learning_rate": 0.0004909840288511077,
"loss": 0.9203,
"step": 4940
},
{
"epoch": 1.5301391035548686,
"grad_norm": 0.0017004169058054686,
"learning_rate": 0.0004899536321483772,
"loss": 0.8936,
"step": 4950
},
{
"epoch": 1.5332302936630602,
"grad_norm": 0.0017624979373067617,
"learning_rate": 0.0004889232354456466,
"loss": 0.8778,
"step": 4960
},
{
"epoch": 1.536321483771252,
"grad_norm": 0.0015045328764244914,
"learning_rate": 0.000487892838742916,
"loss": 0.8828,
"step": 4970
},
{
"epoch": 1.5394126738794436,
"grad_norm": 0.0018641521455720067,
"learning_rate": 0.0004868624420401855,
"loss": 0.9158,
"step": 4980
},
{
"epoch": 1.5425038639876352,
"grad_norm": 0.001571865752339363,
"learning_rate": 0.00048583204533745493,
"loss": 0.8962,
"step": 4990
},
{
"epoch": 1.545595054095827,
"grad_norm": 0.0016725645400583744,
"learning_rate": 0.00048480164863472436,
"loss": 0.9001,
"step": 5000
},
{
"epoch": 1.5486862442040186,
"grad_norm": 0.001643617171794176,
"learning_rate": 0.00048377125193199385,
"loss": 0.9103,
"step": 5010
},
{
"epoch": 1.5517774343122102,
"grad_norm": 0.002170222345739603,
"learning_rate": 0.0004827408552292633,
"loss": 0.9045,
"step": 5020
},
{
"epoch": 1.554868624420402,
"grad_norm": 0.0015412492211908102,
"learning_rate": 0.0004817104585265327,
"loss": 0.8995,
"step": 5030
},
{
"epoch": 1.5579598145285936,
"grad_norm": 0.0019475616281852126,
"learning_rate": 0.0004806800618238022,
"loss": 0.8543,
"step": 5040
},
{
"epoch": 1.5610510046367851,
"grad_norm": 0.0014466085704043508,
"learning_rate": 0.0004796496651210716,
"loss": 0.899,
"step": 5050
},
{
"epoch": 1.564142194744977,
"grad_norm": 0.0015615527518093586,
"learning_rate": 0.00047861926841834105,
"loss": 0.9303,
"step": 5060
},
{
"epoch": 1.5672333848531683,
"grad_norm": 0.0016237753443419933,
"learning_rate": 0.0004775888717156105,
"loss": 0.914,
"step": 5070
},
{
"epoch": 1.5703245749613601,
"grad_norm": 0.0015945249469950795,
"learning_rate": 0.00047655847501287997,
"loss": 0.899,
"step": 5080
},
{
"epoch": 1.573415765069552,
"grad_norm": 0.001989311771467328,
"learning_rate": 0.00047552807831014945,
"loss": 0.9609,
"step": 5090
},
{
"epoch": 1.5765069551777433,
"grad_norm": 0.0025777083355933428,
"learning_rate": 0.00047449768160741883,
"loss": 0.9187,
"step": 5100
},
{
"epoch": 1.5795981452859351,
"grad_norm": 0.0016967840492725372,
"learning_rate": 0.0004734672849046883,
"loss": 0.9198,
"step": 5110
},
{
"epoch": 1.5826893353941267,
"grad_norm": 0.0015623560175299644,
"learning_rate": 0.00047243688820195774,
"loss": 0.9066,
"step": 5120
},
{
"epoch": 1.5857805255023183,
"grad_norm": 0.0014336062595248222,
"learning_rate": 0.00047140649149922723,
"loss": 0.8992,
"step": 5130
},
{
"epoch": 1.58887171561051,
"grad_norm": 0.0018111519748345017,
"learning_rate": 0.00047037609479649666,
"loss": 0.9826,
"step": 5140
},
{
"epoch": 1.5919629057187017,
"grad_norm": 0.0016681707929819822,
"learning_rate": 0.0004693456980937661,
"loss": 0.9236,
"step": 5150
},
{
"epoch": 1.5950540958268933,
"grad_norm": 0.0015410635387524962,
"learning_rate": 0.0004683153013910356,
"loss": 0.9151,
"step": 5160
},
{
"epoch": 1.598145285935085,
"grad_norm": 0.0017971232300624251,
"learning_rate": 0.00046728490468830506,
"loss": 0.9007,
"step": 5170
},
{
"epoch": 1.6012364760432767,
"grad_norm": 0.0019288517069071531,
"learning_rate": 0.00046625450798557443,
"loss": 0.9123,
"step": 5180
},
{
"epoch": 1.6043276661514683,
"grad_norm": 0.0013020862825214863,
"learning_rate": 0.0004652241112828439,
"loss": 0.8848,
"step": 5190
},
{
"epoch": 1.60741885625966,
"grad_norm": 0.0015427186153829098,
"learning_rate": 0.00046419371458011335,
"loss": 0.948,
"step": 5200
},
{
"epoch": 1.6105100463678517,
"grad_norm": 0.0016680203843861818,
"learning_rate": 0.0004631633178773828,
"loss": 0.9122,
"step": 5210
},
{
"epoch": 1.6136012364760433,
"grad_norm": 0.0014202597085386515,
"learning_rate": 0.00046213292117465226,
"loss": 0.8974,
"step": 5220
},
{
"epoch": 1.616692426584235,
"grad_norm": 0.0018021473661065102,
"learning_rate": 0.0004611025244719217,
"loss": 0.8518,
"step": 5230
},
{
"epoch": 1.6197836166924264,
"grad_norm": 0.001819357625208795,
"learning_rate": 0.0004600721277691912,
"loss": 0.9143,
"step": 5240
},
{
"epoch": 1.6228748068006182,
"grad_norm": 0.0018893377855420113,
"learning_rate": 0.00045904173106646055,
"loss": 0.8866,
"step": 5250
},
{
"epoch": 1.62596599690881,
"grad_norm": 0.0018815461080521345,
"learning_rate": 0.00045801133436373004,
"loss": 0.9116,
"step": 5260
},
{
"epoch": 1.6290571870170014,
"grad_norm": 0.0018397814128547907,
"learning_rate": 0.0004569809376609995,
"loss": 0.9021,
"step": 5270
},
{
"epoch": 1.6321483771251932,
"grad_norm": 0.002361022401601076,
"learning_rate": 0.00045595054095826895,
"loss": 0.9044,
"step": 5280
},
{
"epoch": 1.6352395672333848,
"grad_norm": 0.0016238827956840396,
"learning_rate": 0.0004549201442555384,
"loss": 0.9038,
"step": 5290
},
{
"epoch": 1.6383307573415764,
"grad_norm": 0.0020596208050847054,
"learning_rate": 0.0004538897475528078,
"loss": 0.8645,
"step": 5300
},
{
"epoch": 1.6414219474497682,
"grad_norm": 0.0016590558225288987,
"learning_rate": 0.0004528593508500773,
"loss": 0.8853,
"step": 5310
},
{
"epoch": 1.6445131375579598,
"grad_norm": 0.0019768117927014828,
"learning_rate": 0.0004518289541473468,
"loss": 0.8622,
"step": 5320
},
{
"epoch": 1.6476043276661514,
"grad_norm": 0.0016761173028498888,
"learning_rate": 0.00045079855744461616,
"loss": 0.9272,
"step": 5330
},
{
"epoch": 1.6506955177743432,
"grad_norm": 0.0013793542748317122,
"learning_rate": 0.00044976816074188564,
"loss": 0.935,
"step": 5340
},
{
"epoch": 1.6537867078825348,
"grad_norm": 0.001541083212941885,
"learning_rate": 0.0004487377640391551,
"loss": 0.8854,
"step": 5350
},
{
"epoch": 1.6568778979907264,
"grad_norm": 0.0018007430480793118,
"learning_rate": 0.00044770736733642456,
"loss": 0.8571,
"step": 5360
},
{
"epoch": 1.6599690880989182,
"grad_norm": 0.0015837026294320822,
"learning_rate": 0.000446676970633694,
"loss": 0.8731,
"step": 5370
},
{
"epoch": 1.6630602782071098,
"grad_norm": 0.0019535787869244814,
"learning_rate": 0.0004456465739309634,
"loss": 0.8952,
"step": 5380
},
{
"epoch": 1.6661514683153014,
"grad_norm": 0.0015085084596648812,
"learning_rate": 0.0004446161772282329,
"loss": 0.9039,
"step": 5390
},
{
"epoch": 1.6692426584234932,
"grad_norm": 0.0016192490002140403,
"learning_rate": 0.0004435857805255023,
"loss": 0.8972,
"step": 5400
},
{
"epoch": 1.6723338485316845,
"grad_norm": 0.0025146189145743847,
"learning_rate": 0.00044255538382277176,
"loss": 0.909,
"step": 5410
},
{
"epoch": 1.6754250386398764,
"grad_norm": 0.001912578009068966,
"learning_rate": 0.00044152498712004125,
"loss": 0.909,
"step": 5420
},
{
"epoch": 1.6785162287480682,
"grad_norm": 0.0015931734815239906,
"learning_rate": 0.0004404945904173107,
"loss": 0.9285,
"step": 5430
},
{
"epoch": 1.6816074188562595,
"grad_norm": 0.0015723604010418057,
"learning_rate": 0.0004394641937145801,
"loss": 0.883,
"step": 5440
},
{
"epoch": 1.6846986089644513,
"grad_norm": 0.0017312741838395596,
"learning_rate": 0.00043843379701184954,
"loss": 0.8687,
"step": 5450
},
{
"epoch": 1.687789799072643,
"grad_norm": 0.0015717818168923259,
"learning_rate": 0.000437403400309119,
"loss": 0.8634,
"step": 5460
},
{
"epoch": 1.6908809891808345,
"grad_norm": 0.0017777059692889452,
"learning_rate": 0.0004363730036063885,
"loss": 0.9342,
"step": 5470
},
{
"epoch": 1.6939721792890263,
"grad_norm": 0.0014749905094504356,
"learning_rate": 0.0004353426069036579,
"loss": 0.8881,
"step": 5480
},
{
"epoch": 1.697063369397218,
"grad_norm": 0.0015921080484986305,
"learning_rate": 0.00043431221020092737,
"loss": 0.8682,
"step": 5490
},
{
"epoch": 1.7001545595054095,
"grad_norm": 0.0017204548930749297,
"learning_rate": 0.00043328181349819685,
"loss": 0.9315,
"step": 5500
},
{
"epoch": 1.7032457496136013,
"grad_norm": 0.0013450038386508822,
"learning_rate": 0.0004322514167954663,
"loss": 0.8749,
"step": 5510
},
{
"epoch": 1.706336939721793,
"grad_norm": 0.0020300759933888912,
"learning_rate": 0.0004312210200927357,
"loss": 0.9331,
"step": 5520
},
{
"epoch": 1.7094281298299845,
"grad_norm": 0.0023906801361590624,
"learning_rate": 0.00043019062339000514,
"loss": 0.9049,
"step": 5530
},
{
"epoch": 1.7125193199381763,
"grad_norm": 0.0016755072865635157,
"learning_rate": 0.00042916022668727463,
"loss": 0.8818,
"step": 5540
},
{
"epoch": 1.7156105100463679,
"grad_norm": 0.001380381640046835,
"learning_rate": 0.00042812982998454406,
"loss": 0.9016,
"step": 5550
},
{
"epoch": 1.7187017001545595,
"grad_norm": 0.0016399535816162825,
"learning_rate": 0.0004270994332818135,
"loss": 0.9137,
"step": 5560
},
{
"epoch": 1.7217928902627513,
"grad_norm": 0.0018158459570258856,
"learning_rate": 0.000426069036579083,
"loss": 0.8935,
"step": 5570
},
{
"epoch": 1.7248840803709427,
"grad_norm": 0.0017615389078855515,
"learning_rate": 0.0004250386398763524,
"loss": 0.8964,
"step": 5580
},
{
"epoch": 1.7279752704791345,
"grad_norm": 0.0018352493643760681,
"learning_rate": 0.00042400824317362183,
"loss": 0.9344,
"step": 5590
},
{
"epoch": 1.7310664605873263,
"grad_norm": 0.0015487250639125705,
"learning_rate": 0.0004229778464708913,
"loss": 0.888,
"step": 5600
},
{
"epoch": 1.7341576506955176,
"grad_norm": 0.00184920453466475,
"learning_rate": 0.00042194744976816075,
"loss": 0.866,
"step": 5610
},
{
"epoch": 1.7372488408037094,
"grad_norm": 0.0018842272693291306,
"learning_rate": 0.00042091705306543023,
"loss": 0.9157,
"step": 5620
},
{
"epoch": 1.740340030911901,
"grad_norm": 0.001251103589311242,
"learning_rate": 0.0004198866563626996,
"loss": 0.8774,
"step": 5630
},
{
"epoch": 1.7434312210200926,
"grad_norm": 0.0017979164840653539,
"learning_rate": 0.0004188562596599691,
"loss": 0.9327,
"step": 5640
},
{
"epoch": 1.7465224111282844,
"grad_norm": 0.003756187856197357,
"learning_rate": 0.0004178258629572386,
"loss": 0.898,
"step": 5650
},
{
"epoch": 1.749613601236476,
"grad_norm": 0.0025360011495649815,
"learning_rate": 0.000416795466254508,
"loss": 0.9021,
"step": 5660
},
{
"epoch": 1.7527047913446676,
"grad_norm": 0.0015577342128381133,
"learning_rate": 0.00041576506955177744,
"loss": 0.8972,
"step": 5670
},
{
"epoch": 1.7557959814528594,
"grad_norm": 0.002707903040573001,
"learning_rate": 0.00041473467284904687,
"loss": 0.8647,
"step": 5680
},
{
"epoch": 1.758887171561051,
"grad_norm": 0.0017573883524164557,
"learning_rate": 0.00041370427614631635,
"loss": 0.9007,
"step": 5690
},
{
"epoch": 1.7619783616692426,
"grad_norm": 0.002097573596984148,
"learning_rate": 0.0004126738794435858,
"loss": 0.865,
"step": 5700
},
{
"epoch": 1.7650695517774344,
"grad_norm": 0.0018730917945504189,
"learning_rate": 0.0004116434827408552,
"loss": 0.9073,
"step": 5710
},
{
"epoch": 1.768160741885626,
"grad_norm": 0.0017573771765455604,
"learning_rate": 0.0004106130860381247,
"loss": 0.9246,
"step": 5720
},
{
"epoch": 1.7712519319938176,
"grad_norm": 0.0017129804473370314,
"learning_rate": 0.00040958268933539413,
"loss": 0.93,
"step": 5730
},
{
"epoch": 1.7743431221020094,
"grad_norm": 0.0012526778737083077,
"learning_rate": 0.00040855229263266356,
"loss": 0.8911,
"step": 5740
},
{
"epoch": 1.7774343122102008,
"grad_norm": 0.002290197880938649,
"learning_rate": 0.00040752189592993304,
"loss": 0.9049,
"step": 5750
},
{
"epoch": 1.7805255023183926,
"grad_norm": 0.001954218838363886,
"learning_rate": 0.0004064914992272025,
"loss": 0.8975,
"step": 5760
},
{
"epoch": 1.7836166924265844,
"grad_norm": 0.002614745870232582,
"learning_rate": 0.00040546110252447196,
"loss": 0.8844,
"step": 5770
},
{
"epoch": 1.7867078825347757,
"grad_norm": 0.0014066528528928757,
"learning_rate": 0.00040443070582174133,
"loss": 0.8726,
"step": 5780
},
{
"epoch": 1.7897990726429676,
"grad_norm": 0.0013754137326031923,
"learning_rate": 0.0004034003091190108,
"loss": 0.9034,
"step": 5790
},
{
"epoch": 1.7928902627511591,
"grad_norm": 0.0014499702956527472,
"learning_rate": 0.0004023699124162803,
"loss": 0.8515,
"step": 5800
},
{
"epoch": 1.7959814528593507,
"grad_norm": 0.0015088847139850259,
"learning_rate": 0.00040133951571354973,
"loss": 0.9027,
"step": 5810
},
{
"epoch": 1.7990726429675425,
"grad_norm": 0.0013973376480862498,
"learning_rate": 0.00040030911901081916,
"loss": 0.8943,
"step": 5820
},
{
"epoch": 1.8021638330757341,
"grad_norm": 0.0014548856997862458,
"learning_rate": 0.00039927872230808865,
"loss": 0.9106,
"step": 5830
},
{
"epoch": 1.8052550231839257,
"grad_norm": 0.0017355557065457106,
"learning_rate": 0.0003982483256053581,
"loss": 0.8727,
"step": 5840
},
{
"epoch": 1.8083462132921175,
"grad_norm": 0.0021262154914438725,
"learning_rate": 0.0003972179289026275,
"loss": 0.8824,
"step": 5850
},
{
"epoch": 1.8114374034003091,
"grad_norm": 0.001427137991413474,
"learning_rate": 0.00039618753219989694,
"loss": 0.9374,
"step": 5860
},
{
"epoch": 1.8145285935085007,
"grad_norm": 0.0016721707070246339,
"learning_rate": 0.0003951571354971664,
"loss": 0.919,
"step": 5870
},
{
"epoch": 1.8176197836166925,
"grad_norm": 0.0017290489049628377,
"learning_rate": 0.0003941267387944359,
"loss": 0.9544,
"step": 5880
},
{
"epoch": 1.820710973724884,
"grad_norm": 0.001801205798983574,
"learning_rate": 0.0003930963420917053,
"loss": 0.8656,
"step": 5890
},
{
"epoch": 1.8238021638330757,
"grad_norm": 0.0016462091589346528,
"learning_rate": 0.00039206594538897477,
"loss": 0.8515,
"step": 5900
},
{
"epoch": 1.8268933539412675,
"grad_norm": 0.001921969000250101,
"learning_rate": 0.0003910355486862442,
"loss": 0.8472,
"step": 5910
},
{
"epoch": 1.829984544049459,
"grad_norm": 0.0016511150170117617,
"learning_rate": 0.0003900051519835137,
"loss": 0.8858,
"step": 5920
},
{
"epoch": 1.8330757341576507,
"grad_norm": 0.0014706592774018645,
"learning_rate": 0.0003889747552807831,
"loss": 0.8983,
"step": 5930
},
{
"epoch": 1.8361669242658425,
"grad_norm": 0.0024984250776469707,
"learning_rate": 0.00038794435857805254,
"loss": 0.9051,
"step": 5940
},
{
"epoch": 1.8392581143740339,
"grad_norm": 0.001705004251562059,
"learning_rate": 0.00038691396187532203,
"loss": 0.902,
"step": 5950
},
{
"epoch": 1.8423493044822257,
"grad_norm": 0.0019023518543690443,
"learning_rate": 0.00038588356517259146,
"loss": 0.8798,
"step": 5960
},
{
"epoch": 1.8454404945904173,
"grad_norm": 0.0018084270413964987,
"learning_rate": 0.0003848531684698609,
"loss": 0.9092,
"step": 5970
},
{
"epoch": 1.8485316846986088,
"grad_norm": 0.002029530005529523,
"learning_rate": 0.00038382277176713037,
"loss": 0.9116,
"step": 5980
},
{
"epoch": 1.8516228748068007,
"grad_norm": 0.0018030694918707013,
"learning_rate": 0.0003827923750643998,
"loss": 0.9126,
"step": 5990
},
{
"epoch": 1.8547140649149922,
"grad_norm": 0.0018464057939127088,
"learning_rate": 0.00038176197836166923,
"loss": 0.8679,
"step": 6000
},
{
"epoch": 1.8578052550231838,
"grad_norm": 0.0018209113040938973,
"learning_rate": 0.00038073158165893866,
"loss": 0.9302,
"step": 6010
},
{
"epoch": 1.8608964451313756,
"grad_norm": 0.001530204783193767,
"learning_rate": 0.00037970118495620815,
"loss": 0.8936,
"step": 6020
},
{
"epoch": 1.8639876352395672,
"grad_norm": 0.0017929489258676767,
"learning_rate": 0.00037867078825347763,
"loss": 0.8799,
"step": 6030
},
{
"epoch": 1.8670788253477588,
"grad_norm": 0.0020685286726802588,
"learning_rate": 0.000377640391550747,
"loss": 0.9022,
"step": 6040
},
{
"epoch": 1.8701700154559506,
"grad_norm": 0.0016937406035140157,
"learning_rate": 0.0003766099948480165,
"loss": 0.922,
"step": 6050
},
{
"epoch": 1.8732612055641422,
"grad_norm": 0.001672919373959303,
"learning_rate": 0.0003755795981452859,
"loss": 0.8984,
"step": 6060
},
{
"epoch": 1.8763523956723338,
"grad_norm": 0.0020905956625938416,
"learning_rate": 0.0003745492014425554,
"loss": 0.9168,
"step": 6070
},
{
"epoch": 1.8794435857805256,
"grad_norm": 0.0016705166781321168,
"learning_rate": 0.00037351880473982484,
"loss": 0.9095,
"step": 6080
},
{
"epoch": 1.8825347758887172,
"grad_norm": 0.002511728322133422,
"learning_rate": 0.00037248840803709427,
"loss": 0.9591,
"step": 6090
},
{
"epoch": 1.8856259659969088,
"grad_norm": 0.0016106483526527882,
"learning_rate": 0.00037145801133436375,
"loss": 0.9133,
"step": 6100
},
{
"epoch": 1.8887171561051006,
"grad_norm": 0.0018942320020869374,
"learning_rate": 0.00037042761463163324,
"loss": 0.9216,
"step": 6110
},
{
"epoch": 1.891808346213292,
"grad_norm": 0.0014648385113105178,
"learning_rate": 0.0003693972179289026,
"loss": 0.854,
"step": 6120
},
{
"epoch": 1.8948995363214838,
"grad_norm": 0.0018195216543972492,
"learning_rate": 0.0003683668212261721,
"loss": 0.9546,
"step": 6130
},
{
"epoch": 1.8979907264296756,
"grad_norm": 0.0016678489046171308,
"learning_rate": 0.00036733642452344153,
"loss": 0.882,
"step": 6140
},
{
"epoch": 1.901081916537867,
"grad_norm": 0.0018015914829447865,
"learning_rate": 0.000366306027820711,
"loss": 0.9164,
"step": 6150
},
{
"epoch": 1.9041731066460588,
"grad_norm": 0.0018244273960590363,
"learning_rate": 0.00036527563111798044,
"loss": 0.9182,
"step": 6160
},
{
"epoch": 1.9072642967542504,
"grad_norm": 0.002539639361202717,
"learning_rate": 0.00036424523441524987,
"loss": 0.8878,
"step": 6170
},
{
"epoch": 1.910355486862442,
"grad_norm": 0.0017704921774566174,
"learning_rate": 0.00036321483771251936,
"loss": 0.8888,
"step": 6180
},
{
"epoch": 1.9134466769706338,
"grad_norm": 0.0023106811568140984,
"learning_rate": 0.00036218444100978873,
"loss": 0.9106,
"step": 6190
},
{
"epoch": 1.9165378670788253,
"grad_norm": 0.0019237243104726076,
"learning_rate": 0.0003611540443070582,
"loss": 0.8905,
"step": 6200
},
{
"epoch": 1.919629057187017,
"grad_norm": 0.0014340688940137625,
"learning_rate": 0.0003601236476043277,
"loss": 0.8845,
"step": 6210
},
{
"epoch": 1.9227202472952087,
"grad_norm": 0.0015864827437326312,
"learning_rate": 0.00035909325090159713,
"loss": 0.8996,
"step": 6220
},
{
"epoch": 1.9258114374034003,
"grad_norm": 0.0015774049097672105,
"learning_rate": 0.00035806285419886656,
"loss": 0.9557,
"step": 6230
},
{
"epoch": 1.928902627511592,
"grad_norm": 0.0022414042614400387,
"learning_rate": 0.000357032457496136,
"loss": 0.8609,
"step": 6240
},
{
"epoch": 1.9319938176197837,
"grad_norm": 0.001590002211742103,
"learning_rate": 0.0003560020607934055,
"loss": 0.9381,
"step": 6250
},
{
"epoch": 1.9350850077279753,
"grad_norm": 0.002488743746653199,
"learning_rate": 0.00035497166409067496,
"loss": 0.9132,
"step": 6260
},
{
"epoch": 1.938176197836167,
"grad_norm": 0.0019566200207918882,
"learning_rate": 0.00035394126738794434,
"loss": 0.9023,
"step": 6270
},
{
"epoch": 1.9412673879443587,
"grad_norm": 0.0015884449239820242,
"learning_rate": 0.0003529108706852138,
"loss": 0.9132,
"step": 6280
},
{
"epoch": 1.94435857805255,
"grad_norm": 0.002089353743940592,
"learning_rate": 0.00035188047398248325,
"loss": 0.8855,
"step": 6290
},
{
"epoch": 1.947449768160742,
"grad_norm": 0.0014843277167528868,
"learning_rate": 0.00035085007727975274,
"loss": 0.898,
"step": 6300
},
{
"epoch": 1.9505409582689337,
"grad_norm": 0.0015669453423470259,
"learning_rate": 0.00034981968057702217,
"loss": 0.9371,
"step": 6310
},
{
"epoch": 1.953632148377125,
"grad_norm": 0.0016016702866181731,
"learning_rate": 0.0003487892838742916,
"loss": 0.9119,
"step": 6320
},
{
"epoch": 1.9567233384853169,
"grad_norm": 0.0017695052083581686,
"learning_rate": 0.0003477588871715611,
"loss": 0.8987,
"step": 6330
},
{
"epoch": 1.9598145285935085,
"grad_norm": 0.00243277451954782,
"learning_rate": 0.00034672849046883046,
"loss": 0.9257,
"step": 6340
},
{
"epoch": 1.9629057187017,
"grad_norm": 0.0014211182715371251,
"learning_rate": 0.00034569809376609994,
"loss": 0.8801,
"step": 6350
},
{
"epoch": 1.9659969088098919,
"grad_norm": 0.0024740241933614016,
"learning_rate": 0.0003446676970633694,
"loss": 0.8899,
"step": 6360
},
{
"epoch": 1.9690880989180835,
"grad_norm": 0.001807063934393227,
"learning_rate": 0.00034363730036063886,
"loss": 0.8971,
"step": 6370
},
{
"epoch": 1.972179289026275,
"grad_norm": 0.0013645980507135391,
"learning_rate": 0.0003426069036579083,
"loss": 0.9323,
"step": 6380
},
{
"epoch": 1.9752704791344669,
"grad_norm": 0.0015155840665102005,
"learning_rate": 0.0003415765069551777,
"loss": 0.8874,
"step": 6390
},
{
"epoch": 1.9783616692426584,
"grad_norm": 0.0016512033762410283,
"learning_rate": 0.0003405461102524472,
"loss": 0.8489,
"step": 6400
},
{
"epoch": 1.98145285935085,
"grad_norm": 0.001505164080299437,
"learning_rate": 0.0003395157135497167,
"loss": 0.9208,
"step": 6410
},
{
"epoch": 1.9845440494590418,
"grad_norm": 0.0018190627451986074,
"learning_rate": 0.00033848531684698606,
"loss": 0.9155,
"step": 6420
},
{
"epoch": 1.9876352395672334,
"grad_norm": 0.0019098619231954217,
"learning_rate": 0.00033745492014425555,
"loss": 0.913,
"step": 6430
},
{
"epoch": 1.990726429675425,
"grad_norm": 0.0015993445413187146,
"learning_rate": 0.00033642452344152503,
"loss": 0.8934,
"step": 6440
},
{
"epoch": 1.9938176197836168,
"grad_norm": 0.0017613953677937388,
"learning_rate": 0.00033539412673879446,
"loss": 0.8946,
"step": 6450
},
{
"epoch": 1.9969088098918082,
"grad_norm": 0.0015573868295177817,
"learning_rate": 0.0003343637300360639,
"loss": 0.9159,
"step": 6460
},
{
"epoch": 2.0,
"grad_norm": 0.0019827033393085003,
"learning_rate": 0.0003333333333333333,
"loss": 0.9071,
"step": 6470
},
{
"epoch": 2.003091190108192,
"grad_norm": 0.001513644470833242,
"learning_rate": 0.0003323029366306028,
"loss": 0.9439,
"step": 6480
},
{
"epoch": 2.006182380216383,
"grad_norm": 0.001435752958059311,
"learning_rate": 0.00033127253992787224,
"loss": 0.9027,
"step": 6490
},
{
"epoch": 2.009273570324575,
"grad_norm": 0.0016444657230749726,
"learning_rate": 0.00033024214322514167,
"loss": 0.8816,
"step": 6500
},
{
"epoch": 2.012364760432767,
"grad_norm": 0.0021295547485351562,
"learning_rate": 0.00032921174652241115,
"loss": 0.9254,
"step": 6510
},
{
"epoch": 2.015455950540958,
"grad_norm": 0.001806983258575201,
"learning_rate": 0.0003281813498196806,
"loss": 0.9273,
"step": 6520
},
{
"epoch": 2.01854714064915,
"grad_norm": 0.0016168680740520358,
"learning_rate": 0.00032715095311695,
"loss": 0.9036,
"step": 6530
},
{
"epoch": 2.021638330757342,
"grad_norm": 0.0014968597097322345,
"learning_rate": 0.0003261205564142195,
"loss": 0.8532,
"step": 6540
},
{
"epoch": 2.024729520865533,
"grad_norm": 0.0019258251413702965,
"learning_rate": 0.0003250901597114889,
"loss": 0.9347,
"step": 6550
},
{
"epoch": 2.027820710973725,
"grad_norm": 0.0017398808849975467,
"learning_rate": 0.0003240597630087584,
"loss": 0.8724,
"step": 6560
},
{
"epoch": 2.0309119010819163,
"grad_norm": 0.001705456175841391,
"learning_rate": 0.0003230293663060278,
"loss": 0.9058,
"step": 6570
},
{
"epoch": 2.034003091190108,
"grad_norm": 0.0018997775623574853,
"learning_rate": 0.00032199896960329727,
"loss": 0.9097,
"step": 6580
},
{
"epoch": 2.0370942812983,
"grad_norm": 0.001412282115779817,
"learning_rate": 0.00032096857290056676,
"loss": 0.8857,
"step": 6590
},
{
"epoch": 2.0401854714064913,
"grad_norm": 0.0014306355733424425,
"learning_rate": 0.0003199381761978362,
"loss": 0.91,
"step": 6600
},
{
"epoch": 2.043276661514683,
"grad_norm": 0.001689639175310731,
"learning_rate": 0.0003189077794951056,
"loss": 0.9038,
"step": 6610
},
{
"epoch": 2.046367851622875,
"grad_norm": 0.0015731025487184525,
"learning_rate": 0.00031787738279237505,
"loss": 0.913,
"step": 6620
},
{
"epoch": 2.0494590417310663,
"grad_norm": 0.0014561581192538142,
"learning_rate": 0.00031684698608964453,
"loss": 0.8767,
"step": 6630
},
{
"epoch": 2.052550231839258,
"grad_norm": 0.0017503297422081232,
"learning_rate": 0.00031581658938691396,
"loss": 0.917,
"step": 6640
},
{
"epoch": 2.05564142194745,
"grad_norm": 0.00222258223220706,
"learning_rate": 0.0003147861926841834,
"loss": 0.8869,
"step": 6650
},
{
"epoch": 2.0587326120556413,
"grad_norm": 0.0023604268208146095,
"learning_rate": 0.0003137557959814529,
"loss": 0.9149,
"step": 6660
},
{
"epoch": 2.061823802163833,
"grad_norm": 0.0023476933129131794,
"learning_rate": 0.0003127253992787223,
"loss": 0.949,
"step": 6670
},
{
"epoch": 2.064914992272025,
"grad_norm": 0.0021014835219830275,
"learning_rate": 0.00031169500257599174,
"loss": 0.9374,
"step": 6680
},
{
"epoch": 2.0680061823802163,
"grad_norm": 0.001549664419144392,
"learning_rate": 0.0003106646058732612,
"loss": 0.9189,
"step": 6690
},
{
"epoch": 2.071097372488408,
"grad_norm": 0.0018286675913259387,
"learning_rate": 0.00030963420917053065,
"loss": 0.9028,
"step": 6700
},
{
"epoch": 2.0741885625966,
"grad_norm": 0.0018017146503552794,
"learning_rate": 0.00030860381246780014,
"loss": 0.8899,
"step": 6710
},
{
"epoch": 2.0772797527047913,
"grad_norm": 0.001875316142104566,
"learning_rate": 0.0003075734157650695,
"loss": 0.9123,
"step": 6720
},
{
"epoch": 2.080370942812983,
"grad_norm": 0.0018925000913441181,
"learning_rate": 0.000306543019062339,
"loss": 0.9139,
"step": 6730
},
{
"epoch": 2.0834621329211744,
"grad_norm": 0.0015832912176847458,
"learning_rate": 0.0003055126223596085,
"loss": 0.9131,
"step": 6740
},
{
"epoch": 2.0865533230293662,
"grad_norm": 0.0018890087958425283,
"learning_rate": 0.0003044822256568779,
"loss": 0.8798,
"step": 6750
},
{
"epoch": 2.089644513137558,
"grad_norm": 0.0020662271417677402,
"learning_rate": 0.00030345182895414734,
"loss": 0.9027,
"step": 6760
},
{
"epoch": 2.0927357032457494,
"grad_norm": 0.001351105165667832,
"learning_rate": 0.0003024214322514168,
"loss": 0.8641,
"step": 6770
},
{
"epoch": 2.0958268933539412,
"grad_norm": 0.001607073936611414,
"learning_rate": 0.00030139103554868626,
"loss": 0.8716,
"step": 6780
},
{
"epoch": 2.098918083462133,
"grad_norm": 0.0022712023928761482,
"learning_rate": 0.0003003606388459557,
"loss": 0.9286,
"step": 6790
},
{
"epoch": 2.1020092735703244,
"grad_norm": 0.0019429827807471156,
"learning_rate": 0.0002993302421432251,
"loss": 0.9157,
"step": 6800
},
{
"epoch": 2.105100463678516,
"grad_norm": 0.0023945835418999195,
"learning_rate": 0.0002982998454404946,
"loss": 0.9332,
"step": 6810
},
{
"epoch": 2.108191653786708,
"grad_norm": 0.0017035908531397581,
"learning_rate": 0.0002972694487377641,
"loss": 0.8984,
"step": 6820
},
{
"epoch": 2.1112828438948994,
"grad_norm": 0.001636037020944059,
"learning_rate": 0.00029623905203503346,
"loss": 0.8535,
"step": 6830
},
{
"epoch": 2.114374034003091,
"grad_norm": 0.0015807118033990264,
"learning_rate": 0.00029520865533230295,
"loss": 0.8537,
"step": 6840
},
{
"epoch": 2.117465224111283,
"grad_norm": 0.0019097881158813834,
"learning_rate": 0.0002941782586295724,
"loss": 0.9224,
"step": 6850
},
{
"epoch": 2.1205564142194744,
"grad_norm": 0.002146846381947398,
"learning_rate": 0.00029314786192684186,
"loss": 0.9206,
"step": 6860
},
{
"epoch": 2.123647604327666,
"grad_norm": 0.0016978083876892924,
"learning_rate": 0.0002921174652241113,
"loss": 0.9441,
"step": 6870
},
{
"epoch": 2.126738794435858,
"grad_norm": 0.0015323269180953503,
"learning_rate": 0.0002910870685213807,
"loss": 0.9336,
"step": 6880
},
{
"epoch": 2.1298299845440494,
"grad_norm": 0.0014333493309095502,
"learning_rate": 0.0002900566718186502,
"loss": 0.9192,
"step": 6890
},
{
"epoch": 2.132921174652241,
"grad_norm": 0.0017516023945063353,
"learning_rate": 0.00028902627511591964,
"loss": 0.8886,
"step": 6900
},
{
"epoch": 2.1360123647604325,
"grad_norm": 0.0013508939882740378,
"learning_rate": 0.00028799587841318907,
"loss": 0.8788,
"step": 6910
},
{
"epoch": 2.1391035548686244,
"grad_norm": 0.002034724224358797,
"learning_rate": 0.00028696548171045855,
"loss": 0.8884,
"step": 6920
},
{
"epoch": 2.142194744976816,
"grad_norm": 0.0014981675194576383,
"learning_rate": 0.000285935085007728,
"loss": 0.9187,
"step": 6930
},
{
"epoch": 2.1452859350850075,
"grad_norm": 0.002125379629433155,
"learning_rate": 0.00028490468830499747,
"loss": 0.8873,
"step": 6940
},
{
"epoch": 2.1483771251931993,
"grad_norm": 0.001538197393529117,
"learning_rate": 0.00028387429160226684,
"loss": 0.9144,
"step": 6950
},
{
"epoch": 2.151468315301391,
"grad_norm": 0.002346182242035866,
"learning_rate": 0.0002828438948995363,
"loss": 0.9295,
"step": 6960
},
{
"epoch": 2.1545595054095825,
"grad_norm": 0.0015058065764606,
"learning_rate": 0.0002818134981968058,
"loss": 0.857,
"step": 6970
},
{
"epoch": 2.1576506955177743,
"grad_norm": 0.0017613341333344579,
"learning_rate": 0.0002807831014940752,
"loss": 0.8723,
"step": 6980
},
{
"epoch": 2.160741885625966,
"grad_norm": 0.0021604488138109446,
"learning_rate": 0.00027975270479134467,
"loss": 0.934,
"step": 6990
},
{
"epoch": 2.1638330757341575,
"grad_norm": 0.0017267893999814987,
"learning_rate": 0.0002787223080886141,
"loss": 0.8985,
"step": 7000
},
{
"epoch": 2.1669242658423493,
"grad_norm": 0.0014519842807203531,
"learning_rate": 0.0002776919113858836,
"loss": 0.9185,
"step": 7010
},
{
"epoch": 2.170015455950541,
"grad_norm": 0.0023860172368586063,
"learning_rate": 0.000276661514683153,
"loss": 0.9081,
"step": 7020
},
{
"epoch": 2.1731066460587325,
"grad_norm": 0.0017903451807796955,
"learning_rate": 0.00027563111798042245,
"loss": 0.9024,
"step": 7030
},
{
"epoch": 2.1761978361669243,
"grad_norm": 0.0015208751428872347,
"learning_rate": 0.00027460072127769193,
"loss": 0.8988,
"step": 7040
},
{
"epoch": 2.179289026275116,
"grad_norm": 0.0019341211300343275,
"learning_rate": 0.0002735703245749614,
"loss": 0.9378,
"step": 7050
},
{
"epoch": 2.1823802163833075,
"grad_norm": 0.0016929497942328453,
"learning_rate": 0.0002725399278722308,
"loss": 0.9231,
"step": 7060
},
{
"epoch": 2.1854714064914993,
"grad_norm": 0.0015913585666567087,
"learning_rate": 0.0002715095311695003,
"loss": 0.8959,
"step": 7070
},
{
"epoch": 2.1885625965996907,
"grad_norm": 0.001789154834114015,
"learning_rate": 0.0002704791344667697,
"loss": 0.9459,
"step": 7080
},
{
"epoch": 2.1916537867078825,
"grad_norm": 0.00176356197334826,
"learning_rate": 0.0002694487377640392,
"loss": 0.915,
"step": 7090
},
{
"epoch": 2.1947449768160743,
"grad_norm": 0.0034130492713302374,
"learning_rate": 0.0002684183410613086,
"loss": 0.8311,
"step": 7100
},
{
"epoch": 2.1978361669242656,
"grad_norm": 0.001789650646969676,
"learning_rate": 0.00026738794435857805,
"loss": 0.9061,
"step": 7110
},
{
"epoch": 2.2009273570324575,
"grad_norm": 0.0015091504901647568,
"learning_rate": 0.00026635754765584754,
"loss": 0.882,
"step": 7120
},
{
"epoch": 2.2040185471406493,
"grad_norm": 0.0016386975767090917,
"learning_rate": 0.0002653271509531169,
"loss": 0.8652,
"step": 7130
},
{
"epoch": 2.2071097372488406,
"grad_norm": 0.0018872515065595508,
"learning_rate": 0.0002642967542503864,
"loss": 0.9064,
"step": 7140
},
{
"epoch": 2.2102009273570324,
"grad_norm": 0.0016174730844795704,
"learning_rate": 0.0002632663575476559,
"loss": 0.8615,
"step": 7150
},
{
"epoch": 2.2132921174652243,
"grad_norm": 0.0023867471609264612,
"learning_rate": 0.0002622359608449253,
"loss": 0.903,
"step": 7160
},
{
"epoch": 2.2163833075734156,
"grad_norm": 0.0018768367590382695,
"learning_rate": 0.00026120556414219474,
"loss": 0.8828,
"step": 7170
},
{
"epoch": 2.2194744976816074,
"grad_norm": 0.0019271258497610688,
"learning_rate": 0.00026017516743946417,
"loss": 0.8861,
"step": 7180
},
{
"epoch": 2.2225656877897992,
"grad_norm": 0.001598935341462493,
"learning_rate": 0.00025914477073673366,
"loss": 0.8821,
"step": 7190
},
{
"epoch": 2.2256568778979906,
"grad_norm": 0.002534502651542425,
"learning_rate": 0.00025811437403400314,
"loss": 0.923,
"step": 7200
},
{
"epoch": 2.2287480680061824,
"grad_norm": 0.001254307571798563,
"learning_rate": 0.0002570839773312725,
"loss": 0.8744,
"step": 7210
},
{
"epoch": 2.2318392581143742,
"grad_norm": 0.0013876528246328235,
"learning_rate": 0.000256053580628542,
"loss": 0.909,
"step": 7220
},
{
"epoch": 2.2349304482225656,
"grad_norm": 0.0030744040850549936,
"learning_rate": 0.00025502318392581143,
"loss": 0.9358,
"step": 7230
},
{
"epoch": 2.2380216383307574,
"grad_norm": 0.001663259114138782,
"learning_rate": 0.0002539927872230809,
"loss": 0.9078,
"step": 7240
},
{
"epoch": 2.2411128284389488,
"grad_norm": 0.00184043834451586,
"learning_rate": 0.00025296239052035035,
"loss": 0.8703,
"step": 7250
},
{
"epoch": 2.2442040185471406,
"grad_norm": 0.002181377960368991,
"learning_rate": 0.0002519319938176198,
"loss": 0.8823,
"step": 7260
},
{
"epoch": 2.2472952086553324,
"grad_norm": 0.0015466611366719007,
"learning_rate": 0.00025090159711488926,
"loss": 0.8792,
"step": 7270
},
{
"epoch": 2.250386398763524,
"grad_norm": 0.002074003452435136,
"learning_rate": 0.0002498712004121587,
"loss": 0.9361,
"step": 7280
},
{
"epoch": 2.2534775888717156,
"grad_norm": 0.0014814219903200865,
"learning_rate": 0.0002488408037094281,
"loss": 0.8684,
"step": 7290
},
{
"epoch": 2.2565687789799074,
"grad_norm": 0.0022694983053952456,
"learning_rate": 0.00024781040700669755,
"loss": 0.8592,
"step": 7300
},
{
"epoch": 2.2596599690880987,
"grad_norm": 0.0019474639557301998,
"learning_rate": 0.00024678001030396704,
"loss": 0.8866,
"step": 7310
},
{
"epoch": 2.2627511591962906,
"grad_norm": 0.0018335517961531878,
"learning_rate": 0.00024574961360123647,
"loss": 0.9352,
"step": 7320
},
{
"epoch": 2.2658423493044824,
"grad_norm": 0.001545943901874125,
"learning_rate": 0.00024471921689850595,
"loss": 0.868,
"step": 7330
},
{
"epoch": 2.2689335394126737,
"grad_norm": 0.0019200016977265477,
"learning_rate": 0.00024368882019577538,
"loss": 0.9414,
"step": 7340
},
{
"epoch": 2.2720247295208655,
"grad_norm": 0.002204937394708395,
"learning_rate": 0.0002426584234930448,
"loss": 0.919,
"step": 7350
},
{
"epoch": 2.2751159196290573,
"grad_norm": 0.002049383707344532,
"learning_rate": 0.0002416280267903143,
"loss": 0.9024,
"step": 7360
},
{
"epoch": 2.2782071097372487,
"grad_norm": 0.0016608345322310925,
"learning_rate": 0.00024059763008758373,
"loss": 0.9112,
"step": 7370
},
{
"epoch": 2.2812982998454405,
"grad_norm": 0.0020049915183335543,
"learning_rate": 0.00023956723338485318,
"loss": 0.9082,
"step": 7380
},
{
"epoch": 2.2843894899536323,
"grad_norm": 0.0017916331999003887,
"learning_rate": 0.0002385368366821226,
"loss": 0.9242,
"step": 7390
},
{
"epoch": 2.2874806800618237,
"grad_norm": 0.0015080280136317015,
"learning_rate": 0.00023750643997939207,
"loss": 0.9115,
"step": 7400
},
{
"epoch": 2.2905718701700155,
"grad_norm": 0.00174785649869591,
"learning_rate": 0.00023647604327666153,
"loss": 0.8643,
"step": 7410
},
{
"epoch": 2.293663060278207,
"grad_norm": 0.002300349297001958,
"learning_rate": 0.00023544564657393096,
"loss": 0.9215,
"step": 7420
},
{
"epoch": 2.2967542503863987,
"grad_norm": 0.001717501669190824,
"learning_rate": 0.00023441524987120042,
"loss": 0.881,
"step": 7430
},
{
"epoch": 2.2998454404945905,
"grad_norm": 0.0015334953786805272,
"learning_rate": 0.00023338485316846985,
"loss": 0.9106,
"step": 7440
},
{
"epoch": 2.3029366306027823,
"grad_norm": 0.00180353585164994,
"learning_rate": 0.00023235445646573933,
"loss": 0.9088,
"step": 7450
},
{
"epoch": 2.3060278207109737,
"grad_norm": 0.0016415161080658436,
"learning_rate": 0.00023132405976300876,
"loss": 0.8925,
"step": 7460
},
{
"epoch": 2.3091190108191655,
"grad_norm": 0.002030453644692898,
"learning_rate": 0.00023029366306027822,
"loss": 0.8819,
"step": 7470
},
{
"epoch": 2.312210200927357,
"grad_norm": 0.0019409249071031809,
"learning_rate": 0.00022926326635754765,
"loss": 0.867,
"step": 7480
},
{
"epoch": 2.3153013910355487,
"grad_norm": 0.0034679800737649202,
"learning_rate": 0.0002282328696548171,
"loss": 0.8954,
"step": 7490
},
{
"epoch": 2.3183925811437405,
"grad_norm": 0.0020659409929066896,
"learning_rate": 0.00022720247295208656,
"loss": 0.9357,
"step": 7500
},
{
"epoch": 2.321483771251932,
"grad_norm": 0.0021641727071255445,
"learning_rate": 0.00022617207624935602,
"loss": 0.8911,
"step": 7510
},
{
"epoch": 2.3245749613601236,
"grad_norm": 0.0016261821147054434,
"learning_rate": 0.00022514167954662545,
"loss": 0.9102,
"step": 7520
},
{
"epoch": 2.3276661514683155,
"grad_norm": 0.0018004965968430042,
"learning_rate": 0.0002241112828438949,
"loss": 0.8924,
"step": 7530
},
{
"epoch": 2.330757341576507,
"grad_norm": 0.0016102648805826902,
"learning_rate": 0.00022308088614116434,
"loss": 0.8812,
"step": 7540
},
{
"epoch": 2.3338485316846986,
"grad_norm": 0.0018460742430761456,
"learning_rate": 0.00022205048943843382,
"loss": 0.8935,
"step": 7550
},
{
"epoch": 2.3369397217928904,
"grad_norm": 0.0018313312903046608,
"learning_rate": 0.00022102009273570325,
"loss": 0.8892,
"step": 7560
},
{
"epoch": 2.340030911901082,
"grad_norm": 0.0018456524703651667,
"learning_rate": 0.0002199896960329727,
"loss": 0.8874,
"step": 7570
},
{
"epoch": 2.3431221020092736,
"grad_norm": 0.0028236303478479385,
"learning_rate": 0.00021895929933024214,
"loss": 0.9291,
"step": 7580
},
{
"epoch": 2.346213292117465,
"grad_norm": 0.0012284901458770037,
"learning_rate": 0.00021792890262751157,
"loss": 0.8771,
"step": 7590
},
{
"epoch": 2.349304482225657,
"grad_norm": 0.0018673281883820891,
"learning_rate": 0.00021689850592478106,
"loss": 0.8804,
"step": 7600
},
{
"epoch": 2.3523956723338486,
"grad_norm": 0.0013668534811586142,
"learning_rate": 0.00021586810922205049,
"loss": 0.9415,
"step": 7610
},
{
"epoch": 2.3554868624420404,
"grad_norm": 0.0014791954308748245,
"learning_rate": 0.00021483771251931994,
"loss": 0.9123,
"step": 7620
},
{
"epoch": 2.358578052550232,
"grad_norm": 0.0016408261144533753,
"learning_rate": 0.00021380731581658937,
"loss": 0.9296,
"step": 7630
},
{
"epoch": 2.3616692426584236,
"grad_norm": 0.0016208128072321415,
"learning_rate": 0.00021277691911385886,
"loss": 0.8988,
"step": 7640
},
{
"epoch": 2.364760432766615,
"grad_norm": 0.0016079987399280071,
"learning_rate": 0.0002117465224111283,
"loss": 0.8697,
"step": 7650
},
{
"epoch": 2.3678516228748068,
"grad_norm": 0.0018998971208930016,
"learning_rate": 0.00021071612570839775,
"loss": 0.8971,
"step": 7660
},
{
"epoch": 2.3709428129829986,
"grad_norm": 0.0013807121431455016,
"learning_rate": 0.00020968572900566718,
"loss": 0.9038,
"step": 7670
},
{
"epoch": 2.37403400309119,
"grad_norm": 0.0016093014273792505,
"learning_rate": 0.00020865533230293663,
"loss": 0.8853,
"step": 7680
},
{
"epoch": 2.3771251931993818,
"grad_norm": 0.001660670735873282,
"learning_rate": 0.0002076249356002061,
"loss": 0.8867,
"step": 7690
},
{
"epoch": 2.3802163833075736,
"grad_norm": 0.001987049588933587,
"learning_rate": 0.00020659453889747555,
"loss": 0.9064,
"step": 7700
},
{
"epoch": 2.383307573415765,
"grad_norm": 0.0014046692522242665,
"learning_rate": 0.00020556414219474498,
"loss": 0.9145,
"step": 7710
},
{
"epoch": 2.3863987635239567,
"grad_norm": 0.0018706049304455519,
"learning_rate": 0.00020453374549201444,
"loss": 0.8975,
"step": 7720
},
{
"epoch": 2.3894899536321486,
"grad_norm": 0.001406969386152923,
"learning_rate": 0.00020350334878928387,
"loss": 0.9143,
"step": 7730
},
{
"epoch": 2.39258114374034,
"grad_norm": 0.001984959002584219,
"learning_rate": 0.00020247295208655332,
"loss": 0.9189,
"step": 7740
},
{
"epoch": 2.3956723338485317,
"grad_norm": 0.0022590451408177614,
"learning_rate": 0.00020144255538382278,
"loss": 0.9174,
"step": 7750
},
{
"epoch": 2.398763523956723,
"grad_norm": 0.0013520545326173306,
"learning_rate": 0.0002004121586810922,
"loss": 0.8876,
"step": 7760
},
{
"epoch": 2.401854714064915,
"grad_norm": 0.001583244651556015,
"learning_rate": 0.00019938176197836167,
"loss": 0.9106,
"step": 7770
},
{
"epoch": 2.4049459041731067,
"grad_norm": 0.0016318537527695298,
"learning_rate": 0.00019835136527563113,
"loss": 0.8566,
"step": 7780
},
{
"epoch": 2.4080370942812985,
"grad_norm": 0.0015274740289896727,
"learning_rate": 0.00019732096857290058,
"loss": 0.9214,
"step": 7790
},
{
"epoch": 2.41112828438949,
"grad_norm": 0.0017980411648750305,
"learning_rate": 0.00019629057187017,
"loss": 0.8636,
"step": 7800
},
{
"epoch": 2.4142194744976817,
"grad_norm": 0.0016120801446959376,
"learning_rate": 0.00019526017516743947,
"loss": 0.8821,
"step": 7810
},
{
"epoch": 2.417310664605873,
"grad_norm": 0.001640370930545032,
"learning_rate": 0.0001942297784647089,
"loss": 0.9068,
"step": 7820
},
{
"epoch": 2.420401854714065,
"grad_norm": 0.0015430683270096779,
"learning_rate": 0.00019319938176197838,
"loss": 0.8889,
"step": 7830
},
{
"epoch": 2.4234930448222567,
"grad_norm": 0.0015471933875232935,
"learning_rate": 0.00019216898505924782,
"loss": 0.8924,
"step": 7840
},
{
"epoch": 2.426584234930448,
"grad_norm": 0.0021910767536610365,
"learning_rate": 0.00019113858835651727,
"loss": 0.8569,
"step": 7850
},
{
"epoch": 2.42967542503864,
"grad_norm": 0.0017051781760528684,
"learning_rate": 0.0001901081916537867,
"loss": 0.937,
"step": 7860
},
{
"epoch": 2.4327666151468317,
"grad_norm": 0.0018304622499272227,
"learning_rate": 0.00018907779495105616,
"loss": 0.9255,
"step": 7870
},
{
"epoch": 2.435857805255023,
"grad_norm": 0.001733385375700891,
"learning_rate": 0.00018804739824832562,
"loss": 0.8877,
"step": 7880
},
{
"epoch": 2.438948995363215,
"grad_norm": 0.0014930800534784794,
"learning_rate": 0.00018701700154559507,
"loss": 0.9079,
"step": 7890
},
{
"epoch": 2.4420401854714067,
"grad_norm": 0.0017946161096915603,
"learning_rate": 0.0001859866048428645,
"loss": 0.8372,
"step": 7900
},
{
"epoch": 2.445131375579598,
"grad_norm": 0.00181410217192024,
"learning_rate": 0.00018495620814013394,
"loss": 0.9301,
"step": 7910
},
{
"epoch": 2.44822256568779,
"grad_norm": 0.0018277463968843222,
"learning_rate": 0.00018392581143740342,
"loss": 0.9406,
"step": 7920
},
{
"epoch": 2.451313755795981,
"grad_norm": 0.0017499460373073816,
"learning_rate": 0.00018289541473467285,
"loss": 0.9327,
"step": 7930
},
{
"epoch": 2.454404945904173,
"grad_norm": 0.0017523594433441758,
"learning_rate": 0.0001818650180319423,
"loss": 0.9541,
"step": 7940
},
{
"epoch": 2.457496136012365,
"grad_norm": 0.001588582992553711,
"learning_rate": 0.00018083462132921174,
"loss": 0.9244,
"step": 7950
},
{
"epoch": 2.4605873261205566,
"grad_norm": 0.0015729885781183839,
"learning_rate": 0.0001798042246264812,
"loss": 0.902,
"step": 7960
},
{
"epoch": 2.463678516228748,
"grad_norm": 0.001764149172231555,
"learning_rate": 0.00017877382792375065,
"loss": 0.9115,
"step": 7970
},
{
"epoch": 2.46676970633694,
"grad_norm": 0.0020394367165863514,
"learning_rate": 0.0001777434312210201,
"loss": 0.8913,
"step": 7980
},
{
"epoch": 2.469860896445131,
"grad_norm": 0.001661314396187663,
"learning_rate": 0.00017671303451828954,
"loss": 0.8924,
"step": 7990
},
{
"epoch": 2.472952086553323,
"grad_norm": 0.0016929521225392818,
"learning_rate": 0.000175682637815559,
"loss": 0.9158,
"step": 8000
},
{
"epoch": 2.476043276661515,
"grad_norm": 0.0014037607470527291,
"learning_rate": 0.00017465224111282843,
"loss": 0.8929,
"step": 8010
},
{
"epoch": 2.479134466769706,
"grad_norm": 0.0012340841349214315,
"learning_rate": 0.0001736218444100979,
"loss": 0.9042,
"step": 8020
},
{
"epoch": 2.482225656877898,
"grad_norm": 0.0016911630518734455,
"learning_rate": 0.00017259144770736734,
"loss": 0.8805,
"step": 8030
},
{
"epoch": 2.48531684698609,
"grad_norm": 0.0015811780467629433,
"learning_rate": 0.0001715610510046368,
"loss": 0.9066,
"step": 8040
},
{
"epoch": 2.488408037094281,
"grad_norm": 0.0022526499815285206,
"learning_rate": 0.00017053065430190623,
"loss": 0.8729,
"step": 8050
},
{
"epoch": 2.491499227202473,
"grad_norm": 0.0014964583097025752,
"learning_rate": 0.00016950025759917566,
"loss": 0.8867,
"step": 8060
},
{
"epoch": 2.4945904173106648,
"grad_norm": 0.001667377888225019,
"learning_rate": 0.00016846986089644514,
"loss": 0.9289,
"step": 8070
},
{
"epoch": 2.497681607418856,
"grad_norm": 0.0015655744355171919,
"learning_rate": 0.00016743946419371457,
"loss": 0.8989,
"step": 8080
},
{
"epoch": 2.500772797527048,
"grad_norm": 0.0018760713282972574,
"learning_rate": 0.00016640906749098403,
"loss": 0.8951,
"step": 8090
},
{
"epoch": 2.5038639876352393,
"grad_norm": 0.0018504380714148283,
"learning_rate": 0.00016537867078825346,
"loss": 0.9548,
"step": 8100
},
{
"epoch": 2.506955177743431,
"grad_norm": 0.0015352640766650438,
"learning_rate": 0.00016434827408552295,
"loss": 0.8789,
"step": 8110
},
{
"epoch": 2.510046367851623,
"grad_norm": 0.0014199953293427825,
"learning_rate": 0.00016331787738279238,
"loss": 0.8956,
"step": 8120
},
{
"epoch": 2.5131375579598147,
"grad_norm": 0.0022967271506786346,
"learning_rate": 0.00016228748068006183,
"loss": 0.8662,
"step": 8130
},
{
"epoch": 2.516228748068006,
"grad_norm": 0.0015619174810126424,
"learning_rate": 0.00016125708397733126,
"loss": 0.9188,
"step": 8140
},
{
"epoch": 2.519319938176198,
"grad_norm": 0.00181775342207402,
"learning_rate": 0.00016022668727460072,
"loss": 0.9115,
"step": 8150
},
{
"epoch": 2.5224111282843893,
"grad_norm": 0.001615070621483028,
"learning_rate": 0.00015919629057187018,
"loss": 0.8719,
"step": 8160
},
{
"epoch": 2.525502318392581,
"grad_norm": 0.002030865289270878,
"learning_rate": 0.00015816589386913964,
"loss": 0.8618,
"step": 8170
},
{
"epoch": 2.528593508500773,
"grad_norm": 0.0018763948464766145,
"learning_rate": 0.00015713549716640907,
"loss": 0.9493,
"step": 8180
},
{
"epoch": 2.5316846986089647,
"grad_norm": 0.0015980995958670974,
"learning_rate": 0.00015610510046367852,
"loss": 0.9139,
"step": 8190
},
{
"epoch": 2.534775888717156,
"grad_norm": 0.0017758564790710807,
"learning_rate": 0.00015507470376094795,
"loss": 0.8785,
"step": 8200
},
{
"epoch": 2.537867078825348,
"grad_norm": 0.0018766775028780103,
"learning_rate": 0.0001540443070582174,
"loss": 0.9099,
"step": 8210
},
{
"epoch": 2.5409582689335393,
"grad_norm": 0.0018965909257531166,
"learning_rate": 0.00015301391035548687,
"loss": 0.9314,
"step": 8220
},
{
"epoch": 2.544049459041731,
"grad_norm": 0.0015854688826948404,
"learning_rate": 0.0001519835136527563,
"loss": 0.9137,
"step": 8230
},
{
"epoch": 2.547140649149923,
"grad_norm": 0.0018873221706598997,
"learning_rate": 0.00015095311695002576,
"loss": 0.8828,
"step": 8240
},
{
"epoch": 2.5502318392581143,
"grad_norm": 0.0014826676342636347,
"learning_rate": 0.00014992272024729521,
"loss": 0.9031,
"step": 8250
},
{
"epoch": 2.553323029366306,
"grad_norm": 0.0014809026615694165,
"learning_rate": 0.00014889232354456467,
"loss": 0.8574,
"step": 8260
},
{
"epoch": 2.5564142194744974,
"grad_norm": 0.0014648685464635491,
"learning_rate": 0.0001478619268418341,
"loss": 0.8706,
"step": 8270
},
{
"epoch": 2.5595054095826892,
"grad_norm": 0.001307973056100309,
"learning_rate": 0.00014683153013910356,
"loss": 0.9271,
"step": 8280
},
{
"epoch": 2.562596599690881,
"grad_norm": 0.0019158597569912672,
"learning_rate": 0.000145801133436373,
"loss": 0.8816,
"step": 8290
},
{
"epoch": 2.565687789799073,
"grad_norm": 0.0016835506539791822,
"learning_rate": 0.00014477073673364247,
"loss": 0.9075,
"step": 8300
},
{
"epoch": 2.5687789799072642,
"grad_norm": 0.0020169655326753855,
"learning_rate": 0.0001437403400309119,
"loss": 0.8927,
"step": 8310
},
{
"epoch": 2.571870170015456,
"grad_norm": 0.001817848184145987,
"learning_rate": 0.00014270994332818136,
"loss": 0.8812,
"step": 8320
},
{
"epoch": 2.5749613601236474,
"grad_norm": 0.001979761989787221,
"learning_rate": 0.0001416795466254508,
"loss": 0.9054,
"step": 8330
},
{
"epoch": 2.578052550231839,
"grad_norm": 0.002088018460199237,
"learning_rate": 0.00014064914992272025,
"loss": 0.8691,
"step": 8340
},
{
"epoch": 2.581143740340031,
"grad_norm": 0.0017918642843142152,
"learning_rate": 0.0001396187532199897,
"loss": 0.9458,
"step": 8350
},
{
"epoch": 2.584234930448223,
"grad_norm": 0.0016459986800327897,
"learning_rate": 0.00013858835651725916,
"loss": 0.9186,
"step": 8360
},
{
"epoch": 2.587326120556414,
"grad_norm": 0.001407464500516653,
"learning_rate": 0.0001375579598145286,
"loss": 0.8839,
"step": 8370
},
{
"epoch": 2.590417310664606,
"grad_norm": 0.001718651968985796,
"learning_rate": 0.00013652756311179802,
"loss": 0.9203,
"step": 8380
},
{
"epoch": 2.5935085007727974,
"grad_norm": 0.0018458360573276877,
"learning_rate": 0.0001354971664090675,
"loss": 0.911,
"step": 8390
},
{
"epoch": 2.596599690880989,
"grad_norm": 0.001538407290354371,
"learning_rate": 0.00013446676970633694,
"loss": 0.9139,
"step": 8400
},
{
"epoch": 2.599690880989181,
"grad_norm": 0.001976667670533061,
"learning_rate": 0.0001334363730036064,
"loss": 0.9075,
"step": 8410
},
{
"epoch": 2.6027820710973724,
"grad_norm": 0.0021034348756074905,
"learning_rate": 0.00013240597630087583,
"loss": 0.9181,
"step": 8420
},
{
"epoch": 2.605873261205564,
"grad_norm": 0.00171546614728868,
"learning_rate": 0.00013137557959814528,
"loss": 0.9361,
"step": 8430
},
{
"epoch": 2.6089644513137555,
"grad_norm": 0.001742625143378973,
"learning_rate": 0.00013034518289541474,
"loss": 0.8895,
"step": 8440
},
{
"epoch": 2.6120556414219473,
"grad_norm": 0.0018996672006323934,
"learning_rate": 0.0001293147861926842,
"loss": 0.8474,
"step": 8450
},
{
"epoch": 2.615146831530139,
"grad_norm": 0.001571224071085453,
"learning_rate": 0.00012828438948995363,
"loss": 0.8976,
"step": 8460
},
{
"epoch": 2.618238021638331,
"grad_norm": 0.0018322218675166368,
"learning_rate": 0.0001272539927872231,
"loss": 0.9207,
"step": 8470
},
{
"epoch": 2.6213292117465223,
"grad_norm": 0.0013319810386747122,
"learning_rate": 0.00012622359608449252,
"loss": 0.8674,
"step": 8480
},
{
"epoch": 2.624420401854714,
"grad_norm": 0.0014434581389650702,
"learning_rate": 0.000125193199381762,
"loss": 0.829,
"step": 8490
},
{
"epoch": 2.6275115919629055,
"grad_norm": 0.0027343255933374166,
"learning_rate": 0.00012416280267903143,
"loss": 0.8882,
"step": 8500
},
{
"epoch": 2.6306027820710973,
"grad_norm": 0.003409834112972021,
"learning_rate": 0.0001231324059763009,
"loss": 0.9302,
"step": 8510
},
{
"epoch": 2.633693972179289,
"grad_norm": 0.0013600644888356328,
"learning_rate": 0.00012210200927357035,
"loss": 0.876,
"step": 8520
},
{
"epoch": 2.636785162287481,
"grad_norm": 0.0016775266267359257,
"learning_rate": 0.00012107161257083978,
"loss": 0.9064,
"step": 8530
},
{
"epoch": 2.6398763523956723,
"grad_norm": 0.0017377499025315046,
"learning_rate": 0.00012004121586810922,
"loss": 0.9224,
"step": 8540
},
{
"epoch": 2.642967542503864,
"grad_norm": 0.001847295556217432,
"learning_rate": 0.00011901081916537868,
"loss": 0.9149,
"step": 8550
},
{
"epoch": 2.6460587326120555,
"grad_norm": 0.0014812115114182234,
"learning_rate": 0.00011798042246264812,
"loss": 0.8929,
"step": 8560
},
{
"epoch": 2.6491499227202473,
"grad_norm": 0.0017952779307961464,
"learning_rate": 0.00011695002575991758,
"loss": 0.9285,
"step": 8570
},
{
"epoch": 2.652241112828439,
"grad_norm": 0.001425527036190033,
"learning_rate": 0.00011591962905718702,
"loss": 0.8703,
"step": 8580
},
{
"epoch": 2.6553323029366305,
"grad_norm": 0.001610907376743853,
"learning_rate": 0.00011488923235445648,
"loss": 0.9103,
"step": 8590
},
{
"epoch": 2.6584234930448223,
"grad_norm": 0.001997585641220212,
"learning_rate": 0.00011385883565172592,
"loss": 0.898,
"step": 8600
},
{
"epoch": 2.6615146831530136,
"grad_norm": 0.0015265914844349027,
"learning_rate": 0.00011282843894899535,
"loss": 0.8225,
"step": 8610
},
{
"epoch": 2.6646058732612055,
"grad_norm": 0.002251180587336421,
"learning_rate": 0.00011179804224626481,
"loss": 0.8886,
"step": 8620
},
{
"epoch": 2.6676970633693973,
"grad_norm": 0.0016794728580862284,
"learning_rate": 0.00011076764554353426,
"loss": 0.8991,
"step": 8630
},
{
"epoch": 2.670788253477589,
"grad_norm": 0.0021927470806986094,
"learning_rate": 0.00010973724884080371,
"loss": 0.872,
"step": 8640
},
{
"epoch": 2.6738794435857804,
"grad_norm": 0.0016734660603106022,
"learning_rate": 0.00010870685213807316,
"loss": 0.9082,
"step": 8650
},
{
"epoch": 2.6769706336939723,
"grad_norm": 0.0023362876381725073,
"learning_rate": 0.00010767645543534261,
"loss": 0.8867,
"step": 8660
},
{
"epoch": 2.6800618238021636,
"grad_norm": 0.0015447117621079087,
"learning_rate": 0.00010664605873261206,
"loss": 0.8706,
"step": 8670
},
{
"epoch": 2.6831530139103554,
"grad_norm": 0.0017007689457386732,
"learning_rate": 0.0001056156620298815,
"loss": 0.9457,
"step": 8680
},
{
"epoch": 2.6862442040185472,
"grad_norm": 0.0018746848218142986,
"learning_rate": 0.00010458526532715096,
"loss": 0.8834,
"step": 8690
},
{
"epoch": 2.689335394126739,
"grad_norm": 0.0016901030903682113,
"learning_rate": 0.0001035548686244204,
"loss": 0.9321,
"step": 8700
},
{
"epoch": 2.6924265842349304,
"grad_norm": 0.0017849161522462964,
"learning_rate": 0.00010252447192168986,
"loss": 0.8886,
"step": 8710
},
{
"epoch": 2.6955177743431222,
"grad_norm": 0.0016776255797594786,
"learning_rate": 0.0001014940752189593,
"loss": 0.8842,
"step": 8720
},
{
"epoch": 2.6986089644513136,
"grad_norm": 0.002052722033113241,
"learning_rate": 0.00010046367851622876,
"loss": 0.8921,
"step": 8730
},
{
"epoch": 2.7017001545595054,
"grad_norm": 0.0020015337504446507,
"learning_rate": 9.94332818134982e-05,
"loss": 0.878,
"step": 8740
},
{
"epoch": 2.704791344667697,
"grad_norm": 0.0018238229677081108,
"learning_rate": 9.840288511076765e-05,
"loss": 0.9286,
"step": 8750
},
{
"epoch": 2.7078825347758886,
"grad_norm": 0.0019556416664272547,
"learning_rate": 9.737248840803709e-05,
"loss": 0.9109,
"step": 8760
},
{
"epoch": 2.7109737248840804,
"grad_norm": 0.001552661880850792,
"learning_rate": 9.634209170530654e-05,
"loss": 0.9073,
"step": 8770
},
{
"epoch": 2.7140649149922718,
"grad_norm": 0.0018825504230335355,
"learning_rate": 9.5311695002576e-05,
"loss": 0.9033,
"step": 8780
},
{
"epoch": 2.7171561051004636,
"grad_norm": 0.0018266792176291347,
"learning_rate": 9.428129829984544e-05,
"loss": 0.946,
"step": 8790
},
{
"epoch": 2.7202472952086554,
"grad_norm": 0.0014049782184883952,
"learning_rate": 9.325090159711488e-05,
"loss": 0.8817,
"step": 8800
},
{
"epoch": 2.723338485316847,
"grad_norm": 0.0015499057481065392,
"learning_rate": 9.222050489438434e-05,
"loss": 0.8569,
"step": 8810
},
{
"epoch": 2.7264296754250386,
"grad_norm": 0.0018245774554088712,
"learning_rate": 9.119010819165378e-05,
"loss": 0.8934,
"step": 8820
},
{
"epoch": 2.7295208655332304,
"grad_norm": 0.001886395737528801,
"learning_rate": 9.015971148892324e-05,
"loss": 0.9212,
"step": 8830
},
{
"epoch": 2.7326120556414217,
"grad_norm": 0.0017441367963328958,
"learning_rate": 8.912931478619268e-05,
"loss": 0.9378,
"step": 8840
},
{
"epoch": 2.7357032457496135,
"grad_norm": 0.0015955539420247078,
"learning_rate": 8.809891808346214e-05,
"loss": 0.8835,
"step": 8850
},
{
"epoch": 2.7387944358578054,
"grad_norm": 0.001902989810332656,
"learning_rate": 8.706852138073158e-05,
"loss": 0.9072,
"step": 8860
},
{
"epoch": 2.741885625965997,
"grad_norm": 0.0014242329634726048,
"learning_rate": 8.603812467800103e-05,
"loss": 0.8712,
"step": 8870
},
{
"epoch": 2.7449768160741885,
"grad_norm": 0.001587534206919372,
"learning_rate": 8.500772797527049e-05,
"loss": 0.8655,
"step": 8880
},
{
"epoch": 2.7480680061823803,
"grad_norm": 0.001545070088468492,
"learning_rate": 8.397733127253993e-05,
"loss": 0.9077,
"step": 8890
},
{
"epoch": 2.7511591962905717,
"grad_norm": 0.0018857244867831469,
"learning_rate": 8.294693456980939e-05,
"loss": 0.896,
"step": 8900
},
{
"epoch": 2.7542503863987635,
"grad_norm": 0.0012212925357744098,
"learning_rate": 8.191653786707883e-05,
"loss": 0.8993,
"step": 8910
},
{
"epoch": 2.7573415765069553,
"grad_norm": 0.0014877498615533113,
"learning_rate": 8.088614116434827e-05,
"loss": 0.9214,
"step": 8920
},
{
"epoch": 2.7604327666151467,
"grad_norm": 0.0014304481446743011,
"learning_rate": 7.985574446161772e-05,
"loss": 0.8926,
"step": 8930
},
{
"epoch": 2.7635239567233385,
"grad_norm": 0.001448179711587727,
"learning_rate": 7.882534775888716e-05,
"loss": 0.873,
"step": 8940
},
{
"epoch": 2.76661514683153,
"grad_norm": 0.0015107804210856557,
"learning_rate": 7.779495105615662e-05,
"loss": 0.8396,
"step": 8950
},
{
"epoch": 2.7697063369397217,
"grad_norm": 0.001311837462708354,
"learning_rate": 7.676455435342606e-05,
"loss": 0.9185,
"step": 8960
},
{
"epoch": 2.7727975270479135,
"grad_norm": 0.0017208693316206336,
"learning_rate": 7.573415765069552e-05,
"loss": 0.8959,
"step": 8970
},
{
"epoch": 2.7758887171561053,
"grad_norm": 0.0016424921341240406,
"learning_rate": 7.470376094796496e-05,
"loss": 0.8892,
"step": 8980
},
{
"epoch": 2.7789799072642967,
"grad_norm": 0.0014334353618323803,
"learning_rate": 7.367336424523442e-05,
"loss": 0.8908,
"step": 8990
},
{
"epoch": 2.7820710973724885,
"grad_norm": 0.0019086402608081698,
"learning_rate": 7.264296754250387e-05,
"loss": 0.8791,
"step": 9000
},
{
"epoch": 2.78516228748068,
"grad_norm": 0.0013547363923862576,
"learning_rate": 7.161257083977331e-05,
"loss": 0.9024,
"step": 9010
},
{
"epoch": 2.7882534775888717,
"grad_norm": 0.0013833148404955864,
"learning_rate": 7.058217413704277e-05,
"loss": 0.9195,
"step": 9020
},
{
"epoch": 2.7913446676970635,
"grad_norm": 0.0014539181720465422,
"learning_rate": 6.955177743431221e-05,
"loss": 0.8835,
"step": 9030
},
{
"epoch": 2.7944358578052553,
"grad_norm": 0.001557295210659504,
"learning_rate": 6.852138073158167e-05,
"loss": 0.8959,
"step": 9040
},
{
"epoch": 2.7975270479134466,
"grad_norm": 0.0020530694164335728,
"learning_rate": 6.749098402885111e-05,
"loss": 0.9193,
"step": 9050
},
{
"epoch": 2.8006182380216385,
"grad_norm": 0.0016817068681120872,
"learning_rate": 6.646058732612057e-05,
"loss": 0.8456,
"step": 9060
},
{
"epoch": 2.80370942812983,
"grad_norm": 0.0019015265861526132,
"learning_rate": 6.543019062339001e-05,
"loss": 0.8723,
"step": 9070
},
{
"epoch": 2.8068006182380216,
"grad_norm": 0.0015511283418163657,
"learning_rate": 6.439979392065944e-05,
"loss": 0.9393,
"step": 9080
},
{
"epoch": 2.8098918083462134,
"grad_norm": 0.00140860746614635,
"learning_rate": 6.33693972179289e-05,
"loss": 0.8914,
"step": 9090
},
{
"epoch": 2.812982998454405,
"grad_norm": 0.00200218940153718,
"learning_rate": 6.233900051519836e-05,
"loss": 0.8723,
"step": 9100
},
{
"epoch": 2.8160741885625966,
"grad_norm": 0.001929080463014543,
"learning_rate": 6.13086038124678e-05,
"loss": 0.9241,
"step": 9110
},
{
"epoch": 2.819165378670788,
"grad_norm": 0.0016536141047254205,
"learning_rate": 6.0278207109737246e-05,
"loss": 0.9228,
"step": 9120
},
{
"epoch": 2.82225656877898,
"grad_norm": 0.0014851295854896307,
"learning_rate": 5.9247810407006696e-05,
"loss": 0.9038,
"step": 9130
},
{
"epoch": 2.8253477588871716,
"grad_norm": 0.0014977608807384968,
"learning_rate": 5.821741370427615e-05,
"loss": 0.8881,
"step": 9140
},
{
"epoch": 2.8284389489953634,
"grad_norm": 0.002265679184347391,
"learning_rate": 5.71870170015456e-05,
"loss": 0.8694,
"step": 9150
},
{
"epoch": 2.8315301391035548,
"grad_norm": 0.0016861397307366133,
"learning_rate": 5.615662029881505e-05,
"loss": 0.849,
"step": 9160
},
{
"epoch": 2.8346213292117466,
"grad_norm": 0.001969564938917756,
"learning_rate": 5.512622359608449e-05,
"loss": 0.8832,
"step": 9170
},
{
"epoch": 2.837712519319938,
"grad_norm": 0.0019365076441317797,
"learning_rate": 5.409582689335394e-05,
"loss": 0.8763,
"step": 9180
},
{
"epoch": 2.8408037094281298,
"grad_norm": 0.0024066604673862457,
"learning_rate": 5.306543019062339e-05,
"loss": 0.9283,
"step": 9190
},
{
"epoch": 2.8438948995363216,
"grad_norm": 0.0017798724584281445,
"learning_rate": 5.203503348789284e-05,
"loss": 0.9098,
"step": 9200
},
{
"epoch": 2.8469860896445134,
"grad_norm": 0.0014776487369090319,
"learning_rate": 5.100463678516229e-05,
"loss": 0.8685,
"step": 9210
},
{
"epoch": 2.8500772797527048,
"grad_norm": 0.0016472855350002646,
"learning_rate": 4.997424008243174e-05,
"loss": 0.9151,
"step": 9220
},
{
"epoch": 2.8531684698608966,
"grad_norm": 0.001969245495274663,
"learning_rate": 4.894384337970119e-05,
"loss": 0.8841,
"step": 9230
},
{
"epoch": 2.856259659969088,
"grad_norm": 0.00151623017154634,
"learning_rate": 4.791344667697063e-05,
"loss": 0.8764,
"step": 9240
},
{
"epoch": 2.8593508500772797,
"grad_norm": 0.0014065582072362304,
"learning_rate": 4.688304997424008e-05,
"loss": 0.8737,
"step": 9250
},
{
"epoch": 2.8624420401854715,
"grad_norm": 0.001735221827402711,
"learning_rate": 4.5852653271509534e-05,
"loss": 0.8494,
"step": 9260
},
{
"epoch": 2.865533230293663,
"grad_norm": 0.001790928072296083,
"learning_rate": 4.4822256568778984e-05,
"loss": 0.9272,
"step": 9270
},
{
"epoch": 2.8686244204018547,
"grad_norm": 0.002074885880574584,
"learning_rate": 4.379185986604843e-05,
"loss": 0.8767,
"step": 9280
},
{
"epoch": 2.871715610510046,
"grad_norm": 0.002597965532913804,
"learning_rate": 4.276146316331788e-05,
"loss": 0.9302,
"step": 9290
},
{
"epoch": 2.874806800618238,
"grad_norm": 0.0019227894954383373,
"learning_rate": 4.173106646058733e-05,
"loss": 0.931,
"step": 9300
},
{
"epoch": 2.8778979907264297,
"grad_norm": 0.0014659016160294414,
"learning_rate": 4.070066975785677e-05,
"loss": 0.944,
"step": 9310
},
{
"epoch": 2.8809891808346215,
"grad_norm": 0.0017797836335375905,
"learning_rate": 3.9670273055126224e-05,
"loss": 0.95,
"step": 9320
},
{
"epoch": 2.884080370942813,
"grad_norm": 0.0014417979400604963,
"learning_rate": 3.8639876352395674e-05,
"loss": 0.8956,
"step": 9330
},
{
"epoch": 2.8871715610510047,
"grad_norm": 0.0013740757713094354,
"learning_rate": 3.7609479649665125e-05,
"loss": 0.9165,
"step": 9340
},
{
"epoch": 2.890262751159196,
"grad_norm": 0.0016021078918129206,
"learning_rate": 3.6579082946934575e-05,
"loss": 0.9061,
"step": 9350
},
{
"epoch": 2.893353941267388,
"grad_norm": 0.0017530877375975251,
"learning_rate": 3.554868624420402e-05,
"loss": 0.926,
"step": 9360
},
{
"epoch": 2.8964451313755797,
"grad_norm": 0.0014526183949783444,
"learning_rate": 3.451828954147346e-05,
"loss": 0.8938,
"step": 9370
},
{
"epoch": 2.8995363214837715,
"grad_norm": 0.001970699056982994,
"learning_rate": 3.3487892838742914e-05,
"loss": 0.8899,
"step": 9380
},
{
"epoch": 2.902627511591963,
"grad_norm": 0.001455257530324161,
"learning_rate": 3.2457496136012364e-05,
"loss": 0.8998,
"step": 9390
},
{
"epoch": 2.9057187017001547,
"grad_norm": 0.0023150176275521517,
"learning_rate": 3.1427099433281815e-05,
"loss": 0.8652,
"step": 9400
},
{
"epoch": 2.908809891808346,
"grad_norm": 0.0021134279668331146,
"learning_rate": 3.0396702730551262e-05,
"loss": 0.8573,
"step": 9410
},
{
"epoch": 2.911901081916538,
"grad_norm": 0.0016436997102573514,
"learning_rate": 2.936630602782071e-05,
"loss": 0.9263,
"step": 9420
},
{
"epoch": 2.9149922720247297,
"grad_norm": 0.001375730847939849,
"learning_rate": 2.833590932509016e-05,
"loss": 0.8869,
"step": 9430
},
{
"epoch": 2.918083462132921,
"grad_norm": 0.001517411321401596,
"learning_rate": 2.730551262235961e-05,
"loss": 0.9652,
"step": 9440
},
{
"epoch": 2.921174652241113,
"grad_norm": 0.0015275340992957354,
"learning_rate": 2.6275115919629058e-05,
"loss": 0.8882,
"step": 9450
},
{
"epoch": 2.9242658423493046,
"grad_norm": 0.0019407563377171755,
"learning_rate": 2.5244719216898505e-05,
"loss": 0.9207,
"step": 9460
},
{
"epoch": 2.927357032457496,
"grad_norm": 0.0013504921225830913,
"learning_rate": 2.4214322514167955e-05,
"loss": 0.8926,
"step": 9470
},
{
"epoch": 2.930448222565688,
"grad_norm": 0.0017280855681747198,
"learning_rate": 2.3183925811437406e-05,
"loss": 0.9399,
"step": 9480
},
{
"epoch": 2.9335394126738796,
"grad_norm": 0.0019845685455948114,
"learning_rate": 2.215352910870685e-05,
"loss": 0.9159,
"step": 9490
},
{
"epoch": 2.936630602782071,
"grad_norm": 0.001844555838033557,
"learning_rate": 2.11231324059763e-05,
"loss": 0.9431,
"step": 9500
},
{
"epoch": 2.939721792890263,
"grad_norm": 0.0013761295704171062,
"learning_rate": 2.009273570324575e-05,
"loss": 0.9008,
"step": 9510
},
{
"epoch": 2.942812982998454,
"grad_norm": 0.001978537067770958,
"learning_rate": 1.90623390005152e-05,
"loss": 0.949,
"step": 9520
},
{
"epoch": 2.945904173106646,
"grad_norm": 0.001545691047795117,
"learning_rate": 1.8031942297784645e-05,
"loss": 0.9337,
"step": 9530
},
{
"epoch": 2.948995363214838,
"grad_norm": 0.0017250186065211892,
"learning_rate": 1.7001545595054096e-05,
"loss": 0.8819,
"step": 9540
},
{
"epoch": 2.9520865533230296,
"grad_norm": 0.0014475194038823247,
"learning_rate": 1.5971148892323546e-05,
"loss": 0.9038,
"step": 9550
},
{
"epoch": 2.955177743431221,
"grad_norm": 0.0015622404171153903,
"learning_rate": 1.4940752189592994e-05,
"loss": 0.8629,
"step": 9560
},
{
"epoch": 2.958268933539413,
"grad_norm": 0.00176430051214993,
"learning_rate": 1.3910355486862443e-05,
"loss": 0.9482,
"step": 9570
},
{
"epoch": 2.961360123647604,
"grad_norm": 0.002005284419283271,
"learning_rate": 1.287995878413189e-05,
"loss": 0.9089,
"step": 9580
},
{
"epoch": 2.964451313755796,
"grad_norm": 0.0017773432191461325,
"learning_rate": 1.184956208140134e-05,
"loss": 0.9071,
"step": 9590
},
{
"epoch": 2.9675425038639878,
"grad_norm": 0.00213231542147696,
"learning_rate": 1.0819165378670788e-05,
"loss": 0.9035,
"step": 9600
},
{
"epoch": 2.970633693972179,
"grad_norm": 0.002046185778453946,
"learning_rate": 9.788768675940238e-06,
"loss": 0.9451,
"step": 9610
},
{
"epoch": 2.973724884080371,
"grad_norm": 0.001680073793977499,
"learning_rate": 8.758371973209685e-06,
"loss": 0.8838,
"step": 9620
},
{
"epoch": 2.9768160741885628,
"grad_norm": 0.0015061446465551853,
"learning_rate": 7.727975270479134e-06,
"loss": 0.8764,
"step": 9630
},
{
"epoch": 2.979907264296754,
"grad_norm": 0.0019097643671557307,
"learning_rate": 6.697578567748584e-06,
"loss": 0.9369,
"step": 9640
},
{
"epoch": 2.982998454404946,
"grad_norm": 0.0017981400014832616,
"learning_rate": 5.667181865018033e-06,
"loss": 0.8498,
"step": 9650
},
{
"epoch": 2.9860896445131377,
"grad_norm": 0.0021447227336466312,
"learning_rate": 4.636785162287481e-06,
"loss": 0.9168,
"step": 9660
},
{
"epoch": 2.989180834621329,
"grad_norm": 0.0015040052821859717,
"learning_rate": 3.6063884595569293e-06,
"loss": 0.9233,
"step": 9670
},
{
"epoch": 2.992272024729521,
"grad_norm": 0.00173336046282202,
"learning_rate": 2.575991756826378e-06,
"loss": 0.9224,
"step": 9680
},
{
"epoch": 2.9953632148377123,
"grad_norm": 0.00168697745539248,
"learning_rate": 1.5455950540958269e-06,
"loss": 0.9131,
"step": 9690
},
{
"epoch": 2.998454404945904,
"grad_norm": 0.0016643669223412871,
"learning_rate": 5.151983513652757e-07,
"loss": 0.945,
"step": 9700
}
],
"logging_steps": 10,
"max_steps": 9705,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.329354151166001e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}