gpt2-xl-lora-multi-512-k5-31-im-2 / trainer_state.json
MHGanainy's picture
MHGanainy/gpt2-xl-lora-multi-512-k5-31-im-2
b7eb840 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 42248,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0023669759515243323,
"grad_norm": 0.36582449078559875,
"learning_rate": 5.858508151611091e-08,
"loss": 2.8441,
"step": 100
},
{
"epoch": 0.004733951903048665,
"grad_norm": 0.36473265290260315,
"learning_rate": 1.1657839453205907e-07,
"loss": 2.8263,
"step": 200
},
{
"epoch": 0.007100927854572997,
"grad_norm": 0.3368377685546875,
"learning_rate": 1.7516347604817e-07,
"loss": 2.8367,
"step": 300
},
{
"epoch": 0.00946790380609733,
"grad_norm": 0.3552389442920685,
"learning_rate": 2.3434032606444363e-07,
"loss": 2.8383,
"step": 400
},
{
"epoch": 0.011834879757621663,
"grad_norm": 0.4023584723472595,
"learning_rate": 2.9351717608071723e-07,
"loss": 2.8136,
"step": 500
},
{
"epoch": 0.014201855709145995,
"grad_norm": 0.27697062492370605,
"learning_rate": 3.526940260969909e-07,
"loss": 2.7922,
"step": 600
},
{
"epoch": 0.016568831660670327,
"grad_norm": 0.35471972823143005,
"learning_rate": 4.1187087611326455e-07,
"loss": 2.7994,
"step": 700
},
{
"epoch": 0.01893580761219466,
"grad_norm": 0.31292667984962463,
"learning_rate": 4.710477261295382e-07,
"loss": 2.7759,
"step": 800
},
{
"epoch": 0.021302783563718994,
"grad_norm": 0.453121542930603,
"learning_rate": 5.302245761458118e-07,
"loss": 2.7526,
"step": 900
},
{
"epoch": 0.023669759515243326,
"grad_norm": 0.33600056171417236,
"learning_rate": 5.894014261620854e-07,
"loss": 2.7212,
"step": 1000
},
{
"epoch": 0.026036735466767658,
"grad_norm": 0.3094422221183777,
"learning_rate": 6.485782761783591e-07,
"loss": 2.6944,
"step": 1100
},
{
"epoch": 0.02840371141829199,
"grad_norm": 0.3267682194709778,
"learning_rate": 7.077551261946328e-07,
"loss": 2.6648,
"step": 1200
},
{
"epoch": 0.03077068736981632,
"grad_norm": 0.5763485431671143,
"learning_rate": 7.669319762109063e-07,
"loss": 2.6594,
"step": 1300
},
{
"epoch": 0.03313766332134065,
"grad_norm": 0.2788572609424591,
"learning_rate": 8.255170577270173e-07,
"loss": 2.6552,
"step": 1400
},
{
"epoch": 0.035504639272864985,
"grad_norm": 0.38050368428230286,
"learning_rate": 8.846939077432909e-07,
"loss": 2.6319,
"step": 1500
},
{
"epoch": 0.03787161522438932,
"grad_norm": 0.29289504885673523,
"learning_rate": 9.438707577595646e-07,
"loss": 2.6371,
"step": 1600
},
{
"epoch": 0.040238591175913656,
"grad_norm": 0.20580381155014038,
"learning_rate": 1.0030476077758381e-06,
"loss": 2.6054,
"step": 1700
},
{
"epoch": 0.04260556712743799,
"grad_norm": 0.2935289442539215,
"learning_rate": 1.0622244577921118e-06,
"loss": 2.5963,
"step": 1800
},
{
"epoch": 0.04497254307896232,
"grad_norm": 0.2953510582447052,
"learning_rate": 1.1214013078083855e-06,
"loss": 2.6056,
"step": 1900
},
{
"epoch": 0.04733951903048665,
"grad_norm": 0.3077057898044586,
"learning_rate": 1.180578157824659e-06,
"loss": 2.5864,
"step": 2000
},
{
"epoch": 0.04970649498201098,
"grad_norm": 0.25115400552749634,
"learning_rate": 1.2397550078409327e-06,
"loss": 2.5911,
"step": 2100
},
{
"epoch": 0.052073470933535315,
"grad_norm": 0.2623751759529114,
"learning_rate": 1.2989318578572062e-06,
"loss": 2.5591,
"step": 2200
},
{
"epoch": 0.05444044688505965,
"grad_norm": 0.30447134375572205,
"learning_rate": 1.35810870787348e-06,
"loss": 2.5668,
"step": 2300
},
{
"epoch": 0.05680742283658398,
"grad_norm": 0.2120353877544403,
"learning_rate": 1.4172855578897537e-06,
"loss": 2.5521,
"step": 2400
},
{
"epoch": 0.05917439878810831,
"grad_norm": 0.23940175771713257,
"learning_rate": 1.4764624079060272e-06,
"loss": 2.5594,
"step": 2500
},
{
"epoch": 0.06154137473963264,
"grad_norm": 0.2214510440826416,
"learning_rate": 1.5356392579223009e-06,
"loss": 2.5458,
"step": 2600
},
{
"epoch": 0.06390835069115698,
"grad_norm": 0.22601068019866943,
"learning_rate": 1.5948161079385746e-06,
"loss": 2.538,
"step": 2700
},
{
"epoch": 0.0662753266426813,
"grad_norm": 0.23850201070308685,
"learning_rate": 1.6539929579548483e-06,
"loss": 2.532,
"step": 2800
},
{
"epoch": 0.06864230259420565,
"grad_norm": 0.20834830403327942,
"learning_rate": 1.7131698079711218e-06,
"loss": 2.5285,
"step": 2900
},
{
"epoch": 0.07100927854572997,
"grad_norm": 0.21344949305057526,
"learning_rate": 1.7723466579873955e-06,
"loss": 2.5185,
"step": 3000
},
{
"epoch": 0.07337625449725431,
"grad_norm": 0.21799206733703613,
"learning_rate": 1.8315235080036692e-06,
"loss": 2.5192,
"step": 3100
},
{
"epoch": 0.07574323044877863,
"grad_norm": 0.21602454781532288,
"learning_rate": 1.8907003580199425e-06,
"loss": 2.5086,
"step": 3200
},
{
"epoch": 0.07811020640030297,
"grad_norm": 0.2055075764656067,
"learning_rate": 1.9498772080362162e-06,
"loss": 2.5068,
"step": 3300
},
{
"epoch": 0.08047718235182731,
"grad_norm": 0.21900290250778198,
"learning_rate": 2.00905405805249e-06,
"loss": 2.5144,
"step": 3400
},
{
"epoch": 0.08284415830335164,
"grad_norm": 0.2083442062139511,
"learning_rate": 2.0682309080687637e-06,
"loss": 2.5137,
"step": 3500
},
{
"epoch": 0.08521113425487598,
"grad_norm": 0.21810264885425568,
"learning_rate": 2.127407758085037e-06,
"loss": 2.5075,
"step": 3600
},
{
"epoch": 0.0875781102064003,
"grad_norm": 0.2033359259366989,
"learning_rate": 2.186584608101311e-06,
"loss": 2.4976,
"step": 3700
},
{
"epoch": 0.08994508615792464,
"grad_norm": 0.20291608572006226,
"learning_rate": 2.2457614581175846e-06,
"loss": 2.5054,
"step": 3800
},
{
"epoch": 0.09231206210944896,
"grad_norm": 0.21681128442287445,
"learning_rate": 2.3049383081338585e-06,
"loss": 2.5165,
"step": 3900
},
{
"epoch": 0.0946790380609733,
"grad_norm": 0.23095227777957916,
"learning_rate": 2.3641151581501316e-06,
"loss": 2.4983,
"step": 4000
},
{
"epoch": 0.09704601401249763,
"grad_norm": 0.23442834615707397,
"learning_rate": 2.4232920081664055e-06,
"loss": 2.4923,
"step": 4100
},
{
"epoch": 0.09941298996402197,
"grad_norm": 0.22967080771923065,
"learning_rate": 2.482468858182679e-06,
"loss": 2.4899,
"step": 4200
},
{
"epoch": 0.10177996591554629,
"grad_norm": 0.22393766045570374,
"learning_rate": 2.5416457081989525e-06,
"loss": 2.4939,
"step": 4300
},
{
"epoch": 0.10414694186707063,
"grad_norm": 0.23877893388271332,
"learning_rate": 2.6008225582152264e-06,
"loss": 2.4974,
"step": 4400
},
{
"epoch": 0.10651391781859497,
"grad_norm": 0.26197168231010437,
"learning_rate": 2.6599994082315e-06,
"loss": 2.4773,
"step": 4500
},
{
"epoch": 0.1088808937701193,
"grad_norm": 0.2509444057941437,
"learning_rate": 2.719176258247774e-06,
"loss": 2.4774,
"step": 4600
},
{
"epoch": 0.11124786972164363,
"grad_norm": 0.22168482840061188,
"learning_rate": 2.7783531082640474e-06,
"loss": 2.49,
"step": 4700
},
{
"epoch": 0.11361484567316796,
"grad_norm": 0.24707303941249847,
"learning_rate": 2.8375299582803213e-06,
"loss": 2.468,
"step": 4800
},
{
"epoch": 0.1159818216246923,
"grad_norm": 0.2593018412590027,
"learning_rate": 2.8967068082965944e-06,
"loss": 2.4749,
"step": 4900
},
{
"epoch": 0.11834879757621662,
"grad_norm": 0.22931291162967682,
"learning_rate": 2.955883658312868e-06,
"loss": 2.4787,
"step": 5000
},
{
"epoch": 0.12071577352774096,
"grad_norm": 0.2900484502315521,
"learning_rate": 3.015060508329142e-06,
"loss": 2.4705,
"step": 5100
},
{
"epoch": 0.12308274947926529,
"grad_norm": 0.2222159057855606,
"learning_rate": 3.0742373583454153e-06,
"loss": 2.4629,
"step": 5200
},
{
"epoch": 0.1254497254307896,
"grad_norm": 0.2193671613931656,
"learning_rate": 3.1334142083616892e-06,
"loss": 2.4742,
"step": 5300
},
{
"epoch": 0.12781670138231396,
"grad_norm": 0.22836729884147644,
"learning_rate": 3.1925910583779627e-06,
"loss": 2.4879,
"step": 5400
},
{
"epoch": 0.1301836773338383,
"grad_norm": 0.2218533158302307,
"learning_rate": 3.2517679083942367e-06,
"loss": 2.4636,
"step": 5500
},
{
"epoch": 0.1325506532853626,
"grad_norm": 0.252085417509079,
"learning_rate": 3.31094475841051e-06,
"loss": 2.4727,
"step": 5600
},
{
"epoch": 0.13491762923688697,
"grad_norm": 0.26298022270202637,
"learning_rate": 3.3701216084267837e-06,
"loss": 2.4638,
"step": 5700
},
{
"epoch": 0.1372846051884113,
"grad_norm": 0.23198895156383514,
"learning_rate": 3.429298458443057e-06,
"loss": 2.4597,
"step": 5800
},
{
"epoch": 0.13965158113993562,
"grad_norm": 0.2724401354789734,
"learning_rate": 3.488475308459331e-06,
"loss": 2.4671,
"step": 5900
},
{
"epoch": 0.14201855709145994,
"grad_norm": 0.22617186605930328,
"learning_rate": 3.5476521584756046e-06,
"loss": 2.4665,
"step": 6000
},
{
"epoch": 0.1443855330429843,
"grad_norm": 0.24222290515899658,
"learning_rate": 3.6068290084918785e-06,
"loss": 2.4729,
"step": 6100
},
{
"epoch": 0.14675250899450862,
"grad_norm": 0.23433572053909302,
"learning_rate": 3.666005858508152e-06,
"loss": 2.4512,
"step": 6200
},
{
"epoch": 0.14911948494603294,
"grad_norm": 0.23977671563625336,
"learning_rate": 3.725182708524426e-06,
"loss": 2.464,
"step": 6300
},
{
"epoch": 0.15148646089755727,
"grad_norm": 0.23321278393268585,
"learning_rate": 3.784359558540699e-06,
"loss": 2.4798,
"step": 6400
},
{
"epoch": 0.15385343684908162,
"grad_norm": 0.27208179235458374,
"learning_rate": 3.843536408556973e-06,
"loss": 2.4705,
"step": 6500
},
{
"epoch": 0.15622041280060595,
"grad_norm": 0.23790614306926727,
"learning_rate": 3.902713258573246e-06,
"loss": 2.4555,
"step": 6600
},
{
"epoch": 0.15858738875213027,
"grad_norm": 0.2843892276287079,
"learning_rate": 3.96189010858952e-06,
"loss": 2.4725,
"step": 6700
},
{
"epoch": 0.16095436470365462,
"grad_norm": 0.2643658220767975,
"learning_rate": 4.021066958605794e-06,
"loss": 2.4687,
"step": 6800
},
{
"epoch": 0.16332134065517895,
"grad_norm": 0.29611462354660034,
"learning_rate": 4.080243808622068e-06,
"loss": 2.4594,
"step": 6900
},
{
"epoch": 0.16568831660670327,
"grad_norm": 0.2879164218902588,
"learning_rate": 4.139420658638341e-06,
"loss": 2.4642,
"step": 7000
},
{
"epoch": 0.1680552925582276,
"grad_norm": 0.27046066522598267,
"learning_rate": 4.198597508654615e-06,
"loss": 2.4689,
"step": 7100
},
{
"epoch": 0.17042226850975195,
"grad_norm": 0.24744383990764618,
"learning_rate": 4.257774358670888e-06,
"loss": 2.4526,
"step": 7200
},
{
"epoch": 0.17278924446127628,
"grad_norm": 0.2348434180021286,
"learning_rate": 4.316951208687162e-06,
"loss": 2.4539,
"step": 7300
},
{
"epoch": 0.1751562204128006,
"grad_norm": 0.295792818069458,
"learning_rate": 4.376128058703436e-06,
"loss": 2.4491,
"step": 7400
},
{
"epoch": 0.17752319636432493,
"grad_norm": 0.2649165093898773,
"learning_rate": 4.435304908719709e-06,
"loss": 2.449,
"step": 7500
},
{
"epoch": 0.17989017231584928,
"grad_norm": 0.23758557438850403,
"learning_rate": 4.494481758735983e-06,
"loss": 2.455,
"step": 7600
},
{
"epoch": 0.1822571482673736,
"grad_norm": 0.27746689319610596,
"learning_rate": 4.553658608752257e-06,
"loss": 2.4493,
"step": 7700
},
{
"epoch": 0.18462412421889793,
"grad_norm": 0.2592689096927643,
"learning_rate": 4.6128354587685306e-06,
"loss": 2.4571,
"step": 7800
},
{
"epoch": 0.18699110017042228,
"grad_norm": 0.2735172510147095,
"learning_rate": 4.672012308784804e-06,
"loss": 2.4539,
"step": 7900
},
{
"epoch": 0.1893580761219466,
"grad_norm": 0.2739349603652954,
"learning_rate": 4.7311891588010776e-06,
"loss": 2.4604,
"step": 8000
},
{
"epoch": 0.19172505207347093,
"grad_norm": 0.271176815032959,
"learning_rate": 4.790366008817351e-06,
"loss": 2.448,
"step": 8100
},
{
"epoch": 0.19409202802499526,
"grad_norm": 0.2696959674358368,
"learning_rate": 4.8495428588336246e-06,
"loss": 2.4563,
"step": 8200
},
{
"epoch": 0.1964590039765196,
"grad_norm": 0.30911239981651306,
"learning_rate": 4.9087197088498985e-06,
"loss": 2.4614,
"step": 8300
},
{
"epoch": 0.19882597992804393,
"grad_norm": 0.2745211720466614,
"learning_rate": 4.967896558866172e-06,
"loss": 2.4462,
"step": 8400
},
{
"epoch": 0.20119295587956826,
"grad_norm": 0.29566124081611633,
"learning_rate": 5.0270734088824455e-06,
"loss": 2.451,
"step": 8500
},
{
"epoch": 0.20355993183109258,
"grad_norm": 0.28213486075401306,
"learning_rate": 5.086250258898719e-06,
"loss": 2.4407,
"step": 8600
},
{
"epoch": 0.20592690778261694,
"grad_norm": 0.2758745849132538,
"learning_rate": 5.145427108914993e-06,
"loss": 2.4438,
"step": 8700
},
{
"epoch": 0.20829388373414126,
"grad_norm": 0.2921348810195923,
"learning_rate": 5.204603958931267e-06,
"loss": 2.449,
"step": 8800
},
{
"epoch": 0.21066085968566559,
"grad_norm": 0.26501932740211487,
"learning_rate": 5.26378080894754e-06,
"loss": 2.4486,
"step": 8900
},
{
"epoch": 0.21302783563718994,
"grad_norm": 0.2748875617980957,
"learning_rate": 5.322957658963814e-06,
"loss": 2.4424,
"step": 9000
},
{
"epoch": 0.21539481158871426,
"grad_norm": 0.28109443187713623,
"learning_rate": 5.382134508980087e-06,
"loss": 2.4513,
"step": 9100
},
{
"epoch": 0.2177617875402386,
"grad_norm": 0.27431926131248474,
"learning_rate": 5.44131135899636e-06,
"loss": 2.4437,
"step": 9200
},
{
"epoch": 0.2201287634917629,
"grad_norm": 0.2729012668132782,
"learning_rate": 5.500488209012634e-06,
"loss": 2.4538,
"step": 9300
},
{
"epoch": 0.22249573944328727,
"grad_norm": 0.2898072600364685,
"learning_rate": 5.559665059028908e-06,
"loss": 2.4546,
"step": 9400
},
{
"epoch": 0.2248627153948116,
"grad_norm": 0.3519386649131775,
"learning_rate": 5.618841909045182e-06,
"loss": 2.4462,
"step": 9500
},
{
"epoch": 0.22722969134633592,
"grad_norm": 0.2779889404773712,
"learning_rate": 5.678018759061455e-06,
"loss": 2.443,
"step": 9600
},
{
"epoch": 0.22959666729786024,
"grad_norm": 0.2758658826351166,
"learning_rate": 5.737195609077729e-06,
"loss": 2.4392,
"step": 9700
},
{
"epoch": 0.2319636432493846,
"grad_norm": 0.3754834532737732,
"learning_rate": 5.796372459094003e-06,
"loss": 2.4352,
"step": 9800
},
{
"epoch": 0.23433061920090892,
"grad_norm": 0.27345120906829834,
"learning_rate": 5.855549309110277e-06,
"loss": 2.4523,
"step": 9900
},
{
"epoch": 0.23669759515243324,
"grad_norm": 0.32833969593048096,
"learning_rate": 5.91472615912655e-06,
"loss": 2.4497,
"step": 10000
},
{
"epoch": 0.2390645711039576,
"grad_norm": 0.2878655791282654,
"learning_rate": 5.973903009142824e-06,
"loss": 2.451,
"step": 10100
},
{
"epoch": 0.24143154705548192,
"grad_norm": 0.31419286131858826,
"learning_rate": 6.033079859159098e-06,
"loss": 2.4429,
"step": 10200
},
{
"epoch": 0.24379852300700625,
"grad_norm": 0.2996383607387543,
"learning_rate": 6.092256709175372e-06,
"loss": 2.4349,
"step": 10300
},
{
"epoch": 0.24616549895853057,
"grad_norm": 0.308442085981369,
"learning_rate": 6.151433559191645e-06,
"loss": 2.4495,
"step": 10400
},
{
"epoch": 0.24853247491005492,
"grad_norm": 0.2972429394721985,
"learning_rate": 6.210610409207919e-06,
"loss": 2.433,
"step": 10500
},
{
"epoch": 0.2508994508615792,
"grad_norm": 0.30551430583000183,
"learning_rate": 6.269787259224191e-06,
"loss": 2.447,
"step": 10600
},
{
"epoch": 0.2532664268131036,
"grad_norm": 0.3082588016986847,
"learning_rate": 6.328964109240465e-06,
"loss": 2.4458,
"step": 10700
},
{
"epoch": 0.2556334027646279,
"grad_norm": 0.29121455550193787,
"learning_rate": 6.388140959256739e-06,
"loss": 2.4208,
"step": 10800
},
{
"epoch": 0.2580003787161522,
"grad_norm": 0.32775169610977173,
"learning_rate": 6.447317809273013e-06,
"loss": 2.4263,
"step": 10900
},
{
"epoch": 0.2603673546676766,
"grad_norm": 0.32109200954437256,
"learning_rate": 6.506494659289286e-06,
"loss": 2.4385,
"step": 11000
},
{
"epoch": 0.26273433061920093,
"grad_norm": 0.4912450313568115,
"learning_rate": 6.56567150930556e-06,
"loss": 2.4331,
"step": 11100
},
{
"epoch": 0.2651013065707252,
"grad_norm": 0.30363771319389343,
"learning_rate": 6.624848359321834e-06,
"loss": 2.4339,
"step": 11200
},
{
"epoch": 0.2674682825222496,
"grad_norm": 0.30812105536460876,
"learning_rate": 6.684025209338108e-06,
"loss": 2.4373,
"step": 11300
},
{
"epoch": 0.26983525847377393,
"grad_norm": 0.3601232171058655,
"learning_rate": 6.743202059354381e-06,
"loss": 2.4292,
"step": 11400
},
{
"epoch": 0.27220223442529823,
"grad_norm": 0.3195793926715851,
"learning_rate": 6.802378909370655e-06,
"loss": 2.438,
"step": 11500
},
{
"epoch": 0.2745692103768226,
"grad_norm": 0.31187400221824646,
"learning_rate": 6.861555759386929e-06,
"loss": 2.4413,
"step": 11600
},
{
"epoch": 0.2769361863283469,
"grad_norm": 0.3234810531139374,
"learning_rate": 6.920732609403203e-06,
"loss": 2.4502,
"step": 11700
},
{
"epoch": 0.27930316227987123,
"grad_norm": 0.3229145109653473,
"learning_rate": 6.979909459419476e-06,
"loss": 2.4369,
"step": 11800
},
{
"epoch": 0.2816701382313956,
"grad_norm": 0.30176299810409546,
"learning_rate": 7.03908630943575e-06,
"loss": 2.439,
"step": 11900
},
{
"epoch": 0.2840371141829199,
"grad_norm": 0.3238876461982727,
"learning_rate": 7.0982631594520235e-06,
"loss": 2.4441,
"step": 12000
},
{
"epoch": 0.28640409013444423,
"grad_norm": 0.3230147063732147,
"learning_rate": 7.157440009468296e-06,
"loss": 2.4395,
"step": 12100
},
{
"epoch": 0.2887710660859686,
"grad_norm": 0.33063408732414246,
"learning_rate": 7.21661685948457e-06,
"loss": 2.4332,
"step": 12200
},
{
"epoch": 0.2911380420374929,
"grad_norm": 0.32114726305007935,
"learning_rate": 7.275793709500844e-06,
"loss": 2.4301,
"step": 12300
},
{
"epoch": 0.29350501798901724,
"grad_norm": 0.4075353741645813,
"learning_rate": 7.3349705595171175e-06,
"loss": 2.4333,
"step": 12400
},
{
"epoch": 0.2958719939405416,
"grad_norm": 0.3239745497703552,
"learning_rate": 7.394147409533391e-06,
"loss": 2.4323,
"step": 12500
},
{
"epoch": 0.2982389698920659,
"grad_norm": 0.4447726011276245,
"learning_rate": 7.4533242595496645e-06,
"loss": 2.4321,
"step": 12600
},
{
"epoch": 0.30060594584359024,
"grad_norm": 0.3478521406650543,
"learning_rate": 7.5125011095659385e-06,
"loss": 2.4246,
"step": 12700
},
{
"epoch": 0.30297292179511454,
"grad_norm": 0.35203248262405396,
"learning_rate": 7.571677959582212e-06,
"loss": 2.425,
"step": 12800
},
{
"epoch": 0.3053398977466389,
"grad_norm": 0.328659325838089,
"learning_rate": 7.630854809598486e-06,
"loss": 2.4367,
"step": 12900
},
{
"epoch": 0.30770687369816324,
"grad_norm": 0.3298031985759735,
"learning_rate": 7.69003165961476e-06,
"loss": 2.4273,
"step": 13000
},
{
"epoch": 0.31007384964968754,
"grad_norm": 0.3143956661224365,
"learning_rate": 7.749208509631032e-06,
"loss": 2.4292,
"step": 13100
},
{
"epoch": 0.3124408256012119,
"grad_norm": 0.33441880345344543,
"learning_rate": 7.808385359647306e-06,
"loss": 2.437,
"step": 13200
},
{
"epoch": 0.31480780155273624,
"grad_norm": 0.335602730512619,
"learning_rate": 7.86756220966358e-06,
"loss": 2.438,
"step": 13300
},
{
"epoch": 0.31717477750426054,
"grad_norm": 0.3256273865699768,
"learning_rate": 7.926739059679854e-06,
"loss": 2.4324,
"step": 13400
},
{
"epoch": 0.3195417534557849,
"grad_norm": 0.3533662259578705,
"learning_rate": 7.985915909696128e-06,
"loss": 2.4312,
"step": 13500
},
{
"epoch": 0.32190872940730925,
"grad_norm": 0.34541791677474976,
"learning_rate": 8.0450927597124e-06,
"loss": 2.4294,
"step": 13600
},
{
"epoch": 0.32427570535883354,
"grad_norm": 0.33559226989746094,
"learning_rate": 8.104269609728674e-06,
"loss": 2.4206,
"step": 13700
},
{
"epoch": 0.3266426813103579,
"grad_norm": 0.34667766094207764,
"learning_rate": 8.163446459744948e-06,
"loss": 2.4289,
"step": 13800
},
{
"epoch": 0.3290096572618822,
"grad_norm": 0.3094275891780853,
"learning_rate": 8.222623309761222e-06,
"loss": 2.4335,
"step": 13900
},
{
"epoch": 0.33137663321340655,
"grad_norm": 0.32228076457977295,
"learning_rate": 8.281800159777496e-06,
"loss": 2.4348,
"step": 14000
},
{
"epoch": 0.3337436091649309,
"grad_norm": 0.3154647946357727,
"learning_rate": 8.34097700979377e-06,
"loss": 2.4195,
"step": 14100
},
{
"epoch": 0.3361105851164552,
"grad_norm": 0.380206823348999,
"learning_rate": 8.400153859810042e-06,
"loss": 2.4257,
"step": 14200
},
{
"epoch": 0.33847756106797955,
"grad_norm": 0.32707059383392334,
"learning_rate": 8.459330709826316e-06,
"loss": 2.4279,
"step": 14300
},
{
"epoch": 0.3408445370195039,
"grad_norm": 0.3562242090702057,
"learning_rate": 8.51850755984259e-06,
"loss": 2.4433,
"step": 14400
},
{
"epoch": 0.3432115129710282,
"grad_norm": 0.3338697552680969,
"learning_rate": 8.577684409858864e-06,
"loss": 2.4378,
"step": 14500
},
{
"epoch": 0.34557848892255255,
"grad_norm": 0.3395216166973114,
"learning_rate": 8.636861259875138e-06,
"loss": 2.4274,
"step": 14600
},
{
"epoch": 0.3479454648740769,
"grad_norm": 0.32426705956459045,
"learning_rate": 8.696038109891412e-06,
"loss": 2.4268,
"step": 14700
},
{
"epoch": 0.3503124408256012,
"grad_norm": 0.3478586673736572,
"learning_rate": 8.755214959907686e-06,
"loss": 2.4247,
"step": 14800
},
{
"epoch": 0.35267941677712555,
"grad_norm": 0.3790106475353241,
"learning_rate": 8.81439180992396e-06,
"loss": 2.4372,
"step": 14900
},
{
"epoch": 0.35504639272864985,
"grad_norm": 0.3437531888484955,
"learning_rate": 8.873568659940232e-06,
"loss": 2.4193,
"step": 15000
},
{
"epoch": 0.3574133686801742,
"grad_norm": 0.3627135753631592,
"learning_rate": 8.932745509956506e-06,
"loss": 2.4343,
"step": 15100
},
{
"epoch": 0.35978034463169856,
"grad_norm": 0.3435176610946655,
"learning_rate": 8.99192235997278e-06,
"loss": 2.4231,
"step": 15200
},
{
"epoch": 0.36214732058322285,
"grad_norm": 0.3540484309196472,
"learning_rate": 9.051099209989052e-06,
"loss": 2.426,
"step": 15300
},
{
"epoch": 0.3645142965347472,
"grad_norm": 0.3281879723072052,
"learning_rate": 9.110276060005326e-06,
"loss": 2.4262,
"step": 15400
},
{
"epoch": 0.36688127248627156,
"grad_norm": 0.419574499130249,
"learning_rate": 9.1694529100216e-06,
"loss": 2.4103,
"step": 15500
},
{
"epoch": 0.36924824843779586,
"grad_norm": 0.38810306787490845,
"learning_rate": 9.228629760037874e-06,
"loss": 2.4288,
"step": 15600
},
{
"epoch": 0.3716152243893202,
"grad_norm": 0.3265315592288971,
"learning_rate": 9.287214841553986e-06,
"loss": 2.431,
"step": 15700
},
{
"epoch": 0.37398220034084456,
"grad_norm": 0.3964623510837555,
"learning_rate": 9.346391691570258e-06,
"loss": 2.4272,
"step": 15800
},
{
"epoch": 0.37634917629236886,
"grad_norm": 0.3374871611595154,
"learning_rate": 9.405568541586532e-06,
"loss": 2.4326,
"step": 15900
},
{
"epoch": 0.3787161522438932,
"grad_norm": 0.34002941846847534,
"learning_rate": 9.464745391602806e-06,
"loss": 2.4256,
"step": 16000
},
{
"epoch": 0.3810831281954175,
"grad_norm": 0.3714279234409332,
"learning_rate": 9.52392224161908e-06,
"loss": 2.4202,
"step": 16100
},
{
"epoch": 0.38345010414694186,
"grad_norm": 0.343189537525177,
"learning_rate": 9.583099091635353e-06,
"loss": 2.4168,
"step": 16200
},
{
"epoch": 0.3858170800984662,
"grad_norm": 0.33741703629493713,
"learning_rate": 9.642275941651626e-06,
"loss": 2.4185,
"step": 16300
},
{
"epoch": 0.3881840560499905,
"grad_norm": 0.3652304708957672,
"learning_rate": 9.7014527916679e-06,
"loss": 2.4272,
"step": 16400
},
{
"epoch": 0.39055103200151486,
"grad_norm": 0.3449861407279968,
"learning_rate": 9.760629641684174e-06,
"loss": 2.4048,
"step": 16500
},
{
"epoch": 0.3929180079530392,
"grad_norm": 0.344180703163147,
"learning_rate": 9.819806491700447e-06,
"loss": 2.4201,
"step": 16600
},
{
"epoch": 0.3952849839045635,
"grad_norm": 0.328961044549942,
"learning_rate": 9.878983341716721e-06,
"loss": 2.4252,
"step": 16700
},
{
"epoch": 0.39765195985608787,
"grad_norm": 0.3466714918613434,
"learning_rate": 9.938160191732995e-06,
"loss": 2.4082,
"step": 16800
},
{
"epoch": 0.4000189358076122,
"grad_norm": 0.3624398112297058,
"learning_rate": 9.99733704174927e-06,
"loss": 2.4275,
"step": 16900
},
{
"epoch": 0.4023859117591365,
"grad_norm": 0.35927194356918335,
"learning_rate": 1.0056513891765543e-05,
"loss": 2.4183,
"step": 17000
},
{
"epoch": 0.40475288771066087,
"grad_norm": 0.3643719255924225,
"learning_rate": 1.0115690741781815e-05,
"loss": 2.4299,
"step": 17100
},
{
"epoch": 0.40711986366218517,
"grad_norm": 0.3489636182785034,
"learning_rate": 1.017486759179809e-05,
"loss": 2.4105,
"step": 17200
},
{
"epoch": 0.4094868396137095,
"grad_norm": 0.3617055118083954,
"learning_rate": 1.0234044441814363e-05,
"loss": 2.4262,
"step": 17300
},
{
"epoch": 0.41185381556523387,
"grad_norm": 0.3670959174633026,
"learning_rate": 1.0293221291830637e-05,
"loss": 2.4253,
"step": 17400
},
{
"epoch": 0.41422079151675817,
"grad_norm": 0.4054628610610962,
"learning_rate": 1.0352398141846911e-05,
"loss": 2.4165,
"step": 17500
},
{
"epoch": 0.4165877674682825,
"grad_norm": 0.32820406556129456,
"learning_rate": 1.0411574991863185e-05,
"loss": 2.4156,
"step": 17600
},
{
"epoch": 0.4189547434198069,
"grad_norm": 0.3387589752674103,
"learning_rate": 1.0470751841879459e-05,
"loss": 2.4273,
"step": 17700
},
{
"epoch": 0.42132171937133117,
"grad_norm": 0.3759928047657013,
"learning_rate": 1.0529928691895733e-05,
"loss": 2.4311,
"step": 17800
},
{
"epoch": 0.4236886953228555,
"grad_norm": 0.38023602962493896,
"learning_rate": 1.0589105541912005e-05,
"loss": 2.4243,
"step": 17900
},
{
"epoch": 0.4260556712743799,
"grad_norm": 0.34721675515174866,
"learning_rate": 1.0648282391928279e-05,
"loss": 2.4188,
"step": 18000
},
{
"epoch": 0.4284226472259042,
"grad_norm": 0.34966644644737244,
"learning_rate": 1.0707459241944551e-05,
"loss": 2.4086,
"step": 18100
},
{
"epoch": 0.4307896231774285,
"grad_norm": 0.38616931438446045,
"learning_rate": 1.0766636091960825e-05,
"loss": 2.412,
"step": 18200
},
{
"epoch": 0.4331565991289528,
"grad_norm": 0.3381541967391968,
"learning_rate": 1.0825812941977099e-05,
"loss": 2.414,
"step": 18300
},
{
"epoch": 0.4355235750804772,
"grad_norm": 0.4827527105808258,
"learning_rate": 1.0884989791993373e-05,
"loss": 2.4125,
"step": 18400
},
{
"epoch": 0.43789055103200153,
"grad_norm": 0.3514668941497803,
"learning_rate": 1.0944166642009645e-05,
"loss": 2.4137,
"step": 18500
},
{
"epoch": 0.4402575269835258,
"grad_norm": 0.3542225956916809,
"learning_rate": 1.100334349202592e-05,
"loss": 2.4087,
"step": 18600
},
{
"epoch": 0.4426245029350502,
"grad_norm": 0.40214431285858154,
"learning_rate": 1.1062520342042193e-05,
"loss": 2.4242,
"step": 18700
},
{
"epoch": 0.44499147888657453,
"grad_norm": 0.34530532360076904,
"learning_rate": 1.1121697192058467e-05,
"loss": 2.4115,
"step": 18800
},
{
"epoch": 0.44735845483809883,
"grad_norm": 0.3892427384853363,
"learning_rate": 1.1180874042074741e-05,
"loss": 2.4158,
"step": 18900
},
{
"epoch": 0.4497254307896232,
"grad_norm": 0.3698406219482422,
"learning_rate": 1.1240050892091015e-05,
"loss": 2.4136,
"step": 19000
},
{
"epoch": 0.45209240674114753,
"grad_norm": 0.3435867726802826,
"learning_rate": 1.1299227742107289e-05,
"loss": 2.4181,
"step": 19100
},
{
"epoch": 0.45445938269267183,
"grad_norm": 0.3343878388404846,
"learning_rate": 1.1358404592123563e-05,
"loss": 2.4123,
"step": 19200
},
{
"epoch": 0.4568263586441962,
"grad_norm": 0.3319224417209625,
"learning_rate": 1.1417581442139835e-05,
"loss": 2.4179,
"step": 19300
},
{
"epoch": 0.4591933345957205,
"grad_norm": 0.36949145793914795,
"learning_rate": 1.1476758292156109e-05,
"loss": 2.4288,
"step": 19400
},
{
"epoch": 0.46156031054724483,
"grad_norm": 0.33672720193862915,
"learning_rate": 1.1535935142172383e-05,
"loss": 2.4283,
"step": 19500
},
{
"epoch": 0.4639272864987692,
"grad_norm": 0.36359962821006775,
"learning_rate": 1.1595111992188657e-05,
"loss": 2.4104,
"step": 19600
},
{
"epoch": 0.4662942624502935,
"grad_norm": 0.357768714427948,
"learning_rate": 1.165428884220493e-05,
"loss": 2.4005,
"step": 19700
},
{
"epoch": 0.46866123840181784,
"grad_norm": 0.35632389783859253,
"learning_rate": 1.1713465692221205e-05,
"loss": 2.4156,
"step": 19800
},
{
"epoch": 0.4710282143533422,
"grad_norm": 0.35454291105270386,
"learning_rate": 1.1772642542237479e-05,
"loss": 2.4075,
"step": 19900
},
{
"epoch": 0.4733951903048665,
"grad_norm": 0.337933212518692,
"learning_rate": 1.1831819392253752e-05,
"loss": 2.4119,
"step": 20000
},
{
"epoch": 0.47576216625639084,
"grad_norm": 0.36804336309432983,
"learning_rate": 1.1890996242270025e-05,
"loss": 2.4112,
"step": 20100
},
{
"epoch": 0.4781291422079152,
"grad_norm": 0.3589170575141907,
"learning_rate": 1.1950173092286299e-05,
"loss": 2.4111,
"step": 20200
},
{
"epoch": 0.4804961181594395,
"grad_norm": 0.4138932228088379,
"learning_rate": 1.2009349942302573e-05,
"loss": 2.4147,
"step": 20300
},
{
"epoch": 0.48286309411096384,
"grad_norm": 0.37294042110443115,
"learning_rate": 1.2068526792318846e-05,
"loss": 2.4199,
"step": 20400
},
{
"epoch": 0.48523007006248814,
"grad_norm": 0.34787285327911377,
"learning_rate": 1.212770364233512e-05,
"loss": 2.4125,
"step": 20500
},
{
"epoch": 0.4875970460140125,
"grad_norm": 0.33219948410987854,
"learning_rate": 1.2186880492351394e-05,
"loss": 2.4046,
"step": 20600
},
{
"epoch": 0.48996402196553684,
"grad_norm": 0.3547484278678894,
"learning_rate": 1.2246057342367668e-05,
"loss": 2.4178,
"step": 20700
},
{
"epoch": 0.49233099791706114,
"grad_norm": 0.33837926387786865,
"learning_rate": 1.2305234192383942e-05,
"loss": 2.403,
"step": 20800
},
{
"epoch": 0.4946979738685855,
"grad_norm": 0.35077232122421265,
"learning_rate": 1.2364411042400214e-05,
"loss": 2.4139,
"step": 20900
},
{
"epoch": 0.49706494982010985,
"grad_norm": 0.3571261167526245,
"learning_rate": 1.2422996123916324e-05,
"loss": 2.4001,
"step": 21000
},
{
"epoch": 0.49943192577163414,
"grad_norm": 0.36656296253204346,
"learning_rate": 1.2482172973932598e-05,
"loss": 2.406,
"step": 21100
},
{
"epoch": 0.5017989017231584,
"grad_norm": 0.3557038903236389,
"learning_rate": 1.2541349823948872e-05,
"loss": 2.41,
"step": 21200
},
{
"epoch": 0.5041658776746828,
"grad_norm": 0.361907035112381,
"learning_rate": 1.2600526673965146e-05,
"loss": 2.4106,
"step": 21300
},
{
"epoch": 0.5065328536262071,
"grad_norm": 0.34070518612861633,
"learning_rate": 1.2659703523981418e-05,
"loss": 2.4121,
"step": 21400
},
{
"epoch": 0.5088998295777315,
"grad_norm": 0.35266879200935364,
"learning_rate": 1.2718880373997692e-05,
"loss": 2.4051,
"step": 21500
},
{
"epoch": 0.5112668055292559,
"grad_norm": 0.39729219675064087,
"learning_rate": 1.2778057224013966e-05,
"loss": 2.4004,
"step": 21600
},
{
"epoch": 0.5136337814807802,
"grad_norm": 0.34886813163757324,
"learning_rate": 1.283723407403024e-05,
"loss": 2.4171,
"step": 21700
},
{
"epoch": 0.5160007574323044,
"grad_norm": 0.33244648575782776,
"learning_rate": 1.2896410924046514e-05,
"loss": 2.3979,
"step": 21800
},
{
"epoch": 0.5183677333838288,
"grad_norm": 0.3533230423927307,
"learning_rate": 1.2955587774062788e-05,
"loss": 2.4039,
"step": 21900
},
{
"epoch": 0.5207347093353532,
"grad_norm": 0.3643980920314789,
"learning_rate": 1.3014764624079062e-05,
"loss": 2.417,
"step": 22000
},
{
"epoch": 0.5231016852868775,
"grad_norm": 0.3681216835975647,
"learning_rate": 1.3073941474095336e-05,
"loss": 2.4028,
"step": 22100
},
{
"epoch": 0.5254686612384019,
"grad_norm": 0.3376631438732147,
"learning_rate": 1.3133118324111608e-05,
"loss": 2.4044,
"step": 22200
},
{
"epoch": 0.5278356371899261,
"grad_norm": 0.3588080108165741,
"learning_rate": 1.3192295174127882e-05,
"loss": 2.4152,
"step": 22300
},
{
"epoch": 0.5302026131414505,
"grad_norm": 0.35474061965942383,
"learning_rate": 1.3251472024144156e-05,
"loss": 2.3962,
"step": 22400
},
{
"epoch": 0.5325695890929748,
"grad_norm": 0.36065080761909485,
"learning_rate": 1.331064887416043e-05,
"loss": 2.4035,
"step": 22500
},
{
"epoch": 0.5349365650444992,
"grad_norm": 0.34817591309547424,
"learning_rate": 1.3369825724176704e-05,
"loss": 2.4108,
"step": 22600
},
{
"epoch": 0.5373035409960235,
"grad_norm": 0.33565661311149597,
"learning_rate": 1.3429002574192978e-05,
"loss": 2.403,
"step": 22700
},
{
"epoch": 0.5396705169475479,
"grad_norm": 0.34676095843315125,
"learning_rate": 1.3488179424209252e-05,
"loss": 2.4056,
"step": 22800
},
{
"epoch": 0.5420374928990721,
"grad_norm": 0.3674164116382599,
"learning_rate": 1.3547356274225526e-05,
"loss": 2.4061,
"step": 22900
},
{
"epoch": 0.5444044688505965,
"grad_norm": 0.3376142978668213,
"learning_rate": 1.3605941355741634e-05,
"loss": 2.4158,
"step": 23000
},
{
"epoch": 0.5467714448021208,
"grad_norm": 0.3908544182777405,
"learning_rate": 1.3665118205757908e-05,
"loss": 2.4022,
"step": 23100
},
{
"epoch": 0.5491384207536452,
"grad_norm": 0.38587990403175354,
"learning_rate": 1.3724295055774182e-05,
"loss": 2.4171,
"step": 23200
},
{
"epoch": 0.5515053967051695,
"grad_norm": 0.3695133924484253,
"learning_rate": 1.3783471905790456e-05,
"loss": 2.3997,
"step": 23300
},
{
"epoch": 0.5538723726566938,
"grad_norm": 0.3392127454280853,
"learning_rate": 1.384264875580673e-05,
"loss": 2.4157,
"step": 23400
},
{
"epoch": 0.5562393486082181,
"grad_norm": 0.3664696216583252,
"learning_rate": 1.3901825605823004e-05,
"loss": 2.4123,
"step": 23500
},
{
"epoch": 0.5586063245597425,
"grad_norm": 0.3691762387752533,
"learning_rate": 1.3961002455839276e-05,
"loss": 2.3994,
"step": 23600
},
{
"epoch": 0.5609733005112668,
"grad_norm": 0.3565746247768402,
"learning_rate": 1.402017930585555e-05,
"loss": 2.4027,
"step": 23700
},
{
"epoch": 0.5633402764627912,
"grad_norm": 0.3518475890159607,
"learning_rate": 1.4079356155871824e-05,
"loss": 2.3937,
"step": 23800
},
{
"epoch": 0.5657072524143155,
"grad_norm": 0.34867557883262634,
"learning_rate": 1.4138533005888098e-05,
"loss": 2.4,
"step": 23900
},
{
"epoch": 0.5680742283658398,
"grad_norm": 0.35145652294158936,
"learning_rate": 1.4197709855904371e-05,
"loss": 2.4044,
"step": 24000
},
{
"epoch": 0.5704412043173641,
"grad_norm": 0.3380683958530426,
"learning_rate": 1.4256886705920645e-05,
"loss": 2.4139,
"step": 24100
},
{
"epoch": 0.5728081802688885,
"grad_norm": 0.3554782569408417,
"learning_rate": 1.431606355593692e-05,
"loss": 2.395,
"step": 24200
},
{
"epoch": 0.5751751562204128,
"grad_norm": 0.39881500601768494,
"learning_rate": 1.4375240405953193e-05,
"loss": 2.3942,
"step": 24300
},
{
"epoch": 0.5775421321719372,
"grad_norm": 0.37088507413864136,
"learning_rate": 1.4434417255969465e-05,
"loss": 2.4092,
"step": 24400
},
{
"epoch": 0.5799091081234614,
"grad_norm": 0.3711656630039215,
"learning_rate": 1.449359410598574e-05,
"loss": 2.4184,
"step": 24500
},
{
"epoch": 0.5822760840749858,
"grad_norm": 0.33910948038101196,
"learning_rate": 1.4552770956002013e-05,
"loss": 2.3916,
"step": 24600
},
{
"epoch": 0.5846430600265101,
"grad_norm": 0.35600873827934265,
"learning_rate": 1.4611947806018287e-05,
"loss": 2.4008,
"step": 24700
},
{
"epoch": 0.5870100359780345,
"grad_norm": 0.35309475660324097,
"learning_rate": 1.4671124656034561e-05,
"loss": 2.3979,
"step": 24800
},
{
"epoch": 0.5893770119295588,
"grad_norm": 0.3425716459751129,
"learning_rate": 1.4730301506050835e-05,
"loss": 2.4015,
"step": 24900
},
{
"epoch": 0.5917439878810832,
"grad_norm": 0.3652407228946686,
"learning_rate": 1.4789478356067109e-05,
"loss": 2.3957,
"step": 25000
},
{
"epoch": 0.5941109638326074,
"grad_norm": 0.3365596830844879,
"learning_rate": 1.4848655206083383e-05,
"loss": 2.3913,
"step": 25100
},
{
"epoch": 0.5964779397841318,
"grad_norm": 0.35885608196258545,
"learning_rate": 1.4907832056099655e-05,
"loss": 2.3903,
"step": 25200
},
{
"epoch": 0.5988449157356561,
"grad_norm": 0.38684821128845215,
"learning_rate": 1.4966417137615765e-05,
"loss": 2.3876,
"step": 25300
},
{
"epoch": 0.6012118916871805,
"grad_norm": 0.3497035503387451,
"learning_rate": 1.5025593987632039e-05,
"loss": 2.3874,
"step": 25400
},
{
"epoch": 0.6035788676387048,
"grad_norm": 0.3431876599788666,
"learning_rate": 1.5084770837648313e-05,
"loss": 2.39,
"step": 25500
},
{
"epoch": 0.6059458435902291,
"grad_norm": 0.35600966215133667,
"learning_rate": 1.5143947687664587e-05,
"loss": 2.4009,
"step": 25600
},
{
"epoch": 0.6083128195417534,
"grad_norm": 0.33623310923576355,
"learning_rate": 1.5203124537680861e-05,
"loss": 2.3981,
"step": 25700
},
{
"epoch": 0.6106797954932778,
"grad_norm": 0.33237648010253906,
"learning_rate": 1.5262301387697135e-05,
"loss": 2.4036,
"step": 25800
},
{
"epoch": 0.6130467714448021,
"grad_norm": 0.35398033261299133,
"learning_rate": 1.532147823771341e-05,
"loss": 2.3988,
"step": 25900
},
{
"epoch": 0.6154137473963265,
"grad_norm": 0.47366973757743835,
"learning_rate": 1.5380655087729683e-05,
"loss": 2.4013,
"step": 26000
},
{
"epoch": 0.6177807233478508,
"grad_norm": 0.339417427778244,
"learning_rate": 1.5439831937745957e-05,
"loss": 2.4069,
"step": 26100
},
{
"epoch": 0.6201476992993751,
"grad_norm": 0.3327637016773224,
"learning_rate": 1.5499008787762227e-05,
"loss": 2.3921,
"step": 26200
},
{
"epoch": 0.6225146752508994,
"grad_norm": 0.3412494659423828,
"learning_rate": 1.55581856377785e-05,
"loss": 2.379,
"step": 26300
},
{
"epoch": 0.6248816512024238,
"grad_norm": 0.3637641668319702,
"learning_rate": 1.5617362487794775e-05,
"loss": 2.3911,
"step": 26400
},
{
"epoch": 0.6272486271539481,
"grad_norm": 0.4117577373981476,
"learning_rate": 1.567653933781105e-05,
"loss": 2.391,
"step": 26500
},
{
"epoch": 0.6296156031054725,
"grad_norm": 0.3605392575263977,
"learning_rate": 1.5735716187827323e-05,
"loss": 2.3961,
"step": 26600
},
{
"epoch": 0.6319825790569967,
"grad_norm": 0.35646742582321167,
"learning_rate": 1.5794893037843597e-05,
"loss": 2.3969,
"step": 26700
},
{
"epoch": 0.6343495550085211,
"grad_norm": 0.3432878851890564,
"learning_rate": 1.585406988785987e-05,
"loss": 2.3939,
"step": 26800
},
{
"epoch": 0.6367165309600454,
"grad_norm": 0.3541545569896698,
"learning_rate": 1.5913246737876145e-05,
"loss": 2.4079,
"step": 26900
},
{
"epoch": 0.6390835069115698,
"grad_norm": 0.3709736168384552,
"learning_rate": 1.597242358789242e-05,
"loss": 2.4119,
"step": 27000
},
{
"epoch": 0.6414504828630941,
"grad_norm": 0.32629159092903137,
"learning_rate": 1.6031600437908692e-05,
"loss": 2.3905,
"step": 27100
},
{
"epoch": 0.6438174588146185,
"grad_norm": 0.4810309410095215,
"learning_rate": 1.6090777287924966e-05,
"loss": 2.3926,
"step": 27200
},
{
"epoch": 0.6461844347661427,
"grad_norm": 0.37358030676841736,
"learning_rate": 1.614995413794124e-05,
"loss": 2.3836,
"step": 27300
},
{
"epoch": 0.6485514107176671,
"grad_norm": 0.36473044753074646,
"learning_rate": 1.6209130987957514e-05,
"loss": 2.39,
"step": 27400
},
{
"epoch": 0.6509183866691914,
"grad_norm": 0.32987740635871887,
"learning_rate": 1.6268307837973788e-05,
"loss": 2.3925,
"step": 27500
},
{
"epoch": 0.6532853626207158,
"grad_norm": 0.34442269802093506,
"learning_rate": 1.6327484687990062e-05,
"loss": 2.4023,
"step": 27600
},
{
"epoch": 0.6556523385722401,
"grad_norm": 0.3745739161968231,
"learning_rate": 1.6386661538006333e-05,
"loss": 2.4047,
"step": 27700
},
{
"epoch": 0.6580193145237644,
"grad_norm": 0.3746493458747864,
"learning_rate": 1.6445838388022607e-05,
"loss": 2.4005,
"step": 27800
},
{
"epoch": 0.6603862904752887,
"grad_norm": 0.32949355244636536,
"learning_rate": 1.650501523803888e-05,
"loss": 2.3875,
"step": 27900
},
{
"epoch": 0.6627532664268131,
"grad_norm": 0.331719309091568,
"learning_rate": 1.6564192088055154e-05,
"loss": 2.3876,
"step": 28000
},
{
"epoch": 0.6651202423783374,
"grad_norm": 0.34970593452453613,
"learning_rate": 1.662336893807143e-05,
"loss": 2.3995,
"step": 28100
},
{
"epoch": 0.6674872183298618,
"grad_norm": 0.3494050204753876,
"learning_rate": 1.6682545788087702e-05,
"loss": 2.3852,
"step": 28200
},
{
"epoch": 0.6698541942813862,
"grad_norm": 0.31740233302116394,
"learning_rate": 1.6741722638103973e-05,
"loss": 2.3953,
"step": 28300
},
{
"epoch": 0.6722211702329104,
"grad_norm": 0.3360515236854553,
"learning_rate": 1.6800899488120247e-05,
"loss": 2.3911,
"step": 28400
},
{
"epoch": 0.6745881461844347,
"grad_norm": 0.3421274721622467,
"learning_rate": 1.685948456963636e-05,
"loss": 2.404,
"step": 28500
},
{
"epoch": 0.6769551221359591,
"grad_norm": 0.33647575974464417,
"learning_rate": 1.6918661419652632e-05,
"loss": 2.3986,
"step": 28600
},
{
"epoch": 0.6793220980874835,
"grad_norm": 0.33582180738449097,
"learning_rate": 1.6977838269668906e-05,
"loss": 2.3948,
"step": 28700
},
{
"epoch": 0.6816890740390078,
"grad_norm": 0.34744688868522644,
"learning_rate": 1.703701511968518e-05,
"loss": 2.3921,
"step": 28800
},
{
"epoch": 0.684056049990532,
"grad_norm": 0.3513332009315491,
"learning_rate": 1.7096191969701454e-05,
"loss": 2.397,
"step": 28900
},
{
"epoch": 0.6864230259420564,
"grad_norm": 0.35616153478622437,
"learning_rate": 1.7155368819717728e-05,
"loss": 2.3922,
"step": 29000
},
{
"epoch": 0.6887900018935808,
"grad_norm": 0.3601691424846649,
"learning_rate": 1.7214545669734002e-05,
"loss": 2.3886,
"step": 29100
},
{
"epoch": 0.6911569778451051,
"grad_norm": 0.3415214419364929,
"learning_rate": 1.7273722519750276e-05,
"loss": 2.3836,
"step": 29200
},
{
"epoch": 0.6935239537966295,
"grad_norm": 0.3496253788471222,
"learning_rate": 1.733289936976655e-05,
"loss": 2.3832,
"step": 29300
},
{
"epoch": 0.6958909297481538,
"grad_norm": 0.32848358154296875,
"learning_rate": 1.7392076219782824e-05,
"loss": 2.3839,
"step": 29400
},
{
"epoch": 0.698257905699678,
"grad_norm": 0.3362344801425934,
"learning_rate": 1.7451253069799098e-05,
"loss": 2.3878,
"step": 29500
},
{
"epoch": 0.7006248816512024,
"grad_norm": 0.34034013748168945,
"learning_rate": 1.751042991981537e-05,
"loss": 2.3841,
"step": 29600
},
{
"epoch": 0.7029918576027268,
"grad_norm": 0.34850838780403137,
"learning_rate": 1.7569606769831646e-05,
"loss": 2.3893,
"step": 29700
},
{
"epoch": 0.7053588335542511,
"grad_norm": 0.34481024742126465,
"learning_rate": 1.762878361984792e-05,
"loss": 2.3746,
"step": 29800
},
{
"epoch": 0.7077258095057755,
"grad_norm": 0.319324254989624,
"learning_rate": 1.768796046986419e-05,
"loss": 2.3909,
"step": 29900
},
{
"epoch": 0.7100927854572997,
"grad_norm": 0.3310067057609558,
"learning_rate": 1.7747137319880464e-05,
"loss": 2.3859,
"step": 30000
},
{
"epoch": 0.712459761408824,
"grad_norm": 0.34449535608291626,
"learning_rate": 1.7806314169896738e-05,
"loss": 2.4031,
"step": 30100
},
{
"epoch": 0.7148267373603484,
"grad_norm": 0.36738091707229614,
"learning_rate": 1.7865491019913012e-05,
"loss": 2.3877,
"step": 30200
},
{
"epoch": 0.7171937133118728,
"grad_norm": 0.3570147752761841,
"learning_rate": 1.7924667869929286e-05,
"loss": 2.3921,
"step": 30300
},
{
"epoch": 0.7195606892633971,
"grad_norm": 0.32705631852149963,
"learning_rate": 1.798384471994556e-05,
"loss": 2.3844,
"step": 30400
},
{
"epoch": 0.7219276652149215,
"grad_norm": 0.3508467972278595,
"learning_rate": 1.804302156996183e-05,
"loss": 2.375,
"step": 30500
},
{
"epoch": 0.7242946411664457,
"grad_norm": 0.3959505558013916,
"learning_rate": 1.8102198419978104e-05,
"loss": 2.3893,
"step": 30600
},
{
"epoch": 0.7266616171179701,
"grad_norm": 0.3338560163974762,
"learning_rate": 1.8161375269994378e-05,
"loss": 2.3803,
"step": 30700
},
{
"epoch": 0.7290285930694944,
"grad_norm": 0.3438529968261719,
"learning_rate": 1.8220552120010652e-05,
"loss": 2.383,
"step": 30800
},
{
"epoch": 0.7313955690210188,
"grad_norm": 0.34159713983535767,
"learning_rate": 1.8279728970026926e-05,
"loss": 2.381,
"step": 30900
},
{
"epoch": 0.7337625449725431,
"grad_norm": 0.38974571228027344,
"learning_rate": 1.83389058200432e-05,
"loss": 2.3779,
"step": 31000
},
{
"epoch": 0.7361295209240674,
"grad_norm": 0.3364710211753845,
"learning_rate": 1.8398082670059474e-05,
"loss": 2.3846,
"step": 31100
},
{
"epoch": 0.7384964968755917,
"grad_norm": 0.39294859766960144,
"learning_rate": 1.8457259520075748e-05,
"loss": 2.3857,
"step": 31200
},
{
"epoch": 0.7408634728271161,
"grad_norm": 0.35359159111976624,
"learning_rate": 1.851643637009202e-05,
"loss": 2.3821,
"step": 31300
},
{
"epoch": 0.7432304487786404,
"grad_norm": 0.37089574337005615,
"learning_rate": 1.8575613220108295e-05,
"loss": 2.394,
"step": 31400
},
{
"epoch": 0.7455974247301648,
"grad_norm": 0.32074281573295593,
"learning_rate": 1.863479007012457e-05,
"loss": 2.3854,
"step": 31500
},
{
"epoch": 0.7479644006816891,
"grad_norm": 0.3406684696674347,
"learning_rate": 1.8693966920140843e-05,
"loss": 2.3822,
"step": 31600
},
{
"epoch": 0.7503313766332134,
"grad_norm": 0.3442894220352173,
"learning_rate": 1.8753143770157117e-05,
"loss": 2.3782,
"step": 31700
},
{
"epoch": 0.7526983525847377,
"grad_norm": 0.3537774682044983,
"learning_rate": 1.881172885167323e-05,
"loss": 2.3846,
"step": 31800
},
{
"epoch": 0.7550653285362621,
"grad_norm": 0.31586501002311707,
"learning_rate": 1.8870905701689503e-05,
"loss": 2.3876,
"step": 31900
},
{
"epoch": 0.7574323044877864,
"grad_norm": 0.35079076886177063,
"learning_rate": 1.8930082551705777e-05,
"loss": 2.3891,
"step": 32000
},
{
"epoch": 0.7597992804393108,
"grad_norm": 0.3363019824028015,
"learning_rate": 1.8989259401722047e-05,
"loss": 2.3933,
"step": 32100
},
{
"epoch": 0.762166256390835,
"grad_norm": 0.32039549946784973,
"learning_rate": 1.904843625173832e-05,
"loss": 2.3585,
"step": 32200
},
{
"epoch": 0.7645332323423594,
"grad_norm": 0.33742275834083557,
"learning_rate": 1.9107613101754595e-05,
"loss": 2.3809,
"step": 32300
},
{
"epoch": 0.7669002082938837,
"grad_norm": 0.3437131941318512,
"learning_rate": 1.916678995177087e-05,
"loss": 2.3811,
"step": 32400
},
{
"epoch": 0.7692671842454081,
"grad_norm": 0.3589881658554077,
"learning_rate": 1.9225966801787143e-05,
"loss": 2.3763,
"step": 32500
},
{
"epoch": 0.7716341601969324,
"grad_norm": 0.36550530791282654,
"learning_rate": 1.9285143651803414e-05,
"loss": 2.3804,
"step": 32600
},
{
"epoch": 0.7740011361484568,
"grad_norm": 0.3241026699542999,
"learning_rate": 1.9344320501819688e-05,
"loss": 2.3908,
"step": 32700
},
{
"epoch": 0.776368112099981,
"grad_norm": 0.33091387152671814,
"learning_rate": 1.940349735183596e-05,
"loss": 2.378,
"step": 32800
},
{
"epoch": 0.7787350880515054,
"grad_norm": 0.31871795654296875,
"learning_rate": 1.9462674201852235e-05,
"loss": 2.3837,
"step": 32900
},
{
"epoch": 0.7811020640030297,
"grad_norm": 0.331828773021698,
"learning_rate": 1.952185105186851e-05,
"loss": 2.3774,
"step": 33000
},
{
"epoch": 0.7834690399545541,
"grad_norm": 0.33192068338394165,
"learning_rate": 1.9581027901884783e-05,
"loss": 2.3812,
"step": 33100
},
{
"epoch": 0.7858360159060784,
"grad_norm": 0.3415600657463074,
"learning_rate": 1.9640204751901057e-05,
"loss": 2.3754,
"step": 33200
},
{
"epoch": 0.7882029918576027,
"grad_norm": 0.30927810072898865,
"learning_rate": 1.969938160191733e-05,
"loss": 2.3844,
"step": 33300
},
{
"epoch": 0.790569967809127,
"grad_norm": 0.3214524984359741,
"learning_rate": 1.9758558451933605e-05,
"loss": 2.3678,
"step": 33400
},
{
"epoch": 0.7929369437606514,
"grad_norm": 0.3286936581134796,
"learning_rate": 1.981773530194988e-05,
"loss": 2.3848,
"step": 33500
},
{
"epoch": 0.7953039197121757,
"grad_norm": 0.33375072479248047,
"learning_rate": 1.9876912151966153e-05,
"loss": 2.3737,
"step": 33600
},
{
"epoch": 0.7976708956637001,
"grad_norm": 0.3241300582885742,
"learning_rate": 1.9936089001982427e-05,
"loss": 2.3662,
"step": 33700
},
{
"epoch": 0.8000378716152244,
"grad_norm": 0.34323224425315857,
"learning_rate": 1.99952658519987e-05,
"loss": 2.3809,
"step": 33800
},
{
"epoch": 0.8024048475667487,
"grad_norm": 0.3225324749946594,
"learning_rate": 1.9994152275965527e-05,
"loss": 2.3724,
"step": 33900
},
{
"epoch": 0.804771823518273,
"grad_norm": 0.3365699350833893,
"learning_rate": 1.997453922456623e-05,
"loss": 2.3759,
"step": 34000
},
{
"epoch": 0.8071387994697974,
"grad_norm": 0.32580050826072693,
"learning_rate": 1.994114372491635e-05,
"loss": 2.3733,
"step": 34100
},
{
"epoch": 0.8095057754213217,
"grad_norm": 0.3402758836746216,
"learning_rate": 1.989455103627163e-05,
"loss": 2.3742,
"step": 34200
},
{
"epoch": 0.8118727513728461,
"grad_norm": 0.3205104470252991,
"learning_rate": 1.983388438172617e-05,
"loss": 2.3704,
"step": 34300
},
{
"epoch": 0.8142397273243703,
"grad_norm": 0.3125210404396057,
"learning_rate": 1.975962963057375e-05,
"loss": 2.3652,
"step": 34400
},
{
"epoch": 0.8166067032758947,
"grad_norm": 0.3083760142326355,
"learning_rate": 1.9671889385274698e-05,
"loss": 2.3782,
"step": 34500
},
{
"epoch": 0.818973679227419,
"grad_norm": 0.3169231116771698,
"learning_rate": 1.9570784882044856e-05,
"loss": 2.3826,
"step": 34600
},
{
"epoch": 0.8213406551789434,
"grad_norm": 0.30974331498146057,
"learning_rate": 1.945645582333587e-05,
"loss": 2.3741,
"step": 34700
},
{
"epoch": 0.8237076311304677,
"grad_norm": 0.34712207317352295,
"learning_rate": 1.93290601847995e-05,
"loss": 2.3839,
"step": 34800
},
{
"epoch": 0.8260746070819921,
"grad_norm": 0.3297557234764099,
"learning_rate": 1.918877399700279e-05,
"loss": 2.3762,
"step": 34900
},
{
"epoch": 0.8284415830335163,
"grad_norm": 0.3331148326396942,
"learning_rate": 1.9035791102195484e-05,
"loss": 2.3759,
"step": 35000
},
{
"epoch": 0.8308085589850407,
"grad_norm": 0.3134233057498932,
"learning_rate": 1.8870322886466053e-05,
"loss": 2.3715,
"step": 35100
},
{
"epoch": 0.833175534936565,
"grad_norm": 0.3077858090400696,
"learning_rate": 1.8692597987656205e-05,
"loss": 2.3652,
"step": 35200
},
{
"epoch": 0.8355425108880894,
"grad_norm": 0.3141195476055145,
"learning_rate": 1.8502861979437626e-05,
"loss": 2.3677,
"step": 35300
},
{
"epoch": 0.8379094868396137,
"grad_norm": 0.3238203525543213,
"learning_rate": 1.8301377031987363e-05,
"loss": 2.368,
"step": 35400
},
{
"epoch": 0.840276462791138,
"grad_norm": 0.32180941104888916,
"learning_rate": 1.8088421549730826e-05,
"loss": 2.3654,
"step": 35500
},
{
"epoch": 0.8426434387426623,
"grad_norm": 0.3173375427722931,
"learning_rate": 1.7864289786652865e-05,
"loss": 2.3708,
"step": 35600
},
{
"epoch": 0.8450104146941867,
"grad_norm": 0.3098245859146118,
"learning_rate": 1.762929143970854e-05,
"loss": 2.3847,
"step": 35700
},
{
"epoch": 0.847377390645711,
"grad_norm": 0.3169116675853729,
"learning_rate": 1.7383751220895348e-05,
"loss": 2.3849,
"step": 35800
},
{
"epoch": 0.8497443665972354,
"grad_norm": 0.2940201461315155,
"learning_rate": 1.7128008408578232e-05,
"loss": 2.3777,
"step": 35900
},
{
"epoch": 0.8521113425487598,
"grad_norm": 0.3399713635444641,
"learning_rate": 1.686241637868734e-05,
"loss": 2.3686,
"step": 36000
},
{
"epoch": 0.854478318500284,
"grad_norm": 0.319431871175766,
"learning_rate": 1.658734211643625e-05,
"loss": 2.3656,
"step": 36100
},
{
"epoch": 0.8568452944518083,
"grad_norm": 0.3360809087753296,
"learning_rate": 1.6303165709235443e-05,
"loss": 2.3782,
"step": 36200
},
{
"epoch": 0.8592122704033327,
"grad_norm": 0.3362599015235901,
"learning_rate": 1.6010279821501603e-05,
"loss": 2.3838,
"step": 36300
},
{
"epoch": 0.861579246354857,
"grad_norm": 0.32247358560562134,
"learning_rate": 1.5709089152088488e-05,
"loss": 2.3708,
"step": 36400
},
{
"epoch": 0.8639462223063814,
"grad_norm": 0.3041239380836487,
"learning_rate": 1.5400009875089087e-05,
"loss": 2.3754,
"step": 36500
},
{
"epoch": 0.8663131982579056,
"grad_norm": 0.3329671323299408,
"learning_rate": 1.5083469064781687e-05,
"loss": 2.3611,
"step": 36600
},
{
"epoch": 0.86868017420943,
"grad_norm": 0.3081216812133789,
"learning_rate": 1.475990410551448e-05,
"loss": 2.3697,
"step": 36700
},
{
"epoch": 0.8710471501609544,
"grad_norm": 0.3056845963001251,
"learning_rate": 1.4429762087344101e-05,
"loss": 2.3602,
"step": 36800
},
{
"epoch": 0.8734141261124787,
"grad_norm": 0.3348017632961273,
"learning_rate": 1.4093499188263166e-05,
"loss": 2.3688,
"step": 36900
},
{
"epoch": 0.8757811020640031,
"grad_norm": 0.3079584240913391,
"learning_rate": 1.3751580043870465e-05,
"loss": 2.3741,
"step": 37000
},
{
"epoch": 0.8781480780155274,
"grad_norm": 0.33923518657684326,
"learning_rate": 1.3407972225319847e-05,
"loss": 2.3628,
"step": 37100
},
{
"epoch": 0.8805150539670517,
"grad_norm": 0.325127512216568,
"learning_rate": 1.3056209752459611e-05,
"loss": 2.3621,
"step": 37200
},
{
"epoch": 0.882882029918576,
"grad_norm": 0.32378092408180237,
"learning_rate": 1.270022432234713e-05,
"loss": 2.3662,
"step": 37300
},
{
"epoch": 0.8852490058701004,
"grad_norm": 0.3274565637111664,
"learning_rate": 1.2340507822442868e-05,
"loss": 2.3665,
"step": 37400
},
{
"epoch": 0.8876159818216247,
"grad_norm": 0.33395031094551086,
"learning_rate": 1.1977557295661108e-05,
"loss": 2.3616,
"step": 37500
},
{
"epoch": 0.8899829577731491,
"grad_norm": 0.3172805607318878,
"learning_rate": 1.1611874253574492e-05,
"loss": 2.3676,
"step": 37600
},
{
"epoch": 0.8923499337246733,
"grad_norm": 0.34134411811828613,
"learning_rate": 1.1243963983443936e-05,
"loss": 2.361,
"step": 37700
},
{
"epoch": 0.8947169096761977,
"grad_norm": 0.3161686658859253,
"learning_rate": 1.0874334850031435e-05,
"loss": 2.3653,
"step": 37800
},
{
"epoch": 0.897083885627722,
"grad_norm": 0.31952333450317383,
"learning_rate": 1.0503497593160507e-05,
"loss": 2.3689,
"step": 37900
},
{
"epoch": 0.8994508615792464,
"grad_norm": 0.3199727237224579,
"learning_rate": 1.0131964621994832e-05,
"loss": 2.3679,
"step": 38000
},
{
"epoch": 0.9018178375307707,
"grad_norm": 0.3248251676559448,
"learning_rate": 9.760249307010301e-06,
"loss": 2.3718,
"step": 38100
},
{
"epoch": 0.9041848134822951,
"grad_norm": 0.32015731930732727,
"learning_rate": 9.388865270638724e-06,
"loss": 2.3594,
"step": 38200
},
{
"epoch": 0.9065517894338193,
"grad_norm": 0.336444228887558,
"learning_rate": 9.018325677563413e-06,
"loss": 2.3677,
"step": 38300
},
{
"epoch": 0.9089187653853437,
"grad_norm": 0.3233816623687744,
"learning_rate": 8.649142525647271e-06,
"loss": 2.3651,
"step": 38400
},
{
"epoch": 0.911285741336868,
"grad_norm": 0.3261754512786865,
"learning_rate": 8.281825938473116e-06,
"loss": 2.3586,
"step": 38500
},
{
"epoch": 0.9136527172883924,
"grad_norm": 0.30703264474868774,
"learning_rate": 7.916883460473865e-06,
"loss": 2.3668,
"step": 38600
},
{
"epoch": 0.9160196932399167,
"grad_norm": 0.3308265507221222,
"learning_rate": 7.554819355626455e-06,
"loss": 2.3536,
"step": 38700
},
{
"epoch": 0.918386669191441,
"grad_norm": 0.35222479701042175,
"learning_rate": 7.196133910678582e-06,
"loss": 2.3635,
"step": 38800
},
{
"epoch": 0.9207536451429653,
"grad_norm": 0.3244943618774414,
"learning_rate": 6.841322743871041e-06,
"loss": 2.3705,
"step": 38900
},
{
"epoch": 0.9231206210944897,
"grad_norm": 0.3116552233695984,
"learning_rate": 6.490876120110827e-06,
"loss": 2.3611,
"step": 39000
},
{
"epoch": 0.925487597046014,
"grad_norm": 0.33249032497406006,
"learning_rate": 6.145278273541281e-06,
"loss": 2.3585,
"step": 39100
},
{
"epoch": 0.9278545729975384,
"grad_norm": 0.3132554888725281,
"learning_rate": 5.805006738445294e-06,
"loss": 2.368,
"step": 39200
},
{
"epoch": 0.9302215489490627,
"grad_norm": 0.3264056444168091,
"learning_rate": 5.4705316894061765e-06,
"loss": 2.3635,
"step": 39300
},
{
"epoch": 0.932588524900587,
"grad_norm": 0.3539658188819885,
"learning_rate": 5.142315291637857e-06,
"loss": 2.3624,
"step": 39400
},
{
"epoch": 0.9349555008521113,
"grad_norm": 0.32513052225112915,
"learning_rate": 4.823991412773918e-06,
"loss": 2.3714,
"step": 39500
},
{
"epoch": 0.9373224768036357,
"grad_norm": 0.3148360252380371,
"learning_rate": 4.509569863501355e-06,
"loss": 2.3587,
"step": 39600
},
{
"epoch": 0.93968945275516,
"grad_norm": 0.33431729674339294,
"learning_rate": 4.202734786899464e-06,
"loss": 2.3719,
"step": 39700
},
{
"epoch": 0.9420564287066844,
"grad_norm": 0.3179948031902313,
"learning_rate": 3.903910156293686e-06,
"loss": 2.3668,
"step": 39800
},
{
"epoch": 0.9444234046582086,
"grad_norm": 0.3322046101093292,
"learning_rate": 3.613508876472357e-06,
"loss": 2.3645,
"step": 39900
},
{
"epoch": 0.946790380609733,
"grad_norm": 0.328708291053772,
"learning_rate": 3.331932213150203e-06,
"loss": 2.3592,
"step": 40000
},
{
"epoch": 0.9491573565612573,
"grad_norm": 0.31054648756980896,
"learning_rate": 3.0595692385142717e-06,
"loss": 2.373,
"step": 40100
},
{
"epoch": 0.9515243325127817,
"grad_norm": 0.3367001414299011,
"learning_rate": 2.79679629361839e-06,
"loss": 2.3614,
"step": 40200
},
{
"epoch": 0.953891308464306,
"grad_norm": 0.32708635926246643,
"learning_rate": 2.543976468369088e-06,
"loss": 2.3541,
"step": 40300
},
{
"epoch": 0.9562582844158304,
"grad_norm": 0.32008349895477295,
"learning_rate": 2.301459099821417e-06,
"loss": 2.3742,
"step": 40400
},
{
"epoch": 0.9586252603673546,
"grad_norm": 0.3486650288105011,
"learning_rate": 2.0695792894779788e-06,
"loss": 2.3553,
"step": 40500
},
{
"epoch": 0.960992236318879,
"grad_norm": 0.32483014464378357,
"learning_rate": 1.8486574402580858e-06,
"loss": 2.3573,
"step": 40600
},
{
"epoch": 0.9633592122704033,
"grad_norm": 0.3283023536205292,
"learning_rate": 1.6389988137769153e-06,
"loss": 2.3715,
"step": 40700
},
{
"epoch": 0.9657261882219277,
"grad_norm": 0.3350400924682617,
"learning_rate": 1.4408931085463206e-06,
"loss": 2.3757,
"step": 40800
},
{
"epoch": 0.968093164173452,
"grad_norm": 0.32112061977386475,
"learning_rate": 1.2564174493396274e-06,
"loss": 2.3816,
"step": 40900
},
{
"epoch": 0.9704601401249763,
"grad_norm": 0.3097105324268341,
"learning_rate": 1.0821003902626947e-06,
"loss": 2.365,
"step": 41000
},
{
"epoch": 0.9728271160765006,
"grad_norm": 0.3082149028778076,
"learning_rate": 9.201057540173219e-07,
"loss": 2.3691,
"step": 41100
},
{
"epoch": 0.975194092028025,
"grad_norm": 0.31467440724372864,
"learning_rate": 7.706573787819616e-07,
"loss": 2.3787,
"step": 41200
},
{
"epoch": 0.9775610679795493,
"grad_norm": 0.3216908872127533,
"learning_rate": 6.339617667770615e-07,
"loss": 2.3821,
"step": 41300
},
{
"epoch": 0.9799280439310737,
"grad_norm": 0.32618415355682373,
"learning_rate": 5.102077989279552e-07,
"loss": 2.3609,
"step": 41400
},
{
"epoch": 0.982295019882598,
"grad_norm": 0.32504287362098694,
"learning_rate": 3.9956647387621507e-07,
"loss": 2.3646,
"step": 41500
},
{
"epoch": 0.9846619958341223,
"grad_norm": 0.31874212622642517,
"learning_rate": 3.0219067170006445e-07,
"loss": 2.3579,
"step": 41600
},
{
"epoch": 0.9870289717856466,
"grad_norm": 0.34173473715782166,
"learning_rate": 2.182149426703606e-07,
"loss": 2.3719,
"step": 41700
},
{
"epoch": 0.989395947737171,
"grad_norm": 0.32773253321647644,
"learning_rate": 1.4775532133402547e-07,
"loss": 2.3625,
"step": 41800
},
{
"epoch": 0.9917629236886953,
"grad_norm": 0.34616851806640625,
"learning_rate": 9.090916618180623e-08,
"loss": 2.3645,
"step": 41900
},
{
"epoch": 0.9941298996402197,
"grad_norm": 0.3184524476528168,
"learning_rate": 4.775502512193164e-08,
"loss": 2.3658,
"step": 42000
},
{
"epoch": 0.9964968755917439,
"grad_norm": 0.349345862865448,
"learning_rate": 1.835252694552425e-08,
"loss": 2.3604,
"step": 42100
},
{
"epoch": 0.9988638515432683,
"grad_norm": 0.33537670969963074,
"learning_rate": 2.742298933747778e-09,
"loss": 2.3624,
"step": 42200
},
{
"epoch": 1.0,
"step": 42248,
"total_flos": 6.236990962447417e+18,
"train_loss": 2.428118334464832,
"train_runtime": 22646.6509,
"train_samples_per_second": 29.848,
"train_steps_per_second": 1.866
}
],
"logging_steps": 100,
"max_steps": 42248,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.236990962447417e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}