lvp_llama3_8b / trainer_state.json
Sierkinhane's picture
Add checkpoint
d565b21 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 10000,
"global_step": 12869,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007770611547128759,
"grad_norm": 10.867119295233653,
"learning_rate": 7.770007770007771e-08,
"loss": 1.9304,
"step": 10
},
{
"epoch": 0.0015541223094257517,
"grad_norm": 10.849708826912844,
"learning_rate": 1.5540015540015542e-07,
"loss": 1.7731,
"step": 20
},
{
"epoch": 0.002331183464138628,
"grad_norm": 14.155064770562149,
"learning_rate": 2.3310023310023313e-07,
"loss": 1.8856,
"step": 30
},
{
"epoch": 0.0031082446188515035,
"grad_norm": 7.8437428614229106,
"learning_rate": 3.1080031080031084e-07,
"loss": 1.7444,
"step": 40
},
{
"epoch": 0.0038853057735643796,
"grad_norm": 6.831881710614827,
"learning_rate": 3.885003885003885e-07,
"loss": 1.8265,
"step": 50
},
{
"epoch": 0.004662366928277256,
"grad_norm": 4.832603626709541,
"learning_rate": 4.6620046620046626e-07,
"loss": 1.6188,
"step": 60
},
{
"epoch": 0.005439428082990132,
"grad_norm": 4.682650839871238,
"learning_rate": 5.43900543900544e-07,
"loss": 1.4275,
"step": 70
},
{
"epoch": 0.006216489237703007,
"grad_norm": 3.4425826085380953,
"learning_rate": 6.216006216006217e-07,
"loss": 1.2748,
"step": 80
},
{
"epoch": 0.006993550392415883,
"grad_norm": 2.5177730139033967,
"learning_rate": 6.993006993006994e-07,
"loss": 1.3117,
"step": 90
},
{
"epoch": 0.007770611547128759,
"grad_norm": 3.7240853256217887,
"learning_rate": 7.77000777000777e-07,
"loss": 1.3679,
"step": 100
},
{
"epoch": 0.008547672701841634,
"grad_norm": 3.326103554099626,
"learning_rate": 8.547008547008548e-07,
"loss": 1.2847,
"step": 110
},
{
"epoch": 0.009324733856554511,
"grad_norm": 3.06027479016113,
"learning_rate": 9.324009324009325e-07,
"loss": 1.4199,
"step": 120
},
{
"epoch": 0.010101795011267387,
"grad_norm": 3.3944711373731336,
"learning_rate": 1.01010101010101e-06,
"loss": 1.2416,
"step": 130
},
{
"epoch": 0.010878856165980264,
"grad_norm": 2.8783458689024943,
"learning_rate": 1.087801087801088e-06,
"loss": 1.2386,
"step": 140
},
{
"epoch": 0.011655917320693139,
"grad_norm": 4.560753080777367,
"learning_rate": 1.1655011655011655e-06,
"loss": 1.1683,
"step": 150
},
{
"epoch": 0.012432978475406014,
"grad_norm": 3.2610287866769823,
"learning_rate": 1.2432012432012434e-06,
"loss": 1.3331,
"step": 160
},
{
"epoch": 0.013210039630118891,
"grad_norm": 3.5630097050518494,
"learning_rate": 1.320901320901321e-06,
"loss": 1.3022,
"step": 170
},
{
"epoch": 0.013987100784831766,
"grad_norm": 3.190644560112282,
"learning_rate": 1.3986013986013987e-06,
"loss": 1.2078,
"step": 180
},
{
"epoch": 0.014764161939544641,
"grad_norm": 2.7424883315006667,
"learning_rate": 1.4763014763014764e-06,
"loss": 1.2883,
"step": 190
},
{
"epoch": 0.015541223094257518,
"grad_norm": 3.578715680372139,
"learning_rate": 1.554001554001554e-06,
"loss": 1.2041,
"step": 200
},
{
"epoch": 0.016318284248970395,
"grad_norm": 3.2571998176648207,
"learning_rate": 1.6317016317016318e-06,
"loss": 1.2505,
"step": 210
},
{
"epoch": 0.01709534540368327,
"grad_norm": 3.4399528179608327,
"learning_rate": 1.7094017094017097e-06,
"loss": 1.2012,
"step": 220
},
{
"epoch": 0.017872406558396146,
"grad_norm": 3.8729257141905116,
"learning_rate": 1.7871017871017873e-06,
"loss": 1.3179,
"step": 230
},
{
"epoch": 0.018649467713109023,
"grad_norm": 3.6027496697475616,
"learning_rate": 1.864801864801865e-06,
"loss": 1.2437,
"step": 240
},
{
"epoch": 0.019426528867821896,
"grad_norm": 3.6431878968740072,
"learning_rate": 1.9425019425019425e-06,
"loss": 1.1645,
"step": 250
},
{
"epoch": 0.020203590022534773,
"grad_norm": 2.857881707560637,
"learning_rate": 2.02020202020202e-06,
"loss": 1.1369,
"step": 260
},
{
"epoch": 0.02098065117724765,
"grad_norm": 2.8739221855243042,
"learning_rate": 2.0979020979020983e-06,
"loss": 1.1846,
"step": 270
},
{
"epoch": 0.021757712331960527,
"grad_norm": 3.5028112168977557,
"learning_rate": 2.175602175602176e-06,
"loss": 1.2122,
"step": 280
},
{
"epoch": 0.0225347734866734,
"grad_norm": 3.4640995610274445,
"learning_rate": 2.2533022533022537e-06,
"loss": 1.1927,
"step": 290
},
{
"epoch": 0.023311834641386277,
"grad_norm": 3.379264646936701,
"learning_rate": 2.331002331002331e-06,
"loss": 1.258,
"step": 300
},
{
"epoch": 0.024088895796099154,
"grad_norm": 2.4371515340367385,
"learning_rate": 2.408702408702409e-06,
"loss": 1.1477,
"step": 310
},
{
"epoch": 0.024865956950812028,
"grad_norm": 3.014613121507287,
"learning_rate": 2.4864024864024867e-06,
"loss": 1.1715,
"step": 320
},
{
"epoch": 0.025643018105524905,
"grad_norm": 3.0458793192067715,
"learning_rate": 2.564102564102564e-06,
"loss": 1.1353,
"step": 330
},
{
"epoch": 0.026420079260237782,
"grad_norm": 2.9917200999353906,
"learning_rate": 2.641802641802642e-06,
"loss": 1.1992,
"step": 340
},
{
"epoch": 0.027197140414950655,
"grad_norm": 2.6599563280716985,
"learning_rate": 2.7195027195027198e-06,
"loss": 1.1782,
"step": 350
},
{
"epoch": 0.027974201569663532,
"grad_norm": 3.0009575544324454,
"learning_rate": 2.7972027972027974e-06,
"loss": 1.2762,
"step": 360
},
{
"epoch": 0.02875126272437641,
"grad_norm": 2.8774035033800343,
"learning_rate": 2.874902874902875e-06,
"loss": 1.2687,
"step": 370
},
{
"epoch": 0.029528323879089283,
"grad_norm": 3.11771455020667,
"learning_rate": 2.952602952602953e-06,
"loss": 1.207,
"step": 380
},
{
"epoch": 0.03030538503380216,
"grad_norm": 3.6810769724431345,
"learning_rate": 3.0303030303030305e-06,
"loss": 1.2037,
"step": 390
},
{
"epoch": 0.031082446188515037,
"grad_norm": 2.5507766565385084,
"learning_rate": 3.108003108003108e-06,
"loss": 1.163,
"step": 400
},
{
"epoch": 0.031859507343227914,
"grad_norm": 2.9816770527812686,
"learning_rate": 3.1857031857031863e-06,
"loss": 1.1592,
"step": 410
},
{
"epoch": 0.03263656849794079,
"grad_norm": 2.591410551140759,
"learning_rate": 3.2634032634032635e-06,
"loss": 1.0411,
"step": 420
},
{
"epoch": 0.03341362965265366,
"grad_norm": 3.1328334298888345,
"learning_rate": 3.3411033411033412e-06,
"loss": 1.1438,
"step": 430
},
{
"epoch": 0.03419069080736654,
"grad_norm": 2.9537075236771675,
"learning_rate": 3.4188034188034193e-06,
"loss": 1.1713,
"step": 440
},
{
"epoch": 0.034967751962079414,
"grad_norm": 4.35570272552757,
"learning_rate": 3.4965034965034966e-06,
"loss": 1.2358,
"step": 450
},
{
"epoch": 0.03574481311679229,
"grad_norm": 2.4749714488159613,
"learning_rate": 3.5742035742035747e-06,
"loss": 1.1325,
"step": 460
},
{
"epoch": 0.03652187427150517,
"grad_norm": 2.770830578293701,
"learning_rate": 3.651903651903652e-06,
"loss": 1.1979,
"step": 470
},
{
"epoch": 0.037298935426218045,
"grad_norm": 3.2166027135563793,
"learning_rate": 3.72960372960373e-06,
"loss": 1.1605,
"step": 480
},
{
"epoch": 0.03807599658093092,
"grad_norm": 2.843605809275243,
"learning_rate": 3.8073038073038077e-06,
"loss": 1.2299,
"step": 490
},
{
"epoch": 0.03885305773564379,
"grad_norm": 2.959678568881321,
"learning_rate": 3.885003885003885e-06,
"loss": 1.2634,
"step": 500
},
{
"epoch": 0.03963011889035667,
"grad_norm": 2.5622873834599367,
"learning_rate": 3.962703962703963e-06,
"loss": 1.1137,
"step": 510
},
{
"epoch": 0.040407180045069546,
"grad_norm": 3.086457018563733,
"learning_rate": 4.04040404040404e-06,
"loss": 1.2407,
"step": 520
},
{
"epoch": 0.04118424119978242,
"grad_norm": 4.106519986211115,
"learning_rate": 4.1181041181041185e-06,
"loss": 1.1239,
"step": 530
},
{
"epoch": 0.0419613023544953,
"grad_norm": 2.7183745936305312,
"learning_rate": 4.195804195804197e-06,
"loss": 1.1746,
"step": 540
},
{
"epoch": 0.04273836350920818,
"grad_norm": 2.703894165918197,
"learning_rate": 4.273504273504274e-06,
"loss": 1.1105,
"step": 550
},
{
"epoch": 0.043515424663921054,
"grad_norm": 2.4867862713355686,
"learning_rate": 4.351204351204352e-06,
"loss": 1.1258,
"step": 560
},
{
"epoch": 0.044292485818633924,
"grad_norm": 2.838440814840756,
"learning_rate": 4.428904428904429e-06,
"loss": 1.0962,
"step": 570
},
{
"epoch": 0.0450695469733468,
"grad_norm": 2.1466654271023162,
"learning_rate": 4.506604506604507e-06,
"loss": 1.1085,
"step": 580
},
{
"epoch": 0.04584660812805968,
"grad_norm": 2.5468209985419477,
"learning_rate": 4.5843045843045846e-06,
"loss": 1.1391,
"step": 590
},
{
"epoch": 0.046623669282772555,
"grad_norm": 2.7865905520731493,
"learning_rate": 4.662004662004662e-06,
"loss": 1.1387,
"step": 600
},
{
"epoch": 0.04740073043748543,
"grad_norm": 2.1644371566582827,
"learning_rate": 4.73970473970474e-06,
"loss": 1.1499,
"step": 610
},
{
"epoch": 0.04817779159219831,
"grad_norm": 2.4913053508847667,
"learning_rate": 4.817404817404818e-06,
"loss": 1.1007,
"step": 620
},
{
"epoch": 0.04895485274691118,
"grad_norm": 3.001782928715834,
"learning_rate": 4.895104895104895e-06,
"loss": 1.2201,
"step": 630
},
{
"epoch": 0.049731913901624056,
"grad_norm": 2.2758391555971835,
"learning_rate": 4.972804972804973e-06,
"loss": 1.2219,
"step": 640
},
{
"epoch": 0.05050897505633693,
"grad_norm": 2.2066322284819155,
"learning_rate": 5.0505050505050515e-06,
"loss": 1.1288,
"step": 650
},
{
"epoch": 0.05128603621104981,
"grad_norm": 2.5894704735788263,
"learning_rate": 5.128205128205128e-06,
"loss": 1.1989,
"step": 660
},
{
"epoch": 0.05206309736576269,
"grad_norm": 2.952941171933437,
"learning_rate": 5.205905205905206e-06,
"loss": 1.1165,
"step": 670
},
{
"epoch": 0.052840158520475564,
"grad_norm": 2.7070115957706946,
"learning_rate": 5.283605283605284e-06,
"loss": 1.1954,
"step": 680
},
{
"epoch": 0.05361721967518844,
"grad_norm": 2.2390053746810668,
"learning_rate": 5.361305361305362e-06,
"loss": 1.1219,
"step": 690
},
{
"epoch": 0.05439428082990131,
"grad_norm": 2.5396421668929774,
"learning_rate": 5.4390054390054395e-06,
"loss": 1.2285,
"step": 700
},
{
"epoch": 0.05517134198461419,
"grad_norm": 1.881059123798051,
"learning_rate": 5.516705516705518e-06,
"loss": 1.074,
"step": 710
},
{
"epoch": 0.055948403139327064,
"grad_norm": 2.4027627997044285,
"learning_rate": 5.594405594405595e-06,
"loss": 1.1395,
"step": 720
},
{
"epoch": 0.05672546429403994,
"grad_norm": 2.135346477937923,
"learning_rate": 5.672105672105672e-06,
"loss": 1.0813,
"step": 730
},
{
"epoch": 0.05750252544875282,
"grad_norm": 2.673768837075326,
"learning_rate": 5.74980574980575e-06,
"loss": 1.117,
"step": 740
},
{
"epoch": 0.058279586603465695,
"grad_norm": 2.1561152898986924,
"learning_rate": 5.827505827505828e-06,
"loss": 1.0868,
"step": 750
},
{
"epoch": 0.059056647758178565,
"grad_norm": 2.198404569968455,
"learning_rate": 5.905205905205906e-06,
"loss": 1.0985,
"step": 760
},
{
"epoch": 0.05983370891289144,
"grad_norm": 1.9218754267874707,
"learning_rate": 5.982905982905983e-06,
"loss": 1.1303,
"step": 770
},
{
"epoch": 0.06061077006760432,
"grad_norm": 2.070293097589863,
"learning_rate": 6.060606060606061e-06,
"loss": 1.056,
"step": 780
},
{
"epoch": 0.061387831222317196,
"grad_norm": 2.2102128154144833,
"learning_rate": 6.138306138306139e-06,
"loss": 1.1511,
"step": 790
},
{
"epoch": 0.06216489237703007,
"grad_norm": 2.9020079791880438,
"learning_rate": 6.216006216006216e-06,
"loss": 1.0976,
"step": 800
},
{
"epoch": 0.06294195353174295,
"grad_norm": 3.1669273668699733,
"learning_rate": 6.2937062937062944e-06,
"loss": 1.1263,
"step": 810
},
{
"epoch": 0.06371901468645583,
"grad_norm": 1.9847375481750156,
"learning_rate": 6.3714063714063726e-06,
"loss": 1.0923,
"step": 820
},
{
"epoch": 0.0644960758411687,
"grad_norm": 2.09531371322368,
"learning_rate": 6.449106449106449e-06,
"loss": 1.0821,
"step": 830
},
{
"epoch": 0.06527313699588158,
"grad_norm": 3.287845612968483,
"learning_rate": 6.526806526806527e-06,
"loss": 1.0614,
"step": 840
},
{
"epoch": 0.06605019815059446,
"grad_norm": 2.2662925493592083,
"learning_rate": 6.604506604506605e-06,
"loss": 1.1021,
"step": 850
},
{
"epoch": 0.06682725930530732,
"grad_norm": 2.4839925554425717,
"learning_rate": 6.6822066822066824e-06,
"loss": 1.1501,
"step": 860
},
{
"epoch": 0.0676043204600202,
"grad_norm": 2.331604369524609,
"learning_rate": 6.7599067599067605e-06,
"loss": 1.1742,
"step": 870
},
{
"epoch": 0.06838138161473307,
"grad_norm": 2.3590829029315583,
"learning_rate": 6.837606837606839e-06,
"loss": 1.1163,
"step": 880
},
{
"epoch": 0.06915844276944595,
"grad_norm": 2.892618110874262,
"learning_rate": 6.915306915306917e-06,
"loss": 1.1725,
"step": 890
},
{
"epoch": 0.06993550392415883,
"grad_norm": 2.384306036165181,
"learning_rate": 6.993006993006993e-06,
"loss": 1.1418,
"step": 900
},
{
"epoch": 0.0707125650788717,
"grad_norm": 2.614333186214158,
"learning_rate": 7.070707070707071e-06,
"loss": 1.1369,
"step": 910
},
{
"epoch": 0.07148962623358458,
"grad_norm": 2.7443403027281428,
"learning_rate": 7.148407148407149e-06,
"loss": 1.1719,
"step": 920
},
{
"epoch": 0.07226668738829746,
"grad_norm": 2.115919721191313,
"learning_rate": 7.226107226107227e-06,
"loss": 1.1642,
"step": 930
},
{
"epoch": 0.07304374854301034,
"grad_norm": 2.4226282288227052,
"learning_rate": 7.303807303807304e-06,
"loss": 1.1609,
"step": 940
},
{
"epoch": 0.07382080969772321,
"grad_norm": 2.3322158120159657,
"learning_rate": 7.381507381507382e-06,
"loss": 1.0715,
"step": 950
},
{
"epoch": 0.07459787085243609,
"grad_norm": 2.216105414113714,
"learning_rate": 7.45920745920746e-06,
"loss": 1.0836,
"step": 960
},
{
"epoch": 0.07537493200714897,
"grad_norm": 2.080845937826623,
"learning_rate": 7.536907536907537e-06,
"loss": 1.1298,
"step": 970
},
{
"epoch": 0.07615199316186184,
"grad_norm": 1.9845987743197342,
"learning_rate": 7.6146076146076155e-06,
"loss": 1.1375,
"step": 980
},
{
"epoch": 0.07692905431657471,
"grad_norm": 2.052691449449282,
"learning_rate": 7.692307692307694e-06,
"loss": 1.1501,
"step": 990
},
{
"epoch": 0.07770611547128758,
"grad_norm": 1.9263855972921253,
"learning_rate": 7.77000777000777e-06,
"loss": 1.1237,
"step": 1000
},
{
"epoch": 0.07848317662600046,
"grad_norm": 2.9671775698526934,
"learning_rate": 7.847707847707848e-06,
"loss": 1.0918,
"step": 1010
},
{
"epoch": 0.07926023778071334,
"grad_norm": 2.2929478440651394,
"learning_rate": 7.925407925407926e-06,
"loss": 1.0848,
"step": 1020
},
{
"epoch": 0.08003729893542622,
"grad_norm": 1.6757069320789237,
"learning_rate": 8.003108003108003e-06,
"loss": 1.1209,
"step": 1030
},
{
"epoch": 0.08081436009013909,
"grad_norm": 2.093200645109728,
"learning_rate": 8.08080808080808e-06,
"loss": 1.0866,
"step": 1040
},
{
"epoch": 0.08159142124485197,
"grad_norm": 2.408927649486391,
"learning_rate": 8.158508158508159e-06,
"loss": 1.0934,
"step": 1050
},
{
"epoch": 0.08236848239956485,
"grad_norm": 2.2763929710773643,
"learning_rate": 8.236208236208237e-06,
"loss": 1.1081,
"step": 1060
},
{
"epoch": 0.08314554355427772,
"grad_norm": 2.329064562776198,
"learning_rate": 8.313908313908315e-06,
"loss": 1.1366,
"step": 1070
},
{
"epoch": 0.0839226047089906,
"grad_norm": 1.9093884379628574,
"learning_rate": 8.391608391608393e-06,
"loss": 1.0907,
"step": 1080
},
{
"epoch": 0.08469966586370348,
"grad_norm": 2.0666971265552694,
"learning_rate": 8.46930846930847e-06,
"loss": 1.1396,
"step": 1090
},
{
"epoch": 0.08547672701841635,
"grad_norm": 2.6618881870416833,
"learning_rate": 8.547008547008548e-06,
"loss": 1.1204,
"step": 1100
},
{
"epoch": 0.08625378817312923,
"grad_norm": 2.5811056119151115,
"learning_rate": 8.624708624708626e-06,
"loss": 1.1067,
"step": 1110
},
{
"epoch": 0.08703084932784211,
"grad_norm": 2.4891841510360697,
"learning_rate": 8.702408702408704e-06,
"loss": 1.0186,
"step": 1120
},
{
"epoch": 0.08780791048255497,
"grad_norm": 1.9964291348885184,
"learning_rate": 8.78010878010878e-06,
"loss": 1.0534,
"step": 1130
},
{
"epoch": 0.08858497163726785,
"grad_norm": 1.8380639056753707,
"learning_rate": 8.857808857808858e-06,
"loss": 1.1259,
"step": 1140
},
{
"epoch": 0.08936203279198073,
"grad_norm": 2.026492546755725,
"learning_rate": 8.935508935508937e-06,
"loss": 1.1357,
"step": 1150
},
{
"epoch": 0.0901390939466936,
"grad_norm": 2.3881102752793546,
"learning_rate": 9.013209013209015e-06,
"loss": 1.1451,
"step": 1160
},
{
"epoch": 0.09091615510140648,
"grad_norm": 2.3516814013111578,
"learning_rate": 9.090909090909091e-06,
"loss": 1.1304,
"step": 1170
},
{
"epoch": 0.09169321625611936,
"grad_norm": 2.2625675458737255,
"learning_rate": 9.168609168609169e-06,
"loss": 1.1289,
"step": 1180
},
{
"epoch": 0.09247027741083223,
"grad_norm": 1.8601808712202859,
"learning_rate": 9.246309246309247e-06,
"loss": 1.0705,
"step": 1190
},
{
"epoch": 0.09324733856554511,
"grad_norm": 2.45830074558663,
"learning_rate": 9.324009324009324e-06,
"loss": 1.0006,
"step": 1200
},
{
"epoch": 0.09402439972025799,
"grad_norm": 2.2208723337033747,
"learning_rate": 9.401709401709402e-06,
"loss": 1.0642,
"step": 1210
},
{
"epoch": 0.09480146087497086,
"grad_norm": 2.992927987309589,
"learning_rate": 9.47940947940948e-06,
"loss": 1.1099,
"step": 1220
},
{
"epoch": 0.09557852202968374,
"grad_norm": 2.2404447843072526,
"learning_rate": 9.557109557109558e-06,
"loss": 1.1046,
"step": 1230
},
{
"epoch": 0.09635558318439662,
"grad_norm": 2.027188334095754,
"learning_rate": 9.634809634809636e-06,
"loss": 1.1388,
"step": 1240
},
{
"epoch": 0.0971326443391095,
"grad_norm": 1.9884931664591046,
"learning_rate": 9.712509712509714e-06,
"loss": 1.1093,
"step": 1250
},
{
"epoch": 0.09790970549382236,
"grad_norm": 2.2780803616241,
"learning_rate": 9.79020979020979e-06,
"loss": 1.0973,
"step": 1260
},
{
"epoch": 0.09868676664853523,
"grad_norm": 2.482851610024637,
"learning_rate": 9.867909867909869e-06,
"loss": 1.1074,
"step": 1270
},
{
"epoch": 0.09946382780324811,
"grad_norm": 2.1809979058393547,
"learning_rate": 9.945609945609947e-06,
"loss": 1.1111,
"step": 1280
},
{
"epoch": 0.10024088895796099,
"grad_norm": 2.653036244084716,
"learning_rate": 9.999998344553621e-06,
"loss": 1.0539,
"step": 1290
},
{
"epoch": 0.10101795011267387,
"grad_norm": 2.1782834112618144,
"learning_rate": 9.99996891442626e-06,
"loss": 1.1277,
"step": 1300
},
{
"epoch": 0.10179501126738674,
"grad_norm": 2.0794830642914532,
"learning_rate": 9.999902696850819e-06,
"loss": 1.1028,
"step": 1310
},
{
"epoch": 0.10257207242209962,
"grad_norm": 1.9588777456228414,
"learning_rate": 9.999799692314491e-06,
"loss": 1.0799,
"step": 1320
},
{
"epoch": 0.1033491335768125,
"grad_norm": 1.8109731584724105,
"learning_rate": 9.999659901575142e-06,
"loss": 1.0387,
"step": 1330
},
{
"epoch": 0.10412619473152537,
"grad_norm": 1.496513992331799,
"learning_rate": 9.999483325661283e-06,
"loss": 1.0982,
"step": 1340
},
{
"epoch": 0.10490325588623825,
"grad_norm": 1.9418465016002184,
"learning_rate": 9.999269965872081e-06,
"loss": 1.1873,
"step": 1350
},
{
"epoch": 0.10568031704095113,
"grad_norm": 1.8814020449439044,
"learning_rate": 9.999019823777335e-06,
"loss": 1.1121,
"step": 1360
},
{
"epoch": 0.106457378195664,
"grad_norm": 2.5624116813963083,
"learning_rate": 9.998732901217474e-06,
"loss": 1.1057,
"step": 1370
},
{
"epoch": 0.10723443935037688,
"grad_norm": 2.8084481900607767,
"learning_rate": 9.998409200303543e-06,
"loss": 1.0796,
"step": 1380
},
{
"epoch": 0.10801150050508974,
"grad_norm": 2.5585637275706827,
"learning_rate": 9.998048723417184e-06,
"loss": 1.0911,
"step": 1390
},
{
"epoch": 0.10878856165980262,
"grad_norm": 1.8486528676878824,
"learning_rate": 9.997651473210614e-06,
"loss": 1.1027,
"step": 1400
},
{
"epoch": 0.1095656228145155,
"grad_norm": 1.6756625698252106,
"learning_rate": 9.99721745260662e-06,
"loss": 0.9892,
"step": 1410
},
{
"epoch": 0.11034268396922838,
"grad_norm": 1.7980527241240165,
"learning_rate": 9.996746664798523e-06,
"loss": 1.0714,
"step": 1420
},
{
"epoch": 0.11111974512394125,
"grad_norm": 2.965648407184345,
"learning_rate": 9.996239113250158e-06,
"loss": 1.1627,
"step": 1430
},
{
"epoch": 0.11189680627865413,
"grad_norm": 2.58378967500062,
"learning_rate": 9.995694801695856e-06,
"loss": 1.1338,
"step": 1440
},
{
"epoch": 0.112673867433367,
"grad_norm": 2.3312493063488104,
"learning_rate": 9.995113734140409e-06,
"loss": 1.0527,
"step": 1450
},
{
"epoch": 0.11345092858807988,
"grad_norm": 1.7987672632076395,
"learning_rate": 9.99449591485904e-06,
"loss": 1.1463,
"step": 1460
},
{
"epoch": 0.11422798974279276,
"grad_norm": 2.2447963047423674,
"learning_rate": 9.993841348397377e-06,
"loss": 1.0993,
"step": 1470
},
{
"epoch": 0.11500505089750564,
"grad_norm": 2.3307589401248983,
"learning_rate": 9.993150039571417e-06,
"loss": 1.1,
"step": 1480
},
{
"epoch": 0.11578211205221851,
"grad_norm": 2.4461716652591377,
"learning_rate": 9.992421993467488e-06,
"loss": 1.1223,
"step": 1490
},
{
"epoch": 0.11655917320693139,
"grad_norm": 2.325560003259248,
"learning_rate": 9.991657215442215e-06,
"loss": 1.1016,
"step": 1500
},
{
"epoch": 0.11733623436164427,
"grad_norm": 2.324019330722723,
"learning_rate": 9.99085571112248e-06,
"loss": 1.102,
"step": 1510
},
{
"epoch": 0.11811329551635713,
"grad_norm": 2.184804872790777,
"learning_rate": 9.990017486405379e-06,
"loss": 1.0691,
"step": 1520
},
{
"epoch": 0.11889035667107001,
"grad_norm": 2.3778750559007946,
"learning_rate": 9.989142547458182e-06,
"loss": 1.0902,
"step": 1530
},
{
"epoch": 0.11966741782578288,
"grad_norm": 1.9170168154911298,
"learning_rate": 9.988230900718279e-06,
"loss": 1.0755,
"step": 1540
},
{
"epoch": 0.12044447898049576,
"grad_norm": 2.242423744369333,
"learning_rate": 9.987282552893146e-06,
"loss": 1.0557,
"step": 1550
},
{
"epoch": 0.12122154013520864,
"grad_norm": 2.4290588197619574,
"learning_rate": 9.986297510960284e-06,
"loss": 1.0472,
"step": 1560
},
{
"epoch": 0.12199860128992152,
"grad_norm": 2.4366241079551596,
"learning_rate": 9.985275782167175e-06,
"loss": 1.0249,
"step": 1570
},
{
"epoch": 0.12277566244463439,
"grad_norm": 2.6491566316518673,
"learning_rate": 9.984217374031225e-06,
"loss": 1.0816,
"step": 1580
},
{
"epoch": 0.12355272359934727,
"grad_norm": 2.159316756547971,
"learning_rate": 9.983122294339708e-06,
"loss": 1.078,
"step": 1590
},
{
"epoch": 0.12432978475406015,
"grad_norm": 2.0761579284967944,
"learning_rate": 9.981990551149714e-06,
"loss": 1.0913,
"step": 1600
},
{
"epoch": 0.12510684590877302,
"grad_norm": 2.528857689821478,
"learning_rate": 9.980822152788082e-06,
"loss": 1.1034,
"step": 1610
},
{
"epoch": 0.1258839070634859,
"grad_norm": 1.5046304989897192,
"learning_rate": 9.979617107851343e-06,
"loss": 1.114,
"step": 1620
},
{
"epoch": 0.12666096821819878,
"grad_norm": 2.2475747257064707,
"learning_rate": 9.97837542520566e-06,
"loss": 1.0558,
"step": 1630
},
{
"epoch": 0.12743802937291165,
"grad_norm": 2.016387639571554,
"learning_rate": 9.977097113986755e-06,
"loss": 1.1429,
"step": 1640
},
{
"epoch": 0.12821509052762453,
"grad_norm": 2.246062301174424,
"learning_rate": 9.97578218359985e-06,
"loss": 1.0643,
"step": 1650
},
{
"epoch": 0.1289921516823374,
"grad_norm": 2.7312095064634323,
"learning_rate": 9.974430643719591e-06,
"loss": 1.0671,
"step": 1660
},
{
"epoch": 0.12976921283705029,
"grad_norm": 1.813294617554991,
"learning_rate": 9.973042504289978e-06,
"loss": 0.9926,
"step": 1670
},
{
"epoch": 0.13054627399176316,
"grad_norm": 2.2812471968380095,
"learning_rate": 9.971617775524301e-06,
"loss": 1.0825,
"step": 1680
},
{
"epoch": 0.13132333514647604,
"grad_norm": 1.756937891360179,
"learning_rate": 9.970156467905048e-06,
"loss": 1.0673,
"step": 1690
},
{
"epoch": 0.13210039630118892,
"grad_norm": 2.082158585539177,
"learning_rate": 9.968658592183842e-06,
"loss": 1.1994,
"step": 1700
},
{
"epoch": 0.1328774574559018,
"grad_norm": 1.9267534200786023,
"learning_rate": 9.967124159381359e-06,
"loss": 1.1162,
"step": 1710
},
{
"epoch": 0.13365451861061464,
"grad_norm": 3.0547406918856748,
"learning_rate": 9.965553180787239e-06,
"loss": 1.0263,
"step": 1720
},
{
"epoch": 0.13443157976532752,
"grad_norm": 1.7665942406417015,
"learning_rate": 9.963945667960017e-06,
"loss": 0.9662,
"step": 1730
},
{
"epoch": 0.1352086409200404,
"grad_norm": 1.8418454319389166,
"learning_rate": 9.962301632727022e-06,
"loss": 1.0806,
"step": 1740
},
{
"epoch": 0.13598570207475327,
"grad_norm": 1.7673330680317212,
"learning_rate": 9.960621087184303e-06,
"loss": 1.0801,
"step": 1750
},
{
"epoch": 0.13676276322946615,
"grad_norm": 2.206590428660935,
"learning_rate": 9.95890404369653e-06,
"loss": 1.1432,
"step": 1760
},
{
"epoch": 0.13753982438417903,
"grad_norm": 2.2302577958801195,
"learning_rate": 9.957150514896919e-06,
"loss": 1.152,
"step": 1770
},
{
"epoch": 0.1383168855388919,
"grad_norm": 2.0260327381346794,
"learning_rate": 9.95536051368711e-06,
"loss": 1.0658,
"step": 1780
},
{
"epoch": 0.13909394669360478,
"grad_norm": 1.5644692783168082,
"learning_rate": 9.953534053237108e-06,
"loss": 1.0604,
"step": 1790
},
{
"epoch": 0.13987100784831766,
"grad_norm": 1.738578328297917,
"learning_rate": 9.951671146985159e-06,
"loss": 0.9911,
"step": 1800
},
{
"epoch": 0.14064806900303053,
"grad_norm": 1.6603612609497798,
"learning_rate": 9.949771808637657e-06,
"loss": 1.0849,
"step": 1810
},
{
"epoch": 0.1414251301577434,
"grad_norm": 2.031511681498179,
"learning_rate": 9.947836052169056e-06,
"loss": 0.9919,
"step": 1820
},
{
"epoch": 0.1422021913124563,
"grad_norm": 1.5044981498939936,
"learning_rate": 9.945863891821749e-06,
"loss": 0.9996,
"step": 1830
},
{
"epoch": 0.14297925246716917,
"grad_norm": 2.293059765739188,
"learning_rate": 9.943855342105979e-06,
"loss": 1.0394,
"step": 1840
},
{
"epoch": 0.14375631362188204,
"grad_norm": 1.9478707992466775,
"learning_rate": 9.941810417799719e-06,
"loss": 0.9964,
"step": 1850
},
{
"epoch": 0.14453337477659492,
"grad_norm": 1.5149400216960562,
"learning_rate": 9.939729133948572e-06,
"loss": 1.0521,
"step": 1860
},
{
"epoch": 0.1453104359313078,
"grad_norm": 2.2351667693118524,
"learning_rate": 9.93761150586566e-06,
"loss": 1.1685,
"step": 1870
},
{
"epoch": 0.14608749708602067,
"grad_norm": 3.4005405751624087,
"learning_rate": 9.935457549131504e-06,
"loss": 1.0859,
"step": 1880
},
{
"epoch": 0.14686455824073355,
"grad_norm": 2.1781460644900257,
"learning_rate": 9.933267279593919e-06,
"loss": 1.037,
"step": 1890
},
{
"epoch": 0.14764161939544643,
"grad_norm": 2.432585604447532,
"learning_rate": 9.931040713367888e-06,
"loss": 1.0816,
"step": 1900
},
{
"epoch": 0.1484186805501593,
"grad_norm": 1.834847415817245,
"learning_rate": 9.928777866835454e-06,
"loss": 1.0843,
"step": 1910
},
{
"epoch": 0.14919574170487218,
"grad_norm": 1.7231188780918039,
"learning_rate": 9.926478756645586e-06,
"loss": 1.0286,
"step": 1920
},
{
"epoch": 0.14997280285958506,
"grad_norm": 2.113770754133767,
"learning_rate": 9.924143399714072e-06,
"loss": 1.0627,
"step": 1930
},
{
"epoch": 0.15074986401429794,
"grad_norm": 2.3994884363588036,
"learning_rate": 9.92177181322338e-06,
"loss": 1.0116,
"step": 1940
},
{
"epoch": 0.1515269251690108,
"grad_norm": 2.0230342364705454,
"learning_rate": 9.919364014622545e-06,
"loss": 1.0606,
"step": 1950
},
{
"epoch": 0.1523039863237237,
"grad_norm": 2.1208192115487816,
"learning_rate": 9.91692002162703e-06,
"loss": 1.0623,
"step": 1960
},
{
"epoch": 0.15308104747843657,
"grad_norm": 1.954692914861481,
"learning_rate": 9.914439852218598e-06,
"loss": 1.036,
"step": 1970
},
{
"epoch": 0.15385810863314942,
"grad_norm": 2.4424599661840394,
"learning_rate": 9.911923524645184e-06,
"loss": 1.0592,
"step": 1980
},
{
"epoch": 0.1546351697878623,
"grad_norm": 1.7002048061692303,
"learning_rate": 9.909371057420756e-06,
"loss": 1.1009,
"step": 1990
},
{
"epoch": 0.15541223094257517,
"grad_norm": 1.6400522184059512,
"learning_rate": 9.906782469325183e-06,
"loss": 1.0584,
"step": 2000
},
{
"epoch": 0.15618929209728805,
"grad_norm": 1.9086125071696802,
"learning_rate": 9.904157779404095e-06,
"loss": 1.027,
"step": 2010
},
{
"epoch": 0.15696635325200092,
"grad_norm": 2.0429187558374284,
"learning_rate": 9.901497006968737e-06,
"loss": 1.0366,
"step": 2020
},
{
"epoch": 0.1577434144067138,
"grad_norm": 1.9839452672457782,
"learning_rate": 9.89880017159584e-06,
"loss": 1.0253,
"step": 2030
},
{
"epoch": 0.15852047556142668,
"grad_norm": 1.9239243059085187,
"learning_rate": 9.896067293127462e-06,
"loss": 1.0809,
"step": 2040
},
{
"epoch": 0.15929753671613955,
"grad_norm": 2.116977455932609,
"learning_rate": 9.893298391670857e-06,
"loss": 1.0288,
"step": 2050
},
{
"epoch": 0.16007459787085243,
"grad_norm": 1.9256786973087672,
"learning_rate": 9.890493487598315e-06,
"loss": 1.062,
"step": 2060
},
{
"epoch": 0.1608516590255653,
"grad_norm": 1.770000631025023,
"learning_rate": 9.887652601547011e-06,
"loss": 1.029,
"step": 2070
},
{
"epoch": 0.16162872018027818,
"grad_norm": 2.0460739758835715,
"learning_rate": 9.884775754418872e-06,
"loss": 1.0978,
"step": 2080
},
{
"epoch": 0.16240578133499106,
"grad_norm": 1.8387960887988681,
"learning_rate": 9.881862967380398e-06,
"loss": 1.0499,
"step": 2090
},
{
"epoch": 0.16318284248970394,
"grad_norm": 2.0055836577178145,
"learning_rate": 9.878914261862524e-06,
"loss": 1.0964,
"step": 2100
},
{
"epoch": 0.16395990364441682,
"grad_norm": 1.7868218097590607,
"learning_rate": 9.875929659560455e-06,
"loss": 1.0277,
"step": 2110
},
{
"epoch": 0.1647369647991297,
"grad_norm": 2.1063589192373424,
"learning_rate": 9.872909182433509e-06,
"loss": 1.1237,
"step": 2120
},
{
"epoch": 0.16551402595384257,
"grad_norm": 2.2482455806975365,
"learning_rate": 9.869852852704951e-06,
"loss": 1.069,
"step": 2130
},
{
"epoch": 0.16629108710855545,
"grad_norm": 1.7191931035624053,
"learning_rate": 9.866760692861837e-06,
"loss": 1.0432,
"step": 2140
},
{
"epoch": 0.16706814826326832,
"grad_norm": 1.9822067032337325,
"learning_rate": 9.863632725654841e-06,
"loss": 1.0966,
"step": 2150
},
{
"epoch": 0.1678452094179812,
"grad_norm": 1.5154087879613518,
"learning_rate": 9.860468974098093e-06,
"loss": 0.9731,
"step": 2160
},
{
"epoch": 0.16862227057269408,
"grad_norm": 2.109259264636941,
"learning_rate": 9.85726946146901e-06,
"loss": 1.075,
"step": 2170
},
{
"epoch": 0.16939933172740695,
"grad_norm": 2.264076822727728,
"learning_rate": 9.854034211308114e-06,
"loss": 1.0237,
"step": 2180
},
{
"epoch": 0.17017639288211983,
"grad_norm": 1.892118264625731,
"learning_rate": 9.850763247418876e-06,
"loss": 1.0245,
"step": 2190
},
{
"epoch": 0.1709534540368327,
"grad_norm": 2.0853632303159535,
"learning_rate": 9.847456593867525e-06,
"loss": 1.0026,
"step": 2200
},
{
"epoch": 0.17173051519154559,
"grad_norm": 1.9677334934726516,
"learning_rate": 9.844114274982885e-06,
"loss": 1.0431,
"step": 2210
},
{
"epoch": 0.17250757634625846,
"grad_norm": 2.2830817893790103,
"learning_rate": 9.840736315356183e-06,
"loss": 1.0943,
"step": 2220
},
{
"epoch": 0.17328463750097134,
"grad_norm": 1.575442825346659,
"learning_rate": 9.837322739840877e-06,
"loss": 1.0007,
"step": 2230
},
{
"epoch": 0.17406169865568422,
"grad_norm": 2.2367315093018134,
"learning_rate": 9.833873573552472e-06,
"loss": 1.0301,
"step": 2240
},
{
"epoch": 0.17483875981039707,
"grad_norm": 2.24222375291448,
"learning_rate": 9.830388841868329e-06,
"loss": 1.0919,
"step": 2250
},
{
"epoch": 0.17561582096510994,
"grad_norm": 1.872156214913949,
"learning_rate": 9.826868570427484e-06,
"loss": 1.0933,
"step": 2260
},
{
"epoch": 0.17639288211982282,
"grad_norm": 2.200623982755955,
"learning_rate": 9.823312785130457e-06,
"loss": 1.0556,
"step": 2270
},
{
"epoch": 0.1771699432745357,
"grad_norm": 2.0166726180309547,
"learning_rate": 9.819721512139069e-06,
"loss": 1.0136,
"step": 2280
},
{
"epoch": 0.17794700442924857,
"grad_norm": 2.3268106459403155,
"learning_rate": 9.816094777876233e-06,
"loss": 1.0609,
"step": 2290
},
{
"epoch": 0.17872406558396145,
"grad_norm": 2.5483756559425097,
"learning_rate": 9.812432609025778e-06,
"loss": 1.1066,
"step": 2300
},
{
"epoch": 0.17950112673867433,
"grad_norm": 1.5050242159549674,
"learning_rate": 9.808735032532239e-06,
"loss": 1.0461,
"step": 2310
},
{
"epoch": 0.1802781878933872,
"grad_norm": 1.7444888511627248,
"learning_rate": 9.805002075600668e-06,
"loss": 0.9875,
"step": 2320
},
{
"epoch": 0.18105524904810008,
"grad_norm": 2.1359724957586295,
"learning_rate": 9.801233765696423e-06,
"loss": 1.0032,
"step": 2330
},
{
"epoch": 0.18183231020281296,
"grad_norm": 2.0933731292318214,
"learning_rate": 9.797430130544983e-06,
"loss": 1.0092,
"step": 2340
},
{
"epoch": 0.18260937135752583,
"grad_norm": 1.7774756159015281,
"learning_rate": 9.793591198131724e-06,
"loss": 0.9708,
"step": 2350
},
{
"epoch": 0.1833864325122387,
"grad_norm": 1.9057742144891412,
"learning_rate": 9.789716996701729e-06,
"loss": 1.0716,
"step": 2360
},
{
"epoch": 0.1841634936669516,
"grad_norm": 1.6679562880223004,
"learning_rate": 9.78580755475957e-06,
"loss": 1.0184,
"step": 2370
},
{
"epoch": 0.18494055482166447,
"grad_norm": 2.036953279006188,
"learning_rate": 9.781862901069105e-06,
"loss": 0.988,
"step": 2380
},
{
"epoch": 0.18571761597637734,
"grad_norm": 2.0964552627447777,
"learning_rate": 9.777883064653266e-06,
"loss": 1.0113,
"step": 2390
},
{
"epoch": 0.18649467713109022,
"grad_norm": 1.6106495155390417,
"learning_rate": 9.773868074793838e-06,
"loss": 1.0423,
"step": 2400
},
{
"epoch": 0.1872717382858031,
"grad_norm": 2.8770640128408456,
"learning_rate": 9.76981796103125e-06,
"loss": 1.0398,
"step": 2410
},
{
"epoch": 0.18804879944051597,
"grad_norm": 2.0693212678122843,
"learning_rate": 9.76573275316436e-06,
"loss": 1.0045,
"step": 2420
},
{
"epoch": 0.18882586059522885,
"grad_norm": 2.0119207294765213,
"learning_rate": 9.761612481250225e-06,
"loss": 1.0224,
"step": 2430
},
{
"epoch": 0.18960292174994173,
"grad_norm": 2.223557066379335,
"learning_rate": 9.757457175603893e-06,
"loss": 1.0773,
"step": 2440
},
{
"epoch": 0.1903799829046546,
"grad_norm": 1.9108372181328375,
"learning_rate": 9.753266866798174e-06,
"loss": 1.0526,
"step": 2450
},
{
"epoch": 0.19115704405936748,
"grad_norm": 2.1959378359625177,
"learning_rate": 9.749041585663411e-06,
"loss": 1.1138,
"step": 2460
},
{
"epoch": 0.19193410521408036,
"grad_norm": 2.0485148481555218,
"learning_rate": 9.74478136328726e-06,
"loss": 1.0884,
"step": 2470
},
{
"epoch": 0.19271116636879324,
"grad_norm": 1.8565352764102319,
"learning_rate": 9.740486231014461e-06,
"loss": 1.0099,
"step": 2480
},
{
"epoch": 0.1934882275235061,
"grad_norm": 1.5302635825343132,
"learning_rate": 9.736156220446597e-06,
"loss": 1.0461,
"step": 2490
},
{
"epoch": 0.194265288678219,
"grad_norm": 1.5954264600641812,
"learning_rate": 9.731791363441876e-06,
"loss": 0.9655,
"step": 2500
},
{
"epoch": 0.19504234983293184,
"grad_norm": 1.7822694013944302,
"learning_rate": 9.727391692114887e-06,
"loss": 1.0542,
"step": 2510
},
{
"epoch": 0.19581941098764472,
"grad_norm": 2.424667963877112,
"learning_rate": 9.722957238836366e-06,
"loss": 1.0331,
"step": 2520
},
{
"epoch": 0.1965964721423576,
"grad_norm": 2.3703044008316487,
"learning_rate": 9.718488036232963e-06,
"loss": 1.0926,
"step": 2530
},
{
"epoch": 0.19737353329707047,
"grad_norm": 2.1530662223107955,
"learning_rate": 9.713984117186993e-06,
"loss": 1.0121,
"step": 2540
},
{
"epoch": 0.19815059445178335,
"grad_norm": 1.9314173573162179,
"learning_rate": 9.7094455148362e-06,
"loss": 1.0475,
"step": 2550
},
{
"epoch": 0.19892765560649622,
"grad_norm": 1.9777777372602399,
"learning_rate": 9.704872262573508e-06,
"loss": 1.0105,
"step": 2560
},
{
"epoch": 0.1997047167612091,
"grad_norm": 2.2819001107312546,
"learning_rate": 9.700264394046787e-06,
"loss": 0.948,
"step": 2570
},
{
"epoch": 0.20048177791592198,
"grad_norm": 1.7288416994808482,
"learning_rate": 9.69562194315859e-06,
"loss": 1.0458,
"step": 2580
},
{
"epoch": 0.20125883907063485,
"grad_norm": 1.7457323208199687,
"learning_rate": 9.690944944065914e-06,
"loss": 1.0476,
"step": 2590
},
{
"epoch": 0.20203590022534773,
"grad_norm": 2.47172385268511,
"learning_rate": 9.686233431179944e-06,
"loss": 1.0115,
"step": 2600
},
{
"epoch": 0.2028129613800606,
"grad_norm": 2.38182568324136,
"learning_rate": 9.681487439165804e-06,
"loss": 1.0733,
"step": 2610
},
{
"epoch": 0.20359002253477348,
"grad_norm": 2.1251613678643153,
"learning_rate": 9.676707002942299e-06,
"loss": 1.1202,
"step": 2620
},
{
"epoch": 0.20436708368948636,
"grad_norm": 2.3331174035594158,
"learning_rate": 9.671892157681656e-06,
"loss": 0.9892,
"step": 2630
},
{
"epoch": 0.20514414484419924,
"grad_norm": 2.5297296744464597,
"learning_rate": 9.66704293880927e-06,
"loss": 1.0913,
"step": 2640
},
{
"epoch": 0.20592120599891212,
"grad_norm": 1.9953398885425944,
"learning_rate": 9.662159382003438e-06,
"loss": 0.9739,
"step": 2650
},
{
"epoch": 0.206698267153625,
"grad_norm": 1.9554157695142245,
"learning_rate": 9.657241523195106e-06,
"loss": 1.0062,
"step": 2660
},
{
"epoch": 0.20747532830833787,
"grad_norm": 1.9681771655746416,
"learning_rate": 9.652289398567591e-06,
"loss": 0.9645,
"step": 2670
},
{
"epoch": 0.20825238946305075,
"grad_norm": 1.6398427617567763,
"learning_rate": 9.647303044556327e-06,
"loss": 1.0691,
"step": 2680
},
{
"epoch": 0.20902945061776362,
"grad_norm": 2.191033664996454,
"learning_rate": 9.642282497848587e-06,
"loss": 1.0046,
"step": 2690
},
{
"epoch": 0.2098065117724765,
"grad_norm": 2.422573387512772,
"learning_rate": 9.637227795383223e-06,
"loss": 1.0334,
"step": 2700
},
{
"epoch": 0.21058357292718938,
"grad_norm": 2.2231115952498817,
"learning_rate": 9.63213897435039e-06,
"loss": 1.0092,
"step": 2710
},
{
"epoch": 0.21136063408190225,
"grad_norm": 1.5887335858791765,
"learning_rate": 9.627016072191263e-06,
"loss": 1.0601,
"step": 2720
},
{
"epoch": 0.21213769523661513,
"grad_norm": 2.079071610960163,
"learning_rate": 9.62185912659778e-06,
"loss": 1.0089,
"step": 2730
},
{
"epoch": 0.212914756391328,
"grad_norm": 2.6538511902261672,
"learning_rate": 9.616668175512347e-06,
"loss": 1.0996,
"step": 2740
},
{
"epoch": 0.21369181754604089,
"grad_norm": 2.2366602617889675,
"learning_rate": 9.611443257127573e-06,
"loss": 0.995,
"step": 2750
},
{
"epoch": 0.21446887870075376,
"grad_norm": 1.9923272374726597,
"learning_rate": 9.60618440988598e-06,
"loss": 1.0588,
"step": 2760
},
{
"epoch": 0.2152459398554666,
"grad_norm": 1.933851579802707,
"learning_rate": 9.60089167247972e-06,
"loss": 1.0677,
"step": 2770
},
{
"epoch": 0.2160230010101795,
"grad_norm": 1.7051761690927782,
"learning_rate": 9.595565083850298e-06,
"loss": 0.9761,
"step": 2780
},
{
"epoch": 0.21680006216489237,
"grad_norm": 2.760621047319595,
"learning_rate": 9.590204683188275e-06,
"loss": 1.0485,
"step": 2790
},
{
"epoch": 0.21757712331960524,
"grad_norm": 2.164361791637637,
"learning_rate": 9.584810509932993e-06,
"loss": 1.0935,
"step": 2800
},
{
"epoch": 0.21835418447431812,
"grad_norm": 2.1290187047633387,
"learning_rate": 9.579382603772269e-06,
"loss": 1.0242,
"step": 2810
},
{
"epoch": 0.219131245629031,
"grad_norm": 2.4594545836748796,
"learning_rate": 9.573921004642117e-06,
"loss": 1.0066,
"step": 2820
},
{
"epoch": 0.21990830678374387,
"grad_norm": 2.211316974662037,
"learning_rate": 9.568425752726442e-06,
"loss": 0.9617,
"step": 2830
},
{
"epoch": 0.22068536793845675,
"grad_norm": 2.914326191682928,
"learning_rate": 9.562896888456758e-06,
"loss": 1.0298,
"step": 2840
},
{
"epoch": 0.22146242909316963,
"grad_norm": 1.8033463375470347,
"learning_rate": 9.557334452511879e-06,
"loss": 0.9536,
"step": 2850
},
{
"epoch": 0.2222394902478825,
"grad_norm": 2.1801243317191856,
"learning_rate": 9.551738485817622e-06,
"loss": 0.951,
"step": 2860
},
{
"epoch": 0.22301655140259538,
"grad_norm": 2.1629577942104183,
"learning_rate": 9.546109029546511e-06,
"loss": 0.9987,
"step": 2870
},
{
"epoch": 0.22379361255730826,
"grad_norm": 1.3716114805711197,
"learning_rate": 9.540446125117468e-06,
"loss": 0.969,
"step": 2880
},
{
"epoch": 0.22457067371202113,
"grad_norm": 1.9483284357069952,
"learning_rate": 9.534749814195516e-06,
"loss": 1.0039,
"step": 2890
},
{
"epoch": 0.225347734866734,
"grad_norm": 2.0793028495715697,
"learning_rate": 9.529020138691463e-06,
"loss": 0.9743,
"step": 2900
},
{
"epoch": 0.2261247960214469,
"grad_norm": 2.3579800092596646,
"learning_rate": 9.523257140761595e-06,
"loss": 0.9396,
"step": 2910
},
{
"epoch": 0.22690185717615977,
"grad_norm": 1.9666592282727686,
"learning_rate": 9.517460862807378e-06,
"loss": 1.0413,
"step": 2920
},
{
"epoch": 0.22767891833087264,
"grad_norm": 2.332398520531907,
"learning_rate": 9.51163134747513e-06,
"loss": 0.9895,
"step": 2930
},
{
"epoch": 0.22845597948558552,
"grad_norm": 2.0112812087397853,
"learning_rate": 9.505768637655717e-06,
"loss": 1.026,
"step": 2940
},
{
"epoch": 0.2292330406402984,
"grad_norm": 1.30588230567386,
"learning_rate": 9.499872776484234e-06,
"loss": 0.9389,
"step": 2950
},
{
"epoch": 0.23001010179501127,
"grad_norm": 2.4882043492951107,
"learning_rate": 9.493943807339686e-06,
"loss": 1.0177,
"step": 2960
},
{
"epoch": 0.23078716294972415,
"grad_norm": 2.472037249258304,
"learning_rate": 9.487981773844673e-06,
"loss": 1.0865,
"step": 2970
},
{
"epoch": 0.23156422410443703,
"grad_norm": 2.3974288694298864,
"learning_rate": 9.48198671986507e-06,
"loss": 1.1025,
"step": 2980
},
{
"epoch": 0.2323412852591499,
"grad_norm": 1.8931995855209747,
"learning_rate": 9.475958689509697e-06,
"loss": 1.0401,
"step": 2990
},
{
"epoch": 0.23311834641386278,
"grad_norm": 1.7588453721284736,
"learning_rate": 9.469897727130001e-06,
"loss": 1.026,
"step": 3000
},
{
"epoch": 0.23389540756857566,
"grad_norm": 2.25782280175551,
"learning_rate": 9.463803877319727e-06,
"loss": 1.045,
"step": 3010
},
{
"epoch": 0.23467246872328854,
"grad_norm": 2.062470298217632,
"learning_rate": 9.45767718491459e-06,
"loss": 0.9873,
"step": 3020
},
{
"epoch": 0.2354495298780014,
"grad_norm": 2.235317636179408,
"learning_rate": 9.451517694991947e-06,
"loss": 0.9935,
"step": 3030
},
{
"epoch": 0.23622659103271426,
"grad_norm": 1.8159214167836841,
"learning_rate": 9.445325452870459e-06,
"loss": 0.9837,
"step": 3040
},
{
"epoch": 0.23700365218742714,
"grad_norm": 2.530492729153044,
"learning_rate": 9.439100504109772e-06,
"loss": 1.0975,
"step": 3050
},
{
"epoch": 0.23778071334214002,
"grad_norm": 1.9008032910522048,
"learning_rate": 9.432842894510164e-06,
"loss": 0.975,
"step": 3060
},
{
"epoch": 0.2385577744968529,
"grad_norm": 1.340909447158594,
"learning_rate": 9.42655267011222e-06,
"loss": 0.8966,
"step": 3070
},
{
"epoch": 0.23933483565156577,
"grad_norm": 2.3032534649906053,
"learning_rate": 9.420229877196484e-06,
"loss": 0.899,
"step": 3080
},
{
"epoch": 0.24011189680627865,
"grad_norm": 3.3083719276637815,
"learning_rate": 9.413874562283136e-06,
"loss": 1.0154,
"step": 3090
},
{
"epoch": 0.24088895796099152,
"grad_norm": 1.7584921998647791,
"learning_rate": 9.407486772131624e-06,
"loss": 0.9767,
"step": 3100
},
{
"epoch": 0.2416660191157044,
"grad_norm": 2.9427356878313686,
"learning_rate": 9.401066553740343e-06,
"loss": 0.9662,
"step": 3110
},
{
"epoch": 0.24244308027041728,
"grad_norm": 2.1699016387323233,
"learning_rate": 9.394613954346274e-06,
"loss": 0.9713,
"step": 3120
},
{
"epoch": 0.24322014142513015,
"grad_norm": 2.1813371019451653,
"learning_rate": 9.388129021424648e-06,
"loss": 0.9555,
"step": 3130
},
{
"epoch": 0.24399720257984303,
"grad_norm": 1.9891788565996813,
"learning_rate": 9.381611802688586e-06,
"loss": 1.0036,
"step": 3140
},
{
"epoch": 0.2447742637345559,
"grad_norm": 2.3143675049942014,
"learning_rate": 9.375062346088759e-06,
"loss": 0.971,
"step": 3150
},
{
"epoch": 0.24555132488926878,
"grad_norm": 2.6629770871009155,
"learning_rate": 9.368480699813021e-06,
"loss": 0.9176,
"step": 3160
},
{
"epoch": 0.24632838604398166,
"grad_norm": 2.1132958055867808,
"learning_rate": 9.36186691228607e-06,
"loss": 0.8972,
"step": 3170
},
{
"epoch": 0.24710544719869454,
"grad_norm": 2.029313412599108,
"learning_rate": 9.35522103216908e-06,
"loss": 0.9154,
"step": 3180
},
{
"epoch": 0.24788250835340742,
"grad_norm": 1.4578868082629726,
"learning_rate": 9.34854310835935e-06,
"loss": 1.036,
"step": 3190
},
{
"epoch": 0.2486595695081203,
"grad_norm": 2.1062999698802503,
"learning_rate": 9.341833189989942e-06,
"loss": 0.8603,
"step": 3200
},
{
"epoch": 0.24943663066283317,
"grad_norm": 2.6614219310606892,
"learning_rate": 9.335091326429313e-06,
"loss": 0.9924,
"step": 3210
},
{
"epoch": 0.25021369181754605,
"grad_norm": 2.0301151705921665,
"learning_rate": 9.328317567280968e-06,
"loss": 0.953,
"step": 3220
},
{
"epoch": 0.2509907529722589,
"grad_norm": 1.9699445720729638,
"learning_rate": 9.321511962383077e-06,
"loss": 0.9379,
"step": 3230
},
{
"epoch": 0.2517678141269718,
"grad_norm": 2.2607361825721854,
"learning_rate": 9.314674561808117e-06,
"loss": 0.986,
"step": 3240
},
{
"epoch": 0.2525448752816847,
"grad_norm": 1.839113470172114,
"learning_rate": 9.307805415862507e-06,
"loss": 0.9541,
"step": 3250
},
{
"epoch": 0.25332193643639755,
"grad_norm": 1.8823362594556383,
"learning_rate": 9.300904575086232e-06,
"loss": 0.9203,
"step": 3260
},
{
"epoch": 0.25409899759111043,
"grad_norm": 2.259964303887286,
"learning_rate": 9.293972090252468e-06,
"loss": 0.9679,
"step": 3270
},
{
"epoch": 0.2548760587458233,
"grad_norm": 2.058151781656702,
"learning_rate": 9.287008012367221e-06,
"loss": 1.0023,
"step": 3280
},
{
"epoch": 0.2556531199005362,
"grad_norm": 2.306218040399529,
"learning_rate": 9.280012392668938e-06,
"loss": 1.0326,
"step": 3290
},
{
"epoch": 0.25643018105524906,
"grad_norm": 2.218261287466935,
"learning_rate": 9.272985282628138e-06,
"loss": 0.988,
"step": 3300
},
{
"epoch": 0.25720724220996194,
"grad_norm": 2.8185774692963146,
"learning_rate": 9.265926733947035e-06,
"loss": 0.9237,
"step": 3310
},
{
"epoch": 0.2579843033646748,
"grad_norm": 1.966754798605311,
"learning_rate": 9.258836798559148e-06,
"loss": 0.8764,
"step": 3320
},
{
"epoch": 0.2587613645193877,
"grad_norm": 2.907713378609492,
"learning_rate": 9.251715528628926e-06,
"loss": 0.9781,
"step": 3330
},
{
"epoch": 0.25953842567410057,
"grad_norm": 2.5867766624212107,
"learning_rate": 9.244562976551368e-06,
"loss": 0.9835,
"step": 3340
},
{
"epoch": 0.26031548682881345,
"grad_norm": 2.659891863331392,
"learning_rate": 9.237379194951626e-06,
"loss": 0.9438,
"step": 3350
},
{
"epoch": 0.2610925479835263,
"grad_norm": 1.8970250029232214,
"learning_rate": 9.230164236684628e-06,
"loss": 0.9617,
"step": 3360
},
{
"epoch": 0.2618696091382392,
"grad_norm": 1.4823476343052233,
"learning_rate": 9.222918154834684e-06,
"loss": 1.0756,
"step": 3370
},
{
"epoch": 0.2626466702929521,
"grad_norm": 2.1930418016202577,
"learning_rate": 9.215641002715097e-06,
"loss": 1.0523,
"step": 3380
},
{
"epoch": 0.26342373144766495,
"grad_norm": 1.8533472991342042,
"learning_rate": 9.208332833867772e-06,
"loss": 0.8869,
"step": 3390
},
{
"epoch": 0.26420079260237783,
"grad_norm": 2.184383922916281,
"learning_rate": 9.200993702062821e-06,
"loss": 0.9808,
"step": 3400
},
{
"epoch": 0.2649778537570907,
"grad_norm": 2.510050570387309,
"learning_rate": 9.193623661298164e-06,
"loss": 0.9156,
"step": 3410
},
{
"epoch": 0.2657549149118036,
"grad_norm": 2.546343372247806,
"learning_rate": 9.186222765799137e-06,
"loss": 0.9764,
"step": 3420
},
{
"epoch": 0.2665319760665164,
"grad_norm": 1.5693684379771662,
"learning_rate": 9.17879107001809e-06,
"loss": 0.9491,
"step": 3430
},
{
"epoch": 0.2673090372212293,
"grad_norm": 2.2264963076350544,
"learning_rate": 9.171328628633987e-06,
"loss": 0.9796,
"step": 3440
},
{
"epoch": 0.26808609837594216,
"grad_norm": 1.8513099710874061,
"learning_rate": 9.163835496552006e-06,
"loss": 0.9294,
"step": 3450
},
{
"epoch": 0.26886315953065504,
"grad_norm": 2.1369479039679913,
"learning_rate": 9.15631172890313e-06,
"loss": 0.9428,
"step": 3460
},
{
"epoch": 0.2696402206853679,
"grad_norm": 2.1701410069417806,
"learning_rate": 9.148757381043745e-06,
"loss": 0.9497,
"step": 3470
},
{
"epoch": 0.2704172818400808,
"grad_norm": 2.4018785001267102,
"learning_rate": 9.141172508555234e-06,
"loss": 0.9611,
"step": 3480
},
{
"epoch": 0.27119434299479367,
"grad_norm": 2.5173991790204346,
"learning_rate": 9.133557167243565e-06,
"loss": 0.9233,
"step": 3490
},
{
"epoch": 0.27197140414950655,
"grad_norm": 2.5138075382856497,
"learning_rate": 9.125911413138877e-06,
"loss": 0.9203,
"step": 3500
},
{
"epoch": 0.2727484653042194,
"grad_norm": 2.898893363605526,
"learning_rate": 9.11823530249508e-06,
"loss": 0.8849,
"step": 3510
},
{
"epoch": 0.2735255264589323,
"grad_norm": 1.6850916480287021,
"learning_rate": 9.11052889178943e-06,
"loss": 0.875,
"step": 3520
},
{
"epoch": 0.2743025876136452,
"grad_norm": 2.3316883827873447,
"learning_rate": 9.102792237722114e-06,
"loss": 1.0095,
"step": 3530
},
{
"epoch": 0.27507964876835805,
"grad_norm": 2.1632200172689298,
"learning_rate": 9.095025397215838e-06,
"loss": 0.9276,
"step": 3540
},
{
"epoch": 0.27585670992307093,
"grad_norm": 2.8796310855009795,
"learning_rate": 9.087228427415405e-06,
"loss": 0.9235,
"step": 3550
},
{
"epoch": 0.2766337710777838,
"grad_norm": 2.0564562085035023,
"learning_rate": 9.079401385687299e-06,
"loss": 0.9491,
"step": 3560
},
{
"epoch": 0.2774108322324967,
"grad_norm": 2.608162831191934,
"learning_rate": 9.071544329619253e-06,
"loss": 0.9458,
"step": 3570
},
{
"epoch": 0.27818789338720956,
"grad_norm": 2.521963823842101,
"learning_rate": 9.063657317019838e-06,
"loss": 0.9137,
"step": 3580
},
{
"epoch": 0.27896495454192244,
"grad_norm": 2.535651222771701,
"learning_rate": 9.055740405918026e-06,
"loss": 0.9567,
"step": 3590
},
{
"epoch": 0.2797420156966353,
"grad_norm": 2.2389260303888476,
"learning_rate": 9.04779365456277e-06,
"loss": 0.9689,
"step": 3600
},
{
"epoch": 0.2805190768513482,
"grad_norm": 1.7592398575015094,
"learning_rate": 9.039817121422575e-06,
"loss": 0.9177,
"step": 3610
},
{
"epoch": 0.28129613800606107,
"grad_norm": 1.8002755024191208,
"learning_rate": 9.031810865185066e-06,
"loss": 0.9407,
"step": 3620
},
{
"epoch": 0.28207319916077395,
"grad_norm": 2.3928408034774082,
"learning_rate": 9.023774944756555e-06,
"loss": 0.9863,
"step": 3630
},
{
"epoch": 0.2828502603154868,
"grad_norm": 2.395034750902151,
"learning_rate": 9.015709419261612e-06,
"loss": 0.9869,
"step": 3640
},
{
"epoch": 0.2836273214701997,
"grad_norm": 2.3890411242782466,
"learning_rate": 9.007614348042626e-06,
"loss": 0.909,
"step": 3650
},
{
"epoch": 0.2844043826249126,
"grad_norm": 2.2350831565472107,
"learning_rate": 8.999489790659368e-06,
"loss": 0.8966,
"step": 3660
},
{
"epoch": 0.28518144377962545,
"grad_norm": 3.694934035517618,
"learning_rate": 8.991335806888558e-06,
"loss": 0.9765,
"step": 3670
},
{
"epoch": 0.28595850493433833,
"grad_norm": 3.0768679656946794,
"learning_rate": 8.983152456723419e-06,
"loss": 0.9859,
"step": 3680
},
{
"epoch": 0.2867355660890512,
"grad_norm": 2.4664124428796548,
"learning_rate": 8.97493980037324e-06,
"loss": 0.9534,
"step": 3690
},
{
"epoch": 0.2875126272437641,
"grad_norm": 2.245723206050526,
"learning_rate": 8.96669789826293e-06,
"loss": 0.9482,
"step": 3700
},
{
"epoch": 0.28828968839847696,
"grad_norm": 2.355965037185437,
"learning_rate": 8.958426811032576e-06,
"loss": 0.8993,
"step": 3710
},
{
"epoch": 0.28906674955318984,
"grad_norm": 2.43480515736849,
"learning_rate": 8.950126599536993e-06,
"loss": 0.9597,
"step": 3720
},
{
"epoch": 0.2898438107079027,
"grad_norm": 2.5741426103315304,
"learning_rate": 8.941797324845284e-06,
"loss": 0.9499,
"step": 3730
},
{
"epoch": 0.2906208718626156,
"grad_norm": 2.490107440300966,
"learning_rate": 8.933439048240376e-06,
"loss": 0.8834,
"step": 3740
},
{
"epoch": 0.29139793301732847,
"grad_norm": 2.079854760599078,
"learning_rate": 8.92505183121859e-06,
"loss": 0.9257,
"step": 3750
},
{
"epoch": 0.29217499417204135,
"grad_norm": 2.520480318994419,
"learning_rate": 8.91663573548917e-06,
"loss": 0.9679,
"step": 3760
},
{
"epoch": 0.2929520553267542,
"grad_norm": 1.8583413033492335,
"learning_rate": 8.908190822973838e-06,
"loss": 0.8838,
"step": 3770
},
{
"epoch": 0.2937291164814671,
"grad_norm": 2.3837910942670177,
"learning_rate": 8.899717155806337e-06,
"loss": 0.8847,
"step": 3780
},
{
"epoch": 0.29450617763618,
"grad_norm": 2.711696676240023,
"learning_rate": 8.891214796331973e-06,
"loss": 0.9878,
"step": 3790
},
{
"epoch": 0.29528323879089285,
"grad_norm": 2.502641692502333,
"learning_rate": 8.882683807107154e-06,
"loss": 0.9536,
"step": 3800
},
{
"epoch": 0.29606029994560573,
"grad_norm": 2.3453784276871708,
"learning_rate": 8.874124250898937e-06,
"loss": 0.8787,
"step": 3810
},
{
"epoch": 0.2968373611003186,
"grad_norm": 1.8832906440195756,
"learning_rate": 8.865536190684559e-06,
"loss": 0.9384,
"step": 3820
},
{
"epoch": 0.2976144222550315,
"grad_norm": 2.1009680565481514,
"learning_rate": 8.856919689650977e-06,
"loss": 0.8934,
"step": 3830
},
{
"epoch": 0.29839148340974436,
"grad_norm": 1.9232637840358615,
"learning_rate": 8.848274811194402e-06,
"loss": 0.9733,
"step": 3840
},
{
"epoch": 0.29916854456445724,
"grad_norm": 2.807204409009,
"learning_rate": 8.839601618919833e-06,
"loss": 0.9018,
"step": 3850
},
{
"epoch": 0.2999456057191701,
"grad_norm": 2.0589460869005065,
"learning_rate": 8.830900176640587e-06,
"loss": 0.9858,
"step": 3860
},
{
"epoch": 0.300722666873883,
"grad_norm": 2.454773689152951,
"learning_rate": 8.822170548377835e-06,
"loss": 0.9769,
"step": 3870
},
{
"epoch": 0.30149972802859587,
"grad_norm": 2.128683839495848,
"learning_rate": 8.813412798360126e-06,
"loss": 0.8856,
"step": 3880
},
{
"epoch": 0.30227678918330875,
"grad_norm": 2.4279634048337213,
"learning_rate": 8.804626991022915e-06,
"loss": 0.9671,
"step": 3890
},
{
"epoch": 0.3030538503380216,
"grad_norm": 3.6045788043321894,
"learning_rate": 8.79581319100809e-06,
"loss": 0.8933,
"step": 3900
},
{
"epoch": 0.3038309114927345,
"grad_norm": 2.1672482233441084,
"learning_rate": 8.786971463163495e-06,
"loss": 0.9564,
"step": 3910
},
{
"epoch": 0.3046079726474474,
"grad_norm": 2.1636428752933328,
"learning_rate": 8.778101872542458e-06,
"loss": 0.9913,
"step": 3920
},
{
"epoch": 0.30538503380216026,
"grad_norm": 2.871516588464275,
"learning_rate": 8.769204484403304e-06,
"loss": 0.8939,
"step": 3930
},
{
"epoch": 0.30616209495687313,
"grad_norm": 2.2048100149121814,
"learning_rate": 8.760279364208879e-06,
"loss": 0.8993,
"step": 3940
},
{
"epoch": 0.306939156111586,
"grad_norm": 2.0054550377532343,
"learning_rate": 8.751326577626075e-06,
"loss": 0.9712,
"step": 3950
},
{
"epoch": 0.30771621726629883,
"grad_norm": 1.941321214144556,
"learning_rate": 8.742346190525332e-06,
"loss": 0.9545,
"step": 3960
},
{
"epoch": 0.3084932784210117,
"grad_norm": 2.3634949614963743,
"learning_rate": 8.733338268980166e-06,
"loss": 0.887,
"step": 3970
},
{
"epoch": 0.3092703395757246,
"grad_norm": 3.5243533187865403,
"learning_rate": 8.72430287926668e-06,
"loss": 0.8955,
"step": 3980
},
{
"epoch": 0.31004740073043746,
"grad_norm": 2.3622243989894747,
"learning_rate": 8.715240087863072e-06,
"loss": 0.8944,
"step": 3990
},
{
"epoch": 0.31082446188515034,
"grad_norm": 1.946906851098621,
"learning_rate": 8.70614996144915e-06,
"loss": 0.8534,
"step": 4000
},
{
"epoch": 0.3116015230398632,
"grad_norm": 2.268588081924812,
"learning_rate": 8.697032566905842e-06,
"loss": 0.8884,
"step": 4010
},
{
"epoch": 0.3123785841945761,
"grad_norm": 2.183711381325099,
"learning_rate": 8.6878879713147e-06,
"loss": 0.9143,
"step": 4020
},
{
"epoch": 0.31315564534928897,
"grad_norm": 2.627681687760923,
"learning_rate": 8.678716241957408e-06,
"loss": 0.8835,
"step": 4030
},
{
"epoch": 0.31393270650400185,
"grad_norm": 1.443133233680791,
"learning_rate": 8.669517446315292e-06,
"loss": 0.9273,
"step": 4040
},
{
"epoch": 0.3147097676587147,
"grad_norm": 2.393245491803305,
"learning_rate": 8.660291652068813e-06,
"loss": 0.9162,
"step": 4050
},
{
"epoch": 0.3154868288134276,
"grad_norm": 2.2137742145203987,
"learning_rate": 8.65103892709708e-06,
"loss": 0.9558,
"step": 4060
},
{
"epoch": 0.3162638899681405,
"grad_norm": 1.8575771555594642,
"learning_rate": 8.641759339477345e-06,
"loss": 0.9469,
"step": 4070
},
{
"epoch": 0.31704095112285335,
"grad_norm": 2.3987640931014496,
"learning_rate": 8.632452957484498e-06,
"loss": 0.8976,
"step": 4080
},
{
"epoch": 0.31781801227756623,
"grad_norm": 2.6592688199749612,
"learning_rate": 8.62311984959058e-06,
"loss": 0.8577,
"step": 4090
},
{
"epoch": 0.3185950734322791,
"grad_norm": 2.6015155100334226,
"learning_rate": 8.613760084464258e-06,
"loss": 0.8989,
"step": 4100
},
{
"epoch": 0.319372134586992,
"grad_norm": 3.2861649632260903,
"learning_rate": 8.604373730970334e-06,
"loss": 0.9379,
"step": 4110
},
{
"epoch": 0.32014919574170486,
"grad_norm": 2.2805290644540315,
"learning_rate": 8.59496085816924e-06,
"loss": 0.9307,
"step": 4120
},
{
"epoch": 0.32092625689641774,
"grad_norm": 1.9526498942261281,
"learning_rate": 8.585521535316517e-06,
"loss": 0.9789,
"step": 4130
},
{
"epoch": 0.3217033180511306,
"grad_norm": 2.1513380917456923,
"learning_rate": 8.576055831862317e-06,
"loss": 0.9632,
"step": 4140
},
{
"epoch": 0.3224803792058435,
"grad_norm": 2.9117768462597273,
"learning_rate": 8.56656381745089e-06,
"loss": 0.8607,
"step": 4150
},
{
"epoch": 0.32325744036055637,
"grad_norm": 1.6037295849873296,
"learning_rate": 8.557045561920066e-06,
"loss": 0.9062,
"step": 4160
},
{
"epoch": 0.32403450151526925,
"grad_norm": 2.3047029595748745,
"learning_rate": 8.547501135300747e-06,
"loss": 0.8982,
"step": 4170
},
{
"epoch": 0.3248115626699821,
"grad_norm": 2.414214418200032,
"learning_rate": 8.537930607816386e-06,
"loss": 0.952,
"step": 4180
},
{
"epoch": 0.325588623824695,
"grad_norm": 2.6048634749383037,
"learning_rate": 8.528334049882482e-06,
"loss": 0.9004,
"step": 4190
},
{
"epoch": 0.3263656849794079,
"grad_norm": 2.090591332073441,
"learning_rate": 8.51871153210605e-06,
"loss": 0.9109,
"step": 4200
},
{
"epoch": 0.32714274613412075,
"grad_norm": 2.039137230473015,
"learning_rate": 8.5090631252851e-06,
"loss": 0.8622,
"step": 4210
},
{
"epoch": 0.32791980728883363,
"grad_norm": 1.3644794656877728,
"learning_rate": 8.499388900408131e-06,
"loss": 0.8932,
"step": 4220
},
{
"epoch": 0.3286968684435465,
"grad_norm": 1.9869041419127695,
"learning_rate": 8.489688928653593e-06,
"loss": 0.8921,
"step": 4230
},
{
"epoch": 0.3294739295982594,
"grad_norm": 2.1198129652125908,
"learning_rate": 8.479963281389369e-06,
"loss": 0.9178,
"step": 4240
},
{
"epoch": 0.33025099075297226,
"grad_norm": 2.922298668933732,
"learning_rate": 8.470212030172254e-06,
"loss": 0.8541,
"step": 4250
},
{
"epoch": 0.33102805190768514,
"grad_norm": 2.862204782837741,
"learning_rate": 8.460435246747425e-06,
"loss": 0.9081,
"step": 4260
},
{
"epoch": 0.331805113062398,
"grad_norm": 2.4866367731953103,
"learning_rate": 8.45063300304791e-06,
"loss": 0.9563,
"step": 4270
},
{
"epoch": 0.3325821742171109,
"grad_norm": 3.6054620715626298,
"learning_rate": 8.440805371194064e-06,
"loss": 0.8762,
"step": 4280
},
{
"epoch": 0.33335923537182377,
"grad_norm": 1.357274089384285,
"learning_rate": 8.430952423493038e-06,
"loss": 0.89,
"step": 4290
},
{
"epoch": 0.33413629652653665,
"grad_norm": 2.462550588436075,
"learning_rate": 8.42107423243824e-06,
"loss": 0.8998,
"step": 4300
},
{
"epoch": 0.3349133576812495,
"grad_norm": 2.4758376060526337,
"learning_rate": 8.41117087070881e-06,
"loss": 0.8602,
"step": 4310
},
{
"epoch": 0.3356904188359624,
"grad_norm": 2.670924674405534,
"learning_rate": 8.401242411169085e-06,
"loss": 0.9091,
"step": 4320
},
{
"epoch": 0.3364674799906753,
"grad_norm": 2.4965212229622855,
"learning_rate": 8.391288926868055e-06,
"loss": 0.905,
"step": 4330
},
{
"epoch": 0.33724454114538815,
"grad_norm": 2.6193244431141105,
"learning_rate": 8.381310491038835e-06,
"loss": 0.8834,
"step": 4340
},
{
"epoch": 0.33802160230010103,
"grad_norm": 2.639094468488719,
"learning_rate": 8.371307177098114e-06,
"loss": 0.9659,
"step": 4350
},
{
"epoch": 0.3387986634548139,
"grad_norm": 1.844532803490863,
"learning_rate": 8.361279058645634e-06,
"loss": 0.8736,
"step": 4360
},
{
"epoch": 0.3395757246095268,
"grad_norm": 3.4447047963873647,
"learning_rate": 8.351226209463628e-06,
"loss": 0.8564,
"step": 4370
},
{
"epoch": 0.34035278576423966,
"grad_norm": 2.0546081486698773,
"learning_rate": 8.341148703516291e-06,
"loss": 0.929,
"step": 4380
},
{
"epoch": 0.34112984691895254,
"grad_norm": 2.498839246884663,
"learning_rate": 8.331046614949228e-06,
"loss": 0.8663,
"step": 4390
},
{
"epoch": 0.3419069080736654,
"grad_norm": 2.574109259388575,
"learning_rate": 8.320920018088912e-06,
"loss": 0.9137,
"step": 4400
},
{
"epoch": 0.3426839692283783,
"grad_norm": 3.1393397756280206,
"learning_rate": 8.310768987442139e-06,
"loss": 0.9368,
"step": 4410
},
{
"epoch": 0.34346103038309117,
"grad_norm": 3.20210731977578,
"learning_rate": 8.300593597695476e-06,
"loss": 0.9299,
"step": 4420
},
{
"epoch": 0.34423809153780405,
"grad_norm": 3.5589792979708994,
"learning_rate": 8.290393923714713e-06,
"loss": 0.9587,
"step": 4430
},
{
"epoch": 0.3450151526925169,
"grad_norm": 2.8541415351108825,
"learning_rate": 8.280170040544312e-06,
"loss": 0.8605,
"step": 4440
},
{
"epoch": 0.3457922138472298,
"grad_norm": 2.0518411713546554,
"learning_rate": 8.269922023406851e-06,
"loss": 0.7918,
"step": 4450
},
{
"epoch": 0.3465692750019427,
"grad_norm": 2.699406909968831,
"learning_rate": 8.259649947702485e-06,
"loss": 0.873,
"step": 4460
},
{
"epoch": 0.34734633615665556,
"grad_norm": 3.0919334403019425,
"learning_rate": 8.24935388900837e-06,
"loss": 0.8373,
"step": 4470
},
{
"epoch": 0.34812339731136843,
"grad_norm": 2.9019624759746305,
"learning_rate": 8.239033923078124e-06,
"loss": 0.9174,
"step": 4480
},
{
"epoch": 0.34890045846608125,
"grad_norm": 2.1140460699445764,
"learning_rate": 8.228690125841258e-06,
"loss": 0.8672,
"step": 4490
},
{
"epoch": 0.34967751962079413,
"grad_norm": 3.0197408308584146,
"learning_rate": 8.218322573402629e-06,
"loss": 0.8523,
"step": 4500
},
{
"epoch": 0.350454580775507,
"grad_norm": 2.657040743922122,
"learning_rate": 8.20793134204187e-06,
"loss": 0.8497,
"step": 4510
},
{
"epoch": 0.3512316419302199,
"grad_norm": 3.4478785002624903,
"learning_rate": 8.197516508212832e-06,
"loss": 0.9144,
"step": 4520
},
{
"epoch": 0.35200870308493276,
"grad_norm": 2.615501805261325,
"learning_rate": 8.187078148543026e-06,
"loss": 0.8521,
"step": 4530
},
{
"epoch": 0.35278576423964564,
"grad_norm": 2.7673910964569566,
"learning_rate": 8.176616339833048e-06,
"loss": 0.9834,
"step": 4540
},
{
"epoch": 0.3535628253943585,
"grad_norm": 3.110704979833664,
"learning_rate": 8.166131159056028e-06,
"loss": 0.9291,
"step": 4550
},
{
"epoch": 0.3543398865490714,
"grad_norm": 2.382239717418457,
"learning_rate": 8.155622683357056e-06,
"loss": 0.962,
"step": 4560
},
{
"epoch": 0.35511694770378427,
"grad_norm": 2.974819074830629,
"learning_rate": 8.14509099005261e-06,
"loss": 0.9076,
"step": 4570
},
{
"epoch": 0.35589400885849715,
"grad_norm": 2.025484177379498,
"learning_rate": 8.13453615663e-06,
"loss": 0.9316,
"step": 4580
},
{
"epoch": 0.35667107001321,
"grad_norm": 2.490523269053249,
"learning_rate": 8.123958260746781e-06,
"loss": 0.9202,
"step": 4590
},
{
"epoch": 0.3574481311679229,
"grad_norm": 2.4151860798523566,
"learning_rate": 8.113357380230198e-06,
"loss": 0.8332,
"step": 4600
},
{
"epoch": 0.3582251923226358,
"grad_norm": 2.994576094392819,
"learning_rate": 8.102733593076608e-06,
"loss": 0.907,
"step": 4610
},
{
"epoch": 0.35900225347734865,
"grad_norm": 2.2570861805827898,
"learning_rate": 8.092086977450896e-06,
"loss": 0.892,
"step": 4620
},
{
"epoch": 0.35977931463206153,
"grad_norm": 1.9441465953568793,
"learning_rate": 8.081417611685914e-06,
"loss": 0.8221,
"step": 4630
},
{
"epoch": 0.3605563757867744,
"grad_norm": 2.9229560639134,
"learning_rate": 8.0707255742819e-06,
"loss": 0.8765,
"step": 4640
},
{
"epoch": 0.3613334369414873,
"grad_norm": 3.3085405723587216,
"learning_rate": 8.060010943905894e-06,
"loss": 0.8406,
"step": 4650
},
{
"epoch": 0.36211049809620016,
"grad_norm": 2.7364277865283624,
"learning_rate": 8.049273799391171e-06,
"loss": 0.8282,
"step": 4660
},
{
"epoch": 0.36288755925091304,
"grad_norm": 2.483155933386303,
"learning_rate": 8.038514219736648e-06,
"loss": 0.9325,
"step": 4670
},
{
"epoch": 0.3636646204056259,
"grad_norm": 3.132743137231315,
"learning_rate": 8.027732284106316e-06,
"loss": 0.8662,
"step": 4680
},
{
"epoch": 0.3644416815603388,
"grad_norm": 2.9308723735400233,
"learning_rate": 8.016928071828644e-06,
"loss": 0.876,
"step": 4690
},
{
"epoch": 0.36521874271505167,
"grad_norm": 2.4289372656874058,
"learning_rate": 8.006101662396011e-06,
"loss": 0.8752,
"step": 4700
},
{
"epoch": 0.36599580386976455,
"grad_norm": 3.5005034837842794,
"learning_rate": 7.995253135464103e-06,
"loss": 0.8211,
"step": 4710
},
{
"epoch": 0.3667728650244774,
"grad_norm": 2.6219168824993897,
"learning_rate": 7.984382570851341e-06,
"loss": 0.8963,
"step": 4720
},
{
"epoch": 0.3675499261791903,
"grad_norm": 2.6913591077446544,
"learning_rate": 7.973490048538291e-06,
"loss": 0.8135,
"step": 4730
},
{
"epoch": 0.3683269873339032,
"grad_norm": 3.323688764018341,
"learning_rate": 7.962575648667068e-06,
"loss": 0.8394,
"step": 4740
},
{
"epoch": 0.36910404848861605,
"grad_norm": 1.9160655382592797,
"learning_rate": 7.951639451540759e-06,
"loss": 0.8373,
"step": 4750
},
{
"epoch": 0.36988110964332893,
"grad_norm": 2.2592953806408977,
"learning_rate": 7.940681537622816e-06,
"loss": 0.8717,
"step": 4760
},
{
"epoch": 0.3706581707980418,
"grad_norm": 2.4625597781213933,
"learning_rate": 7.92970198753648e-06,
"loss": 0.8353,
"step": 4770
},
{
"epoch": 0.3714352319527547,
"grad_norm": 2.547595160954955,
"learning_rate": 7.918700882064181e-06,
"loss": 0.8747,
"step": 4780
},
{
"epoch": 0.37221229310746756,
"grad_norm": 3.276135067674202,
"learning_rate": 7.907678302146939e-06,
"loss": 0.8997,
"step": 4790
},
{
"epoch": 0.37298935426218044,
"grad_norm": 3.036723238718559,
"learning_rate": 7.896634328883777e-06,
"loss": 0.8189,
"step": 4800
},
{
"epoch": 0.3737664154168933,
"grad_norm": 2.0650698930773093,
"learning_rate": 7.885569043531118e-06,
"loss": 0.8454,
"step": 4810
},
{
"epoch": 0.3745434765716062,
"grad_norm": 3.760117109301269,
"learning_rate": 7.874482527502192e-06,
"loss": 0.8213,
"step": 4820
},
{
"epoch": 0.37532053772631907,
"grad_norm": 3.531426821109854,
"learning_rate": 7.863374862366428e-06,
"loss": 0.8113,
"step": 4830
},
{
"epoch": 0.37609759888103195,
"grad_norm": 2.3515365517581164,
"learning_rate": 7.85224612984887e-06,
"loss": 0.8064,
"step": 4840
},
{
"epoch": 0.3768746600357448,
"grad_norm": 1.8840341910034588,
"learning_rate": 7.841096411829561e-06,
"loss": 0.8683,
"step": 4850
},
{
"epoch": 0.3776517211904577,
"grad_norm": 2.38418725628485,
"learning_rate": 7.829925790342942e-06,
"loss": 0.7812,
"step": 4860
},
{
"epoch": 0.3784287823451706,
"grad_norm": 2.4785026498656615,
"learning_rate": 7.818734347577258e-06,
"loss": 0.8119,
"step": 4870
},
{
"epoch": 0.37920584349988345,
"grad_norm": 3.137259786348735,
"learning_rate": 7.807522165873945e-06,
"loss": 0.8764,
"step": 4880
},
{
"epoch": 0.37998290465459633,
"grad_norm": 2.8359325177369845,
"learning_rate": 7.796289327727022e-06,
"loss": 0.7978,
"step": 4890
},
{
"epoch": 0.3807599658093092,
"grad_norm": 3.158128777649866,
"learning_rate": 7.7850359157825e-06,
"loss": 0.8412,
"step": 4900
},
{
"epoch": 0.3815370269640221,
"grad_norm": 3.501006126578136,
"learning_rate": 7.773762012837751e-06,
"loss": 0.8779,
"step": 4910
},
{
"epoch": 0.38231408811873496,
"grad_norm": 2.468978859483751,
"learning_rate": 7.762467701840914e-06,
"loss": 0.8813,
"step": 4920
},
{
"epoch": 0.38309114927344784,
"grad_norm": 3.0067259204153634,
"learning_rate": 7.751153065890284e-06,
"loss": 0.7915,
"step": 4930
},
{
"epoch": 0.3838682104281607,
"grad_norm": 3.9988455962849865,
"learning_rate": 7.739818188233693e-06,
"loss": 0.8698,
"step": 4940
},
{
"epoch": 0.3846452715828736,
"grad_norm": 2.8749069871202746,
"learning_rate": 7.728463152267905e-06,
"loss": 0.8986,
"step": 4950
},
{
"epoch": 0.38542233273758647,
"grad_norm": 1.8557781579247277,
"learning_rate": 7.717088041538e-06,
"loss": 0.836,
"step": 4960
},
{
"epoch": 0.38619939389229935,
"grad_norm": 2.554552315654769,
"learning_rate": 7.705692939736754e-06,
"loss": 0.905,
"step": 4970
},
{
"epoch": 0.3869764550470122,
"grad_norm": 3.253478052848826,
"learning_rate": 7.694277930704035e-06,
"loss": 0.8877,
"step": 4980
},
{
"epoch": 0.3877535162017251,
"grad_norm": 2.8816016322900095,
"learning_rate": 7.682843098426173e-06,
"loss": 0.9017,
"step": 4990
},
{
"epoch": 0.388530577356438,
"grad_norm": 3.6095277498188465,
"learning_rate": 7.671388527035353e-06,
"loss": 0.839,
"step": 5000
},
{
"epoch": 0.38930763851115086,
"grad_norm": 2.872689759467288,
"learning_rate": 7.659914300808987e-06,
"loss": 0.8551,
"step": 5010
},
{
"epoch": 0.3900846996658637,
"grad_norm": 3.57677819644193,
"learning_rate": 7.6484205041691e-06,
"loss": 0.9367,
"step": 5020
},
{
"epoch": 0.39086176082057655,
"grad_norm": 2.516301941871412,
"learning_rate": 7.63690722168171e-06,
"loss": 0.8439,
"step": 5030
},
{
"epoch": 0.39163882197528943,
"grad_norm": 3.6976446460324985,
"learning_rate": 7.625374538056196e-06,
"loss": 0.9143,
"step": 5040
},
{
"epoch": 0.3924158831300023,
"grad_norm": 2.4108959760850976,
"learning_rate": 7.61382253814469e-06,
"loss": 0.8488,
"step": 5050
},
{
"epoch": 0.3931929442847152,
"grad_norm": 3.575102830931404,
"learning_rate": 7.6022513069414375e-06,
"loss": 0.9244,
"step": 5060
},
{
"epoch": 0.39397000543942806,
"grad_norm": 2.5214806607432156,
"learning_rate": 7.5906609295821785e-06,
"loss": 0.7828,
"step": 5070
},
{
"epoch": 0.39474706659414094,
"grad_norm": 2.2256882514741267,
"learning_rate": 7.57905149134353e-06,
"loss": 0.8343,
"step": 5080
},
{
"epoch": 0.3955241277488538,
"grad_norm": 2.9737799015299915,
"learning_rate": 7.567423077642342e-06,
"loss": 0.8029,
"step": 5090
},
{
"epoch": 0.3963011889035667,
"grad_norm": 2.1814937586449474,
"learning_rate": 7.555775774035077e-06,
"loss": 0.8595,
"step": 5100
},
{
"epoch": 0.39707825005827957,
"grad_norm": 3.756192351660152,
"learning_rate": 7.544109666217186e-06,
"loss": 0.8058,
"step": 5110
},
{
"epoch": 0.39785531121299245,
"grad_norm": 2.3316584523565544,
"learning_rate": 7.532424840022468e-06,
"loss": 0.8203,
"step": 5120
},
{
"epoch": 0.3986323723677053,
"grad_norm": 3.3303069401649195,
"learning_rate": 7.520721381422444e-06,
"loss": 0.8766,
"step": 5130
},
{
"epoch": 0.3994094335224182,
"grad_norm": 2.7096079276885203,
"learning_rate": 7.5089993765257295e-06,
"loss": 0.8252,
"step": 5140
},
{
"epoch": 0.4001864946771311,
"grad_norm": 2.7989889775088987,
"learning_rate": 7.497258911577385e-06,
"loss": 0.8241,
"step": 5150
},
{
"epoch": 0.40096355583184395,
"grad_norm": 2.8348088908838833,
"learning_rate": 7.485500072958298e-06,
"loss": 0.8047,
"step": 5160
},
{
"epoch": 0.40174061698655683,
"grad_norm": 2.7178413634018206,
"learning_rate": 7.4737229471845384e-06,
"loss": 0.8469,
"step": 5170
},
{
"epoch": 0.4025176781412697,
"grad_norm": 2.653716140680188,
"learning_rate": 7.46192762090673e-06,
"loss": 0.8986,
"step": 5180
},
{
"epoch": 0.4032947392959826,
"grad_norm": 3.2114074118987097,
"learning_rate": 7.450114180909396e-06,
"loss": 0.8572,
"step": 5190
},
{
"epoch": 0.40407180045069546,
"grad_norm": 2.5594692675174904,
"learning_rate": 7.438282714110346e-06,
"loss": 0.8348,
"step": 5200
},
{
"epoch": 0.40484886160540834,
"grad_norm": 2.570719975580699,
"learning_rate": 7.4264333075600094e-06,
"loss": 0.817,
"step": 5210
},
{
"epoch": 0.4056259227601212,
"grad_norm": 1.7904273430264364,
"learning_rate": 7.414566048440815e-06,
"loss": 0.773,
"step": 5220
},
{
"epoch": 0.4064029839148341,
"grad_norm": 3.1160992335315836,
"learning_rate": 7.4026810240665455e-06,
"loss": 0.8406,
"step": 5230
},
{
"epoch": 0.40718004506954697,
"grad_norm": 2.879056289166062,
"learning_rate": 7.390778321881684e-06,
"loss": 0.8299,
"step": 5240
},
{
"epoch": 0.40795710622425985,
"grad_norm": 3.4705886843955134,
"learning_rate": 7.378858029460785e-06,
"loss": 0.8443,
"step": 5250
},
{
"epoch": 0.4087341673789727,
"grad_norm": 3.0683501999483203,
"learning_rate": 7.366920234507819e-06,
"loss": 0.8563,
"step": 5260
},
{
"epoch": 0.4095112285336856,
"grad_norm": 3.0155201359764248,
"learning_rate": 7.354965024855536e-06,
"loss": 0.7995,
"step": 5270
},
{
"epoch": 0.4102882896883985,
"grad_norm": 3.7649992863447594,
"learning_rate": 7.342992488464813e-06,
"loss": 0.8513,
"step": 5280
},
{
"epoch": 0.41106535084311135,
"grad_norm": 2.766804831311677,
"learning_rate": 7.331002713424012e-06,
"loss": 0.818,
"step": 5290
},
{
"epoch": 0.41184241199782423,
"grad_norm": 3.759592880394652,
"learning_rate": 7.3189957879483235e-06,
"loss": 0.8724,
"step": 5300
},
{
"epoch": 0.4126194731525371,
"grad_norm": 3.069207342018398,
"learning_rate": 7.3069718003791276e-06,
"loss": 0.8836,
"step": 5310
},
{
"epoch": 0.41339653430725,
"grad_norm": 3.3679689130107433,
"learning_rate": 7.29493083918334e-06,
"loss": 0.8408,
"step": 5320
},
{
"epoch": 0.41417359546196286,
"grad_norm": 3.1614295846456244,
"learning_rate": 7.282872992952757e-06,
"loss": 0.796,
"step": 5330
},
{
"epoch": 0.41495065661667574,
"grad_norm": 2.3615270875399905,
"learning_rate": 7.270798350403407e-06,
"loss": 0.7622,
"step": 5340
},
{
"epoch": 0.4157277177713886,
"grad_norm": 4.796953025378249,
"learning_rate": 7.2587070003749015e-06,
"loss": 0.8264,
"step": 5350
},
{
"epoch": 0.4165047789261015,
"grad_norm": 3.137452467564146,
"learning_rate": 7.246599031829775e-06,
"loss": 0.7943,
"step": 5360
},
{
"epoch": 0.41728184008081437,
"grad_norm": 3.0340412586302064,
"learning_rate": 7.234474533852834e-06,
"loss": 0.8368,
"step": 5370
},
{
"epoch": 0.41805890123552725,
"grad_norm": 3.5888770778936627,
"learning_rate": 7.222333595650502e-06,
"loss": 0.8416,
"step": 5380
},
{
"epoch": 0.4188359623902401,
"grad_norm": 1.602353309028904,
"learning_rate": 7.210176306550161e-06,
"loss": 0.8347,
"step": 5390
},
{
"epoch": 0.419613023544953,
"grad_norm": 4.051100900717811,
"learning_rate": 7.198002755999495e-06,
"loss": 0.8079,
"step": 5400
},
{
"epoch": 0.4203900846996659,
"grad_norm": 2.6685524323790215,
"learning_rate": 7.185813033565832e-06,
"loss": 0.8434,
"step": 5410
},
{
"epoch": 0.42116714585437875,
"grad_norm": 2.729322434976506,
"learning_rate": 7.1736072289354875e-06,
"loss": 0.8578,
"step": 5420
},
{
"epoch": 0.42194420700909163,
"grad_norm": 2.906073044503289,
"learning_rate": 7.161385431913098e-06,
"loss": 0.7804,
"step": 5430
},
{
"epoch": 0.4227212681638045,
"grad_norm": 2.290496693909145,
"learning_rate": 7.149147732420971e-06,
"loss": 0.8248,
"step": 5440
},
{
"epoch": 0.4234983293185174,
"grad_norm": 5.010159443056758,
"learning_rate": 7.1368942204984094e-06,
"loss": 0.8057,
"step": 5450
},
{
"epoch": 0.42427539047323026,
"grad_norm": 2.536646021262698,
"learning_rate": 7.124624986301062e-06,
"loss": 0.8439,
"step": 5460
},
{
"epoch": 0.42505245162794314,
"grad_norm": 2.9421994943957364,
"learning_rate": 7.112340120100255e-06,
"loss": 0.8744,
"step": 5470
},
{
"epoch": 0.425829512782656,
"grad_norm": 3.3641741595063888,
"learning_rate": 7.100039712282323e-06,
"loss": 0.8211,
"step": 5480
},
{
"epoch": 0.4266065739373689,
"grad_norm": 4.096933321696819,
"learning_rate": 7.0877238533479535e-06,
"loss": 0.838,
"step": 5490
},
{
"epoch": 0.42738363509208177,
"grad_norm": 3.4094346710709678,
"learning_rate": 7.075392633911513e-06,
"loss": 0.8409,
"step": 5500
},
{
"epoch": 0.42816069624679465,
"grad_norm": 1.993447683519007,
"learning_rate": 7.063046144700383e-06,
"loss": 0.8555,
"step": 5510
},
{
"epoch": 0.4289377574015075,
"grad_norm": 2.5909937579577256,
"learning_rate": 7.050684476554299e-06,
"loss": 0.822,
"step": 5520
},
{
"epoch": 0.4297148185562204,
"grad_norm": 3.3258757291630716,
"learning_rate": 7.038307720424668e-06,
"loss": 0.8538,
"step": 5530
},
{
"epoch": 0.4304918797109332,
"grad_norm": 3.6044299100524535,
"learning_rate": 7.025915967373911e-06,
"loss": 0.7909,
"step": 5540
},
{
"epoch": 0.4312689408656461,
"grad_norm": 2.945760411127075,
"learning_rate": 7.013509308574788e-06,
"loss": 0.7084,
"step": 5550
},
{
"epoch": 0.432046002020359,
"grad_norm": 3.9943856557515405,
"learning_rate": 7.001087835309734e-06,
"loss": 0.8192,
"step": 5560
},
{
"epoch": 0.43282306317507185,
"grad_norm": 3.9363696932078094,
"learning_rate": 6.988651638970175e-06,
"loss": 0.7937,
"step": 5570
},
{
"epoch": 0.43360012432978473,
"grad_norm": 2.7961832443632697,
"learning_rate": 6.976200811055867e-06,
"loss": 0.8409,
"step": 5580
},
{
"epoch": 0.4343771854844976,
"grad_norm": 3.573733698773883,
"learning_rate": 6.963735443174213e-06,
"loss": 0.8,
"step": 5590
},
{
"epoch": 0.4351542466392105,
"grad_norm": 1.861238869282892,
"learning_rate": 6.9512556270395996e-06,
"loss": 0.8202,
"step": 5600
},
{
"epoch": 0.43593130779392336,
"grad_norm": 3.435004374927387,
"learning_rate": 6.938761454472718e-06,
"loss": 0.7907,
"step": 5610
},
{
"epoch": 0.43670836894863624,
"grad_norm": 2.169031421644934,
"learning_rate": 6.926253017399882e-06,
"loss": 0.7455,
"step": 5620
},
{
"epoch": 0.4374854301033491,
"grad_norm": 2.639119266804599,
"learning_rate": 6.913730407852359e-06,
"loss": 0.7798,
"step": 5630
},
{
"epoch": 0.438262491258062,
"grad_norm": 3.0923108923433653,
"learning_rate": 6.9011937179656956e-06,
"loss": 0.86,
"step": 5640
},
{
"epoch": 0.43903955241277487,
"grad_norm": 3.4778690753111974,
"learning_rate": 6.888643039979025e-06,
"loss": 0.8565,
"step": 5650
},
{
"epoch": 0.43981661356748775,
"grad_norm": 2.019550042044677,
"learning_rate": 6.8760784662344085e-06,
"loss": 0.8222,
"step": 5660
},
{
"epoch": 0.4405936747222006,
"grad_norm": 2.531115492821316,
"learning_rate": 6.863500089176141e-06,
"loss": 0.7994,
"step": 5670
},
{
"epoch": 0.4413707358769135,
"grad_norm": 3.623980012450744,
"learning_rate": 6.850908001350076e-06,
"loss": 0.8085,
"step": 5680
},
{
"epoch": 0.4421477970316264,
"grad_norm": 2.874269072854778,
"learning_rate": 6.838302295402944e-06,
"loss": 0.8206,
"step": 5690
},
{
"epoch": 0.44292485818633925,
"grad_norm": 3.3046693857663767,
"learning_rate": 6.825683064081673e-06,
"loss": 0.7733,
"step": 5700
},
{
"epoch": 0.44370191934105213,
"grad_norm": 2.820815832528071,
"learning_rate": 6.813050400232705e-06,
"loss": 0.7684,
"step": 5710
},
{
"epoch": 0.444478980495765,
"grad_norm": 3.2657952823703513,
"learning_rate": 6.800404396801309e-06,
"loss": 0.8003,
"step": 5720
},
{
"epoch": 0.4452560416504779,
"grad_norm": 3.316944889654959,
"learning_rate": 6.787745146830903e-06,
"loss": 0.8037,
"step": 5730
},
{
"epoch": 0.44603310280519076,
"grad_norm": 3.850090302427542,
"learning_rate": 6.775072743462368e-06,
"loss": 0.7318,
"step": 5740
},
{
"epoch": 0.44681016395990364,
"grad_norm": 2.488942618483238,
"learning_rate": 6.762387279933355e-06,
"loss": 0.7842,
"step": 5750
},
{
"epoch": 0.4475872251146165,
"grad_norm": 3.9986923312061537,
"learning_rate": 6.749688849577616e-06,
"loss": 0.7452,
"step": 5760
},
{
"epoch": 0.4483642862693294,
"grad_norm": 3.174677745330878,
"learning_rate": 6.736977545824299e-06,
"loss": 0.7755,
"step": 5770
},
{
"epoch": 0.44914134742404227,
"grad_norm": 3.008290639491103,
"learning_rate": 6.72425346219727e-06,
"loss": 0.7483,
"step": 5780
},
{
"epoch": 0.44991840857875515,
"grad_norm": 3.7842544499599335,
"learning_rate": 6.711516692314426e-06,
"loss": 0.8714,
"step": 5790
},
{
"epoch": 0.450695469733468,
"grad_norm": 3.595279361244756,
"learning_rate": 6.698767329887001e-06,
"loss": 0.8087,
"step": 5800
},
{
"epoch": 0.4514725308881809,
"grad_norm": 3.2985766841264974,
"learning_rate": 6.686005468718879e-06,
"loss": 0.7593,
"step": 5810
},
{
"epoch": 0.4522495920428938,
"grad_norm": 3.3364617948252855,
"learning_rate": 6.673231202705906e-06,
"loss": 0.744,
"step": 5820
},
{
"epoch": 0.45302665319760665,
"grad_norm": 1.6739208971136896,
"learning_rate": 6.660444625835194e-06,
"loss": 0.7233,
"step": 5830
},
{
"epoch": 0.45380371435231953,
"grad_norm": 2.908524261261958,
"learning_rate": 6.647645832184437e-06,
"loss": 0.7726,
"step": 5840
},
{
"epoch": 0.4545807755070324,
"grad_norm": 3.741049911001574,
"learning_rate": 6.634834915921211e-06,
"loss": 0.7414,
"step": 5850
},
{
"epoch": 0.4553578366617453,
"grad_norm": 3.525582515759396,
"learning_rate": 6.6220119713022855e-06,
"loss": 0.7431,
"step": 5860
},
{
"epoch": 0.45613489781645816,
"grad_norm": 3.6441156387339446,
"learning_rate": 6.609177092672927e-06,
"loss": 0.8191,
"step": 5870
},
{
"epoch": 0.45691195897117104,
"grad_norm": 3.235190279824699,
"learning_rate": 6.596330374466212e-06,
"loss": 0.7609,
"step": 5880
},
{
"epoch": 0.4576890201258839,
"grad_norm": 2.6003682513249555,
"learning_rate": 6.5834719112023215e-06,
"loss": 0.7252,
"step": 5890
},
{
"epoch": 0.4584660812805968,
"grad_norm": 4.03595764942659,
"learning_rate": 6.570601797487854e-06,
"loss": 0.8437,
"step": 5900
},
{
"epoch": 0.45924314243530967,
"grad_norm": 2.7068297821785943,
"learning_rate": 6.557720128015127e-06,
"loss": 0.8236,
"step": 5910
},
{
"epoch": 0.46002020359002255,
"grad_norm": 3.4599815225643495,
"learning_rate": 6.544826997561479e-06,
"loss": 0.7797,
"step": 5920
},
{
"epoch": 0.4607972647447354,
"grad_norm": 3.773628994151356,
"learning_rate": 6.531922500988572e-06,
"loss": 0.751,
"step": 5930
},
{
"epoch": 0.4615743258994483,
"grad_norm": 2.2173873623143563,
"learning_rate": 6.519006733241697e-06,
"loss": 0.7701,
"step": 5940
},
{
"epoch": 0.4623513870541612,
"grad_norm": 3.033174067089371,
"learning_rate": 6.506079789349074e-06,
"loss": 0.7682,
"step": 5950
},
{
"epoch": 0.46312844820887406,
"grad_norm": 4.1166433622525584,
"learning_rate": 6.493141764421145e-06,
"loss": 0.8537,
"step": 5960
},
{
"epoch": 0.46390550936358693,
"grad_norm": 3.131603304402972,
"learning_rate": 6.48019275364989e-06,
"loss": 0.7729,
"step": 5970
},
{
"epoch": 0.4646825705182998,
"grad_norm": 3.0925113977774674,
"learning_rate": 6.46723285230811e-06,
"loss": 0.7959,
"step": 5980
},
{
"epoch": 0.4654596316730127,
"grad_norm": 4.214785149959189,
"learning_rate": 6.454262155748741e-06,
"loss": 0.771,
"step": 5990
},
{
"epoch": 0.46623669282772556,
"grad_norm": 4.231644528802966,
"learning_rate": 6.4412807594041396e-06,
"loss": 0.8038,
"step": 6000
},
{
"epoch": 0.46701375398243844,
"grad_norm": 3.077252834668561,
"learning_rate": 6.428288758785387e-06,
"loss": 0.7784,
"step": 6010
},
{
"epoch": 0.4677908151371513,
"grad_norm": 3.710905060380187,
"learning_rate": 6.415286249481591e-06,
"loss": 0.7705,
"step": 6020
},
{
"epoch": 0.4685678762918642,
"grad_norm": 4.489857568139187,
"learning_rate": 6.402273327159169e-06,
"loss": 0.7182,
"step": 6030
},
{
"epoch": 0.46934493744657707,
"grad_norm": 3.852955528938296,
"learning_rate": 6.389250087561162e-06,
"loss": 0.7736,
"step": 6040
},
{
"epoch": 0.47012199860128995,
"grad_norm": 3.9025918987862878,
"learning_rate": 6.376216626506513e-06,
"loss": 0.7431,
"step": 6050
},
{
"epoch": 0.4708990597560028,
"grad_norm": 3.4097364478378203,
"learning_rate": 6.363173039889373e-06,
"loss": 0.7973,
"step": 6060
},
{
"epoch": 0.47167612091071565,
"grad_norm": 4.33473272302523,
"learning_rate": 6.350119423678391e-06,
"loss": 0.7898,
"step": 6070
},
{
"epoch": 0.4724531820654285,
"grad_norm": 3.679757021095654,
"learning_rate": 6.3370558739160096e-06,
"loss": 0.7576,
"step": 6080
},
{
"epoch": 0.4732302432201414,
"grad_norm": 3.9057618817922033,
"learning_rate": 6.32398248671776e-06,
"loss": 0.7725,
"step": 6090
},
{
"epoch": 0.4740073043748543,
"grad_norm": 3.403797504220692,
"learning_rate": 6.310899358271549e-06,
"loss": 0.8273,
"step": 6100
},
{
"epoch": 0.47478436552956715,
"grad_norm": 2.2498527490634936,
"learning_rate": 6.2978065848369594e-06,
"loss": 0.7365,
"step": 6110
},
{
"epoch": 0.47556142668428003,
"grad_norm": 3.5041131745023777,
"learning_rate": 6.284704262744532e-06,
"loss": 0.7739,
"step": 6120
},
{
"epoch": 0.4763384878389929,
"grad_norm": 3.236195246500179,
"learning_rate": 6.271592488395064e-06,
"loss": 0.769,
"step": 6130
},
{
"epoch": 0.4771155489937058,
"grad_norm": 4.227426671695652,
"learning_rate": 6.2584713582589015e-06,
"loss": 0.801,
"step": 6140
},
{
"epoch": 0.47789261014841866,
"grad_norm": 2.395986835968045,
"learning_rate": 6.2453409688752244e-06,
"loss": 0.7343,
"step": 6150
},
{
"epoch": 0.47866967130313154,
"grad_norm": 3.050933140103267,
"learning_rate": 6.232201416851332e-06,
"loss": 0.7774,
"step": 6160
},
{
"epoch": 0.4794467324578444,
"grad_norm": 3.680174317755052,
"learning_rate": 6.219052798861948e-06,
"loss": 0.8151,
"step": 6170
},
{
"epoch": 0.4802237936125573,
"grad_norm": 3.282669805242103,
"learning_rate": 6.205895211648489e-06,
"loss": 0.7851,
"step": 6180
},
{
"epoch": 0.48100085476727017,
"grad_norm": 3.0746449279394454,
"learning_rate": 6.192728752018373e-06,
"loss": 0.8465,
"step": 6190
},
{
"epoch": 0.48177791592198305,
"grad_norm": 3.6239050452367345,
"learning_rate": 6.179553516844291e-06,
"loss": 0.7675,
"step": 6200
},
{
"epoch": 0.4825549770766959,
"grad_norm": 2.4293135613154706,
"learning_rate": 6.1663696030635e-06,
"loss": 0.7459,
"step": 6210
},
{
"epoch": 0.4833320382314088,
"grad_norm": 2.4717149655776716,
"learning_rate": 6.153177107677112e-06,
"loss": 0.7385,
"step": 6220
},
{
"epoch": 0.4841090993861217,
"grad_norm": 3.7011954863420424,
"learning_rate": 6.139976127749381e-06,
"loss": 0.7594,
"step": 6230
},
{
"epoch": 0.48488616054083455,
"grad_norm": 3.580923341493924,
"learning_rate": 6.126766760406982e-06,
"loss": 0.7504,
"step": 6240
},
{
"epoch": 0.48566322169554743,
"grad_norm": 3.7474824398696054,
"learning_rate": 6.1135491028383e-06,
"loss": 0.8189,
"step": 6250
},
{
"epoch": 0.4864402828502603,
"grad_norm": 4.008525494927905,
"learning_rate": 6.100323252292721e-06,
"loss": 0.8037,
"step": 6260
},
{
"epoch": 0.4872173440049732,
"grad_norm": 4.533137670554457,
"learning_rate": 6.087089306079907e-06,
"loss": 0.7396,
"step": 6270
},
{
"epoch": 0.48799440515968606,
"grad_norm": 3.577325942559521,
"learning_rate": 6.073847361569085e-06,
"loss": 0.7712,
"step": 6280
},
{
"epoch": 0.48877146631439894,
"grad_norm": 3.4785892916574226,
"learning_rate": 6.06059751618833e-06,
"loss": 0.7744,
"step": 6290
},
{
"epoch": 0.4895485274691118,
"grad_norm": 2.726294641729152,
"learning_rate": 6.047339867423849e-06,
"loss": 0.739,
"step": 6300
},
{
"epoch": 0.4903255886238247,
"grad_norm": 3.2923367667657244,
"learning_rate": 6.034074512819259e-06,
"loss": 0.7921,
"step": 6310
},
{
"epoch": 0.49110264977853757,
"grad_norm": 2.5138919730315163,
"learning_rate": 6.020801549974879e-06,
"loss": 0.7627,
"step": 6320
},
{
"epoch": 0.49187971093325045,
"grad_norm": 3.0639205838133923,
"learning_rate": 6.007521076546999e-06,
"loss": 0.6908,
"step": 6330
},
{
"epoch": 0.4926567720879633,
"grad_norm": 5.28489991162866,
"learning_rate": 5.994233190247174e-06,
"loss": 0.6984,
"step": 6340
},
{
"epoch": 0.4934338332426762,
"grad_norm": 3.1930218466849665,
"learning_rate": 5.9809379888414975e-06,
"loss": 0.7312,
"step": 6350
},
{
"epoch": 0.4942108943973891,
"grad_norm": 2.140853783592497,
"learning_rate": 5.967635570149881e-06,
"loss": 0.739,
"step": 6360
},
{
"epoch": 0.49498795555210195,
"grad_norm": 2.6520877753384706,
"learning_rate": 5.9543260320453445e-06,
"loss": 0.7115,
"step": 6370
},
{
"epoch": 0.49576501670681483,
"grad_norm": 3.5362571286933693,
"learning_rate": 5.941009472453283e-06,
"loss": 0.7313,
"step": 6380
},
{
"epoch": 0.4965420778615277,
"grad_norm": 3.1479357916202173,
"learning_rate": 5.927685989350755e-06,
"loss": 0.7689,
"step": 6390
},
{
"epoch": 0.4973191390162406,
"grad_norm": 4.239286662147043,
"learning_rate": 5.914355680765757e-06,
"loss": 0.7209,
"step": 6400
},
{
"epoch": 0.49809620017095346,
"grad_norm": 4.168222516693175,
"learning_rate": 5.901018644776509e-06,
"loss": 0.7151,
"step": 6410
},
{
"epoch": 0.49887326132566634,
"grad_norm": 2.857843662958384,
"learning_rate": 5.8876749795107214e-06,
"loss": 0.768,
"step": 6420
},
{
"epoch": 0.4996503224803792,
"grad_norm": 3.52360411131157,
"learning_rate": 5.874324783144885e-06,
"loss": 0.8139,
"step": 6430
},
{
"epoch": 0.5004273836350921,
"grad_norm": 3.657211308302993,
"learning_rate": 5.860968153903542e-06,
"loss": 0.6869,
"step": 6440
},
{
"epoch": 0.501204444789805,
"grad_norm": 2.331407753002653,
"learning_rate": 5.847605190058563e-06,
"loss": 0.747,
"step": 6450
},
{
"epoch": 0.5019815059445178,
"grad_norm": 3.7182364487724713,
"learning_rate": 5.8342359899284286e-06,
"loss": 0.7425,
"step": 6460
},
{
"epoch": 0.5027585670992307,
"grad_norm": 3.5617096002819926,
"learning_rate": 5.8208606518775e-06,
"loss": 0.7474,
"step": 6470
},
{
"epoch": 0.5035356282539436,
"grad_norm": 3.1283143308974477,
"learning_rate": 5.807479274315302e-06,
"loss": 0.7354,
"step": 6480
},
{
"epoch": 0.5043126894086565,
"grad_norm": 3.183649544594623,
"learning_rate": 5.79409195569579e-06,
"loss": 0.7693,
"step": 6490
},
{
"epoch": 0.5050897505633694,
"grad_norm": 4.183143639793591,
"learning_rate": 5.780698794516636e-06,
"loss": 0.7159,
"step": 6500
},
{
"epoch": 0.5058668117180822,
"grad_norm": 3.3530863093489613,
"learning_rate": 5.767299889318496e-06,
"loss": 0.7258,
"step": 6510
},
{
"epoch": 0.5066438728727951,
"grad_norm": 3.4594325919428703,
"learning_rate": 5.75389533868429e-06,
"loss": 0.831,
"step": 6520
},
{
"epoch": 0.507420934027508,
"grad_norm": 2.9431596981070642,
"learning_rate": 5.7404852412384725e-06,
"loss": 0.6962,
"step": 6530
},
{
"epoch": 0.5081979951822209,
"grad_norm": 3.0367905793947894,
"learning_rate": 5.72706969564631e-06,
"loss": 0.7612,
"step": 6540
},
{
"epoch": 0.5089750563369337,
"grad_norm": 2.429198874828814,
"learning_rate": 5.713648800613154e-06,
"loss": 0.7464,
"step": 6550
},
{
"epoch": 0.5097521174916466,
"grad_norm": 3.4346659673155964,
"learning_rate": 5.700222654883712e-06,
"loss": 0.784,
"step": 6560
},
{
"epoch": 0.5105291786463595,
"grad_norm": 3.412520275752024,
"learning_rate": 5.686791357241329e-06,
"loss": 0.7418,
"step": 6570
},
{
"epoch": 0.5113062398010724,
"grad_norm": 3.5500533489754957,
"learning_rate": 5.673355006507251e-06,
"loss": 0.7931,
"step": 6580
},
{
"epoch": 0.5120833009557852,
"grad_norm": 3.3785219578924073,
"learning_rate": 5.659913701539903e-06,
"loss": 0.7255,
"step": 6590
},
{
"epoch": 0.5128603621104981,
"grad_norm": 2.8478099507815493,
"learning_rate": 5.646467541234162e-06,
"loss": 0.6869,
"step": 6600
},
{
"epoch": 0.513637423265211,
"grad_norm": 4.116946216809252,
"learning_rate": 5.633016624520627e-06,
"loss": 0.723,
"step": 6610
},
{
"epoch": 0.5144144844199239,
"grad_norm": 4.278208268527751,
"learning_rate": 5.619561050364897e-06,
"loss": 0.7021,
"step": 6620
},
{
"epoch": 0.5151915455746368,
"grad_norm": 3.9380435048254068,
"learning_rate": 5.606100917766829e-06,
"loss": 0.7289,
"step": 6630
},
{
"epoch": 0.5159686067293496,
"grad_norm": 3.035312643544745,
"learning_rate": 5.592636325759829e-06,
"loss": 0.6616,
"step": 6640
},
{
"epoch": 0.5167456678840625,
"grad_norm": 4.67293135855067,
"learning_rate": 5.579167373410108e-06,
"loss": 0.6983,
"step": 6650
},
{
"epoch": 0.5175227290387754,
"grad_norm": 4.655170532587341,
"learning_rate": 5.565694159815955e-06,
"loss": 0.7799,
"step": 6660
},
{
"epoch": 0.5182997901934883,
"grad_norm": 3.3764468867138193,
"learning_rate": 5.552216784107022e-06,
"loss": 0.7443,
"step": 6670
},
{
"epoch": 0.5190768513482011,
"grad_norm": 3.441315238146844,
"learning_rate": 5.538735345443573e-06,
"loss": 0.7195,
"step": 6680
},
{
"epoch": 0.519853912502914,
"grad_norm": 4.575454800944016,
"learning_rate": 5.525249943015771e-06,
"loss": 0.7499,
"step": 6690
},
{
"epoch": 0.5206309736576269,
"grad_norm": 5.206336978319692,
"learning_rate": 5.511760676042941e-06,
"loss": 0.7462,
"step": 6700
},
{
"epoch": 0.5214080348123398,
"grad_norm": 2.782422183265534,
"learning_rate": 5.498267643772842e-06,
"loss": 0.6735,
"step": 6710
},
{
"epoch": 0.5221850959670526,
"grad_norm": 4.799976665563157,
"learning_rate": 5.484770945480935e-06,
"loss": 0.7432,
"step": 6720
},
{
"epoch": 0.5229621571217655,
"grad_norm": 3.68056618328099,
"learning_rate": 5.471270680469656e-06,
"loss": 0.7086,
"step": 6730
},
{
"epoch": 0.5237392182764784,
"grad_norm": 4.337600776833273,
"learning_rate": 5.457766948067682e-06,
"loss": 0.6972,
"step": 6740
},
{
"epoch": 0.5245162794311913,
"grad_norm": 2.9170786823925754,
"learning_rate": 5.4442598476292e-06,
"loss": 0.697,
"step": 6750
},
{
"epoch": 0.5252933405859042,
"grad_norm": 3.389813065457727,
"learning_rate": 5.430749478533182e-06,
"loss": 0.6823,
"step": 6760
},
{
"epoch": 0.526070401740617,
"grad_norm": 4.405810375053449,
"learning_rate": 5.417235940182646e-06,
"loss": 0.6954,
"step": 6770
},
{
"epoch": 0.5268474628953299,
"grad_norm": 3.745948791175591,
"learning_rate": 5.403719332003925e-06,
"loss": 0.7129,
"step": 6780
},
{
"epoch": 0.5276245240500428,
"grad_norm": 3.5759861354998095,
"learning_rate": 5.390199753445945e-06,
"loss": 0.7457,
"step": 6790
},
{
"epoch": 0.5284015852047557,
"grad_norm": 3.133292740862389,
"learning_rate": 5.376677303979481e-06,
"loss": 0.716,
"step": 6800
},
{
"epoch": 0.5291786463594685,
"grad_norm": 3.6994792177101536,
"learning_rate": 5.3631520830964335e-06,
"loss": 0.7075,
"step": 6810
},
{
"epoch": 0.5299557075141814,
"grad_norm": 4.2709254391755875,
"learning_rate": 5.349624190309095e-06,
"loss": 0.6646,
"step": 6820
},
{
"epoch": 0.5307327686688943,
"grad_norm": 4.757235420288998,
"learning_rate": 5.3360937251494145e-06,
"loss": 0.7197,
"step": 6830
},
{
"epoch": 0.5315098298236072,
"grad_norm": 3.970395562121448,
"learning_rate": 5.322560787168266e-06,
"loss": 0.7113,
"step": 6840
},
{
"epoch": 0.53228689097832,
"grad_norm": 3.4076129510381636,
"learning_rate": 5.30902547593472e-06,
"loss": 0.7051,
"step": 6850
},
{
"epoch": 0.5330639521330328,
"grad_norm": 4.69605182138137,
"learning_rate": 5.29548789103531e-06,
"loss": 0.7044,
"step": 6860
},
{
"epoch": 0.5338410132877457,
"grad_norm": 3.804895971708535,
"learning_rate": 5.281948132073293e-06,
"loss": 0.7119,
"step": 6870
},
{
"epoch": 0.5346180744424586,
"grad_norm": 3.6916149040278596,
"learning_rate": 5.2684062986679245e-06,
"loss": 0.7208,
"step": 6880
},
{
"epoch": 0.5353951355971714,
"grad_norm": 3.0845852262650775,
"learning_rate": 5.254862490453723e-06,
"loss": 0.6855,
"step": 6890
},
{
"epoch": 0.5361721967518843,
"grad_norm": 4.685912874705627,
"learning_rate": 5.241316807079735e-06,
"loss": 0.7176,
"step": 6900
},
{
"epoch": 0.5369492579065972,
"grad_norm": 2.9240144110586157,
"learning_rate": 5.227769348208808e-06,
"loss": 0.7158,
"step": 6910
},
{
"epoch": 0.5377263190613101,
"grad_norm": 3.258492056259544,
"learning_rate": 5.214220213516849e-06,
"loss": 0.6492,
"step": 6920
},
{
"epoch": 0.538503380216023,
"grad_norm": 4.273950085839226,
"learning_rate": 5.200669502692092e-06,
"loss": 0.6784,
"step": 6930
},
{
"epoch": 0.5392804413707358,
"grad_norm": 2.6079076529513503,
"learning_rate": 5.187117315434374e-06,
"loss": 0.6969,
"step": 6940
},
{
"epoch": 0.5400575025254487,
"grad_norm": 3.4737447133789847,
"learning_rate": 5.173563751454393e-06,
"loss": 0.7804,
"step": 6950
},
{
"epoch": 0.5408345636801616,
"grad_norm": 4.786817720128349,
"learning_rate": 5.160008910472971e-06,
"loss": 0.6805,
"step": 6960
},
{
"epoch": 0.5416116248348745,
"grad_norm": 3.7701770083150197,
"learning_rate": 5.146452892220334e-06,
"loss": 0.7214,
"step": 6970
},
{
"epoch": 0.5423886859895873,
"grad_norm": 3.7554811031983344,
"learning_rate": 5.132895796435363e-06,
"loss": 0.6417,
"step": 6980
},
{
"epoch": 0.5431657471443002,
"grad_norm": 3.5547381426364097,
"learning_rate": 5.119337722864871e-06,
"loss": 0.6636,
"step": 6990
},
{
"epoch": 0.5439428082990131,
"grad_norm": 5.011611632534712,
"learning_rate": 5.1057787712628645e-06,
"loss": 0.6869,
"step": 7000
},
{
"epoch": 0.544719869453726,
"grad_norm": 3.833252076719035,
"learning_rate": 5.092219041389809e-06,
"loss": 0.698,
"step": 7010
},
{
"epoch": 0.5454969306084388,
"grad_norm": 3.94968001273636,
"learning_rate": 5.0786586330118936e-06,
"loss": 0.6499,
"step": 7020
},
{
"epoch": 0.5462739917631517,
"grad_norm": 4.652418519560147,
"learning_rate": 5.065097645900305e-06,
"loss": 0.7365,
"step": 7030
},
{
"epoch": 0.5470510529178646,
"grad_norm": 3.4688260249453333,
"learning_rate": 5.051536179830485e-06,
"loss": 0.7244,
"step": 7040
},
{
"epoch": 0.5478281140725775,
"grad_norm": 3.507980085656876,
"learning_rate": 5.0379743345814e-06,
"loss": 0.6463,
"step": 7050
},
{
"epoch": 0.5486051752272904,
"grad_norm": 4.08415517826481,
"learning_rate": 5.024412209934806e-06,
"loss": 0.7134,
"step": 7060
},
{
"epoch": 0.5493822363820032,
"grad_norm": 3.1430434027718848,
"learning_rate": 5.010849905674513e-06,
"loss": 0.6646,
"step": 7070
},
{
"epoch": 0.5501592975367161,
"grad_norm": 1.7398353080625177,
"learning_rate": 4.997287521585657e-06,
"loss": 0.6604,
"step": 7080
},
{
"epoch": 0.550936358691429,
"grad_norm": 3.6616218145390356,
"learning_rate": 4.983725157453956e-06,
"loss": 0.6713,
"step": 7090
},
{
"epoch": 0.5517134198461419,
"grad_norm": 3.811153246818418,
"learning_rate": 4.9701629130649834e-06,
"loss": 0.7095,
"step": 7100
},
{
"epoch": 0.5524904810008547,
"grad_norm": 4.929016419712588,
"learning_rate": 4.956600888203433e-06,
"loss": 0.6714,
"step": 7110
},
{
"epoch": 0.5532675421555676,
"grad_norm": 3.4541756616239927,
"learning_rate": 4.943039182652383e-06,
"loss": 0.7235,
"step": 7120
},
{
"epoch": 0.5540446033102805,
"grad_norm": 4.095722371398238,
"learning_rate": 4.929477896192561e-06,
"loss": 0.8093,
"step": 7130
},
{
"epoch": 0.5548216644649934,
"grad_norm": 4.870666395156222,
"learning_rate": 4.915917128601611e-06,
"loss": 0.7031,
"step": 7140
},
{
"epoch": 0.5555987256197062,
"grad_norm": 3.448418758510041,
"learning_rate": 4.902356979653361e-06,
"loss": 0.7084,
"step": 7150
},
{
"epoch": 0.5563757867744191,
"grad_norm": 3.829159584215915,
"learning_rate": 4.8887975491170845e-06,
"loss": 0.7181,
"step": 7160
},
{
"epoch": 0.557152847929132,
"grad_norm": 3.555777208653401,
"learning_rate": 4.875238936756774e-06,
"loss": 0.6763,
"step": 7170
},
{
"epoch": 0.5579299090838449,
"grad_norm": 2.5493937496001187,
"learning_rate": 4.861681242330397e-06,
"loss": 0.6756,
"step": 7180
},
{
"epoch": 0.5587069702385578,
"grad_norm": 3.3198532718689813,
"learning_rate": 4.84812456558917e-06,
"loss": 0.6644,
"step": 7190
},
{
"epoch": 0.5594840313932706,
"grad_norm": 3.829290955616477,
"learning_rate": 4.834569006276823e-06,
"loss": 0.6786,
"step": 7200
},
{
"epoch": 0.5602610925479835,
"grad_norm": 2.592783541640363,
"learning_rate": 4.821014664128859e-06,
"loss": 0.7156,
"step": 7210
},
{
"epoch": 0.5610381537026964,
"grad_norm": 4.188978510013467,
"learning_rate": 4.807461638871835e-06,
"loss": 0.7262,
"step": 7220
},
{
"epoch": 0.5618152148574093,
"grad_norm": 3.069522579226053,
"learning_rate": 4.79391003022261e-06,
"loss": 0.6989,
"step": 7230
},
{
"epoch": 0.5625922760121221,
"grad_norm": 4.039799899118001,
"learning_rate": 4.780359937887625e-06,
"loss": 0.6682,
"step": 7240
},
{
"epoch": 0.563369337166835,
"grad_norm": 4.6623197649536126,
"learning_rate": 4.766811461562163e-06,
"loss": 0.6464,
"step": 7250
},
{
"epoch": 0.5641463983215479,
"grad_norm": 5.438968217638661,
"learning_rate": 4.753264700929619e-06,
"loss": 0.6507,
"step": 7260
},
{
"epoch": 0.5649234594762608,
"grad_norm": 4.0222533809812,
"learning_rate": 4.739719755660761e-06,
"loss": 0.7014,
"step": 7270
},
{
"epoch": 0.5657005206309736,
"grad_norm": 4.058570524163514,
"learning_rate": 4.726176725413004e-06,
"loss": 0.693,
"step": 7280
},
{
"epoch": 0.5664775817856865,
"grad_norm": 3.3787013409423445,
"learning_rate": 4.712635709829672e-06,
"loss": 0.6591,
"step": 7290
},
{
"epoch": 0.5672546429403994,
"grad_norm": 3.3640659595948708,
"learning_rate": 4.699096808539264e-06,
"loss": 0.7431,
"step": 7300
},
{
"epoch": 0.5680317040951123,
"grad_norm": 3.1238662551833616,
"learning_rate": 4.685560121154729e-06,
"loss": 0.6474,
"step": 7310
},
{
"epoch": 0.5688087652498252,
"grad_norm": 2.452949406434516,
"learning_rate": 4.672025747272721e-06,
"loss": 0.6816,
"step": 7320
},
{
"epoch": 0.569585826404538,
"grad_norm": 3.127308776747053,
"learning_rate": 4.658493786472874e-06,
"loss": 0.6741,
"step": 7330
},
{
"epoch": 0.5703628875592509,
"grad_norm": 3.9891903397041455,
"learning_rate": 4.644964338317069e-06,
"loss": 0.7111,
"step": 7340
},
{
"epoch": 0.5711399487139638,
"grad_norm": 3.495751965003335,
"learning_rate": 4.631437502348697e-06,
"loss": 0.6552,
"step": 7350
},
{
"epoch": 0.5719170098686767,
"grad_norm": 3.436449484433345,
"learning_rate": 4.617913378091935e-06,
"loss": 0.6893,
"step": 7360
},
{
"epoch": 0.5726940710233895,
"grad_norm": 3.0865849237950784,
"learning_rate": 4.604392065051003e-06,
"loss": 0.7376,
"step": 7370
},
{
"epoch": 0.5734711321781024,
"grad_norm": 4.474788471571803,
"learning_rate": 4.590873662709441e-06,
"loss": 0.6914,
"step": 7380
},
{
"epoch": 0.5742481933328153,
"grad_norm": 2.91533419260106,
"learning_rate": 4.577358270529371e-06,
"loss": 0.6414,
"step": 7390
},
{
"epoch": 0.5750252544875282,
"grad_norm": 4.0797704361429785,
"learning_rate": 4.5638459879507685e-06,
"loss": 0.6661,
"step": 7400
},
{
"epoch": 0.575802315642241,
"grad_norm": 4.709772893333078,
"learning_rate": 4.550336914390734e-06,
"loss": 0.6594,
"step": 7410
},
{
"epoch": 0.5765793767969539,
"grad_norm": 4.564968479413114,
"learning_rate": 4.536831149242752e-06,
"loss": 0.6672,
"step": 7420
},
{
"epoch": 0.5773564379516668,
"grad_norm": 4.056479158493849,
"learning_rate": 4.5233287918759645e-06,
"loss": 0.708,
"step": 7430
},
{
"epoch": 0.5781334991063797,
"grad_norm": 3.645071188138108,
"learning_rate": 4.509829941634447e-06,
"loss": 0.686,
"step": 7440
},
{
"epoch": 0.5789105602610926,
"grad_norm": 3.7318479118380044,
"learning_rate": 4.496334697836466e-06,
"loss": 0.6866,
"step": 7450
},
{
"epoch": 0.5796876214158054,
"grad_norm": 3.6748150242674384,
"learning_rate": 4.482843159773753e-06,
"loss": 0.701,
"step": 7460
},
{
"epoch": 0.5804646825705183,
"grad_norm": 3.532495775566941,
"learning_rate": 4.46935542671078e-06,
"loss": 0.6266,
"step": 7470
},
{
"epoch": 0.5812417437252312,
"grad_norm": 3.917282093097207,
"learning_rate": 4.455871597884016e-06,
"loss": 0.6965,
"step": 7480
},
{
"epoch": 0.5820188048799441,
"grad_norm": 3.541326700374132,
"learning_rate": 4.4423917725012125e-06,
"loss": 0.6256,
"step": 7490
},
{
"epoch": 0.5827958660346569,
"grad_norm": 2.8073311337818088,
"learning_rate": 4.428916049740657e-06,
"loss": 0.5885,
"step": 7500
},
{
"epoch": 0.5835729271893698,
"grad_norm": 3.374101386732686,
"learning_rate": 4.41544452875046e-06,
"loss": 0.6549,
"step": 7510
},
{
"epoch": 0.5843499883440827,
"grad_norm": 4.325578617573067,
"learning_rate": 4.401977308647811e-06,
"loss": 0.6566,
"step": 7520
},
{
"epoch": 0.5851270494987956,
"grad_norm": 4.915536833619769,
"learning_rate": 4.38851448851826e-06,
"loss": 0.6687,
"step": 7530
},
{
"epoch": 0.5859041106535084,
"grad_norm": 3.6537787425693544,
"learning_rate": 4.3750561674149815e-06,
"loss": 0.6292,
"step": 7540
},
{
"epoch": 0.5866811718082213,
"grad_norm": 2.9777148243481335,
"learning_rate": 4.3616024443580475e-06,
"loss": 0.6541,
"step": 7550
},
{
"epoch": 0.5874582329629342,
"grad_norm": 3.5260018889623455,
"learning_rate": 4.348153418333703e-06,
"loss": 0.667,
"step": 7560
},
{
"epoch": 0.5882352941176471,
"grad_norm": 3.7174490457010654,
"learning_rate": 4.334709188293631e-06,
"loss": 0.6419,
"step": 7570
},
{
"epoch": 0.58901235527236,
"grad_norm": 3.4684662206499355,
"learning_rate": 4.321269853154231e-06,
"loss": 0.65,
"step": 7580
},
{
"epoch": 0.5897894164270728,
"grad_norm": 3.1882054970304083,
"learning_rate": 4.307835511795883e-06,
"loss": 0.622,
"step": 7590
},
{
"epoch": 0.5905664775817857,
"grad_norm": 4.381319562804776,
"learning_rate": 4.294406263062235e-06,
"loss": 0.6422,
"step": 7600
},
{
"epoch": 0.5913435387364986,
"grad_norm": 3.724730362444138,
"learning_rate": 4.280982205759453e-06,
"loss": 0.664,
"step": 7610
},
{
"epoch": 0.5921205998912115,
"grad_norm": 3.2942646676430027,
"learning_rate": 4.267563438655517e-06,
"loss": 0.6834,
"step": 7620
},
{
"epoch": 0.5928976610459243,
"grad_norm": 3.9059709080382445,
"learning_rate": 4.254150060479479e-06,
"loss": 0.6773,
"step": 7630
},
{
"epoch": 0.5936747222006372,
"grad_norm": 3.2926775490538867,
"learning_rate": 4.240742169920744e-06,
"loss": 0.6612,
"step": 7640
},
{
"epoch": 0.5944517833553501,
"grad_norm": 3.721480675397905,
"learning_rate": 4.22733986562834e-06,
"loss": 0.5946,
"step": 7650
},
{
"epoch": 0.595228844510063,
"grad_norm": 3.6657313410284282,
"learning_rate": 4.213943246210195e-06,
"loss": 0.6839,
"step": 7660
},
{
"epoch": 0.5960059056647758,
"grad_norm": 3.555216109953286,
"learning_rate": 4.200552410232411e-06,
"loss": 0.6839,
"step": 7670
},
{
"epoch": 0.5967829668194887,
"grad_norm": 4.24437071856819,
"learning_rate": 4.187167456218536e-06,
"loss": 0.7096,
"step": 7680
},
{
"epoch": 0.5975600279742016,
"grad_norm": 3.760444842640791,
"learning_rate": 4.173788482648841e-06,
"loss": 0.6495,
"step": 7690
},
{
"epoch": 0.5983370891289145,
"grad_norm": 3.2749111360276086,
"learning_rate": 4.1604155879595985e-06,
"loss": 0.6266,
"step": 7700
},
{
"epoch": 0.5991141502836274,
"grad_norm": 4.05061726263054,
"learning_rate": 4.147048870542358e-06,
"loss": 0.6682,
"step": 7710
},
{
"epoch": 0.5998912114383402,
"grad_norm": 4.177296915658458,
"learning_rate": 4.133688428743209e-06,
"loss": 0.6504,
"step": 7720
},
{
"epoch": 0.6006682725930531,
"grad_norm": 3.4374499956078997,
"learning_rate": 4.120334360862078e-06,
"loss": 0.6068,
"step": 7730
},
{
"epoch": 0.601445333747766,
"grad_norm": 3.7771571359160374,
"learning_rate": 4.106986765151992e-06,
"loss": 0.6811,
"step": 7740
},
{
"epoch": 0.6022223949024789,
"grad_norm": 2.755405096701383,
"learning_rate": 4.093645739818357e-06,
"loss": 0.6374,
"step": 7750
},
{
"epoch": 0.6029994560571917,
"grad_norm": 4.718012688255332,
"learning_rate": 4.080311383018239e-06,
"loss": 0.7078,
"step": 7760
},
{
"epoch": 0.6037765172119046,
"grad_norm": 2.894912540809299,
"learning_rate": 4.06698379285964e-06,
"loss": 0.6759,
"step": 7770
},
{
"epoch": 0.6045535783666175,
"grad_norm": 3.025336800067562,
"learning_rate": 4.0536630674007734e-06,
"loss": 0.6109,
"step": 7780
},
{
"epoch": 0.6053306395213304,
"grad_norm": 3.2614510795042126,
"learning_rate": 4.040349304649351e-06,
"loss": 0.685,
"step": 7790
},
{
"epoch": 0.6061077006760432,
"grad_norm": 2.800252117497351,
"learning_rate": 4.027042602561853e-06,
"loss": 0.6498,
"step": 7800
},
{
"epoch": 0.6068847618307561,
"grad_norm": 3.8460226274586122,
"learning_rate": 4.013743059042808e-06,
"loss": 0.6977,
"step": 7810
},
{
"epoch": 0.607661822985469,
"grad_norm": 3.771896387641876,
"learning_rate": 4.0004507719440795e-06,
"loss": 0.6635,
"step": 7820
},
{
"epoch": 0.6084388841401819,
"grad_norm": 3.1786304501140092,
"learning_rate": 3.987165839064141e-06,
"loss": 0.6758,
"step": 7830
},
{
"epoch": 0.6092159452948948,
"grad_norm": 5.015425132509244,
"learning_rate": 3.973888358147353e-06,
"loss": 0.623,
"step": 7840
},
{
"epoch": 0.6099930064496076,
"grad_norm": 4.27847425835873,
"learning_rate": 3.9606184268832525e-06,
"loss": 0.6758,
"step": 7850
},
{
"epoch": 0.6107700676043205,
"grad_norm": 3.3936214832633507,
"learning_rate": 3.947356142905827e-06,
"loss": 0.6132,
"step": 7860
},
{
"epoch": 0.6115471287590334,
"grad_norm": 2.5020153230654896,
"learning_rate": 3.934101603792802e-06,
"loss": 0.6084,
"step": 7870
},
{
"epoch": 0.6123241899137463,
"grad_norm": 3.0348186320695936,
"learning_rate": 3.920854907064912e-06,
"loss": 0.6277,
"step": 7880
},
{
"epoch": 0.6131012510684591,
"grad_norm": 4.926182627828219,
"learning_rate": 3.907616150185205e-06,
"loss": 0.6746,
"step": 7890
},
{
"epoch": 0.613878312223172,
"grad_norm": 4.0423507052637735,
"learning_rate": 3.894385430558297e-06,
"loss": 0.6112,
"step": 7900
},
{
"epoch": 0.6146553733778849,
"grad_norm": 3.549727749823181,
"learning_rate": 3.881162845529678e-06,
"loss": 0.6219,
"step": 7910
},
{
"epoch": 0.6154324345325977,
"grad_norm": 4.713227361162499,
"learning_rate": 3.867948492384983e-06,
"loss": 0.6693,
"step": 7920
},
{
"epoch": 0.6162094956873105,
"grad_norm": 3.471848373352376,
"learning_rate": 3.854742468349283e-06,
"loss": 0.6833,
"step": 7930
},
{
"epoch": 0.6169865568420234,
"grad_norm": 7.217595191023394,
"learning_rate": 3.841544870586369e-06,
"loss": 0.6947,
"step": 7940
},
{
"epoch": 0.6177636179967363,
"grad_norm": 2.9040989631629976,
"learning_rate": 3.828355796198029e-06,
"loss": 0.6342,
"step": 7950
},
{
"epoch": 0.6185406791514492,
"grad_norm": 3.7080878359935268,
"learning_rate": 3.815175342223349e-06,
"loss": 0.6267,
"step": 7960
},
{
"epoch": 0.619317740306162,
"grad_norm": 4.731993499154974,
"learning_rate": 3.80200360563798e-06,
"loss": 0.6319,
"step": 7970
},
{
"epoch": 0.6200948014608749,
"grad_norm": 3.2422107203395267,
"learning_rate": 3.7888406833534447e-06,
"loss": 0.6219,
"step": 7980
},
{
"epoch": 0.6208718626155878,
"grad_norm": 2.7384103955014565,
"learning_rate": 3.7756866722164055e-06,
"loss": 0.6304,
"step": 7990
},
{
"epoch": 0.6216489237703007,
"grad_norm": 4.934854236839532,
"learning_rate": 3.7625416690079674e-06,
"loss": 0.5913,
"step": 8000
},
{
"epoch": 0.6224259849250136,
"grad_norm": 5.278185394532136,
"learning_rate": 3.749405770442954e-06,
"loss": 0.6062,
"step": 8010
},
{
"epoch": 0.6232030460797264,
"grad_norm": 3.745775463675437,
"learning_rate": 3.7362790731692045e-06,
"loss": 0.5785,
"step": 8020
},
{
"epoch": 0.6239801072344393,
"grad_norm": 3.0793776700444893,
"learning_rate": 3.7231616737668587e-06,
"loss": 0.6212,
"step": 8030
},
{
"epoch": 0.6247571683891522,
"grad_norm": 4.616140309647705,
"learning_rate": 3.710053668747644e-06,
"loss": 0.6978,
"step": 8040
},
{
"epoch": 0.6255342295438651,
"grad_norm": 2.266055763696263,
"learning_rate": 3.696955154554174e-06,
"loss": 0.6677,
"step": 8050
},
{
"epoch": 0.6263112906985779,
"grad_norm": 3.167710349649831,
"learning_rate": 3.6838662275592285e-06,
"loss": 0.5961,
"step": 8060
},
{
"epoch": 0.6270883518532908,
"grad_norm": 3.6679021169417583,
"learning_rate": 3.670786984065049e-06,
"loss": 0.5932,
"step": 8070
},
{
"epoch": 0.6278654130080037,
"grad_norm": 4.807394417840595,
"learning_rate": 3.657717520302635e-06,
"loss": 0.6507,
"step": 8080
},
{
"epoch": 0.6286424741627166,
"grad_norm": 2.8567195928058697,
"learning_rate": 3.6446579324310283e-06,
"loss": 0.5622,
"step": 8090
},
{
"epoch": 0.6294195353174294,
"grad_norm": 4.87655399348002,
"learning_rate": 3.6316083165366066e-06,
"loss": 0.6807,
"step": 8100
},
{
"epoch": 0.6301965964721423,
"grad_norm": 3.7014748147970886,
"learning_rate": 3.61856876863238e-06,
"loss": 0.6127,
"step": 8110
},
{
"epoch": 0.6309736576268552,
"grad_norm": 3.9766985471750482,
"learning_rate": 3.6055393846572863e-06,
"loss": 0.6355,
"step": 8120
},
{
"epoch": 0.6317507187815681,
"grad_norm": 5.176163354598203,
"learning_rate": 3.592520260475474e-06,
"loss": 0.5764,
"step": 8130
},
{
"epoch": 0.632527779936281,
"grad_norm": 3.3915897413256273,
"learning_rate": 3.579511491875614e-06,
"loss": 0.5824,
"step": 8140
},
{
"epoch": 0.6333048410909938,
"grad_norm": 2.968301217496569,
"learning_rate": 3.5665131745701796e-06,
"loss": 0.6927,
"step": 8150
},
{
"epoch": 0.6340819022457067,
"grad_norm": 3.4049937558114367,
"learning_rate": 3.5535254041947487e-06,
"loss": 0.6589,
"step": 8160
},
{
"epoch": 0.6348589634004196,
"grad_norm": 3.0490199659476223,
"learning_rate": 3.5405482763073006e-06,
"loss": 0.6264,
"step": 8170
},
{
"epoch": 0.6356360245551325,
"grad_norm": 4.610543482084557,
"learning_rate": 3.5275818863875176e-06,
"loss": 0.6298,
"step": 8180
},
{
"epoch": 0.6364130857098453,
"grad_norm": 3.792284286942197,
"learning_rate": 3.5146263298360676e-06,
"loss": 0.6409,
"step": 8190
},
{
"epoch": 0.6371901468645582,
"grad_norm": 4.791463361046891,
"learning_rate": 3.501681701973917e-06,
"loss": 0.5988,
"step": 8200
},
{
"epoch": 0.6379672080192711,
"grad_norm": 2.946227557833364,
"learning_rate": 3.488748098041623e-06,
"loss": 0.56,
"step": 8210
},
{
"epoch": 0.638744269173984,
"grad_norm": 3.9143118513649013,
"learning_rate": 3.4758256131986333e-06,
"loss": 0.6102,
"step": 8220
},
{
"epoch": 0.6395213303286968,
"grad_norm": 7.013871477575305,
"learning_rate": 3.4629143425225893e-06,
"loss": 0.6887,
"step": 8230
},
{
"epoch": 0.6402983914834097,
"grad_norm": 3.771798826744058,
"learning_rate": 3.4500143810086194e-06,
"loss": 0.6373,
"step": 8240
},
{
"epoch": 0.6410754526381226,
"grad_norm": 3.132474576222066,
"learning_rate": 3.437125823568646e-06,
"loss": 0.6452,
"step": 8250
},
{
"epoch": 0.6418525137928355,
"grad_norm": 4.0341361359246,
"learning_rate": 3.4242487650306867e-06,
"loss": 0.65,
"step": 8260
},
{
"epoch": 0.6426295749475484,
"grad_norm": 3.489817034481266,
"learning_rate": 3.4113833001381575e-06,
"loss": 0.6041,
"step": 8270
},
{
"epoch": 0.6434066361022612,
"grad_norm": 4.207948013742414,
"learning_rate": 3.398529523549169e-06,
"loss": 0.6047,
"step": 8280
},
{
"epoch": 0.6441836972569741,
"grad_norm": 3.300977059658827,
"learning_rate": 3.3856875298358365e-06,
"loss": 0.6619,
"step": 8290
},
{
"epoch": 0.644960758411687,
"grad_norm": 3.8241041070180413,
"learning_rate": 3.3728574134835846e-06,
"loss": 0.6198,
"step": 8300
},
{
"epoch": 0.6457378195663999,
"grad_norm": 3.875014176616493,
"learning_rate": 3.360039268890446e-06,
"loss": 0.6003,
"step": 8310
},
{
"epoch": 0.6465148807211127,
"grad_norm": 3.2752573740495556,
"learning_rate": 3.347233190366375e-06,
"loss": 0.6101,
"step": 8320
},
{
"epoch": 0.6472919418758256,
"grad_norm": 3.8745882003993177,
"learning_rate": 3.3344392721325458e-06,
"loss": 0.6248,
"step": 8330
},
{
"epoch": 0.6480690030305385,
"grad_norm": 2.942894246587158,
"learning_rate": 3.3216576083206637e-06,
"loss": 0.6087,
"step": 8340
},
{
"epoch": 0.6488460641852514,
"grad_norm": 2.990495379975504,
"learning_rate": 3.308888292972273e-06,
"loss": 0.5888,
"step": 8350
},
{
"epoch": 0.6496231253399642,
"grad_norm": 3.376642101090337,
"learning_rate": 3.2961314200380616e-06,
"loss": 0.637,
"step": 8360
},
{
"epoch": 0.6504001864946771,
"grad_norm": 3.4092448553804156,
"learning_rate": 3.2833870833771753e-06,
"loss": 0.6105,
"step": 8370
},
{
"epoch": 0.65117724764939,
"grad_norm": 5.292717322884515,
"learning_rate": 3.270655376756521e-06,
"loss": 0.579,
"step": 8380
},
{
"epoch": 0.6519543088041029,
"grad_norm": 3.7225346348995982,
"learning_rate": 3.25793639385008e-06,
"loss": 0.6072,
"step": 8390
},
{
"epoch": 0.6527313699588158,
"grad_norm": 3.656912994279593,
"learning_rate": 3.2452302282382185e-06,
"loss": 0.5656,
"step": 8400
},
{
"epoch": 0.6535084311135286,
"grad_norm": 5.191851471827204,
"learning_rate": 3.232536973407e-06,
"loss": 0.6353,
"step": 8410
},
{
"epoch": 0.6542854922682415,
"grad_norm": 4.5342622406097135,
"learning_rate": 3.2198567227474954e-06,
"loss": 0.6239,
"step": 8420
},
{
"epoch": 0.6550625534229544,
"grad_norm": 3.2997906214128507,
"learning_rate": 3.207189569555096e-06,
"loss": 0.6493,
"step": 8430
},
{
"epoch": 0.6558396145776673,
"grad_norm": 3.7417655823104092,
"learning_rate": 3.194535607028832e-06,
"loss": 0.5765,
"step": 8440
},
{
"epoch": 0.6566166757323801,
"grad_norm": 4.1174225350073685,
"learning_rate": 3.1818949282706764e-06,
"loss": 0.584,
"step": 8450
},
{
"epoch": 0.657393736887093,
"grad_norm": 5.288074659352862,
"learning_rate": 3.1692676262848732e-06,
"loss": 0.5846,
"step": 8460
},
{
"epoch": 0.6581707980418059,
"grad_norm": 6.8794935144127285,
"learning_rate": 3.1566537939772433e-06,
"loss": 0.6164,
"step": 8470
},
{
"epoch": 0.6589478591965188,
"grad_norm": 3.369610724208555,
"learning_rate": 3.1440535241545035e-06,
"loss": 0.5667,
"step": 8480
},
{
"epoch": 0.6597249203512316,
"grad_norm": 2.700055960128087,
"learning_rate": 3.131466909523582e-06,
"loss": 0.5729,
"step": 8490
},
{
"epoch": 0.6605019815059445,
"grad_norm": 4.481552377327523,
"learning_rate": 3.118894042690945e-06,
"loss": 0.5639,
"step": 8500
},
{
"epoch": 0.6612790426606574,
"grad_norm": 5.130216388568981,
"learning_rate": 3.1063350161619025e-06,
"loss": 0.5904,
"step": 8510
},
{
"epoch": 0.6620561038153703,
"grad_norm": 4.00502225199317,
"learning_rate": 3.093789922339936e-06,
"loss": 0.5998,
"step": 8520
},
{
"epoch": 0.6628331649700832,
"grad_norm": 3.774461462354705,
"learning_rate": 3.081258853526018e-06,
"loss": 0.5886,
"step": 8530
},
{
"epoch": 0.663610226124796,
"grad_norm": 2.821168583180078,
"learning_rate": 3.0687419019179285e-06,
"loss": 0.6011,
"step": 8540
},
{
"epoch": 0.6643872872795089,
"grad_norm": 4.63573425963788,
"learning_rate": 3.0562391596095833e-06,
"loss": 0.61,
"step": 8550
},
{
"epoch": 0.6651643484342218,
"grad_norm": 4.151701829585363,
"learning_rate": 3.0437507185903516e-06,
"loss": 0.6334,
"step": 8560
},
{
"epoch": 0.6659414095889347,
"grad_norm": 3.1823244853803097,
"learning_rate": 3.0312766707443784e-06,
"loss": 0.6492,
"step": 8570
},
{
"epoch": 0.6667184707436475,
"grad_norm": 3.494168616800063,
"learning_rate": 3.0188171078499117e-06,
"loss": 0.6293,
"step": 8580
},
{
"epoch": 0.6674955318983604,
"grad_norm": 3.007455561802234,
"learning_rate": 3.0063721215786274e-06,
"loss": 0.6125,
"step": 8590
},
{
"epoch": 0.6682725930530733,
"grad_norm": 4.328591303423522,
"learning_rate": 2.99394180349495e-06,
"loss": 0.6152,
"step": 8600
},
{
"epoch": 0.6690496542077862,
"grad_norm": 3.0920402812840413,
"learning_rate": 2.981526245055387e-06,
"loss": 0.5768,
"step": 8610
},
{
"epoch": 0.669826715362499,
"grad_norm": 2.9353592413440155,
"learning_rate": 2.9691255376078464e-06,
"loss": 0.542,
"step": 8620
},
{
"epoch": 0.6706037765172119,
"grad_norm": 3.882400088723547,
"learning_rate": 2.9567397723909725e-06,
"loss": 0.519,
"step": 8630
},
{
"epoch": 0.6713808376719248,
"grad_norm": 4.783097703300002,
"learning_rate": 2.944369040533471e-06,
"loss": 0.6396,
"step": 8640
},
{
"epoch": 0.6721578988266377,
"grad_norm": 4.770262430972376,
"learning_rate": 2.9320134330534367e-06,
"loss": 0.6385,
"step": 8650
},
{
"epoch": 0.6729349599813506,
"grad_norm": 3.1574059447890486,
"learning_rate": 2.919673040857693e-06,
"loss": 0.5935,
"step": 8660
},
{
"epoch": 0.6737120211360634,
"grad_norm": 3.945392779400959,
"learning_rate": 2.9073479547411087e-06,
"loss": 0.6041,
"step": 8670
},
{
"epoch": 0.6744890822907763,
"grad_norm": 3.834570241650989,
"learning_rate": 2.89503826538594e-06,
"loss": 0.5603,
"step": 8680
},
{
"epoch": 0.6752661434454892,
"grad_norm": 3.322325574324924,
"learning_rate": 2.882744063361165e-06,
"loss": 0.5839,
"step": 8690
},
{
"epoch": 0.6760432046002021,
"grad_norm": 5.400737978025128,
"learning_rate": 2.870465439121807e-06,
"loss": 0.6,
"step": 8700
},
{
"epoch": 0.6768202657549149,
"grad_norm": 3.7907802256324614,
"learning_rate": 2.8582024830082796e-06,
"loss": 0.6255,
"step": 8710
},
{
"epoch": 0.6775973269096278,
"grad_norm": 3.912677923882123,
"learning_rate": 2.845955285245715e-06,
"loss": 0.5545,
"step": 8720
},
{
"epoch": 0.6783743880643407,
"grad_norm": 4.941243247209147,
"learning_rate": 2.833723935943301e-06,
"loss": 0.5684,
"step": 8730
},
{
"epoch": 0.6791514492190536,
"grad_norm": 3.289971837418658,
"learning_rate": 2.821508525093627e-06,
"loss": 0.6519,
"step": 8740
},
{
"epoch": 0.6799285103737664,
"grad_norm": 3.939920814084507,
"learning_rate": 2.8093091425720097e-06,
"loss": 0.6229,
"step": 8750
},
{
"epoch": 0.6807055715284793,
"grad_norm": 4.336532929599707,
"learning_rate": 2.797125878135837e-06,
"loss": 0.5641,
"step": 8760
},
{
"epoch": 0.6814826326831922,
"grad_norm": 3.322566385669406,
"learning_rate": 2.784958821423907e-06,
"loss": 0.6232,
"step": 8770
},
{
"epoch": 0.6822596938379051,
"grad_norm": 4.200430984375038,
"learning_rate": 2.7728080619557702e-06,
"loss": 0.5977,
"step": 8780
},
{
"epoch": 0.683036754992618,
"grad_norm": 3.740176445426232,
"learning_rate": 2.760673689131068e-06,
"loss": 0.6185,
"step": 8790
},
{
"epoch": 0.6838138161473308,
"grad_norm": 2.1066076609366613,
"learning_rate": 2.7485557922288776e-06,
"loss": 0.6274,
"step": 8800
},
{
"epoch": 0.6845908773020437,
"grad_norm": 2.8053182283923213,
"learning_rate": 2.736454460407055e-06,
"loss": 0.6181,
"step": 8810
},
{
"epoch": 0.6853679384567566,
"grad_norm": 3.437087088984394,
"learning_rate": 2.724369782701578e-06,
"loss": 0.621,
"step": 8820
},
{
"epoch": 0.6861449996114695,
"grad_norm": 3.0623391960294595,
"learning_rate": 2.7123018480258876e-06,
"loss": 0.5441,
"step": 8830
},
{
"epoch": 0.6869220607661823,
"grad_norm": 4.447855889156802,
"learning_rate": 2.7002507451702394e-06,
"loss": 0.5498,
"step": 8840
},
{
"epoch": 0.6876991219208952,
"grad_norm": 3.328238936470799,
"learning_rate": 2.688216562801052e-06,
"loss": 0.5992,
"step": 8850
},
{
"epoch": 0.6884761830756081,
"grad_norm": 4.421506555636393,
"learning_rate": 2.6761993894602444e-06,
"loss": 0.5945,
"step": 8860
},
{
"epoch": 0.689253244230321,
"grad_norm": 5.322591815355897,
"learning_rate": 2.664199313564598e-06,
"loss": 0.5958,
"step": 8870
},
{
"epoch": 0.6900303053850338,
"grad_norm": 3.7611828384663393,
"learning_rate": 2.652216423405093e-06,
"loss": 0.5645,
"step": 8880
},
{
"epoch": 0.6908073665397467,
"grad_norm": 3.3085304945194176,
"learning_rate": 2.6402508071462685e-06,
"loss": 0.5821,
"step": 8890
},
{
"epoch": 0.6915844276944596,
"grad_norm": 4.5103793305482105,
"learning_rate": 2.6283025528255685e-06,
"loss": 0.6111,
"step": 8900
},
{
"epoch": 0.6923614888491725,
"grad_norm": 3.2568624242920623,
"learning_rate": 2.6163717483526953e-06,
"loss": 0.5546,
"step": 8910
},
{
"epoch": 0.6931385500038854,
"grad_norm": 2.973519357151336,
"learning_rate": 2.6044584815089667e-06,
"loss": 0.5685,
"step": 8920
},
{
"epoch": 0.6939156111585982,
"grad_norm": 3.5837020468987166,
"learning_rate": 2.592562839946664e-06,
"loss": 0.5456,
"step": 8930
},
{
"epoch": 0.6946926723133111,
"grad_norm": 4.064184411405787,
"learning_rate": 2.5806849111883913e-06,
"loss": 0.559,
"step": 8940
},
{
"epoch": 0.695469733468024,
"grad_norm": 3.3437426814478406,
"learning_rate": 2.56882478262643e-06,
"loss": 0.5538,
"step": 8950
},
{
"epoch": 0.6962467946227369,
"grad_norm": 3.107677218552789,
"learning_rate": 2.556982541522094e-06,
"loss": 0.5383,
"step": 8960
},
{
"epoch": 0.6970238557774496,
"grad_norm": 2.882272796253547,
"learning_rate": 2.5451582750050896e-06,
"loss": 0.5698,
"step": 8970
},
{
"epoch": 0.6978009169321625,
"grad_norm": 3.2190081599711164,
"learning_rate": 2.5333520700728793e-06,
"loss": 0.5581,
"step": 8980
},
{
"epoch": 0.6985779780868754,
"grad_norm": 4.12751667992376,
"learning_rate": 2.521564013590031e-06,
"loss": 0.5334,
"step": 8990
},
{
"epoch": 0.6993550392415883,
"grad_norm": 4.145588694570731,
"learning_rate": 2.509794192287588e-06,
"loss": 0.561,
"step": 9000
},
{
"epoch": 0.7001321003963011,
"grad_norm": 3.155212860949128,
"learning_rate": 2.498042692762426e-06,
"loss": 0.5418,
"step": 9010
},
{
"epoch": 0.700909161551014,
"grad_norm": 3.2632869764204897,
"learning_rate": 2.4863096014766193e-06,
"loss": 0.5411,
"step": 9020
},
{
"epoch": 0.7016862227057269,
"grad_norm": 4.001715026222935,
"learning_rate": 2.474595004756799e-06,
"loss": 0.5589,
"step": 9030
},
{
"epoch": 0.7024632838604398,
"grad_norm": 3.3415316677677325,
"learning_rate": 2.4628989887935266e-06,
"loss": 0.537,
"step": 9040
},
{
"epoch": 0.7032403450151526,
"grad_norm": 5.797689446433965,
"learning_rate": 2.4512216396406552e-06,
"loss": 0.6243,
"step": 9050
},
{
"epoch": 0.7040174061698655,
"grad_norm": 4.284101589916973,
"learning_rate": 2.4395630432146926e-06,
"loss": 0.5817,
"step": 9060
},
{
"epoch": 0.7047944673245784,
"grad_norm": 3.211724547014886,
"learning_rate": 2.427923285294174e-06,
"loss": 0.5788,
"step": 9070
},
{
"epoch": 0.7055715284792913,
"grad_norm": 3.2055910232947085,
"learning_rate": 2.4163024515190293e-06,
"loss": 0.5311,
"step": 9080
},
{
"epoch": 0.7063485896340042,
"grad_norm": 4.255051995836248,
"learning_rate": 2.4047006273899527e-06,
"loss": 0.5713,
"step": 9090
},
{
"epoch": 0.707125650788717,
"grad_norm": 4.597394692328588,
"learning_rate": 2.393117898267779e-06,
"loss": 0.6031,
"step": 9100
},
{
"epoch": 0.7079027119434299,
"grad_norm": 3.2150862347569933,
"learning_rate": 2.3815543493728454e-06,
"loss": 0.5594,
"step": 9110
},
{
"epoch": 0.7086797730981428,
"grad_norm": 4.683878110698539,
"learning_rate": 2.370010065784372e-06,
"loss": 0.5461,
"step": 9120
},
{
"epoch": 0.7094568342528557,
"grad_norm": 4.033438486304492,
"learning_rate": 2.358485132439831e-06,
"loss": 0.5815,
"step": 9130
},
{
"epoch": 0.7102338954075685,
"grad_norm": 3.3703523652063168,
"learning_rate": 2.3469796341343315e-06,
"loss": 0.5247,
"step": 9140
},
{
"epoch": 0.7110109565622814,
"grad_norm": 4.325956291425198,
"learning_rate": 2.33549365551998e-06,
"loss": 0.5387,
"step": 9150
},
{
"epoch": 0.7117880177169943,
"grad_norm": 2.490947555344077,
"learning_rate": 2.3240272811052738e-06,
"loss": 0.5776,
"step": 9160
},
{
"epoch": 0.7125650788717072,
"grad_norm": 4.949535189967038,
"learning_rate": 2.3125805952544666e-06,
"loss": 0.5842,
"step": 9170
},
{
"epoch": 0.71334214002642,
"grad_norm": 3.670543908233672,
"learning_rate": 2.301153682186954e-06,
"loss": 0.53,
"step": 9180
},
{
"epoch": 0.7141192011811329,
"grad_norm": 4.866130796619525,
"learning_rate": 2.289746625976653e-06,
"loss": 0.5681,
"step": 9190
},
{
"epoch": 0.7148962623358458,
"grad_norm": 3.4112599844471467,
"learning_rate": 2.2783595105513832e-06,
"loss": 0.5575,
"step": 9200
},
{
"epoch": 0.7156733234905587,
"grad_norm": 3.844471466545408,
"learning_rate": 2.266992419692247e-06,
"loss": 0.5716,
"step": 9210
},
{
"epoch": 0.7164503846452716,
"grad_norm": 3.3046961399811474,
"learning_rate": 2.2556454370330195e-06,
"loss": 0.5431,
"step": 9220
},
{
"epoch": 0.7172274457999844,
"grad_norm": 2.960816022759597,
"learning_rate": 2.2443186460595277e-06,
"loss": 0.5502,
"step": 9230
},
{
"epoch": 0.7180045069546973,
"grad_norm": 3.7931643481456794,
"learning_rate": 2.2330121301090362e-06,
"loss": 0.5844,
"step": 9240
},
{
"epoch": 0.7187815681094102,
"grad_norm": 3.4283490865176853,
"learning_rate": 2.221725972369635e-06,
"loss": 0.5568,
"step": 9250
},
{
"epoch": 0.7195586292641231,
"grad_norm": 4.3583902590026895,
"learning_rate": 2.210460255879629e-06,
"loss": 0.5173,
"step": 9260
},
{
"epoch": 0.7203356904188359,
"grad_norm": 3.653581931257441,
"learning_rate": 2.1992150635269233e-06,
"loss": 0.5229,
"step": 9270
},
{
"epoch": 0.7211127515735488,
"grad_norm": 4.770502864647989,
"learning_rate": 2.187990478048423e-06,
"loss": 0.5761,
"step": 9280
},
{
"epoch": 0.7218898127282617,
"grad_norm": 3.878473847618142,
"learning_rate": 2.1767865820294093e-06,
"loss": 0.4937,
"step": 9290
},
{
"epoch": 0.7226668738829746,
"grad_norm": 3.9771101901252157,
"learning_rate": 2.165603457902945e-06,
"loss": 0.5237,
"step": 9300
},
{
"epoch": 0.7234439350376874,
"grad_norm": 3.533717896030411,
"learning_rate": 2.1544411879492597e-06,
"loss": 0.5743,
"step": 9310
},
{
"epoch": 0.7242209961924003,
"grad_norm": 3.65695725762207,
"learning_rate": 2.143299854295149e-06,
"loss": 0.5824,
"step": 9320
},
{
"epoch": 0.7249980573471132,
"grad_norm": 2.492214523438049,
"learning_rate": 2.13217953891337e-06,
"loss": 0.5274,
"step": 9330
},
{
"epoch": 0.7257751185018261,
"grad_norm": 3.386138297909339,
"learning_rate": 2.121080323622038e-06,
"loss": 0.5612,
"step": 9340
},
{
"epoch": 0.726552179656539,
"grad_norm": 3.9436014142777096,
"learning_rate": 2.1100022900840208e-06,
"loss": 0.5317,
"step": 9350
},
{
"epoch": 0.7273292408112518,
"grad_norm": 4.412376927983859,
"learning_rate": 2.0989455198063415e-06,
"loss": 0.574,
"step": 9360
},
{
"epoch": 0.7281063019659647,
"grad_norm": 2.3279248382650737,
"learning_rate": 2.0879100941395787e-06,
"loss": 0.5289,
"step": 9370
},
{
"epoch": 0.7288833631206776,
"grad_norm": 4.637433311164565,
"learning_rate": 2.076896094277265e-06,
"loss": 0.5622,
"step": 9380
},
{
"epoch": 0.7296604242753905,
"grad_norm": 4.904954853760184,
"learning_rate": 2.065903601255297e-06,
"loss": 0.5176,
"step": 9390
},
{
"epoch": 0.7304374854301033,
"grad_norm": 3.729037710128586,
"learning_rate": 2.0549326959513287e-06,
"loss": 0.5315,
"step": 9400
},
{
"epoch": 0.7312145465848162,
"grad_norm": 3.5966860873794966,
"learning_rate": 2.0439834590841833e-06,
"loss": 0.5177,
"step": 9410
},
{
"epoch": 0.7319916077395291,
"grad_norm": 4.464459321144577,
"learning_rate": 2.0330559712132614e-06,
"loss": 0.5484,
"step": 9420
},
{
"epoch": 0.732768668894242,
"grad_norm": 4.589314499941277,
"learning_rate": 2.022150312737939e-06,
"loss": 0.5467,
"step": 9430
},
{
"epoch": 0.7335457300489548,
"grad_norm": 4.017841935745773,
"learning_rate": 2.0112665638969842e-06,
"loss": 0.5266,
"step": 9440
},
{
"epoch": 0.7343227912036677,
"grad_norm": 3.0931816369991703,
"learning_rate": 2.0004048047679624e-06,
"loss": 0.5767,
"step": 9450
},
{
"epoch": 0.7350998523583806,
"grad_norm": 4.495169108132031,
"learning_rate": 1.9895651152666538e-06,
"loss": 0.5613,
"step": 9460
},
{
"epoch": 0.7358769135130935,
"grad_norm": 4.1470825704755,
"learning_rate": 1.978747575146455e-06,
"loss": 0.5111,
"step": 9470
},
{
"epoch": 0.7366539746678064,
"grad_norm": 4.197560473624663,
"learning_rate": 1.967952263997801e-06,
"loss": 0.5538,
"step": 9480
},
{
"epoch": 0.7374310358225192,
"grad_norm": 3.7319528048077246,
"learning_rate": 1.9571792612475747e-06,
"loss": 0.5741,
"step": 9490
},
{
"epoch": 0.7382080969772321,
"grad_norm": 5.01956999231008,
"learning_rate": 1.9464286461585223e-06,
"loss": 0.5357,
"step": 9500
},
{
"epoch": 0.738985158131945,
"grad_norm": 3.7344522235830264,
"learning_rate": 1.9357004978286777e-06,
"loss": 0.5369,
"step": 9510
},
{
"epoch": 0.7397622192866579,
"grad_norm": 5.534900941588667,
"learning_rate": 1.924994895190772e-06,
"loss": 0.547,
"step": 9520
},
{
"epoch": 0.7405392804413707,
"grad_norm": 3.544511900994509,
"learning_rate": 1.9143119170116534e-06,
"loss": 0.5365,
"step": 9530
},
{
"epoch": 0.7413163415960836,
"grad_norm": 3.617025368147638,
"learning_rate": 1.9036516418917128e-06,
"loss": 0.576,
"step": 9540
},
{
"epoch": 0.7420934027507965,
"grad_norm": 2.717825183803928,
"learning_rate": 1.8930141482643005e-06,
"loss": 0.5528,
"step": 9550
},
{
"epoch": 0.7428704639055094,
"grad_norm": 3.8576185713414732,
"learning_rate": 1.88239951439515e-06,
"loss": 0.5505,
"step": 9560
},
{
"epoch": 0.7436475250602222,
"grad_norm": 5.360570148700179,
"learning_rate": 1.8718078183818094e-06,
"loss": 0.547,
"step": 9570
},
{
"epoch": 0.7444245862149351,
"grad_norm": 3.9702986251974126,
"learning_rate": 1.8612391381530548e-06,
"loss": 0.5361,
"step": 9580
},
{
"epoch": 0.745201647369648,
"grad_norm": 4.210077667591901,
"learning_rate": 1.8506935514683244e-06,
"loss": 0.5558,
"step": 9590
},
{
"epoch": 0.7459787085243609,
"grad_norm": 4.27553292233449,
"learning_rate": 1.8401711359171438e-06,
"loss": 0.5406,
"step": 9600
},
{
"epoch": 0.7467557696790738,
"grad_norm": 5.023769063952561,
"learning_rate": 1.82967196891856e-06,
"loss": 0.5345,
"step": 9610
},
{
"epoch": 0.7475328308337866,
"grad_norm": 3.7148918067051353,
"learning_rate": 1.819196127720565e-06,
"loss": 0.5417,
"step": 9620
},
{
"epoch": 0.7483098919884995,
"grad_norm": 4.636272948323283,
"learning_rate": 1.808743689399528e-06,
"loss": 0.5792,
"step": 9630
},
{
"epoch": 0.7490869531432124,
"grad_norm": 3.103713105912325,
"learning_rate": 1.798314730859637e-06,
"loss": 0.5527,
"step": 9640
},
{
"epoch": 0.7498640142979253,
"grad_norm": 3.204765078923141,
"learning_rate": 1.787909328832323e-06,
"loss": 0.5491,
"step": 9650
},
{
"epoch": 0.7506410754526381,
"grad_norm": 4.894522393499138,
"learning_rate": 1.7775275598756974e-06,
"loss": 0.5553,
"step": 9660
},
{
"epoch": 0.751418136607351,
"grad_norm": 3.428628239034369,
"learning_rate": 1.7671695003739935e-06,
"loss": 0.5143,
"step": 9670
},
{
"epoch": 0.7521951977620639,
"grad_norm": 4.535044446134579,
"learning_rate": 1.7568352265369987e-06,
"loss": 0.5291,
"step": 9680
},
{
"epoch": 0.7529722589167768,
"grad_norm": 4.546057980769502,
"learning_rate": 1.7465248143995011e-06,
"loss": 0.5271,
"step": 9690
},
{
"epoch": 0.7537493200714896,
"grad_norm": 3.6725535134363785,
"learning_rate": 1.7362383398207189e-06,
"loss": 0.5665,
"step": 9700
},
{
"epoch": 0.7545263812262025,
"grad_norm": 3.3515951674477793,
"learning_rate": 1.725975878483757e-06,
"loss": 0.5282,
"step": 9710
},
{
"epoch": 0.7553034423809154,
"grad_norm": 4.187132180488078,
"learning_rate": 1.7157375058950349e-06,
"loss": 0.5572,
"step": 9720
},
{
"epoch": 0.7560805035356283,
"grad_norm": 3.013413844455128,
"learning_rate": 1.705523297383741e-06,
"loss": 0.5502,
"step": 9730
},
{
"epoch": 0.7568575646903412,
"grad_norm": 4.034990404281864,
"learning_rate": 1.6953333281012745e-06,
"loss": 0.5557,
"step": 9740
},
{
"epoch": 0.757634625845054,
"grad_norm": 4.2869070311052475,
"learning_rate": 1.6851676730206978e-06,
"loss": 0.5067,
"step": 9750
},
{
"epoch": 0.7584116869997669,
"grad_norm": 2.55851587794808,
"learning_rate": 1.6750264069361755e-06,
"loss": 0.521,
"step": 9760
},
{
"epoch": 0.7591887481544798,
"grad_norm": 3.860783467248806,
"learning_rate": 1.664909604462432e-06,
"loss": 0.5162,
"step": 9770
},
{
"epoch": 0.7599658093091927,
"grad_norm": 2.964535685167722,
"learning_rate": 1.6548173400341988e-06,
"loss": 0.4662,
"step": 9780
},
{
"epoch": 0.7607428704639055,
"grad_norm": 4.5148211810505,
"learning_rate": 1.6447496879056667e-06,
"loss": 0.5326,
"step": 9790
},
{
"epoch": 0.7615199316186184,
"grad_norm": 2.9731810276505595,
"learning_rate": 1.6347067221499441e-06,
"loss": 0.5221,
"step": 9800
},
{
"epoch": 0.7622969927733313,
"grad_norm": 4.225015592243322,
"learning_rate": 1.6246885166585081e-06,
"loss": 0.5404,
"step": 9810
},
{
"epoch": 0.7630740539280442,
"grad_norm": 4.195775975703309,
"learning_rate": 1.6146951451406583e-06,
"loss": 0.4837,
"step": 9820
},
{
"epoch": 0.763851115082757,
"grad_norm": 2.77408092127348,
"learning_rate": 1.604726681122979e-06,
"loss": 0.4849,
"step": 9830
},
{
"epoch": 0.7646281762374699,
"grad_norm": 4.215861830136612,
"learning_rate": 1.5947831979487966e-06,
"loss": 0.5925,
"step": 9840
},
{
"epoch": 0.7654052373921828,
"grad_norm": 3.1030479659610393,
"learning_rate": 1.5848647687776397e-06,
"loss": 0.5019,
"step": 9850
},
{
"epoch": 0.7661822985468957,
"grad_norm": 3.926045471634979,
"learning_rate": 1.574971466584701e-06,
"loss": 0.5124,
"step": 9860
},
{
"epoch": 0.7669593597016086,
"grad_norm": 4.015070211236076,
"learning_rate": 1.5651033641603041e-06,
"loss": 0.5314,
"step": 9870
},
{
"epoch": 0.7677364208563214,
"grad_norm": 3.649601860518483,
"learning_rate": 1.555260534109359e-06,
"loss": 0.5089,
"step": 9880
},
{
"epoch": 0.7685134820110343,
"grad_norm": 3.604893647217938,
"learning_rate": 1.5454430488508359e-06,
"loss": 0.5472,
"step": 9890
},
{
"epoch": 0.7692905431657472,
"grad_norm": 3.3095117069291624,
"learning_rate": 1.5356509806172315e-06,
"loss": 0.5168,
"step": 9900
},
{
"epoch": 0.7700676043204601,
"grad_norm": 3.8970071625899445,
"learning_rate": 1.525884401454033e-06,
"loss": 0.5485,
"step": 9910
},
{
"epoch": 0.7708446654751729,
"grad_norm": 2.80658001169654,
"learning_rate": 1.5161433832191902e-06,
"loss": 0.5044,
"step": 9920
},
{
"epoch": 0.7716217266298858,
"grad_norm": 3.1868297865512214,
"learning_rate": 1.5064279975825923e-06,
"loss": 0.4934,
"step": 9930
},
{
"epoch": 0.7723987877845987,
"grad_norm": 3.0425811492999366,
"learning_rate": 1.4967383160255316e-06,
"loss": 0.5183,
"step": 9940
},
{
"epoch": 0.7731758489393116,
"grad_norm": 4.54933754793044,
"learning_rate": 1.4870744098401819e-06,
"loss": 0.5306,
"step": 9950
},
{
"epoch": 0.7739529100940244,
"grad_norm": 3.931701576666515,
"learning_rate": 1.4774363501290755e-06,
"loss": 0.5415,
"step": 9960
},
{
"epoch": 0.7747299712487373,
"grad_norm": 3.282020379585411,
"learning_rate": 1.4678242078045756e-06,
"loss": 0.5421,
"step": 9970
},
{
"epoch": 0.7755070324034502,
"grad_norm": 3.2735246508623366,
"learning_rate": 1.4582380535883622e-06,
"loss": 0.5452,
"step": 9980
},
{
"epoch": 0.7762840935581631,
"grad_norm": 3.2961538894269067,
"learning_rate": 1.4486779580109012e-06,
"loss": 0.5254,
"step": 9990
},
{
"epoch": 0.777061154712876,
"grad_norm": 4.499334024075413,
"learning_rate": 1.4391439914109367e-06,
"loss": 0.4899,
"step": 10000
},
{
"epoch": 0.777061154712876,
"eval_loss": 0.5171714425086975,
"eval_runtime": 472.4039,
"eval_samples_per_second": 22.94,
"eval_steps_per_second": 2.868,
"step": 10000
},
{
"epoch": 0.7778382158675888,
"grad_norm": 4.012283871593952,
"learning_rate": 1.429636223934963e-06,
"loss": 0.4927,
"step": 10010
},
{
"epoch": 0.7786152770223017,
"grad_norm": 3.483797094263642,
"learning_rate": 1.4201547255367165e-06,
"loss": 0.5085,
"step": 10020
},
{
"epoch": 0.7793923381770145,
"grad_norm": 4.75329332254169,
"learning_rate": 1.4106995659766547e-06,
"loss": 0.5058,
"step": 10030
},
{
"epoch": 0.7801693993317274,
"grad_norm": 3.68815778033119,
"learning_rate": 1.4012708148214522e-06,
"loss": 0.5265,
"step": 10040
},
{
"epoch": 0.7809464604864402,
"grad_norm": 3.4635761925286306,
"learning_rate": 1.3918685414434763e-06,
"loss": 0.4623,
"step": 10050
},
{
"epoch": 0.7817235216411531,
"grad_norm": 4.024245798823526,
"learning_rate": 1.3824928150202866e-06,
"loss": 0.4865,
"step": 10060
},
{
"epoch": 0.782500582795866,
"grad_norm": 3.876558527294442,
"learning_rate": 1.3731437045341218e-06,
"loss": 0.5297,
"step": 10070
},
{
"epoch": 0.7832776439505789,
"grad_norm": 4.13041441043086,
"learning_rate": 1.363821278771391e-06,
"loss": 0.5588,
"step": 10080
},
{
"epoch": 0.7840547051052917,
"grad_norm": 4.828512693632229,
"learning_rate": 1.3545256063221745e-06,
"loss": 0.5241,
"step": 10090
},
{
"epoch": 0.7848317662600046,
"grad_norm": 3.330489049598463,
"learning_rate": 1.3452567555797085e-06,
"loss": 0.5351,
"step": 10100
},
{
"epoch": 0.7856088274147175,
"grad_norm": 3.577340154782965,
"learning_rate": 1.3360147947398927e-06,
"loss": 0.4874,
"step": 10110
},
{
"epoch": 0.7863858885694304,
"grad_norm": 4.201117799816586,
"learning_rate": 1.3267997918007792e-06,
"loss": 0.5148,
"step": 10120
},
{
"epoch": 0.7871629497241432,
"grad_norm": 2.5965256135200643,
"learning_rate": 1.3176118145620775e-06,
"loss": 0.4988,
"step": 10130
},
{
"epoch": 0.7879400108788561,
"grad_norm": 2.397365078889302,
"learning_rate": 1.3084509306246562e-06,
"loss": 0.4687,
"step": 10140
},
{
"epoch": 0.788717072033569,
"grad_norm": 5.5016070521496,
"learning_rate": 1.29931720739004e-06,
"loss": 0.518,
"step": 10150
},
{
"epoch": 0.7894941331882819,
"grad_norm": 4.9408112199928444,
"learning_rate": 1.2902107120599249e-06,
"loss": 0.5312,
"step": 10160
},
{
"epoch": 0.7902711943429948,
"grad_norm": 3.557763106103323,
"learning_rate": 1.2811315116356698e-06,
"loss": 0.5196,
"step": 10170
},
{
"epoch": 0.7910482554977076,
"grad_norm": 4.192138798834655,
"learning_rate": 1.2720796729178115e-06,
"loss": 0.527,
"step": 10180
},
{
"epoch": 0.7918253166524205,
"grad_norm": 3.586108157059095,
"learning_rate": 1.2630552625055763e-06,
"loss": 0.5347,
"step": 10190
},
{
"epoch": 0.7926023778071334,
"grad_norm": 3.9368756234903195,
"learning_rate": 1.2540583467963817e-06,
"loss": 0.4811,
"step": 10200
},
{
"epoch": 0.7933794389618463,
"grad_norm": 4.518574036325759,
"learning_rate": 1.245088991985352e-06,
"loss": 0.5086,
"step": 10210
},
{
"epoch": 0.7941565001165591,
"grad_norm": 3.850061816242949,
"learning_rate": 1.2361472640648347e-06,
"loss": 0.4862,
"step": 10220
},
{
"epoch": 0.794933561271272,
"grad_norm": 3.5644700141713064,
"learning_rate": 1.227233228823908e-06,
"loss": 0.5303,
"step": 10230
},
{
"epoch": 0.7957106224259849,
"grad_norm": 2.1351987055036985,
"learning_rate": 1.2183469518479018e-06,
"loss": 0.5179,
"step": 10240
},
{
"epoch": 0.7964876835806978,
"grad_norm": 5.25048528063306,
"learning_rate": 1.2094884985179117e-06,
"loss": 0.5318,
"step": 10250
},
{
"epoch": 0.7972647447354106,
"grad_norm": 2.1615227439546745,
"learning_rate": 1.200657934010323e-06,
"loss": 0.4547,
"step": 10260
},
{
"epoch": 0.7980418058901235,
"grad_norm": 2.6751655695167154,
"learning_rate": 1.1918553232963237e-06,
"loss": 0.5134,
"step": 10270
},
{
"epoch": 0.7988188670448364,
"grad_norm": 4.159654861888376,
"learning_rate": 1.1830807311414355e-06,
"loss": 0.524,
"step": 10280
},
{
"epoch": 0.7995959281995493,
"grad_norm": 3.6944240100922214,
"learning_rate": 1.1743342221050314e-06,
"loss": 0.5175,
"step": 10290
},
{
"epoch": 0.8003729893542622,
"grad_norm": 4.133885672495875,
"learning_rate": 1.1656158605398599e-06,
"loss": 0.4854,
"step": 10300
},
{
"epoch": 0.801150050508975,
"grad_norm": 4.0354219471053305,
"learning_rate": 1.1569257105915743e-06,
"loss": 0.5293,
"step": 10310
},
{
"epoch": 0.8019271116636879,
"grad_norm": 4.987229671719538,
"learning_rate": 1.1482638361982595e-06,
"loss": 0.5067,
"step": 10320
},
{
"epoch": 0.8027041728184008,
"grad_norm": 4.060534061900532,
"learning_rate": 1.1396303010899623e-06,
"loss": 0.5031,
"step": 10330
},
{
"epoch": 0.8034812339731137,
"grad_norm": 3.8027639891295615,
"learning_rate": 1.131025168788225e-06,
"loss": 0.5339,
"step": 10340
},
{
"epoch": 0.8042582951278265,
"grad_norm": 4.5696870186179215,
"learning_rate": 1.122448502605611e-06,
"loss": 0.5187,
"step": 10350
},
{
"epoch": 0.8050353562825394,
"grad_norm": 3.4544068898990257,
"learning_rate": 1.1139003656452451e-06,
"loss": 0.5012,
"step": 10360
},
{
"epoch": 0.8058124174372523,
"grad_norm": 4.024795478219517,
"learning_rate": 1.1053808208003463e-06,
"loss": 0.5039,
"step": 10370
},
{
"epoch": 0.8065894785919652,
"grad_norm": 3.451121303154774,
"learning_rate": 1.0968899307537688e-06,
"loss": 0.5096,
"step": 10380
},
{
"epoch": 0.807366539746678,
"grad_norm": 3.5430435341751374,
"learning_rate": 1.088427757977535e-06,
"loss": 0.4995,
"step": 10390
},
{
"epoch": 0.8081436009013909,
"grad_norm": 3.3568799457193315,
"learning_rate": 1.0799943647323823e-06,
"loss": 0.4896,
"step": 10400
},
{
"epoch": 0.8089206620561038,
"grad_norm": 2.7324998256576265,
"learning_rate": 1.071589813067298e-06,
"loss": 0.4757,
"step": 10410
},
{
"epoch": 0.8096977232108167,
"grad_norm": 3.114681260826415,
"learning_rate": 1.0632141648190685e-06,
"loss": 0.5033,
"step": 10420
},
{
"epoch": 0.8104747843655296,
"grad_norm": 3.7347524196800856,
"learning_rate": 1.054867481611822e-06,
"loss": 0.4849,
"step": 10430
},
{
"epoch": 0.8112518455202424,
"grad_norm": 2.4431545580868423,
"learning_rate": 1.046549824856574e-06,
"loss": 0.4344,
"step": 10440
},
{
"epoch": 0.8120289066749553,
"grad_norm": 3.370757705323888,
"learning_rate": 1.038261255750781e-06,
"loss": 0.4419,
"step": 10450
},
{
"epoch": 0.8128059678296682,
"grad_norm": 4.176509993840626,
"learning_rate": 1.0300018352778817e-06,
"loss": 0.4905,
"step": 10460
},
{
"epoch": 0.8135830289843811,
"grad_norm": 4.2860515845724505,
"learning_rate": 1.0217716242068525e-06,
"loss": 0.4989,
"step": 10470
},
{
"epoch": 0.8143600901390939,
"grad_norm": 2.914685646542763,
"learning_rate": 1.0135706830917663e-06,
"loss": 0.4527,
"step": 10480
},
{
"epoch": 0.8151371512938068,
"grad_norm": 4.781204814322438,
"learning_rate": 1.0053990722713347e-06,
"loss": 0.5185,
"step": 10490
},
{
"epoch": 0.8159142124485197,
"grad_norm": 4.336551191079965,
"learning_rate": 9.97256851868474e-07,
"loss": 0.5453,
"step": 10500
},
{
"epoch": 0.8166912736032326,
"grad_norm": 3.980153258528895,
"learning_rate": 9.891440817898569e-07,
"loss": 0.4476,
"step": 10510
},
{
"epoch": 0.8174683347579454,
"grad_norm": 4.803099851628047,
"learning_rate": 9.810608217254785e-07,
"loss": 0.4535,
"step": 10520
},
{
"epoch": 0.8182453959126583,
"grad_norm": 5.434746877487003,
"learning_rate": 9.730071311482104e-07,
"loss": 0.5266,
"step": 10530
},
{
"epoch": 0.8190224570673712,
"grad_norm": 4.132134349770947,
"learning_rate": 9.649830693133649e-07,
"loss": 0.4794,
"step": 10540
},
{
"epoch": 0.8197995182220841,
"grad_norm": 3.8042895258614657,
"learning_rate": 9.569886952582613e-07,
"loss": 0.4857,
"step": 10550
},
{
"epoch": 0.820576579376797,
"grad_norm": 4.505324473871432,
"learning_rate": 9.49024067801787e-07,
"loss": 0.4773,
"step": 10560
},
{
"epoch": 0.8213536405315098,
"grad_norm": 4.085373275991255,
"learning_rate": 9.410892455439724e-07,
"loss": 0.5123,
"step": 10570
},
{
"epoch": 0.8221307016862227,
"grad_norm": 2.8077333631243047,
"learning_rate": 9.331842868655538e-07,
"loss": 0.4766,
"step": 10580
},
{
"epoch": 0.8229077628409356,
"grad_norm": 4.995807097173484,
"learning_rate": 9.253092499275435e-07,
"loss": 0.5059,
"step": 10590
},
{
"epoch": 0.8236848239956485,
"grad_norm": 3.0312698428527085,
"learning_rate": 9.174641926708028e-07,
"loss": 0.5072,
"step": 10600
},
{
"epoch": 0.8244618851503613,
"grad_norm": 3.6228940116700166,
"learning_rate": 9.096491728156187e-07,
"loss": 0.5157,
"step": 10610
},
{
"epoch": 0.8252389463050742,
"grad_norm": 4.4841778480785885,
"learning_rate": 9.018642478612755e-07,
"loss": 0.5325,
"step": 10620
},
{
"epoch": 0.8260160074597871,
"grad_norm": 3.7081609263257596,
"learning_rate": 8.941094750856349e-07,
"loss": 0.5225,
"step": 10630
},
{
"epoch": 0.8267930686145,
"grad_norm": 2.9403067849013493,
"learning_rate": 8.863849115447121e-07,
"loss": 0.4859,
"step": 10640
},
{
"epoch": 0.8275701297692128,
"grad_norm": 3.9121829857836925,
"learning_rate": 8.786906140722551e-07,
"loss": 0.4704,
"step": 10650
},
{
"epoch": 0.8283471909239257,
"grad_norm": 3.7718616897098234,
"learning_rate": 8.710266392793293e-07,
"loss": 0.5054,
"step": 10660
},
{
"epoch": 0.8291242520786386,
"grad_norm": 3.108303958961309,
"learning_rate": 8.633930435539023e-07,
"loss": 0.5006,
"step": 10670
},
{
"epoch": 0.8299013132333515,
"grad_norm": 2.5549313563071725,
"learning_rate": 8.557898830604239e-07,
"loss": 0.4795,
"step": 10680
},
{
"epoch": 0.8306783743880644,
"grad_norm": 3.459144570766454,
"learning_rate": 8.48217213739414e-07,
"loss": 0.5052,
"step": 10690
},
{
"epoch": 0.8314554355427772,
"grad_norm": 3.8583077857999992,
"learning_rate": 8.406750913070582e-07,
"loss": 0.5121,
"step": 10700
},
{
"epoch": 0.8322324966974901,
"grad_norm": 3.963740775603707,
"learning_rate": 8.33163571254787e-07,
"loss": 0.4949,
"step": 10710
},
{
"epoch": 0.833009557852203,
"grad_norm": 4.576071555267779,
"learning_rate": 8.256827088488756e-07,
"loss": 0.488,
"step": 10720
},
{
"epoch": 0.8337866190069159,
"grad_norm": 4.018939367025651,
"learning_rate": 8.182325591300333e-07,
"loss": 0.4584,
"step": 10730
},
{
"epoch": 0.8345636801616287,
"grad_norm": 5.537702555635495,
"learning_rate": 8.10813176912999e-07,
"loss": 0.5078,
"step": 10740
},
{
"epoch": 0.8353407413163416,
"grad_norm": 4.521346564196193,
"learning_rate": 8.03424616786142e-07,
"loss": 0.5017,
"step": 10750
},
{
"epoch": 0.8361178024710545,
"grad_norm": 4.426790844413774,
"learning_rate": 7.960669331110521e-07,
"loss": 0.4832,
"step": 10760
},
{
"epoch": 0.8368948636257674,
"grad_norm": 4.986892159186973,
"learning_rate": 7.887401800221495e-07,
"loss": 0.5278,
"step": 10770
},
{
"epoch": 0.8376719247804802,
"grad_norm": 3.034636301392233,
"learning_rate": 7.814444114262786e-07,
"loss": 0.4996,
"step": 10780
},
{
"epoch": 0.8384489859351931,
"grad_norm": 2.63148766912681,
"learning_rate": 7.741796810023139e-07,
"loss": 0.4839,
"step": 10790
},
{
"epoch": 0.839226047089906,
"grad_norm": 4.33674902614418,
"learning_rate": 7.669460422007657e-07,
"loss": 0.439,
"step": 10800
},
{
"epoch": 0.8400031082446189,
"grad_norm": 4.048856363638596,
"learning_rate": 7.597435482433896e-07,
"loss": 0.4783,
"step": 10810
},
{
"epoch": 0.8407801693993318,
"grad_norm": 3.925372203600619,
"learning_rate": 7.525722521227885e-07,
"loss": 0.5017,
"step": 10820
},
{
"epoch": 0.8415572305540446,
"grad_norm": 2.3654265887367054,
"learning_rate": 7.45432206602027e-07,
"loss": 0.5123,
"step": 10830
},
{
"epoch": 0.8423342917087575,
"grad_norm": 3.754610906804235,
"learning_rate": 7.383234642142422e-07,
"loss": 0.4907,
"step": 10840
},
{
"epoch": 0.8431113528634704,
"grad_norm": 4.1554282145692625,
"learning_rate": 7.312460772622565e-07,
"loss": 0.5107,
"step": 10850
},
{
"epoch": 0.8438884140181833,
"grad_norm": 3.319418655291393,
"learning_rate": 7.242000978181963e-07,
"loss": 0.5048,
"step": 10860
},
{
"epoch": 0.8446654751728961,
"grad_norm": 4.374110046424012,
"learning_rate": 7.171855777231058e-07,
"loss": 0.4617,
"step": 10870
},
{
"epoch": 0.845442536327609,
"grad_norm": 4.441680587693151,
"learning_rate": 7.102025685865622e-07,
"loss": 0.4959,
"step": 10880
},
{
"epoch": 0.8462195974823219,
"grad_norm": 2.8350312541634803,
"learning_rate": 7.032511217863031e-07,
"loss": 0.4677,
"step": 10890
},
{
"epoch": 0.8469966586370348,
"grad_norm": 3.982485022264907,
"learning_rate": 6.963312884678441e-07,
"loss": 0.4954,
"step": 10900
},
{
"epoch": 0.8477737197917476,
"grad_norm": 4.590377956407083,
"learning_rate": 6.894431195441037e-07,
"loss": 0.5297,
"step": 10910
},
{
"epoch": 0.8485507809464605,
"grad_norm": 2.408789067882966,
"learning_rate": 6.825866656950264e-07,
"loss": 0.445,
"step": 10920
},
{
"epoch": 0.8493278421011734,
"grad_norm": 4.694687311202965,
"learning_rate": 6.757619773672169e-07,
"loss": 0.493,
"step": 10930
},
{
"epoch": 0.8501049032558863,
"grad_norm": 4.491758478617379,
"learning_rate": 6.689691047735597e-07,
"loss": 0.5153,
"step": 10940
},
{
"epoch": 0.8508819644105992,
"grad_norm": 3.959513693411194,
"learning_rate": 6.62208097892853e-07,
"loss": 0.4797,
"step": 10950
},
{
"epoch": 0.851659025565312,
"grad_norm": 5.339647237399662,
"learning_rate": 6.554790064694471e-07,
"loss": 0.4897,
"step": 10960
},
{
"epoch": 0.8524360867200249,
"grad_norm": 4.541122198536199,
"learning_rate": 6.487818800128692e-07,
"loss": 0.4698,
"step": 10970
},
{
"epoch": 0.8532131478747378,
"grad_norm": 4.7468681798060395,
"learning_rate": 6.421167677974622e-07,
"loss": 0.5016,
"step": 10980
},
{
"epoch": 0.8539902090294507,
"grad_norm": 4.381332344102587,
"learning_rate": 6.354837188620278e-07,
"loss": 0.51,
"step": 10990
},
{
"epoch": 0.8547672701841635,
"grad_norm": 4.1592821906223705,
"learning_rate": 6.288827820094562e-07,
"loss": 0.4875,
"step": 11000
},
{
"epoch": 0.8555443313388764,
"grad_norm": 5.029800475729443,
"learning_rate": 6.223140058063737e-07,
"loss": 0.4549,
"step": 11010
},
{
"epoch": 0.8563213924935893,
"grad_norm": 3.254886843193101,
"learning_rate": 6.157774385827847e-07,
"loss": 0.4314,
"step": 11020
},
{
"epoch": 0.8570984536483022,
"grad_norm": 3.434364877703452,
"learning_rate": 6.092731284317111e-07,
"loss": 0.4654,
"step": 11030
},
{
"epoch": 0.857875514803015,
"grad_norm": 4.488825872633713,
"learning_rate": 6.028011232088471e-07,
"loss": 0.482,
"step": 11040
},
{
"epoch": 0.8586525759577279,
"grad_norm": 3.0602137297514638,
"learning_rate": 5.963614705321996e-07,
"loss": 0.4618,
"step": 11050
},
{
"epoch": 0.8594296371124408,
"grad_norm": 4.827196277112413,
"learning_rate": 5.899542177817413e-07,
"loss": 0.4525,
"step": 11060
},
{
"epoch": 0.8602066982671537,
"grad_norm": 4.39228489153871,
"learning_rate": 5.835794120990607e-07,
"loss": 0.5458,
"step": 11070
},
{
"epoch": 0.8609837594218664,
"grad_norm": 4.013851924684146,
"learning_rate": 5.772371003870147e-07,
"loss": 0.521,
"step": 11080
},
{
"epoch": 0.8617608205765793,
"grad_norm": 4.599909020480007,
"learning_rate": 5.709273293093865e-07,
"loss": 0.4641,
"step": 11090
},
{
"epoch": 0.8625378817312922,
"grad_norm": 3.522635100581711,
"learning_rate": 5.646501452905406e-07,
"loss": 0.4613,
"step": 11100
},
{
"epoch": 0.8633149428860051,
"grad_norm": 4.170720600102606,
"learning_rate": 5.584055945150807e-07,
"loss": 0.4533,
"step": 11110
},
{
"epoch": 0.864092004040718,
"grad_norm": 5.0485560375944365,
"learning_rate": 5.521937229275087e-07,
"loss": 0.4584,
"step": 11120
},
{
"epoch": 0.8648690651954308,
"grad_norm": 4.0298286961319105,
"learning_rate": 5.460145762318903e-07,
"loss": 0.5072,
"step": 11130
},
{
"epoch": 0.8656461263501437,
"grad_norm": 3.963316318056793,
"learning_rate": 5.398681998915145e-07,
"loss": 0.454,
"step": 11140
},
{
"epoch": 0.8664231875048566,
"grad_norm": 2.2989684529089076,
"learning_rate": 5.337546391285647e-07,
"loss": 0.4753,
"step": 11150
},
{
"epoch": 0.8672002486595695,
"grad_norm": 4.488811638369375,
"learning_rate": 5.276739389237778e-07,
"loss": 0.452,
"step": 11160
},
{
"epoch": 0.8679773098142823,
"grad_norm": 4.7387272438267605,
"learning_rate": 5.216261440161236e-07,
"loss": 0.4891,
"step": 11170
},
{
"epoch": 0.8687543709689952,
"grad_norm": 5.278573940043423,
"learning_rate": 5.156112989024653e-07,
"loss": 0.477,
"step": 11180
},
{
"epoch": 0.8695314321237081,
"grad_norm": 3.9270271390134828,
"learning_rate": 5.096294478372382e-07,
"loss": 0.465,
"step": 11190
},
{
"epoch": 0.870308493278421,
"grad_norm": 3.2023556593268427,
"learning_rate": 5.036806348321238e-07,
"loss": 0.4654,
"step": 11200
},
{
"epoch": 0.8710855544331338,
"grad_norm": 4.204967484017854,
"learning_rate": 4.977649036557225e-07,
"loss": 0.4933,
"step": 11210
},
{
"epoch": 0.8718626155878467,
"grad_norm": 3.8562465627781743,
"learning_rate": 4.918822978332377e-07,
"loss": 0.4487,
"step": 11220
},
{
"epoch": 0.8726396767425596,
"grad_norm": 2.7494815741242484,
"learning_rate": 4.860328606461485e-07,
"loss": 0.4637,
"step": 11230
},
{
"epoch": 0.8734167378972725,
"grad_norm": 2.1088033052796895,
"learning_rate": 4.802166351318965e-07,
"loss": 0.4899,
"step": 11240
},
{
"epoch": 0.8741937990519854,
"grad_norm": 5.113207022204942,
"learning_rate": 4.7443366408356673e-07,
"loss": 0.5035,
"step": 11250
},
{
"epoch": 0.8749708602066982,
"grad_norm": 3.993509884814402,
"learning_rate": 4.6868399004957266e-07,
"loss": 0.4983,
"step": 11260
},
{
"epoch": 0.8757479213614111,
"grad_norm": 6.019062769443196,
"learning_rate": 4.6296765533334345e-07,
"loss": 0.5127,
"step": 11270
},
{
"epoch": 0.876524982516124,
"grad_norm": 3.42141410170646,
"learning_rate": 4.57284701993016e-07,
"loss": 0.4686,
"step": 11280
},
{
"epoch": 0.8773020436708369,
"grad_norm": 4.401665485132851,
"learning_rate": 4.5163517184111885e-07,
"loss": 0.4423,
"step": 11290
},
{
"epoch": 0.8780791048255497,
"grad_norm": 1.7965008908739462,
"learning_rate": 4.460191064442704e-07,
"loss": 0.5013,
"step": 11300
},
{
"epoch": 0.8788561659802626,
"grad_norm": 4.038506349330642,
"learning_rate": 4.4043654712287e-07,
"loss": 0.4681,
"step": 11310
},
{
"epoch": 0.8796332271349755,
"grad_norm": 2.6713825342303084,
"learning_rate": 4.348875349507953e-07,
"loss": 0.4723,
"step": 11320
},
{
"epoch": 0.8804102882896884,
"grad_norm": 2.5242881927131493,
"learning_rate": 4.293721107551002e-07,
"loss": 0.4948,
"step": 11330
},
{
"epoch": 0.8811873494444012,
"grad_norm": 3.089605520005084,
"learning_rate": 4.23890315115712e-07,
"loss": 0.4837,
"step": 11340
},
{
"epoch": 0.8819644105991141,
"grad_norm": 4.640356219725602,
"learning_rate": 4.184421883651374e-07,
"loss": 0.4594,
"step": 11350
},
{
"epoch": 0.882741471753827,
"grad_norm": 4.452516441213523,
"learning_rate": 4.1302777058816136e-07,
"loss": 0.5087,
"step": 11360
},
{
"epoch": 0.8835185329085399,
"grad_norm": 3.2814252714146903,
"learning_rate": 4.076471016215533e-07,
"loss": 0.4585,
"step": 11370
},
{
"epoch": 0.8842955940632528,
"grad_norm": 4.17360304036643,
"learning_rate": 4.023002210537763e-07,
"loss": 0.4808,
"step": 11380
},
{
"epoch": 0.8850726552179656,
"grad_norm": 3.4710617417209897,
"learning_rate": 3.9698716822469175e-07,
"loss": 0.4764,
"step": 11390
},
{
"epoch": 0.8858497163726785,
"grad_norm": 4.94630365171049,
"learning_rate": 3.917079822252756e-07,
"loss": 0.4676,
"step": 11400
},
{
"epoch": 0.8866267775273914,
"grad_norm": 3.9963020658849295,
"learning_rate": 3.864627018973244e-07,
"loss": 0.4594,
"step": 11410
},
{
"epoch": 0.8874038386821043,
"grad_norm": 4.149575936577817,
"learning_rate": 3.8125136583317404e-07,
"loss": 0.4408,
"step": 11420
},
{
"epoch": 0.8881808998368171,
"grad_norm": 4.0908393768408535,
"learning_rate": 3.760740123754125e-07,
"loss": 0.4906,
"step": 11430
},
{
"epoch": 0.88895796099153,
"grad_norm": 3.2442681217314413,
"learning_rate": 3.709306796166029e-07,
"loss": 0.4602,
"step": 11440
},
{
"epoch": 0.8897350221462429,
"grad_norm": 3.2062024108356786,
"learning_rate": 3.658214053989967e-07,
"loss": 0.4291,
"step": 11450
},
{
"epoch": 0.8905120833009558,
"grad_norm": 3.070354137183584,
"learning_rate": 3.6074622731426036e-07,
"loss": 0.4704,
"step": 11460
},
{
"epoch": 0.8912891444556686,
"grad_norm": 3.7959986708913136,
"learning_rate": 3.557051827031954e-07,
"loss": 0.4694,
"step": 11470
},
{
"epoch": 0.8920662056103815,
"grad_norm": 4.3724752517742145,
"learning_rate": 3.506983086554666e-07,
"loss": 0.4679,
"step": 11480
},
{
"epoch": 0.8928432667650944,
"grad_norm": 4.7403654025736035,
"learning_rate": 3.4572564200932634e-07,
"loss": 0.5283,
"step": 11490
},
{
"epoch": 0.8936203279198073,
"grad_norm": 4.243101118629279,
"learning_rate": 3.4078721935134397e-07,
"loss": 0.5125,
"step": 11500
},
{
"epoch": 0.8943973890745202,
"grad_norm": 4.475859170580614,
"learning_rate": 3.3588307701614144e-07,
"loss": 0.4869,
"step": 11510
},
{
"epoch": 0.895174450229233,
"grad_norm": 4.052974333086782,
"learning_rate": 3.310132510861169e-07,
"loss": 0.497,
"step": 11520
},
{
"epoch": 0.8959515113839459,
"grad_norm": 3.373865018498319,
"learning_rate": 3.2617777739118894e-07,
"loss": 0.4441,
"step": 11530
},
{
"epoch": 0.8967285725386588,
"grad_norm": 3.276175321494806,
"learning_rate": 3.213766915085248e-07,
"loss": 0.4451,
"step": 11540
},
{
"epoch": 0.8975056336933717,
"grad_norm": 3.908380664561767,
"learning_rate": 3.1661002876228473e-07,
"loss": 0.4243,
"step": 11550
},
{
"epoch": 0.8982826948480845,
"grad_norm": 2.6868106053772003,
"learning_rate": 3.118778242233572e-07,
"loss": 0.4427,
"step": 11560
},
{
"epoch": 0.8990597560027974,
"grad_norm": 3.3557801815767285,
"learning_rate": 3.0718011270910455e-07,
"loss": 0.4702,
"step": 11570
},
{
"epoch": 0.8998368171575103,
"grad_norm": 3.473766818324853,
"learning_rate": 3.02516928783107e-07,
"loss": 0.4744,
"step": 11580
},
{
"epoch": 0.9006138783122232,
"grad_norm": 3.8754395433857503,
"learning_rate": 2.978883067549032e-07,
"loss": 0.4519,
"step": 11590
},
{
"epoch": 0.901390939466936,
"grad_norm": 4.145319857126792,
"learning_rate": 2.9329428067974454e-07,
"loss": 0.4612,
"step": 11600
},
{
"epoch": 0.9021680006216489,
"grad_norm": 3.4197421104899424,
"learning_rate": 2.8873488435833983e-07,
"loss": 0.46,
"step": 11610
},
{
"epoch": 0.9029450617763618,
"grad_norm": 5.689929153660378,
"learning_rate": 2.8421015133660856e-07,
"loss": 0.4345,
"step": 11620
},
{
"epoch": 0.9037221229310747,
"grad_norm": 2.292957288599791,
"learning_rate": 2.797201149054335e-07,
"loss": 0.4454,
"step": 11630
},
{
"epoch": 0.9044991840857876,
"grad_norm": 4.486223577334596,
"learning_rate": 2.752648081004183e-07,
"loss": 0.4593,
"step": 11640
},
{
"epoch": 0.9052762452405004,
"grad_norm": 3.8405561325920745,
"learning_rate": 2.7084426370163954e-07,
"loss": 0.4888,
"step": 11650
},
{
"epoch": 0.9060533063952133,
"grad_norm": 3.406878245329023,
"learning_rate": 2.6645851423340806e-07,
"loss": 0.4558,
"step": 11660
},
{
"epoch": 0.9068303675499262,
"grad_norm": 4.950678382840644,
"learning_rate": 2.621075919640309e-07,
"loss": 0.4762,
"step": 11670
},
{
"epoch": 0.9076074287046391,
"grad_norm": 3.322238216032584,
"learning_rate": 2.577915289055727e-07,
"loss": 0.4759,
"step": 11680
},
{
"epoch": 0.9083844898593519,
"grad_norm": 3.3945486166885006,
"learning_rate": 2.535103568136205e-07,
"loss": 0.4955,
"step": 11690
},
{
"epoch": 0.9091615510140648,
"grad_norm": 3.8694072275201945,
"learning_rate": 2.492641071870489e-07,
"loss": 0.5166,
"step": 11700
},
{
"epoch": 0.9099386121687777,
"grad_norm": 4.7651096314002865,
"learning_rate": 2.450528112677886e-07,
"loss": 0.4971,
"step": 11710
},
{
"epoch": 0.9107156733234906,
"grad_norm": 4.469927022538459,
"learning_rate": 2.408765000406005e-07,
"loss": 0.4796,
"step": 11720
},
{
"epoch": 0.9114927344782034,
"grad_norm": 4.519223313466715,
"learning_rate": 2.367352042328408e-07,
"loss": 0.4685,
"step": 11730
},
{
"epoch": 0.9122697956329163,
"grad_norm": 3.963061942219626,
"learning_rate": 2.3262895431424015e-07,
"loss": 0.4851,
"step": 11740
},
{
"epoch": 0.9130468567876292,
"grad_norm": 2.4524133862796313,
"learning_rate": 2.2855778049667653e-07,
"loss": 0.4534,
"step": 11750
},
{
"epoch": 0.9138239179423421,
"grad_norm": 2.834722369254088,
"learning_rate": 2.2452171273395716e-07,
"loss": 0.4548,
"step": 11760
},
{
"epoch": 0.914600979097055,
"grad_norm": 3.662017876045297,
"learning_rate": 2.2052078072159143e-07,
"loss": 0.4596,
"step": 11770
},
{
"epoch": 0.9153780402517678,
"grad_norm": 4.021945589966396,
"learning_rate": 2.1655501389657941e-07,
"loss": 0.4744,
"step": 11780
},
{
"epoch": 0.9161551014064807,
"grad_norm": 3.251036017263966,
"learning_rate": 2.126244414371903e-07,
"loss": 0.4575,
"step": 11790
},
{
"epoch": 0.9169321625611936,
"grad_norm": 3.351594261133528,
"learning_rate": 2.087290922627494e-07,
"loss": 0.4722,
"step": 11800
},
{
"epoch": 0.9177092237159065,
"grad_norm": 2.9100443321260645,
"learning_rate": 2.0486899503342595e-07,
"loss": 0.4781,
"step": 11810
},
{
"epoch": 0.9184862848706193,
"grad_norm": 5.769177396129288,
"learning_rate": 2.010441781500233e-07,
"loss": 0.4561,
"step": 11820
},
{
"epoch": 0.9192633460253322,
"grad_norm": 3.6257554055271703,
"learning_rate": 1.9725466975376585e-07,
"loss": 0.4628,
"step": 11830
},
{
"epoch": 0.9200404071800451,
"grad_norm": 5.698219899736846,
"learning_rate": 1.9350049772609568e-07,
"loss": 0.4849,
"step": 11840
},
{
"epoch": 0.920817468334758,
"grad_norm": 5.10283696189389,
"learning_rate": 1.8978168968846632e-07,
"loss": 0.4584,
"step": 11850
},
{
"epoch": 0.9215945294894708,
"grad_norm": 2.4057166233933107,
"learning_rate": 1.8609827300213877e-07,
"loss": 0.4575,
"step": 11860
},
{
"epoch": 0.9223715906441837,
"grad_norm": 4.039902041938024,
"learning_rate": 1.8245027476798295e-07,
"loss": 0.4237,
"step": 11870
},
{
"epoch": 0.9231486517988966,
"grad_norm": 5.0104310640190155,
"learning_rate": 1.7883772182627378e-07,
"loss": 0.4609,
"step": 11880
},
{
"epoch": 0.9239257129536095,
"grad_norm": 3.235199066685605,
"learning_rate": 1.7526064075649718e-07,
"loss": 0.4725,
"step": 11890
},
{
"epoch": 0.9247027741083224,
"grad_norm": 5.7000179030429,
"learning_rate": 1.7171905787715436e-07,
"loss": 0.4844,
"step": 11900
},
{
"epoch": 0.9254798352630352,
"grad_norm": 4.833515226751012,
"learning_rate": 1.6821299924556557e-07,
"loss": 0.4711,
"step": 11910
},
{
"epoch": 0.9262568964177481,
"grad_norm": 4.541973195325704,
"learning_rate": 1.647424906576811e-07,
"loss": 0.4536,
"step": 11920
},
{
"epoch": 0.927033957572461,
"grad_norm": 3.1471929054096464,
"learning_rate": 1.613075576478923e-07,
"loss": 0.461,
"step": 11930
},
{
"epoch": 0.9278110187271739,
"grad_norm": 5.155810640275875,
"learning_rate": 1.5790822548883921e-07,
"loss": 0.4619,
"step": 11940
},
{
"epoch": 0.9285880798818867,
"grad_norm": 4.815168413187984,
"learning_rate": 1.545445191912287e-07,
"loss": 0.4811,
"step": 11950
},
{
"epoch": 0.9293651410365996,
"grad_norm": 4.039603939657306,
"learning_rate": 1.5121646350364784e-07,
"loss": 0.4677,
"step": 11960
},
{
"epoch": 0.9301422021913125,
"grad_norm": 3.0484480106622565,
"learning_rate": 1.4792408291238514e-07,
"loss": 0.4621,
"step": 11970
},
{
"epoch": 0.9309192633460254,
"grad_norm": 3.30445623378334,
"learning_rate": 1.4466740164124582e-07,
"loss": 0.423,
"step": 11980
},
{
"epoch": 0.9316963245007382,
"grad_norm": 5.507483370884143,
"learning_rate": 1.4144644365137906e-07,
"loss": 0.4395,
"step": 11990
},
{
"epoch": 0.9324733856554511,
"grad_norm": 4.472623280485502,
"learning_rate": 1.382612326410959e-07,
"loss": 0.4407,
"step": 12000
},
{
"epoch": 0.933250446810164,
"grad_norm": 4.43958885227866,
"learning_rate": 1.3511179204570014e-07,
"loss": 0.4594,
"step": 12010
},
{
"epoch": 0.9340275079648769,
"grad_norm": 4.219831856666021,
"learning_rate": 1.3199814503731144e-07,
"loss": 0.4935,
"step": 12020
},
{
"epoch": 0.9348045691195898,
"grad_norm": 3.6973725388649887,
"learning_rate": 1.289203145246981e-07,
"loss": 0.4163,
"step": 12030
},
{
"epoch": 0.9355816302743026,
"grad_norm": 3.7631001641207087,
"learning_rate": 1.258783231531069e-07,
"loss": 0.4795,
"step": 12040
},
{
"epoch": 0.9363586914290155,
"grad_norm": 4.91235572426644,
"learning_rate": 1.2287219330409716e-07,
"loss": 0.4763,
"step": 12050
},
{
"epoch": 0.9371357525837284,
"grad_norm": 5.162011591962256,
"learning_rate": 1.1990194709537496e-07,
"loss": 0.4663,
"step": 12060
},
{
"epoch": 0.9379128137384413,
"grad_norm": 4.847494906904684,
"learning_rate": 1.1696760638063243e-07,
"loss": 0.4638,
"step": 12070
},
{
"epoch": 0.9386898748931541,
"grad_norm": 3.4104319145126203,
"learning_rate": 1.1406919274938477e-07,
"loss": 0.5046,
"step": 12080
},
{
"epoch": 0.939466936047867,
"grad_norm": 3.5111768971357793,
"learning_rate": 1.112067275268125e-07,
"loss": 0.4713,
"step": 12090
},
{
"epoch": 0.9402439972025799,
"grad_norm": 2.618733293064988,
"learning_rate": 1.083802317736049e-07,
"loss": 0.4698,
"step": 12100
},
{
"epoch": 0.9410210583572928,
"grad_norm": 3.9733552104692333,
"learning_rate": 1.0558972628580522e-07,
"loss": 0.5037,
"step": 12110
},
{
"epoch": 0.9417981195120056,
"grad_norm": 3.876212810601272,
"learning_rate": 1.0283523159465514e-07,
"loss": 0.4538,
"step": 12120
},
{
"epoch": 0.9425751806667185,
"grad_norm": 3.2929997252443193,
"learning_rate": 1.0011676796644776e-07,
"loss": 0.4606,
"step": 12130
},
{
"epoch": 0.9433522418214313,
"grad_norm": 4.672469430214036,
"learning_rate": 9.743435540237433e-08,
"loss": 0.4695,
"step": 12140
},
{
"epoch": 0.9441293029761442,
"grad_norm": 3.462179429994501,
"learning_rate": 9.478801363838052e-08,
"loss": 0.448,
"step": 12150
},
{
"epoch": 0.944906364130857,
"grad_norm": 4.83408662775341,
"learning_rate": 9.217776214501984e-08,
"loss": 0.484,
"step": 12160
},
{
"epoch": 0.9456834252855699,
"grad_norm": 4.438140209760804,
"learning_rate": 8.960362012730983e-08,
"loss": 0.4603,
"step": 12170
},
{
"epoch": 0.9464604864402828,
"grad_norm": 3.4973998750156543,
"learning_rate": 8.706560652459062e-08,
"loss": 0.4249,
"step": 12180
},
{
"epoch": 0.9472375475949957,
"grad_norm": 4.089297315615882,
"learning_rate": 8.456374001038769e-08,
"loss": 0.4491,
"step": 12190
},
{
"epoch": 0.9480146087497086,
"grad_norm": 4.084120402865338,
"learning_rate": 8.209803899227209e-08,
"loss": 0.4535,
"step": 12200
},
{
"epoch": 0.9487916699044214,
"grad_norm": 3.941907831396277,
"learning_rate": 7.966852161172711e-08,
"loss": 0.4496,
"step": 12210
},
{
"epoch": 0.9495687310591343,
"grad_norm": 2.833823469782505,
"learning_rate": 7.727520574401127e-08,
"loss": 0.4243,
"step": 12220
},
{
"epoch": 0.9503457922138472,
"grad_norm": 4.551822537590359,
"learning_rate": 7.49181089980322e-08,
"loss": 0.4582,
"step": 12230
},
{
"epoch": 0.9511228533685601,
"grad_norm": 3.043933176817138,
"learning_rate": 7.259724871621188e-08,
"loss": 0.5034,
"step": 12240
},
{
"epoch": 0.9518999145232729,
"grad_norm": 3.4621240444267665,
"learning_rate": 7.031264197436161e-08,
"loss": 0.4268,
"step": 12250
},
{
"epoch": 0.9526769756779858,
"grad_norm": 4.056375247941382,
"learning_rate": 6.806430558155719e-08,
"loss": 0.4745,
"step": 12260
},
{
"epoch": 0.9534540368326987,
"grad_norm": 4.535857419133766,
"learning_rate": 6.585225608001178e-08,
"loss": 0.4308,
"step": 12270
},
{
"epoch": 0.9542310979874116,
"grad_norm": 2.4310495050933816,
"learning_rate": 6.367650974495875e-08,
"loss": 0.4222,
"step": 12280
},
{
"epoch": 0.9550081591421244,
"grad_norm": 1.8480746534853145,
"learning_rate": 6.153708258452851e-08,
"loss": 0.4637,
"step": 12290
},
{
"epoch": 0.9557852202968373,
"grad_norm": 4.469852603004664,
"learning_rate": 5.943399033963182e-08,
"loss": 0.4771,
"step": 12300
},
{
"epoch": 0.9565622814515502,
"grad_norm": 3.2674434265539745,
"learning_rate": 5.7367248483845005e-08,
"loss": 0.4866,
"step": 12310
},
{
"epoch": 0.9573393426062631,
"grad_norm": 2.005250278061698,
"learning_rate": 5.533687222329332e-08,
"loss": 0.4144,
"step": 12320
},
{
"epoch": 0.958116403760976,
"grad_norm": 2.6745479068375824,
"learning_rate": 5.3342876496542126e-08,
"loss": 0.4685,
"step": 12330
},
{
"epoch": 0.9588934649156888,
"grad_norm": 2.9539394159745815,
"learning_rate": 5.138527597448595e-08,
"loss": 0.4639,
"step": 12340
},
{
"epoch": 0.9596705260704017,
"grad_norm": 4.017786152412138,
"learning_rate": 4.946408506023958e-08,
"loss": 0.442,
"step": 12350
},
{
"epoch": 0.9604475872251146,
"grad_norm": 3.850870480799147,
"learning_rate": 4.757931788903325e-08,
"loss": 0.4304,
"step": 12360
},
{
"epoch": 0.9612246483798275,
"grad_norm": 3.0544561131913586,
"learning_rate": 4.573098832810818e-08,
"loss": 0.4478,
"step": 12370
},
{
"epoch": 0.9620017095345403,
"grad_norm": 5.80094396671801,
"learning_rate": 4.391910997661397e-08,
"loss": 0.4821,
"step": 12380
},
{
"epoch": 0.9627787706892532,
"grad_norm": 3.5119994742694773,
"learning_rate": 4.214369616550973e-08,
"loss": 0.4362,
"step": 12390
},
{
"epoch": 0.9635558318439661,
"grad_norm": 4.914214488501594,
"learning_rate": 4.040475995746529e-08,
"loss": 0.4375,
"step": 12400
},
{
"epoch": 0.964332892998679,
"grad_norm": 3.0958335114663322,
"learning_rate": 3.8702314146766284e-08,
"loss": 0.4565,
"step": 12410
},
{
"epoch": 0.9651099541533918,
"grad_norm": 3.156460394460856,
"learning_rate": 3.7036371259216994e-08,
"loss": 0.4625,
"step": 12420
},
{
"epoch": 0.9658870153081047,
"grad_norm": 3.411808395407994,
"learning_rate": 3.540694355205099e-08,
"loss": 0.4403,
"step": 12430
},
{
"epoch": 0.9666640764628176,
"grad_norm": 3.823904951701004,
"learning_rate": 3.381404301384117e-08,
"loss": 0.4446,
"step": 12440
},
{
"epoch": 0.9674411376175305,
"grad_norm": 3.762167967184466,
"learning_rate": 3.225768136440821e-08,
"loss": 0.4588,
"step": 12450
},
{
"epoch": 0.9682181987722434,
"grad_norm": 3.0475232787033835,
"learning_rate": 3.0737870054739496e-08,
"loss": 0.4643,
"step": 12460
},
{
"epoch": 0.9689952599269562,
"grad_norm": 3.768638648169802,
"learning_rate": 2.925462026689918e-08,
"loss": 0.4438,
"step": 12470
},
{
"epoch": 0.9697723210816691,
"grad_norm": 3.1658559982961942,
"learning_rate": 2.7807942913950504e-08,
"loss": 0.4872,
"step": 12480
},
{
"epoch": 0.970549382236382,
"grad_norm": 4.351267552340424,
"learning_rate": 2.6397848639874156e-08,
"loss": 0.4828,
"step": 12490
},
{
"epoch": 0.9713264433910949,
"grad_norm": 4.485145700676859,
"learning_rate": 2.502434781948726e-08,
"loss": 0.4754,
"step": 12500
},
{
"epoch": 0.9721035045458077,
"grad_norm": 3.772459688285439,
"learning_rate": 2.3687450558370627e-08,
"loss": 0.4425,
"step": 12510
},
{
"epoch": 0.9728805657005206,
"grad_norm": 2.3671995254376474,
"learning_rate": 2.2387166692794392e-08,
"loss": 0.4698,
"step": 12520
},
{
"epoch": 0.9736576268552335,
"grad_norm": 4.4933944619724,
"learning_rate": 2.1123505789642507e-08,
"loss": 0.4746,
"step": 12530
},
{
"epoch": 0.9744346880099464,
"grad_norm": 4.072967347229249,
"learning_rate": 1.989647714634446e-08,
"loss": 0.4646,
"step": 12540
},
{
"epoch": 0.9752117491646592,
"grad_norm": 3.8176450930369965,
"learning_rate": 1.8706089790807014e-08,
"loss": 0.4885,
"step": 12550
},
{
"epoch": 0.9759888103193721,
"grad_norm": 5.406894035256226,
"learning_rate": 1.7552352481347013e-08,
"loss": 0.4495,
"step": 12560
},
{
"epoch": 0.976765871474085,
"grad_norm": 2.91507715459867,
"learning_rate": 1.6435273706627564e-08,
"loss": 0.4498,
"step": 12570
},
{
"epoch": 0.9775429326287979,
"grad_norm": 3.455836019853387,
"learning_rate": 1.5354861685595855e-08,
"loss": 0.4679,
"step": 12580
},
{
"epoch": 0.9783199937835108,
"grad_norm": 3.065621924437169,
"learning_rate": 1.4311124367420992e-08,
"loss": 0.424,
"step": 12590
},
{
"epoch": 0.9790970549382236,
"grad_norm": 5.408364243129198,
"learning_rate": 1.3304069431437362e-08,
"loss": 0.4582,
"step": 12600
},
{
"epoch": 0.9798741160929365,
"grad_norm": 3.9623851369922485,
"learning_rate": 1.2333704287087467e-08,
"loss": 0.4733,
"step": 12610
},
{
"epoch": 0.9806511772476494,
"grad_norm": 3.6951264488478976,
"learning_rate": 1.1400036073866416e-08,
"loss": 0.46,
"step": 12620
},
{
"epoch": 0.9814282384023623,
"grad_norm": 2.8637927854551233,
"learning_rate": 1.0503071661271957e-08,
"loss": 0.4449,
"step": 12630
},
{
"epoch": 0.9822052995570751,
"grad_norm": 3.2568596741604523,
"learning_rate": 9.642817648750636e-09,
"loss": 0.4644,
"step": 12640
},
{
"epoch": 0.982982360711788,
"grad_norm": 4.000380462168666,
"learning_rate": 8.819280365652827e-09,
"loss": 0.4525,
"step": 12650
},
{
"epoch": 0.9837594218665009,
"grad_norm": 4.048475764438385,
"learning_rate": 8.032465871182227e-09,
"loss": 0.4586,
"step": 12660
},
{
"epoch": 0.9845364830212138,
"grad_norm": 3.2880203159325307,
"learning_rate": 7.282379954354768e-09,
"loss": 0.4334,
"step": 12670
},
{
"epoch": 0.9853135441759266,
"grad_norm": 4.0643620339312605,
"learning_rate": 6.569028133954214e-09,
"loss": 0.4458,
"step": 12680
},
{
"epoch": 0.9860906053306395,
"grad_norm": 4.549795834627539,
"learning_rate": 5.892415658491634e-09,
"loss": 0.4554,
"step": 12690
},
{
"epoch": 0.9868676664853524,
"grad_norm": 2.533413360663321,
"learning_rate": 5.252547506167105e-09,
"loss": 0.4535,
"step": 12700
},
{
"epoch": 0.9876447276400653,
"grad_norm": 3.539581600293753,
"learning_rate": 4.649428384833065e-09,
"loss": 0.4591,
"step": 12710
},
{
"epoch": 0.9884217887947782,
"grad_norm": 3.392398736378723,
"learning_rate": 4.083062731960463e-09,
"loss": 0.4609,
"step": 12720
},
{
"epoch": 0.989198849949491,
"grad_norm": 3.8523386314806305,
"learning_rate": 3.5534547146043318e-09,
"loss": 0.4601,
"step": 12730
},
{
"epoch": 0.9899759111042039,
"grad_norm": 4.270954545588355,
"learning_rate": 3.060608229373818e-09,
"loss": 0.4578,
"step": 12740
},
{
"epoch": 0.9907529722589168,
"grad_norm": 3.428519580605601,
"learning_rate": 2.6045269024049802e-09,
"loss": 0.4564,
"step": 12750
},
{
"epoch": 0.9915300334136297,
"grad_norm": 3.2452332555408683,
"learning_rate": 2.1852140893319218e-09,
"loss": 0.4291,
"step": 12760
},
{
"epoch": 0.9923070945683425,
"grad_norm": 2.788077697667321,
"learning_rate": 1.8026728752634781e-09,
"loss": 0.4726,
"step": 12770
},
{
"epoch": 0.9930841557230554,
"grad_norm": 4.551637132581418,
"learning_rate": 1.4569060747610109e-09,
"loss": 0.4655,
"step": 12780
},
{
"epoch": 0.9938612168777683,
"grad_norm": 3.8202797170955614,
"learning_rate": 1.1479162318150939e-09,
"loss": 0.4136,
"step": 12790
},
{
"epoch": 0.9946382780324812,
"grad_norm": 4.028638686891394,
"learning_rate": 8.757056198294145e-10,
"loss": 0.4866,
"step": 12800
},
{
"epoch": 0.995415339187194,
"grad_norm": 3.652876594672518,
"learning_rate": 6.402762416035657e-10,
"loss": 0.4361,
"step": 12810
},
{
"epoch": 0.9961924003419069,
"grad_norm": 3.900658925525932,
"learning_rate": 4.4162982931750255e-10,
"loss": 0.4366,
"step": 12820
},
{
"epoch": 0.9969694614966198,
"grad_norm": 4.0222498355179,
"learning_rate": 2.7976784451877457e-10,
"loss": 0.5075,
"step": 12830
},
{
"epoch": 0.9977465226513327,
"grad_norm": 3.2583568617059995,
"learning_rate": 1.5469147811308926e-10,
"loss": 0.438,
"step": 12840
},
{
"epoch": 0.9985235838060456,
"grad_norm": 3.7370678626951936,
"learning_rate": 6.640165035431967e-11,
"loss": 0.4643,
"step": 12850
},
{
"epoch": 0.9993006449607584,
"grad_norm": 4.275291482479352,
"learning_rate": 1.4899010837288174e-11,
"loss": 0.4825,
"step": 12860
},
{
"epoch": 1.0,
"step": 12869,
"total_flos": 626183508787200.0,
"train_loss": 0.7687553066651168,
"train_runtime": 40960.0959,
"train_samples_per_second": 5.027,
"train_steps_per_second": 0.314
}
],
"logging_steps": 10,
"max_steps": 12869,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 20000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 626183508787200.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}