ahmedheakl's picture
End of training
3bf05c4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.999028560326404,
"eval_steps": 500,
"global_step": 7719,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0038857586943850785,
"grad_norm": 6.868951908344346,
"learning_rate": 5.181347150259067e-07,
"loss": 1.0933,
"step": 10
},
{
"epoch": 0.007771517388770157,
"grad_norm": 5.837615214745706,
"learning_rate": 1.0362694300518134e-06,
"loss": 1.0482,
"step": 20
},
{
"epoch": 0.011657276083155236,
"grad_norm": 3.7584728670248264,
"learning_rate": 1.5544041450777204e-06,
"loss": 0.9528,
"step": 30
},
{
"epoch": 0.015543034777540314,
"grad_norm": 2.430849231904627,
"learning_rate": 2.072538860103627e-06,
"loss": 0.7595,
"step": 40
},
{
"epoch": 0.019428793471925394,
"grad_norm": 1.9877795062873533,
"learning_rate": 2.5906735751295338e-06,
"loss": 0.5683,
"step": 50
},
{
"epoch": 0.02331455216631047,
"grad_norm": 1.1748343816269307,
"learning_rate": 3.1088082901554407e-06,
"loss": 0.393,
"step": 60
},
{
"epoch": 0.02720031086069555,
"grad_norm": 1.2009973092317858,
"learning_rate": 3.6269430051813476e-06,
"loss": 0.2619,
"step": 70
},
{
"epoch": 0.031086069555080628,
"grad_norm": 1.0407742203046373,
"learning_rate": 4.145077720207254e-06,
"loss": 0.2057,
"step": 80
},
{
"epoch": 0.03497182824946571,
"grad_norm": 1.1295981297490019,
"learning_rate": 4.663212435233161e-06,
"loss": 0.1671,
"step": 90
},
{
"epoch": 0.03885758694385079,
"grad_norm": 1.1504503826508636,
"learning_rate": 5.1813471502590676e-06,
"loss": 0.155,
"step": 100
},
{
"epoch": 0.04274334563823587,
"grad_norm": 0.7676430272864883,
"learning_rate": 5.699481865284975e-06,
"loss": 0.1451,
"step": 110
},
{
"epoch": 0.04662910433262094,
"grad_norm": 1.1281617722679143,
"learning_rate": 6.217616580310881e-06,
"loss": 0.1436,
"step": 120
},
{
"epoch": 0.05051486302700602,
"grad_norm": 1.042258959715133,
"learning_rate": 6.735751295336788e-06,
"loss": 0.1295,
"step": 130
},
{
"epoch": 0.0544006217213911,
"grad_norm": 1.0836324717622774,
"learning_rate": 7.253886010362695e-06,
"loss": 0.1149,
"step": 140
},
{
"epoch": 0.05828638041577618,
"grad_norm": 1.4204705637615596,
"learning_rate": 7.772020725388602e-06,
"loss": 0.1268,
"step": 150
},
{
"epoch": 0.062172139110161256,
"grad_norm": 1.0083666041980668,
"learning_rate": 8.290155440414507e-06,
"loss": 0.1182,
"step": 160
},
{
"epoch": 0.06605789780454634,
"grad_norm": 0.833955996150633,
"learning_rate": 8.808290155440415e-06,
"loss": 0.1037,
"step": 170
},
{
"epoch": 0.06994365649893142,
"grad_norm": 0.762829972908706,
"learning_rate": 9.326424870466322e-06,
"loss": 0.1047,
"step": 180
},
{
"epoch": 0.0738294151933165,
"grad_norm": 1.018781373894918,
"learning_rate": 9.844559585492228e-06,
"loss": 0.1059,
"step": 190
},
{
"epoch": 0.07771517388770158,
"grad_norm": 0.8149796744145043,
"learning_rate": 1.0362694300518135e-05,
"loss": 0.1094,
"step": 200
},
{
"epoch": 0.08160093258208666,
"grad_norm": 1.0562790321924551,
"learning_rate": 1.0880829015544042e-05,
"loss": 0.0978,
"step": 210
},
{
"epoch": 0.08548669127647174,
"grad_norm": 0.8392051142898549,
"learning_rate": 1.139896373056995e-05,
"loss": 0.1074,
"step": 220
},
{
"epoch": 0.0893724499708568,
"grad_norm": 0.8606053025510271,
"learning_rate": 1.1917098445595855e-05,
"loss": 0.0936,
"step": 230
},
{
"epoch": 0.09325820866524188,
"grad_norm": 0.8010752114927957,
"learning_rate": 1.2435233160621763e-05,
"loss": 0.084,
"step": 240
},
{
"epoch": 0.09714396735962696,
"grad_norm": 0.8865363433827388,
"learning_rate": 1.2953367875647668e-05,
"loss": 0.0849,
"step": 250
},
{
"epoch": 0.10102972605401205,
"grad_norm": 1.0150630528923608,
"learning_rate": 1.3471502590673576e-05,
"loss": 0.0941,
"step": 260
},
{
"epoch": 0.10491548474839713,
"grad_norm": 1.1564503498169056,
"learning_rate": 1.3989637305699483e-05,
"loss": 0.0884,
"step": 270
},
{
"epoch": 0.1088012434427822,
"grad_norm": 0.7425602409996528,
"learning_rate": 1.450777202072539e-05,
"loss": 0.0904,
"step": 280
},
{
"epoch": 0.11268700213716729,
"grad_norm": 1.010495639141353,
"learning_rate": 1.5025906735751296e-05,
"loss": 0.0877,
"step": 290
},
{
"epoch": 0.11657276083155237,
"grad_norm": 0.7312574416672747,
"learning_rate": 1.5544041450777204e-05,
"loss": 0.0833,
"step": 300
},
{
"epoch": 0.12045851952593743,
"grad_norm": 0.8186188565897778,
"learning_rate": 1.606217616580311e-05,
"loss": 0.082,
"step": 310
},
{
"epoch": 0.12434427822032251,
"grad_norm": 0.6725849359490335,
"learning_rate": 1.6580310880829015e-05,
"loss": 0.0773,
"step": 320
},
{
"epoch": 0.1282300369147076,
"grad_norm": 0.6044289285905357,
"learning_rate": 1.7098445595854924e-05,
"loss": 0.0793,
"step": 330
},
{
"epoch": 0.13211579560909267,
"grad_norm": 0.648628136760092,
"learning_rate": 1.761658031088083e-05,
"loss": 0.0748,
"step": 340
},
{
"epoch": 0.13600155430347777,
"grad_norm": 0.7654085124832696,
"learning_rate": 1.813471502590674e-05,
"loss": 0.0656,
"step": 350
},
{
"epoch": 0.13988731299786283,
"grad_norm": 0.6998519217873228,
"learning_rate": 1.8652849740932644e-05,
"loss": 0.0841,
"step": 360
},
{
"epoch": 0.1437730716922479,
"grad_norm": 0.5847858909362429,
"learning_rate": 1.917098445595855e-05,
"loss": 0.0692,
"step": 370
},
{
"epoch": 0.147658830386633,
"grad_norm": 0.7494257595598578,
"learning_rate": 1.9689119170984456e-05,
"loss": 0.0783,
"step": 380
},
{
"epoch": 0.15154458908101806,
"grad_norm": 0.7368291090701112,
"learning_rate": 2.0207253886010365e-05,
"loss": 0.0672,
"step": 390
},
{
"epoch": 0.15543034777540315,
"grad_norm": 0.6162219355087554,
"learning_rate": 2.072538860103627e-05,
"loss": 0.0768,
"step": 400
},
{
"epoch": 0.15931610646978822,
"grad_norm": 1.0522389801039482,
"learning_rate": 2.124352331606218e-05,
"loss": 0.0753,
"step": 410
},
{
"epoch": 0.16320186516417332,
"grad_norm": 0.6631859133005514,
"learning_rate": 2.1761658031088085e-05,
"loss": 0.0853,
"step": 420
},
{
"epoch": 0.16708762385855838,
"grad_norm": 0.8284655987791505,
"learning_rate": 2.227979274611399e-05,
"loss": 0.0644,
"step": 430
},
{
"epoch": 0.17097338255294348,
"grad_norm": 0.6911832435770443,
"learning_rate": 2.27979274611399e-05,
"loss": 0.0623,
"step": 440
},
{
"epoch": 0.17485914124732854,
"grad_norm": 0.5902621273486757,
"learning_rate": 2.3316062176165802e-05,
"loss": 0.077,
"step": 450
},
{
"epoch": 0.1787448999417136,
"grad_norm": 0.7554872423202403,
"learning_rate": 2.383419689119171e-05,
"loss": 0.0798,
"step": 460
},
{
"epoch": 0.1826306586360987,
"grad_norm": 0.6639542635427919,
"learning_rate": 2.435233160621762e-05,
"loss": 0.0786,
"step": 470
},
{
"epoch": 0.18651641733048377,
"grad_norm": 0.7784648823322334,
"learning_rate": 2.4870466321243526e-05,
"loss": 0.0814,
"step": 480
},
{
"epoch": 0.19040217602486886,
"grad_norm": 0.6900832905628588,
"learning_rate": 2.5388601036269435e-05,
"loss": 0.0787,
"step": 490
},
{
"epoch": 0.19428793471925393,
"grad_norm": 0.6434291181769316,
"learning_rate": 2.5906735751295337e-05,
"loss": 0.0694,
"step": 500
},
{
"epoch": 0.19817369341363902,
"grad_norm": 0.6727244264583743,
"learning_rate": 2.6424870466321246e-05,
"loss": 0.0696,
"step": 510
},
{
"epoch": 0.2020594521080241,
"grad_norm": 0.5145508471366054,
"learning_rate": 2.694300518134715e-05,
"loss": 0.0762,
"step": 520
},
{
"epoch": 0.20594521080240916,
"grad_norm": 0.5458188017984406,
"learning_rate": 2.746113989637306e-05,
"loss": 0.067,
"step": 530
},
{
"epoch": 0.20983096949679425,
"grad_norm": 0.615156585002907,
"learning_rate": 2.7979274611398966e-05,
"loss": 0.0674,
"step": 540
},
{
"epoch": 0.21371672819117932,
"grad_norm": 0.5503543665259718,
"learning_rate": 2.8497409326424872e-05,
"loss": 0.0764,
"step": 550
},
{
"epoch": 0.2176024868855644,
"grad_norm": 0.6637963707127615,
"learning_rate": 2.901554404145078e-05,
"loss": 0.0666,
"step": 560
},
{
"epoch": 0.22148824557994948,
"grad_norm": 0.5116291190440031,
"learning_rate": 2.9533678756476683e-05,
"loss": 0.0648,
"step": 570
},
{
"epoch": 0.22537400427433457,
"grad_norm": 0.6270451144717418,
"learning_rate": 3.0051813471502592e-05,
"loss": 0.0634,
"step": 580
},
{
"epoch": 0.22925976296871964,
"grad_norm": 0.5329413421123648,
"learning_rate": 3.0569948186528505e-05,
"loss": 0.0605,
"step": 590
},
{
"epoch": 0.23314552166310473,
"grad_norm": 0.7863078930627488,
"learning_rate": 3.108808290155441e-05,
"loss": 0.0611,
"step": 600
},
{
"epoch": 0.2370312803574898,
"grad_norm": 0.5641860345765728,
"learning_rate": 3.1606217616580316e-05,
"loss": 0.0713,
"step": 610
},
{
"epoch": 0.24091703905187487,
"grad_norm": 0.6298950201322868,
"learning_rate": 3.212435233160622e-05,
"loss": 0.062,
"step": 620
},
{
"epoch": 0.24480279774625996,
"grad_norm": 0.5576657376337026,
"learning_rate": 3.264248704663213e-05,
"loss": 0.0729,
"step": 630
},
{
"epoch": 0.24868855644064503,
"grad_norm": 0.5849778617897893,
"learning_rate": 3.316062176165803e-05,
"loss": 0.0663,
"step": 640
},
{
"epoch": 0.2525743151350301,
"grad_norm": 0.6862718900926282,
"learning_rate": 3.367875647668394e-05,
"loss": 0.0702,
"step": 650
},
{
"epoch": 0.2564600738294152,
"grad_norm": 0.620792145685114,
"learning_rate": 3.419689119170985e-05,
"loss": 0.0647,
"step": 660
},
{
"epoch": 0.2603458325238003,
"grad_norm": 0.5377947147204513,
"learning_rate": 3.471502590673575e-05,
"loss": 0.0629,
"step": 670
},
{
"epoch": 0.26423159121818535,
"grad_norm": 0.5488958030608141,
"learning_rate": 3.523316062176166e-05,
"loss": 0.0671,
"step": 680
},
{
"epoch": 0.2681173499125704,
"grad_norm": 0.5954204106913672,
"learning_rate": 3.575129533678757e-05,
"loss": 0.064,
"step": 690
},
{
"epoch": 0.27200310860695553,
"grad_norm": 0.5519696206558653,
"learning_rate": 3.626943005181348e-05,
"loss": 0.055,
"step": 700
},
{
"epoch": 0.2758888673013406,
"grad_norm": 0.6671381342735235,
"learning_rate": 3.6787564766839386e-05,
"loss": 0.0566,
"step": 710
},
{
"epoch": 0.27977462599572567,
"grad_norm": 0.46839311589453947,
"learning_rate": 3.730569948186529e-05,
"loss": 0.0655,
"step": 720
},
{
"epoch": 0.28366038469011073,
"grad_norm": 0.49157316560679837,
"learning_rate": 3.78238341968912e-05,
"loss": 0.0535,
"step": 730
},
{
"epoch": 0.2875461433844958,
"grad_norm": 0.49892347263304754,
"learning_rate": 3.83419689119171e-05,
"loss": 0.0674,
"step": 740
},
{
"epoch": 0.2914319020788809,
"grad_norm": 0.5208448100813172,
"learning_rate": 3.886010362694301e-05,
"loss": 0.0511,
"step": 750
},
{
"epoch": 0.295317660773266,
"grad_norm": 0.2894917388721045,
"learning_rate": 3.937823834196891e-05,
"loss": 0.0568,
"step": 760
},
{
"epoch": 0.29920341946765106,
"grad_norm": 0.532926439876675,
"learning_rate": 3.989637305699482e-05,
"loss": 0.0565,
"step": 770
},
{
"epoch": 0.3030891781620361,
"grad_norm": 0.49821621909804464,
"learning_rate": 3.999986911657599e-05,
"loss": 0.0718,
"step": 780
},
{
"epoch": 0.30697493685642124,
"grad_norm": 0.4380644811650154,
"learning_rate": 3.99993374056019e-05,
"loss": 0.0578,
"step": 790
},
{
"epoch": 0.3108606955508063,
"grad_norm": 0.47585349907521746,
"learning_rate": 3.999839669772912e-05,
"loss": 0.0523,
"step": 800
},
{
"epoch": 0.3147464542451914,
"grad_norm": 0.31641494499233286,
"learning_rate": 3.999704701219562e-05,
"loss": 0.0563,
"step": 810
},
{
"epoch": 0.31863221293957644,
"grad_norm": 0.48142926317508933,
"learning_rate": 3.999528837660319e-05,
"loss": 0.0587,
"step": 820
},
{
"epoch": 0.3225179716339615,
"grad_norm": 0.48754014218594893,
"learning_rate": 3.999312082691682e-05,
"loss": 0.0659,
"step": 830
},
{
"epoch": 0.32640373032834663,
"grad_norm": 0.6302273551444069,
"learning_rate": 3.9990544407464044e-05,
"loss": 0.0617,
"step": 840
},
{
"epoch": 0.3302894890227317,
"grad_norm": 0.4908822983385492,
"learning_rate": 3.9987559170934e-05,
"loss": 0.0573,
"step": 850
},
{
"epoch": 0.33417524771711676,
"grad_norm": 0.3297602359842017,
"learning_rate": 3.9984165178376316e-05,
"loss": 0.0491,
"step": 860
},
{
"epoch": 0.33806100641150183,
"grad_norm": 0.419433697037015,
"learning_rate": 3.9980362499199915e-05,
"loss": 0.0555,
"step": 870
},
{
"epoch": 0.34194676510588695,
"grad_norm": 0.5429807682582697,
"learning_rate": 3.997615121117159e-05,
"loss": 0.0501,
"step": 880
},
{
"epoch": 0.345832523800272,
"grad_norm": 0.5045801179188242,
"learning_rate": 3.997153140041437e-05,
"loss": 0.0527,
"step": 890
},
{
"epoch": 0.3497182824946571,
"grad_norm": 0.4119373168535617,
"learning_rate": 3.9966503161405786e-05,
"loss": 0.0579,
"step": 900
},
{
"epoch": 0.35360404118904215,
"grad_norm": 0.3652010084324409,
"learning_rate": 3.996106659697597e-05,
"loss": 0.0394,
"step": 910
},
{
"epoch": 0.3574897998834272,
"grad_norm": 0.49911111921282303,
"learning_rate": 3.9955221818305504e-05,
"loss": 0.0618,
"step": 920
},
{
"epoch": 0.36137555857781234,
"grad_norm": 0.4691524202220399,
"learning_rate": 3.994896894492316e-05,
"loss": 0.0449,
"step": 930
},
{
"epoch": 0.3652613172721974,
"grad_norm": 0.4522219450999137,
"learning_rate": 3.9942308104703464e-05,
"loss": 0.0637,
"step": 940
},
{
"epoch": 0.36914707596658247,
"grad_norm": 0.3759169728636205,
"learning_rate": 3.993523943386408e-05,
"loss": 0.0556,
"step": 950
},
{
"epoch": 0.37303283466096754,
"grad_norm": 0.4043985895545187,
"learning_rate": 3.9927763076963026e-05,
"loss": 0.0497,
"step": 960
},
{
"epoch": 0.37691859335535266,
"grad_norm": 0.36167729094148904,
"learning_rate": 3.99198791868957e-05,
"loss": 0.0443,
"step": 970
},
{
"epoch": 0.3808043520497377,
"grad_norm": 0.43188183331177943,
"learning_rate": 3.991158792489178e-05,
"loss": 0.0492,
"step": 980
},
{
"epoch": 0.3846901107441228,
"grad_norm": 0.591934470494083,
"learning_rate": 3.9902889460511895e-05,
"loss": 0.0433,
"step": 990
},
{
"epoch": 0.38857586943850786,
"grad_norm": 0.38540473325489366,
"learning_rate": 3.989378397164419e-05,
"loss": 0.062,
"step": 1000
},
{
"epoch": 0.3924616281328929,
"grad_norm": 0.4686847716122605,
"learning_rate": 3.988427164450067e-05,
"loss": 0.053,
"step": 1010
},
{
"epoch": 0.39634738682727805,
"grad_norm": 0.44830904799588184,
"learning_rate": 3.98743526736134e-05,
"loss": 0.0546,
"step": 1020
},
{
"epoch": 0.4002331455216631,
"grad_norm": 0.38427649669131925,
"learning_rate": 3.986402726183051e-05,
"loss": 0.044,
"step": 1030
},
{
"epoch": 0.4041189042160482,
"grad_norm": 0.3797043059039449,
"learning_rate": 3.985329562031207e-05,
"loss": 0.0507,
"step": 1040
},
{
"epoch": 0.40800466291043325,
"grad_norm": 0.4645705602868689,
"learning_rate": 3.9842157968525755e-05,
"loss": 0.0488,
"step": 1050
},
{
"epoch": 0.4118904216048183,
"grad_norm": 0.45995234050567774,
"learning_rate": 3.9830614534242365e-05,
"loss": 0.0504,
"step": 1060
},
{
"epoch": 0.41577618029920344,
"grad_norm": 0.6048279655888865,
"learning_rate": 3.981866555353115e-05,
"loss": 0.0554,
"step": 1070
},
{
"epoch": 0.4196619389935885,
"grad_norm": 0.4778278100735776,
"learning_rate": 3.9806311270755026e-05,
"loss": 0.0504,
"step": 1080
},
{
"epoch": 0.42354769768797357,
"grad_norm": 0.2819163749757861,
"learning_rate": 3.9793551938565513e-05,
"loss": 0.0473,
"step": 1090
},
{
"epoch": 0.42743345638235863,
"grad_norm": 0.34078755188718823,
"learning_rate": 3.978038781789764e-05,
"loss": 0.0513,
"step": 1100
},
{
"epoch": 0.43131921507674376,
"grad_norm": 0.42583054736380277,
"learning_rate": 3.9766819177964535e-05,
"loss": 0.0469,
"step": 1110
},
{
"epoch": 0.4352049737711288,
"grad_norm": 0.5037049238340003,
"learning_rate": 3.975284629625198e-05,
"loss": 0.0552,
"step": 1120
},
{
"epoch": 0.4390907324655139,
"grad_norm": 0.4127984535316269,
"learning_rate": 3.973846945851271e-05,
"loss": 0.0431,
"step": 1130
},
{
"epoch": 0.44297649115989896,
"grad_norm": 0.4024463238112972,
"learning_rate": 3.972368895876056e-05,
"loss": 0.0442,
"step": 1140
},
{
"epoch": 0.446862249854284,
"grad_norm": 0.34036792334896143,
"learning_rate": 3.970850509926448e-05,
"loss": 0.0429,
"step": 1150
},
{
"epoch": 0.45074800854866914,
"grad_norm": 0.43486746219653427,
"learning_rate": 3.969291819054232e-05,
"loss": 0.0465,
"step": 1160
},
{
"epoch": 0.4546337672430542,
"grad_norm": 0.3533700876931675,
"learning_rate": 3.9676928551354524e-05,
"loss": 0.0442,
"step": 1170
},
{
"epoch": 0.4585195259374393,
"grad_norm": 0.4418640383769793,
"learning_rate": 3.9660536508697545e-05,
"loss": 0.0433,
"step": 1180
},
{
"epoch": 0.46240528463182434,
"grad_norm": 0.45970453713501874,
"learning_rate": 3.9643742397797236e-05,
"loss": 0.0511,
"step": 1190
},
{
"epoch": 0.46629104332620946,
"grad_norm": 0.35306399118899623,
"learning_rate": 3.9626546562101936e-05,
"loss": 0.0508,
"step": 1200
},
{
"epoch": 0.47017680202059453,
"grad_norm": 0.3211095806787999,
"learning_rate": 3.960894935327546e-05,
"loss": 0.039,
"step": 1210
},
{
"epoch": 0.4740625607149796,
"grad_norm": 0.36627785482679437,
"learning_rate": 3.9590951131189934e-05,
"loss": 0.0558,
"step": 1220
},
{
"epoch": 0.47794831940936466,
"grad_norm": 0.421462123124338,
"learning_rate": 3.957255226391839e-05,
"loss": 0.0497,
"step": 1230
},
{
"epoch": 0.48183407810374973,
"grad_norm": 0.3353485956514776,
"learning_rate": 3.955375312772729e-05,
"loss": 0.0384,
"step": 1240
},
{
"epoch": 0.48571983679813485,
"grad_norm": 11.666925837495032,
"learning_rate": 3.9534554107068786e-05,
"loss": 0.0472,
"step": 1250
},
{
"epoch": 0.4896055954925199,
"grad_norm": 0.4841474924556115,
"learning_rate": 3.9514955594572874e-05,
"loss": 0.0487,
"step": 1260
},
{
"epoch": 0.493491354186905,
"grad_norm": 0.3248614577525704,
"learning_rate": 3.9494957991039386e-05,
"loss": 0.0548,
"step": 1270
},
{
"epoch": 0.49737711288129005,
"grad_norm": 0.3970898472778102,
"learning_rate": 3.947456170542976e-05,
"loss": 0.0489,
"step": 1280
},
{
"epoch": 0.5012628715756752,
"grad_norm": 0.4161749757032391,
"learning_rate": 3.945376715485868e-05,
"loss": 0.0526,
"step": 1290
},
{
"epoch": 0.5051486302700602,
"grad_norm": 0.3452729447702609,
"learning_rate": 3.9432574764585574e-05,
"loss": 0.048,
"step": 1300
},
{
"epoch": 0.5090343889644453,
"grad_norm": 0.47170016327450176,
"learning_rate": 3.9410984968005904e-05,
"loss": 0.0405,
"step": 1310
},
{
"epoch": 0.5129201476588304,
"grad_norm": 0.42610644718786006,
"learning_rate": 3.938899820664229e-05,
"loss": 0.0458,
"step": 1320
},
{
"epoch": 0.5168059063532154,
"grad_norm": 0.3408319733233165,
"learning_rate": 3.936661493013548e-05,
"loss": 0.0391,
"step": 1330
},
{
"epoch": 0.5206916650476006,
"grad_norm": 0.30711332622300047,
"learning_rate": 3.934383559623518e-05,
"loss": 0.042,
"step": 1340
},
{
"epoch": 0.5245774237419856,
"grad_norm": 0.5571217272080251,
"learning_rate": 3.932066067079066e-05,
"loss": 0.0431,
"step": 1350
},
{
"epoch": 0.5284631824363707,
"grad_norm": 0.4027909880638002,
"learning_rate": 3.929709062774127e-05,
"loss": 0.0374,
"step": 1360
},
{
"epoch": 0.5323489411307558,
"grad_norm": 0.30456945378456746,
"learning_rate": 3.9273125949106675e-05,
"loss": 0.0424,
"step": 1370
},
{
"epoch": 0.5362346998251408,
"grad_norm": 0.3964517011394958,
"learning_rate": 3.924876712497711e-05,
"loss": 0.0467,
"step": 1380
},
{
"epoch": 0.540120458519526,
"grad_norm": 0.3649307036795635,
"learning_rate": 3.9224014653503226e-05,
"loss": 0.0497,
"step": 1390
},
{
"epoch": 0.5440062172139111,
"grad_norm": 0.3198268167473866,
"learning_rate": 3.919886904088601e-05,
"loss": 0.0456,
"step": 1400
},
{
"epoch": 0.5478919759082961,
"grad_norm": 0.3463770316931819,
"learning_rate": 3.917333080136638e-05,
"loss": 0.0368,
"step": 1410
},
{
"epoch": 0.5517777346026812,
"grad_norm": 0.35644328141911197,
"learning_rate": 3.9147400457214674e-05,
"loss": 0.0481,
"step": 1420
},
{
"epoch": 0.5556634932970662,
"grad_norm": 0.40914490698280265,
"learning_rate": 3.9121078538719975e-05,
"loss": 0.0474,
"step": 1430
},
{
"epoch": 0.5595492519914513,
"grad_norm": 0.3427548933748361,
"learning_rate": 3.9094365584179264e-05,
"loss": 0.0385,
"step": 1440
},
{
"epoch": 0.5634350106858365,
"grad_norm": 0.4352394889252363,
"learning_rate": 3.906726213988642e-05,
"loss": 0.0383,
"step": 1450
},
{
"epoch": 0.5673207693802215,
"grad_norm": 0.29442224609914946,
"learning_rate": 3.903976876012105e-05,
"loss": 0.0499,
"step": 1460
},
{
"epoch": 0.5712065280746066,
"grad_norm": 0.2407859708001586,
"learning_rate": 3.901188600713712e-05,
"loss": 0.0459,
"step": 1470
},
{
"epoch": 0.5750922867689916,
"grad_norm": 0.3994426590406398,
"learning_rate": 3.89836144511515e-05,
"loss": 0.0399,
"step": 1480
},
{
"epoch": 0.5789780454633767,
"grad_norm": 0.3839601198021419,
"learning_rate": 3.895495467033229e-05,
"loss": 0.0455,
"step": 1490
},
{
"epoch": 0.5828638041577618,
"grad_norm": 0.5414728833934257,
"learning_rate": 3.8925907250786966e-05,
"loss": 0.0428,
"step": 1500
},
{
"epoch": 0.5867495628521469,
"grad_norm": 0.4855025115080349,
"learning_rate": 3.8896472786550444e-05,
"loss": 0.0415,
"step": 1510
},
{
"epoch": 0.590635321546532,
"grad_norm": 0.3542282158759638,
"learning_rate": 3.886665187957289e-05,
"loss": 0.05,
"step": 1520
},
{
"epoch": 0.594521080240917,
"grad_norm": 0.32139501424521916,
"learning_rate": 3.883644513970744e-05,
"loss": 0.0386,
"step": 1530
},
{
"epoch": 0.5984068389353021,
"grad_norm": 0.32361000000718626,
"learning_rate": 3.8805853184697694e-05,
"loss": 0.0407,
"step": 1540
},
{
"epoch": 0.6022925976296872,
"grad_norm": 0.2699532107849867,
"learning_rate": 3.877487664016513e-05,
"loss": 0.035,
"step": 1550
},
{
"epoch": 0.6061783563240722,
"grad_norm": 0.31749658242328194,
"learning_rate": 3.8743516139596244e-05,
"loss": 0.0449,
"step": 1560
},
{
"epoch": 0.6100641150184574,
"grad_norm": 0.38042009032251006,
"learning_rate": 3.871177232432969e-05,
"loss": 0.0433,
"step": 1570
},
{
"epoch": 0.6139498737128425,
"grad_norm": 0.27994580128185775,
"learning_rate": 3.867964584354305e-05,
"loss": 0.0375,
"step": 1580
},
{
"epoch": 0.6178356324072275,
"grad_norm": 0.44326692252464484,
"learning_rate": 3.864713735423964e-05,
"loss": 0.0527,
"step": 1590
},
{
"epoch": 0.6217213911016126,
"grad_norm": 0.338368734908726,
"learning_rate": 3.861424752123506e-05,
"loss": 0.0356,
"step": 1600
},
{
"epoch": 0.6256071497959976,
"grad_norm": 0.31262748879439145,
"learning_rate": 3.858097701714358e-05,
"loss": 0.0391,
"step": 1610
},
{
"epoch": 0.6294929084903828,
"grad_norm": 0.37350424104203495,
"learning_rate": 3.8547326522364386e-05,
"loss": 0.0437,
"step": 1620
},
{
"epoch": 0.6333786671847679,
"grad_norm": 0.3610320718135361,
"learning_rate": 3.851329672506768e-05,
"loss": 0.0412,
"step": 1630
},
{
"epoch": 0.6372644258791529,
"grad_norm": 0.46752288451915147,
"learning_rate": 3.847888832118059e-05,
"loss": 0.0426,
"step": 1640
},
{
"epoch": 0.641150184573538,
"grad_norm": 0.42134006799700907,
"learning_rate": 3.844410201437296e-05,
"loss": 0.0598,
"step": 1650
},
{
"epoch": 0.645035943267923,
"grad_norm": 0.31290423259787065,
"learning_rate": 3.840893851604294e-05,
"loss": 0.0346,
"step": 1660
},
{
"epoch": 0.6489217019623081,
"grad_norm": 0.40257644426810313,
"learning_rate": 3.837339854530243e-05,
"loss": 0.0452,
"step": 1670
},
{
"epoch": 0.6528074606566933,
"grad_norm": 0.35775285851697575,
"learning_rate": 3.833748282896241e-05,
"loss": 0.0375,
"step": 1680
},
{
"epoch": 0.6566932193510783,
"grad_norm": 0.39620710106307544,
"learning_rate": 3.8301192101518034e-05,
"loss": 0.0389,
"step": 1690
},
{
"epoch": 0.6605789780454634,
"grad_norm": 0.35116612697915695,
"learning_rate": 3.8264527105133655e-05,
"loss": 0.0416,
"step": 1700
},
{
"epoch": 0.6644647367398484,
"grad_norm": 0.3593763085620626,
"learning_rate": 3.822748858962759e-05,
"loss": 0.039,
"step": 1710
},
{
"epoch": 0.6683504954342335,
"grad_norm": 0.3636137986839074,
"learning_rate": 3.8190077312456837e-05,
"loss": 0.0437,
"step": 1720
},
{
"epoch": 0.6722362541286186,
"grad_norm": 0.3070384884366638,
"learning_rate": 3.815229403870156e-05,
"loss": 0.0441,
"step": 1730
},
{
"epoch": 0.6761220128230037,
"grad_norm": 0.452855427559548,
"learning_rate": 3.811413954104944e-05,
"loss": 0.0476,
"step": 1740
},
{
"epoch": 0.6800077715173888,
"grad_norm": 0.3645618682663316,
"learning_rate": 3.80756145997799e-05,
"loss": 0.044,
"step": 1750
},
{
"epoch": 0.6838935302117739,
"grad_norm": 0.3236245721921845,
"learning_rate": 3.8036720002748116e-05,
"loss": 0.0392,
"step": 1760
},
{
"epoch": 0.6877792889061589,
"grad_norm": 0.3755492942938037,
"learning_rate": 3.79974565453689e-05,
"loss": 0.0391,
"step": 1770
},
{
"epoch": 0.691665047600544,
"grad_norm": 0.32387171524447206,
"learning_rate": 3.795782503060049e-05,
"loss": 0.0387,
"step": 1780
},
{
"epoch": 0.695550806294929,
"grad_norm": 0.34576851613064813,
"learning_rate": 3.791782626892806e-05,
"loss": 0.041,
"step": 1790
},
{
"epoch": 0.6994365649893142,
"grad_norm": 0.3841280667838816,
"learning_rate": 3.7877461078347184e-05,
"loss": 0.0421,
"step": 1800
},
{
"epoch": 0.7033223236836993,
"grad_norm": 0.3143570778817021,
"learning_rate": 3.78367302843471e-05,
"loss": 0.0471,
"step": 1810
},
{
"epoch": 0.7072080823780843,
"grad_norm": 0.27227430217810716,
"learning_rate": 3.7795634719893824e-05,
"loss": 0.0368,
"step": 1820
},
{
"epoch": 0.7110938410724694,
"grad_norm": 0.7857732487925049,
"learning_rate": 3.7754175225413116e-05,
"loss": 0.0459,
"step": 1830
},
{
"epoch": 0.7149795997668544,
"grad_norm": 0.3672886777669717,
"learning_rate": 3.771235264877331e-05,
"loss": 0.0337,
"step": 1840
},
{
"epoch": 0.7188653584612396,
"grad_norm": 0.35164352388978687,
"learning_rate": 3.7670167845267934e-05,
"loss": 0.0385,
"step": 1850
},
{
"epoch": 0.7227511171556247,
"grad_norm": 0.40071179887944486,
"learning_rate": 3.762762167759827e-05,
"loss": 0.0467,
"step": 1860
},
{
"epoch": 0.7266368758500097,
"grad_norm": 0.4969346423226924,
"learning_rate": 3.758471501585567e-05,
"loss": 0.0356,
"step": 1870
},
{
"epoch": 0.7305226345443948,
"grad_norm": 0.3649736322224178,
"learning_rate": 3.7541448737503785e-05,
"loss": 0.0417,
"step": 1880
},
{
"epoch": 0.7344083932387798,
"grad_norm": 0.3412635607398391,
"learning_rate": 3.749782372736061e-05,
"loss": 0.036,
"step": 1890
},
{
"epoch": 0.7382941519331649,
"grad_norm": 0.2934505710318939,
"learning_rate": 3.74538408775804e-05,
"loss": 0.0442,
"step": 1900
},
{
"epoch": 0.7421799106275501,
"grad_norm": 0.1935092511725485,
"learning_rate": 3.740950108763541e-05,
"loss": 0.0429,
"step": 1910
},
{
"epoch": 0.7460656693219351,
"grad_norm": 0.2578986935595479,
"learning_rate": 3.73648052642975e-05,
"loss": 0.0392,
"step": 1920
},
{
"epoch": 0.7499514280163202,
"grad_norm": 0.28980154790022605,
"learning_rate": 3.7319754321619625e-05,
"loss": 0.0395,
"step": 1930
},
{
"epoch": 0.7538371867107053,
"grad_norm": 0.2637087037771915,
"learning_rate": 3.7274349180917094e-05,
"loss": 0.0415,
"step": 1940
},
{
"epoch": 0.7577229454050903,
"grad_norm": 0.2854361833691479,
"learning_rate": 3.722859077074875e-05,
"loss": 0.0425,
"step": 1950
},
{
"epoch": 0.7616087040994755,
"grad_norm": 0.2742967763695032,
"learning_rate": 3.718248002689799e-05,
"loss": 0.0352,
"step": 1960
},
{
"epoch": 0.7654944627938605,
"grad_norm": 0.3604576092657196,
"learning_rate": 3.7136017892353626e-05,
"loss": 0.0413,
"step": 1970
},
{
"epoch": 0.7693802214882456,
"grad_norm": 0.2855589921061211,
"learning_rate": 3.7089205317290564e-05,
"loss": 0.0458,
"step": 1980
},
{
"epoch": 0.7732659801826307,
"grad_norm": 0.3073335348834244,
"learning_rate": 3.7042043259050444e-05,
"loss": 0.0333,
"step": 1990
},
{
"epoch": 0.7771517388770157,
"grad_norm": 0.28916930774412825,
"learning_rate": 3.699453268212199e-05,
"loss": 0.0424,
"step": 2000
},
{
"epoch": 0.7810374975714008,
"grad_norm": 0.3314165818828528,
"learning_rate": 3.694667455812131e-05,
"loss": 0.0395,
"step": 2010
},
{
"epoch": 0.7849232562657859,
"grad_norm": 0.39228463874894487,
"learning_rate": 3.6898469865772055e-05,
"loss": 0.0377,
"step": 2020
},
{
"epoch": 0.788809014960171,
"grad_norm": 0.19523419184608223,
"learning_rate": 3.684991959088537e-05,
"loss": 0.0362,
"step": 2030
},
{
"epoch": 0.7926947736545561,
"grad_norm": 0.29565378498547584,
"learning_rate": 3.680102472633974e-05,
"loss": 0.0354,
"step": 2040
},
{
"epoch": 0.7965805323489411,
"grad_norm": 0.3948911652054535,
"learning_rate": 3.675178627206068e-05,
"loss": 0.0411,
"step": 2050
},
{
"epoch": 0.8004662910433262,
"grad_norm": 0.3555760694304167,
"learning_rate": 3.6702205235000315e-05,
"loss": 0.0409,
"step": 2060
},
{
"epoch": 0.8043520497377112,
"grad_norm": 0.33270407018218245,
"learning_rate": 3.665228262911676e-05,
"loss": 0.0306,
"step": 2070
},
{
"epoch": 0.8082378084320964,
"grad_norm": 0.5008566240131939,
"learning_rate": 3.660201947535338e-05,
"loss": 0.0415,
"step": 2080
},
{
"epoch": 0.8121235671264815,
"grad_norm": 0.3354219331000289,
"learning_rate": 3.655141680161793e-05,
"loss": 0.0362,
"step": 2090
},
{
"epoch": 0.8160093258208665,
"grad_norm": 0.39746802052357905,
"learning_rate": 3.650047564276152e-05,
"loss": 0.0418,
"step": 2100
},
{
"epoch": 0.8198950845152516,
"grad_norm": 0.2251580580435554,
"learning_rate": 3.644919704055748e-05,
"loss": 0.0394,
"step": 2110
},
{
"epoch": 0.8237808432096366,
"grad_norm": 0.2699540407370164,
"learning_rate": 3.639758204368001e-05,
"loss": 0.0384,
"step": 2120
},
{
"epoch": 0.8276666019040217,
"grad_norm": 0.2866910109291188,
"learning_rate": 3.6345631707682744e-05,
"loss": 0.0357,
"step": 2130
},
{
"epoch": 0.8315523605984069,
"grad_norm": 0.2713788435071836,
"learning_rate": 3.6293347094977224e-05,
"loss": 0.0409,
"step": 2140
},
{
"epoch": 0.8354381192927919,
"grad_norm": 0.3195763251154283,
"learning_rate": 3.624072927481107e-05,
"loss": 0.0403,
"step": 2150
},
{
"epoch": 0.839323877987177,
"grad_norm": 0.2983473462358474,
"learning_rate": 3.618777932324621e-05,
"loss": 0.0341,
"step": 2160
},
{
"epoch": 0.8432096366815621,
"grad_norm": 0.29251828904033916,
"learning_rate": 3.613449832313683e-05,
"loss": 0.036,
"step": 2170
},
{
"epoch": 0.8470953953759471,
"grad_norm": 0.23448459997179785,
"learning_rate": 3.608088736410718e-05,
"loss": 0.0338,
"step": 2180
},
{
"epoch": 0.8509811540703323,
"grad_norm": 0.2757212169813267,
"learning_rate": 3.6026947542529415e-05,
"loss": 0.0409,
"step": 2190
},
{
"epoch": 0.8548669127647173,
"grad_norm": 0.33239782308647875,
"learning_rate": 3.597267996150106e-05,
"loss": 0.0374,
"step": 2200
},
{
"epoch": 0.8587526714591024,
"grad_norm": 0.33403655068983207,
"learning_rate": 3.591808573082249e-05,
"loss": 0.0357,
"step": 2210
},
{
"epoch": 0.8626384301534875,
"grad_norm": 0.2378398763830906,
"learning_rate": 3.586316596697426e-05,
"loss": 0.0344,
"step": 2220
},
{
"epoch": 0.8665241888478725,
"grad_norm": 0.3216096369297081,
"learning_rate": 3.580792179309422e-05,
"loss": 0.0382,
"step": 2230
},
{
"epoch": 0.8704099475422576,
"grad_norm": 0.37090625174379876,
"learning_rate": 3.5752354338954594e-05,
"loss": 0.0383,
"step": 2240
},
{
"epoch": 0.8742957062366427,
"grad_norm": 0.26521085264464783,
"learning_rate": 3.569646474093885e-05,
"loss": 0.0346,
"step": 2250
},
{
"epoch": 0.8781814649310278,
"grad_norm": 0.30913299216593815,
"learning_rate": 3.564025414201846e-05,
"loss": 0.0373,
"step": 2260
},
{
"epoch": 0.8820672236254129,
"grad_norm": 0.3053308654147181,
"learning_rate": 3.558372369172956e-05,
"loss": 0.0339,
"step": 2270
},
{
"epoch": 0.8859529823197979,
"grad_norm": 0.2351240040993444,
"learning_rate": 3.552687454614938e-05,
"loss": 0.0331,
"step": 2280
},
{
"epoch": 0.889838741014183,
"grad_norm": 0.37288388901346237,
"learning_rate": 3.546970786787264e-05,
"loss": 0.0361,
"step": 2290
},
{
"epoch": 0.893724499708568,
"grad_norm": 0.44083971894002005,
"learning_rate": 3.541222482598779e-05,
"loss": 0.0418,
"step": 2300
},
{
"epoch": 0.8976102584029532,
"grad_norm": 0.321928529274947,
"learning_rate": 3.5354426596053066e-05,
"loss": 0.0296,
"step": 2310
},
{
"epoch": 0.9014960170973383,
"grad_norm": 0.28686426327838627,
"learning_rate": 3.529631436007246e-05,
"loss": 0.0324,
"step": 2320
},
{
"epoch": 0.9053817757917233,
"grad_norm": 0.22912536137270445,
"learning_rate": 3.523788930647157e-05,
"loss": 0.0429,
"step": 2330
},
{
"epoch": 0.9092675344861084,
"grad_norm": 0.3085807580953211,
"learning_rate": 3.5179152630073256e-05,
"loss": 0.0313,
"step": 2340
},
{
"epoch": 0.9131532931804935,
"grad_norm": 0.34303257055116365,
"learning_rate": 3.512010553207325e-05,
"loss": 0.0371,
"step": 2350
},
{
"epoch": 0.9170390518748786,
"grad_norm": 0.2772929349898867,
"learning_rate": 3.506074922001554e-05,
"loss": 0.0328,
"step": 2360
},
{
"epoch": 0.9209248105692637,
"grad_norm": 0.30788741712363193,
"learning_rate": 3.500108490776774e-05,
"loss": 0.0402,
"step": 2370
},
{
"epoch": 0.9248105692636487,
"grad_norm": 0.3455599730329004,
"learning_rate": 3.494111381549618e-05,
"loss": 0.0321,
"step": 2380
},
{
"epoch": 0.9286963279580338,
"grad_norm": 0.4074372387990901,
"learning_rate": 3.4880837169641056e-05,
"loss": 0.0384,
"step": 2390
},
{
"epoch": 0.9325820866524189,
"grad_norm": 0.2793409167713601,
"learning_rate": 3.482025620289125e-05,
"loss": 0.0317,
"step": 2400
},
{
"epoch": 0.9364678453468039,
"grad_norm": 0.3747211944134066,
"learning_rate": 3.4759372154159185e-05,
"loss": 0.0339,
"step": 2410
},
{
"epoch": 0.9403536040411891,
"grad_norm": 0.2805669269123183,
"learning_rate": 3.469818626855546e-05,
"loss": 0.0339,
"step": 2420
},
{
"epoch": 0.9442393627355741,
"grad_norm": 0.40602944782370093,
"learning_rate": 3.463669979736343e-05,
"loss": 0.042,
"step": 2430
},
{
"epoch": 0.9481251214299592,
"grad_norm": 0.225237503944127,
"learning_rate": 3.457491399801353e-05,
"loss": 0.034,
"step": 2440
},
{
"epoch": 0.9520108801243443,
"grad_norm": 0.37947299716439314,
"learning_rate": 3.451283013405764e-05,
"loss": 0.0342,
"step": 2450
},
{
"epoch": 0.9558966388187293,
"grad_norm": 0.4204039088153023,
"learning_rate": 3.445044947514322e-05,
"loss": 0.0396,
"step": 2460
},
{
"epoch": 0.9597823975131144,
"grad_norm": 0.2632256881229335,
"learning_rate": 3.438777329698733e-05,
"loss": 0.0391,
"step": 2470
},
{
"epoch": 0.9636681562074995,
"grad_norm": 0.31582365898299897,
"learning_rate": 3.432480288135057e-05,
"loss": 0.0425,
"step": 2480
},
{
"epoch": 0.9675539149018846,
"grad_norm": 0.28143075132786943,
"learning_rate": 3.426153951601082e-05,
"loss": 0.0343,
"step": 2490
},
{
"epoch": 0.9714396735962697,
"grad_norm": 0.24769109092709082,
"learning_rate": 3.419798449473698e-05,
"loss": 0.034,
"step": 2500
},
{
"epoch": 0.9753254322906547,
"grad_norm": 0.33732830476966047,
"learning_rate": 3.413413911726241e-05,
"loss": 0.0405,
"step": 2510
},
{
"epoch": 0.9792111909850398,
"grad_norm": 0.32757343882785267,
"learning_rate": 3.407000468925845e-05,
"loss": 0.0339,
"step": 2520
},
{
"epoch": 0.983096949679425,
"grad_norm": 0.30532242889882966,
"learning_rate": 3.4005582522307664e-05,
"loss": 0.0342,
"step": 2530
},
{
"epoch": 0.98698270837381,
"grad_norm": 0.25099007166252546,
"learning_rate": 3.394087393387702e-05,
"loss": 0.0336,
"step": 2540
},
{
"epoch": 0.9908684670681951,
"grad_norm": 0.32436067338283625,
"learning_rate": 3.387588024729096e-05,
"loss": 0.0399,
"step": 2550
},
{
"epoch": 0.9947542257625801,
"grad_norm": 0.4181129951433982,
"learning_rate": 3.3810602791704325e-05,
"loss": 0.0361,
"step": 2560
},
{
"epoch": 0.9986399844569652,
"grad_norm": 0.3470403621225102,
"learning_rate": 3.374504290207519e-05,
"loss": 0.0343,
"step": 2570
},
{
"epoch": 1.0023314552166311,
"grad_norm": 0.26511898565361997,
"learning_rate": 3.367920191913755e-05,
"loss": 0.0321,
"step": 2580
},
{
"epoch": 1.0062172139110162,
"grad_norm": 0.2702160755775835,
"learning_rate": 3.3613081189373914e-05,
"loss": 0.0314,
"step": 2590
},
{
"epoch": 1.0101029726054012,
"grad_norm": 0.3118992978670784,
"learning_rate": 3.3546682064987735e-05,
"loss": 0.033,
"step": 2600
},
{
"epoch": 1.0139887312997864,
"grad_norm": 0.23058487649261575,
"learning_rate": 3.34800059038758e-05,
"loss": 0.0239,
"step": 2610
},
{
"epoch": 1.0178744899941714,
"grad_norm": 0.36097802187023126,
"learning_rate": 3.341305406960045e-05,
"loss": 0.033,
"step": 2620
},
{
"epoch": 1.0217602486885564,
"grad_norm": 0.28013397849201116,
"learning_rate": 3.3345827931361666e-05,
"loss": 0.0255,
"step": 2630
},
{
"epoch": 1.0256460073829414,
"grad_norm": 0.2029992608524442,
"learning_rate": 3.32783288639691e-05,
"loss": 0.0244,
"step": 2640
},
{
"epoch": 1.0295317660773267,
"grad_norm": 0.3553166383478555,
"learning_rate": 3.321055824781394e-05,
"loss": 0.0273,
"step": 2650
},
{
"epoch": 1.0334175247717117,
"grad_norm": 0.24984422004673013,
"learning_rate": 3.31425174688407e-05,
"loss": 0.0296,
"step": 2660
},
{
"epoch": 1.0373032834660967,
"grad_norm": 0.18909677592052765,
"learning_rate": 3.307420791851887e-05,
"loss": 0.0273,
"step": 2670
},
{
"epoch": 1.041189042160482,
"grad_norm": 0.436082933063616,
"learning_rate": 3.3005630993814416e-05,
"loss": 0.041,
"step": 2680
},
{
"epoch": 1.045074800854867,
"grad_norm": 0.23112116433945518,
"learning_rate": 3.293678809716129e-05,
"loss": 0.0304,
"step": 2690
},
{
"epoch": 1.048960559549252,
"grad_norm": 0.2941866407157775,
"learning_rate": 3.28676806364327e-05,
"loss": 0.0296,
"step": 2700
},
{
"epoch": 1.0528463182436372,
"grad_norm": 0.2235983248751329,
"learning_rate": 3.279831002491232e-05,
"loss": 0.0276,
"step": 2710
},
{
"epoch": 1.0567320769380222,
"grad_norm": 0.3904007075572595,
"learning_rate": 3.27286776812654e-05,
"loss": 0.0347,
"step": 2720
},
{
"epoch": 1.0606178356324072,
"grad_norm": 0.2861997613717581,
"learning_rate": 3.2658785029509746e-05,
"loss": 0.0264,
"step": 2730
},
{
"epoch": 1.0645035943267924,
"grad_norm": 0.3566819164029042,
"learning_rate": 3.258863349898659e-05,
"loss": 0.0277,
"step": 2740
},
{
"epoch": 1.0683893530211774,
"grad_norm": 0.18390854912707696,
"learning_rate": 3.251822452433141e-05,
"loss": 0.0307,
"step": 2750
},
{
"epoch": 1.0722751117155624,
"grad_norm": 0.3062331302207858,
"learning_rate": 3.244755954544449e-05,
"loss": 0.0312,
"step": 2760
},
{
"epoch": 1.0761608704099475,
"grad_norm": 0.3047851916691974,
"learning_rate": 3.2376640007461595e-05,
"loss": 0.0339,
"step": 2770
},
{
"epoch": 1.0800466291043327,
"grad_norm": 0.2300653857497523,
"learning_rate": 3.230546736072432e-05,
"loss": 0.0297,
"step": 2780
},
{
"epoch": 1.0839323877987177,
"grad_norm": 0.20305416815265903,
"learning_rate": 3.2234043060750464e-05,
"loss": 0.0324,
"step": 2790
},
{
"epoch": 1.0878181464931027,
"grad_norm": 0.24117134662351375,
"learning_rate": 3.216236856820429e-05,
"loss": 0.0225,
"step": 2800
},
{
"epoch": 1.091703905187488,
"grad_norm": 0.23857567814841038,
"learning_rate": 3.2090445348866616e-05,
"loss": 0.037,
"step": 2810
},
{
"epoch": 1.095589663881873,
"grad_norm": 0.20744357816037623,
"learning_rate": 3.201827487360485e-05,
"loss": 0.0343,
"step": 2820
},
{
"epoch": 1.099475422576258,
"grad_norm": 0.3148313503304088,
"learning_rate": 3.194585861834292e-05,
"loss": 0.0329,
"step": 2830
},
{
"epoch": 1.103361181270643,
"grad_norm": 0.2597458756041528,
"learning_rate": 3.187319806403108e-05,
"loss": 0.0324,
"step": 2840
},
{
"epoch": 1.1072469399650282,
"grad_norm": 0.37462454725250943,
"learning_rate": 3.180029469661563e-05,
"loss": 0.0298,
"step": 2850
},
{
"epoch": 1.1111326986594132,
"grad_norm": 0.26990355348646067,
"learning_rate": 3.172715000700851e-05,
"loss": 0.0256,
"step": 2860
},
{
"epoch": 1.1150184573537982,
"grad_norm": 0.2193580972033405,
"learning_rate": 3.165376549105686e-05,
"loss": 0.035,
"step": 2870
},
{
"epoch": 1.1189042160481835,
"grad_norm": 0.2966997098167283,
"learning_rate": 3.158014264951234e-05,
"loss": 0.0341,
"step": 2880
},
{
"epoch": 1.1227899747425685,
"grad_norm": 0.3014871881210635,
"learning_rate": 3.150628298800055e-05,
"loss": 0.0328,
"step": 2890
},
{
"epoch": 1.1266757334369535,
"grad_norm": 0.33425883934391964,
"learning_rate": 3.1432188016990154e-05,
"loss": 0.0262,
"step": 2900
},
{
"epoch": 1.1305614921313387,
"grad_norm": 0.23135238874039699,
"learning_rate": 3.1357859251762005e-05,
"loss": 0.0349,
"step": 2910
},
{
"epoch": 1.1344472508257237,
"grad_norm": 0.2957844330256544,
"learning_rate": 3.1283298212378204e-05,
"loss": 0.0308,
"step": 2920
},
{
"epoch": 1.1383330095201087,
"grad_norm": 0.309553215175613,
"learning_rate": 3.120850642365094e-05,
"loss": 0.0378,
"step": 2930
},
{
"epoch": 1.142218768214494,
"grad_norm": 0.27357475926464664,
"learning_rate": 3.113348541511139e-05,
"loss": 0.0315,
"step": 2940
},
{
"epoch": 1.146104526908879,
"grad_norm": 0.2922282115073422,
"learning_rate": 3.1058236720978357e-05,
"loss": 0.0207,
"step": 2950
},
{
"epoch": 1.149990285603264,
"grad_norm": 0.30551015741363674,
"learning_rate": 3.0982761880126956e-05,
"loss": 0.0309,
"step": 2960
},
{
"epoch": 1.153876044297649,
"grad_norm": 0.19603484059399878,
"learning_rate": 3.090706243605712e-05,
"loss": 0.0275,
"step": 2970
},
{
"epoch": 1.1577618029920342,
"grad_norm": 0.12126922576885169,
"learning_rate": 3.083113993686202e-05,
"loss": 0.0214,
"step": 2980
},
{
"epoch": 1.1616475616864192,
"grad_norm": 0.24570529004069103,
"learning_rate": 3.075499593519643e-05,
"loss": 0.0313,
"step": 2990
},
{
"epoch": 1.1655333203808043,
"grad_norm": 0.21996601041925384,
"learning_rate": 3.067863198824499e-05,
"loss": 0.0322,
"step": 3000
},
{
"epoch": 1.1694190790751895,
"grad_norm": 0.26523791368864613,
"learning_rate": 3.0602049657690275e-05,
"loss": 0.033,
"step": 3010
},
{
"epoch": 1.1733048377695745,
"grad_norm": 0.3743941590932805,
"learning_rate": 3.0525250509680975e-05,
"loss": 0.0315,
"step": 3020
},
{
"epoch": 1.1771905964639595,
"grad_norm": 0.29800477638021083,
"learning_rate": 3.0448236114799798e-05,
"loss": 0.0314,
"step": 3030
},
{
"epoch": 1.1810763551583447,
"grad_norm": 0.41913805025695794,
"learning_rate": 3.0371008048031335e-05,
"loss": 0.0252,
"step": 3040
},
{
"epoch": 1.1849621138527298,
"grad_norm": 0.2972323169585253,
"learning_rate": 3.029356788872992e-05,
"loss": 0.0284,
"step": 3050
},
{
"epoch": 1.1888478725471148,
"grad_norm": 0.2548640539234551,
"learning_rate": 3.0215917220587264e-05,
"loss": 0.0259,
"step": 3060
},
{
"epoch": 1.1927336312415,
"grad_norm": 0.18370703051409343,
"learning_rate": 3.013805763160009e-05,
"loss": 0.023,
"step": 3070
},
{
"epoch": 1.196619389935885,
"grad_norm": 0.17576524808213698,
"learning_rate": 3.0059990714037678e-05,
"loss": 0.0268,
"step": 3080
},
{
"epoch": 1.20050514863027,
"grad_norm": 0.25633777261633506,
"learning_rate": 2.9981718064409284e-05,
"loss": 0.0307,
"step": 3090
},
{
"epoch": 1.204390907324655,
"grad_norm": 0.27763678833721916,
"learning_rate": 2.9903241283431472e-05,
"loss": 0.0279,
"step": 3100
},
{
"epoch": 1.2082766660190403,
"grad_norm": 0.3087663315027572,
"learning_rate": 2.9824561975995427e-05,
"loss": 0.0276,
"step": 3110
},
{
"epoch": 1.2121624247134253,
"grad_norm": 0.28744858586674227,
"learning_rate": 2.974568175113409e-05,
"loss": 0.024,
"step": 3120
},
{
"epoch": 1.2160481834078103,
"grad_norm": 0.21935445525902675,
"learning_rate": 2.9666602221989267e-05,
"loss": 0.0286,
"step": 3130
},
{
"epoch": 1.2199339421021955,
"grad_norm": 0.23246090578146508,
"learning_rate": 2.958732500577864e-05,
"loss": 0.0212,
"step": 3140
},
{
"epoch": 1.2238197007965805,
"grad_norm": 0.2698352542678301,
"learning_rate": 2.9507851723762716e-05,
"loss": 0.0308,
"step": 3150
},
{
"epoch": 1.2277054594909655,
"grad_norm": 0.4180270105674614,
"learning_rate": 2.9428184001211616e-05,
"loss": 0.0212,
"step": 3160
},
{
"epoch": 1.2315912181853508,
"grad_norm": 0.2841163727116711,
"learning_rate": 2.9348323467371897e-05,
"loss": 0.0237,
"step": 3170
},
{
"epoch": 1.2354769768797358,
"grad_norm": 0.3283735446941676,
"learning_rate": 2.9268271755433198e-05,
"loss": 0.0268,
"step": 3180
},
{
"epoch": 1.2393627355741208,
"grad_norm": 0.17964907995508142,
"learning_rate": 2.9188030502494853e-05,
"loss": 0.026,
"step": 3190
},
{
"epoch": 1.243248494268506,
"grad_norm": 0.3739453975454048,
"learning_rate": 2.9107601349532406e-05,
"loss": 0.0351,
"step": 3200
},
{
"epoch": 1.247134252962891,
"grad_norm": 0.22041723340395544,
"learning_rate": 2.9026985941364053e-05,
"loss": 0.0289,
"step": 3210
},
{
"epoch": 1.251020011657276,
"grad_norm": 0.16977718972527425,
"learning_rate": 2.8946185926617012e-05,
"loss": 0.0227,
"step": 3220
},
{
"epoch": 1.254905770351661,
"grad_norm": 0.3419426868688641,
"learning_rate": 2.88652029576938e-05,
"loss": 0.0265,
"step": 3230
},
{
"epoch": 1.2587915290460463,
"grad_norm": 0.22438684261772954,
"learning_rate": 2.878403869073843e-05,
"loss": 0.0262,
"step": 3240
},
{
"epoch": 1.2626772877404313,
"grad_norm": 0.30489902115039463,
"learning_rate": 2.8702694785602587e-05,
"loss": 0.0314,
"step": 3250
},
{
"epoch": 1.2665630464348163,
"grad_norm": 0.2434714851331239,
"learning_rate": 2.8621172905811613e-05,
"loss": 0.0275,
"step": 3260
},
{
"epoch": 1.2704488051292016,
"grad_norm": 0.27588341692139456,
"learning_rate": 2.8539474718530543e-05,
"loss": 0.0264,
"step": 3270
},
{
"epoch": 1.2743345638235866,
"grad_norm": 0.29322871157127156,
"learning_rate": 2.8457601894529997e-05,
"loss": 0.0375,
"step": 3280
},
{
"epoch": 1.2782203225179716,
"grad_norm": 0.19868867147652092,
"learning_rate": 2.8375556108151995e-05,
"loss": 0.029,
"step": 3290
},
{
"epoch": 1.2821060812123566,
"grad_norm": 0.3711457646768087,
"learning_rate": 2.829333903727574e-05,
"loss": 0.0327,
"step": 3300
},
{
"epoch": 1.2859918399067418,
"grad_norm": 0.28271933767183866,
"learning_rate": 2.821095236328328e-05,
"loss": 0.0261,
"step": 3310
},
{
"epoch": 1.2898775986011268,
"grad_norm": 0.23298730366288653,
"learning_rate": 2.812839777102514e-05,
"loss": 0.029,
"step": 3320
},
{
"epoch": 1.293763357295512,
"grad_norm": 0.23059435959988767,
"learning_rate": 2.8045676948785873e-05,
"loss": 0.0321,
"step": 3330
},
{
"epoch": 1.297649115989897,
"grad_norm": 0.19452335917095895,
"learning_rate": 2.7962791588249492e-05,
"loss": 0.029,
"step": 3340
},
{
"epoch": 1.301534874684282,
"grad_norm": 0.30937914926531507,
"learning_rate": 2.7879743384464942e-05,
"loss": 0.027,
"step": 3350
},
{
"epoch": 1.305420633378667,
"grad_norm": 0.30675762635230686,
"learning_rate": 2.7796534035811378e-05,
"loss": 0.0248,
"step": 3360
},
{
"epoch": 1.3093063920730523,
"grad_norm": 0.14564788288302782,
"learning_rate": 2.7713165243963444e-05,
"loss": 0.0242,
"step": 3370
},
{
"epoch": 1.3131921507674373,
"grad_norm": 0.3611570999157121,
"learning_rate": 2.7629638713856503e-05,
"loss": 0.0313,
"step": 3380
},
{
"epoch": 1.3170779094618223,
"grad_norm": 0.15900271035042685,
"learning_rate": 2.7545956153651712e-05,
"loss": 0.0246,
"step": 3390
},
{
"epoch": 1.3209636681562076,
"grad_norm": 0.3433667317929698,
"learning_rate": 2.746211927470117e-05,
"loss": 0.0269,
"step": 3400
},
{
"epoch": 1.3248494268505926,
"grad_norm": 0.28124492243082266,
"learning_rate": 2.737812979151284e-05,
"loss": 0.0245,
"step": 3410
},
{
"epoch": 1.3287351855449776,
"grad_norm": 0.187688472206659,
"learning_rate": 2.7293989421715542e-05,
"loss": 0.0253,
"step": 3420
},
{
"epoch": 1.3326209442393626,
"grad_norm": 0.2829084594578858,
"learning_rate": 2.720969988602379e-05,
"loss": 0.0207,
"step": 3430
},
{
"epoch": 1.3365067029337478,
"grad_norm": 0.3016772133080438,
"learning_rate": 2.7125262908202633e-05,
"loss": 0.0281,
"step": 3440
},
{
"epoch": 1.3403924616281329,
"grad_norm": 0.2338038212652911,
"learning_rate": 2.7040680215032377e-05,
"loss": 0.0322,
"step": 3450
},
{
"epoch": 1.344278220322518,
"grad_norm": 0.23012741652116103,
"learning_rate": 2.6955953536273285e-05,
"loss": 0.0342,
"step": 3460
},
{
"epoch": 1.348163979016903,
"grad_norm": 0.1789676480348028,
"learning_rate": 2.6871084604630214e-05,
"loss": 0.0302,
"step": 3470
},
{
"epoch": 1.3520497377112881,
"grad_norm": 0.2847505802479265,
"learning_rate": 2.6786075155717147e-05,
"loss": 0.0341,
"step": 3480
},
{
"epoch": 1.3559354964056731,
"grad_norm": 0.21765574449058714,
"learning_rate": 2.6700926928021736e-05,
"loss": 0.0287,
"step": 3490
},
{
"epoch": 1.3598212551000584,
"grad_norm": 0.28617645696636324,
"learning_rate": 2.6615641662869714e-05,
"loss": 0.0307,
"step": 3500
},
{
"epoch": 1.3637070137944434,
"grad_norm": 0.34355897351030124,
"learning_rate": 2.6530221104389316e-05,
"loss": 0.0232,
"step": 3510
},
{
"epoch": 1.3675927724888284,
"grad_norm": 0.19082168638857241,
"learning_rate": 2.6444666999475593e-05,
"loss": 0.0301,
"step": 3520
},
{
"epoch": 1.3714785311832136,
"grad_norm": 0.27997736990150557,
"learning_rate": 2.635898109775468e-05,
"loss": 0.0237,
"step": 3530
},
{
"epoch": 1.3753642898775986,
"grad_norm": 0.2234204944915347,
"learning_rate": 2.6273165151548047e-05,
"loss": 0.0271,
"step": 3540
},
{
"epoch": 1.3792500485719836,
"grad_norm": 0.2201998240178073,
"learning_rate": 2.6187220915836627e-05,
"loss": 0.0292,
"step": 3550
},
{
"epoch": 1.3831358072663686,
"grad_norm": 0.14320713366865984,
"learning_rate": 2.6101150148224928e-05,
"loss": 0.0288,
"step": 3560
},
{
"epoch": 1.3870215659607539,
"grad_norm": 0.2116339664682065,
"learning_rate": 2.601495460890513e-05,
"loss": 0.0269,
"step": 3570
},
{
"epoch": 1.3909073246551389,
"grad_norm": 0.24959055444114717,
"learning_rate": 2.5928636060621036e-05,
"loss": 0.0337,
"step": 3580
},
{
"epoch": 1.3947930833495241,
"grad_norm": 0.2954702102324182,
"learning_rate": 2.5842196268632068e-05,
"loss": 0.0228,
"step": 3590
},
{
"epoch": 1.3986788420439091,
"grad_norm": 0.38085393262983136,
"learning_rate": 2.5755637000677124e-05,
"loss": 0.0247,
"step": 3600
},
{
"epoch": 1.4025646007382941,
"grad_norm": 0.21224922937032303,
"learning_rate": 2.566896002693845e-05,
"loss": 0.0252,
"step": 3610
},
{
"epoch": 1.4064503594326792,
"grad_norm": 0.3607238719844902,
"learning_rate": 2.5582167120005467e-05,
"loss": 0.0258,
"step": 3620
},
{
"epoch": 1.4103361181270644,
"grad_norm": 0.19260362661029898,
"learning_rate": 2.549526005483844e-05,
"loss": 0.0235,
"step": 3630
},
{
"epoch": 1.4142218768214494,
"grad_norm": 0.26994983197422656,
"learning_rate": 2.5408240608732277e-05,
"loss": 0.0253,
"step": 3640
},
{
"epoch": 1.4181076355158344,
"grad_norm": 0.29929031715132187,
"learning_rate": 2.5321110561280106e-05,
"loss": 0.0275,
"step": 3650
},
{
"epoch": 1.4219933942102196,
"grad_norm": 0.3196437552669507,
"learning_rate": 2.523387169433692e-05,
"loss": 0.0311,
"step": 3660
},
{
"epoch": 1.4258791529046047,
"grad_norm": 0.2296744794340079,
"learning_rate": 2.514652579198312e-05,
"loss": 0.0278,
"step": 3670
},
{
"epoch": 1.4297649115989897,
"grad_norm": 0.16892616731103247,
"learning_rate": 2.5059074640488047e-05,
"loss": 0.0263,
"step": 3680
},
{
"epoch": 1.4336506702933747,
"grad_norm": 0.2394458582673369,
"learning_rate": 2.497152002827345e-05,
"loss": 0.0277,
"step": 3690
},
{
"epoch": 1.43753642898776,
"grad_norm": 0.1974720107762794,
"learning_rate": 2.488386374587688e-05,
"loss": 0.0288,
"step": 3700
},
{
"epoch": 1.441422187682145,
"grad_norm": 0.21920589738520604,
"learning_rate": 2.479610758591511e-05,
"loss": 0.0216,
"step": 3710
},
{
"epoch": 1.4453079463765302,
"grad_norm": 0.21707363978296285,
"learning_rate": 2.4708253343047456e-05,
"loss": 0.0292,
"step": 3720
},
{
"epoch": 1.4491937050709152,
"grad_norm": 0.18601726681673092,
"learning_rate": 2.4620302813939093e-05,
"loss": 0.0245,
"step": 3730
},
{
"epoch": 1.4530794637653002,
"grad_norm": 0.2635531513187159,
"learning_rate": 2.4532257797224287e-05,
"loss": 0.0295,
"step": 3740
},
{
"epoch": 1.4569652224596852,
"grad_norm": 0.3251174321742817,
"learning_rate": 2.4444120093469632e-05,
"loss": 0.0298,
"step": 3750
},
{
"epoch": 1.4608509811540702,
"grad_norm": 0.4977849600583359,
"learning_rate": 2.4355891505137216e-05,
"loss": 0.0325,
"step": 3760
},
{
"epoch": 1.4647367398484554,
"grad_norm": 0.1750593962332665,
"learning_rate": 2.4267573836547768e-05,
"loss": 0.022,
"step": 3770
},
{
"epoch": 1.4686224985428404,
"grad_norm": 0.271292018124954,
"learning_rate": 2.417916889384374e-05,
"loss": 0.0281,
"step": 3780
},
{
"epoch": 1.4725082572372257,
"grad_norm": 0.2221308512528936,
"learning_rate": 2.4090678484952416e-05,
"loss": 0.0209,
"step": 3790
},
{
"epoch": 1.4763940159316107,
"grad_norm": 0.3356499834645714,
"learning_rate": 2.400210441954888e-05,
"loss": 0.0286,
"step": 3800
},
{
"epoch": 1.4802797746259957,
"grad_norm": 0.19837629045900065,
"learning_rate": 2.3913448509019047e-05,
"loss": 0.0268,
"step": 3810
},
{
"epoch": 1.4841655333203807,
"grad_norm": 0.22984338791117062,
"learning_rate": 2.3824712566422613e-05,
"loss": 0.0324,
"step": 3820
},
{
"epoch": 1.488051292014766,
"grad_norm": 0.149938685223149,
"learning_rate": 2.3735898406455945e-05,
"loss": 0.0204,
"step": 3830
},
{
"epoch": 1.491937050709151,
"grad_norm": 0.2297420680782252,
"learning_rate": 2.364700784541504e-05,
"loss": 0.0281,
"step": 3840
},
{
"epoch": 1.495822809403536,
"grad_norm": 0.21161937179194085,
"learning_rate": 2.3558042701158294e-05,
"loss": 0.0296,
"step": 3850
},
{
"epoch": 1.4997085680979212,
"grad_norm": 0.2681086390248988,
"learning_rate": 2.34690047930694e-05,
"loss": 0.024,
"step": 3860
},
{
"epoch": 1.5035943267923062,
"grad_norm": 0.26044197357045645,
"learning_rate": 2.337989594202009e-05,
"loss": 0.0242,
"step": 3870
},
{
"epoch": 1.5074800854866912,
"grad_norm": 0.2581684322985289,
"learning_rate": 2.3290717970332918e-05,
"loss": 0.0236,
"step": 3880
},
{
"epoch": 1.5113658441810762,
"grad_norm": 0.23713590285989272,
"learning_rate": 2.3201472701744013e-05,
"loss": 0.0251,
"step": 3890
},
{
"epoch": 1.5152516028754615,
"grad_norm": 0.2501508071933915,
"learning_rate": 2.3112161961365724e-05,
"loss": 0.0301,
"step": 3900
},
{
"epoch": 1.5191373615698465,
"grad_norm": 0.24802013587982075,
"learning_rate": 2.302278757564937e-05,
"loss": 0.0261,
"step": 3910
},
{
"epoch": 1.5230231202642317,
"grad_norm": 0.3445587520016882,
"learning_rate": 2.2933351372347822e-05,
"loss": 0.0279,
"step": 3920
},
{
"epoch": 1.5269088789586167,
"grad_norm": 0.22828900210653533,
"learning_rate": 2.2843855180478167e-05,
"loss": 0.0224,
"step": 3930
},
{
"epoch": 1.5307946376530017,
"grad_norm": 0.14993393496314023,
"learning_rate": 2.2754300830284287e-05,
"loss": 0.0228,
"step": 3940
},
{
"epoch": 1.5346803963473867,
"grad_norm": 0.22112935196732578,
"learning_rate": 2.266469015319943e-05,
"loss": 0.0298,
"step": 3950
},
{
"epoch": 1.5385661550417717,
"grad_norm": 0.19193655052829417,
"learning_rate": 2.2575024981808763e-05,
"loss": 0.0253,
"step": 3960
},
{
"epoch": 1.542451913736157,
"grad_norm": 0.2670658908912582,
"learning_rate": 2.2485307149811894e-05,
"loss": 0.029,
"step": 3970
},
{
"epoch": 1.5463376724305422,
"grad_norm": 0.2243861088423283,
"learning_rate": 2.2395538491985338e-05,
"loss": 0.0246,
"step": 3980
},
{
"epoch": 1.5502234311249272,
"grad_norm": 0.22493381727938802,
"learning_rate": 2.230572084414507e-05,
"loss": 0.0253,
"step": 3990
},
{
"epoch": 1.5541091898193122,
"grad_norm": 0.2866864450702556,
"learning_rate": 2.2215856043108896e-05,
"loss": 0.0291,
"step": 4000
},
{
"epoch": 1.5579949485136972,
"grad_norm": 0.16815062034208167,
"learning_rate": 2.212594592665896e-05,
"loss": 0.025,
"step": 4010
},
{
"epoch": 1.5618807072080823,
"grad_norm": 0.2612232980649845,
"learning_rate": 2.2035992333504127e-05,
"loss": 0.0326,
"step": 4020
},
{
"epoch": 1.5657664659024675,
"grad_norm": 0.33854813622270197,
"learning_rate": 2.1945997103242344e-05,
"loss": 0.0292,
"step": 4030
},
{
"epoch": 1.5696522245968525,
"grad_norm": 0.2617347504462213,
"learning_rate": 2.1855962076323115e-05,
"loss": 0.021,
"step": 4040
},
{
"epoch": 1.5735379832912377,
"grad_norm": 0.1548116373638503,
"learning_rate": 2.1765889094009762e-05,
"loss": 0.0269,
"step": 4050
},
{
"epoch": 1.5774237419856227,
"grad_norm": 0.19918562909410628,
"learning_rate": 2.167577999834185e-05,
"loss": 0.0237,
"step": 4060
},
{
"epoch": 1.5813095006800078,
"grad_norm": 0.11447983969077741,
"learning_rate": 2.1585636632097446e-05,
"loss": 0.0235,
"step": 4070
},
{
"epoch": 1.5851952593743928,
"grad_norm": 0.1598181388976202,
"learning_rate": 2.1495460838755492e-05,
"loss": 0.0249,
"step": 4080
},
{
"epoch": 1.5890810180687778,
"grad_norm": 0.1998709332001755,
"learning_rate": 2.140525446245808e-05,
"loss": 0.0223,
"step": 4090
},
{
"epoch": 1.592966776763163,
"grad_norm": 0.23042732127911256,
"learning_rate": 2.1315019347972723e-05,
"loss": 0.025,
"step": 4100
},
{
"epoch": 1.5968525354575482,
"grad_norm": 0.28671695920142976,
"learning_rate": 2.1224757340654672e-05,
"loss": 0.0282,
"step": 4110
},
{
"epoch": 1.6007382941519332,
"grad_norm": 0.2013168208153807,
"learning_rate": 2.1134470286409118e-05,
"loss": 0.0223,
"step": 4120
},
{
"epoch": 1.6046240528463183,
"grad_norm": 0.22132045622532287,
"learning_rate": 2.10441600316535e-05,
"loss": 0.0209,
"step": 4130
},
{
"epoch": 1.6085098115407033,
"grad_norm": 0.18246569785225308,
"learning_rate": 2.095382842327971e-05,
"loss": 0.0258,
"step": 4140
},
{
"epoch": 1.6123955702350883,
"grad_norm": 0.15024515227995486,
"learning_rate": 2.086347730861633e-05,
"loss": 0.018,
"step": 4150
},
{
"epoch": 1.6162813289294735,
"grad_norm": 0.21028730383896185,
"learning_rate": 2.077310853539086e-05,
"loss": 0.0246,
"step": 4160
},
{
"epoch": 1.6201670876238585,
"grad_norm": 0.18683373176287732,
"learning_rate": 2.068272395169193e-05,
"loss": 0.0207,
"step": 4170
},
{
"epoch": 1.6240528463182438,
"grad_norm": 0.2838911905964799,
"learning_rate": 2.0592325405931498e-05,
"loss": 0.0282,
"step": 4180
},
{
"epoch": 1.6279386050126288,
"grad_norm": 0.28170440553062887,
"learning_rate": 2.050191474680705e-05,
"loss": 0.0226,
"step": 4190
},
{
"epoch": 1.6318243637070138,
"grad_norm": 0.20578915632228978,
"learning_rate": 2.04114938232638e-05,
"loss": 0.0221,
"step": 4200
},
{
"epoch": 1.6357101224013988,
"grad_norm": 0.3124892105648976,
"learning_rate": 2.0321064484456875e-05,
"loss": 0.0199,
"step": 4210
},
{
"epoch": 1.6395958810957838,
"grad_norm": 0.1570324139378916,
"learning_rate": 2.0230628579713505e-05,
"loss": 0.0255,
"step": 4220
},
{
"epoch": 1.643481639790169,
"grad_norm": 0.24972195531663,
"learning_rate": 2.0140187958495187e-05,
"loss": 0.0252,
"step": 4230
},
{
"epoch": 1.6473673984845543,
"grad_norm": 0.2018721430297068,
"learning_rate": 2.004974447035988e-05,
"loss": 0.0225,
"step": 4240
},
{
"epoch": 1.6512531571789393,
"grad_norm": 0.19112522378491964,
"learning_rate": 1.9959299964924156e-05,
"loss": 0.024,
"step": 4250
},
{
"epoch": 1.6551389158733243,
"grad_norm": 0.14928748310133091,
"learning_rate": 1.9868856291825417e-05,
"loss": 0.0199,
"step": 4260
},
{
"epoch": 1.6590246745677093,
"grad_norm": 0.22380334602146915,
"learning_rate": 1.9778415300684033e-05,
"loss": 0.0274,
"step": 4270
},
{
"epoch": 1.6629104332620943,
"grad_norm": 0.24112965797496708,
"learning_rate": 1.9687978841065514e-05,
"loss": 0.0217,
"step": 4280
},
{
"epoch": 1.6667961919564795,
"grad_norm": 0.16406141595970072,
"learning_rate": 1.9597548762442712e-05,
"loss": 0.0283,
"step": 4290
},
{
"epoch": 1.6706819506508646,
"grad_norm": 0.1948101464396577,
"learning_rate": 1.9507126914157973e-05,
"loss": 0.0251,
"step": 4300
},
{
"epoch": 1.6745677093452498,
"grad_norm": 0.2956340770250799,
"learning_rate": 1.941671514538536e-05,
"loss": 0.0185,
"step": 4310
},
{
"epoch": 1.6784534680396348,
"grad_norm": 0.12177397943751574,
"learning_rate": 1.9326315305092746e-05,
"loss": 0.0263,
"step": 4320
},
{
"epoch": 1.6823392267340198,
"grad_norm": 0.19695542020311577,
"learning_rate": 1.923592924200412e-05,
"loss": 0.021,
"step": 4330
},
{
"epoch": 1.6862249854284048,
"grad_norm": 0.21181764645601958,
"learning_rate": 1.9145558804561686e-05,
"loss": 0.0254,
"step": 4340
},
{
"epoch": 1.6901107441227898,
"grad_norm": 0.20176994374162602,
"learning_rate": 1.90552058408881e-05,
"loss": 0.0247,
"step": 4350
},
{
"epoch": 1.693996502817175,
"grad_norm": 0.23272965832055748,
"learning_rate": 1.8964872198748694e-05,
"loss": 0.0207,
"step": 4360
},
{
"epoch": 1.6978822615115603,
"grad_norm": 0.17179212412190722,
"learning_rate": 1.8874559725513618e-05,
"loss": 0.0236,
"step": 4370
},
{
"epoch": 1.7017680202059453,
"grad_norm": 0.2088323094557007,
"learning_rate": 1.8784270268120148e-05,
"loss": 0.0215,
"step": 4380
},
{
"epoch": 1.7056537789003303,
"grad_norm": 0.25618504375528284,
"learning_rate": 1.869400567303486e-05,
"loss": 0.0253,
"step": 4390
},
{
"epoch": 1.7095395375947153,
"grad_norm": 0.29335176738438906,
"learning_rate": 1.8603767786215886e-05,
"loss": 0.0247,
"step": 4400
},
{
"epoch": 1.7134252962891003,
"grad_norm": 0.19297739454353136,
"learning_rate": 1.8513558453075145e-05,
"loss": 0.0176,
"step": 4410
},
{
"epoch": 1.7173110549834856,
"grad_norm": 0.21907186035510737,
"learning_rate": 1.8423379518440637e-05,
"loss": 0.0252,
"step": 4420
},
{
"epoch": 1.7211968136778706,
"grad_norm": 0.1662707063311211,
"learning_rate": 1.833323282651869e-05,
"loss": 0.0264,
"step": 4430
},
{
"epoch": 1.7250825723722558,
"grad_norm": 0.2782775098963104,
"learning_rate": 1.824312022085625e-05,
"loss": 0.0256,
"step": 4440
},
{
"epoch": 1.7289683310666408,
"grad_norm": 0.2244270504850629,
"learning_rate": 1.8153043544303187e-05,
"loss": 0.022,
"step": 4450
},
{
"epoch": 1.7328540897610258,
"grad_norm": 0.19984321027915625,
"learning_rate": 1.806300463897459e-05,
"loss": 0.0258,
"step": 4460
},
{
"epoch": 1.7367398484554109,
"grad_norm": 0.34358544847861805,
"learning_rate": 1.7973005346213112e-05,
"loss": 0.0247,
"step": 4470
},
{
"epoch": 1.7406256071497959,
"grad_norm": 0.258634512592744,
"learning_rate": 1.7883047506551323e-05,
"loss": 0.0241,
"step": 4480
},
{
"epoch": 1.744511365844181,
"grad_norm": 0.2993875451174172,
"learning_rate": 1.779313295967404e-05,
"loss": 0.025,
"step": 4490
},
{
"epoch": 1.748397124538566,
"grad_norm": 0.2524644984996203,
"learning_rate": 1.7703263544380712e-05,
"loss": 0.0256,
"step": 4500
},
{
"epoch": 1.7522828832329513,
"grad_norm": 0.15203592796958135,
"learning_rate": 1.7613441098547844e-05,
"loss": 0.0277,
"step": 4510
},
{
"epoch": 1.7561686419273363,
"grad_norm": 0.3339303036694701,
"learning_rate": 1.7523667459091372e-05,
"loss": 0.0253,
"step": 4520
},
{
"epoch": 1.7600544006217214,
"grad_norm": 0.10575616905501665,
"learning_rate": 1.743394446192915e-05,
"loss": 0.0239,
"step": 4530
},
{
"epoch": 1.7639401593161064,
"grad_norm": 0.23336439363322206,
"learning_rate": 1.734427394194331e-05,
"loss": 0.0177,
"step": 4540
},
{
"epoch": 1.7678259180104916,
"grad_norm": 0.18372002116915823,
"learning_rate": 1.725465773294286e-05,
"loss": 0.0238,
"step": 4550
},
{
"epoch": 1.7717116767048766,
"grad_norm": 0.4517812923928783,
"learning_rate": 1.7165097667626085e-05,
"loss": 0.0241,
"step": 4560
},
{
"epoch": 1.7755974353992618,
"grad_norm": 0.3032985382795743,
"learning_rate": 1.7075595577543112e-05,
"loss": 0.0246,
"step": 4570
},
{
"epoch": 1.7794831940936469,
"grad_norm": 0.2883198267970469,
"learning_rate": 1.698615329305846e-05,
"loss": 0.0363,
"step": 4580
},
{
"epoch": 1.7833689527880319,
"grad_norm": 0.212183942746847,
"learning_rate": 1.6896772643313545e-05,
"loss": 0.0184,
"step": 4590
},
{
"epoch": 1.7872547114824169,
"grad_norm": 0.20532524865736027,
"learning_rate": 1.6807455456189375e-05,
"loss": 0.0238,
"step": 4600
},
{
"epoch": 1.791140470176802,
"grad_norm": 0.34333620427983147,
"learning_rate": 1.671820355826909e-05,
"loss": 0.0262,
"step": 4610
},
{
"epoch": 1.7950262288711871,
"grad_norm": 0.20001883941281645,
"learning_rate": 1.6629018774800626e-05,
"loss": 0.0272,
"step": 4620
},
{
"epoch": 1.7989119875655721,
"grad_norm": 0.34691765585073897,
"learning_rate": 1.6539902929659398e-05,
"loss": 0.025,
"step": 4630
},
{
"epoch": 1.8027977462599574,
"grad_norm": 0.2742832126748164,
"learning_rate": 1.6450857845310995e-05,
"loss": 0.0235,
"step": 4640
},
{
"epoch": 1.8066835049543424,
"grad_norm": 0.28750335882588557,
"learning_rate": 1.6361885342773928e-05,
"loss": 0.0217,
"step": 4650
},
{
"epoch": 1.8105692636487274,
"grad_norm": 0.20616162983525588,
"learning_rate": 1.6272987241582344e-05,
"loss": 0.0234,
"step": 4660
},
{
"epoch": 1.8144550223431124,
"grad_norm": 0.28032518340338625,
"learning_rate": 1.6184165359748873e-05,
"loss": 0.0241,
"step": 4670
},
{
"epoch": 1.8183407810374974,
"grad_norm": 0.19596210439624756,
"learning_rate": 1.6095421513727393e-05,
"loss": 0.0195,
"step": 4680
},
{
"epoch": 1.8222265397318826,
"grad_norm": 0.23644823136897267,
"learning_rate": 1.600675751837591e-05,
"loss": 0.0219,
"step": 4690
},
{
"epoch": 1.8261122984262679,
"grad_norm": 0.2630361162326872,
"learning_rate": 1.591817518691947e-05,
"loss": 0.0228,
"step": 4700
},
{
"epoch": 1.8299980571206529,
"grad_norm": 0.2595409653034801,
"learning_rate": 1.582967633091303e-05,
"loss": 0.0231,
"step": 4710
},
{
"epoch": 1.833883815815038,
"grad_norm": 0.3326391117683167,
"learning_rate": 1.5741262760204424e-05,
"loss": 0.0291,
"step": 4720
},
{
"epoch": 1.837769574509423,
"grad_norm": 0.24035275119514785,
"learning_rate": 1.5652936282897365e-05,
"loss": 0.0219,
"step": 4730
},
{
"epoch": 1.841655333203808,
"grad_norm": 0.244068103767282,
"learning_rate": 1.5564698705314457e-05,
"loss": 0.0224,
"step": 4740
},
{
"epoch": 1.8455410918981932,
"grad_norm": 0.24784872474552172,
"learning_rate": 1.5476551831960283e-05,
"loss": 0.0269,
"step": 4750
},
{
"epoch": 1.8494268505925782,
"grad_norm": 0.2820854981793686,
"learning_rate": 1.5388497465484427e-05,
"loss": 0.0236,
"step": 4760
},
{
"epoch": 1.8533126092869634,
"grad_norm": 0.18652076792233116,
"learning_rate": 1.5300537406644707e-05,
"loss": 0.0248,
"step": 4770
},
{
"epoch": 1.8571983679813484,
"grad_norm": 0.21164601939248687,
"learning_rate": 1.5212673454270275e-05,
"loss": 0.0187,
"step": 4780
},
{
"epoch": 1.8610841266757334,
"grad_norm": 0.19650678419372633,
"learning_rate": 1.5124907405224857e-05,
"loss": 0.0202,
"step": 4790
},
{
"epoch": 1.8649698853701184,
"grad_norm": 0.19671419185547953,
"learning_rate": 1.5037241054370031e-05,
"loss": 0.0226,
"step": 4800
},
{
"epoch": 1.8688556440645034,
"grad_norm": 0.28963451060296624,
"learning_rate": 1.4949676194528443e-05,
"loss": 0.0212,
"step": 4810
},
{
"epoch": 1.8727414027588887,
"grad_norm": 0.2291236266085277,
"learning_rate": 1.4862214616447246e-05,
"loss": 0.0265,
"step": 4820
},
{
"epoch": 1.876627161453274,
"grad_norm": 0.21720512675401074,
"learning_rate": 1.4774858108761399e-05,
"loss": 0.0247,
"step": 4830
},
{
"epoch": 1.880512920147659,
"grad_norm": 0.24043869727722034,
"learning_rate": 1.4687608457957131e-05,
"loss": 0.0237,
"step": 4840
},
{
"epoch": 1.884398678842044,
"grad_norm": 0.3039350538024942,
"learning_rate": 1.4600467448335377e-05,
"loss": 0.0209,
"step": 4850
},
{
"epoch": 1.888284437536429,
"grad_norm": 0.27910808051006536,
"learning_rate": 1.4513436861975309e-05,
"loss": 0.0239,
"step": 4860
},
{
"epoch": 1.892170196230814,
"grad_norm": 0.186631017294127,
"learning_rate": 1.4426518478697898e-05,
"loss": 0.0267,
"step": 4870
},
{
"epoch": 1.8960559549251992,
"grad_norm": 0.1881734007091214,
"learning_rate": 1.4339714076029485e-05,
"loss": 0.0218,
"step": 4880
},
{
"epoch": 1.8999417136195842,
"grad_norm": 0.13892970586490075,
"learning_rate": 1.4253025429165464e-05,
"loss": 0.0212,
"step": 4890
},
{
"epoch": 1.9038274723139694,
"grad_norm": 0.288495543969294,
"learning_rate": 1.4166454310933941e-05,
"loss": 0.0209,
"step": 4900
},
{
"epoch": 1.9077132310083544,
"grad_norm": 0.2555017196077912,
"learning_rate": 1.4080002491759519e-05,
"loss": 0.0244,
"step": 4910
},
{
"epoch": 1.9115989897027394,
"grad_norm": 0.18634334770662844,
"learning_rate": 1.3993671739627072e-05,
"loss": 0.0217,
"step": 4920
},
{
"epoch": 1.9154847483971245,
"grad_norm": 0.3164288797558731,
"learning_rate": 1.3907463820045589e-05,
"loss": 0.0222,
"step": 4930
},
{
"epoch": 1.9193705070915095,
"grad_norm": 0.2818084680815898,
"learning_rate": 1.382138049601205e-05,
"loss": 0.0218,
"step": 4940
},
{
"epoch": 1.9232562657858947,
"grad_norm": 0.36520600715684526,
"learning_rate": 1.3735423527975416e-05,
"loss": 0.0268,
"step": 4950
},
{
"epoch": 1.92714202448028,
"grad_norm": 0.29834225940452946,
"learning_rate": 1.3649594673800585e-05,
"loss": 0.027,
"step": 4960
},
{
"epoch": 1.931027783174665,
"grad_norm": 0.24025465966717943,
"learning_rate": 1.3563895688732476e-05,
"loss": 0.0201,
"step": 4970
},
{
"epoch": 1.93491354186905,
"grad_norm": 0.36547751994851313,
"learning_rate": 1.3478328325360108e-05,
"loss": 0.0175,
"step": 4980
},
{
"epoch": 1.938799300563435,
"grad_norm": 0.30354949309324647,
"learning_rate": 1.3392894333580757e-05,
"loss": 0.0222,
"step": 4990
},
{
"epoch": 1.94268505925782,
"grad_norm": 0.23953269705131022,
"learning_rate": 1.3307595460564196e-05,
"loss": 0.0245,
"step": 5000
},
{
"epoch": 1.9465708179522052,
"grad_norm": 0.25015753705105026,
"learning_rate": 1.3222433450716939e-05,
"loss": 0.0273,
"step": 5010
},
{
"epoch": 1.9504565766465902,
"grad_norm": 0.1601753057955312,
"learning_rate": 1.3137410045646593e-05,
"loss": 0.0223,
"step": 5020
},
{
"epoch": 1.9543423353409755,
"grad_norm": 0.2242880052150341,
"learning_rate": 1.3052526984126192e-05,
"loss": 0.0183,
"step": 5030
},
{
"epoch": 1.9582280940353605,
"grad_norm": 0.2607444099422183,
"learning_rate": 1.2967786002058712e-05,
"loss": 0.0241,
"step": 5040
},
{
"epoch": 1.9621138527297455,
"grad_norm": 0.24290540890508416,
"learning_rate": 1.2883188832441496e-05,
"loss": 0.0226,
"step": 5050
},
{
"epoch": 1.9659996114241305,
"grad_norm": 0.3409873620160428,
"learning_rate": 1.2798737205330869e-05,
"loss": 0.041,
"step": 5060
},
{
"epoch": 1.9698853701185155,
"grad_norm": 0.1874649872758006,
"learning_rate": 1.2714432847806721e-05,
"loss": 0.0256,
"step": 5070
},
{
"epoch": 1.9737711288129007,
"grad_norm": 0.18360888920333324,
"learning_rate": 1.263027748393721e-05,
"loss": 0.0219,
"step": 5080
},
{
"epoch": 1.9776568875072857,
"grad_norm": 0.23427360199598463,
"learning_rate": 1.2546272834743496e-05,
"loss": 0.0221,
"step": 5090
},
{
"epoch": 1.981542646201671,
"grad_norm": 0.31230044573711585,
"learning_rate": 1.2462420618164548e-05,
"loss": 0.0204,
"step": 5100
},
{
"epoch": 1.985428404896056,
"grad_norm": 0.250237465475584,
"learning_rate": 1.2378722549022e-05,
"loss": 0.02,
"step": 5110
},
{
"epoch": 1.989314163590441,
"grad_norm": 0.24977255925651687,
"learning_rate": 1.2295180338985092e-05,
"loss": 0.0221,
"step": 5120
},
{
"epoch": 1.993199922284826,
"grad_norm": 0.3918157450288067,
"learning_rate": 1.2211795696535664e-05,
"loss": 0.0264,
"step": 5130
},
{
"epoch": 1.9970856809792112,
"grad_norm": 0.24230360473123272,
"learning_rate": 1.2128570326933224e-05,
"loss": 0.0167,
"step": 5140
},
{
"epoch": 2.000777151738877,
"grad_norm": 0.19021696383018177,
"learning_rate": 1.2045505932180069e-05,
"loss": 0.0227,
"step": 5150
},
{
"epoch": 2.0046629104332623,
"grad_norm": 0.1996009218747256,
"learning_rate": 1.1962604210986455e-05,
"loss": 0.0214,
"step": 5160
},
{
"epoch": 2.0085486691276473,
"grad_norm": 0.25730395478860757,
"learning_rate": 1.18798668587359e-05,
"loss": 0.0209,
"step": 5170
},
{
"epoch": 2.0124344278220323,
"grad_norm": 0.1771816589400498,
"learning_rate": 1.179729556745048e-05,
"loss": 0.0188,
"step": 5180
},
{
"epoch": 2.0163201865164173,
"grad_norm": 0.1992628058679068,
"learning_rate": 1.171489202575624e-05,
"loss": 0.0159,
"step": 5190
},
{
"epoch": 2.0202059452108023,
"grad_norm": 0.15396300645984096,
"learning_rate": 1.163265791884868e-05,
"loss": 0.0165,
"step": 5200
},
{
"epoch": 2.0240917039051873,
"grad_norm": 0.2379685245286237,
"learning_rate": 1.1550594928458224e-05,
"loss": 0.011,
"step": 5210
},
{
"epoch": 2.027977462599573,
"grad_norm": 0.1501164586957791,
"learning_rate": 1.14687047328159e-05,
"loss": 0.0124,
"step": 5220
},
{
"epoch": 2.031863221293958,
"grad_norm": 0.22785045077674615,
"learning_rate": 1.138698900661901e-05,
"loss": 0.0173,
"step": 5230
},
{
"epoch": 2.035748979988343,
"grad_norm": 0.19489104500055154,
"learning_rate": 1.130544942099685e-05,
"loss": 0.0156,
"step": 5240
},
{
"epoch": 2.039634738682728,
"grad_norm": 0.2578847419234991,
"learning_rate": 1.1224087643476525e-05,
"loss": 0.0138,
"step": 5250
},
{
"epoch": 2.043520497377113,
"grad_norm": 0.24662743252166966,
"learning_rate": 1.1142905337948905e-05,
"loss": 0.0145,
"step": 5260
},
{
"epoch": 2.047406256071498,
"grad_norm": 0.2704090578235268,
"learning_rate": 1.1061904164634547e-05,
"loss": 0.0201,
"step": 5270
},
{
"epoch": 2.051292014765883,
"grad_norm": 0.2277257711569673,
"learning_rate": 1.0981085780049783e-05,
"loss": 0.0156,
"step": 5280
},
{
"epoch": 2.0551777734602683,
"grad_norm": 0.24393861223637625,
"learning_rate": 1.0900451836972779e-05,
"loss": 0.0157,
"step": 5290
},
{
"epoch": 2.0590635321546533,
"grad_norm": 0.22148212227817382,
"learning_rate": 1.0820003984409809e-05,
"loss": 0.0182,
"step": 5300
},
{
"epoch": 2.0629492908490383,
"grad_norm": 0.17873179551577242,
"learning_rate": 1.0739743867561484e-05,
"loss": 0.0223,
"step": 5310
},
{
"epoch": 2.0668350495434233,
"grad_norm": 0.26424010200649295,
"learning_rate": 1.0659673127789123e-05,
"loss": 0.0149,
"step": 5320
},
{
"epoch": 2.0707208082378084,
"grad_norm": 0.14038451344369343,
"learning_rate": 1.0579793402581208e-05,
"loss": 0.0125,
"step": 5330
},
{
"epoch": 2.0746065669321934,
"grad_norm": 0.22811068872894222,
"learning_rate": 1.050010632551983e-05,
"loss": 0.0132,
"step": 5340
},
{
"epoch": 2.078492325626579,
"grad_norm": 0.20031881335249865,
"learning_rate": 1.0420613526247356e-05,
"loss": 0.0166,
"step": 5350
},
{
"epoch": 2.082378084320964,
"grad_norm": 0.2633505577038377,
"learning_rate": 1.0341316630433062e-05,
"loss": 0.0146,
"step": 5360
},
{
"epoch": 2.086263843015349,
"grad_norm": 0.18868039147588853,
"learning_rate": 1.0262217259739897e-05,
"loss": 0.0117,
"step": 5370
},
{
"epoch": 2.090149601709734,
"grad_norm": 0.1483051784876369,
"learning_rate": 1.0183317031791318e-05,
"loss": 0.0175,
"step": 5380
},
{
"epoch": 2.094035360404119,
"grad_norm": 0.1594861493991722,
"learning_rate": 1.0104617560138205e-05,
"loss": 0.0117,
"step": 5390
},
{
"epoch": 2.097921119098504,
"grad_norm": 0.26030303220977924,
"learning_rate": 1.0026120454225877e-05,
"loss": 0.018,
"step": 5400
},
{
"epoch": 2.101806877792889,
"grad_norm": 0.2308638280548361,
"learning_rate": 9.947827319361152e-06,
"loss": 0.0178,
"step": 5410
},
{
"epoch": 2.1056926364872743,
"grad_norm": 0.10900704882374063,
"learning_rate": 9.869739756679551e-06,
"loss": 0.0165,
"step": 5420
},
{
"epoch": 2.1095783951816593,
"grad_norm": 0.21796588094597985,
"learning_rate": 9.791859363112521e-06,
"loss": 0.0191,
"step": 5430
},
{
"epoch": 2.1134641538760444,
"grad_norm": 0.1672673980678181,
"learning_rate": 9.714187731354805e-06,
"loss": 0.0177,
"step": 5440
},
{
"epoch": 2.1173499125704294,
"grad_norm": 0.2684028767592411,
"learning_rate": 9.63672644983185e-06,
"loss": 0.0162,
"step": 5450
},
{
"epoch": 2.1212356712648144,
"grad_norm": 0.2649724905943757,
"learning_rate": 9.559477102667331e-06,
"loss": 0.0159,
"step": 5460
},
{
"epoch": 2.1251214299591994,
"grad_norm": 0.1855609816934914,
"learning_rate": 9.482441269650762e-06,
"loss": 0.0168,
"step": 5470
},
{
"epoch": 2.129007188653585,
"grad_norm": 0.20799545221446245,
"learning_rate": 9.405620526205173e-06,
"loss": 0.0229,
"step": 5480
},
{
"epoch": 2.13289294734797,
"grad_norm": 0.2169295403294615,
"learning_rate": 9.329016443354899e-06,
"loss": 0.0166,
"step": 5490
},
{
"epoch": 2.136778706042355,
"grad_norm": 0.20155345013696116,
"learning_rate": 9.25263058769347e-06,
"loss": 0.0164,
"step": 5500
},
{
"epoch": 2.14066446473674,
"grad_norm": 0.24455184564544635,
"learning_rate": 9.176464521351517e-06,
"loss": 0.0137,
"step": 5510
},
{
"epoch": 2.144550223431125,
"grad_norm": 0.14491602829427766,
"learning_rate": 9.100519801964913e-06,
"loss": 0.0124,
"step": 5520
},
{
"epoch": 2.14843598212551,
"grad_norm": 0.31403014524681455,
"learning_rate": 9.024797982642841e-06,
"loss": 0.0199,
"step": 5530
},
{
"epoch": 2.152321740819895,
"grad_norm": 0.16133633738818282,
"learning_rate": 8.949300611936065e-06,
"loss": 0.0177,
"step": 5540
},
{
"epoch": 2.1562074995142804,
"grad_norm": 0.34872137324864605,
"learning_rate": 8.874029233805269e-06,
"loss": 0.0176,
"step": 5550
},
{
"epoch": 2.1600932582086654,
"grad_norm": 0.09809795832817098,
"learning_rate": 8.798985387589436e-06,
"loss": 0.0108,
"step": 5560
},
{
"epoch": 2.1639790169030504,
"grad_norm": 0.2159359078852452,
"learning_rate": 8.724170607974454e-06,
"loss": 0.0147,
"step": 5570
},
{
"epoch": 2.1678647755974354,
"grad_norm": 0.21468205780280344,
"learning_rate": 8.649586424961645e-06,
"loss": 0.0141,
"step": 5580
},
{
"epoch": 2.1717505342918204,
"grad_norm": 0.2249780220781056,
"learning_rate": 8.575234363836526e-06,
"loss": 0.0186,
"step": 5590
},
{
"epoch": 2.1756362929862054,
"grad_norm": 0.24124720508302283,
"learning_rate": 8.501115945137577e-06,
"loss": 0.0144,
"step": 5600
},
{
"epoch": 2.1795220516805904,
"grad_norm": 0.26759279337799535,
"learning_rate": 8.427232684625186e-06,
"loss": 0.0193,
"step": 5610
},
{
"epoch": 2.183407810374976,
"grad_norm": 0.2297544777314867,
"learning_rate": 8.353586093250642e-06,
"loss": 0.0147,
"step": 5620
},
{
"epoch": 2.187293569069361,
"grad_norm": 0.23619415311251218,
"learning_rate": 8.280177677125214e-06,
"loss": 0.0166,
"step": 5630
},
{
"epoch": 2.191179327763746,
"grad_norm": 0.1943560804911627,
"learning_rate": 8.207008937489364e-06,
"loss": 0.0172,
"step": 5640
},
{
"epoch": 2.195065086458131,
"grad_norm": 0.22431037235771287,
"learning_rate": 8.134081370682038e-06,
"loss": 0.0161,
"step": 5650
},
{
"epoch": 2.198950845152516,
"grad_norm": 0.21729765338858503,
"learning_rate": 8.061396468110074e-06,
"loss": 0.0175,
"step": 5660
},
{
"epoch": 2.202836603846901,
"grad_norm": 0.18338237613630443,
"learning_rate": 7.988955716217719e-06,
"loss": 0.0106,
"step": 5670
},
{
"epoch": 2.206722362541286,
"grad_norm": 0.20665577127269266,
"learning_rate": 7.916760596456197e-06,
"loss": 0.0138,
"step": 5680
},
{
"epoch": 2.2106081212356714,
"grad_norm": 0.2316552326389069,
"learning_rate": 7.84481258525341e-06,
"loss": 0.0141,
"step": 5690
},
{
"epoch": 2.2144938799300564,
"grad_norm": 0.26108396067769657,
"learning_rate": 7.773113153983787e-06,
"loss": 0.0169,
"step": 5700
},
{
"epoch": 2.2183796386244414,
"grad_norm": 0.2243677380846199,
"learning_rate": 7.701663768938146e-06,
"loss": 0.0145,
"step": 5710
},
{
"epoch": 2.2222653973188264,
"grad_norm": 0.18542682485202877,
"learning_rate": 7.630465891293766e-06,
"loss": 0.0184,
"step": 5720
},
{
"epoch": 2.2261511560132115,
"grad_norm": 0.16604306041103264,
"learning_rate": 7.559520977084416e-06,
"loss": 0.0134,
"step": 5730
},
{
"epoch": 2.2300369147075965,
"grad_norm": 0.15271062795031004,
"learning_rate": 7.4888304771706675e-06,
"loss": 0.0174,
"step": 5740
},
{
"epoch": 2.233922673401982,
"grad_norm": 0.22620464359511355,
"learning_rate": 7.418395837210177e-06,
"loss": 0.0183,
"step": 5750
},
{
"epoch": 2.237808432096367,
"grad_norm": 0.1932286630859644,
"learning_rate": 7.34821849762813e-06,
"loss": 0.015,
"step": 5760
},
{
"epoch": 2.241694190790752,
"grad_norm": 0.20920536899184783,
"learning_rate": 7.278299893587784e-06,
"loss": 0.0128,
"step": 5770
},
{
"epoch": 2.245579949485137,
"grad_norm": 0.1681609340326629,
"learning_rate": 7.20864145496112e-06,
"loss": 0.0139,
"step": 5780
},
{
"epoch": 2.249465708179522,
"grad_norm": 0.23030013864097168,
"learning_rate": 7.139244606299603e-06,
"loss": 0.0202,
"step": 5790
},
{
"epoch": 2.253351466873907,
"grad_norm": 0.20837500136002646,
"learning_rate": 7.070110766805045e-06,
"loss": 0.0151,
"step": 5800
},
{
"epoch": 2.257237225568292,
"grad_norm": 0.09892626670584977,
"learning_rate": 7.001241350300585e-06,
"loss": 0.0139,
"step": 5810
},
{
"epoch": 2.2611229842626774,
"grad_norm": 0.2421628202226961,
"learning_rate": 6.932637765201767e-06,
"loss": 0.0167,
"step": 5820
},
{
"epoch": 2.2650087429570624,
"grad_norm": 0.22005154896696727,
"learning_rate": 6.86430141448775e-06,
"loss": 0.0163,
"step": 5830
},
{
"epoch": 2.2688945016514475,
"grad_norm": 0.18906159377923581,
"learning_rate": 6.796233695672611e-06,
"loss": 0.0141,
"step": 5840
},
{
"epoch": 2.2727802603458325,
"grad_norm": 0.1882303122476742,
"learning_rate": 6.728436000776759e-06,
"loss": 0.0167,
"step": 5850
},
{
"epoch": 2.2766660190402175,
"grad_norm": 0.20942395612766407,
"learning_rate": 6.6609097162984785e-06,
"loss": 0.0167,
"step": 5860
},
{
"epoch": 2.2805517777346025,
"grad_norm": 0.17048648869426325,
"learning_rate": 6.593656223185565e-06,
"loss": 0.0158,
"step": 5870
},
{
"epoch": 2.284437536428988,
"grad_norm": 0.18912326858465023,
"learning_rate": 6.526676896807092e-06,
"loss": 0.0172,
"step": 5880
},
{
"epoch": 2.288323295123373,
"grad_norm": 0.22429991946158165,
"learning_rate": 6.459973106925272e-06,
"loss": 0.0138,
"step": 5890
},
{
"epoch": 2.292209053817758,
"grad_norm": 0.2719200154391883,
"learning_rate": 6.393546217667464e-06,
"loss": 0.0137,
"step": 5900
},
{
"epoch": 2.296094812512143,
"grad_norm": 0.23193225915033114,
"learning_rate": 6.327397587498254e-06,
"loss": 0.0202,
"step": 5910
},
{
"epoch": 2.299980571206528,
"grad_norm": 0.1773945719864865,
"learning_rate": 6.26152856919169e-06,
"loss": 0.0269,
"step": 5920
},
{
"epoch": 2.303866329900913,
"grad_norm": 0.18357413512611562,
"learning_rate": 6.19594050980361e-06,
"loss": 0.0133,
"step": 5930
},
{
"epoch": 2.307752088595298,
"grad_norm": 0.10329466907615475,
"learning_rate": 6.130634750644102e-06,
"loss": 0.0243,
"step": 5940
},
{
"epoch": 2.3116378472896835,
"grad_norm": 0.26127302357511983,
"learning_rate": 6.0656126272500485e-06,
"loss": 0.0155,
"step": 5950
},
{
"epoch": 2.3155236059840685,
"grad_norm": 0.15622640887182293,
"learning_rate": 6.000875469357841e-06,
"loss": 0.0101,
"step": 5960
},
{
"epoch": 2.3194093646784535,
"grad_norm": 0.253685898111279,
"learning_rate": 5.936424600876194e-06,
"loss": 0.0189,
"step": 5970
},
{
"epoch": 2.3232951233728385,
"grad_norm": 0.29404993122697953,
"learning_rate": 5.872261339859038e-06,
"loss": 0.0184,
"step": 5980
},
{
"epoch": 2.3271808820672235,
"grad_norm": 0.14796428743903128,
"learning_rate": 5.8083869984785836e-06,
"loss": 0.0147,
"step": 5990
},
{
"epoch": 2.3310666407616085,
"grad_norm": 0.20683726050768395,
"learning_rate": 5.7448028829984745e-06,
"loss": 0.0188,
"step": 6000
},
{
"epoch": 2.334952399455994,
"grad_norm": 0.21092382019033648,
"learning_rate": 5.681510293747092e-06,
"loss": 0.016,
"step": 6010
},
{
"epoch": 2.338838158150379,
"grad_norm": 0.21550234513080346,
"learning_rate": 5.618510525090966e-06,
"loss": 0.0189,
"step": 6020
},
{
"epoch": 2.342723916844764,
"grad_norm": 0.16535694850993649,
"learning_rate": 5.555804865408279e-06,
"loss": 0.0191,
"step": 6030
},
{
"epoch": 2.346609675539149,
"grad_norm": 0.21745148122170188,
"learning_rate": 5.4933945970625225e-06,
"loss": 0.0137,
"step": 6040
},
{
"epoch": 2.350495434233534,
"grad_norm": 0.19584922849210273,
"learning_rate": 5.431280996376294e-06,
"loss": 0.0184,
"step": 6050
},
{
"epoch": 2.354381192927919,
"grad_norm": 0.2120146691664677,
"learning_rate": 5.369465333605172e-06,
"loss": 0.0136,
"step": 6060
},
{
"epoch": 2.358266951622304,
"grad_norm": 0.23504940817120928,
"learning_rate": 5.307948872911772e-06,
"loss": 0.0108,
"step": 6070
},
{
"epoch": 2.3621527103166895,
"grad_norm": 0.2544407938266695,
"learning_rate": 5.246732872339852e-06,
"loss": 0.0162,
"step": 6080
},
{
"epoch": 2.3660384690110745,
"grad_norm": 0.1787782566980142,
"learning_rate": 5.185818583788596e-06,
"loss": 0.0167,
"step": 6090
},
{
"epoch": 2.3699242277054595,
"grad_norm": 0.18779821624668364,
"learning_rate": 5.125207252987034e-06,
"loss": 0.0164,
"step": 6100
},
{
"epoch": 2.3738099863998445,
"grad_norm": 0.23079303888097902,
"learning_rate": 5.064900119468544e-06,
"loss": 0.0172,
"step": 6110
},
{
"epoch": 2.3776957450942295,
"grad_norm": 0.15381004609158025,
"learning_rate": 5.004898416545529e-06,
"loss": 0.0184,
"step": 6120
},
{
"epoch": 2.3815815037886146,
"grad_norm": 0.23537158479264925,
"learning_rate": 4.945203371284147e-06,
"loss": 0.0123,
"step": 6130
},
{
"epoch": 2.385467262483,
"grad_norm": 0.15335724927915312,
"learning_rate": 4.8858162044792654e-06,
"loss": 0.0102,
"step": 6140
},
{
"epoch": 2.389353021177385,
"grad_norm": 0.2832041836288288,
"learning_rate": 4.826738130629473e-06,
"loss": 0.0149,
"step": 6150
},
{
"epoch": 2.39323877987177,
"grad_norm": 0.27813345843062076,
"learning_rate": 4.767970357912246e-06,
"loss": 0.0165,
"step": 6160
},
{
"epoch": 2.397124538566155,
"grad_norm": 0.15264755246893946,
"learning_rate": 4.7095140881592395e-06,
"loss": 0.0153,
"step": 6170
},
{
"epoch": 2.40101029726054,
"grad_norm": 0.24434480556286678,
"learning_rate": 4.65137051683171e-06,
"loss": 0.0203,
"step": 6180
},
{
"epoch": 2.404896055954925,
"grad_norm": 0.17160358108568669,
"learning_rate": 4.593540832996071e-06,
"loss": 0.0126,
"step": 6190
},
{
"epoch": 2.40878181464931,
"grad_norm": 0.2220871800365551,
"learning_rate": 4.53602621929957e-06,
"loss": 0.0174,
"step": 6200
},
{
"epoch": 2.4126675733436955,
"grad_norm": 0.19387829594662281,
"learning_rate": 4.478827851946102e-06,
"loss": 0.0131,
"step": 6210
},
{
"epoch": 2.4165533320380805,
"grad_norm": 0.2720443716921079,
"learning_rate": 4.421946900672165e-06,
"loss": 0.0174,
"step": 6220
},
{
"epoch": 2.4204390907324655,
"grad_norm": 0.3727208802934669,
"learning_rate": 4.365384528722931e-06,
"loss": 0.0152,
"step": 6230
},
{
"epoch": 2.4243248494268506,
"grad_norm": 0.2178296686386825,
"learning_rate": 4.309141892828459e-06,
"loss": 0.0142,
"step": 6240
},
{
"epoch": 2.4282106081212356,
"grad_norm": 0.22728601802630374,
"learning_rate": 4.2532201431800344e-06,
"loss": 0.0115,
"step": 6250
},
{
"epoch": 2.4320963668156206,
"grad_norm": 0.23843429430202012,
"learning_rate": 4.197620423406657e-06,
"loss": 0.0157,
"step": 6260
},
{
"epoch": 2.435982125510006,
"grad_norm": 0.19116227547407497,
"learning_rate": 4.1423438705516415e-06,
"loss": 0.0154,
"step": 6270
},
{
"epoch": 2.439867884204391,
"grad_norm": 0.22767774713522007,
"learning_rate": 4.087391615049374e-06,
"loss": 0.0141,
"step": 6280
},
{
"epoch": 2.443753642898776,
"grad_norm": 0.29789802335991833,
"learning_rate": 4.03276478070219e-06,
"loss": 0.0159,
"step": 6290
},
{
"epoch": 2.447639401593161,
"grad_norm": 0.24719135376807377,
"learning_rate": 3.978464484657392e-06,
"loss": 0.0157,
"step": 6300
},
{
"epoch": 2.451525160287546,
"grad_norm": 0.32391016652290383,
"learning_rate": 3.924491837384406e-06,
"loss": 0.0149,
"step": 6310
},
{
"epoch": 2.455410918981931,
"grad_norm": 0.20232458142688142,
"learning_rate": 3.87084794265206e-06,
"loss": 0.015,
"step": 6320
},
{
"epoch": 2.459296677676316,
"grad_norm": 0.2712629616510874,
"learning_rate": 3.817533897506036e-06,
"loss": 0.0133,
"step": 6330
},
{
"epoch": 2.4631824363707016,
"grad_norm": 0.17142782056379016,
"learning_rate": 3.764550792246411e-06,
"loss": 0.0178,
"step": 6340
},
{
"epoch": 2.4670681950650866,
"grad_norm": 0.2625497363608154,
"learning_rate": 3.7118997104053557e-06,
"loss": 0.0177,
"step": 6350
},
{
"epoch": 2.4709539537594716,
"grad_norm": 0.23418764898240915,
"learning_rate": 3.659581728725017e-06,
"loss": 0.0216,
"step": 6360
},
{
"epoch": 2.4748397124538566,
"grad_norm": 0.2421515106857572,
"learning_rate": 3.607597917135448e-06,
"loss": 0.0102,
"step": 6370
},
{
"epoch": 2.4787254711482416,
"grad_norm": 0.18002030517689566,
"learning_rate": 3.5559493387327603e-06,
"loss": 0.0125,
"step": 6380
},
{
"epoch": 2.4826112298426266,
"grad_norm": 0.22127119378874585,
"learning_rate": 3.5046370497573558e-06,
"loss": 0.0171,
"step": 6390
},
{
"epoch": 2.486496988537012,
"grad_norm": 0.24067053535828525,
"learning_rate": 3.4536620995723524e-06,
"loss": 0.0131,
"step": 6400
},
{
"epoch": 2.490382747231397,
"grad_norm": 0.3548945672530695,
"learning_rate": 3.4030255306421254e-06,
"loss": 0.0172,
"step": 6410
},
{
"epoch": 2.494268505925782,
"grad_norm": 0.34342268520149605,
"learning_rate": 3.3527283785109565e-06,
"loss": 0.0134,
"step": 6420
},
{
"epoch": 2.498154264620167,
"grad_norm": 0.26685487402837554,
"learning_rate": 3.3027716717818925e-06,
"loss": 0.0121,
"step": 6430
},
{
"epoch": 2.502040023314552,
"grad_norm": 0.23865361157570286,
"learning_rate": 3.2531564320956745e-06,
"loss": 0.0134,
"step": 6440
},
{
"epoch": 2.505925782008937,
"grad_norm": 0.2262219648398464,
"learning_rate": 3.2038836741098756e-06,
"loss": 0.0164,
"step": 6450
},
{
"epoch": 2.509811540703322,
"grad_norm": 0.11415732152468076,
"learning_rate": 3.15495440547815e-06,
"loss": 0.0126,
"step": 6460
},
{
"epoch": 2.513697299397707,
"grad_norm": 0.1637749047242322,
"learning_rate": 3.1063696268296063e-06,
"loss": 0.0152,
"step": 6470
},
{
"epoch": 2.5175830580920926,
"grad_norm": 0.19217591389578745,
"learning_rate": 3.0581303317483367e-06,
"loss": 0.0171,
"step": 6480
},
{
"epoch": 2.5214688167864776,
"grad_norm": 0.1744921556168132,
"learning_rate": 3.0102375067531375e-06,
"loss": 0.0133,
"step": 6490
},
{
"epoch": 2.5253545754808626,
"grad_norm": 0.19272649371157077,
"learning_rate": 2.962692131277296e-06,
"loss": 0.0131,
"step": 6500
},
{
"epoch": 2.5292403341752476,
"grad_norm": 0.23696107664845892,
"learning_rate": 2.9154951776485905e-06,
"loss": 0.0169,
"step": 6510
},
{
"epoch": 2.5331260928696326,
"grad_norm": 0.16637778487180105,
"learning_rate": 2.8686476110693796e-06,
"loss": 0.0167,
"step": 6520
},
{
"epoch": 2.537011851564018,
"grad_norm": 0.22456349132177617,
"learning_rate": 2.822150389596867e-06,
"loss": 0.0134,
"step": 6530
},
{
"epoch": 2.540897610258403,
"grad_norm": 0.3349504222512524,
"learning_rate": 2.7760044641235295e-06,
"loss": 0.0144,
"step": 6540
},
{
"epoch": 2.544783368952788,
"grad_norm": 0.23230866877649609,
"learning_rate": 2.730210778357649e-06,
"loss": 0.0124,
"step": 6550
},
{
"epoch": 2.548669127647173,
"grad_norm": 0.2876662681394313,
"learning_rate": 2.6847702688040357e-06,
"loss": 0.0174,
"step": 6560
},
{
"epoch": 2.552554886341558,
"grad_norm": 0.24429770597495445,
"learning_rate": 2.6396838647448353e-06,
"loss": 0.0153,
"step": 6570
},
{
"epoch": 2.556440645035943,
"grad_norm": 0.18614206123634575,
"learning_rate": 2.594952488220577e-06,
"loss": 0.0138,
"step": 6580
},
{
"epoch": 2.560326403730328,
"grad_norm": 0.1707554494222318,
"learning_rate": 2.550577054011274e-06,
"loss": 0.0097,
"step": 6590
},
{
"epoch": 2.564212162424713,
"grad_norm": 0.14383952391367794,
"learning_rate": 2.5065584696177414e-06,
"loss": 0.0149,
"step": 6600
},
{
"epoch": 2.5680979211190986,
"grad_norm": 0.18379892796393424,
"learning_rate": 2.4628976352430376e-06,
"loss": 0.011,
"step": 6610
},
{
"epoch": 2.5719836798134836,
"grad_norm": 0.24670126413024498,
"learning_rate": 2.419595443774023e-06,
"loss": 0.0133,
"step": 6620
},
{
"epoch": 2.5758694385078686,
"grad_norm": 0.20077779669194387,
"learning_rate": 2.3766527807631422e-06,
"loss": 0.0143,
"step": 6630
},
{
"epoch": 2.5797551972022537,
"grad_norm": 0.22864135011568798,
"learning_rate": 2.33407052441029e-06,
"loss": 0.0195,
"step": 6640
},
{
"epoch": 2.5836409558966387,
"grad_norm": 0.1525621735784211,
"learning_rate": 2.291849545544853e-06,
"loss": 0.0186,
"step": 6650
},
{
"epoch": 2.587526714591024,
"grad_norm": 0.23345566519875988,
"learning_rate": 2.249990707607912e-06,
"loss": 0.0134,
"step": 6660
},
{
"epoch": 2.591412473285409,
"grad_norm": 0.2280149763833634,
"learning_rate": 2.2084948666345695e-06,
"loss": 0.0139,
"step": 6670
},
{
"epoch": 2.595298231979794,
"grad_norm": 0.26595705322222424,
"learning_rate": 2.1673628712364538e-06,
"loss": 0.0142,
"step": 6680
},
{
"epoch": 2.599183990674179,
"grad_norm": 0.3089658507273733,
"learning_rate": 2.126595562584357e-06,
"loss": 0.0182,
"step": 6690
},
{
"epoch": 2.603069749368564,
"grad_norm": 0.2145635655464109,
"learning_rate": 2.0861937743910456e-06,
"loss": 0.0143,
"step": 6700
},
{
"epoch": 2.606955508062949,
"grad_norm": 0.22760812179389878,
"learning_rate": 2.046158332894195e-06,
"loss": 0.014,
"step": 6710
},
{
"epoch": 2.610841266757334,
"grad_norm": 0.20004208111148616,
"learning_rate": 2.006490056839496e-06,
"loss": 0.0138,
"step": 6720
},
{
"epoch": 2.614727025451719,
"grad_norm": 0.1855756971876611,
"learning_rate": 1.9671897574639233e-06,
"loss": 0.0157,
"step": 6730
},
{
"epoch": 2.6186127841461047,
"grad_norm": 0.23769731157003207,
"learning_rate": 1.928258238479133e-06,
"loss": 0.0164,
"step": 6740
},
{
"epoch": 2.6224985428404897,
"grad_norm": 0.17194198866392246,
"learning_rate": 1.8896962960550214e-06,
"loss": 0.0123,
"step": 6750
},
{
"epoch": 2.6263843015348747,
"grad_norm": 0.25716223538323435,
"learning_rate": 1.8515047188034651e-06,
"loss": 0.0132,
"step": 6760
},
{
"epoch": 2.6302700602292597,
"grad_norm": 0.23923230278799787,
"learning_rate": 1.8136842877621697e-06,
"loss": 0.0132,
"step": 6770
},
{
"epoch": 2.6341558189236447,
"grad_norm": 0.18419856300682974,
"learning_rate": 1.7762357763787097e-06,
"loss": 0.011,
"step": 6780
},
{
"epoch": 2.63804157761803,
"grad_norm": 0.1915730985861053,
"learning_rate": 1.7391599504947043e-06,
"loss": 0.013,
"step": 6790
},
{
"epoch": 2.641927336312415,
"grad_norm": 0.1566125187690401,
"learning_rate": 1.702457568330167e-06,
"loss": 0.0116,
"step": 6800
},
{
"epoch": 2.6458130950068,
"grad_norm": 0.16344705005127574,
"learning_rate": 1.666129380467989e-06,
"loss": 0.0124,
"step": 6810
},
{
"epoch": 2.649698853701185,
"grad_norm": 0.19446375972044477,
"learning_rate": 1.63017612983859e-06,
"loss": 0.0154,
"step": 6820
},
{
"epoch": 2.65358461239557,
"grad_norm": 0.2483349003041754,
"learning_rate": 1.5945985517047336e-06,
"loss": 0.0176,
"step": 6830
},
{
"epoch": 2.657470371089955,
"grad_norm": 0.1788660816854284,
"learning_rate": 1.5593973736464718e-06,
"loss": 0.0158,
"step": 6840
},
{
"epoch": 2.66135612978434,
"grad_norm": 0.31388430244796356,
"learning_rate": 1.5245733155462937e-06,
"loss": 0.0117,
"step": 6850
},
{
"epoch": 2.6652418884787252,
"grad_norm": 0.09656141608090874,
"learning_rate": 1.4901270895743803e-06,
"loss": 0.0179,
"step": 6860
},
{
"epoch": 2.6691276471731107,
"grad_norm": 0.2572799089734027,
"learning_rate": 1.4560594001740503e-06,
"loss": 0.0125,
"step": 6870
},
{
"epoch": 2.6730134058674957,
"grad_norm": 0.19767701320137746,
"learning_rate": 1.4223709440473466e-06,
"loss": 0.0149,
"step": 6880
},
{
"epoch": 2.6768991645618807,
"grad_norm": 0.2720993871756462,
"learning_rate": 1.3890624101407957e-06,
"loss": 0.0142,
"step": 6890
},
{
"epoch": 2.6807849232562657,
"grad_norm": 0.26464941453049734,
"learning_rate": 1.356134479631328e-06,
"loss": 0.0146,
"step": 6900
},
{
"epoch": 2.6846706819506507,
"grad_norm": 0.19524351475983762,
"learning_rate": 1.3235878259123226e-06,
"loss": 0.0182,
"step": 6910
},
{
"epoch": 2.688556440645036,
"grad_norm": 0.26891449363176717,
"learning_rate": 1.2914231145798462e-06,
"loss": 0.0185,
"step": 6920
},
{
"epoch": 2.692442199339421,
"grad_norm": 0.3011765177624915,
"learning_rate": 1.2596410034190543e-06,
"loss": 0.014,
"step": 6930
},
{
"epoch": 2.696327958033806,
"grad_norm": 0.3020550635806977,
"learning_rate": 1.228242142390721e-06,
"loss": 0.0128,
"step": 6940
},
{
"epoch": 2.700213716728191,
"grad_norm": 0.24847929057204657,
"learning_rate": 1.1972271736179653e-06,
"loss": 0.0126,
"step": 6950
},
{
"epoch": 2.7040994754225762,
"grad_norm": 0.14385502966292424,
"learning_rate": 1.166596731373102e-06,
"loss": 0.0142,
"step": 6960
},
{
"epoch": 2.7079852341169612,
"grad_norm": 0.27969175657328743,
"learning_rate": 1.1363514420646738e-06,
"loss": 0.0122,
"step": 6970
},
{
"epoch": 2.7118709928113462,
"grad_norm": 0.23975714101635787,
"learning_rate": 1.1064919242246486e-06,
"loss": 0.018,
"step": 6980
},
{
"epoch": 2.7157567515057313,
"grad_norm": 0.18202493395247812,
"learning_rate": 1.0770187884957673e-06,
"loss": 0.0161,
"step": 6990
},
{
"epoch": 2.7196425102001167,
"grad_norm": 0.15583830698569281,
"learning_rate": 1.0479326376190602e-06,
"loss": 0.0121,
"step": 7000
},
{
"epoch": 2.7235282688945017,
"grad_norm": 0.23832508481487272,
"learning_rate": 1.0192340664214995e-06,
"loss": 0.0154,
"step": 7010
},
{
"epoch": 2.7274140275888867,
"grad_norm": 0.16847478281366923,
"learning_rate": 9.909236618038665e-07,
"loss": 0.0146,
"step": 7020
},
{
"epoch": 2.7312997862832717,
"grad_norm": 0.30569535019277033,
"learning_rate": 9.630020027287213e-07,
"loss": 0.0181,
"step": 7030
},
{
"epoch": 2.7351855449776568,
"grad_norm": 0.23680858512407482,
"learning_rate": 9.354696602085833e-07,
"loss": 0.0179,
"step": 7040
},
{
"epoch": 2.739071303672042,
"grad_norm": 0.24801887023205305,
"learning_rate": 9.083271972942431e-07,
"loss": 0.0178,
"step": 7050
},
{
"epoch": 2.7429570623664272,
"grad_norm": 0.18244403571549045,
"learning_rate": 8.815751690632423e-07,
"loss": 0.0134,
"step": 7060
},
{
"epoch": 2.7468428210608122,
"grad_norm": 0.3177601765145437,
"learning_rate": 8.552141226085408e-07,
"loss": 0.0135,
"step": 7070
},
{
"epoch": 2.7507285797551972,
"grad_norm": 0.2478869375091213,
"learning_rate": 8.292445970273055e-07,
"loss": 0.0131,
"step": 7080
},
{
"epoch": 2.7546143384495823,
"grad_norm": 0.16351159527021386,
"learning_rate": 8.03667123409908e-07,
"loss": 0.0116,
"step": 7090
},
{
"epoch": 2.7585000971439673,
"grad_norm": 0.16164434978464962,
"learning_rate": 7.784822248290424e-07,
"loss": 0.0151,
"step": 7100
},
{
"epoch": 2.7623858558383523,
"grad_norm": 0.19368227723734358,
"learning_rate": 7.53690416329047e-07,
"loss": 0.0115,
"step": 7110
},
{
"epoch": 2.7662716145327373,
"grad_norm": 0.19844203086570225,
"learning_rate": 7.292922049153528e-07,
"loss": 0.0158,
"step": 7120
},
{
"epoch": 2.7701573732271227,
"grad_norm": 0.12080846948403764,
"learning_rate": 7.052880895441339e-07,
"loss": 0.0117,
"step": 7130
},
{
"epoch": 2.7740431319215078,
"grad_norm": 0.15523717768836803,
"learning_rate": 6.816785611120913e-07,
"loss": 0.0133,
"step": 7140
},
{
"epoch": 2.7779288906158928,
"grad_norm": 0.25279826141773354,
"learning_rate": 6.584641024464122e-07,
"loss": 0.0108,
"step": 7150
},
{
"epoch": 2.7818146493102778,
"grad_norm": 0.1026232646491208,
"learning_rate": 6.356451882949088e-07,
"loss": 0.0137,
"step": 7160
},
{
"epoch": 2.785700408004663,
"grad_norm": 0.1843621477660204,
"learning_rate": 6.132222853162972e-07,
"loss": 0.0171,
"step": 7170
},
{
"epoch": 2.7895861666990482,
"grad_norm": 0.2974243267699617,
"learning_rate": 5.911958520706562e-07,
"loss": 0.0105,
"step": 7180
},
{
"epoch": 2.7934719253934333,
"grad_norm": 0.1450131937905085,
"learning_rate": 5.695663390100548e-07,
"loss": 0.0181,
"step": 7190
},
{
"epoch": 2.7973576840878183,
"grad_norm": 0.19449961006516722,
"learning_rate": 5.483341884693327e-07,
"loss": 0.0151,
"step": 7200
},
{
"epoch": 2.8012434427822033,
"grad_norm": 0.231835400931022,
"learning_rate": 5.274998346570659e-07,
"loss": 0.0174,
"step": 7210
},
{
"epoch": 2.8051292014765883,
"grad_norm": 0.18990330928555402,
"learning_rate": 5.070637036466753e-07,
"loss": 0.0163,
"step": 7220
},
{
"epoch": 2.8090149601709733,
"grad_norm": 0.19725672735140468,
"learning_rate": 4.870262133677072e-07,
"loss": 0.0117,
"step": 7230
},
{
"epoch": 2.8129007188653583,
"grad_norm": 0.3493633046330908,
"learning_rate": 4.6738777359731866e-07,
"loss": 0.017,
"step": 7240
},
{
"epoch": 2.8167864775597433,
"grad_norm": 0.2268035823210108,
"learning_rate": 4.481487859518563e-07,
"loss": 0.0112,
"step": 7250
},
{
"epoch": 2.8206722362541288,
"grad_norm": 0.26841266073002745,
"learning_rate": 4.293096438786726e-07,
"loss": 0.0176,
"step": 7260
},
{
"epoch": 2.824557994948514,
"grad_norm": 0.20423668146731536,
"learning_rate": 4.108707326480632e-07,
"loss": 0.0128,
"step": 7270
},
{
"epoch": 2.828443753642899,
"grad_norm": 0.1743068137356259,
"learning_rate": 3.9283242934539555e-07,
"loss": 0.0111,
"step": 7280
},
{
"epoch": 2.832329512337284,
"grad_norm": 0.2340589582843017,
"learning_rate": 3.751951028633971e-07,
"loss": 0.0125,
"step": 7290
},
{
"epoch": 2.836215271031669,
"grad_norm": 0.21054160485426174,
"learning_rate": 3.5795911389461033e-07,
"loss": 0.012,
"step": 7300
},
{
"epoch": 2.8401010297260543,
"grad_norm": 0.17738056254213855,
"learning_rate": 3.411248149240165e-07,
"loss": 0.0131,
"step": 7310
},
{
"epoch": 2.8439867884204393,
"grad_norm": 0.18120809069881422,
"learning_rate": 3.24692550221819e-07,
"loss": 0.0126,
"step": 7320
},
{
"epoch": 2.8478725471148243,
"grad_norm": 0.27728578365868417,
"learning_rate": 3.086626558364203e-07,
"loss": 0.0133,
"step": 7330
},
{
"epoch": 2.8517583058092093,
"grad_norm": 0.15083855508982252,
"learning_rate": 2.93035459587534e-07,
"loss": 0.0111,
"step": 7340
},
{
"epoch": 2.8556440645035943,
"grad_norm": 0.20682581691820806,
"learning_rate": 2.7781128105949015e-07,
"loss": 0.0158,
"step": 7350
},
{
"epoch": 2.8595298231979793,
"grad_norm": 0.19888880451656954,
"learning_rate": 2.6299043159468963e-07,
"loss": 0.0199,
"step": 7360
},
{
"epoch": 2.8634155818923643,
"grad_norm": 0.24115048647296294,
"learning_rate": 2.485732142872488e-07,
"loss": 0.0119,
"step": 7370
},
{
"epoch": 2.8673013405867493,
"grad_norm": 0.20653770018813603,
"learning_rate": 2.3455992397679595e-07,
"loss": 0.0158,
"step": 7380
},
{
"epoch": 2.871187099281135,
"grad_norm": 0.18076322308667522,
"learning_rate": 2.2095084724243598e-07,
"loss": 0.0125,
"step": 7390
},
{
"epoch": 2.87507285797552,
"grad_norm": 0.20736237506128907,
"learning_rate": 2.0774626239690176e-07,
"loss": 0.0117,
"step": 7400
},
{
"epoch": 2.878958616669905,
"grad_norm": 0.195234932120182,
"learning_rate": 1.9494643948084979e-07,
"loss": 0.0141,
"step": 7410
},
{
"epoch": 2.88284437536429,
"grad_norm": 0.2350430358349683,
"learning_rate": 1.8255164025734684e-07,
"loss": 0.0123,
"step": 7420
},
{
"epoch": 2.886730134058675,
"grad_norm": 0.2192321643235776,
"learning_rate": 1.7056211820651425e-07,
"loss": 0.0128,
"step": 7430
},
{
"epoch": 2.8906158927530603,
"grad_norm": 0.2649035408787872,
"learning_rate": 1.5897811852033873e-07,
"loss": 0.0174,
"step": 7440
},
{
"epoch": 2.8945016514474453,
"grad_norm": 0.1967812235705591,
"learning_rate": 1.4779987809766528e-07,
"loss": 0.0143,
"step": 7450
},
{
"epoch": 2.8983874101418303,
"grad_norm": 0.1249890086380848,
"learning_rate": 1.3702762553935656e-07,
"loss": 0.0119,
"step": 7460
},
{
"epoch": 2.9022731688362153,
"grad_norm": 0.22978595448983524,
"learning_rate": 1.2666158114359894e-07,
"loss": 0.0146,
"step": 7470
},
{
"epoch": 2.9061589275306003,
"grad_norm": 0.16965468919403492,
"learning_rate": 1.1670195690141939e-07,
"loss": 0.0138,
"step": 7480
},
{
"epoch": 2.9100446862249854,
"grad_norm": 0.28228206459702576,
"learning_rate": 1.0714895649233781e-07,
"loss": 0.0129,
"step": 7490
},
{
"epoch": 2.9139304449193704,
"grad_norm": 0.20835241859756748,
"learning_rate": 9.800277528020153e-08,
"loss": 0.0151,
"step": 7500
},
{
"epoch": 2.9178162036137554,
"grad_norm": 0.16424147151499843,
"learning_rate": 8.926360030919513e-08,
"loss": 0.0149,
"step": 7510
},
{
"epoch": 2.9217019623081404,
"grad_norm": 0.1759921900549206,
"learning_rate": 8.093161030001462e-08,
"loss": 0.0108,
"step": 7520
},
{
"epoch": 2.925587721002526,
"grad_norm": 0.2025940367611509,
"learning_rate": 7.300697564620596e-08,
"loss": 0.0124,
"step": 7530
},
{
"epoch": 2.929473479696911,
"grad_norm": 0.3091292487177528,
"learning_rate": 6.548985841069e-08,
"loss": 0.0171,
"step": 7540
},
{
"epoch": 2.933359238391296,
"grad_norm": 0.18550760451795564,
"learning_rate": 5.8380412322440736e-08,
"loss": 0.0098,
"step": 7550
},
{
"epoch": 2.937244997085681,
"grad_norm": 0.2399355018987987,
"learning_rate": 5.167878277334559e-08,
"loss": 0.0105,
"step": 7560
},
{
"epoch": 2.9411307557800663,
"grad_norm": 0.2129680096079058,
"learning_rate": 4.538510681523001e-08,
"loss": 0.0151,
"step": 7570
},
{
"epoch": 2.9450165144744513,
"grad_norm": 0.15339355552498427,
"learning_rate": 3.949951315705303e-08,
"loss": 0.0154,
"step": 7580
},
{
"epoch": 2.9489022731688364,
"grad_norm": 0.20862935241082844,
"learning_rate": 3.4022122162282736e-08,
"loss": 0.0114,
"step": 7590
},
{
"epoch": 2.9527880318632214,
"grad_norm": 0.23246847497172307,
"learning_rate": 2.895304584642711e-08,
"loss": 0.0177,
"step": 7600
},
{
"epoch": 2.9566737905576064,
"grad_norm": 0.20519849079048713,
"learning_rate": 2.429238787474475e-08,
"loss": 0.0126,
"step": 7610
},
{
"epoch": 2.9605595492519914,
"grad_norm": 0.19918834292961288,
"learning_rate": 2.004024356012435e-08,
"loss": 0.0161,
"step": 7620
},
{
"epoch": 2.9644453079463764,
"grad_norm": 0.19800296749468346,
"learning_rate": 1.6196699861139586e-08,
"loss": 0.0151,
"step": 7630
},
{
"epoch": 2.9683310666407614,
"grad_norm": 0.1967950909462482,
"learning_rate": 1.2761835380268317e-08,
"loss": 0.0159,
"step": 7640
},
{
"epoch": 2.9722168253351464,
"grad_norm": 0.2624813305392548,
"learning_rate": 9.735720362282763e-09,
"loss": 0.016,
"step": 7650
},
{
"epoch": 2.976102584029532,
"grad_norm": 0.24779472153476578,
"learning_rate": 7.1184166928151e-09,
"loss": 0.0145,
"step": 7660
},
{
"epoch": 2.979988342723917,
"grad_norm": 0.25286229822920325,
"learning_rate": 4.90997789709402e-09,
"loss": 0.0131,
"step": 7670
},
{
"epoch": 2.983874101418302,
"grad_norm": 0.1409302028113683,
"learning_rate": 3.1104491388478375e-09,
"loss": 0.0154,
"step": 7680
},
{
"epoch": 2.987759860112687,
"grad_norm": 0.24024651305037117,
"learning_rate": 1.719867219378557e-09,
"loss": 0.0142,
"step": 7690
},
{
"epoch": 2.991645618807072,
"grad_norm": 0.1761325503880173,
"learning_rate": 7.382605768113671e-10,
"loss": 0.0123,
"step": 7700
},
{
"epoch": 2.9955313775014574,
"grad_norm": 0.17792976628823454,
"learning_rate": 1.6564928551732195e-10,
"loss": 0.0113,
"step": 7710
},
{
"epoch": 2.999028560326404,
"step": 7719,
"total_flos": 301241258213376.0,
"train_loss": 0.03801638325974335,
"train_runtime": 73801.1602,
"train_samples_per_second": 1.674,
"train_steps_per_second": 0.105
}
],
"logging_steps": 10,
"max_steps": 7719,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 301241258213376.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}