{ "best_metric": 0.59325110912323, "best_model_checkpoint": "autotrain-i56bj-d90g7/checkpoint-13053", "epoch": 3.0, "eval_steps": 500, "global_step": 13053, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005745805561939784, "grad_norm": 18.06277084350586, "learning_rate": 9.571209800918836e-07, "loss": 1.3779, "step": 25 }, { "epoch": 0.011491611123879569, "grad_norm": 15.910608291625977, "learning_rate": 1.914241960183767e-06, "loss": 1.1469, "step": 50 }, { "epoch": 0.01723741668581935, "grad_norm": 23.948566436767578, "learning_rate": 2.871362940275651e-06, "loss": 1.2746, "step": 75 }, { "epoch": 0.022983222247759137, "grad_norm": 25.929588317871094, "learning_rate": 3.828483920367534e-06, "loss": 1.2256, "step": 100 }, { "epoch": 0.02872902780969892, "grad_norm": 16.358503341674805, "learning_rate": 4.785604900459419e-06, "loss": 1.1802, "step": 125 }, { "epoch": 0.0344748333716387, "grad_norm": 11.254109382629395, "learning_rate": 5.742725880551302e-06, "loss": 1.0634, "step": 150 }, { "epoch": 0.04022063893357849, "grad_norm": 28.07424545288086, "learning_rate": 6.699846860643186e-06, "loss": 1.315, "step": 175 }, { "epoch": 0.045966444495518274, "grad_norm": 16.383939743041992, "learning_rate": 7.656967840735069e-06, "loss": 1.17, "step": 200 }, { "epoch": 0.051712250057458053, "grad_norm": 22.179977416992188, "learning_rate": 8.614088820826952e-06, "loss": 1.0369, "step": 225 }, { "epoch": 0.05745805561939784, "grad_norm": 13.887266159057617, "learning_rate": 9.571209800918838e-06, "loss": 1.0267, "step": 250 }, { "epoch": 0.06320386118133763, "grad_norm": 19.803924560546875, "learning_rate": 1.052833078101072e-05, "loss": 1.0433, "step": 275 }, { "epoch": 0.0689496667432774, "grad_norm": 13.864017486572266, "learning_rate": 1.1485451761102605e-05, "loss": 1.0669, "step": 300 }, { "epoch": 0.0746954723052172, "grad_norm": 8.618718147277832, "learning_rate": 1.2442572741194487e-05, "loss": 1.0531, "step": 325 }, { "epoch": 0.08044127786715698, "grad_norm": 13.573511123657227, "learning_rate": 1.3399693721286372e-05, "loss": 1.084, "step": 350 }, { "epoch": 0.08618708342909676, "grad_norm": 15.934146881103516, "learning_rate": 1.4356814701378254e-05, "loss": 1.0139, "step": 375 }, { "epoch": 0.09193288899103655, "grad_norm": 17.006610870361328, "learning_rate": 1.5313935681470137e-05, "loss": 1.0439, "step": 400 }, { "epoch": 0.09767869455297633, "grad_norm": 15.182875633239746, "learning_rate": 1.6271056661562023e-05, "loss": 1.1166, "step": 425 }, { "epoch": 0.10342450011491611, "grad_norm": 8.765186309814453, "learning_rate": 1.7228177641653905e-05, "loss": 0.9745, "step": 450 }, { "epoch": 0.1091703056768559, "grad_norm": 23.071510314941406, "learning_rate": 1.818529862174579e-05, "loss": 1.2088, "step": 475 }, { "epoch": 0.11491611123879568, "grad_norm": 16.805238723754883, "learning_rate": 1.9142419601837675e-05, "loss": 0.9662, "step": 500 }, { "epoch": 0.12066191680073546, "grad_norm": 16.67304039001465, "learning_rate": 2.0099540581929557e-05, "loss": 1.068, "step": 525 }, { "epoch": 0.12640772236267525, "grad_norm": 20.331649780273438, "learning_rate": 2.105666156202144e-05, "loss": 1.0481, "step": 550 }, { "epoch": 0.13215352792461504, "grad_norm": 18.258352279663086, "learning_rate": 2.2013782542113324e-05, "loss": 0.9451, "step": 575 }, { "epoch": 0.1378993334865548, "grad_norm": 43.41960906982422, "learning_rate": 2.297090352220521e-05, "loss": 1.2434, "step": 600 }, { "epoch": 0.1436451390484946, "grad_norm": 14.547021865844727, "learning_rate": 2.392802450229709e-05, "loss": 0.884, "step": 625 }, { "epoch": 0.1493909446104344, "grad_norm": 19.18711280822754, "learning_rate": 2.4885145482388973e-05, "loss": 1.1211, "step": 650 }, { "epoch": 0.15513675017237416, "grad_norm": 12.683270454406738, "learning_rate": 2.584226646248086e-05, "loss": 1.1419, "step": 675 }, { "epoch": 0.16088255573431395, "grad_norm": 12.683286666870117, "learning_rate": 2.6799387442572744e-05, "loss": 0.9859, "step": 700 }, { "epoch": 0.16662836129625375, "grad_norm": 17.171892166137695, "learning_rate": 2.775650842266463e-05, "loss": 1.1642, "step": 725 }, { "epoch": 0.1723741668581935, "grad_norm": 16.442211151123047, "learning_rate": 2.8713629402756508e-05, "loss": 1.116, "step": 750 }, { "epoch": 0.1781199724201333, "grad_norm": 16.613231658935547, "learning_rate": 2.9670750382848396e-05, "loss": 0.882, "step": 775 }, { "epoch": 0.1838657779820731, "grad_norm": 18.972503662109375, "learning_rate": 3.0627871362940275e-05, "loss": 0.9289, "step": 800 }, { "epoch": 0.18961158354401286, "grad_norm": 12.045616149902344, "learning_rate": 3.158499234303216e-05, "loss": 1.0779, "step": 825 }, { "epoch": 0.19535738910595266, "grad_norm": 10.648724555969238, "learning_rate": 3.2542113323124045e-05, "loss": 1.0184, "step": 850 }, { "epoch": 0.20110319466789245, "grad_norm": 12.485301971435547, "learning_rate": 3.349923430321593e-05, "loss": 0.9873, "step": 875 }, { "epoch": 0.20684900022983221, "grad_norm": 23.347667694091797, "learning_rate": 3.445635528330781e-05, "loss": 1.1136, "step": 900 }, { "epoch": 0.212594805791772, "grad_norm": 22.95267105102539, "learning_rate": 3.54134762633997e-05, "loss": 0.9551, "step": 925 }, { "epoch": 0.2183406113537118, "grad_norm": 17.17110824584961, "learning_rate": 3.637059724349158e-05, "loss": 1.1647, "step": 950 }, { "epoch": 0.22408641691565157, "grad_norm": 11.29631519317627, "learning_rate": 3.732771822358346e-05, "loss": 1.0254, "step": 975 }, { "epoch": 0.22983222247759136, "grad_norm": 18.52350425720215, "learning_rate": 3.828483920367535e-05, "loss": 0.9791, "step": 1000 }, { "epoch": 0.23557802803953115, "grad_norm": 24.836822509765625, "learning_rate": 3.924196018376723e-05, "loss": 1.0647, "step": 1025 }, { "epoch": 0.24132383360147092, "grad_norm": 12.823792457580566, "learning_rate": 4.0199081163859114e-05, "loss": 0.9141, "step": 1050 }, { "epoch": 0.2470696391634107, "grad_norm": 11.589208602905273, "learning_rate": 4.1156202143950996e-05, "loss": 1.1041, "step": 1075 }, { "epoch": 0.2528154447253505, "grad_norm": 21.15415382385254, "learning_rate": 4.211332312404288e-05, "loss": 1.0452, "step": 1100 }, { "epoch": 0.25856125028729027, "grad_norm": 15.532148361206055, "learning_rate": 4.3070444104134766e-05, "loss": 0.9655, "step": 1125 }, { "epoch": 0.2643070558492301, "grad_norm": 25.468048095703125, "learning_rate": 4.402756508422665e-05, "loss": 0.9519, "step": 1150 }, { "epoch": 0.27005286141116985, "grad_norm": 15.610981941223145, "learning_rate": 4.498468606431853e-05, "loss": 1.0293, "step": 1175 }, { "epoch": 0.2757986669731096, "grad_norm": 17.443798065185547, "learning_rate": 4.594180704441042e-05, "loss": 0.9748, "step": 1200 }, { "epoch": 0.28154447253504944, "grad_norm": 9.4945650100708, "learning_rate": 4.68989280245023e-05, "loss": 1.0328, "step": 1225 }, { "epoch": 0.2872902780969892, "grad_norm": 14.447216033935547, "learning_rate": 4.785604900459418e-05, "loss": 1.0917, "step": 1250 }, { "epoch": 0.29303608365892897, "grad_norm": 15.632061958312988, "learning_rate": 4.881316998468607e-05, "loss": 1.1806, "step": 1275 }, { "epoch": 0.2987818892208688, "grad_norm": 11.10702896118164, "learning_rate": 4.9770290964777946e-05, "loss": 0.9064, "step": 1300 }, { "epoch": 0.30452769478280856, "grad_norm": 14.962434768676758, "learning_rate": 4.991912828807355e-05, "loss": 1.0403, "step": 1325 }, { "epoch": 0.3102735003447483, "grad_norm": 7.605042934417725, "learning_rate": 4.9812718140801915e-05, "loss": 1.0427, "step": 1350 }, { "epoch": 0.31601930590668814, "grad_norm": 15.476686477661133, "learning_rate": 4.9706307993530265e-05, "loss": 1.1261, "step": 1375 }, { "epoch": 0.3217651114686279, "grad_norm": 15.57099723815918, "learning_rate": 4.959989784625862e-05, "loss": 1.0964, "step": 1400 }, { "epoch": 0.32751091703056767, "grad_norm": 19.747634887695312, "learning_rate": 4.949348769898698e-05, "loss": 1.1034, "step": 1425 }, { "epoch": 0.3332567225925075, "grad_norm": 16.88288116455078, "learning_rate": 4.9387077551715334e-05, "loss": 1.0307, "step": 1450 }, { "epoch": 0.33900252815444726, "grad_norm": 14.693648338317871, "learning_rate": 4.928066740444369e-05, "loss": 1.0929, "step": 1475 }, { "epoch": 0.344748333716387, "grad_norm": 19.459819793701172, "learning_rate": 4.917425725717205e-05, "loss": 1.0146, "step": 1500 }, { "epoch": 0.35049413927832684, "grad_norm": 13.217524528503418, "learning_rate": 4.90678471099004e-05, "loss": 1.1282, "step": 1525 }, { "epoch": 0.3562399448402666, "grad_norm": 16.83650016784668, "learning_rate": 4.896143696262876e-05, "loss": 0.938, "step": 1550 }, { "epoch": 0.3619857504022064, "grad_norm": 15.622553825378418, "learning_rate": 4.885502681535711e-05, "loss": 1.1303, "step": 1575 }, { "epoch": 0.3677315559641462, "grad_norm": 18.172714233398438, "learning_rate": 4.8748616668085473e-05, "loss": 1.0853, "step": 1600 }, { "epoch": 0.37347736152608596, "grad_norm": 25.019132614135742, "learning_rate": 4.864220652081383e-05, "loss": 1.012, "step": 1625 }, { "epoch": 0.3792231670880257, "grad_norm": 21.372913360595703, "learning_rate": 4.853579637354218e-05, "loss": 1.1325, "step": 1650 }, { "epoch": 0.38496897264996555, "grad_norm": 18.59086036682129, "learning_rate": 4.842938622627054e-05, "loss": 1.0089, "step": 1675 }, { "epoch": 0.3907147782119053, "grad_norm": 10.926801681518555, "learning_rate": 4.832297607899889e-05, "loss": 0.8773, "step": 1700 }, { "epoch": 0.3964605837738451, "grad_norm": 59.61377716064453, "learning_rate": 4.8216565931727256e-05, "loss": 1.0163, "step": 1725 }, { "epoch": 0.4022063893357849, "grad_norm": 12.620614051818848, "learning_rate": 4.8110155784455606e-05, "loss": 1.1281, "step": 1750 }, { "epoch": 0.40795219489772466, "grad_norm": 32.67042541503906, "learning_rate": 4.800374563718397e-05, "loss": 1.0002, "step": 1775 }, { "epoch": 0.41369800045966443, "grad_norm": 20.0575008392334, "learning_rate": 4.789733548991232e-05, "loss": 1.0719, "step": 1800 }, { "epoch": 0.41944380602160425, "grad_norm": 13.503483772277832, "learning_rate": 4.7790925342640675e-05, "loss": 1.1353, "step": 1825 }, { "epoch": 0.425189611583544, "grad_norm": 21.6424560546875, "learning_rate": 4.768451519536903e-05, "loss": 0.9627, "step": 1850 }, { "epoch": 0.4309354171454838, "grad_norm": 44.92792892456055, "learning_rate": 4.757810504809739e-05, "loss": 0.949, "step": 1875 }, { "epoch": 0.4366812227074236, "grad_norm": 16.71977424621582, "learning_rate": 4.7471694900825745e-05, "loss": 0.9162, "step": 1900 }, { "epoch": 0.44242702826936336, "grad_norm": 16.909381866455078, "learning_rate": 4.73652847535541e-05, "loss": 0.9035, "step": 1925 }, { "epoch": 0.44817283383130313, "grad_norm": 17.321500778198242, "learning_rate": 4.725887460628246e-05, "loss": 1.0233, "step": 1950 }, { "epoch": 0.45391863939324295, "grad_norm": 24.249675750732422, "learning_rate": 4.7152464459010814e-05, "loss": 1.1362, "step": 1975 }, { "epoch": 0.4596644449551827, "grad_norm": 19.54537010192871, "learning_rate": 4.704605431173917e-05, "loss": 0.9597, "step": 2000 }, { "epoch": 0.4654102505171225, "grad_norm": 10.297558784484863, "learning_rate": 4.693964416446753e-05, "loss": 1.0074, "step": 2025 }, { "epoch": 0.4711560560790623, "grad_norm": 17.777502059936523, "learning_rate": 4.6833234017195884e-05, "loss": 1.0467, "step": 2050 }, { "epoch": 0.47690186164100207, "grad_norm": 25.018428802490234, "learning_rate": 4.6726823869924234e-05, "loss": 1.0512, "step": 2075 }, { "epoch": 0.48264766720294183, "grad_norm": 12.033584594726562, "learning_rate": 4.66204137226526e-05, "loss": 1.0026, "step": 2100 }, { "epoch": 0.48839347276488165, "grad_norm": 11.690512657165527, "learning_rate": 4.651400357538095e-05, "loss": 1.0681, "step": 2125 }, { "epoch": 0.4941392783268214, "grad_norm": 16.697847366333008, "learning_rate": 4.640759342810931e-05, "loss": 1.1947, "step": 2150 }, { "epoch": 0.4998850838887612, "grad_norm": 16.905946731567383, "learning_rate": 4.630118328083766e-05, "loss": 1.0865, "step": 2175 }, { "epoch": 0.505630889450701, "grad_norm": 12.390742301940918, "learning_rate": 4.619477313356602e-05, "loss": 1.0294, "step": 2200 }, { "epoch": 0.5113766950126408, "grad_norm": 15.317625045776367, "learning_rate": 4.608836298629437e-05, "loss": 1.1887, "step": 2225 }, { "epoch": 0.5171225005745805, "grad_norm": 15.924113273620605, "learning_rate": 4.598195283902273e-05, "loss": 1.0063, "step": 2250 }, { "epoch": 0.5228683061365204, "grad_norm": 11.048434257507324, "learning_rate": 4.5875542691751086e-05, "loss": 0.886, "step": 2275 }, { "epoch": 0.5286141116984602, "grad_norm": 16.680204391479492, "learning_rate": 4.576913254447944e-05, "loss": 1.1075, "step": 2300 }, { "epoch": 0.5343599172603999, "grad_norm": 14.081778526306152, "learning_rate": 4.56627223972078e-05, "loss": 1.1626, "step": 2325 }, { "epoch": 0.5401057228223397, "grad_norm": 25.836400985717773, "learning_rate": 4.5556312249936155e-05, "loss": 1.0299, "step": 2350 }, { "epoch": 0.5458515283842795, "grad_norm": 14.950949668884277, "learning_rate": 4.544990210266451e-05, "loss": 1.0186, "step": 2375 }, { "epoch": 0.5515973339462192, "grad_norm": 13.539344787597656, "learning_rate": 4.534349195539287e-05, "loss": 1.0529, "step": 2400 }, { "epoch": 0.5573431395081591, "grad_norm": 11.866837501525879, "learning_rate": 4.5237081808121225e-05, "loss": 1.0269, "step": 2425 }, { "epoch": 0.5630889450700989, "grad_norm": 23.597551345825195, "learning_rate": 4.513067166084958e-05, "loss": 1.0083, "step": 2450 }, { "epoch": 0.5688347506320386, "grad_norm": 15.675888061523438, "learning_rate": 4.502426151357794e-05, "loss": 1.0386, "step": 2475 }, { "epoch": 0.5745805561939784, "grad_norm": 12.158834457397461, "learning_rate": 4.491785136630629e-05, "loss": 1.1082, "step": 2500 }, { "epoch": 0.5803263617559182, "grad_norm": 14.904141426086426, "learning_rate": 4.481144121903465e-05, "loss": 1.0783, "step": 2525 }, { "epoch": 0.5860721673178579, "grad_norm": 29.646265029907227, "learning_rate": 4.4705031071763e-05, "loss": 1.0602, "step": 2550 }, { "epoch": 0.5918179728797978, "grad_norm": 18.369321823120117, "learning_rate": 4.4598620924491364e-05, "loss": 0.9937, "step": 2575 }, { "epoch": 0.5975637784417376, "grad_norm": 12.290249824523926, "learning_rate": 4.4492210777219714e-05, "loss": 0.9421, "step": 2600 }, { "epoch": 0.6033095840036773, "grad_norm": 18.439208984375, "learning_rate": 4.438580062994808e-05, "loss": 1.0252, "step": 2625 }, { "epoch": 0.6090553895656171, "grad_norm": 22.04486846923828, "learning_rate": 4.4279390482676434e-05, "loss": 0.8886, "step": 2650 }, { "epoch": 0.6148011951275569, "grad_norm": 22.83307647705078, "learning_rate": 4.4172980335404783e-05, "loss": 0.9552, "step": 2675 }, { "epoch": 0.6205470006894966, "grad_norm": 19.321020126342773, "learning_rate": 4.406657018813315e-05, "loss": 1.0092, "step": 2700 }, { "epoch": 0.6262928062514365, "grad_norm": 14.00854778289795, "learning_rate": 4.3960160040861496e-05, "loss": 0.8749, "step": 2725 }, { "epoch": 0.6320386118133763, "grad_norm": 7.2084269523620605, "learning_rate": 4.385374989358986e-05, "loss": 1.0578, "step": 2750 }, { "epoch": 0.637784417375316, "grad_norm": 12.39623737335205, "learning_rate": 4.374733974631821e-05, "loss": 1.0612, "step": 2775 }, { "epoch": 0.6435302229372558, "grad_norm": 26.27788734436035, "learning_rate": 4.3640929599046566e-05, "loss": 0.968, "step": 2800 }, { "epoch": 0.6492760284991956, "grad_norm": 11.55069637298584, "learning_rate": 4.353451945177492e-05, "loss": 0.9956, "step": 2825 }, { "epoch": 0.6550218340611353, "grad_norm": 17.346981048583984, "learning_rate": 4.342810930450328e-05, "loss": 0.984, "step": 2850 }, { "epoch": 0.6607676396230752, "grad_norm": 11.784706115722656, "learning_rate": 4.3321699157231636e-05, "loss": 1.0254, "step": 2875 }, { "epoch": 0.666513445185015, "grad_norm": 14.726810455322266, "learning_rate": 4.321528900995999e-05, "loss": 0.9698, "step": 2900 }, { "epoch": 0.6722592507469547, "grad_norm": 18.386871337890625, "learning_rate": 4.310887886268835e-05, "loss": 0.9413, "step": 2925 }, { "epoch": 0.6780050563088945, "grad_norm": 29.44155502319336, "learning_rate": 4.3002468715416705e-05, "loss": 1.0491, "step": 2950 }, { "epoch": 0.6837508618708343, "grad_norm": 15.0762939453125, "learning_rate": 4.289605856814506e-05, "loss": 0.8545, "step": 2975 }, { "epoch": 0.689496667432774, "grad_norm": 21.399852752685547, "learning_rate": 4.278964842087342e-05, "loss": 1.0221, "step": 3000 }, { "epoch": 0.6952424729947139, "grad_norm": 20.22688865661621, "learning_rate": 4.2683238273601775e-05, "loss": 0.9472, "step": 3025 }, { "epoch": 0.7009882785566537, "grad_norm": 21.75887680053711, "learning_rate": 4.257682812633013e-05, "loss": 1.1666, "step": 3050 }, { "epoch": 0.7067340841185934, "grad_norm": 19.081144332885742, "learning_rate": 4.247041797905849e-05, "loss": 1.0754, "step": 3075 }, { "epoch": 0.7124798896805332, "grad_norm": 11.539175987243652, "learning_rate": 4.236400783178684e-05, "loss": 0.9631, "step": 3100 }, { "epoch": 0.718225695242473, "grad_norm": 19.64055824279785, "learning_rate": 4.22575976845152e-05, "loss": 1.0522, "step": 3125 }, { "epoch": 0.7239715008044127, "grad_norm": 15.160033226013184, "learning_rate": 4.215118753724355e-05, "loss": 0.8811, "step": 3150 }, { "epoch": 0.7297173063663526, "grad_norm": 12.39316177368164, "learning_rate": 4.2044777389971914e-05, "loss": 0.8472, "step": 3175 }, { "epoch": 0.7354631119282924, "grad_norm": 7.085254669189453, "learning_rate": 4.1938367242700264e-05, "loss": 0.9831, "step": 3200 }, { "epoch": 0.7412089174902321, "grad_norm": 15.343351364135742, "learning_rate": 4.183195709542862e-05, "loss": 1.084, "step": 3225 }, { "epoch": 0.7469547230521719, "grad_norm": 11.71967601776123, "learning_rate": 4.1725546948156977e-05, "loss": 1.0677, "step": 3250 }, { "epoch": 0.7527005286141117, "grad_norm": 14.376879692077637, "learning_rate": 4.161913680088533e-05, "loss": 0.9266, "step": 3275 }, { "epoch": 0.7584463341760515, "grad_norm": 12.832955360412598, "learning_rate": 4.151272665361369e-05, "loss": 1.069, "step": 3300 }, { "epoch": 0.7641921397379913, "grad_norm": 8.518636703491211, "learning_rate": 4.1406316506342046e-05, "loss": 1.0591, "step": 3325 }, { "epoch": 0.7699379452999311, "grad_norm": 13.678732872009277, "learning_rate": 4.12999063590704e-05, "loss": 0.8587, "step": 3350 }, { "epoch": 0.7756837508618708, "grad_norm": 16.9919376373291, "learning_rate": 4.119349621179876e-05, "loss": 1.0463, "step": 3375 }, { "epoch": 0.7814295564238106, "grad_norm": 19.827529907226562, "learning_rate": 4.1087086064527116e-05, "loss": 1.0071, "step": 3400 }, { "epoch": 0.7871753619857504, "grad_norm": 16.30826759338379, "learning_rate": 4.098067591725547e-05, "loss": 0.9613, "step": 3425 }, { "epoch": 0.7929211675476902, "grad_norm": 19.365869522094727, "learning_rate": 4.087426576998383e-05, "loss": 0.8334, "step": 3450 }, { "epoch": 0.79866697310963, "grad_norm": 10.79730224609375, "learning_rate": 4.076785562271218e-05, "loss": 0.9526, "step": 3475 }, { "epoch": 0.8044127786715698, "grad_norm": 10.032505989074707, "learning_rate": 4.066144547544054e-05, "loss": 1.0273, "step": 3500 }, { "epoch": 0.8101585842335095, "grad_norm": 15.712605476379395, "learning_rate": 4.055503532816889e-05, "loss": 1.0953, "step": 3525 }, { "epoch": 0.8159043897954493, "grad_norm": 18.788259506225586, "learning_rate": 4.0448625180897255e-05, "loss": 1.1593, "step": 3550 }, { "epoch": 0.8216501953573891, "grad_norm": 12.3018159866333, "learning_rate": 4.0342215033625605e-05, "loss": 0.943, "step": 3575 }, { "epoch": 0.8273960009193289, "grad_norm": 26.096878051757812, "learning_rate": 4.023580488635397e-05, "loss": 0.9589, "step": 3600 }, { "epoch": 0.8331418064812687, "grad_norm": 9.981711387634277, "learning_rate": 4.0129394739082324e-05, "loss": 0.9157, "step": 3625 }, { "epoch": 0.8388876120432085, "grad_norm": 19.37105941772461, "learning_rate": 4.0022984591810674e-05, "loss": 1.1807, "step": 3650 }, { "epoch": 0.8446334176051482, "grad_norm": 20.733781814575195, "learning_rate": 3.991657444453904e-05, "loss": 0.9934, "step": 3675 }, { "epoch": 0.850379223167088, "grad_norm": 27.081636428833008, "learning_rate": 3.981016429726739e-05, "loss": 1.1039, "step": 3700 }, { "epoch": 0.8561250287290278, "grad_norm": 20.710731506347656, "learning_rate": 3.970375414999575e-05, "loss": 1.0698, "step": 3725 }, { "epoch": 0.8618708342909676, "grad_norm": 12.305147171020508, "learning_rate": 3.95973440027241e-05, "loss": 0.9086, "step": 3750 }, { "epoch": 0.8676166398529074, "grad_norm": 8.804189682006836, "learning_rate": 3.949093385545246e-05, "loss": 0.8438, "step": 3775 }, { "epoch": 0.8733624454148472, "grad_norm": 14.99170207977295, "learning_rate": 3.938452370818081e-05, "loss": 0.8631, "step": 3800 }, { "epoch": 0.8791082509767869, "grad_norm": 18.036231994628906, "learning_rate": 3.927811356090917e-05, "loss": 1.0129, "step": 3825 }, { "epoch": 0.8848540565387267, "grad_norm": 11.981534957885742, "learning_rate": 3.9171703413637526e-05, "loss": 0.9894, "step": 3850 }, { "epoch": 0.8905998621006666, "grad_norm": 15.030149459838867, "learning_rate": 3.906529326636588e-05, "loss": 1.0499, "step": 3875 }, { "epoch": 0.8963456676626063, "grad_norm": 15.344982147216797, "learning_rate": 3.895888311909424e-05, "loss": 0.9134, "step": 3900 }, { "epoch": 0.9020914732245461, "grad_norm": 25.36429214477539, "learning_rate": 3.8852472971822596e-05, "loss": 0.9781, "step": 3925 }, { "epoch": 0.9078372787864859, "grad_norm": 14.226202011108398, "learning_rate": 3.874606282455095e-05, "loss": 0.8312, "step": 3950 }, { "epoch": 0.9135830843484256, "grad_norm": 43.66263198852539, "learning_rate": 3.863965267727931e-05, "loss": 0.9736, "step": 3975 }, { "epoch": 0.9193288899103654, "grad_norm": 23.34619140625, "learning_rate": 3.8533242530007665e-05, "loss": 1.0665, "step": 4000 }, { "epoch": 0.9250746954723053, "grad_norm": 25.17789649963379, "learning_rate": 3.842683238273602e-05, "loss": 1.0652, "step": 4025 }, { "epoch": 0.930820501034245, "grad_norm": 9.696635246276855, "learning_rate": 3.832042223546438e-05, "loss": 0.867, "step": 4050 }, { "epoch": 0.9365663065961848, "grad_norm": 9.90356731414795, "learning_rate": 3.821401208819273e-05, "loss": 0.7973, "step": 4075 }, { "epoch": 0.9423121121581246, "grad_norm": 22.405452728271484, "learning_rate": 3.810760194092109e-05, "loss": 0.8214, "step": 4100 }, { "epoch": 0.9480579177200643, "grad_norm": 24.39476776123047, "learning_rate": 3.800119179364944e-05, "loss": 0.9049, "step": 4125 }, { "epoch": 0.9538037232820041, "grad_norm": 9.647842407226562, "learning_rate": 3.7894781646377804e-05, "loss": 0.8708, "step": 4150 }, { "epoch": 0.959549528843944, "grad_norm": 26.49906349182129, "learning_rate": 3.7788371499106154e-05, "loss": 0.9184, "step": 4175 }, { "epoch": 0.9652953344058837, "grad_norm": 20.982925415039062, "learning_rate": 3.768196135183451e-05, "loss": 0.9092, "step": 4200 }, { "epoch": 0.9710411399678235, "grad_norm": 11.060940742492676, "learning_rate": 3.757555120456287e-05, "loss": 0.8978, "step": 4225 }, { "epoch": 0.9767869455297633, "grad_norm": 9.029313087463379, "learning_rate": 3.7469141057291224e-05, "loss": 0.7428, "step": 4250 }, { "epoch": 0.982532751091703, "grad_norm": 10.593424797058105, "learning_rate": 3.736273091001958e-05, "loss": 0.9591, "step": 4275 }, { "epoch": 0.9882785566536428, "grad_norm": 15.341836929321289, "learning_rate": 3.725632076274794e-05, "loss": 0.8476, "step": 4300 }, { "epoch": 0.9940243622155827, "grad_norm": 13.930924415588379, "learning_rate": 3.714991061547629e-05, "loss": 1.0338, "step": 4325 }, { "epoch": 0.9997701677775224, "grad_norm": 10.704158782958984, "learning_rate": 3.704350046820465e-05, "loss": 0.923, "step": 4350 }, { "epoch": 1.0, "eval_gen_len": 26.2909, "eval_loss": 0.734386682510376, "eval_rouge1": 83.7735, "eval_rouge2": 66.5715, "eval_rougeL": 81.6112, "eval_rougeLsum": 81.6279, "eval_runtime": 2086.9324, "eval_samples_per_second": 1.39, "eval_steps_per_second": 0.348, "step": 4351 }, { "epoch": 1.0055159733394623, "grad_norm": 7.619024276733398, "learning_rate": 3.6937090320933006e-05, "loss": 0.5463, "step": 4375 }, { "epoch": 1.011261778901402, "grad_norm": 6.126559734344482, "learning_rate": 3.683068017366136e-05, "loss": 0.5629, "step": 4400 }, { "epoch": 1.0170075844633417, "grad_norm": 20.718738555908203, "learning_rate": 3.672427002638972e-05, "loss": 0.5278, "step": 4425 }, { "epoch": 1.0227533900252817, "grad_norm": 6.006888389587402, "learning_rate": 3.6617859879118076e-05, "loss": 0.5893, "step": 4450 }, { "epoch": 1.0284991955872214, "grad_norm": 14.341240882873535, "learning_rate": 3.651144973184643e-05, "loss": 0.6282, "step": 4475 }, { "epoch": 1.034245001149161, "grad_norm": 16.240156173706055, "learning_rate": 3.640503958457478e-05, "loss": 0.6328, "step": 4500 }, { "epoch": 1.039990806711101, "grad_norm": 8.577178955078125, "learning_rate": 3.6298629437303145e-05, "loss": 0.6274, "step": 4525 }, { "epoch": 1.0457366122730407, "grad_norm": 13.634942054748535, "learning_rate": 3.6192219290031495e-05, "loss": 0.6302, "step": 4550 }, { "epoch": 1.0514824178349804, "grad_norm": 13.95753002166748, "learning_rate": 3.608580914275986e-05, "loss": 0.6617, "step": 4575 }, { "epoch": 1.0572282233969204, "grad_norm": 16.2097110748291, "learning_rate": 3.597939899548821e-05, "loss": 0.612, "step": 4600 }, { "epoch": 1.06297402895886, "grad_norm": 17.29051971435547, "learning_rate": 3.5872988848216565e-05, "loss": 0.5602, "step": 4625 }, { "epoch": 1.0687198345207998, "grad_norm": 11.388086318969727, "learning_rate": 3.576657870094493e-05, "loss": 0.4798, "step": 4650 }, { "epoch": 1.0744656400827397, "grad_norm": 13.426841735839844, "learning_rate": 3.566016855367328e-05, "loss": 0.5345, "step": 4675 }, { "epoch": 1.0802114456446794, "grad_norm": 14.696466445922852, "learning_rate": 3.555375840640164e-05, "loss": 0.6485, "step": 4700 }, { "epoch": 1.0859572512066191, "grad_norm": 12.150015830993652, "learning_rate": 3.544734825912999e-05, "loss": 0.6087, "step": 4725 }, { "epoch": 1.091703056768559, "grad_norm": 9.841524124145508, "learning_rate": 3.534093811185835e-05, "loss": 0.5868, "step": 4750 }, { "epoch": 1.0974488623304988, "grad_norm": 8.271934509277344, "learning_rate": 3.5234527964586704e-05, "loss": 0.5462, "step": 4775 }, { "epoch": 1.1031946678924385, "grad_norm": 7.936254978179932, "learning_rate": 3.512811781731506e-05, "loss": 0.5379, "step": 4800 }, { "epoch": 1.1089404734543784, "grad_norm": 11.007326126098633, "learning_rate": 3.502170767004342e-05, "loss": 0.5492, "step": 4825 }, { "epoch": 1.1146862790163181, "grad_norm": 19.896902084350586, "learning_rate": 3.4915297522771773e-05, "loss": 0.6375, "step": 4850 }, { "epoch": 1.1204320845782578, "grad_norm": 22.039011001586914, "learning_rate": 3.480888737550013e-05, "loss": 0.5828, "step": 4875 }, { "epoch": 1.1261778901401978, "grad_norm": 6.415574550628662, "learning_rate": 3.4702477228228486e-05, "loss": 0.4254, "step": 4900 }, { "epoch": 1.1319236957021375, "grad_norm": 12.866458892822266, "learning_rate": 3.459606708095684e-05, "loss": 0.566, "step": 4925 }, { "epoch": 1.1376695012640772, "grad_norm": 22.729032516479492, "learning_rate": 3.44896569336852e-05, "loss": 0.5428, "step": 4950 }, { "epoch": 1.143415306826017, "grad_norm": 11.15864372253418, "learning_rate": 3.4383246786413556e-05, "loss": 0.628, "step": 4975 }, { "epoch": 1.1491611123879568, "grad_norm": 9.711397171020508, "learning_rate": 3.427683663914191e-05, "loss": 0.4856, "step": 5000 }, { "epoch": 1.1549069179498965, "grad_norm": 13.280930519104004, "learning_rate": 3.417042649187027e-05, "loss": 0.5113, "step": 5025 }, { "epoch": 1.1606527235118365, "grad_norm": 35.520687103271484, "learning_rate": 3.406401634459862e-05, "loss": 0.4617, "step": 5050 }, { "epoch": 1.1663985290737762, "grad_norm": 31.962560653686523, "learning_rate": 3.395760619732698e-05, "loss": 0.5678, "step": 5075 }, { "epoch": 1.1721443346357159, "grad_norm": 18.267778396606445, "learning_rate": 3.385119605005533e-05, "loss": 0.5897, "step": 5100 }, { "epoch": 1.1778901401976558, "grad_norm": 16.51255989074707, "learning_rate": 3.3744785902783695e-05, "loss": 0.5746, "step": 5125 }, { "epoch": 1.1836359457595955, "grad_norm": 6.914166450500488, "learning_rate": 3.3638375755512045e-05, "loss": 0.593, "step": 5150 }, { "epoch": 1.1893817513215352, "grad_norm": 10.85730266571045, "learning_rate": 3.35319656082404e-05, "loss": 0.5905, "step": 5175 }, { "epoch": 1.1951275568834752, "grad_norm": 17.28081512451172, "learning_rate": 3.342555546096876e-05, "loss": 0.5473, "step": 5200 }, { "epoch": 1.2008733624454149, "grad_norm": 13.884358406066895, "learning_rate": 3.3319145313697114e-05, "loss": 0.5982, "step": 5225 }, { "epoch": 1.2066191680073546, "grad_norm": 9.417092323303223, "learning_rate": 3.321273516642547e-05, "loss": 0.6085, "step": 5250 }, { "epoch": 1.2123649735692945, "grad_norm": 10.19940185546875, "learning_rate": 3.310632501915383e-05, "loss": 0.56, "step": 5275 }, { "epoch": 1.2181107791312342, "grad_norm": 17.75829315185547, "learning_rate": 3.2999914871882184e-05, "loss": 0.5455, "step": 5300 }, { "epoch": 1.223856584693174, "grad_norm": 13.822357177734375, "learning_rate": 3.289350472461054e-05, "loss": 0.5122, "step": 5325 }, { "epoch": 1.2296023902551139, "grad_norm": 6.675373077392578, "learning_rate": 3.27870945773389e-05, "loss": 0.6045, "step": 5350 }, { "epoch": 1.2353481958170536, "grad_norm": 12.710549354553223, "learning_rate": 3.2680684430067254e-05, "loss": 0.5715, "step": 5375 }, { "epoch": 1.2410940013789933, "grad_norm": 14.400224685668945, "learning_rate": 3.257427428279561e-05, "loss": 0.5742, "step": 5400 }, { "epoch": 1.2468398069409332, "grad_norm": 14.449501037597656, "learning_rate": 3.2467864135523967e-05, "loss": 0.6402, "step": 5425 }, { "epoch": 1.252585612502873, "grad_norm": 10.527132034301758, "learning_rate": 3.236145398825232e-05, "loss": 0.6159, "step": 5450 }, { "epoch": 1.2583314180648126, "grad_norm": 10.82150650024414, "learning_rate": 3.225504384098067e-05, "loss": 0.5755, "step": 5475 }, { "epoch": 1.2640772236267526, "grad_norm": 7.481512069702148, "learning_rate": 3.2148633693709036e-05, "loss": 0.5496, "step": 5500 }, { "epoch": 1.2698230291886923, "grad_norm": 14.726329803466797, "learning_rate": 3.2042223546437386e-05, "loss": 0.4598, "step": 5525 }, { "epoch": 1.275568834750632, "grad_norm": 17.87427520751953, "learning_rate": 3.193581339916575e-05, "loss": 0.5756, "step": 5550 }, { "epoch": 1.281314640312572, "grad_norm": 10.091878890991211, "learning_rate": 3.18294032518941e-05, "loss": 0.5296, "step": 5575 }, { "epoch": 1.2870604458745116, "grad_norm": 20.161104202270508, "learning_rate": 3.1722993104622455e-05, "loss": 0.5092, "step": 5600 }, { "epoch": 1.2928062514364513, "grad_norm": 26.924951553344727, "learning_rate": 3.161658295735082e-05, "loss": 0.5627, "step": 5625 }, { "epoch": 1.2985520569983913, "grad_norm": 11.02285099029541, "learning_rate": 3.151017281007917e-05, "loss": 0.5273, "step": 5650 }, { "epoch": 1.304297862560331, "grad_norm": 26.79828643798828, "learning_rate": 3.140376266280753e-05, "loss": 0.4949, "step": 5675 }, { "epoch": 1.3100436681222707, "grad_norm": 13.996374130249023, "learning_rate": 3.129735251553588e-05, "loss": 0.5387, "step": 5700 }, { "epoch": 1.3157894736842106, "grad_norm": 18.909332275390625, "learning_rate": 3.119094236826424e-05, "loss": 0.5246, "step": 5725 }, { "epoch": 1.3215352792461503, "grad_norm": 13.408002853393555, "learning_rate": 3.1084532220992595e-05, "loss": 0.4895, "step": 5750 }, { "epoch": 1.32728108480809, "grad_norm": 13.344757080078125, "learning_rate": 3.097812207372095e-05, "loss": 0.4562, "step": 5775 }, { "epoch": 1.33302689037003, "grad_norm": 16.318279266357422, "learning_rate": 3.087171192644931e-05, "loss": 0.6114, "step": 5800 }, { "epoch": 1.3387726959319697, "grad_norm": 14.15111255645752, "learning_rate": 3.0765301779177664e-05, "loss": 0.5826, "step": 5825 }, { "epoch": 1.3445185014939094, "grad_norm": 10.535929679870605, "learning_rate": 3.065889163190602e-05, "loss": 0.546, "step": 5850 }, { "epoch": 1.3502643070558493, "grad_norm": 14.012350082397461, "learning_rate": 3.055248148463438e-05, "loss": 0.6077, "step": 5875 }, { "epoch": 1.356010112617789, "grad_norm": 15.3707914352417, "learning_rate": 3.044607133736273e-05, "loss": 0.5442, "step": 5900 }, { "epoch": 1.3617559181797287, "grad_norm": 28.015796661376953, "learning_rate": 3.033966119009109e-05, "loss": 0.4813, "step": 5925 }, { "epoch": 1.3675017237416687, "grad_norm": 10.887642860412598, "learning_rate": 3.0233251042819443e-05, "loss": 0.6075, "step": 5950 }, { "epoch": 1.3732475293036084, "grad_norm": 13.379786491394043, "learning_rate": 3.0126840895547803e-05, "loss": 0.6452, "step": 5975 }, { "epoch": 1.378993334865548, "grad_norm": 19.68934440612793, "learning_rate": 3.0020430748276156e-05, "loss": 0.5204, "step": 6000 }, { "epoch": 1.384739140427488, "grad_norm": 11.586589813232422, "learning_rate": 2.991402060100451e-05, "loss": 0.4538, "step": 6025 }, { "epoch": 1.3904849459894277, "grad_norm": 16.921403884887695, "learning_rate": 2.980761045373287e-05, "loss": 0.6548, "step": 6050 }, { "epoch": 1.3962307515513674, "grad_norm": 12.915703773498535, "learning_rate": 2.9701200306461226e-05, "loss": 0.5857, "step": 6075 }, { "epoch": 1.4019765571133074, "grad_norm": 11.560894966125488, "learning_rate": 2.9594790159189582e-05, "loss": 0.6351, "step": 6100 }, { "epoch": 1.407722362675247, "grad_norm": 12.070367813110352, "learning_rate": 2.948838001191794e-05, "loss": 0.5254, "step": 6125 }, { "epoch": 1.4134681682371868, "grad_norm": 6.56376314163208, "learning_rate": 2.9381969864646292e-05, "loss": 0.489, "step": 6150 }, { "epoch": 1.4192139737991267, "grad_norm": 7.615925312042236, "learning_rate": 2.9275559717374652e-05, "loss": 0.4299, "step": 6175 }, { "epoch": 1.4249597793610664, "grad_norm": 26.307531356811523, "learning_rate": 2.9169149570103005e-05, "loss": 0.5573, "step": 6200 }, { "epoch": 1.4307055849230061, "grad_norm": 17.957910537719727, "learning_rate": 2.9062739422831365e-05, "loss": 0.5165, "step": 6225 }, { "epoch": 1.436451390484946, "grad_norm": 5.903622150421143, "learning_rate": 2.8956329275559718e-05, "loss": 0.6125, "step": 6250 }, { "epoch": 1.4421971960468858, "grad_norm": 71.71666717529297, "learning_rate": 2.8849919128288078e-05, "loss": 0.5896, "step": 6275 }, { "epoch": 1.4479430016088255, "grad_norm": 7.325091361999512, "learning_rate": 2.874350898101643e-05, "loss": 0.5207, "step": 6300 }, { "epoch": 1.4536888071707654, "grad_norm": 7.871235370635986, "learning_rate": 2.8637098833744784e-05, "loss": 0.4792, "step": 6325 }, { "epoch": 1.4594346127327051, "grad_norm": 3.092621326446533, "learning_rate": 2.8530688686473144e-05, "loss": 0.4546, "step": 6350 }, { "epoch": 1.4651804182946448, "grad_norm": 11.23499870300293, "learning_rate": 2.8424278539201497e-05, "loss": 0.4942, "step": 6375 }, { "epoch": 1.4709262238565848, "grad_norm": 16.10143280029297, "learning_rate": 2.8317868391929857e-05, "loss": 0.4757, "step": 6400 }, { "epoch": 1.4766720294185245, "grad_norm": 13.106595039367676, "learning_rate": 2.821145824465821e-05, "loss": 0.5392, "step": 6425 }, { "epoch": 1.4824178349804642, "grad_norm": 8.037747383117676, "learning_rate": 2.8105048097386567e-05, "loss": 0.5916, "step": 6450 }, { "epoch": 1.4881636405424041, "grad_norm": 14.93556022644043, "learning_rate": 2.7998637950114927e-05, "loss": 0.564, "step": 6475 }, { "epoch": 1.4939094461043438, "grad_norm": 10.39865493774414, "learning_rate": 2.789222780284328e-05, "loss": 0.5026, "step": 6500 }, { "epoch": 1.4996552516662836, "grad_norm": 10.64941120147705, "learning_rate": 2.778581765557164e-05, "loss": 0.4656, "step": 6525 }, { "epoch": 1.5054010572282235, "grad_norm": 10.483504295349121, "learning_rate": 2.7679407508299993e-05, "loss": 0.5126, "step": 6550 }, { "epoch": 1.5111468627901632, "grad_norm": 7.613571643829346, "learning_rate": 2.7572997361028346e-05, "loss": 0.5551, "step": 6575 }, { "epoch": 1.516892668352103, "grad_norm": 14.762700080871582, "learning_rate": 2.7466587213756706e-05, "loss": 0.7115, "step": 6600 }, { "epoch": 1.5226384739140428, "grad_norm": 15.398651123046875, "learning_rate": 2.736017706648506e-05, "loss": 0.5283, "step": 6625 }, { "epoch": 1.5283842794759825, "grad_norm": 5.248310089111328, "learning_rate": 2.725376691921342e-05, "loss": 0.4443, "step": 6650 }, { "epoch": 1.5341300850379223, "grad_norm": 11.633146286010742, "learning_rate": 2.7147356771941772e-05, "loss": 0.4418, "step": 6675 }, { "epoch": 1.5398758905998622, "grad_norm": 13.065744400024414, "learning_rate": 2.704094662467013e-05, "loss": 0.5505, "step": 6700 }, { "epoch": 1.545621696161802, "grad_norm": 10.502464294433594, "learning_rate": 2.6934536477398485e-05, "loss": 0.5268, "step": 6725 }, { "epoch": 1.5513675017237416, "grad_norm": 42.0969352722168, "learning_rate": 2.6828126330126842e-05, "loss": 0.4736, "step": 6750 }, { "epoch": 1.5571133072856815, "grad_norm": 6.398384094238281, "learning_rate": 2.6721716182855198e-05, "loss": 0.5209, "step": 6775 }, { "epoch": 1.5628591128476212, "grad_norm": 3.2121517658233643, "learning_rate": 2.6615306035583555e-05, "loss": 0.5577, "step": 6800 }, { "epoch": 1.568604918409561, "grad_norm": 48.83503723144531, "learning_rate": 2.6508895888311915e-05, "loss": 0.5862, "step": 6825 }, { "epoch": 1.5743507239715009, "grad_norm": 10.185980796813965, "learning_rate": 2.6402485741040268e-05, "loss": 0.5377, "step": 6850 }, { "epoch": 1.5800965295334406, "grad_norm": 5.717852592468262, "learning_rate": 2.629607559376862e-05, "loss": 0.447, "step": 6875 }, { "epoch": 1.5858423350953803, "grad_norm": 10.820260047912598, "learning_rate": 2.618966544649698e-05, "loss": 0.5178, "step": 6900 }, { "epoch": 1.5915881406573202, "grad_norm": 7.412465572357178, "learning_rate": 2.6083255299225334e-05, "loss": 0.5001, "step": 6925 }, { "epoch": 1.59733394621926, "grad_norm": 23.02524757385254, "learning_rate": 2.5976845151953694e-05, "loss": 0.4905, "step": 6950 }, { "epoch": 1.6030797517811997, "grad_norm": 7.75547981262207, "learning_rate": 2.5870435004682047e-05, "loss": 0.592, "step": 6975 }, { "epoch": 1.6088255573431396, "grad_norm": 44.980621337890625, "learning_rate": 2.57640248574104e-05, "loss": 0.5954, "step": 7000 }, { "epoch": 1.6145713629050793, "grad_norm": 8.900413513183594, "learning_rate": 2.565761471013876e-05, "loss": 0.5001, "step": 7025 }, { "epoch": 1.620317168467019, "grad_norm": 12.892452239990234, "learning_rate": 2.5551204562867113e-05, "loss": 0.5148, "step": 7050 }, { "epoch": 1.626062974028959, "grad_norm": 17.05414390563965, "learning_rate": 2.5444794415595473e-05, "loss": 0.5795, "step": 7075 }, { "epoch": 1.6318087795908987, "grad_norm": 15.482218742370605, "learning_rate": 2.533838426832383e-05, "loss": 0.5684, "step": 7100 }, { "epoch": 1.6375545851528384, "grad_norm": 7.356892108917236, "learning_rate": 2.5231974121052183e-05, "loss": 0.5952, "step": 7125 }, { "epoch": 1.6433003907147783, "grad_norm": 16.71323013305664, "learning_rate": 2.5125563973780543e-05, "loss": 0.7416, "step": 7150 }, { "epoch": 1.649046196276718, "grad_norm": 12.637115478515625, "learning_rate": 2.5019153826508896e-05, "loss": 0.6563, "step": 7175 }, { "epoch": 1.6547920018386577, "grad_norm": 13.602217674255371, "learning_rate": 2.4912743679237252e-05, "loss": 0.4979, "step": 7200 }, { "epoch": 1.6605378074005976, "grad_norm": 10.359466552734375, "learning_rate": 2.480633353196561e-05, "loss": 0.5067, "step": 7225 }, { "epoch": 1.6662836129625374, "grad_norm": 7.33120059967041, "learning_rate": 2.4699923384693965e-05, "loss": 0.4785, "step": 7250 }, { "epoch": 1.672029418524477, "grad_norm": 9.278757095336914, "learning_rate": 2.4593513237422322e-05, "loss": 0.4941, "step": 7275 }, { "epoch": 1.677775224086417, "grad_norm": 6.453770160675049, "learning_rate": 2.448710309015068e-05, "loss": 0.5033, "step": 7300 }, { "epoch": 1.6835210296483567, "grad_norm": 8.040416717529297, "learning_rate": 2.4380692942879035e-05, "loss": 0.4577, "step": 7325 }, { "epoch": 1.6892668352102964, "grad_norm": 13.009758949279785, "learning_rate": 2.4274282795607388e-05, "loss": 0.5811, "step": 7350 }, { "epoch": 1.6950126407722363, "grad_norm": 12.394170761108398, "learning_rate": 2.4167872648335745e-05, "loss": 0.4945, "step": 7375 }, { "epoch": 1.700758446334176, "grad_norm": 16.887958526611328, "learning_rate": 2.40614625010641e-05, "loss": 0.5528, "step": 7400 }, { "epoch": 1.7065042518961158, "grad_norm": 5.690896511077881, "learning_rate": 2.3955052353792458e-05, "loss": 0.4227, "step": 7425 }, { "epoch": 1.7122500574580557, "grad_norm": 5.632653713226318, "learning_rate": 2.3848642206520818e-05, "loss": 0.6466, "step": 7450 }, { "epoch": 1.7179958630199954, "grad_norm": 14.108399391174316, "learning_rate": 2.374223205924917e-05, "loss": 0.4858, "step": 7475 }, { "epoch": 1.7237416685819351, "grad_norm": 12.757282257080078, "learning_rate": 2.3635821911977527e-05, "loss": 0.5619, "step": 7500 }, { "epoch": 1.729487474143875, "grad_norm": 3.3859102725982666, "learning_rate": 2.3529411764705884e-05, "loss": 0.5808, "step": 7525 }, { "epoch": 1.7352332797058148, "grad_norm": 13.51447582244873, "learning_rate": 2.342300161743424e-05, "loss": 0.5598, "step": 7550 }, { "epoch": 1.7409790852677545, "grad_norm": 12.655472755432129, "learning_rate": 2.3316591470162597e-05, "loss": 0.4214, "step": 7575 }, { "epoch": 1.7467248908296944, "grad_norm": 22.180927276611328, "learning_rate": 2.3210181322890953e-05, "loss": 0.5946, "step": 7600 }, { "epoch": 1.752470696391634, "grad_norm": 11.127065658569336, "learning_rate": 2.3103771175619306e-05, "loss": 0.6091, "step": 7625 }, { "epoch": 1.7582165019535738, "grad_norm": 14.059673309326172, "learning_rate": 2.2997361028347663e-05, "loss": 0.5442, "step": 7650 }, { "epoch": 1.7639623075155137, "grad_norm": 9.362860679626465, "learning_rate": 2.289095088107602e-05, "loss": 0.5622, "step": 7675 }, { "epoch": 1.7697081130774535, "grad_norm": 11.744709968566895, "learning_rate": 2.2784540733804376e-05, "loss": 0.4207, "step": 7700 }, { "epoch": 1.7754539186393932, "grad_norm": 11.617082595825195, "learning_rate": 2.2678130586532732e-05, "loss": 0.5576, "step": 7725 }, { "epoch": 1.781199724201333, "grad_norm": 7.126068592071533, "learning_rate": 2.257172043926109e-05, "loss": 0.5096, "step": 7750 }, { "epoch": 1.7869455297632728, "grad_norm": 6.728802680969238, "learning_rate": 2.2465310291989445e-05, "loss": 0.3561, "step": 7775 }, { "epoch": 1.7926913353252125, "grad_norm": 6.852474212646484, "learning_rate": 2.2358900144717802e-05, "loss": 0.5567, "step": 7800 }, { "epoch": 1.7984371408871525, "grad_norm": 9.070609092712402, "learning_rate": 2.225248999744616e-05, "loss": 0.6369, "step": 7825 }, { "epoch": 1.8041829464490922, "grad_norm": 12.296309471130371, "learning_rate": 2.2146079850174515e-05, "loss": 0.5887, "step": 7850 }, { "epoch": 1.8099287520110319, "grad_norm": 13.876431465148926, "learning_rate": 2.203966970290287e-05, "loss": 0.5461, "step": 7875 }, { "epoch": 1.8156745575729718, "grad_norm": 8.236191749572754, "learning_rate": 2.1933259555631225e-05, "loss": 0.4896, "step": 7900 }, { "epoch": 1.8214203631349115, "grad_norm": 32.38478088378906, "learning_rate": 2.182684940835958e-05, "loss": 0.5381, "step": 7925 }, { "epoch": 1.8271661686968512, "grad_norm": 7.219331741333008, "learning_rate": 2.1720439261087938e-05, "loss": 0.4077, "step": 7950 }, { "epoch": 1.8329119742587912, "grad_norm": 10.3890962600708, "learning_rate": 2.1614029113816294e-05, "loss": 0.547, "step": 7975 }, { "epoch": 1.8386577798207309, "grad_norm": 7.473533630371094, "learning_rate": 2.150761896654465e-05, "loss": 0.5187, "step": 8000 }, { "epoch": 1.8444035853826706, "grad_norm": 9.898606300354004, "learning_rate": 2.1401208819273007e-05, "loss": 0.5642, "step": 8025 }, { "epoch": 1.8501493909446105, "grad_norm": 10.344517707824707, "learning_rate": 2.129479867200136e-05, "loss": 0.4615, "step": 8050 }, { "epoch": 1.8558951965065502, "grad_norm": 34.38172912597656, "learning_rate": 2.118838852472972e-05, "loss": 0.5293, "step": 8075 }, { "epoch": 1.86164100206849, "grad_norm": 12.77538776397705, "learning_rate": 2.1081978377458077e-05, "loss": 0.4196, "step": 8100 }, { "epoch": 1.8673868076304299, "grad_norm": 9.176766395568848, "learning_rate": 2.0975568230186433e-05, "loss": 0.4989, "step": 8125 }, { "epoch": 1.8731326131923696, "grad_norm": 9.476819038391113, "learning_rate": 2.086915808291479e-05, "loss": 0.5131, "step": 8150 }, { "epoch": 1.8788784187543093, "grad_norm": 12.82066822052002, "learning_rate": 2.0762747935643143e-05, "loss": 0.5161, "step": 8175 }, { "epoch": 1.8846242243162492, "grad_norm": 6.587464332580566, "learning_rate": 2.06563377883715e-05, "loss": 0.5613, "step": 8200 }, { "epoch": 1.890370029878189, "grad_norm": 14.604435920715332, "learning_rate": 2.0549927641099856e-05, "loss": 0.5146, "step": 8225 }, { "epoch": 1.8961158354401286, "grad_norm": 25.261781692504883, "learning_rate": 2.0443517493828213e-05, "loss": 0.4847, "step": 8250 }, { "epoch": 1.9018616410020686, "grad_norm": 15.835251808166504, "learning_rate": 2.033710734655657e-05, "loss": 0.5851, "step": 8275 }, { "epoch": 1.9076074465640083, "grad_norm": 25.110139846801758, "learning_rate": 2.0230697199284926e-05, "loss": 0.4597, "step": 8300 }, { "epoch": 1.913353252125948, "grad_norm": 24.496837615966797, "learning_rate": 2.012428705201328e-05, "loss": 0.5487, "step": 8325 }, { "epoch": 1.919099057687888, "grad_norm": 11.73768424987793, "learning_rate": 2.0017876904741635e-05, "loss": 0.4469, "step": 8350 }, { "epoch": 1.9248448632498276, "grad_norm": 10.91761589050293, "learning_rate": 1.9911466757469992e-05, "loss": 0.6221, "step": 8375 }, { "epoch": 1.9305906688117673, "grad_norm": 5.8950724601745605, "learning_rate": 1.9805056610198348e-05, "loss": 0.447, "step": 8400 }, { "epoch": 1.9363364743737073, "grad_norm": 8.51844310760498, "learning_rate": 1.9698646462926705e-05, "loss": 0.5028, "step": 8425 }, { "epoch": 1.942082279935647, "grad_norm": 8.514192581176758, "learning_rate": 1.9592236315655065e-05, "loss": 0.5334, "step": 8450 }, { "epoch": 1.9478280854975867, "grad_norm": 17.922142028808594, "learning_rate": 1.9485826168383418e-05, "loss": 0.5718, "step": 8475 }, { "epoch": 1.9535738910595266, "grad_norm": 16.389118194580078, "learning_rate": 1.9379416021111774e-05, "loss": 0.44, "step": 8500 }, { "epoch": 1.9593196966214663, "grad_norm": 21.868207931518555, "learning_rate": 1.927300587384013e-05, "loss": 0.5156, "step": 8525 }, { "epoch": 1.965065502183406, "grad_norm": 16.75226593017578, "learning_rate": 1.9166595726568487e-05, "loss": 0.5354, "step": 8550 }, { "epoch": 1.970811307745346, "grad_norm": 10.739360809326172, "learning_rate": 1.9060185579296844e-05, "loss": 0.4722, "step": 8575 }, { "epoch": 1.9765571133072857, "grad_norm": 12.929302215576172, "learning_rate": 1.8953775432025197e-05, "loss": 0.551, "step": 8600 }, { "epoch": 1.9823029188692254, "grad_norm": 17.41213035583496, "learning_rate": 1.8847365284753554e-05, "loss": 0.5017, "step": 8625 }, { "epoch": 1.9880487244311653, "grad_norm": 5.805027484893799, "learning_rate": 1.874095513748191e-05, "loss": 0.5878, "step": 8650 }, { "epoch": 1.993794529993105, "grad_norm": 11.490523338317871, "learning_rate": 1.8634544990210267e-05, "loss": 0.5627, "step": 8675 }, { "epoch": 1.9995403355550447, "grad_norm": 13.531903266906738, "learning_rate": 1.8528134842938623e-05, "loss": 0.5863, "step": 8700 }, { "epoch": 2.0, "eval_gen_len": 26.2389, "eval_loss": 0.6237149238586426, "eval_rouge1": 86.4062, "eval_rouge2": 71.9313, "eval_rougeL": 84.7508, "eval_rougeLsum": 84.7631, "eval_runtime": 2064.9224, "eval_samples_per_second": 1.405, "eval_steps_per_second": 0.352, "step": 8702 }, { "epoch": 2.0052861411169847, "grad_norm": 9.042722702026367, "learning_rate": 1.842172469566698e-05, "loss": 0.3464, "step": 8725 }, { "epoch": 2.0110319466789246, "grad_norm": 29.441604614257812, "learning_rate": 1.8315314548395336e-05, "loss": 0.3289, "step": 8750 }, { "epoch": 2.016777752240864, "grad_norm": 6.572937488555908, "learning_rate": 1.8208904401123693e-05, "loss": 0.2474, "step": 8775 }, { "epoch": 2.022523557802804, "grad_norm": 5.759756088256836, "learning_rate": 1.810249425385205e-05, "loss": 0.2546, "step": 8800 }, { "epoch": 2.028269363364744, "grad_norm": 13.856329917907715, "learning_rate": 1.7996084106580406e-05, "loss": 0.2817, "step": 8825 }, { "epoch": 2.0340151689266834, "grad_norm": 11.948515892028809, "learning_rate": 1.7889673959308762e-05, "loss": 0.3477, "step": 8850 }, { "epoch": 2.0397609744886234, "grad_norm": 7.310947418212891, "learning_rate": 1.7783263812037115e-05, "loss": 0.2743, "step": 8875 }, { "epoch": 2.0455067800505633, "grad_norm": 8.42832088470459, "learning_rate": 1.7676853664765472e-05, "loss": 0.3924, "step": 8900 }, { "epoch": 2.051252585612503, "grad_norm": 8.289580345153809, "learning_rate": 1.757044351749383e-05, "loss": 0.3083, "step": 8925 }, { "epoch": 2.0569983911744427, "grad_norm": 5.149430751800537, "learning_rate": 1.7464033370222185e-05, "loss": 0.301, "step": 8950 }, { "epoch": 2.0627441967363827, "grad_norm": 10.75927734375, "learning_rate": 1.735762322295054e-05, "loss": 0.3943, "step": 8975 }, { "epoch": 2.068490002298322, "grad_norm": 7.442399501800537, "learning_rate": 1.7251213075678898e-05, "loss": 0.29, "step": 9000 }, { "epoch": 2.074235807860262, "grad_norm": 7.649430751800537, "learning_rate": 1.714480292840725e-05, "loss": 0.2922, "step": 9025 }, { "epoch": 2.079981613422202, "grad_norm": 14.108525276184082, "learning_rate": 1.7038392781135608e-05, "loss": 0.3514, "step": 9050 }, { "epoch": 2.0857274189841415, "grad_norm": 15.770670890808105, "learning_rate": 1.6931982633863968e-05, "loss": 0.302, "step": 9075 }, { "epoch": 2.0914732245460814, "grad_norm": 10.099161148071289, "learning_rate": 1.6825572486592324e-05, "loss": 0.2739, "step": 9100 }, { "epoch": 2.0972190301080214, "grad_norm": 9.134102821350098, "learning_rate": 1.671916233932068e-05, "loss": 0.3254, "step": 9125 }, { "epoch": 2.102964835669961, "grad_norm": 19.84739875793457, "learning_rate": 1.6612752192049037e-05, "loss": 0.2823, "step": 9150 }, { "epoch": 2.1087106412319008, "grad_norm": 8.968664169311523, "learning_rate": 1.650634204477739e-05, "loss": 0.2593, "step": 9175 }, { "epoch": 2.1144564467938407, "grad_norm": 3.067753791809082, "learning_rate": 1.6399931897505747e-05, "loss": 0.262, "step": 9200 }, { "epoch": 2.12020225235578, "grad_norm": 2.221193552017212, "learning_rate": 1.6293521750234103e-05, "loss": 0.3989, "step": 9225 }, { "epoch": 2.12594805791772, "grad_norm": 8.303793907165527, "learning_rate": 1.618711160296246e-05, "loss": 0.2848, "step": 9250 }, { "epoch": 2.13169386347966, "grad_norm": 8.77238941192627, "learning_rate": 1.6080701455690816e-05, "loss": 0.273, "step": 9275 }, { "epoch": 2.1374396690415995, "grad_norm": 9.391227722167969, "learning_rate": 1.597429130841917e-05, "loss": 0.2648, "step": 9300 }, { "epoch": 2.1431854746035395, "grad_norm": 7.9943389892578125, "learning_rate": 1.5867881161147526e-05, "loss": 0.2839, "step": 9325 }, { "epoch": 2.1489312801654794, "grad_norm": 8.543972969055176, "learning_rate": 1.5761471013875882e-05, "loss": 0.2569, "step": 9350 }, { "epoch": 2.154677085727419, "grad_norm": 6.456871032714844, "learning_rate": 1.565506086660424e-05, "loss": 0.2514, "step": 9375 }, { "epoch": 2.160422891289359, "grad_norm": 10.609663009643555, "learning_rate": 1.5548650719332595e-05, "loss": 0.3059, "step": 9400 }, { "epoch": 2.1661686968512988, "grad_norm": 12.719677925109863, "learning_rate": 1.5442240572060952e-05, "loss": 0.2691, "step": 9425 }, { "epoch": 2.1719145024132382, "grad_norm": 6.143183708190918, "learning_rate": 1.533583042478931e-05, "loss": 0.2422, "step": 9450 }, { "epoch": 2.177660307975178, "grad_norm": 20.21449089050293, "learning_rate": 1.5229420277517665e-05, "loss": 0.3001, "step": 9475 }, { "epoch": 2.183406113537118, "grad_norm": 4.3389973640441895, "learning_rate": 1.5123010130246022e-05, "loss": 0.2947, "step": 9500 }, { "epoch": 2.1891519190990576, "grad_norm": 10.764538764953613, "learning_rate": 1.5016599982974378e-05, "loss": 0.2926, "step": 9525 }, { "epoch": 2.1948977246609975, "grad_norm": 5.7259321212768555, "learning_rate": 1.4910189835702735e-05, "loss": 0.281, "step": 9550 }, { "epoch": 2.2006435302229375, "grad_norm": 7.039416790008545, "learning_rate": 1.4803779688431091e-05, "loss": 0.2707, "step": 9575 }, { "epoch": 2.206389335784877, "grad_norm": 9.577095985412598, "learning_rate": 1.4697369541159444e-05, "loss": 0.3298, "step": 9600 }, { "epoch": 2.212135141346817, "grad_norm": 5.981830596923828, "learning_rate": 1.45909593938878e-05, "loss": 0.3344, "step": 9625 }, { "epoch": 2.217880946908757, "grad_norm": 5.922014236450195, "learning_rate": 1.4484549246616157e-05, "loss": 0.2731, "step": 9650 }, { "epoch": 2.2236267524706963, "grad_norm": 9.63442325592041, "learning_rate": 1.4378139099344515e-05, "loss": 0.3236, "step": 9675 }, { "epoch": 2.2293725580326362, "grad_norm": 15.069372177124023, "learning_rate": 1.4271728952072872e-05, "loss": 0.3373, "step": 9700 }, { "epoch": 2.235118363594576, "grad_norm": 9.148941993713379, "learning_rate": 1.4165318804801225e-05, "loss": 0.3207, "step": 9725 }, { "epoch": 2.2408641691565157, "grad_norm": 6.5600385665893555, "learning_rate": 1.4058908657529582e-05, "loss": 0.2614, "step": 9750 }, { "epoch": 2.2466099747184556, "grad_norm": 12.141286849975586, "learning_rate": 1.3952498510257938e-05, "loss": 0.3223, "step": 9775 }, { "epoch": 2.2523557802803955, "grad_norm": 6.805424213409424, "learning_rate": 1.3846088362986295e-05, "loss": 0.3697, "step": 9800 }, { "epoch": 2.258101585842335, "grad_norm": 13.576851844787598, "learning_rate": 1.3739678215714651e-05, "loss": 0.36, "step": 9825 }, { "epoch": 2.263847391404275, "grad_norm": 5.8922576904296875, "learning_rate": 1.363326806844301e-05, "loss": 0.2679, "step": 9850 }, { "epoch": 2.2695931969662144, "grad_norm": 3.609133720397949, "learning_rate": 1.3526857921171363e-05, "loss": 0.298, "step": 9875 }, { "epoch": 2.2753390025281544, "grad_norm": 10.248851776123047, "learning_rate": 1.3420447773899719e-05, "loss": 0.3184, "step": 9900 }, { "epoch": 2.2810848080900943, "grad_norm": 8.504976272583008, "learning_rate": 1.3314037626628076e-05, "loss": 0.3057, "step": 9925 }, { "epoch": 2.286830613652034, "grad_norm": 3.3555614948272705, "learning_rate": 1.3207627479356432e-05, "loss": 0.2886, "step": 9950 }, { "epoch": 2.2925764192139737, "grad_norm": 6.76616096496582, "learning_rate": 1.3101217332084789e-05, "loss": 0.2674, "step": 9975 }, { "epoch": 2.2983222247759136, "grad_norm": 9.159728050231934, "learning_rate": 1.2994807184813143e-05, "loss": 0.3071, "step": 10000 }, { "epoch": 2.304068030337853, "grad_norm": 18.63709831237793, "learning_rate": 1.28883970375415e-05, "loss": 0.2902, "step": 10025 }, { "epoch": 2.309813835899793, "grad_norm": 1.2971268892288208, "learning_rate": 1.2781986890269857e-05, "loss": 0.322, "step": 10050 }, { "epoch": 2.315559641461733, "grad_norm": 11.695150375366211, "learning_rate": 1.2675576742998213e-05, "loss": 0.2625, "step": 10075 }, { "epoch": 2.321305447023673, "grad_norm": 17.39932632446289, "learning_rate": 1.256916659572657e-05, "loss": 0.2826, "step": 10100 }, { "epoch": 2.3270512525856124, "grad_norm": 5.18493127822876, "learning_rate": 1.2462756448454924e-05, "loss": 0.2516, "step": 10125 }, { "epoch": 2.3327970581475523, "grad_norm": 16.71261978149414, "learning_rate": 1.2356346301183281e-05, "loss": 0.3449, "step": 10150 }, { "epoch": 2.338542863709492, "grad_norm": 10.863037109375, "learning_rate": 1.2249936153911637e-05, "loss": 0.2464, "step": 10175 }, { "epoch": 2.3442886692714318, "grad_norm": 4.433645725250244, "learning_rate": 1.2143526006639994e-05, "loss": 0.3322, "step": 10200 }, { "epoch": 2.3500344748333717, "grad_norm": 7.951370716094971, "learning_rate": 1.203711585936835e-05, "loss": 0.3509, "step": 10225 }, { "epoch": 2.3557802803953116, "grad_norm": 10.264397621154785, "learning_rate": 1.1930705712096705e-05, "loss": 0.2962, "step": 10250 }, { "epoch": 2.361526085957251, "grad_norm": 8.093633651733398, "learning_rate": 1.1824295564825062e-05, "loss": 0.4046, "step": 10275 }, { "epoch": 2.367271891519191, "grad_norm": 12.440337181091309, "learning_rate": 1.1717885417553418e-05, "loss": 0.3199, "step": 10300 }, { "epoch": 2.3730176970811305, "grad_norm": 7.12372350692749, "learning_rate": 1.1611475270281775e-05, "loss": 0.2804, "step": 10325 }, { "epoch": 2.3787635026430705, "grad_norm": 13.004124641418457, "learning_rate": 1.1505065123010131e-05, "loss": 0.254, "step": 10350 }, { "epoch": 2.3845093082050104, "grad_norm": 8.472222328186035, "learning_rate": 1.1398654975738488e-05, "loss": 0.3731, "step": 10375 }, { "epoch": 2.3902551137669503, "grad_norm": 4.128398895263672, "learning_rate": 1.1292244828466843e-05, "loss": 0.263, "step": 10400 }, { "epoch": 2.39600091932889, "grad_norm": 10.770340919494629, "learning_rate": 1.11858346811952e-05, "loss": 0.305, "step": 10425 }, { "epoch": 2.4017467248908297, "grad_norm": 12.2786226272583, "learning_rate": 1.1079424533923556e-05, "loss": 0.3019, "step": 10450 }, { "epoch": 2.4074925304527692, "grad_norm": 6.7361063957214355, "learning_rate": 1.0973014386651912e-05, "loss": 0.3148, "step": 10475 }, { "epoch": 2.413238336014709, "grad_norm": 9.457742691040039, "learning_rate": 1.0866604239380269e-05, "loss": 0.2865, "step": 10500 }, { "epoch": 2.418984141576649, "grad_norm": 3.7694594860076904, "learning_rate": 1.0760194092108624e-05, "loss": 0.246, "step": 10525 }, { "epoch": 2.424729947138589, "grad_norm": 7.730304718017578, "learning_rate": 1.065378394483698e-05, "loss": 0.335, "step": 10550 }, { "epoch": 2.4304757527005285, "grad_norm": 10.514723777770996, "learning_rate": 1.0547373797565337e-05, "loss": 0.2765, "step": 10575 }, { "epoch": 2.4362215582624684, "grad_norm": 5.863102436065674, "learning_rate": 1.0440963650293691e-05, "loss": 0.2872, "step": 10600 }, { "epoch": 2.441967363824408, "grad_norm": 11.08028793334961, "learning_rate": 1.0334553503022048e-05, "loss": 0.2951, "step": 10625 }, { "epoch": 2.447713169386348, "grad_norm": 7.661574363708496, "learning_rate": 1.0228143355750404e-05, "loss": 0.2148, "step": 10650 }, { "epoch": 2.453458974948288, "grad_norm": 8.94846248626709, "learning_rate": 1.0121733208478761e-05, "loss": 0.278, "step": 10675 }, { "epoch": 2.4592047805102277, "grad_norm": 4.757375717163086, "learning_rate": 1.0015323061207118e-05, "loss": 0.2581, "step": 10700 }, { "epoch": 2.464950586072167, "grad_norm": 5.684369087219238, "learning_rate": 9.908912913935474e-06, "loss": 0.2654, "step": 10725 }, { "epoch": 2.470696391634107, "grad_norm": 8.778314590454102, "learning_rate": 9.802502766663829e-06, "loss": 0.3054, "step": 10750 }, { "epoch": 2.476442197196047, "grad_norm": 9.916199684143066, "learning_rate": 9.696092619392185e-06, "loss": 0.2818, "step": 10775 }, { "epoch": 2.4821880027579866, "grad_norm": 14.061298370361328, "learning_rate": 9.589682472120542e-06, "loss": 0.2872, "step": 10800 }, { "epoch": 2.4879338083199265, "grad_norm": 3.405550241470337, "learning_rate": 9.483272324848898e-06, "loss": 0.2891, "step": 10825 }, { "epoch": 2.4936796138818664, "grad_norm": 13.360318183898926, "learning_rate": 9.376862177577255e-06, "loss": 0.2764, "step": 10850 }, { "epoch": 2.499425419443806, "grad_norm": 9.567176818847656, "learning_rate": 9.27045203030561e-06, "loss": 0.3287, "step": 10875 }, { "epoch": 2.505171225005746, "grad_norm": 22.88526725769043, "learning_rate": 9.164041883033966e-06, "loss": 0.2867, "step": 10900 }, { "epoch": 2.5109170305676853, "grad_norm": 8.176691055297852, "learning_rate": 9.057631735762323e-06, "loss": 0.313, "step": 10925 }, { "epoch": 2.5166628361296253, "grad_norm": 9.478363990783691, "learning_rate": 8.951221588490678e-06, "loss": 0.3178, "step": 10950 }, { "epoch": 2.522408641691565, "grad_norm": 11.561506271362305, "learning_rate": 8.844811441219034e-06, "loss": 0.3263, "step": 10975 }, { "epoch": 2.528154447253505, "grad_norm": 21.103870391845703, "learning_rate": 8.738401293947392e-06, "loss": 0.392, "step": 11000 }, { "epoch": 2.5339002528154446, "grad_norm": 9.002408981323242, "learning_rate": 8.631991146675747e-06, "loss": 0.2399, "step": 11025 }, { "epoch": 2.5396460583773846, "grad_norm": 12.6602201461792, "learning_rate": 8.525580999404104e-06, "loss": 0.3234, "step": 11050 }, { "epoch": 2.545391863939324, "grad_norm": 7.497732639312744, "learning_rate": 8.41917085213246e-06, "loss": 0.3384, "step": 11075 }, { "epoch": 2.551137669501264, "grad_norm": 7.916686058044434, "learning_rate": 8.312760704860815e-06, "loss": 0.262, "step": 11100 }, { "epoch": 2.556883475063204, "grad_norm": 9.539471626281738, "learning_rate": 8.206350557589172e-06, "loss": 0.2634, "step": 11125 }, { "epoch": 2.562629280625144, "grad_norm": 10.455008506774902, "learning_rate": 8.099940410317528e-06, "loss": 0.354, "step": 11150 }, { "epoch": 2.5683750861870833, "grad_norm": 13.299657821655273, "learning_rate": 7.993530263045885e-06, "loss": 0.2795, "step": 11175 }, { "epoch": 2.5741208917490233, "grad_norm": 11.104608535766602, "learning_rate": 7.887120115774241e-06, "loss": 0.2768, "step": 11200 }, { "epoch": 2.5798666973109627, "grad_norm": 7.9573493003845215, "learning_rate": 7.780709968502596e-06, "loss": 0.3175, "step": 11225 }, { "epoch": 2.5856125028729027, "grad_norm": 6.329565525054932, "learning_rate": 7.674299821230952e-06, "loss": 0.3128, "step": 11250 }, { "epoch": 2.5913583084348426, "grad_norm": 6.937751770019531, "learning_rate": 7.56788967395931e-06, "loss": 0.2232, "step": 11275 }, { "epoch": 2.5971041139967825, "grad_norm": 8.591595649719238, "learning_rate": 7.461479526687665e-06, "loss": 0.2984, "step": 11300 }, { "epoch": 2.602849919558722, "grad_norm": 9.453631401062012, "learning_rate": 7.355069379416021e-06, "loss": 0.2491, "step": 11325 }, { "epoch": 2.608595725120662, "grad_norm": 6.31212854385376, "learning_rate": 7.248659232144378e-06, "loss": 0.3115, "step": 11350 }, { "epoch": 2.6143415306826014, "grad_norm": 6.614916801452637, "learning_rate": 7.142249084872733e-06, "loss": 0.2884, "step": 11375 }, { "epoch": 2.6200873362445414, "grad_norm": 8.102603912353516, "learning_rate": 7.03583893760109e-06, "loss": 0.2584, "step": 11400 }, { "epoch": 2.6258331418064813, "grad_norm": 4.263225555419922, "learning_rate": 6.929428790329446e-06, "loss": 0.2357, "step": 11425 }, { "epoch": 2.6315789473684212, "grad_norm": 8.729228019714355, "learning_rate": 6.823018643057802e-06, "loss": 0.2669, "step": 11450 }, { "epoch": 2.6373247529303607, "grad_norm": 6.767869472503662, "learning_rate": 6.716608495786159e-06, "loss": 0.2634, "step": 11475 }, { "epoch": 2.6430705584923007, "grad_norm": 6.409708499908447, "learning_rate": 6.610198348514515e-06, "loss": 0.2606, "step": 11500 }, { "epoch": 2.64881636405424, "grad_norm": 6.780210018157959, "learning_rate": 6.503788201242871e-06, "loss": 0.2801, "step": 11525 }, { "epoch": 2.65456216961618, "grad_norm": 9.45854663848877, "learning_rate": 6.397378053971227e-06, "loss": 0.2315, "step": 11550 }, { "epoch": 2.66030797517812, "grad_norm": 8.216423988342285, "learning_rate": 6.290967906699584e-06, "loss": 0.2897, "step": 11575 }, { "epoch": 2.66605378074006, "grad_norm": 10.603631973266602, "learning_rate": 6.1845577594279395e-06, "loss": 0.2884, "step": 11600 }, { "epoch": 2.6717995863019994, "grad_norm": 18.06307601928711, "learning_rate": 6.078147612156296e-06, "loss": 0.2767, "step": 11625 }, { "epoch": 2.6775453918639394, "grad_norm": 8.301953315734863, "learning_rate": 5.971737464884652e-06, "loss": 0.305, "step": 11650 }, { "epoch": 2.683291197425879, "grad_norm": 3.897515296936035, "learning_rate": 5.865327317613007e-06, "loss": 0.2622, "step": 11675 }, { "epoch": 2.689037002987819, "grad_norm": 11.311945915222168, "learning_rate": 5.758917170341364e-06, "loss": 0.2443, "step": 11700 }, { "epoch": 2.6947828085497587, "grad_norm": 11.095697402954102, "learning_rate": 5.65250702306972e-06, "loss": 0.2892, "step": 11725 }, { "epoch": 2.7005286141116986, "grad_norm": 4.947134017944336, "learning_rate": 5.546096875798076e-06, "loss": 0.325, "step": 11750 }, { "epoch": 2.706274419673638, "grad_norm": 7.467726707458496, "learning_rate": 5.439686728526433e-06, "loss": 0.3117, "step": 11775 }, { "epoch": 2.712020225235578, "grad_norm": 9.866068840026855, "learning_rate": 5.333276581254789e-06, "loss": 0.3054, "step": 11800 }, { "epoch": 2.7177660307975176, "grad_norm": 8.738960266113281, "learning_rate": 5.226866433983145e-06, "loss": 0.3043, "step": 11825 }, { "epoch": 2.7235118363594575, "grad_norm": 12.69951057434082, "learning_rate": 5.120456286711501e-06, "loss": 0.2877, "step": 11850 }, { "epoch": 2.7292576419213974, "grad_norm": 12.757490158081055, "learning_rate": 5.014046139439857e-06, "loss": 0.3413, "step": 11875 }, { "epoch": 2.7350034474833373, "grad_norm": 8.377620697021484, "learning_rate": 4.9076359921682135e-06, "loss": 0.2688, "step": 11900 }, { "epoch": 2.740749253045277, "grad_norm": 10.275845527648926, "learning_rate": 4.801225844896569e-06, "loss": 0.2605, "step": 11925 }, { "epoch": 2.7464950586072168, "grad_norm": 8.497511863708496, "learning_rate": 4.694815697624926e-06, "loss": 0.3221, "step": 11950 }, { "epoch": 2.7522408641691563, "grad_norm": 2.008751153945923, "learning_rate": 4.588405550353282e-06, "loss": 0.276, "step": 11975 }, { "epoch": 2.757986669731096, "grad_norm": 9.221789360046387, "learning_rate": 4.481995403081638e-06, "loss": 0.2717, "step": 12000 }, { "epoch": 2.763732475293036, "grad_norm": 12.64124870300293, "learning_rate": 4.375585255809994e-06, "loss": 0.3022, "step": 12025 }, { "epoch": 2.769478280854976, "grad_norm": 7.941061496734619, "learning_rate": 4.26917510853835e-06, "loss": 0.2974, "step": 12050 }, { "epoch": 2.7752240864169155, "grad_norm": 11.986599922180176, "learning_rate": 4.1627649612667066e-06, "loss": 0.2933, "step": 12075 }, { "epoch": 2.7809698919788555, "grad_norm": 8.333921432495117, "learning_rate": 4.056354813995063e-06, "loss": 0.3154, "step": 12100 }, { "epoch": 2.786715697540795, "grad_norm": 8.222199440002441, "learning_rate": 3.949944666723419e-06, "loss": 0.2618, "step": 12125 }, { "epoch": 2.792461503102735, "grad_norm": 5.828108310699463, "learning_rate": 3.843534519451775e-06, "loss": 0.3117, "step": 12150 }, { "epoch": 2.798207308664675, "grad_norm": 4.506043910980225, "learning_rate": 3.7371243721801314e-06, "loss": 0.2742, "step": 12175 }, { "epoch": 2.8039531142266148, "grad_norm": 6.610835552215576, "learning_rate": 3.6307142249084875e-06, "loss": 0.2821, "step": 12200 }, { "epoch": 2.8096989197885542, "grad_norm": 9.316131591796875, "learning_rate": 3.524304077636843e-06, "loss": 0.3721, "step": 12225 }, { "epoch": 2.815444725350494, "grad_norm": 13.79557991027832, "learning_rate": 3.4178939303652e-06, "loss": 0.3067, "step": 12250 }, { "epoch": 2.8211905309124337, "grad_norm": 3.6889641284942627, "learning_rate": 3.3114837830935558e-06, "loss": 0.3536, "step": 12275 }, { "epoch": 2.8269363364743736, "grad_norm": 9.081113815307617, "learning_rate": 3.205073635821912e-06, "loss": 0.2818, "step": 12300 }, { "epoch": 2.8326821420363135, "grad_norm": 6.250842571258545, "learning_rate": 3.0986634885502684e-06, "loss": 0.2985, "step": 12325 }, { "epoch": 2.8384279475982535, "grad_norm": 7.818601608276367, "learning_rate": 2.9922533412786245e-06, "loss": 0.3299, "step": 12350 }, { "epoch": 2.844173753160193, "grad_norm": 3.7356948852539062, "learning_rate": 2.8858431940069806e-06, "loss": 0.3068, "step": 12375 }, { "epoch": 2.849919558722133, "grad_norm": 8.613180160522461, "learning_rate": 2.7794330467353367e-06, "loss": 0.2359, "step": 12400 }, { "epoch": 2.8556653642840724, "grad_norm": 5.272718906402588, "learning_rate": 2.673022899463693e-06, "loss": 0.2459, "step": 12425 }, { "epoch": 2.8614111698460123, "grad_norm": 5.97663688659668, "learning_rate": 2.5666127521920493e-06, "loss": 0.3391, "step": 12450 }, { "epoch": 2.8671569754079522, "grad_norm": 9.63822078704834, "learning_rate": 2.4602026049204054e-06, "loss": 0.3068, "step": 12475 }, { "epoch": 2.872902780969892, "grad_norm": 4.8020172119140625, "learning_rate": 2.3537924576487615e-06, "loss": 0.2816, "step": 12500 }, { "epoch": 2.8786485865318316, "grad_norm": 8.880352020263672, "learning_rate": 2.2473823103771176e-06, "loss": 0.2288, "step": 12525 }, { "epoch": 2.8843943920937716, "grad_norm": 12.863115310668945, "learning_rate": 2.1409721631054737e-06, "loss": 0.2539, "step": 12550 }, { "epoch": 2.890140197655711, "grad_norm": 18.137331008911133, "learning_rate": 2.03456201583383e-06, "loss": 0.2567, "step": 12575 }, { "epoch": 2.895886003217651, "grad_norm": 11.29713249206543, "learning_rate": 1.9281518685621863e-06, "loss": 0.2979, "step": 12600 }, { "epoch": 2.901631808779591, "grad_norm": 5.792328834533691, "learning_rate": 1.8217417212905424e-06, "loss": 0.3037, "step": 12625 }, { "epoch": 2.907377614341531, "grad_norm": 4.29570198059082, "learning_rate": 1.7153315740188987e-06, "loss": 0.2398, "step": 12650 }, { "epoch": 2.9131234199034703, "grad_norm": 9.291726112365723, "learning_rate": 1.6089214267472546e-06, "loss": 0.3226, "step": 12675 }, { "epoch": 2.9188692254654103, "grad_norm": 11.283625602722168, "learning_rate": 1.5025112794756109e-06, "loss": 0.3077, "step": 12700 }, { "epoch": 2.9246150310273498, "grad_norm": 9.59054946899414, "learning_rate": 1.396101132203967e-06, "loss": 0.2839, "step": 12725 }, { "epoch": 2.9303608365892897, "grad_norm": 6.355679035186768, "learning_rate": 1.2896909849323233e-06, "loss": 0.2673, "step": 12750 }, { "epoch": 2.9361066421512296, "grad_norm": 4.829615116119385, "learning_rate": 1.1832808376606794e-06, "loss": 0.3218, "step": 12775 }, { "epoch": 2.9418524477131696, "grad_norm": 3.804570198059082, "learning_rate": 1.0768706903890355e-06, "loss": 0.2438, "step": 12800 }, { "epoch": 2.947598253275109, "grad_norm": 9.905104637145996, "learning_rate": 9.704605431173918e-07, "loss": 0.3214, "step": 12825 }, { "epoch": 2.953344058837049, "grad_norm": 19.192649841308594, "learning_rate": 8.64050395845748e-07, "loss": 0.2366, "step": 12850 }, { "epoch": 2.9590898643989885, "grad_norm": 8.720338821411133, "learning_rate": 7.576402485741041e-07, "loss": 0.3157, "step": 12875 }, { "epoch": 2.9648356699609284, "grad_norm": 11.466854095458984, "learning_rate": 6.512301013024603e-07, "loss": 0.3257, "step": 12900 }, { "epoch": 2.9705814755228683, "grad_norm": 12.517078399658203, "learning_rate": 5.448199540308164e-07, "loss": 0.3519, "step": 12925 }, { "epoch": 2.9763272810848083, "grad_norm": 7.879445552825928, "learning_rate": 4.384098067591726e-07, "loss": 0.3597, "step": 12950 }, { "epoch": 2.9820730866467478, "grad_norm": 7.040919780731201, "learning_rate": 3.3199965948752875e-07, "loss": 0.354, "step": 12975 }, { "epoch": 2.9878188922086877, "grad_norm": 5.912668228149414, "learning_rate": 2.255895122158849e-07, "loss": 0.2328, "step": 13000 }, { "epoch": 2.993564697770627, "grad_norm": 2.8012125492095947, "learning_rate": 1.1917936494424109e-07, "loss": 0.2318, "step": 13025 }, { "epoch": 2.999310503332567, "grad_norm": 5.125675201416016, "learning_rate": 1.2769217672597258e-08, "loss": 0.2961, "step": 13050 }, { "epoch": 3.0, "eval_gen_len": 26.3516, "eval_loss": 0.59325110912323, "eval_rouge1": 87.5985, "eval_rouge2": 74.3003, "eval_rougeL": 86.0508, "eval_rougeLsum": 86.0787, "eval_runtime": 2117.538, "eval_samples_per_second": 1.37, "eval_steps_per_second": 0.343, "step": 13053 } ], "logging_steps": 25, "max_steps": 13053, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 197710094794752.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }