{ "best_metric": 4.248934268951416, "best_model_checkpoint": "autotrain-l6hey-orl0t/checkpoint-8938", "epoch": 2.0, "eval_steps": 500, "global_step": 8938, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005594092638174088, "grad_norm": 34.332366943359375, "learning_rate": 7.829977628635347e-07, "loss": 6.4897, "step": 25 }, { "epoch": 0.011188185276348177, "grad_norm": 23.168872833251953, "learning_rate": 1.7151379567486951e-06, "loss": 6.3738, "step": 50 }, { "epoch": 0.016782277914522265, "grad_norm": 20.714399337768555, "learning_rate": 2.6472781506338553e-06, "loss": 5.5122, "step": 75 }, { "epoch": 0.022376370552696354, "grad_norm": 17.31004524230957, "learning_rate": 3.542132736763609e-06, "loss": 5.5352, "step": 100 }, { "epoch": 0.027970463190870442, "grad_norm": 20.792905807495117, "learning_rate": 4.47427293064877e-06, "loss": 5.3902, "step": 125 }, { "epoch": 0.03356455582904453, "grad_norm": 29.867664337158203, "learning_rate": 5.40641312453393e-06, "loss": 5.0552, "step": 150 }, { "epoch": 0.039158648467218615, "grad_norm": 17.143095016479492, "learning_rate": 6.338553318419091e-06, "loss": 5.0792, "step": 175 }, { "epoch": 0.04475274110539271, "grad_norm": 16.778640747070312, "learning_rate": 7.270693512304251e-06, "loss": 5.0412, "step": 200 }, { "epoch": 0.05034683374356679, "grad_norm": 21.074262619018555, "learning_rate": 8.20283370618941e-06, "loss": 4.8163, "step": 225 }, { "epoch": 0.055940926381740884, "grad_norm": 21.221630096435547, "learning_rate": 9.134973900074571e-06, "loss": 4.9409, "step": 250 }, { "epoch": 0.06153501901991497, "grad_norm": 21.189231872558594, "learning_rate": 1.006711409395973e-05, "loss": 5.0649, "step": 275 }, { "epoch": 0.06712911165808906, "grad_norm": 14.992874145507812, "learning_rate": 1.0999254287844893e-05, "loss": 4.7542, "step": 300 }, { "epoch": 0.07272320429626315, "grad_norm": 15.733444213867188, "learning_rate": 1.1931394481730052e-05, "loss": 4.5938, "step": 325 }, { "epoch": 0.07831729693443723, "grad_norm": 23.274600982666016, "learning_rate": 1.2863534675615213e-05, "loss": 4.6138, "step": 350 }, { "epoch": 0.08391138957261132, "grad_norm": 17.92934799194336, "learning_rate": 1.3795674869500374e-05, "loss": 4.8809, "step": 375 }, { "epoch": 0.08950548221078541, "grad_norm": 18.286834716796875, "learning_rate": 1.4727815063385533e-05, "loss": 4.6919, "step": 400 }, { "epoch": 0.09509957484895949, "grad_norm": 20.009130477905273, "learning_rate": 1.5659955257270695e-05, "loss": 4.7557, "step": 425 }, { "epoch": 0.10069366748713358, "grad_norm": 20.796175003051758, "learning_rate": 1.6592095451155853e-05, "loss": 4.8059, "step": 450 }, { "epoch": 0.10628776012530768, "grad_norm": 18.083595275878906, "learning_rate": 1.7524235645041014e-05, "loss": 4.86, "step": 475 }, { "epoch": 0.11188185276348177, "grad_norm": 17.57659339904785, "learning_rate": 1.8456375838926178e-05, "loss": 4.568, "step": 500 }, { "epoch": 0.11747594540165585, "grad_norm": 20.379024505615234, "learning_rate": 1.9388516032811335e-05, "loss": 4.9103, "step": 525 }, { "epoch": 0.12307003803982994, "grad_norm": 16.093093872070312, "learning_rate": 2.0320656226696496e-05, "loss": 4.5315, "step": 550 }, { "epoch": 0.12866413067800403, "grad_norm": 31.44219970703125, "learning_rate": 2.1252796420581657e-05, "loss": 4.6423, "step": 575 }, { "epoch": 0.13425822331617812, "grad_norm": 16.757986068725586, "learning_rate": 2.2184936614466818e-05, "loss": 4.5965, "step": 600 }, { "epoch": 0.1398523159543522, "grad_norm": 16.31531524658203, "learning_rate": 2.311707680835198e-05, "loss": 4.6928, "step": 625 }, { "epoch": 0.1454464085925263, "grad_norm": 13.83728313446045, "learning_rate": 2.4049217002237136e-05, "loss": 4.5197, "step": 650 }, { "epoch": 0.15104050123070037, "grad_norm": 17.00248146057129, "learning_rate": 2.49813571961223e-05, "loss": 4.4602, "step": 675 }, { "epoch": 0.15663459386887446, "grad_norm": 22.1146183013916, "learning_rate": 2.5913497390007457e-05, "loss": 4.4573, "step": 700 }, { "epoch": 0.16222868650704855, "grad_norm": 16.24863624572754, "learning_rate": 2.6845637583892618e-05, "loss": 4.6094, "step": 725 }, { "epoch": 0.16782277914522264, "grad_norm": 15.607491493225098, "learning_rate": 2.777777777777778e-05, "loss": 4.4851, "step": 750 }, { "epoch": 0.17341687178339674, "grad_norm": 17.399606704711914, "learning_rate": 2.8709917971662943e-05, "loss": 4.7641, "step": 775 }, { "epoch": 0.17901096442157083, "grad_norm": 12.437596321105957, "learning_rate": 2.9642058165548097e-05, "loss": 4.6065, "step": 800 }, { "epoch": 0.18460505705974492, "grad_norm": 16.83686637878418, "learning_rate": 3.057419835943326e-05, "loss": 4.471, "step": 825 }, { "epoch": 0.19019914969791898, "grad_norm": 17.172122955322266, "learning_rate": 3.150633855331842e-05, "loss": 4.7266, "step": 850 }, { "epoch": 0.19579324233609308, "grad_norm": 18.03239631652832, "learning_rate": 3.243847874720358e-05, "loss": 4.46, "step": 875 }, { "epoch": 0.20138733497426717, "grad_norm": 11.44616985321045, "learning_rate": 3.3370618941088744e-05, "loss": 4.2646, "step": 900 }, { "epoch": 0.20698142761244126, "grad_norm": 18.69893455505371, "learning_rate": 3.43027591349739e-05, "loss": 4.5796, "step": 925 }, { "epoch": 0.21257552025061535, "grad_norm": 26.265470504760742, "learning_rate": 3.523489932885906e-05, "loss": 4.476, "step": 950 }, { "epoch": 0.21816961288878944, "grad_norm": 28.28611946105957, "learning_rate": 3.616703952274422e-05, "loss": 4.9502, "step": 975 }, { "epoch": 0.22376370552696354, "grad_norm": 20.53813362121582, "learning_rate": 3.709917971662939e-05, "loss": 4.5447, "step": 1000 }, { "epoch": 0.22935779816513763, "grad_norm": 14.012741088867188, "learning_rate": 3.8031319910514545e-05, "loss": 4.7348, "step": 1025 }, { "epoch": 0.2349518908033117, "grad_norm": 15.572279930114746, "learning_rate": 3.89634601043997e-05, "loss": 4.7246, "step": 1050 }, { "epoch": 0.24054598344148578, "grad_norm": 13.881638526916504, "learning_rate": 3.9895600298284866e-05, "loss": 4.6181, "step": 1075 }, { "epoch": 0.24614007607965988, "grad_norm": 12.749186515808105, "learning_rate": 4.0827740492170024e-05, "loss": 4.3893, "step": 1100 }, { "epoch": 0.251734168717834, "grad_norm": 17.131837844848633, "learning_rate": 4.175988068605519e-05, "loss": 4.3859, "step": 1125 }, { "epoch": 0.25732826135600806, "grad_norm": 13.731075286865234, "learning_rate": 4.2692020879940345e-05, "loss": 4.4761, "step": 1150 }, { "epoch": 0.2629223539941821, "grad_norm": 15.064730644226074, "learning_rate": 4.36241610738255e-05, "loss": 4.4364, "step": 1175 }, { "epoch": 0.26851644663235624, "grad_norm": 13.714836120605469, "learning_rate": 4.455630126771067e-05, "loss": 4.5969, "step": 1200 }, { "epoch": 0.2741105392705303, "grad_norm": 14.1889009475708, "learning_rate": 4.5488441461595824e-05, "loss": 4.9771, "step": 1225 }, { "epoch": 0.2797046319087044, "grad_norm": 14.543293952941895, "learning_rate": 4.642058165548099e-05, "loss": 4.6537, "step": 1250 }, { "epoch": 0.2852987245468785, "grad_norm": 18.1214542388916, "learning_rate": 4.735272184936615e-05, "loss": 4.6208, "step": 1275 }, { "epoch": 0.2908928171850526, "grad_norm": 13.347675323486328, "learning_rate": 4.82848620432513e-05, "loss": 4.6199, "step": 1300 }, { "epoch": 0.2964869098232267, "grad_norm": 17.083444595336914, "learning_rate": 4.921700223713647e-05, "loss": 4.8805, "step": 1325 }, { "epoch": 0.30208100246140074, "grad_norm": 24.343929290771484, "learning_rate": 4.998342449859108e-05, "loss": 4.6357, "step": 1350 }, { "epoch": 0.30767509509957486, "grad_norm": 16.355289459228516, "learning_rate": 4.987982761478535e-05, "loss": 4.674, "step": 1375 }, { "epoch": 0.3132691877377489, "grad_norm": 20.73784828186035, "learning_rate": 4.978037460633184e-05, "loss": 4.6211, "step": 1400 }, { "epoch": 0.31886328037592304, "grad_norm": 13.50486946105957, "learning_rate": 4.9676777722526106e-05, "loss": 4.7257, "step": 1425 }, { "epoch": 0.3244573730140971, "grad_norm": 14.019612312316895, "learning_rate": 4.9573180838720376e-05, "loss": 4.6509, "step": 1450 }, { "epoch": 0.3300514656522712, "grad_norm": 18.581026077270508, "learning_rate": 4.946958395491464e-05, "loss": 4.3779, "step": 1475 }, { "epoch": 0.3356455582904453, "grad_norm": 12.192296981811523, "learning_rate": 4.93659870711089e-05, "loss": 4.2878, "step": 1500 }, { "epoch": 0.34123965092861935, "grad_norm": 12.066230773925781, "learning_rate": 4.926239018730317e-05, "loss": 4.5682, "step": 1525 }, { "epoch": 0.3468337435667935, "grad_norm": 10.833414077758789, "learning_rate": 4.9158793303497436e-05, "loss": 4.1772, "step": 1550 }, { "epoch": 0.35242783620496754, "grad_norm": 10.834297180175781, "learning_rate": 4.905519641969169e-05, "loss": 4.6879, "step": 1575 }, { "epoch": 0.35802192884314166, "grad_norm": 12.870365142822266, "learning_rate": 4.895159953588596e-05, "loss": 4.7267, "step": 1600 }, { "epoch": 0.3636160214813157, "grad_norm": 10.991371154785156, "learning_rate": 4.8848002652080226e-05, "loss": 4.6822, "step": 1625 }, { "epoch": 0.36921011411948984, "grad_norm": 14.67126178741455, "learning_rate": 4.874440576827449e-05, "loss": 4.3277, "step": 1650 }, { "epoch": 0.3748042067576639, "grad_norm": 13.142556190490723, "learning_rate": 4.864080888446876e-05, "loss": 4.513, "step": 1675 }, { "epoch": 0.38039829939583797, "grad_norm": 12.647134780883789, "learning_rate": 4.853721200066302e-05, "loss": 4.6069, "step": 1700 }, { "epoch": 0.3859923920340121, "grad_norm": 12.48798942565918, "learning_rate": 4.8433615116857286e-05, "loss": 4.846, "step": 1725 }, { "epoch": 0.39158648467218615, "grad_norm": 10.90004825592041, "learning_rate": 4.833001823305155e-05, "loss": 4.4091, "step": 1750 }, { "epoch": 0.39718057731036027, "grad_norm": 14.13915729522705, "learning_rate": 4.822642134924581e-05, "loss": 4.4192, "step": 1775 }, { "epoch": 0.40277466994853434, "grad_norm": 13.102397918701172, "learning_rate": 4.812282446544008e-05, "loss": 4.5444, "step": 1800 }, { "epoch": 0.40836876258670846, "grad_norm": 12.227831840515137, "learning_rate": 4.8019227581634346e-05, "loss": 4.712, "step": 1825 }, { "epoch": 0.4139628552248825, "grad_norm": 15.215840339660645, "learning_rate": 4.791563069782861e-05, "loss": 4.6632, "step": 1850 }, { "epoch": 0.41955694786305664, "grad_norm": 18.023183822631836, "learning_rate": 4.781203381402288e-05, "loss": 4.1778, "step": 1875 }, { "epoch": 0.4251510405012307, "grad_norm": 13.230670928955078, "learning_rate": 4.770843693021714e-05, "loss": 4.4197, "step": 1900 }, { "epoch": 0.43074513313940477, "grad_norm": 14.070335388183594, "learning_rate": 4.7604840046411407e-05, "loss": 4.6564, "step": 1925 }, { "epoch": 0.4363392257775789, "grad_norm": 20.313472747802734, "learning_rate": 4.750124316260567e-05, "loss": 4.5667, "step": 1950 }, { "epoch": 0.44193331841575295, "grad_norm": 15.953713417053223, "learning_rate": 4.739764627879993e-05, "loss": 4.5235, "step": 1975 }, { "epoch": 0.44752741105392707, "grad_norm": 10.95453929901123, "learning_rate": 4.7294049394994197e-05, "loss": 4.5047, "step": 2000 }, { "epoch": 0.45312150369210114, "grad_norm": 15.660309791564941, "learning_rate": 4.719045251118847e-05, "loss": 4.1537, "step": 2025 }, { "epoch": 0.45871559633027525, "grad_norm": 11.03080940246582, "learning_rate": 4.708685562738273e-05, "loss": 4.3448, "step": 2050 }, { "epoch": 0.4643096889684493, "grad_norm": 10.359949111938477, "learning_rate": 4.698325874357699e-05, "loss": 4.3634, "step": 2075 }, { "epoch": 0.4699037816066234, "grad_norm": 25.401718139648438, "learning_rate": 4.6879661859771263e-05, "loss": 4.2735, "step": 2100 }, { "epoch": 0.4754978742447975, "grad_norm": 14.679646492004395, "learning_rate": 4.677606497596553e-05, "loss": 4.4465, "step": 2125 }, { "epoch": 0.48109196688297157, "grad_norm": 11.817214965820312, "learning_rate": 4.667246809215979e-05, "loss": 4.629, "step": 2150 }, { "epoch": 0.4866860595211457, "grad_norm": 10.622258186340332, "learning_rate": 4.656887120835405e-05, "loss": 4.4189, "step": 2175 }, { "epoch": 0.49228015215931975, "grad_norm": 15.188981056213379, "learning_rate": 4.646527432454832e-05, "loss": 4.2514, "step": 2200 }, { "epoch": 0.49787424479749387, "grad_norm": 14.326010704040527, "learning_rate": 4.636167744074259e-05, "loss": 4.3532, "step": 2225 }, { "epoch": 0.503468337435668, "grad_norm": 16.31020164489746, "learning_rate": 4.625808055693685e-05, "loss": 4.2333, "step": 2250 }, { "epoch": 0.509062430073842, "grad_norm": 18.346088409423828, "learning_rate": 4.6154483673131113e-05, "loss": 4.4483, "step": 2275 }, { "epoch": 0.5146565227120161, "grad_norm": 15.169132232666016, "learning_rate": 4.6050886789325384e-05, "loss": 4.3814, "step": 2300 }, { "epoch": 0.5202506153501902, "grad_norm": 10.011805534362793, "learning_rate": 4.594728990551964e-05, "loss": 4.6235, "step": 2325 }, { "epoch": 0.5258447079883642, "grad_norm": 11.23599624633789, "learning_rate": 4.584369302171391e-05, "loss": 4.5861, "step": 2350 }, { "epoch": 0.5314388006265384, "grad_norm": 10.484898567199707, "learning_rate": 4.5740096137908174e-05, "loss": 4.3894, "step": 2375 }, { "epoch": 0.5370328932647125, "grad_norm": 12.192333221435547, "learning_rate": 4.563649925410244e-05, "loss": 4.448, "step": 2400 }, { "epoch": 0.5426269859028866, "grad_norm": 15.25631332397461, "learning_rate": 4.553290237029671e-05, "loss": 4.3787, "step": 2425 }, { "epoch": 0.5482210785410606, "grad_norm": 10.27658748626709, "learning_rate": 4.542930548649097e-05, "loss": 4.4333, "step": 2450 }, { "epoch": 0.5538151711792347, "grad_norm": 16.127513885498047, "learning_rate": 4.5325708602685234e-05, "loss": 4.1792, "step": 2475 }, { "epoch": 0.5594092638174089, "grad_norm": 10.214879035949707, "learning_rate": 4.5222111718879504e-05, "loss": 4.6276, "step": 2500 }, { "epoch": 0.5650033564555829, "grad_norm": 10.60392951965332, "learning_rate": 4.511851483507376e-05, "loss": 4.5925, "step": 2525 }, { "epoch": 0.570597449093757, "grad_norm": 11.073285102844238, "learning_rate": 4.5014917951268024e-05, "loss": 4.4202, "step": 2550 }, { "epoch": 0.5761915417319311, "grad_norm": 14.709641456604004, "learning_rate": 4.4911321067462294e-05, "loss": 4.2758, "step": 2575 }, { "epoch": 0.5817856343701052, "grad_norm": 12.901021957397461, "learning_rate": 4.480772418365656e-05, "loss": 4.3067, "step": 2600 }, { "epoch": 0.5873797270082792, "grad_norm": 11.27915096282959, "learning_rate": 4.470412729985082e-05, "loss": 4.0111, "step": 2625 }, { "epoch": 0.5929738196464533, "grad_norm": 15.488706588745117, "learning_rate": 4.460053041604509e-05, "loss": 4.2671, "step": 2650 }, { "epoch": 0.5985679122846275, "grad_norm": 13.603848457336426, "learning_rate": 4.4496933532239354e-05, "loss": 4.2589, "step": 2675 }, { "epoch": 0.6041620049228015, "grad_norm": 12.827610969543457, "learning_rate": 4.439333664843362e-05, "loss": 4.6476, "step": 2700 }, { "epoch": 0.6097560975609756, "grad_norm": 16.806106567382812, "learning_rate": 4.428973976462788e-05, "loss": 4.6188, "step": 2725 }, { "epoch": 0.6153501901991497, "grad_norm": 14.598794937133789, "learning_rate": 4.4186142880822144e-05, "loss": 4.4195, "step": 2750 }, { "epoch": 0.6209442828373238, "grad_norm": 10.380790710449219, "learning_rate": 4.4082545997016414e-05, "loss": 4.4634, "step": 2775 }, { "epoch": 0.6265383754754978, "grad_norm": 12.62149715423584, "learning_rate": 4.397894911321068e-05, "loss": 4.4751, "step": 2800 }, { "epoch": 0.632132468113672, "grad_norm": 10.467231750488281, "learning_rate": 4.387535222940494e-05, "loss": 4.2898, "step": 2825 }, { "epoch": 0.6377265607518461, "grad_norm": 12.244780540466309, "learning_rate": 4.377175534559921e-05, "loss": 4.3242, "step": 2850 }, { "epoch": 0.6433206533900201, "grad_norm": 12.39667797088623, "learning_rate": 4.3668158461793474e-05, "loss": 4.2754, "step": 2875 }, { "epoch": 0.6489147460281942, "grad_norm": 12.747861862182617, "learning_rate": 4.356456157798773e-05, "loss": 4.286, "step": 2900 }, { "epoch": 0.6545088386663683, "grad_norm": 22.809650421142578, "learning_rate": 4.3460964694182e-05, "loss": 4.3482, "step": 2925 }, { "epoch": 0.6601029313045425, "grad_norm": 10.659783363342285, "learning_rate": 4.3357367810376264e-05, "loss": 4.2066, "step": 2950 }, { "epoch": 0.6656970239427165, "grad_norm": 14.72028636932373, "learning_rate": 4.325377092657053e-05, "loss": 4.4523, "step": 2975 }, { "epoch": 0.6712911165808906, "grad_norm": 10.166138648986816, "learning_rate": 4.31501740427648e-05, "loss": 4.3615, "step": 3000 }, { "epoch": 0.6768852092190647, "grad_norm": 12.992719650268555, "learning_rate": 4.304657715895906e-05, "loss": 4.5061, "step": 3025 }, { "epoch": 0.6824793018572387, "grad_norm": 11.16627311706543, "learning_rate": 4.2942980275153324e-05, "loss": 4.5588, "step": 3050 }, { "epoch": 0.6880733944954128, "grad_norm": 10.317902565002441, "learning_rate": 4.2839383391347594e-05, "loss": 4.3462, "step": 3075 }, { "epoch": 0.693667487133587, "grad_norm": 12.662385940551758, "learning_rate": 4.273578650754185e-05, "loss": 4.5361, "step": 3100 }, { "epoch": 0.6992615797717611, "grad_norm": 16.921144485473633, "learning_rate": 4.263218962373612e-05, "loss": 4.2344, "step": 3125 }, { "epoch": 0.7048556724099351, "grad_norm": 17.665006637573242, "learning_rate": 4.2528592739930384e-05, "loss": 4.2197, "step": 3150 }, { "epoch": 0.7104497650481092, "grad_norm": 9.232120513916016, "learning_rate": 4.242499585612465e-05, "loss": 4.511, "step": 3175 }, { "epoch": 0.7160438576862833, "grad_norm": 12.514689445495605, "learning_rate": 4.232139897231892e-05, "loss": 4.5251, "step": 3200 }, { "epoch": 0.7216379503244573, "grad_norm": 11.398234367370605, "learning_rate": 4.221780208851318e-05, "loss": 4.1084, "step": 3225 }, { "epoch": 0.7272320429626314, "grad_norm": 8.08095932006836, "learning_rate": 4.2114205204707444e-05, "loss": 4.1601, "step": 3250 }, { "epoch": 0.7328261356008056, "grad_norm": 12.109641075134277, "learning_rate": 4.2010608320901714e-05, "loss": 4.274, "step": 3275 }, { "epoch": 0.7384202282389797, "grad_norm": 10.819121360778809, "learning_rate": 4.190701143709597e-05, "loss": 4.5686, "step": 3300 }, { "epoch": 0.7440143208771537, "grad_norm": 11.090829849243164, "learning_rate": 4.180341455329024e-05, "loss": 4.5401, "step": 3325 }, { "epoch": 0.7496084135153278, "grad_norm": 11.24759578704834, "learning_rate": 4.1699817669484504e-05, "loss": 4.4797, "step": 3350 }, { "epoch": 0.7552025061535019, "grad_norm": 10.916013717651367, "learning_rate": 4.159622078567877e-05, "loss": 4.1258, "step": 3375 }, { "epoch": 0.7607965987916759, "grad_norm": 11.953822135925293, "learning_rate": 4.149262390187304e-05, "loss": 4.3313, "step": 3400 }, { "epoch": 0.7663906914298501, "grad_norm": 16.665861129760742, "learning_rate": 4.13890270180673e-05, "loss": 4.2789, "step": 3425 }, { "epoch": 0.7719847840680242, "grad_norm": 11.539497375488281, "learning_rate": 4.1285430134261564e-05, "loss": 4.5436, "step": 3450 }, { "epoch": 0.7775788767061983, "grad_norm": 11.955995559692383, "learning_rate": 4.118183325045583e-05, "loss": 4.1149, "step": 3475 }, { "epoch": 0.7831729693443723, "grad_norm": 15.087596893310547, "learning_rate": 4.107823636665009e-05, "loss": 4.3523, "step": 3500 }, { "epoch": 0.7887670619825464, "grad_norm": 14.733497619628906, "learning_rate": 4.0974639482844354e-05, "loss": 4.2913, "step": 3525 }, { "epoch": 0.7943611546207205, "grad_norm": 10.125676155090332, "learning_rate": 4.0871042599038624e-05, "loss": 4.3078, "step": 3550 }, { "epoch": 0.7999552472588946, "grad_norm": 11.222993850708008, "learning_rate": 4.076744571523289e-05, "loss": 4.4803, "step": 3575 }, { "epoch": 0.8055493398970687, "grad_norm": 10.871453285217285, "learning_rate": 4.066384883142715e-05, "loss": 4.5347, "step": 3600 }, { "epoch": 0.8111434325352428, "grad_norm": 10.571527481079102, "learning_rate": 4.056025194762142e-05, "loss": 4.1658, "step": 3625 }, { "epoch": 0.8167375251734169, "grad_norm": 12.036689758300781, "learning_rate": 4.0456655063815685e-05, "loss": 4.3284, "step": 3650 }, { "epoch": 0.8223316178115909, "grad_norm": 13.614919662475586, "learning_rate": 4.035305818000995e-05, "loss": 4.1228, "step": 3675 }, { "epoch": 0.827925710449765, "grad_norm": 12.302602767944336, "learning_rate": 4.024946129620421e-05, "loss": 4.0459, "step": 3700 }, { "epoch": 0.8335198030879392, "grad_norm": 13.102405548095703, "learning_rate": 4.0145864412398474e-05, "loss": 4.4641, "step": 3725 }, { "epoch": 0.8391138957261133, "grad_norm": 12.71800422668457, "learning_rate": 4.0042267528592745e-05, "loss": 4.206, "step": 3750 }, { "epoch": 0.8447079883642873, "grad_norm": 13.687782287597656, "learning_rate": 3.993867064478701e-05, "loss": 4.388, "step": 3775 }, { "epoch": 0.8503020810024614, "grad_norm": 16.609664916992188, "learning_rate": 3.983507376098127e-05, "loss": 4.1664, "step": 3800 }, { "epoch": 0.8558961736406355, "grad_norm": 16.731786727905273, "learning_rate": 3.973147687717554e-05, "loss": 4.3956, "step": 3825 }, { "epoch": 0.8614902662788095, "grad_norm": 17.152843475341797, "learning_rate": 3.96278799933698e-05, "loss": 4.3338, "step": 3850 }, { "epoch": 0.8670843589169837, "grad_norm": 14.199774742126465, "learning_rate": 3.952428310956406e-05, "loss": 4.4609, "step": 3875 }, { "epoch": 0.8726784515551578, "grad_norm": 11.605820655822754, "learning_rate": 3.942068622575833e-05, "loss": 4.1688, "step": 3900 }, { "epoch": 0.8782725441933319, "grad_norm": 11.608319282531738, "learning_rate": 3.9317089341952595e-05, "loss": 4.2122, "step": 3925 }, { "epoch": 0.8838666368315059, "grad_norm": 12.212512016296387, "learning_rate": 3.921349245814686e-05, "loss": 4.2482, "step": 3950 }, { "epoch": 0.88946072946968, "grad_norm": 12.425273895263672, "learning_rate": 3.910989557434113e-05, "loss": 4.5128, "step": 3975 }, { "epoch": 0.8950548221078541, "grad_norm": 14.292542457580566, "learning_rate": 3.900629869053539e-05, "loss": 4.5421, "step": 4000 }, { "epoch": 0.9006489147460282, "grad_norm": 9.911199569702148, "learning_rate": 3.8902701806729655e-05, "loss": 4.3237, "step": 4025 }, { "epoch": 0.9062430073842023, "grad_norm": 6.303875923156738, "learning_rate": 3.879910492292392e-05, "loss": 4.195, "step": 4050 }, { "epoch": 0.9118371000223764, "grad_norm": 8.433326721191406, "learning_rate": 3.869550803911818e-05, "loss": 4.1758, "step": 4075 }, { "epoch": 0.9174311926605505, "grad_norm": 12.792257308959961, "learning_rate": 3.859191115531245e-05, "loss": 4.1538, "step": 4100 }, { "epoch": 0.9230252852987245, "grad_norm": 10.63804817199707, "learning_rate": 3.8488314271506715e-05, "loss": 4.2415, "step": 4125 }, { "epoch": 0.9286193779368986, "grad_norm": 10.244176864624023, "learning_rate": 3.838471738770098e-05, "loss": 4.624, "step": 4150 }, { "epoch": 0.9342134705750728, "grad_norm": 14.590502738952637, "learning_rate": 3.828112050389525e-05, "loss": 4.261, "step": 4175 }, { "epoch": 0.9398075632132468, "grad_norm": 17.149826049804688, "learning_rate": 3.817752362008951e-05, "loss": 4.0573, "step": 4200 }, { "epoch": 0.9454016558514209, "grad_norm": 10.837606430053711, "learning_rate": 3.8073926736283775e-05, "loss": 4.2502, "step": 4225 }, { "epoch": 0.950995748489595, "grad_norm": 13.960970878601074, "learning_rate": 3.797032985247804e-05, "loss": 4.1431, "step": 4250 }, { "epoch": 0.9565898411277691, "grad_norm": 10.603372573852539, "learning_rate": 3.78667329686723e-05, "loss": 4.2805, "step": 4275 }, { "epoch": 0.9621839337659431, "grad_norm": 14.068360328674316, "learning_rate": 3.776313608486657e-05, "loss": 4.2771, "step": 4300 }, { "epoch": 0.9677780264041173, "grad_norm": 12.487285614013672, "learning_rate": 3.7659539201060835e-05, "loss": 4.4749, "step": 4325 }, { "epoch": 0.9733721190422914, "grad_norm": 11.214025497436523, "learning_rate": 3.75559423172551e-05, "loss": 4.1715, "step": 4350 }, { "epoch": 0.9789662116804654, "grad_norm": 12.117270469665527, "learning_rate": 3.745234543344937e-05, "loss": 4.1669, "step": 4375 }, { "epoch": 0.9845603043186395, "grad_norm": 7.657718181610107, "learning_rate": 3.734874854964363e-05, "loss": 4.1935, "step": 4400 }, { "epoch": 0.9901543969568136, "grad_norm": 12.381430625915527, "learning_rate": 3.724929554119012e-05, "loss": 4.3682, "step": 4425 }, { "epoch": 0.9957484895949877, "grad_norm": 12.749862670898438, "learning_rate": 3.7145698657384385e-05, "loss": 4.2554, "step": 4450 }, { "epoch": 1.0, "eval_gen_len": 67.1128, "eval_loss": 4.290805816650391, "eval_rouge1": 26.1327, "eval_rouge2": 10.0836, "eval_rougeL": 24.9862, "eval_rougeLsum": 25.321, "eval_runtime": 1004.8822, "eval_samples_per_second": 1.112, "eval_steps_per_second": 0.279, "step": 4469 }, { "epoch": 1.0013425822331619, "grad_norm": 15.127477645874023, "learning_rate": 3.7042101773578655e-05, "loss": 3.9898, "step": 4475 }, { "epoch": 1.006936674871336, "grad_norm": 10.354872703552246, "learning_rate": 3.693850488977292e-05, "loss": 3.623, "step": 4500 }, { "epoch": 1.0125307675095099, "grad_norm": 13.131593704223633, "learning_rate": 3.683490800596718e-05, "loss": 3.7952, "step": 4525 }, { "epoch": 1.018124860147684, "grad_norm": 10.636340141296387, "learning_rate": 3.673131112216145e-05, "loss": 3.4462, "step": 4550 }, { "epoch": 1.0237189527858581, "grad_norm": 14.03700065612793, "learning_rate": 3.662771423835571e-05, "loss": 3.4108, "step": 4575 }, { "epoch": 1.0293130454240322, "grad_norm": 21.94048309326172, "learning_rate": 3.652411735454997e-05, "loss": 3.7664, "step": 4600 }, { "epoch": 1.0349071380622064, "grad_norm": 11.382323265075684, "learning_rate": 3.642052047074424e-05, "loss": 3.7189, "step": 4625 }, { "epoch": 1.0405012307003805, "grad_norm": 11.167036056518555, "learning_rate": 3.6316923586938505e-05, "loss": 3.7947, "step": 4650 }, { "epoch": 1.0460953233385544, "grad_norm": 13.024956703186035, "learning_rate": 3.621332670313277e-05, "loss": 3.6083, "step": 4675 }, { "epoch": 1.0516894159767285, "grad_norm": 11.757680892944336, "learning_rate": 3.610972981932704e-05, "loss": 3.5998, "step": 4700 }, { "epoch": 1.0572835086149026, "grad_norm": 14.893111228942871, "learning_rate": 3.60061329355213e-05, "loss": 3.6173, "step": 4725 }, { "epoch": 1.0628776012530767, "grad_norm": 9.222747802734375, "learning_rate": 3.5902536051715565e-05, "loss": 3.6453, "step": 4750 }, { "epoch": 1.0684716938912509, "grad_norm": 25.488880157470703, "learning_rate": 3.579893916790983e-05, "loss": 3.3486, "step": 4775 }, { "epoch": 1.074065786529425, "grad_norm": 10.05694580078125, "learning_rate": 3.569534228410409e-05, "loss": 3.7705, "step": 4800 }, { "epoch": 1.079659879167599, "grad_norm": 12.402889251708984, "learning_rate": 3.559174540029836e-05, "loss": 3.3739, "step": 4825 }, { "epoch": 1.085253971805773, "grad_norm": 10.890093803405762, "learning_rate": 3.5488148516492625e-05, "loss": 3.7975, "step": 4850 }, { "epoch": 1.090848064443947, "grad_norm": 12.410653114318848, "learning_rate": 3.538455163268689e-05, "loss": 3.5483, "step": 4875 }, { "epoch": 1.0964421570821212, "grad_norm": 11.636606216430664, "learning_rate": 3.528095474888116e-05, "loss": 3.5182, "step": 4900 }, { "epoch": 1.1020362497202953, "grad_norm": 14.367986679077148, "learning_rate": 3.517735786507542e-05, "loss": 3.9954, "step": 4925 }, { "epoch": 1.1076303423584695, "grad_norm": 10.753607749938965, "learning_rate": 3.5073760981269685e-05, "loss": 3.8582, "step": 4950 }, { "epoch": 1.1132244349966436, "grad_norm": 9.407801628112793, "learning_rate": 3.497016409746395e-05, "loss": 3.6181, "step": 4975 }, { "epoch": 1.1188185276348177, "grad_norm": 9.98642349243164, "learning_rate": 3.486656721365821e-05, "loss": 3.5989, "step": 5000 }, { "epoch": 1.1244126202729916, "grad_norm": 9.880094528198242, "learning_rate": 3.476297032985248e-05, "loss": 3.658, "step": 5025 }, { "epoch": 1.1300067129111657, "grad_norm": 14.001792907714844, "learning_rate": 3.4659373446046745e-05, "loss": 3.7073, "step": 5050 }, { "epoch": 1.1356008055493398, "grad_norm": 18.54832649230957, "learning_rate": 3.455577656224101e-05, "loss": 3.7806, "step": 5075 }, { "epoch": 1.141194898187514, "grad_norm": 9.804744720458984, "learning_rate": 3.445217967843528e-05, "loss": 3.6671, "step": 5100 }, { "epoch": 1.146788990825688, "grad_norm": 8.401939392089844, "learning_rate": 3.434858279462954e-05, "loss": 3.8566, "step": 5125 }, { "epoch": 1.1523830834638622, "grad_norm": 10.120752334594727, "learning_rate": 3.42449859108238e-05, "loss": 3.8143, "step": 5150 }, { "epoch": 1.1579771761020363, "grad_norm": 16.10240364074707, "learning_rate": 3.414138902701807e-05, "loss": 3.7451, "step": 5175 }, { "epoch": 1.1635712687402102, "grad_norm": 10.377949714660645, "learning_rate": 3.403779214321233e-05, "loss": 3.6082, "step": 5200 }, { "epoch": 1.1691653613783843, "grad_norm": 10.826866149902344, "learning_rate": 3.3934195259406595e-05, "loss": 3.8001, "step": 5225 }, { "epoch": 1.1747594540165585, "grad_norm": 10.02441120147705, "learning_rate": 3.3830598375600866e-05, "loss": 3.6624, "step": 5250 }, { "epoch": 1.1803535466547326, "grad_norm": 15.683877944946289, "learning_rate": 3.372700149179513e-05, "loss": 3.5834, "step": 5275 }, { "epoch": 1.1859476392929067, "grad_norm": 11.696283340454102, "learning_rate": 3.362340460798939e-05, "loss": 3.6069, "step": 5300 }, { "epoch": 1.1915417319310808, "grad_norm": 13.27725601196289, "learning_rate": 3.351980772418366e-05, "loss": 3.6614, "step": 5325 }, { "epoch": 1.197135824569255, "grad_norm": 11.811793327331543, "learning_rate": 3.341621084037792e-05, "loss": 3.6186, "step": 5350 }, { "epoch": 1.2027299172074288, "grad_norm": 30.400972366333008, "learning_rate": 3.331261395657219e-05, "loss": 3.6704, "step": 5375 }, { "epoch": 1.208324009845603, "grad_norm": 11.845870018005371, "learning_rate": 3.320901707276645e-05, "loss": 3.6688, "step": 5400 }, { "epoch": 1.213918102483777, "grad_norm": 14.447372436523438, "learning_rate": 3.3105420188960716e-05, "loss": 3.8172, "step": 5425 }, { "epoch": 1.2195121951219512, "grad_norm": 9.492889404296875, "learning_rate": 3.3001823305154986e-05, "loss": 3.5741, "step": 5450 }, { "epoch": 1.2251062877601253, "grad_norm": 12.105642318725586, "learning_rate": 3.289822642134925e-05, "loss": 3.5402, "step": 5475 }, { "epoch": 1.2307003803982994, "grad_norm": 7.518635272979736, "learning_rate": 3.279462953754351e-05, "loss": 3.7333, "step": 5500 }, { "epoch": 1.2362944730364736, "grad_norm": 11.485749244689941, "learning_rate": 3.269103265373778e-05, "loss": 3.8156, "step": 5525 }, { "epoch": 1.2418885656746474, "grad_norm": 11.726677894592285, "learning_rate": 3.258743576993204e-05, "loss": 3.937, "step": 5550 }, { "epoch": 1.2474826583128216, "grad_norm": 13.454861640930176, "learning_rate": 3.24838388861263e-05, "loss": 3.5792, "step": 5575 }, { "epoch": 1.2530767509509957, "grad_norm": 17.696428298950195, "learning_rate": 3.238024200232057e-05, "loss": 3.5828, "step": 5600 }, { "epoch": 1.2586708435891698, "grad_norm": 12.128670692443848, "learning_rate": 3.2276645118514836e-05, "loss": 3.6379, "step": 5625 }, { "epoch": 1.264264936227344, "grad_norm": 11.507698059082031, "learning_rate": 3.21730482347091e-05, "loss": 3.5192, "step": 5650 }, { "epoch": 1.269859028865518, "grad_norm": 11.207321166992188, "learning_rate": 3.206945135090337e-05, "loss": 3.885, "step": 5675 }, { "epoch": 1.2754531215036922, "grad_norm": 9.954567909240723, "learning_rate": 3.196585446709763e-05, "loss": 3.6138, "step": 5700 }, { "epoch": 1.281047214141866, "grad_norm": 9.7274751663208, "learning_rate": 3.1862257583291896e-05, "loss": 3.7395, "step": 5725 }, { "epoch": 1.2866413067800404, "grad_norm": 10.094833374023438, "learning_rate": 3.175866069948616e-05, "loss": 3.6801, "step": 5750 }, { "epoch": 1.2922353994182143, "grad_norm": 12.403266906738281, "learning_rate": 3.165506381568042e-05, "loss": 3.6891, "step": 5775 }, { "epoch": 1.2978294920563884, "grad_norm": 13.569632530212402, "learning_rate": 3.155146693187469e-05, "loss": 3.392, "step": 5800 }, { "epoch": 1.3034235846945625, "grad_norm": 10.21789836883545, "learning_rate": 3.1447870048068956e-05, "loss": 3.4392, "step": 5825 }, { "epoch": 1.3090176773327367, "grad_norm": 9.875311851501465, "learning_rate": 3.134427316426322e-05, "loss": 3.6576, "step": 5850 }, { "epoch": 1.3146117699709108, "grad_norm": 13.931588172912598, "learning_rate": 3.124067628045749e-05, "loss": 3.869, "step": 5875 }, { "epoch": 1.3202058626090847, "grad_norm": 9.532690048217773, "learning_rate": 3.113707939665175e-05, "loss": 3.7711, "step": 5900 }, { "epoch": 1.325799955247259, "grad_norm": 9.777695655822754, "learning_rate": 3.1033482512846016e-05, "loss": 3.4998, "step": 5925 }, { "epoch": 1.331394047885433, "grad_norm": 10.633079528808594, "learning_rate": 3.092988562904028e-05, "loss": 3.6, "step": 5950 }, { "epoch": 1.336988140523607, "grad_norm": 11.918797492980957, "learning_rate": 3.082628874523454e-05, "loss": 3.6084, "step": 5975 }, { "epoch": 1.3425822331617812, "grad_norm": 11.27762222290039, "learning_rate": 3.072269186142881e-05, "loss": 3.6122, "step": 6000 }, { "epoch": 1.3481763257999553, "grad_norm": 18.506898880004883, "learning_rate": 3.0619094977623076e-05, "loss": 3.2751, "step": 6025 }, { "epoch": 1.3537704184381294, "grad_norm": 21.525028228759766, "learning_rate": 3.051549809381734e-05, "loss": 3.4856, "step": 6050 }, { "epoch": 1.3593645110763033, "grad_norm": 13.029548645019531, "learning_rate": 3.0411901210011606e-05, "loss": 3.5935, "step": 6075 }, { "epoch": 1.3649586037144776, "grad_norm": 9.536824226379395, "learning_rate": 3.0308304326205873e-05, "loss": 3.569, "step": 6100 }, { "epoch": 1.3705526963526515, "grad_norm": 11.298256874084473, "learning_rate": 3.0204707442400133e-05, "loss": 3.8319, "step": 6125 }, { "epoch": 1.3761467889908257, "grad_norm": 8.461627006530762, "learning_rate": 3.0101110558594396e-05, "loss": 3.7151, "step": 6150 }, { "epoch": 1.3817408816289998, "grad_norm": 8.925524711608887, "learning_rate": 2.9997513674788663e-05, "loss": 3.5296, "step": 6175 }, { "epoch": 1.387334974267174, "grad_norm": 12.109110832214355, "learning_rate": 2.989391679098293e-05, "loss": 3.4691, "step": 6200 }, { "epoch": 1.392929066905348, "grad_norm": 9.915245056152344, "learning_rate": 2.9790319907177193e-05, "loss": 3.5228, "step": 6225 }, { "epoch": 1.398523159543522, "grad_norm": 12.354578018188477, "learning_rate": 2.968672302337146e-05, "loss": 3.5543, "step": 6250 }, { "epoch": 1.4041172521816963, "grad_norm": 13.05074405670166, "learning_rate": 2.9583126139565726e-05, "loss": 3.5762, "step": 6275 }, { "epoch": 1.4097113448198701, "grad_norm": 9.809946060180664, "learning_rate": 2.9479529255759986e-05, "loss": 3.4119, "step": 6300 }, { "epoch": 1.4153054374580443, "grad_norm": 10.61130142211914, "learning_rate": 2.937593237195425e-05, "loss": 3.5493, "step": 6325 }, { "epoch": 1.4208995300962184, "grad_norm": 8.541769027709961, "learning_rate": 2.9272335488148516e-05, "loss": 3.607, "step": 6350 }, { "epoch": 1.4264936227343925, "grad_norm": 14.672987937927246, "learning_rate": 2.9168738604342783e-05, "loss": 3.3681, "step": 6375 }, { "epoch": 1.4320877153725666, "grad_norm": 16.417062759399414, "learning_rate": 2.9065141720537046e-05, "loss": 3.581, "step": 6400 }, { "epoch": 1.4376818080107405, "grad_norm": 11.234773635864258, "learning_rate": 2.8961544836731313e-05, "loss": 3.8083, "step": 6425 }, { "epoch": 1.4432759006489149, "grad_norm": 8.443843841552734, "learning_rate": 2.885794795292558e-05, "loss": 3.4413, "step": 6450 }, { "epoch": 1.4488699932870888, "grad_norm": 11.358126640319824, "learning_rate": 2.8754351069119843e-05, "loss": 3.5337, "step": 6475 }, { "epoch": 1.4544640859252629, "grad_norm": 15.18321418762207, "learning_rate": 2.8650754185314106e-05, "loss": 3.7784, "step": 6500 }, { "epoch": 1.460058178563437, "grad_norm": 10.894759178161621, "learning_rate": 2.854715730150837e-05, "loss": 3.736, "step": 6525 }, { "epoch": 1.4656522712016111, "grad_norm": 8.780854225158691, "learning_rate": 2.8443560417702636e-05, "loss": 3.5959, "step": 6550 }, { "epoch": 1.4712463638397852, "grad_norm": 16.36128044128418, "learning_rate": 2.8339963533896903e-05, "loss": 3.6979, "step": 6575 }, { "epoch": 1.4768404564779591, "grad_norm": 10.243796348571777, "learning_rate": 2.8236366650091166e-05, "loss": 3.6813, "step": 6600 }, { "epoch": 1.4824345491161335, "grad_norm": 13.104743003845215, "learning_rate": 2.8132769766285433e-05, "loss": 3.6853, "step": 6625 }, { "epoch": 1.4880286417543074, "grad_norm": 11.391397476196289, "learning_rate": 2.80291728824797e-05, "loss": 3.5911, "step": 6650 }, { "epoch": 1.4936227343924815, "grad_norm": 11.6659574508667, "learning_rate": 2.792557599867396e-05, "loss": 3.5579, "step": 6675 }, { "epoch": 1.4992168270306556, "grad_norm": 11.695647239685059, "learning_rate": 2.7821979114868223e-05, "loss": 3.5243, "step": 6700 }, { "epoch": 1.5048109196688297, "grad_norm": 9.701094627380371, "learning_rate": 2.771838223106249e-05, "loss": 3.6699, "step": 6725 }, { "epoch": 1.5104050123070039, "grad_norm": 15.949247360229492, "learning_rate": 2.7614785347256757e-05, "loss": 3.7613, "step": 6750 }, { "epoch": 1.5159991049451778, "grad_norm": 13.379142761230469, "learning_rate": 2.751118846345102e-05, "loss": 3.7097, "step": 6775 }, { "epoch": 1.521593197583352, "grad_norm": 10.693124771118164, "learning_rate": 2.7407591579645287e-05, "loss": 3.7779, "step": 6800 }, { "epoch": 1.527187290221526, "grad_norm": 7.9651312828063965, "learning_rate": 2.7303994695839553e-05, "loss": 3.6414, "step": 6825 }, { "epoch": 1.5327813828597001, "grad_norm": 11.157812118530273, "learning_rate": 2.7200397812033817e-05, "loss": 3.6576, "step": 6850 }, { "epoch": 1.5383754754978742, "grad_norm": 12.323993682861328, "learning_rate": 2.7096800928228077e-05, "loss": 3.4723, "step": 6875 }, { "epoch": 1.5439695681360484, "grad_norm": 10.302526473999023, "learning_rate": 2.6993204044422343e-05, "loss": 3.7213, "step": 6900 }, { "epoch": 1.5495636607742225, "grad_norm": 10.622782707214355, "learning_rate": 2.688960716061661e-05, "loss": 3.582, "step": 6925 }, { "epoch": 1.5551577534123964, "grad_norm": 12.592206001281738, "learning_rate": 2.6786010276810873e-05, "loss": 3.524, "step": 6950 }, { "epoch": 1.5607518460505707, "grad_norm": 10.893891334533691, "learning_rate": 2.668241339300514e-05, "loss": 3.4272, "step": 6975 }, { "epoch": 1.5663459386887446, "grad_norm": 11.728677749633789, "learning_rate": 2.6578816509199407e-05, "loss": 3.5618, "step": 7000 }, { "epoch": 1.5719400313269187, "grad_norm": 14.411259651184082, "learning_rate": 2.647521962539367e-05, "loss": 3.6041, "step": 7025 }, { "epoch": 1.5775341239650928, "grad_norm": 12.213258743286133, "learning_rate": 2.6371622741587937e-05, "loss": 3.5627, "step": 7050 }, { "epoch": 1.583128216603267, "grad_norm": 12.341785430908203, "learning_rate": 2.6268025857782197e-05, "loss": 3.631, "step": 7075 }, { "epoch": 1.588722309241441, "grad_norm": 22.097862243652344, "learning_rate": 2.6164428973976464e-05, "loss": 3.5977, "step": 7100 }, { "epoch": 1.594316401879615, "grad_norm": 11.355073928833008, "learning_rate": 2.6060832090170727e-05, "loss": 3.6045, "step": 7125 }, { "epoch": 1.5999104945177893, "grad_norm": 12.087318420410156, "learning_rate": 2.5957235206364994e-05, "loss": 3.5971, "step": 7150 }, { "epoch": 1.6055045871559632, "grad_norm": 12.888108253479004, "learning_rate": 2.585363832255926e-05, "loss": 3.4293, "step": 7175 }, { "epoch": 1.6110986797941373, "grad_norm": 11.495614051818848, "learning_rate": 2.5750041438753524e-05, "loss": 3.9471, "step": 7200 }, { "epoch": 1.6166927724323115, "grad_norm": 12.585895538330078, "learning_rate": 2.564644455494779e-05, "loss": 3.5671, "step": 7225 }, { "epoch": 1.6222868650704856, "grad_norm": 8.79129409790039, "learning_rate": 2.554284767114205e-05, "loss": 3.6453, "step": 7250 }, { "epoch": 1.6278809577086597, "grad_norm": 10.552870750427246, "learning_rate": 2.5439250787336317e-05, "loss": 3.5649, "step": 7275 }, { "epoch": 1.6334750503468336, "grad_norm": 15.649266242980957, "learning_rate": 2.533565390353058e-05, "loss": 3.5291, "step": 7300 }, { "epoch": 1.639069142985008, "grad_norm": 13.944864273071289, "learning_rate": 2.5232057019724847e-05, "loss": 3.3558, "step": 7325 }, { "epoch": 1.6446632356231818, "grad_norm": 14.092317581176758, "learning_rate": 2.5128460135919114e-05, "loss": 3.6211, "step": 7350 }, { "epoch": 1.650257328261356, "grad_norm": 12.18454647064209, "learning_rate": 2.5024863252113377e-05, "loss": 3.7732, "step": 7375 }, { "epoch": 1.65585142089953, "grad_norm": 10.779006958007812, "learning_rate": 2.492126636830764e-05, "loss": 3.6804, "step": 7400 }, { "epoch": 1.6614455135377042, "grad_norm": 10.292470932006836, "learning_rate": 2.4817669484501907e-05, "loss": 3.5316, "step": 7425 }, { "epoch": 1.6670396061758783, "grad_norm": 13.621623039245605, "learning_rate": 2.4714072600696174e-05, "loss": 3.4568, "step": 7450 }, { "epoch": 1.6726336988140522, "grad_norm": 11.503890991210938, "learning_rate": 2.4610475716890437e-05, "loss": 3.3641, "step": 7475 }, { "epoch": 1.6782277914522266, "grad_norm": 13.564423561096191, "learning_rate": 2.45068788330847e-05, "loss": 3.5669, "step": 7500 }, { "epoch": 1.6838218840904005, "grad_norm": 9.360475540161133, "learning_rate": 2.4403281949278967e-05, "loss": 3.6688, "step": 7525 }, { "epoch": 1.6894159767285746, "grad_norm": 13.366150856018066, "learning_rate": 2.4299685065473234e-05, "loss": 3.3199, "step": 7550 }, { "epoch": 1.6950100693667487, "grad_norm": 10.094856262207031, "learning_rate": 2.4196088181667497e-05, "loss": 3.4633, "step": 7575 }, { "epoch": 1.7006041620049228, "grad_norm": 9.134540557861328, "learning_rate": 2.409249129786176e-05, "loss": 3.6078, "step": 7600 }, { "epoch": 1.706198254643097, "grad_norm": 11.252095222473145, "learning_rate": 2.3988894414056027e-05, "loss": 3.7198, "step": 7625 }, { "epoch": 1.7117923472812708, "grad_norm": 6.201746940612793, "learning_rate": 2.388529753025029e-05, "loss": 3.5264, "step": 7650 }, { "epoch": 1.7173864399194452, "grad_norm": 12.331997871398926, "learning_rate": 2.3781700646444557e-05, "loss": 3.4556, "step": 7675 }, { "epoch": 1.722980532557619, "grad_norm": 12.859444618225098, "learning_rate": 2.367810376263882e-05, "loss": 3.4533, "step": 7700 }, { "epoch": 1.7285746251957932, "grad_norm": 13.243956565856934, "learning_rate": 2.3574506878833087e-05, "loss": 3.7355, "step": 7725 }, { "epoch": 1.7341687178339673, "grad_norm": 11.75436019897461, "learning_rate": 2.347090999502735e-05, "loss": 3.6108, "step": 7750 }, { "epoch": 1.7397628104721414, "grad_norm": 13.429585456848145, "learning_rate": 2.3367313111221614e-05, "loss": 3.7925, "step": 7775 }, { "epoch": 1.7453569031103155, "grad_norm": 11.077943801879883, "learning_rate": 2.326371622741588e-05, "loss": 3.6788, "step": 7800 }, { "epoch": 1.7509509957484894, "grad_norm": 7.700258731842041, "learning_rate": 2.3160119343610144e-05, "loss": 3.4562, "step": 7825 }, { "epoch": 1.7565450883866638, "grad_norm": 11.34974479675293, "learning_rate": 2.305652245980441e-05, "loss": 3.3855, "step": 7850 }, { "epoch": 1.7621391810248377, "grad_norm": 11.598840713500977, "learning_rate": 2.2952925575998674e-05, "loss": 3.6097, "step": 7875 }, { "epoch": 1.7677332736630118, "grad_norm": 11.69258975982666, "learning_rate": 2.284932869219294e-05, "loss": 3.6146, "step": 7900 }, { "epoch": 1.773327366301186, "grad_norm": 10.501328468322754, "learning_rate": 2.2745731808387204e-05, "loss": 3.6597, "step": 7925 }, { "epoch": 1.77892145893936, "grad_norm": 12.03715705871582, "learning_rate": 2.264213492458147e-05, "loss": 3.6975, "step": 7950 }, { "epoch": 1.7845155515775342, "grad_norm": 13.386404991149902, "learning_rate": 2.2538538040775734e-05, "loss": 3.6164, "step": 7975 }, { "epoch": 1.790109644215708, "grad_norm": 9.877335548400879, "learning_rate": 2.2434941156969997e-05, "loss": 3.614, "step": 8000 }, { "epoch": 1.7957037368538824, "grad_norm": 12.025654792785645, "learning_rate": 2.2335488148516494e-05, "loss": 3.743, "step": 8025 }, { "epoch": 1.8012978294920563, "grad_norm": 12.508997917175293, "learning_rate": 2.2231891264710757e-05, "loss": 3.7309, "step": 8050 }, { "epoch": 1.8068919221302304, "grad_norm": 12.994415283203125, "learning_rate": 2.2128294380905024e-05, "loss": 3.5284, "step": 8075 }, { "epoch": 1.8124860147684045, "grad_norm": 12.859843254089355, "learning_rate": 2.202469749709929e-05, "loss": 3.686, "step": 8100 }, { "epoch": 1.8180801074065787, "grad_norm": 15.91470718383789, "learning_rate": 2.192110061329355e-05, "loss": 3.3395, "step": 8125 }, { "epoch": 1.8236742000447528, "grad_norm": 10.755178451538086, "learning_rate": 2.1817503729487817e-05, "loss": 3.6407, "step": 8150 }, { "epoch": 1.8292682926829267, "grad_norm": 10.679194450378418, "learning_rate": 2.1713906845682084e-05, "loss": 3.5509, "step": 8175 }, { "epoch": 1.834862385321101, "grad_norm": 18.6633243560791, "learning_rate": 2.1610309961876348e-05, "loss": 3.5166, "step": 8200 }, { "epoch": 1.840456477959275, "grad_norm": 7.9321112632751465, "learning_rate": 2.150671307807061e-05, "loss": 3.36, "step": 8225 }, { "epoch": 1.8460505705974493, "grad_norm": 10.757131576538086, "learning_rate": 2.1403116194264878e-05, "loss": 3.4811, "step": 8250 }, { "epoch": 1.8516446632356232, "grad_norm": 15.632428169250488, "learning_rate": 2.1299519310459144e-05, "loss": 3.5502, "step": 8275 }, { "epoch": 1.8572387558737973, "grad_norm": 19.17276954650879, "learning_rate": 2.1195922426653408e-05, "loss": 3.5601, "step": 8300 }, { "epoch": 1.8628328485119714, "grad_norm": 11.047025680541992, "learning_rate": 2.109232554284767e-05, "loss": 3.6373, "step": 8325 }, { "epoch": 1.8684269411501453, "grad_norm": 15.699575424194336, "learning_rate": 2.0988728659041938e-05, "loss": 3.7378, "step": 8350 }, { "epoch": 1.8740210337883196, "grad_norm": 13.09723949432373, "learning_rate": 2.08851317752362e-05, "loss": 3.8717, "step": 8375 }, { "epoch": 1.8796151264264935, "grad_norm": 8.441289901733398, "learning_rate": 2.0781534891430468e-05, "loss": 3.2558, "step": 8400 }, { "epoch": 1.8852092190646679, "grad_norm": 12.046778678894043, "learning_rate": 2.067793800762473e-05, "loss": 3.3936, "step": 8425 }, { "epoch": 1.8908033117028418, "grad_norm": 10.983031272888184, "learning_rate": 2.0574341123818998e-05, "loss": 3.4321, "step": 8450 }, { "epoch": 1.896397404341016, "grad_norm": 12.274590492248535, "learning_rate": 2.047074424001326e-05, "loss": 3.4638, "step": 8475 }, { "epoch": 1.90199149697919, "grad_norm": 15.135939598083496, "learning_rate": 2.0367147356207524e-05, "loss": 3.5686, "step": 8500 }, { "epoch": 1.907585589617364, "grad_norm": 11.194721221923828, "learning_rate": 2.026355047240179e-05, "loss": 3.7067, "step": 8525 }, { "epoch": 1.9131796822555382, "grad_norm": 15.062312126159668, "learning_rate": 2.0159953588596054e-05, "loss": 3.5299, "step": 8550 }, { "epoch": 1.9187737748937121, "grad_norm": 12.282342910766602, "learning_rate": 2.005635670479032e-05, "loss": 3.5803, "step": 8575 }, { "epoch": 1.9243678675318865, "grad_norm": 14.33022689819336, "learning_rate": 1.9952759820984584e-05, "loss": 3.5835, "step": 8600 }, { "epoch": 1.9299619601700604, "grad_norm": 8.249588966369629, "learning_rate": 1.984916293717885e-05, "loss": 3.2694, "step": 8625 }, { "epoch": 1.9355560528082345, "grad_norm": 9.1649169921875, "learning_rate": 1.9745566053373115e-05, "loss": 3.6105, "step": 8650 }, { "epoch": 1.9411501454464086, "grad_norm": 8.755537986755371, "learning_rate": 1.964196916956738e-05, "loss": 3.5554, "step": 8675 }, { "epoch": 1.9467442380845825, "grad_norm": 8.148399353027344, "learning_rate": 1.9538372285761645e-05, "loss": 3.5073, "step": 8700 }, { "epoch": 1.9523383307227569, "grad_norm": 10.864067077636719, "learning_rate": 1.943477540195591e-05, "loss": 3.6951, "step": 8725 }, { "epoch": 1.9579324233609308, "grad_norm": 13.049738883972168, "learning_rate": 1.9331178518150175e-05, "loss": 3.5186, "step": 8750 }, { "epoch": 1.963526515999105, "grad_norm": 5.955536842346191, "learning_rate": 1.922758163434444e-05, "loss": 3.479, "step": 8775 }, { "epoch": 1.969120608637279, "grad_norm": 12.35295581817627, "learning_rate": 1.9123984750538705e-05, "loss": 3.5103, "step": 8800 }, { "epoch": 1.9747147012754531, "grad_norm": 9.945602416992188, "learning_rate": 1.9020387866732968e-05, "loss": 3.6919, "step": 8825 }, { "epoch": 1.9803087939136272, "grad_norm": 9.716672897338867, "learning_rate": 1.8916790982927235e-05, "loss": 3.7097, "step": 8850 }, { "epoch": 1.9859028865518011, "grad_norm": 7.3651041984558105, "learning_rate": 1.88131940991215e-05, "loss": 3.4111, "step": 8875 }, { "epoch": 1.9914969791899755, "grad_norm": 11.258004188537598, "learning_rate": 1.8709597215315765e-05, "loss": 3.4201, "step": 8900 }, { "epoch": 1.9970910718281494, "grad_norm": 14.962812423706055, "learning_rate": 1.8606000331510028e-05, "loss": 3.4142, "step": 8925 }, { "epoch": 2.0, "eval_gen_len": 61.8854, "eval_loss": 4.248934268951416, "eval_rouge1": 25.7685, "eval_rouge2": 9.8226, "eval_rougeL": 24.6426, "eval_rougeLsum": 24.9756, "eval_runtime": 700.259, "eval_samples_per_second": 1.595, "eval_steps_per_second": 0.4, "step": 8938 } ], "logging_steps": 25, "max_steps": 13407, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1151792816578560.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }