{ "best_metric": 0.4798590838909149, "best_model_checkpoint": "saves/CADICA_qwenvl_stenosis_detect_scale4/lora/sft/checkpoint-1100", "epoch": 0.28328611898017, "eval_steps": 50, "global_step": 1100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012876641771825909, "grad_norm": 13.245840411597928, "learning_rate": 2.9411764705882355e-06, "loss": 2.8889, "num_input_tokens_seen": 52840, "step": 5 }, { "epoch": 0.0025753283543651817, "grad_norm": 12.237619501215374, "learning_rate": 5.882352941176471e-06, "loss": 2.8165, "num_input_tokens_seen": 105528, "step": 10 }, { "epoch": 0.0038629925315477724, "grad_norm": 16.29688816410412, "learning_rate": 8.823529411764707e-06, "loss": 2.8363, "num_input_tokens_seen": 158768, "step": 15 }, { "epoch": 0.0051506567087303634, "grad_norm": 11.576419511120797, "learning_rate": 1.1764705882352942e-05, "loss": 2.6853, "num_input_tokens_seen": 210816, "step": 20 }, { "epoch": 0.006438320885912954, "grad_norm": 6.9672256792859, "learning_rate": 1.4705882352941177e-05, "loss": 2.2992, "num_input_tokens_seen": 262936, "step": 25 }, { "epoch": 0.007725985063095545, "grad_norm": 3.1837818528204305, "learning_rate": 1.7647058823529414e-05, "loss": 1.8923, "num_input_tokens_seen": 315264, "step": 30 }, { "epoch": 0.009013649240278136, "grad_norm": 2.835950303969337, "learning_rate": 2.058823529411765e-05, "loss": 1.6984, "num_input_tokens_seen": 367840, "step": 35 }, { "epoch": 0.010301313417460727, "grad_norm": 2.223740001042382, "learning_rate": 2.3529411764705884e-05, "loss": 1.6434, "num_input_tokens_seen": 420112, "step": 40 }, { "epoch": 0.011588977594643318, "grad_norm": 1.9880935044313244, "learning_rate": 2.647058823529412e-05, "loss": 1.4659, "num_input_tokens_seen": 472728, "step": 45 }, { "epoch": 0.012876641771825908, "grad_norm": 1.7151131700495934, "learning_rate": 2.9411764705882354e-05, "loss": 1.3506, "num_input_tokens_seen": 524648, "step": 50 }, { "epoch": 0.012876641771825908, "eval_loss": 1.1727452278137207, "eval_runtime": 66.3207, "eval_samples_per_second": 1.809, "eval_steps_per_second": 0.452, "num_input_tokens_seen": 524648, "step": 50 }, { "epoch": 0.014164305949008499, "grad_norm": 1.47475981537851, "learning_rate": 3.235294117647059e-05, "loss": 1.1455, "num_input_tokens_seen": 576472, "step": 55 }, { "epoch": 0.01545197012619109, "grad_norm": 1.7476693647440722, "learning_rate": 3.529411764705883e-05, "loss": 0.9971, "num_input_tokens_seen": 628056, "step": 60 }, { "epoch": 0.01673963430337368, "grad_norm": 1.3384365493212875, "learning_rate": 3.8235294117647055e-05, "loss": 0.9073, "num_input_tokens_seen": 680448, "step": 65 }, { "epoch": 0.018027298480556272, "grad_norm": 0.9014358219807773, "learning_rate": 4.11764705882353e-05, "loss": 0.8386, "num_input_tokens_seen": 733664, "step": 70 }, { "epoch": 0.01931496265773886, "grad_norm": 0.8007820009902022, "learning_rate": 4.411764705882353e-05, "loss": 0.7827, "num_input_tokens_seen": 786096, "step": 75 }, { "epoch": 0.020602626834921454, "grad_norm": 0.6701003454307716, "learning_rate": 4.705882352941177e-05, "loss": 0.7814, "num_input_tokens_seen": 838192, "step": 80 }, { "epoch": 0.021890291012104043, "grad_norm": 0.8973165751658843, "learning_rate": 5e-05, "loss": 0.7297, "num_input_tokens_seen": 890112, "step": 85 }, { "epoch": 0.023177955189286635, "grad_norm": 0.9060968630490469, "learning_rate": 5.294117647058824e-05, "loss": 0.7894, "num_input_tokens_seen": 943472, "step": 90 }, { "epoch": 0.024465619366469224, "grad_norm": 0.9520214202472889, "learning_rate": 5.588235294117647e-05, "loss": 0.7758, "num_input_tokens_seen": 996872, "step": 95 }, { "epoch": 0.025753283543651816, "grad_norm": 0.8226006535044261, "learning_rate": 5.882352941176471e-05, "loss": 0.7577, "num_input_tokens_seen": 1049816, "step": 100 }, { "epoch": 0.025753283543651816, "eval_loss": 0.7517351508140564, "eval_runtime": 38.7829, "eval_samples_per_second": 3.094, "eval_steps_per_second": 0.774, "num_input_tokens_seen": 1049816, "step": 100 }, { "epoch": 0.027040947720834405, "grad_norm": 0.7251208491150668, "learning_rate": 6.176470588235295e-05, "loss": 0.7579, "num_input_tokens_seen": 1102584, "step": 105 }, { "epoch": 0.028328611898016998, "grad_norm": 0.8217419839297042, "learning_rate": 6.470588235294118e-05, "loss": 0.7659, "num_input_tokens_seen": 1155512, "step": 110 }, { "epoch": 0.029616276075199587, "grad_norm": 0.6768053879888967, "learning_rate": 6.764705882352942e-05, "loss": 0.7469, "num_input_tokens_seen": 1207976, "step": 115 }, { "epoch": 0.03090394025238218, "grad_norm": 1.9562630849642013, "learning_rate": 7.058823529411765e-05, "loss": 0.7353, "num_input_tokens_seen": 1259776, "step": 120 }, { "epoch": 0.03219160442956477, "grad_norm": 0.6439041597153087, "learning_rate": 7.352941176470589e-05, "loss": 0.7537, "num_input_tokens_seen": 1312760, "step": 125 }, { "epoch": 0.03347926860674736, "grad_norm": 0.6124318582166212, "learning_rate": 7.647058823529411e-05, "loss": 0.7669, "num_input_tokens_seen": 1365616, "step": 130 }, { "epoch": 0.03476693278392995, "grad_norm": 0.7593534002488418, "learning_rate": 7.941176470588235e-05, "loss": 0.722, "num_input_tokens_seen": 1417544, "step": 135 }, { "epoch": 0.036054596961112545, "grad_norm": 0.7827834651032061, "learning_rate": 8.23529411764706e-05, "loss": 0.7502, "num_input_tokens_seen": 1469856, "step": 140 }, { "epoch": 0.037342261138295134, "grad_norm": 0.5444126155596626, "learning_rate": 8.529411764705883e-05, "loss": 0.7174, "num_input_tokens_seen": 1521496, "step": 145 }, { "epoch": 0.03862992531547772, "grad_norm": 0.40878703812837747, "learning_rate": 8.823529411764706e-05, "loss": 0.7018, "num_input_tokens_seen": 1573376, "step": 150 }, { "epoch": 0.03862992531547772, "eval_loss": 0.7309949994087219, "eval_runtime": 38.2005, "eval_samples_per_second": 3.141, "eval_steps_per_second": 0.785, "num_input_tokens_seen": 1573376, "step": 150 }, { "epoch": 0.03991758949266031, "grad_norm": 0.5536144453733772, "learning_rate": 9.11764705882353e-05, "loss": 0.738, "num_input_tokens_seen": 1626136, "step": 155 }, { "epoch": 0.04120525366984291, "grad_norm": 0.5151715191704441, "learning_rate": 9.411764705882353e-05, "loss": 0.7579, "num_input_tokens_seen": 1678760, "step": 160 }, { "epoch": 0.042492917847025496, "grad_norm": 0.5209077394596254, "learning_rate": 9.705882352941177e-05, "loss": 0.7502, "num_input_tokens_seen": 1731240, "step": 165 }, { "epoch": 0.043780582024208085, "grad_norm": 0.721213601237688, "learning_rate": 0.0001, "loss": 0.7448, "num_input_tokens_seen": 1783816, "step": 170 }, { "epoch": 0.045068246201390674, "grad_norm": 0.48666007914879555, "learning_rate": 9.999940874631277e-05, "loss": 0.6648, "num_input_tokens_seen": 1834592, "step": 175 }, { "epoch": 0.04635591037857327, "grad_norm": 0.5136600613696797, "learning_rate": 9.999763499923432e-05, "loss": 0.7759, "num_input_tokens_seen": 1888176, "step": 180 }, { "epoch": 0.04764357455575586, "grad_norm": 0.6706281530046975, "learning_rate": 9.999467880071402e-05, "loss": 0.7167, "num_input_tokens_seen": 1940280, "step": 185 }, { "epoch": 0.04893123873293845, "grad_norm": 0.5159139445497618, "learning_rate": 9.999054022066641e-05, "loss": 0.7483, "num_input_tokens_seen": 1993096, "step": 190 }, { "epoch": 0.050218902910121044, "grad_norm": 0.40251006129746847, "learning_rate": 9.998521935696953e-05, "loss": 0.7464, "num_input_tokens_seen": 2045648, "step": 195 }, { "epoch": 0.05150656708730363, "grad_norm": 0.4811730853311867, "learning_rate": 9.997871633546257e-05, "loss": 0.7594, "num_input_tokens_seen": 2099008, "step": 200 }, { "epoch": 0.05150656708730363, "eval_loss": 0.7274295687675476, "eval_runtime": 38.079, "eval_samples_per_second": 3.151, "eval_steps_per_second": 0.788, "num_input_tokens_seen": 2099008, "step": 200 }, { "epoch": 0.05279423126448622, "grad_norm": 0.591934959695668, "learning_rate": 9.997103130994296e-05, "loss": 0.706, "num_input_tokens_seen": 2151680, "step": 205 }, { "epoch": 0.05408189544166881, "grad_norm": 0.48253717444489286, "learning_rate": 9.996216446216267e-05, "loss": 0.7186, "num_input_tokens_seen": 2203784, "step": 210 }, { "epoch": 0.055369559618851406, "grad_norm": 0.5274315079401322, "learning_rate": 9.995211600182397e-05, "loss": 0.7009, "num_input_tokens_seen": 2255632, "step": 215 }, { "epoch": 0.056657223796033995, "grad_norm": 0.32879215224292613, "learning_rate": 9.994088616657444e-05, "loss": 0.6801, "num_input_tokens_seen": 2308096, "step": 220 }, { "epoch": 0.057944887973216584, "grad_norm": 0.37171195071448215, "learning_rate": 9.992847522200133e-05, "loss": 0.7569, "num_input_tokens_seen": 2361168, "step": 225 }, { "epoch": 0.05923255215039917, "grad_norm": 0.4120941016934064, "learning_rate": 9.99148834616253e-05, "loss": 0.7402, "num_input_tokens_seen": 2413896, "step": 230 }, { "epoch": 0.06052021632758177, "grad_norm": 0.5998680948310651, "learning_rate": 9.990011120689351e-05, "loss": 0.7191, "num_input_tokens_seen": 2466136, "step": 235 }, { "epoch": 0.06180788050476436, "grad_norm": 0.538488141249078, "learning_rate": 9.988415880717194e-05, "loss": 0.7274, "num_input_tokens_seen": 2518848, "step": 240 }, { "epoch": 0.06309554468194695, "grad_norm": 0.4393093124760277, "learning_rate": 9.986702663973722e-05, "loss": 0.7704, "num_input_tokens_seen": 2572384, "step": 245 }, { "epoch": 0.06438320885912954, "grad_norm": 0.6116643616510118, "learning_rate": 9.98487151097676e-05, "loss": 0.7346, "num_input_tokens_seen": 2625352, "step": 250 }, { "epoch": 0.06438320885912954, "eval_loss": 0.7181503176689148, "eval_runtime": 38.0986, "eval_samples_per_second": 3.15, "eval_steps_per_second": 0.787, "num_input_tokens_seen": 2625352, "step": 250 }, { "epoch": 0.06567087303631212, "grad_norm": 0.41200227731339506, "learning_rate": 9.98292246503335e-05, "loss": 0.7408, "num_input_tokens_seen": 2678216, "step": 255 }, { "epoch": 0.06695853721349472, "grad_norm": 0.44521059732114987, "learning_rate": 9.980855572238714e-05, "loss": 0.7044, "num_input_tokens_seen": 2730664, "step": 260 }, { "epoch": 0.06824620139067732, "grad_norm": 0.571896859428363, "learning_rate": 9.978670881475172e-05, "loss": 0.7334, "num_input_tokens_seen": 2783584, "step": 265 }, { "epoch": 0.0695338655678599, "grad_norm": 0.3907697039722125, "learning_rate": 9.976368444410985e-05, "loss": 0.7075, "num_input_tokens_seen": 2836152, "step": 270 }, { "epoch": 0.0708215297450425, "grad_norm": 0.4507806825752261, "learning_rate": 9.973948315499126e-05, "loss": 0.7039, "num_input_tokens_seen": 2887808, "step": 275 }, { "epoch": 0.07210919392222509, "grad_norm": 0.41330504132984697, "learning_rate": 9.971410551976002e-05, "loss": 0.6953, "num_input_tokens_seen": 2939656, "step": 280 }, { "epoch": 0.07339685809940767, "grad_norm": 0.4625671909482009, "learning_rate": 9.968755213860094e-05, "loss": 0.7022, "num_input_tokens_seen": 2991632, "step": 285 }, { "epoch": 0.07468452227659027, "grad_norm": 0.6553627840267285, "learning_rate": 9.96598236395054e-05, "loss": 0.6796, "num_input_tokens_seen": 3043616, "step": 290 }, { "epoch": 0.07597218645377285, "grad_norm": 0.5157886895754477, "learning_rate": 9.96309206782565e-05, "loss": 0.7346, "num_input_tokens_seen": 3096920, "step": 295 }, { "epoch": 0.07725985063095545, "grad_norm": 0.5672965149433489, "learning_rate": 9.960084393841355e-05, "loss": 0.6815, "num_input_tokens_seen": 3149032, "step": 300 }, { "epoch": 0.07725985063095545, "eval_loss": 0.7073924541473389, "eval_runtime": 38.1842, "eval_samples_per_second": 3.143, "eval_steps_per_second": 0.786, "num_input_tokens_seen": 3149032, "step": 300 }, { "epoch": 0.07854751480813804, "grad_norm": 0.4479276285203507, "learning_rate": 9.956959413129585e-05, "loss": 0.7208, "num_input_tokens_seen": 3201560, "step": 305 }, { "epoch": 0.07983517898532062, "grad_norm": 0.368457437106614, "learning_rate": 9.953717199596598e-05, "loss": 0.7144, "num_input_tokens_seen": 3254632, "step": 310 }, { "epoch": 0.08112284316250322, "grad_norm": 0.5531413254856732, "learning_rate": 9.95035782992122e-05, "loss": 0.6861, "num_input_tokens_seen": 3306432, "step": 315 }, { "epoch": 0.08241050733968582, "grad_norm": 0.41513991799613037, "learning_rate": 9.94688138355304e-05, "loss": 0.6836, "num_input_tokens_seen": 3358392, "step": 320 }, { "epoch": 0.0836981715168684, "grad_norm": 0.47052274706452957, "learning_rate": 9.943287942710527e-05, "loss": 0.7353, "num_input_tokens_seen": 3411424, "step": 325 }, { "epoch": 0.08498583569405099, "grad_norm": 0.6322586593511644, "learning_rate": 9.939577592379088e-05, "loss": 0.6774, "num_input_tokens_seen": 3462992, "step": 330 }, { "epoch": 0.08627349987123359, "grad_norm": 0.4129597798905344, "learning_rate": 9.935750420309055e-05, "loss": 0.7331, "num_input_tokens_seen": 3516136, "step": 335 }, { "epoch": 0.08756116404841617, "grad_norm": 0.4031509882699161, "learning_rate": 9.931806517013612e-05, "loss": 0.6939, "num_input_tokens_seen": 3568360, "step": 340 }, { "epoch": 0.08884882822559877, "grad_norm": 0.4444358747076587, "learning_rate": 9.927745975766654e-05, "loss": 0.7158, "num_input_tokens_seen": 3620696, "step": 345 }, { "epoch": 0.09013649240278135, "grad_norm": 0.5290547365449167, "learning_rate": 9.923568892600578e-05, "loss": 0.6932, "num_input_tokens_seen": 3673152, "step": 350 }, { "epoch": 0.09013649240278135, "eval_loss": 0.7044599056243896, "eval_runtime": 38.2709, "eval_samples_per_second": 3.136, "eval_steps_per_second": 0.784, "num_input_tokens_seen": 3673152, "step": 350 }, { "epoch": 0.09142415657996394, "grad_norm": 0.47530311368359207, "learning_rate": 9.91927536630402e-05, "loss": 0.6778, "num_input_tokens_seen": 3725296, "step": 355 }, { "epoch": 0.09271182075714654, "grad_norm": 0.38913022785688944, "learning_rate": 9.91486549841951e-05, "loss": 0.6857, "num_input_tokens_seen": 3777552, "step": 360 }, { "epoch": 0.09399948493432912, "grad_norm": 0.4834773141333328, "learning_rate": 9.91033939324107e-05, "loss": 0.7184, "num_input_tokens_seen": 3830200, "step": 365 }, { "epoch": 0.09528714911151172, "grad_norm": 0.5862045807150876, "learning_rate": 9.905697157811761e-05, "loss": 0.7196, "num_input_tokens_seen": 3883200, "step": 370 }, { "epoch": 0.09657481328869431, "grad_norm": 0.4576971522205563, "learning_rate": 9.900938901921131e-05, "loss": 0.6914, "num_input_tokens_seen": 3935576, "step": 375 }, { "epoch": 0.0978624774658769, "grad_norm": 0.49551517524520683, "learning_rate": 9.896064738102635e-05, "loss": 0.6681, "num_input_tokens_seen": 3987624, "step": 380 }, { "epoch": 0.09915014164305949, "grad_norm": 0.8198390819787913, "learning_rate": 9.891074781630966e-05, "loss": 0.6723, "num_input_tokens_seen": 4039680, "step": 385 }, { "epoch": 0.10043780582024209, "grad_norm": 0.7034626469978683, "learning_rate": 9.885969150519331e-05, "loss": 0.6498, "num_input_tokens_seen": 4091216, "step": 390 }, { "epoch": 0.10172546999742467, "grad_norm": 0.8838075623197742, "learning_rate": 9.88074796551666e-05, "loss": 0.7311, "num_input_tokens_seen": 4144264, "step": 395 }, { "epoch": 0.10301313417460727, "grad_norm": 0.7342758386202114, "learning_rate": 9.875411350104744e-05, "loss": 0.7089, "num_input_tokens_seen": 4197072, "step": 400 }, { "epoch": 0.10301313417460727, "eval_loss": 0.6847750544548035, "eval_runtime": 37.9238, "eval_samples_per_second": 3.164, "eval_steps_per_second": 0.791, "num_input_tokens_seen": 4197072, "step": 400 }, { "epoch": 0.10430079835178985, "grad_norm": 0.8113533605928532, "learning_rate": 9.86995943049533e-05, "loss": 0.7021, "num_input_tokens_seen": 4249656, "step": 405 }, { "epoch": 0.10558846252897244, "grad_norm": 1.1772677082041305, "learning_rate": 9.864392335627117e-05, "loss": 0.6943, "num_input_tokens_seen": 4302944, "step": 410 }, { "epoch": 0.10687612670615504, "grad_norm": 1.6493280510697776, "learning_rate": 9.858710197162721e-05, "loss": 0.7146, "num_input_tokens_seen": 4355480, "step": 415 }, { "epoch": 0.10816379088333762, "grad_norm": 3.0159798803441715, "learning_rate": 9.852913149485556e-05, "loss": 0.6312, "num_input_tokens_seen": 4407688, "step": 420 }, { "epoch": 0.10945145506052022, "grad_norm": 1.7981196843056153, "learning_rate": 9.847001329696653e-05, "loss": 0.6877, "num_input_tokens_seen": 4459736, "step": 425 }, { "epoch": 0.11073911923770281, "grad_norm": 1.5783278376799834, "learning_rate": 9.840974877611422e-05, "loss": 0.6975, "num_input_tokens_seen": 4512928, "step": 430 }, { "epoch": 0.1120267834148854, "grad_norm": 3.306646516615779, "learning_rate": 9.834833935756344e-05, "loss": 0.651, "num_input_tokens_seen": 4565840, "step": 435 }, { "epoch": 0.11331444759206799, "grad_norm": 2.3184973874904005, "learning_rate": 9.828578649365601e-05, "loss": 0.685, "num_input_tokens_seen": 4618168, "step": 440 }, { "epoch": 0.11460211176925057, "grad_norm": 1.602690016495642, "learning_rate": 9.822209166377635e-05, "loss": 0.6258, "num_input_tokens_seen": 4669784, "step": 445 }, { "epoch": 0.11588977594643317, "grad_norm": 2.6770797227308196, "learning_rate": 9.815725637431662e-05, "loss": 0.6732, "num_input_tokens_seen": 4722528, "step": 450 }, { "epoch": 0.11588977594643317, "eval_loss": 0.6526497006416321, "eval_runtime": 39.085, "eval_samples_per_second": 3.07, "eval_steps_per_second": 0.768, "num_input_tokens_seen": 4722528, "step": 450 }, { "epoch": 0.11717744012361576, "grad_norm": 2.1823349329218074, "learning_rate": 9.809128215864097e-05, "loss": 0.6544, "num_input_tokens_seen": 4774400, "step": 455 }, { "epoch": 0.11846510430079835, "grad_norm": 1.434521593914191, "learning_rate": 9.802417057704931e-05, "loss": 0.652, "num_input_tokens_seen": 4826704, "step": 460 }, { "epoch": 0.11975276847798094, "grad_norm": 2.399754385687283, "learning_rate": 9.795592321674045e-05, "loss": 0.6582, "num_input_tokens_seen": 4880072, "step": 465 }, { "epoch": 0.12104043265516354, "grad_norm": 3.9235176077985536, "learning_rate": 9.788654169177453e-05, "loss": 0.6506, "num_input_tokens_seen": 4931968, "step": 470 }, { "epoch": 0.12232809683234612, "grad_norm": 3.659330745777227, "learning_rate": 9.781602764303487e-05, "loss": 0.6551, "num_input_tokens_seen": 4983656, "step": 475 }, { "epoch": 0.12361576100952872, "grad_norm": 1.9670601503398757, "learning_rate": 9.774438273818911e-05, "loss": 0.6978, "num_input_tokens_seen": 5036528, "step": 480 }, { "epoch": 0.12490342518671131, "grad_norm": 1.308580869419328, "learning_rate": 9.767160867164979e-05, "loss": 0.6407, "num_input_tokens_seen": 5088768, "step": 485 }, { "epoch": 0.1261910893638939, "grad_norm": 1.7349486072682865, "learning_rate": 9.759770716453436e-05, "loss": 0.6641, "num_input_tokens_seen": 5142080, "step": 490 }, { "epoch": 0.1274787535410765, "grad_norm": 2.993327939872198, "learning_rate": 9.752267996462434e-05, "loss": 0.6588, "num_input_tokens_seen": 5194432, "step": 495 }, { "epoch": 0.12876641771825909, "grad_norm": 2.6430988002320976, "learning_rate": 9.744652884632406e-05, "loss": 0.6304, "num_input_tokens_seen": 5246640, "step": 500 }, { "epoch": 0.12876641771825909, "eval_loss": 0.6272165775299072, "eval_runtime": 39.4177, "eval_samples_per_second": 3.044, "eval_steps_per_second": 0.761, "num_input_tokens_seen": 5246640, "step": 500 }, { "epoch": 0.13005408189544168, "grad_norm": 2.6047672112920286, "learning_rate": 9.736925561061871e-05, "loss": 0.5741, "num_input_tokens_seen": 5299024, "step": 505 }, { "epoch": 0.13134174607262425, "grad_norm": 2.4706517190834063, "learning_rate": 9.729086208503174e-05, "loss": 0.6535, "num_input_tokens_seen": 5352664, "step": 510 }, { "epoch": 0.13262941024980685, "grad_norm": 2.031672226684599, "learning_rate": 9.721135012358156e-05, "loss": 0.6081, "num_input_tokens_seen": 5406008, "step": 515 }, { "epoch": 0.13391707442698944, "grad_norm": 2.773997809426142, "learning_rate": 9.713072160673777e-05, "loss": 0.6792, "num_input_tokens_seen": 5459368, "step": 520 }, { "epoch": 0.13520473860417204, "grad_norm": 5.083057729524855, "learning_rate": 9.704897844137673e-05, "loss": 0.6821, "num_input_tokens_seen": 5512960, "step": 525 }, { "epoch": 0.13649240278135463, "grad_norm": 3.0440654843385584, "learning_rate": 9.696612256073633e-05, "loss": 0.5835, "num_input_tokens_seen": 5565368, "step": 530 }, { "epoch": 0.1377800669585372, "grad_norm": 3.7400231170971323, "learning_rate": 9.688215592437039e-05, "loss": 0.6129, "num_input_tokens_seen": 5618008, "step": 535 }, { "epoch": 0.1390677311357198, "grad_norm": 6.340287952379529, "learning_rate": 9.679708051810221e-05, "loss": 0.5765, "num_input_tokens_seen": 5670072, "step": 540 }, { "epoch": 0.1403553953129024, "grad_norm": 3.6351560550229207, "learning_rate": 9.67108983539777e-05, "loss": 0.6325, "num_input_tokens_seen": 5722936, "step": 545 }, { "epoch": 0.141643059490085, "grad_norm": 3.8363425916745117, "learning_rate": 9.662361147021779e-05, "loss": 0.5596, "num_input_tokens_seen": 5774880, "step": 550 }, { "epoch": 0.141643059490085, "eval_loss": 0.5832681059837341, "eval_runtime": 38.2495, "eval_samples_per_second": 3.137, "eval_steps_per_second": 0.784, "num_input_tokens_seen": 5774880, "step": 550 }, { "epoch": 0.14293072366726758, "grad_norm": 3.911447203674744, "learning_rate": 9.653522193117013e-05, "loss": 0.5073, "num_input_tokens_seen": 5826608, "step": 555 }, { "epoch": 0.14421838784445018, "grad_norm": 3.3501835856945763, "learning_rate": 9.644573182726035e-05, "loss": 0.5652, "num_input_tokens_seen": 5879776, "step": 560 }, { "epoch": 0.14550605202163275, "grad_norm": 8.75758822201328, "learning_rate": 9.63551432749426e-05, "loss": 0.5727, "num_input_tokens_seen": 5932888, "step": 565 }, { "epoch": 0.14679371619881534, "grad_norm": 4.351029258458384, "learning_rate": 9.626345841664953e-05, "loss": 0.6251, "num_input_tokens_seen": 5984648, "step": 570 }, { "epoch": 0.14808138037599794, "grad_norm": 7.617020699535255, "learning_rate": 9.617067942074153e-05, "loss": 0.6508, "num_input_tokens_seen": 6037000, "step": 575 }, { "epoch": 0.14936904455318054, "grad_norm": 7.293430172750479, "learning_rate": 9.607680848145558e-05, "loss": 0.6686, "num_input_tokens_seen": 6090512, "step": 580 }, { "epoch": 0.15065670873036313, "grad_norm": 3.3635276124166653, "learning_rate": 9.598184781885318e-05, "loss": 0.5793, "num_input_tokens_seen": 6143320, "step": 585 }, { "epoch": 0.1519443729075457, "grad_norm": 2.7589160396339407, "learning_rate": 9.588579967876806e-05, "loss": 0.5954, "num_input_tokens_seen": 6195720, "step": 590 }, { "epoch": 0.1532320370847283, "grad_norm": 1.582169884399532, "learning_rate": 9.578866633275288e-05, "loss": 0.5644, "num_input_tokens_seen": 6247592, "step": 595 }, { "epoch": 0.1545197012619109, "grad_norm": 3.891844940061855, "learning_rate": 9.569045007802559e-05, "loss": 0.5794, "num_input_tokens_seen": 6299656, "step": 600 }, { "epoch": 0.1545197012619109, "eval_loss": 0.6039358973503113, "eval_runtime": 38.3138, "eval_samples_per_second": 3.132, "eval_steps_per_second": 0.783, "num_input_tokens_seen": 6299656, "step": 600 }, { "epoch": 0.1558073654390935, "grad_norm": 5.90634634073773, "learning_rate": 9.55911532374151e-05, "loss": 0.6106, "num_input_tokens_seen": 6351680, "step": 605 }, { "epoch": 0.15709502961627608, "grad_norm": 3.5429043559071034, "learning_rate": 9.549077815930636e-05, "loss": 0.5812, "num_input_tokens_seen": 6403648, "step": 610 }, { "epoch": 0.15838269379345868, "grad_norm": 2.8753548663225144, "learning_rate": 9.538932721758474e-05, "loss": 0.5992, "num_input_tokens_seen": 6456328, "step": 615 }, { "epoch": 0.15967035797064125, "grad_norm": 2.4013005755622467, "learning_rate": 9.528680281157999e-05, "loss": 0.587, "num_input_tokens_seen": 6509024, "step": 620 }, { "epoch": 0.16095802214782384, "grad_norm": 3.860358696946306, "learning_rate": 9.518320736600943e-05, "loss": 0.5836, "num_input_tokens_seen": 6561336, "step": 625 }, { "epoch": 0.16224568632500644, "grad_norm": 3.187917212328382, "learning_rate": 9.507854333092063e-05, "loss": 0.5913, "num_input_tokens_seen": 6614024, "step": 630 }, { "epoch": 0.16353335050218903, "grad_norm": 3.5342177024321586, "learning_rate": 9.497281318163346e-05, "loss": 0.5693, "num_input_tokens_seen": 6666416, "step": 635 }, { "epoch": 0.16482101467937163, "grad_norm": 3.90374612709263, "learning_rate": 9.486601941868154e-05, "loss": 0.572, "num_input_tokens_seen": 6718200, "step": 640 }, { "epoch": 0.1661086788565542, "grad_norm": 4.4270591027201665, "learning_rate": 9.475816456775313e-05, "loss": 0.6111, "num_input_tokens_seen": 6771256, "step": 645 }, { "epoch": 0.1673963430337368, "grad_norm": 5.04761388655614, "learning_rate": 9.464925117963133e-05, "loss": 0.5959, "num_input_tokens_seen": 6824008, "step": 650 }, { "epoch": 0.1673963430337368, "eval_loss": 0.5542036890983582, "eval_runtime": 68.9048, "eval_samples_per_second": 1.742, "eval_steps_per_second": 0.435, "num_input_tokens_seen": 6824008, "step": 650 }, { "epoch": 0.1686840072109194, "grad_norm": 3.428410481447858, "learning_rate": 9.453928183013385e-05, "loss": 0.5344, "num_input_tokens_seen": 6875432, "step": 655 }, { "epoch": 0.16997167138810199, "grad_norm": 2.9137495299009846, "learning_rate": 9.442825912005202e-05, "loss": 0.56, "num_input_tokens_seen": 6927768, "step": 660 }, { "epoch": 0.17125933556528458, "grad_norm": 4.2956604210715925, "learning_rate": 9.431618567508933e-05, "loss": 0.5701, "num_input_tokens_seen": 6980544, "step": 665 }, { "epoch": 0.17254699974246718, "grad_norm": 4.3977584083656405, "learning_rate": 9.420306414579925e-05, "loss": 0.5604, "num_input_tokens_seen": 7032584, "step": 670 }, { "epoch": 0.17383466391964975, "grad_norm": 4.48381006313936, "learning_rate": 9.408889720752266e-05, "loss": 0.5763, "num_input_tokens_seen": 7085048, "step": 675 }, { "epoch": 0.17512232809683234, "grad_norm": 2.189534287393346, "learning_rate": 9.397368756032445e-05, "loss": 0.5962, "num_input_tokens_seen": 7137952, "step": 680 }, { "epoch": 0.17640999227401494, "grad_norm": 3.34591241093722, "learning_rate": 9.385743792892982e-05, "loss": 0.5935, "num_input_tokens_seen": 7190584, "step": 685 }, { "epoch": 0.17769765645119753, "grad_norm": 2.7509902524242507, "learning_rate": 9.374015106265968e-05, "loss": 0.5267, "num_input_tokens_seen": 7243440, "step": 690 }, { "epoch": 0.17898532062838013, "grad_norm": 2.322454948468365, "learning_rate": 9.362182973536569e-05, "loss": 0.5351, "num_input_tokens_seen": 7295568, "step": 695 }, { "epoch": 0.1802729848055627, "grad_norm": 3.4615171229405046, "learning_rate": 9.35024767453647e-05, "loss": 0.5014, "num_input_tokens_seen": 7347040, "step": 700 }, { "epoch": 0.1802729848055627, "eval_loss": 0.5440100431442261, "eval_runtime": 39.1181, "eval_samples_per_second": 3.068, "eval_steps_per_second": 0.767, "num_input_tokens_seen": 7347040, "step": 700 }, { "epoch": 0.1815606489827453, "grad_norm": 4.815426816055898, "learning_rate": 9.338209491537257e-05, "loss": 0.543, "num_input_tokens_seen": 7399584, "step": 705 }, { "epoch": 0.1828483131599279, "grad_norm": 7.294932559918336, "learning_rate": 9.326068709243727e-05, "loss": 0.4995, "num_input_tokens_seen": 7452928, "step": 710 }, { "epoch": 0.18413597733711048, "grad_norm": 3.6946433405013495, "learning_rate": 9.313825614787177e-05, "loss": 0.5109, "num_input_tokens_seen": 7505112, "step": 715 }, { "epoch": 0.18542364151429308, "grad_norm": 4.339671310261357, "learning_rate": 9.301480497718593e-05, "loss": 0.4932, "num_input_tokens_seen": 7557608, "step": 720 }, { "epoch": 0.18671130569147568, "grad_norm": 11.604530853746237, "learning_rate": 9.289033650001817e-05, "loss": 0.5573, "num_input_tokens_seen": 7610048, "step": 725 }, { "epoch": 0.18799896986865824, "grad_norm": 5.990020165378009, "learning_rate": 9.276485366006634e-05, "loss": 0.5305, "num_input_tokens_seen": 7662056, "step": 730 }, { "epoch": 0.18928663404584084, "grad_norm": 4.709895983169237, "learning_rate": 9.263835942501807e-05, "loss": 0.5369, "num_input_tokens_seen": 7713656, "step": 735 }, { "epoch": 0.19057429822302344, "grad_norm": 4.873824727341975, "learning_rate": 9.251085678648072e-05, "loss": 0.5397, "num_input_tokens_seen": 7765992, "step": 740 }, { "epoch": 0.19186196240020603, "grad_norm": 3.288968567031419, "learning_rate": 9.238234875991046e-05, "loss": 0.5116, "num_input_tokens_seen": 7818448, "step": 745 }, { "epoch": 0.19314962657738863, "grad_norm": 4.778741391076671, "learning_rate": 9.225283838454111e-05, "loss": 0.541, "num_input_tokens_seen": 7870520, "step": 750 }, { "epoch": 0.19314962657738863, "eval_loss": 0.5273815989494324, "eval_runtime": 39.1812, "eval_samples_per_second": 3.063, "eval_steps_per_second": 0.766, "num_input_tokens_seen": 7870520, "step": 750 }, { "epoch": 0.1944372907545712, "grad_norm": 4.544356566141105, "learning_rate": 9.21223287233121e-05, "loss": 0.4961, "num_input_tokens_seen": 7922736, "step": 755 }, { "epoch": 0.1957249549317538, "grad_norm": 7.025876813077666, "learning_rate": 9.199082286279622e-05, "loss": 0.4956, "num_input_tokens_seen": 7975304, "step": 760 }, { "epoch": 0.1970126191089364, "grad_norm": 4.9360968239249985, "learning_rate": 9.185832391312644e-05, "loss": 0.4997, "num_input_tokens_seen": 8027448, "step": 765 }, { "epoch": 0.19830028328611898, "grad_norm": 10.528361984915874, "learning_rate": 9.172483500792244e-05, "loss": 0.5214, "num_input_tokens_seen": 8080944, "step": 770 }, { "epoch": 0.19958794746330158, "grad_norm": 9.264531258094065, "learning_rate": 9.159035930421658e-05, "loss": 0.6098, "num_input_tokens_seen": 8133392, "step": 775 }, { "epoch": 0.20087561164048418, "grad_norm": 1.9709167614209242, "learning_rate": 9.145489998237902e-05, "loss": 0.5046, "num_input_tokens_seen": 8185360, "step": 780 }, { "epoch": 0.20216327581766674, "grad_norm": 7.5915211434567595, "learning_rate": 9.131846024604274e-05, "loss": 0.5803, "num_input_tokens_seen": 8237672, "step": 785 }, { "epoch": 0.20345093999484934, "grad_norm": 3.251682970663388, "learning_rate": 9.11810433220276e-05, "loss": 0.5365, "num_input_tokens_seen": 8289688, "step": 790 }, { "epoch": 0.20473860417203193, "grad_norm": 4.341533737034294, "learning_rate": 9.104265246026415e-05, "loss": 0.5259, "num_input_tokens_seen": 8341624, "step": 795 }, { "epoch": 0.20602626834921453, "grad_norm": 5.463180544339495, "learning_rate": 9.090329093371666e-05, "loss": 0.5291, "num_input_tokens_seen": 8393696, "step": 800 }, { "epoch": 0.20602626834921453, "eval_loss": 0.5219093561172485, "eval_runtime": 39.7455, "eval_samples_per_second": 3.019, "eval_steps_per_second": 0.755, "num_input_tokens_seen": 8393696, "step": 800 }, { "epoch": 0.20731393252639713, "grad_norm": 4.254130676908817, "learning_rate": 9.076296203830579e-05, "loss": 0.5449, "num_input_tokens_seen": 8446496, "step": 805 }, { "epoch": 0.2086015967035797, "grad_norm": 5.6525741285524145, "learning_rate": 9.062166909283062e-05, "loss": 0.5625, "num_input_tokens_seen": 8499544, "step": 810 }, { "epoch": 0.2098892608807623, "grad_norm": 3.8041246225911345, "learning_rate": 9.047941543889014e-05, "loss": 0.5564, "num_input_tokens_seen": 8552568, "step": 815 }, { "epoch": 0.2111769250579449, "grad_norm": 3.803732280546421, "learning_rate": 9.033620444080428e-05, "loss": 0.5487, "num_input_tokens_seen": 8605560, "step": 820 }, { "epoch": 0.21246458923512748, "grad_norm": 2.8518948364927925, "learning_rate": 9.019203948553422e-05, "loss": 0.5719, "num_input_tokens_seen": 8657704, "step": 825 }, { "epoch": 0.21375225341231008, "grad_norm": 3.939376115862177, "learning_rate": 9.004692398260244e-05, "loss": 0.5235, "num_input_tokens_seen": 8711088, "step": 830 }, { "epoch": 0.21503991758949267, "grad_norm": 6.635912128499916, "learning_rate": 8.9900861364012e-05, "loss": 0.5566, "num_input_tokens_seen": 8763712, "step": 835 }, { "epoch": 0.21632758176667524, "grad_norm": 3.7547407090496687, "learning_rate": 8.975385508416532e-05, "loss": 0.482, "num_input_tokens_seen": 8815760, "step": 840 }, { "epoch": 0.21761524594385784, "grad_norm": 4.093006904445721, "learning_rate": 8.960590861978265e-05, "loss": 0.5046, "num_input_tokens_seen": 8867720, "step": 845 }, { "epoch": 0.21890291012104043, "grad_norm": 11.397392997722068, "learning_rate": 8.945702546981969e-05, "loss": 0.5063, "num_input_tokens_seen": 8919608, "step": 850 }, { "epoch": 0.21890291012104043, "eval_loss": 0.5525640249252319, "eval_runtime": 39.0469, "eval_samples_per_second": 3.073, "eval_steps_per_second": 0.768, "num_input_tokens_seen": 8919608, "step": 850 }, { "epoch": 0.22019057429822303, "grad_norm": 4.339535962830116, "learning_rate": 8.930720915538487e-05, "loss": 0.5853, "num_input_tokens_seen": 8971048, "step": 855 }, { "epoch": 0.22147823847540563, "grad_norm": 6.118436891847819, "learning_rate": 8.915646321965614e-05, "loss": 0.5534, "num_input_tokens_seen": 9022936, "step": 860 }, { "epoch": 0.2227659026525882, "grad_norm": 3.3997835203618667, "learning_rate": 8.900479122779712e-05, "loss": 0.5623, "num_input_tokens_seen": 9075336, "step": 865 }, { "epoch": 0.2240535668297708, "grad_norm": 4.188326935911128, "learning_rate": 8.885219676687277e-05, "loss": 0.5561, "num_input_tokens_seen": 9127688, "step": 870 }, { "epoch": 0.22534123100695339, "grad_norm": 5.220175192497493, "learning_rate": 8.869868344576459e-05, "loss": 0.5449, "num_input_tokens_seen": 9180624, "step": 875 }, { "epoch": 0.22662889518413598, "grad_norm": 2.2022914161050577, "learning_rate": 8.854425489508532e-05, "loss": 0.5062, "num_input_tokens_seen": 9233176, "step": 880 }, { "epoch": 0.22791655936131858, "grad_norm": 4.62379059067999, "learning_rate": 8.838891476709288e-05, "loss": 0.5033, "num_input_tokens_seen": 9286688, "step": 885 }, { "epoch": 0.22920422353850115, "grad_norm": 3.639684630492015, "learning_rate": 8.823266673560426e-05, "loss": 0.4845, "num_input_tokens_seen": 9339600, "step": 890 }, { "epoch": 0.23049188771568374, "grad_norm": 4.131757647310936, "learning_rate": 8.807551449590846e-05, "loss": 0.5595, "num_input_tokens_seen": 9391536, "step": 895 }, { "epoch": 0.23177955189286634, "grad_norm": 4.771128685196347, "learning_rate": 8.791746176467907e-05, "loss": 0.5251, "num_input_tokens_seen": 9443616, "step": 900 }, { "epoch": 0.23177955189286634, "eval_loss": 0.49604204297065735, "eval_runtime": 39.5289, "eval_samples_per_second": 3.036, "eval_steps_per_second": 0.759, "num_input_tokens_seen": 9443616, "step": 900 }, { "epoch": 0.23306721607004893, "grad_norm": 6.849781513397169, "learning_rate": 8.775851227988656e-05, "loss": 0.5774, "num_input_tokens_seen": 9497304, "step": 905 }, { "epoch": 0.23435488024723153, "grad_norm": 2.526801567699946, "learning_rate": 8.759866980070963e-05, "loss": 0.5441, "num_input_tokens_seen": 9549416, "step": 910 }, { "epoch": 0.23564254442441412, "grad_norm": 3.1008408808291503, "learning_rate": 8.743793810744654e-05, "loss": 0.4898, "num_input_tokens_seen": 9601800, "step": 915 }, { "epoch": 0.2369302086015967, "grad_norm": 4.120824184689494, "learning_rate": 8.727632100142551e-05, "loss": 0.4681, "num_input_tokens_seen": 9653600, "step": 920 }, { "epoch": 0.2382178727787793, "grad_norm": 5.251488809494114, "learning_rate": 8.711382230491493e-05, "loss": 0.4946, "num_input_tokens_seen": 9707224, "step": 925 }, { "epoch": 0.23950553695596188, "grad_norm": 6.885034741125289, "learning_rate": 8.695044586103296e-05, "loss": 0.5517, "num_input_tokens_seen": 9760096, "step": 930 }, { "epoch": 0.24079320113314448, "grad_norm": 4.6246077239626855, "learning_rate": 8.678619553365659e-05, "loss": 0.6064, "num_input_tokens_seen": 9812672, "step": 935 }, { "epoch": 0.24208086531032708, "grad_norm": 5.621020693846077, "learning_rate": 8.662107520733027e-05, "loss": 0.5398, "num_input_tokens_seen": 9866200, "step": 940 }, { "epoch": 0.24336852948750964, "grad_norm": 3.1921985322817092, "learning_rate": 8.64550887871741e-05, "loss": 0.5068, "num_input_tokens_seen": 9918160, "step": 945 }, { "epoch": 0.24465619366469224, "grad_norm": 2.3689648161336465, "learning_rate": 8.628824019879137e-05, "loss": 0.5862, "num_input_tokens_seen": 9970600, "step": 950 }, { "epoch": 0.24465619366469224, "eval_loss": 0.5085262656211853, "eval_runtime": 39.0437, "eval_samples_per_second": 3.073, "eval_steps_per_second": 0.768, "num_input_tokens_seen": 9970600, "step": 950 }, { "epoch": 0.24594385784187484, "grad_norm": 2.8827978223065363, "learning_rate": 8.612053338817581e-05, "loss": 0.4549, "num_input_tokens_seen": 10022248, "step": 955 }, { "epoch": 0.24723152201905743, "grad_norm": 6.662877258417003, "learning_rate": 8.595197232161824e-05, "loss": 0.4791, "num_input_tokens_seen": 10075280, "step": 960 }, { "epoch": 0.24851918619624003, "grad_norm": 8.140970355143077, "learning_rate": 8.578256098561275e-05, "loss": 0.4833, "num_input_tokens_seen": 10128392, "step": 965 }, { "epoch": 0.24980685037342262, "grad_norm": 3.243184767888501, "learning_rate": 8.561230338676239e-05, "loss": 0.4672, "num_input_tokens_seen": 10180720, "step": 970 }, { "epoch": 0.2510945145506052, "grad_norm": 6.588760068173114, "learning_rate": 8.544120355168451e-05, "loss": 0.5205, "num_input_tokens_seen": 10233256, "step": 975 }, { "epoch": 0.2523821787277878, "grad_norm": 2.6240987196110837, "learning_rate": 8.526926552691544e-05, "loss": 0.5124, "num_input_tokens_seen": 10284928, "step": 980 }, { "epoch": 0.2536698429049704, "grad_norm": 8.242761558538728, "learning_rate": 8.509649337881483e-05, "loss": 0.5034, "num_input_tokens_seen": 10338208, "step": 985 }, { "epoch": 0.254957507082153, "grad_norm": 8.922137566500533, "learning_rate": 8.492289119346943e-05, "loss": 0.5226, "num_input_tokens_seen": 10390224, "step": 990 }, { "epoch": 0.25624517125933555, "grad_norm": 4.922275874717211, "learning_rate": 8.474846307659658e-05, "loss": 0.5399, "num_input_tokens_seen": 10443080, "step": 995 }, { "epoch": 0.25753283543651817, "grad_norm": 6.866585614783304, "learning_rate": 8.457321315344694e-05, "loss": 0.483, "num_input_tokens_seen": 10495592, "step": 1000 }, { "epoch": 0.25753283543651817, "eval_loss": 0.5305114388465881, "eval_runtime": 38.9297, "eval_samples_per_second": 3.082, "eval_steps_per_second": 0.771, "num_input_tokens_seen": 10495592, "step": 1000 }, { "epoch": 0.25882049961370074, "grad_norm": 8.233033578002926, "learning_rate": 8.439714556870704e-05, "loss": 0.568, "num_input_tokens_seen": 10548136, "step": 1005 }, { "epoch": 0.26010816379088336, "grad_norm": 5.3701298824478485, "learning_rate": 8.422026448640124e-05, "loss": 0.4335, "num_input_tokens_seen": 10600048, "step": 1010 }, { "epoch": 0.26139582796806593, "grad_norm": 5.491882026124958, "learning_rate": 8.40425740897932e-05, "loss": 0.5385, "num_input_tokens_seen": 10652160, "step": 1015 }, { "epoch": 0.2626834921452485, "grad_norm": 5.479941792055548, "learning_rate": 8.386407858128706e-05, "loss": 0.5171, "num_input_tokens_seen": 10705208, "step": 1020 }, { "epoch": 0.2639711563224311, "grad_norm": 3.489116106033337, "learning_rate": 8.368478218232787e-05, "loss": 0.5201, "num_input_tokens_seen": 10758688, "step": 1025 }, { "epoch": 0.2652588204996137, "grad_norm": 5.923123692460237, "learning_rate": 8.350468913330192e-05, "loss": 0.5521, "num_input_tokens_seen": 10811408, "step": 1030 }, { "epoch": 0.2665464846767963, "grad_norm": 2.7605406738569824, "learning_rate": 8.33238036934364e-05, "loss": 0.4938, "num_input_tokens_seen": 10864144, "step": 1035 }, { "epoch": 0.2678341488539789, "grad_norm": 5.500647711838314, "learning_rate": 8.31421301406986e-05, "loss": 0.4828, "num_input_tokens_seen": 10916952, "step": 1040 }, { "epoch": 0.26912181303116145, "grad_norm": 6.823855575342733, "learning_rate": 8.29596727716949e-05, "loss": 0.5491, "num_input_tokens_seen": 10968824, "step": 1045 }, { "epoch": 0.2704094772083441, "grad_norm": 5.409054743152559, "learning_rate": 8.277643590156894e-05, "loss": 0.4628, "num_input_tokens_seen": 11021656, "step": 1050 }, { "epoch": 0.2704094772083441, "eval_loss": 0.5039986371994019, "eval_runtime": 40.3009, "eval_samples_per_second": 2.978, "eval_steps_per_second": 0.744, "num_input_tokens_seen": 11021656, "step": 1050 }, { "epoch": 0.27169714138552664, "grad_norm": 3.2588151986321994, "learning_rate": 8.259242386389973e-05, "loss": 0.4586, "num_input_tokens_seen": 11074336, "step": 1055 }, { "epoch": 0.27298480556270927, "grad_norm": 12.995641199019554, "learning_rate": 8.240764101059912e-05, "loss": 0.4939, "num_input_tokens_seen": 11126776, "step": 1060 }, { "epoch": 0.27427246973989183, "grad_norm": 8.713479932798109, "learning_rate": 8.222209171180883e-05, "loss": 0.4978, "num_input_tokens_seen": 11179680, "step": 1065 }, { "epoch": 0.2755601339170744, "grad_norm": 3.6728132957332016, "learning_rate": 8.203578035579715e-05, "loss": 0.5695, "num_input_tokens_seen": 11231616, "step": 1070 }, { "epoch": 0.276847798094257, "grad_norm": 9.661110166832387, "learning_rate": 8.184871134885513e-05, "loss": 0.4635, "num_input_tokens_seen": 11283720, "step": 1075 }, { "epoch": 0.2781354622714396, "grad_norm": 5.4096015474623576, "learning_rate": 8.166088911519235e-05, "loss": 0.4974, "num_input_tokens_seen": 11336144, "step": 1080 }, { "epoch": 0.2794231264486222, "grad_norm": 5.353663008589148, "learning_rate": 8.147231809683236e-05, "loss": 0.4439, "num_input_tokens_seen": 11389128, "step": 1085 }, { "epoch": 0.2807107906258048, "grad_norm": 3.863008112890598, "learning_rate": 8.128300275350756e-05, "loss": 0.4368, "num_input_tokens_seen": 11441864, "step": 1090 }, { "epoch": 0.2819984548029874, "grad_norm": 5.545035623030093, "learning_rate": 8.109294756255375e-05, "loss": 0.4895, "num_input_tokens_seen": 11494880, "step": 1095 }, { "epoch": 0.28328611898017, "grad_norm": 5.124762488175073, "learning_rate": 8.090215701880419e-05, "loss": 0.4825, "num_input_tokens_seen": 11547008, "step": 1100 }, { "epoch": 0.28328611898017, "eval_loss": 0.4798590838909149, "eval_runtime": 40.6942, "eval_samples_per_second": 2.949, "eval_steps_per_second": 0.737, "num_input_tokens_seen": 11547008, "step": 1100 } ], "logging_steps": 5, "max_steps": 3400, "num_input_tokens_seen": 11547008, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 761815920803840.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }