{ "best_metric": 0.6319106221199036, "best_model_checkpoint": "saves/CADICA_qwenvl_stenosis_classily_scale4_frozenVision/lora/sft/checkpoint-1600", "epoch": 1.750965748132887, "eval_steps": 50, "global_step": 3400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025753283543651817, "grad_norm": 21.336819681898895, "learning_rate": 2.9411764705882355e-06, "loss": 3.0444, "num_input_tokens_seen": 58496, "step": 5 }, { "epoch": 0.0051506567087303634, "grad_norm": 20.576623155848594, "learning_rate": 5.882352941176471e-06, "loss": 2.9824, "num_input_tokens_seen": 116960, "step": 10 }, { "epoch": 0.007725985063095545, "grad_norm": 22.989873871108518, "learning_rate": 8.823529411764707e-06, "loss": 2.8371, "num_input_tokens_seen": 175448, "step": 15 }, { "epoch": 0.010301313417460727, "grad_norm": 19.533434089690918, "learning_rate": 1.1764705882352942e-05, "loss": 2.5198, "num_input_tokens_seen": 233944, "step": 20 }, { "epoch": 0.012876641771825908, "grad_norm": 12.509494197145006, "learning_rate": 1.4705882352941177e-05, "loss": 1.772, "num_input_tokens_seen": 292416, "step": 25 }, { "epoch": 0.01545197012619109, "grad_norm": 3.6901887027066667, "learning_rate": 1.7647058823529414e-05, "loss": 1.2263, "num_input_tokens_seen": 350904, "step": 30 }, { "epoch": 0.018027298480556272, "grad_norm": 2.3996076770849744, "learning_rate": 2.058823529411765e-05, "loss": 1.0102, "num_input_tokens_seen": 409384, "step": 35 }, { "epoch": 0.020602626834921454, "grad_norm": 0.9253415848864577, "learning_rate": 2.3529411764705884e-05, "loss": 0.9378, "num_input_tokens_seen": 467864, "step": 40 }, { "epoch": 0.023177955189286635, "grad_norm": 1.1966244115097795, "learning_rate": 2.647058823529412e-05, "loss": 0.9265, "num_input_tokens_seen": 526384, "step": 45 }, { "epoch": 0.025753283543651816, "grad_norm": 1.853648349752417, "learning_rate": 2.9411764705882354e-05, "loss": 0.9157, "num_input_tokens_seen": 584856, "step": 50 }, { "epoch": 0.025753283543651816, "eval_loss": 0.9191630482673645, "eval_runtime": 36.6123, "eval_samples_per_second": 1.639, "eval_steps_per_second": 0.41, "num_input_tokens_seen": 584856, "step": 50 }, { "epoch": 0.028328611898016998, "grad_norm": 0.8294990584587586, "learning_rate": 3.235294117647059e-05, "loss": 0.9009, "num_input_tokens_seen": 643344, "step": 55 }, { "epoch": 0.03090394025238218, "grad_norm": 0.8278765532866457, "learning_rate": 3.529411764705883e-05, "loss": 0.9063, "num_input_tokens_seen": 701808, "step": 60 }, { "epoch": 0.03347926860674736, "grad_norm": 0.7285901101792476, "learning_rate": 3.8235294117647055e-05, "loss": 0.9031, "num_input_tokens_seen": 760304, "step": 65 }, { "epoch": 0.036054596961112545, "grad_norm": 0.5341783688819233, "learning_rate": 4.11764705882353e-05, "loss": 0.8991, "num_input_tokens_seen": 818760, "step": 70 }, { "epoch": 0.03862992531547772, "grad_norm": 0.46059313680988906, "learning_rate": 4.411764705882353e-05, "loss": 0.9055, "num_input_tokens_seen": 877256, "step": 75 }, { "epoch": 0.04120525366984291, "grad_norm": 0.8194379237293679, "learning_rate": 4.705882352941177e-05, "loss": 0.9092, "num_input_tokens_seen": 935752, "step": 80 }, { "epoch": 0.043780582024208085, "grad_norm": 0.6745093544830881, "learning_rate": 5e-05, "loss": 0.9069, "num_input_tokens_seen": 994216, "step": 85 }, { "epoch": 0.04635591037857327, "grad_norm": 0.2894672897884604, "learning_rate": 5.294117647058824e-05, "loss": 0.8924, "num_input_tokens_seen": 1052704, "step": 90 }, { "epoch": 0.04893123873293845, "grad_norm": 0.5108489024576455, "learning_rate": 5.588235294117647e-05, "loss": 0.9059, "num_input_tokens_seen": 1111176, "step": 95 }, { "epoch": 0.05150656708730363, "grad_norm": 0.40317180386305224, "learning_rate": 5.882352941176471e-05, "loss": 0.901, "num_input_tokens_seen": 1169664, "step": 100 }, { "epoch": 0.05150656708730363, "eval_loss": 0.9077914953231812, "eval_runtime": 16.8879, "eval_samples_per_second": 3.553, "eval_steps_per_second": 0.888, "num_input_tokens_seen": 1169664, "step": 100 }, { "epoch": 0.05408189544166881, "grad_norm": 0.412918917979438, "learning_rate": 6.176470588235295e-05, "loss": 0.9159, "num_input_tokens_seen": 1228112, "step": 105 }, { "epoch": 0.056657223796033995, "grad_norm": 0.34797408069968117, "learning_rate": 6.470588235294118e-05, "loss": 0.91, "num_input_tokens_seen": 1286608, "step": 110 }, { "epoch": 0.05923255215039917, "grad_norm": 0.27558494796967653, "learning_rate": 6.764705882352942e-05, "loss": 0.9047, "num_input_tokens_seen": 1345072, "step": 115 }, { "epoch": 0.06180788050476436, "grad_norm": 0.5422134023513459, "learning_rate": 7.058823529411765e-05, "loss": 0.9022, "num_input_tokens_seen": 1403544, "step": 120 }, { "epoch": 0.06438320885912954, "grad_norm": 0.4452796218739235, "learning_rate": 7.352941176470589e-05, "loss": 0.9081, "num_input_tokens_seen": 1462024, "step": 125 }, { "epoch": 0.06695853721349472, "grad_norm": 0.5632558160730559, "learning_rate": 7.647058823529411e-05, "loss": 0.8939, "num_input_tokens_seen": 1520528, "step": 130 }, { "epoch": 0.0695338655678599, "grad_norm": 0.3383115884436812, "learning_rate": 7.941176470588235e-05, "loss": 0.9029, "num_input_tokens_seen": 1579024, "step": 135 }, { "epoch": 0.07210919392222509, "grad_norm": 0.3506611095466577, "learning_rate": 8.23529411764706e-05, "loss": 0.9014, "num_input_tokens_seen": 1637504, "step": 140 }, { "epoch": 0.07468452227659027, "grad_norm": 0.6328034405712752, "learning_rate": 8.529411764705883e-05, "loss": 0.9053, "num_input_tokens_seen": 1696024, "step": 145 }, { "epoch": 0.07725985063095545, "grad_norm": 0.3511657661506363, "learning_rate": 8.823529411764706e-05, "loss": 0.9032, "num_input_tokens_seen": 1754512, "step": 150 }, { "epoch": 0.07725985063095545, "eval_loss": 0.8962129950523376, "eval_runtime": 17.0673, "eval_samples_per_second": 3.515, "eval_steps_per_second": 0.879, "num_input_tokens_seen": 1754512, "step": 150 }, { "epoch": 0.07983517898532062, "grad_norm": 0.4047681172482029, "learning_rate": 9.11764705882353e-05, "loss": 0.8985, "num_input_tokens_seen": 1812976, "step": 155 }, { "epoch": 0.08241050733968582, "grad_norm": 0.37729033726569733, "learning_rate": 9.411764705882353e-05, "loss": 0.8949, "num_input_tokens_seen": 1871464, "step": 160 }, { "epoch": 0.08498583569405099, "grad_norm": 0.4655744785034158, "learning_rate": 9.705882352941177e-05, "loss": 0.9069, "num_input_tokens_seen": 1929928, "step": 165 }, { "epoch": 0.08756116404841617, "grad_norm": 0.30643056878817176, "learning_rate": 0.0001, "loss": 0.9049, "num_input_tokens_seen": 1988432, "step": 170 }, { "epoch": 0.09013649240278135, "grad_norm": 0.39944696269496754, "learning_rate": 9.999940874631277e-05, "loss": 0.9026, "num_input_tokens_seen": 2046920, "step": 175 }, { "epoch": 0.09271182075714654, "grad_norm": 0.31301259106593154, "learning_rate": 9.999763499923432e-05, "loss": 0.8984, "num_input_tokens_seen": 2105392, "step": 180 }, { "epoch": 0.09528714911151172, "grad_norm": 0.4309753054454554, "learning_rate": 9.999467880071402e-05, "loss": 0.9057, "num_input_tokens_seen": 2163872, "step": 185 }, { "epoch": 0.0978624774658769, "grad_norm": 0.262930252305763, "learning_rate": 9.999054022066641e-05, "loss": 0.9078, "num_input_tokens_seen": 2222352, "step": 190 }, { "epoch": 0.10043780582024209, "grad_norm": 0.22073598270887426, "learning_rate": 9.998521935696953e-05, "loss": 0.9028, "num_input_tokens_seen": 2280800, "step": 195 }, { "epoch": 0.10301313417460727, "grad_norm": 0.23764668792524696, "learning_rate": 9.997871633546257e-05, "loss": 0.9053, "num_input_tokens_seen": 2339304, "step": 200 }, { "epoch": 0.10301313417460727, "eval_loss": 0.8982028961181641, "eval_runtime": 16.9118, "eval_samples_per_second": 3.548, "eval_steps_per_second": 0.887, "num_input_tokens_seen": 2339304, "step": 200 }, { "epoch": 0.10558846252897244, "grad_norm": 0.6222576114383499, "learning_rate": 9.997103130994296e-05, "loss": 0.9003, "num_input_tokens_seen": 2397808, "step": 205 }, { "epoch": 0.10816379088333762, "grad_norm": 0.2983149992592585, "learning_rate": 9.996216446216267e-05, "loss": 0.8969, "num_input_tokens_seen": 2456288, "step": 210 }, { "epoch": 0.11073911923770281, "grad_norm": 0.3505370510576513, "learning_rate": 9.995211600182397e-05, "loss": 0.9114, "num_input_tokens_seen": 2514784, "step": 215 }, { "epoch": 0.11331444759206799, "grad_norm": 0.3683806652106065, "learning_rate": 9.994088616657444e-05, "loss": 0.899, "num_input_tokens_seen": 2573240, "step": 220 }, { "epoch": 0.11588977594643317, "grad_norm": 0.21111769827155855, "learning_rate": 9.992847522200133e-05, "loss": 0.898, "num_input_tokens_seen": 2631672, "step": 225 }, { "epoch": 0.11846510430079835, "grad_norm": 0.3426987181783304, "learning_rate": 9.99148834616253e-05, "loss": 0.9006, "num_input_tokens_seen": 2690112, "step": 230 }, { "epoch": 0.12104043265516354, "grad_norm": 0.236983209071443, "learning_rate": 9.990011120689351e-05, "loss": 0.8973, "num_input_tokens_seen": 2748608, "step": 235 }, { "epoch": 0.12361576100952872, "grad_norm": 0.4575208248826409, "learning_rate": 9.988415880717194e-05, "loss": 0.8885, "num_input_tokens_seen": 2807080, "step": 240 }, { "epoch": 0.1261910893638939, "grad_norm": 0.5470317919414993, "learning_rate": 9.986702663973722e-05, "loss": 0.9066, "num_input_tokens_seen": 2865520, "step": 245 }, { "epoch": 0.12876641771825909, "grad_norm": 0.4992479706331095, "learning_rate": 9.98487151097676e-05, "loss": 0.9098, "num_input_tokens_seen": 2924016, "step": 250 }, { "epoch": 0.12876641771825909, "eval_loss": 0.8956434726715088, "eval_runtime": 17.4804, "eval_samples_per_second": 3.432, "eval_steps_per_second": 0.858, "num_input_tokens_seen": 2924016, "step": 250 }, { "epoch": 0.13134174607262425, "grad_norm": 0.3762164361984238, "learning_rate": 9.98292246503335e-05, "loss": 0.8987, "num_input_tokens_seen": 2982520, "step": 255 }, { "epoch": 0.13391707442698944, "grad_norm": 0.6447043002410199, "learning_rate": 9.980855572238714e-05, "loss": 0.9036, "num_input_tokens_seen": 3041008, "step": 260 }, { "epoch": 0.13649240278135463, "grad_norm": 0.5308092769971742, "learning_rate": 9.978670881475172e-05, "loss": 0.8961, "num_input_tokens_seen": 3099464, "step": 265 }, { "epoch": 0.1390677311357198, "grad_norm": 0.508333330469703, "learning_rate": 9.976368444410985e-05, "loss": 0.9012, "num_input_tokens_seen": 3157944, "step": 270 }, { "epoch": 0.141643059490085, "grad_norm": 0.6801788563719119, "learning_rate": 9.973948315499126e-05, "loss": 0.8985, "num_input_tokens_seen": 3216448, "step": 275 }, { "epoch": 0.14421838784445018, "grad_norm": 0.6933074703933572, "learning_rate": 9.971410551976002e-05, "loss": 0.9114, "num_input_tokens_seen": 3274928, "step": 280 }, { "epoch": 0.14679371619881534, "grad_norm": 0.21208820897494882, "learning_rate": 9.968755213860094e-05, "loss": 0.8886, "num_input_tokens_seen": 3333408, "step": 285 }, { "epoch": 0.14936904455318054, "grad_norm": 0.5791422669000065, "learning_rate": 9.96598236395054e-05, "loss": 0.8929, "num_input_tokens_seen": 3391896, "step": 290 }, { "epoch": 0.1519443729075457, "grad_norm": 0.3460368893191152, "learning_rate": 9.96309206782565e-05, "loss": 0.9091, "num_input_tokens_seen": 3450392, "step": 295 }, { "epoch": 0.1545197012619109, "grad_norm": 0.22425222135997747, "learning_rate": 9.960084393841355e-05, "loss": 0.8893, "num_input_tokens_seen": 3508888, "step": 300 }, { "epoch": 0.1545197012619109, "eval_loss": 0.8908902406692505, "eval_runtime": 16.9521, "eval_samples_per_second": 3.539, "eval_steps_per_second": 0.885, "num_input_tokens_seen": 3508888, "step": 300 }, { "epoch": 0.15709502961627608, "grad_norm": 0.23111596622064604, "learning_rate": 9.956959413129585e-05, "loss": 0.9056, "num_input_tokens_seen": 3567368, "step": 305 }, { "epoch": 0.15967035797064125, "grad_norm": 0.3918406894807393, "learning_rate": 9.953717199596598e-05, "loss": 0.8982, "num_input_tokens_seen": 3625848, "step": 310 }, { "epoch": 0.16224568632500644, "grad_norm": 0.22081666860189372, "learning_rate": 9.95035782992122e-05, "loss": 0.8968, "num_input_tokens_seen": 3684336, "step": 315 }, { "epoch": 0.16482101467937163, "grad_norm": 0.18024383676398176, "learning_rate": 9.94688138355304e-05, "loss": 0.8975, "num_input_tokens_seen": 3742800, "step": 320 }, { "epoch": 0.1673963430337368, "grad_norm": 0.3866897344302321, "learning_rate": 9.943287942710527e-05, "loss": 0.9061, "num_input_tokens_seen": 3801280, "step": 325 }, { "epoch": 0.16997167138810199, "grad_norm": 0.4804151381712559, "learning_rate": 9.939577592379088e-05, "loss": 0.8948, "num_input_tokens_seen": 3859792, "step": 330 }, { "epoch": 0.17254699974246718, "grad_norm": 0.35878231707669056, "learning_rate": 9.935750420309055e-05, "loss": 0.9063, "num_input_tokens_seen": 3918272, "step": 335 }, { "epoch": 0.17512232809683234, "grad_norm": 0.8713957774909928, "learning_rate": 9.931806517013612e-05, "loss": 0.8952, "num_input_tokens_seen": 3976760, "step": 340 }, { "epoch": 0.17769765645119753, "grad_norm": 0.6671526212854116, "learning_rate": 9.927745975766654e-05, "loss": 0.9136, "num_input_tokens_seen": 4035240, "step": 345 }, { "epoch": 0.1802729848055627, "grad_norm": 0.28702679234521244, "learning_rate": 9.923568892600578e-05, "loss": 0.9075, "num_input_tokens_seen": 4093688, "step": 350 }, { "epoch": 0.1802729848055627, "eval_loss": 0.89204341173172, "eval_runtime": 16.5819, "eval_samples_per_second": 3.618, "eval_steps_per_second": 0.905, "num_input_tokens_seen": 4093688, "step": 350 }, { "epoch": 0.1828483131599279, "grad_norm": 0.32233149132200706, "learning_rate": 9.91927536630402e-05, "loss": 0.8812, "num_input_tokens_seen": 4152160, "step": 355 }, { "epoch": 0.18542364151429308, "grad_norm": 0.5071871697326992, "learning_rate": 9.91486549841951e-05, "loss": 0.9109, "num_input_tokens_seen": 4210648, "step": 360 }, { "epoch": 0.18799896986865824, "grad_norm": 0.4532792519849944, "learning_rate": 9.91033939324107e-05, "loss": 0.9176, "num_input_tokens_seen": 4269136, "step": 365 }, { "epoch": 0.19057429822302344, "grad_norm": 0.5409761562534501, "learning_rate": 9.905697157811761e-05, "loss": 0.9077, "num_input_tokens_seen": 4327664, "step": 370 }, { "epoch": 0.19314962657738863, "grad_norm": 0.3432361562809093, "learning_rate": 9.900938901921131e-05, "loss": 0.893, "num_input_tokens_seen": 4386120, "step": 375 }, { "epoch": 0.1957249549317538, "grad_norm": 0.4756530294720616, "learning_rate": 9.896064738102635e-05, "loss": 0.9094, "num_input_tokens_seen": 4444560, "step": 380 }, { "epoch": 0.19830028328611898, "grad_norm": 0.424836974193983, "learning_rate": 9.891074781630966e-05, "loss": 0.9091, "num_input_tokens_seen": 4503016, "step": 385 }, { "epoch": 0.20087561164048418, "grad_norm": 0.31316926977469683, "learning_rate": 9.885969150519331e-05, "loss": 0.9033, "num_input_tokens_seen": 4561496, "step": 390 }, { "epoch": 0.20345093999484934, "grad_norm": 0.6108378682480797, "learning_rate": 9.88074796551666e-05, "loss": 0.8851, "num_input_tokens_seen": 4619944, "step": 395 }, { "epoch": 0.20602626834921453, "grad_norm": 0.38294566619219206, "learning_rate": 9.875411350104744e-05, "loss": 0.9004, "num_input_tokens_seen": 4678384, "step": 400 }, { "epoch": 0.20602626834921453, "eval_loss": 0.9086406826972961, "eval_runtime": 16.7827, "eval_samples_per_second": 3.575, "eval_steps_per_second": 0.894, "num_input_tokens_seen": 4678384, "step": 400 }, { "epoch": 0.2086015967035797, "grad_norm": 0.4283475401297436, "learning_rate": 9.86995943049533e-05, "loss": 0.8976, "num_input_tokens_seen": 4736904, "step": 405 }, { "epoch": 0.2111769250579449, "grad_norm": 0.40329738287583206, "learning_rate": 9.864392335627117e-05, "loss": 0.9134, "num_input_tokens_seen": 4795376, "step": 410 }, { "epoch": 0.21375225341231008, "grad_norm": 0.37890634863656475, "learning_rate": 9.858710197162721e-05, "loss": 0.8955, "num_input_tokens_seen": 4853880, "step": 415 }, { "epoch": 0.21632758176667524, "grad_norm": 0.32402245835420784, "learning_rate": 9.852913149485556e-05, "loss": 0.9014, "num_input_tokens_seen": 4912360, "step": 420 }, { "epoch": 0.21890291012104043, "grad_norm": 0.49572499508345125, "learning_rate": 9.847001329696653e-05, "loss": 0.9065, "num_input_tokens_seen": 4970872, "step": 425 }, { "epoch": 0.22147823847540563, "grad_norm": 0.11883567118448765, "learning_rate": 9.840974877611422e-05, "loss": 0.8952, "num_input_tokens_seen": 5029304, "step": 430 }, { "epoch": 0.2240535668297708, "grad_norm": 0.7105724703149633, "learning_rate": 9.834833935756344e-05, "loss": 0.9106, "num_input_tokens_seen": 5087800, "step": 435 }, { "epoch": 0.22662889518413598, "grad_norm": 0.708953365388227, "learning_rate": 9.828578649365601e-05, "loss": 0.8996, "num_input_tokens_seen": 5146312, "step": 440 }, { "epoch": 0.22920422353850115, "grad_norm": 0.4503080730364326, "learning_rate": 9.822209166377635e-05, "loss": 0.8999, "num_input_tokens_seen": 5204800, "step": 445 }, { "epoch": 0.23177955189286634, "grad_norm": 0.20754132336834788, "learning_rate": 9.815725637431662e-05, "loss": 0.9076, "num_input_tokens_seen": 5263304, "step": 450 }, { "epoch": 0.23177955189286634, "eval_loss": 0.8962157368659973, "eval_runtime": 17.2029, "eval_samples_per_second": 3.488, "eval_steps_per_second": 0.872, "num_input_tokens_seen": 5263304, "step": 450 }, { "epoch": 0.23435488024723153, "grad_norm": 0.5906403377099594, "learning_rate": 9.809128215864097e-05, "loss": 0.8942, "num_input_tokens_seen": 5321760, "step": 455 }, { "epoch": 0.2369302086015967, "grad_norm": 0.5706805631290568, "learning_rate": 9.802417057704931e-05, "loss": 0.9099, "num_input_tokens_seen": 5380224, "step": 460 }, { "epoch": 0.23950553695596188, "grad_norm": 0.164631948732384, "learning_rate": 9.795592321674045e-05, "loss": 0.8981, "num_input_tokens_seen": 5438704, "step": 465 }, { "epoch": 0.24208086531032708, "grad_norm": 0.32986780285522194, "learning_rate": 9.788654169177453e-05, "loss": 0.8952, "num_input_tokens_seen": 5497208, "step": 470 }, { "epoch": 0.24465619366469224, "grad_norm": 0.40551569446674784, "learning_rate": 9.781602764303487e-05, "loss": 0.8959, "num_input_tokens_seen": 5555704, "step": 475 }, { "epoch": 0.24723152201905743, "grad_norm": 0.20928586231326682, "learning_rate": 9.774438273818911e-05, "loss": 0.901, "num_input_tokens_seen": 5614160, "step": 480 }, { "epoch": 0.24980685037342262, "grad_norm": 0.34365307116824517, "learning_rate": 9.767160867164979e-05, "loss": 0.9008, "num_input_tokens_seen": 5672640, "step": 485 }, { "epoch": 0.2523821787277878, "grad_norm": 0.4212274243028996, "learning_rate": 9.759770716453436e-05, "loss": 0.9016, "num_input_tokens_seen": 5731072, "step": 490 }, { "epoch": 0.254957507082153, "grad_norm": 0.39823625576558597, "learning_rate": 9.752267996462434e-05, "loss": 0.9132, "num_input_tokens_seen": 5789544, "step": 495 }, { "epoch": 0.25753283543651817, "grad_norm": 0.24856324117583653, "learning_rate": 9.744652884632406e-05, "loss": 0.8962, "num_input_tokens_seen": 5848048, "step": 500 }, { "epoch": 0.25753283543651817, "eval_loss": 0.8987945914268494, "eval_runtime": 17.1622, "eval_samples_per_second": 3.496, "eval_steps_per_second": 0.874, "num_input_tokens_seen": 5848048, "step": 500 }, { "epoch": 0.26010816379088336, "grad_norm": 0.25461397268106634, "learning_rate": 9.736925561061871e-05, "loss": 0.8954, "num_input_tokens_seen": 5906512, "step": 505 }, { "epoch": 0.2626834921452485, "grad_norm": 0.38602603275675745, "learning_rate": 9.729086208503174e-05, "loss": 0.8927, "num_input_tokens_seen": 5965024, "step": 510 }, { "epoch": 0.2652588204996137, "grad_norm": 0.150082825225123, "learning_rate": 9.721135012358156e-05, "loss": 0.898, "num_input_tokens_seen": 6023496, "step": 515 }, { "epoch": 0.2678341488539789, "grad_norm": 0.26881662025899655, "learning_rate": 9.713072160673777e-05, "loss": 0.9016, "num_input_tokens_seen": 6082000, "step": 520 }, { "epoch": 0.2704094772083441, "grad_norm": 0.5039123575147229, "learning_rate": 9.704897844137673e-05, "loss": 0.8842, "num_input_tokens_seen": 6140480, "step": 525 }, { "epoch": 0.27298480556270927, "grad_norm": 0.27836945453098666, "learning_rate": 9.696612256073633e-05, "loss": 0.8921, "num_input_tokens_seen": 6198968, "step": 530 }, { "epoch": 0.2755601339170744, "grad_norm": 0.22936338891946384, "learning_rate": 9.688215592437039e-05, "loss": 0.8979, "num_input_tokens_seen": 6257464, "step": 535 }, { "epoch": 0.2781354622714396, "grad_norm": 0.396486857609105, "learning_rate": 9.679708051810221e-05, "loss": 0.8951, "num_input_tokens_seen": 6315944, "step": 540 }, { "epoch": 0.2807107906258048, "grad_norm": 0.4751226662261396, "learning_rate": 9.67108983539777e-05, "loss": 0.9149, "num_input_tokens_seen": 6374408, "step": 545 }, { "epoch": 0.28328611898017, "grad_norm": 0.26829103885131056, "learning_rate": 9.662361147021779e-05, "loss": 0.9013, "num_input_tokens_seen": 6432936, "step": 550 }, { "epoch": 0.28328611898017, "eval_loss": 0.9001271724700928, "eval_runtime": 16.9878, "eval_samples_per_second": 3.532, "eval_steps_per_second": 0.883, "num_input_tokens_seen": 6432936, "step": 550 }, { "epoch": 0.28586144733453517, "grad_norm": 0.5334970266367584, "learning_rate": 9.653522193117013e-05, "loss": 0.8981, "num_input_tokens_seen": 6491400, "step": 555 }, { "epoch": 0.28843677568890036, "grad_norm": 0.33261202813259866, "learning_rate": 9.644573182726035e-05, "loss": 0.9041, "num_input_tokens_seen": 6549872, "step": 560 }, { "epoch": 0.2910121040432655, "grad_norm": 0.19122862132727417, "learning_rate": 9.63551432749426e-05, "loss": 0.9024, "num_input_tokens_seen": 6608296, "step": 565 }, { "epoch": 0.2935874323976307, "grad_norm": 0.27778009425329764, "learning_rate": 9.626345841664953e-05, "loss": 0.9002, "num_input_tokens_seen": 6666768, "step": 570 }, { "epoch": 0.2961627607519959, "grad_norm": 0.3065314332046026, "learning_rate": 9.617067942074153e-05, "loss": 0.9035, "num_input_tokens_seen": 6725248, "step": 575 }, { "epoch": 0.29873808910636107, "grad_norm": 0.24431496415058412, "learning_rate": 9.607680848145558e-05, "loss": 0.9019, "num_input_tokens_seen": 6783680, "step": 580 }, { "epoch": 0.30131341746072626, "grad_norm": 0.27088193021301504, "learning_rate": 9.598184781885318e-05, "loss": 0.9001, "num_input_tokens_seen": 6842144, "step": 585 }, { "epoch": 0.3038887458150914, "grad_norm": 0.33893098113605125, "learning_rate": 9.588579967876806e-05, "loss": 0.8961, "num_input_tokens_seen": 6900656, "step": 590 }, { "epoch": 0.3064640741694566, "grad_norm": 0.3038921833221806, "learning_rate": 9.578866633275288e-05, "loss": 0.9, "num_input_tokens_seen": 6959128, "step": 595 }, { "epoch": 0.3090394025238218, "grad_norm": 0.48929637235055645, "learning_rate": 9.569045007802559e-05, "loss": 0.9046, "num_input_tokens_seen": 7017576, "step": 600 }, { "epoch": 0.3090394025238218, "eval_loss": 0.9053278565406799, "eval_runtime": 17.1218, "eval_samples_per_second": 3.504, "eval_steps_per_second": 0.876, "num_input_tokens_seen": 7017576, "step": 600 }, { "epoch": 0.311614730878187, "grad_norm": 0.3545950949033049, "learning_rate": 9.55911532374151e-05, "loss": 0.9019, "num_input_tokens_seen": 7076032, "step": 605 }, { "epoch": 0.31419005923255217, "grad_norm": 0.2355627006333952, "learning_rate": 9.549077815930636e-05, "loss": 0.8956, "num_input_tokens_seen": 7134536, "step": 610 }, { "epoch": 0.31676538758691736, "grad_norm": 0.17552483625655946, "learning_rate": 9.538932721758474e-05, "loss": 0.898, "num_input_tokens_seen": 7193032, "step": 615 }, { "epoch": 0.3193407159412825, "grad_norm": 0.1749010635522076, "learning_rate": 9.528680281157999e-05, "loss": 0.8991, "num_input_tokens_seen": 7251568, "step": 620 }, { "epoch": 0.3219160442956477, "grad_norm": 0.19885182954224315, "learning_rate": 9.518320736600943e-05, "loss": 0.8961, "num_input_tokens_seen": 7310072, "step": 625 }, { "epoch": 0.3244913726500129, "grad_norm": 0.4778756508206831, "learning_rate": 9.507854333092063e-05, "loss": 0.8994, "num_input_tokens_seen": 7368560, "step": 630 }, { "epoch": 0.32706670100437807, "grad_norm": 0.4123272743887767, "learning_rate": 9.497281318163346e-05, "loss": 0.8925, "num_input_tokens_seen": 7427040, "step": 635 }, { "epoch": 0.32964202935874326, "grad_norm": 0.34409942667705734, "learning_rate": 9.486601941868154e-05, "loss": 0.9087, "num_input_tokens_seen": 7485552, "step": 640 }, { "epoch": 0.3322173577131084, "grad_norm": 0.43327107411223276, "learning_rate": 9.475816456775313e-05, "loss": 0.8924, "num_input_tokens_seen": 7544040, "step": 645 }, { "epoch": 0.3347926860674736, "grad_norm": 0.6643023904352003, "learning_rate": 9.464925117963133e-05, "loss": 0.904, "num_input_tokens_seen": 7602512, "step": 650 }, { "epoch": 0.3347926860674736, "eval_loss": 0.90328449010849, "eval_runtime": 16.1444, "eval_samples_per_second": 3.716, "eval_steps_per_second": 0.929, "num_input_tokens_seen": 7602512, "step": 650 }, { "epoch": 0.3373680144218388, "grad_norm": 0.620349194493935, "learning_rate": 9.453928183013385e-05, "loss": 0.8929, "num_input_tokens_seen": 7660968, "step": 655 }, { "epoch": 0.33994334277620397, "grad_norm": 0.18611846349930314, "learning_rate": 9.442825912005202e-05, "loss": 0.9078, "num_input_tokens_seen": 7719448, "step": 660 }, { "epoch": 0.34251867113056916, "grad_norm": 0.4448289413172567, "learning_rate": 9.431618567508933e-05, "loss": 0.8963, "num_input_tokens_seen": 7777928, "step": 665 }, { "epoch": 0.34509399948493436, "grad_norm": 0.6187189362250411, "learning_rate": 9.420306414579925e-05, "loss": 0.9134, "num_input_tokens_seen": 7836424, "step": 670 }, { "epoch": 0.3476693278392995, "grad_norm": 0.35247743418537675, "learning_rate": 9.408889720752266e-05, "loss": 0.8984, "num_input_tokens_seen": 7894904, "step": 675 }, { "epoch": 0.3502446561936647, "grad_norm": 0.20652916455346712, "learning_rate": 9.397368756032445e-05, "loss": 0.8997, "num_input_tokens_seen": 7953432, "step": 680 }, { "epoch": 0.3528199845480299, "grad_norm": 0.4289996063998063, "learning_rate": 9.385743792892982e-05, "loss": 0.8926, "num_input_tokens_seen": 8011888, "step": 685 }, { "epoch": 0.35539531290239507, "grad_norm": 0.13764054506536547, "learning_rate": 9.374015106265968e-05, "loss": 0.9008, "num_input_tokens_seen": 8070344, "step": 690 }, { "epoch": 0.35797064125676026, "grad_norm": 0.22142459689499855, "learning_rate": 9.362182973536569e-05, "loss": 0.8986, "num_input_tokens_seen": 8128816, "step": 695 }, { "epoch": 0.3605459696111254, "grad_norm": 0.3234539650829873, "learning_rate": 9.35024767453647e-05, "loss": 0.8972, "num_input_tokens_seen": 8187320, "step": 700 }, { "epoch": 0.3605459696111254, "eval_loss": 0.9028835892677307, "eval_runtime": 16.1635, "eval_samples_per_second": 3.712, "eval_steps_per_second": 0.928, "num_input_tokens_seen": 8187320, "step": 700 }, { "epoch": 0.3631212979654906, "grad_norm": 0.3215674690491891, "learning_rate": 9.338209491537257e-05, "loss": 0.8998, "num_input_tokens_seen": 8245776, "step": 705 }, { "epoch": 0.3656966263198558, "grad_norm": 0.36428692362396536, "learning_rate": 9.326068709243727e-05, "loss": 0.8999, "num_input_tokens_seen": 8304280, "step": 710 }, { "epoch": 0.36827195467422097, "grad_norm": 0.280459809393624, "learning_rate": 9.313825614787177e-05, "loss": 0.8983, "num_input_tokens_seen": 8362728, "step": 715 }, { "epoch": 0.37084728302858616, "grad_norm": 0.1819339731162554, "learning_rate": 9.301480497718593e-05, "loss": 0.892, "num_input_tokens_seen": 8421224, "step": 720 }, { "epoch": 0.37342261138295135, "grad_norm": 0.23784840563699303, "learning_rate": 9.289033650001817e-05, "loss": 0.9034, "num_input_tokens_seen": 8479720, "step": 725 }, { "epoch": 0.3759979397373165, "grad_norm": 0.24070744588741375, "learning_rate": 9.276485366006634e-05, "loss": 0.895, "num_input_tokens_seen": 8538192, "step": 730 }, { "epoch": 0.3785732680916817, "grad_norm": 0.24846723619231478, "learning_rate": 9.263835942501807e-05, "loss": 0.8973, "num_input_tokens_seen": 8596664, "step": 735 }, { "epoch": 0.3811485964460469, "grad_norm": 0.2601614440419362, "learning_rate": 9.251085678648072e-05, "loss": 0.8972, "num_input_tokens_seen": 8655128, "step": 740 }, { "epoch": 0.38372392480041206, "grad_norm": 0.30194733839751087, "learning_rate": 9.238234875991046e-05, "loss": 0.8987, "num_input_tokens_seen": 8713624, "step": 745 }, { "epoch": 0.38629925315477726, "grad_norm": 0.3015609177439829, "learning_rate": 9.225283838454111e-05, "loss": 0.9005, "num_input_tokens_seen": 8772104, "step": 750 }, { "epoch": 0.38629925315477726, "eval_loss": 0.8981761336326599, "eval_runtime": 16.0177, "eval_samples_per_second": 3.746, "eval_steps_per_second": 0.936, "num_input_tokens_seen": 8772104, "step": 750 }, { "epoch": 0.3888745815091424, "grad_norm": 0.44991480631292463, "learning_rate": 9.21223287233121e-05, "loss": 0.8973, "num_input_tokens_seen": 8830568, "step": 755 }, { "epoch": 0.3914499098635076, "grad_norm": 0.22570310903133853, "learning_rate": 9.199082286279622e-05, "loss": 0.8974, "num_input_tokens_seen": 8889072, "step": 760 }, { "epoch": 0.3940252382178728, "grad_norm": 0.22090133233732026, "learning_rate": 9.185832391312644e-05, "loss": 0.8985, "num_input_tokens_seen": 8947568, "step": 765 }, { "epoch": 0.39660056657223797, "grad_norm": 0.23738058530347297, "learning_rate": 9.172483500792244e-05, "loss": 0.8935, "num_input_tokens_seen": 9006056, "step": 770 }, { "epoch": 0.39917589492660316, "grad_norm": 0.41232659301572594, "learning_rate": 9.159035930421658e-05, "loss": 0.8985, "num_input_tokens_seen": 9064592, "step": 775 }, { "epoch": 0.40175122328096835, "grad_norm": 0.2004855543001356, "learning_rate": 9.145489998237902e-05, "loss": 0.9105, "num_input_tokens_seen": 9123096, "step": 780 }, { "epoch": 0.4043265516353335, "grad_norm": 0.16209487510237375, "learning_rate": 9.131846024604274e-05, "loss": 0.8925, "num_input_tokens_seen": 9181576, "step": 785 }, { "epoch": 0.4069018799896987, "grad_norm": 0.24319930530142153, "learning_rate": 9.11810433220276e-05, "loss": 0.8955, "num_input_tokens_seen": 9240048, "step": 790 }, { "epoch": 0.40947720834406387, "grad_norm": 0.24311562892750557, "learning_rate": 9.104265246026415e-05, "loss": 0.8986, "num_input_tokens_seen": 9298528, "step": 795 }, { "epoch": 0.41205253669842906, "grad_norm": 0.2891177185942039, "learning_rate": 9.090329093371666e-05, "loss": 0.8881, "num_input_tokens_seen": 9357016, "step": 800 }, { "epoch": 0.41205253669842906, "eval_loss": 0.8973079919815063, "eval_runtime": 16.1396, "eval_samples_per_second": 3.718, "eval_steps_per_second": 0.929, "num_input_tokens_seen": 9357016, "step": 800 }, { "epoch": 0.41462786505279425, "grad_norm": 0.4728970278357675, "learning_rate": 9.076296203830579e-05, "loss": 0.8798, "num_input_tokens_seen": 9415480, "step": 805 }, { "epoch": 0.4172031934071594, "grad_norm": 0.2420351489416807, "learning_rate": 9.062166909283062e-05, "loss": 0.9104, "num_input_tokens_seen": 9473928, "step": 810 }, { "epoch": 0.4197785217615246, "grad_norm": 0.2262623911682871, "learning_rate": 9.047941543889014e-05, "loss": 0.9007, "num_input_tokens_seen": 9532408, "step": 815 }, { "epoch": 0.4223538501158898, "grad_norm": 0.18258980329217392, "learning_rate": 9.033620444080428e-05, "loss": 0.8974, "num_input_tokens_seen": 9590920, "step": 820 }, { "epoch": 0.42492917847025496, "grad_norm": 0.2898762949979446, "learning_rate": 9.019203948553422e-05, "loss": 0.8992, "num_input_tokens_seen": 9649400, "step": 825 }, { "epoch": 0.42750450682462016, "grad_norm": 0.3884592601874919, "learning_rate": 9.004692398260244e-05, "loss": 0.8991, "num_input_tokens_seen": 9707888, "step": 830 }, { "epoch": 0.43007983517898535, "grad_norm": 0.24055719869667014, "learning_rate": 8.9900861364012e-05, "loss": 0.8964, "num_input_tokens_seen": 9766384, "step": 835 }, { "epoch": 0.4326551635333505, "grad_norm": 0.4482774361285702, "learning_rate": 8.975385508416532e-05, "loss": 0.8723, "num_input_tokens_seen": 9824896, "step": 840 }, { "epoch": 0.4352304918877157, "grad_norm": 0.4612030185875055, "learning_rate": 8.960590861978265e-05, "loss": 0.874, "num_input_tokens_seen": 9883408, "step": 845 }, { "epoch": 0.43780582024208087, "grad_norm": 0.44197834194509644, "learning_rate": 8.945702546981969e-05, "loss": 0.9035, "num_input_tokens_seen": 9941896, "step": 850 }, { "epoch": 0.43780582024208087, "eval_loss": 0.8779178261756897, "eval_runtime": 16.159, "eval_samples_per_second": 3.713, "eval_steps_per_second": 0.928, "num_input_tokens_seen": 9941896, "step": 850 }, { "epoch": 0.44038114859644606, "grad_norm": 0.8207188524660312, "learning_rate": 8.930720915538487e-05, "loss": 0.8516, "num_input_tokens_seen": 10000336, "step": 855 }, { "epoch": 0.44295647695081125, "grad_norm": 1.5881804699369033, "learning_rate": 8.915646321965614e-05, "loss": 0.9206, "num_input_tokens_seen": 10058816, "step": 860 }, { "epoch": 0.4455318053051764, "grad_norm": 0.3364043503653687, "learning_rate": 8.900479122779712e-05, "loss": 0.9028, "num_input_tokens_seen": 10117320, "step": 865 }, { "epoch": 0.4481071336595416, "grad_norm": 0.2888069815557639, "learning_rate": 8.885219676687277e-05, "loss": 0.8991, "num_input_tokens_seen": 10175824, "step": 870 }, { "epoch": 0.45068246201390677, "grad_norm": 0.26081919755231314, "learning_rate": 8.869868344576459e-05, "loss": 0.8934, "num_input_tokens_seen": 10234288, "step": 875 }, { "epoch": 0.45325779036827196, "grad_norm": 0.1672074260476841, "learning_rate": 8.854425489508532e-05, "loss": 0.8908, "num_input_tokens_seen": 10292736, "step": 880 }, { "epoch": 0.45583311872263715, "grad_norm": 0.3141498425127344, "learning_rate": 8.838891476709288e-05, "loss": 0.8988, "num_input_tokens_seen": 10351224, "step": 885 }, { "epoch": 0.4584084470770023, "grad_norm": 0.28442383194638554, "learning_rate": 8.823266673560426e-05, "loss": 0.8965, "num_input_tokens_seen": 10409736, "step": 890 }, { "epoch": 0.4609837754313675, "grad_norm": 0.24793143025843287, "learning_rate": 8.807551449590846e-05, "loss": 0.8989, "num_input_tokens_seen": 10468240, "step": 895 }, { "epoch": 0.4635591037857327, "grad_norm": 0.18173090045802157, "learning_rate": 8.791746176467907e-05, "loss": 0.8961, "num_input_tokens_seen": 10526712, "step": 900 }, { "epoch": 0.4635591037857327, "eval_loss": 0.891426146030426, "eval_runtime": 16.0357, "eval_samples_per_second": 3.742, "eval_steps_per_second": 0.935, "num_input_tokens_seen": 10526712, "step": 900 }, { "epoch": 0.46613443214009787, "grad_norm": 0.18755280770432675, "learning_rate": 8.775851227988656e-05, "loss": 0.8955, "num_input_tokens_seen": 10585232, "step": 905 }, { "epoch": 0.46870976049446306, "grad_norm": 0.16684040416821233, "learning_rate": 8.759866980070963e-05, "loss": 0.8951, "num_input_tokens_seen": 10643728, "step": 910 }, { "epoch": 0.47128508884882825, "grad_norm": 0.33346521793095785, "learning_rate": 8.743793810744654e-05, "loss": 0.8951, "num_input_tokens_seen": 10702240, "step": 915 }, { "epoch": 0.4738604172031934, "grad_norm": 0.23650054707790025, "learning_rate": 8.727632100142551e-05, "loss": 0.9066, "num_input_tokens_seen": 10760656, "step": 920 }, { "epoch": 0.4764357455575586, "grad_norm": 0.20217442955339224, "learning_rate": 8.711382230491493e-05, "loss": 0.8953, "num_input_tokens_seen": 10819128, "step": 925 }, { "epoch": 0.47901107391192377, "grad_norm": 0.1648307621403396, "learning_rate": 8.695044586103296e-05, "loss": 0.8961, "num_input_tokens_seen": 10877600, "step": 930 }, { "epoch": 0.48158640226628896, "grad_norm": 0.25983065938238986, "learning_rate": 8.678619553365659e-05, "loss": 0.8965, "num_input_tokens_seen": 10936088, "step": 935 }, { "epoch": 0.48416173062065415, "grad_norm": 0.17882463002474594, "learning_rate": 8.662107520733027e-05, "loss": 0.9018, "num_input_tokens_seen": 10994560, "step": 940 }, { "epoch": 0.4867370589750193, "grad_norm": 0.14644012846994445, "learning_rate": 8.64550887871741e-05, "loss": 0.8944, "num_input_tokens_seen": 11053016, "step": 945 }, { "epoch": 0.4893123873293845, "grad_norm": 0.23751630760966444, "learning_rate": 8.628824019879137e-05, "loss": 0.8852, "num_input_tokens_seen": 11111520, "step": 950 }, { "epoch": 0.4893123873293845, "eval_loss": 0.8915690183639526, "eval_runtime": 16.2589, "eval_samples_per_second": 3.69, "eval_steps_per_second": 0.923, "num_input_tokens_seen": 11111520, "step": 950 }, { "epoch": 0.49188771568374967, "grad_norm": 0.3904846319143667, "learning_rate": 8.612053338817581e-05, "loss": 0.9087, "num_input_tokens_seen": 11170016, "step": 955 }, { "epoch": 0.49446304403811486, "grad_norm": 0.44920450892911645, "learning_rate": 8.595197232161824e-05, "loss": 0.8915, "num_input_tokens_seen": 11228496, "step": 960 }, { "epoch": 0.49703837239248005, "grad_norm": 0.6093857047738649, "learning_rate": 8.578256098561275e-05, "loss": 0.8836, "num_input_tokens_seen": 11286928, "step": 965 }, { "epoch": 0.49961370074684525, "grad_norm": 0.6282945106836194, "learning_rate": 8.561230338676239e-05, "loss": 0.9116, "num_input_tokens_seen": 11345400, "step": 970 }, { "epoch": 0.5021890291012104, "grad_norm": 0.3187294296147391, "learning_rate": 8.544120355168451e-05, "loss": 0.8809, "num_input_tokens_seen": 11403912, "step": 975 }, { "epoch": 0.5047643574555756, "grad_norm": 0.4019889420836467, "learning_rate": 8.526926552691544e-05, "loss": 0.8895, "num_input_tokens_seen": 11462344, "step": 980 }, { "epoch": 0.5073396858099408, "grad_norm": 0.4762279449607594, "learning_rate": 8.509649337881483e-05, "loss": 0.8674, "num_input_tokens_seen": 11520808, "step": 985 }, { "epoch": 0.509915014164306, "grad_norm": 1.7062273050040726, "learning_rate": 8.492289119346943e-05, "loss": 0.8832, "num_input_tokens_seen": 11579248, "step": 990 }, { "epoch": 0.5124903425186711, "grad_norm": 0.7896696939552226, "learning_rate": 8.474846307659658e-05, "loss": 0.8581, "num_input_tokens_seen": 11637712, "step": 995 }, { "epoch": 0.5150656708730363, "grad_norm": 0.9287129351980297, "learning_rate": 8.457321315344694e-05, "loss": 0.8635, "num_input_tokens_seen": 11696200, "step": 1000 }, { "epoch": 0.5150656708730363, "eval_loss": 0.860200047492981, "eval_runtime": 16.1196, "eval_samples_per_second": 3.722, "eval_steps_per_second": 0.931, "num_input_tokens_seen": 11696200, "step": 1000 }, { "epoch": 0.5176409992274015, "grad_norm": 0.9492829276877938, "learning_rate": 8.439714556870704e-05, "loss": 0.8499, "num_input_tokens_seen": 11754720, "step": 1005 }, { "epoch": 0.5202163275817667, "grad_norm": 1.57473364910246, "learning_rate": 8.422026448640124e-05, "loss": 0.8556, "num_input_tokens_seen": 11813216, "step": 1010 }, { "epoch": 0.5227916559361319, "grad_norm": 0.6562994819534732, "learning_rate": 8.40425740897932e-05, "loss": 0.8533, "num_input_tokens_seen": 11871712, "step": 1015 }, { "epoch": 0.525366984290497, "grad_norm": 0.5420643724864006, "learning_rate": 8.386407858128706e-05, "loss": 0.8921, "num_input_tokens_seen": 11930200, "step": 1020 }, { "epoch": 0.5279423126448622, "grad_norm": 0.4900953324933905, "learning_rate": 8.368478218232787e-05, "loss": 0.8815, "num_input_tokens_seen": 11988704, "step": 1025 }, { "epoch": 0.5305176409992274, "grad_norm": 0.46534021808416004, "learning_rate": 8.350468913330192e-05, "loss": 0.854, "num_input_tokens_seen": 12047176, "step": 1030 }, { "epoch": 0.5330929693535926, "grad_norm": 0.6739669998528043, "learning_rate": 8.33238036934364e-05, "loss": 0.8642, "num_input_tokens_seen": 12105680, "step": 1035 }, { "epoch": 0.5356682977079578, "grad_norm": 1.100337259258234, "learning_rate": 8.31421301406986e-05, "loss": 0.8072, "num_input_tokens_seen": 12164208, "step": 1040 }, { "epoch": 0.5382436260623229, "grad_norm": 1.2731858488127639, "learning_rate": 8.29596727716949e-05, "loss": 0.8532, "num_input_tokens_seen": 12222672, "step": 1045 }, { "epoch": 0.5408189544166881, "grad_norm": 0.8686963016555517, "learning_rate": 8.277643590156894e-05, "loss": 0.8844, "num_input_tokens_seen": 12281072, "step": 1050 }, { "epoch": 0.5408189544166881, "eval_loss": 0.8446129560470581, "eval_runtime": 16.0508, "eval_samples_per_second": 3.738, "eval_steps_per_second": 0.935, "num_input_tokens_seen": 12281072, "step": 1050 }, { "epoch": 0.5433942827710533, "grad_norm": 0.5518554447099218, "learning_rate": 8.259242386389973e-05, "loss": 0.8602, "num_input_tokens_seen": 12339544, "step": 1055 }, { "epoch": 0.5459696111254185, "grad_norm": 0.7300911438509382, "learning_rate": 8.240764101059912e-05, "loss": 0.8615, "num_input_tokens_seen": 12397992, "step": 1060 }, { "epoch": 0.5485449394797837, "grad_norm": 0.7364983085887583, "learning_rate": 8.222209171180883e-05, "loss": 0.8732, "num_input_tokens_seen": 12456480, "step": 1065 }, { "epoch": 0.5511202678341488, "grad_norm": 0.4840408774949972, "learning_rate": 8.203578035579715e-05, "loss": 0.8691, "num_input_tokens_seen": 12515000, "step": 1070 }, { "epoch": 0.553695596188514, "grad_norm": 0.516278691776577, "learning_rate": 8.184871134885513e-05, "loss": 0.8544, "num_input_tokens_seen": 12573504, "step": 1075 }, { "epoch": 0.5562709245428792, "grad_norm": 0.8626943002609527, "learning_rate": 8.166088911519235e-05, "loss": 0.8501, "num_input_tokens_seen": 12632008, "step": 1080 }, { "epoch": 0.5588462528972444, "grad_norm": 0.7409465187036862, "learning_rate": 8.147231809683236e-05, "loss": 0.8646, "num_input_tokens_seen": 12690520, "step": 1085 }, { "epoch": 0.5614215812516096, "grad_norm": 0.5736639247313171, "learning_rate": 8.128300275350756e-05, "loss": 0.8327, "num_input_tokens_seen": 12749032, "step": 1090 }, { "epoch": 0.5639969096059748, "grad_norm": 0.7720514157947642, "learning_rate": 8.109294756255375e-05, "loss": 0.8218, "num_input_tokens_seen": 12807504, "step": 1095 }, { "epoch": 0.56657223796034, "grad_norm": 0.9129011996506371, "learning_rate": 8.090215701880419e-05, "loss": 0.8427, "num_input_tokens_seen": 12865992, "step": 1100 }, { "epoch": 0.56657223796034, "eval_loss": 0.7743102312088013, "eval_runtime": 16.1034, "eval_samples_per_second": 3.726, "eval_steps_per_second": 0.931, "num_input_tokens_seen": 12865992, "step": 1100 }, { "epoch": 0.5691475663147051, "grad_norm": 1.6435842633079423, "learning_rate": 8.07106356344834e-05, "loss": 0.8335, "num_input_tokens_seen": 12924448, "step": 1105 }, { "epoch": 0.5717228946690703, "grad_norm": 1.2281943545237959, "learning_rate": 8.051838793910038e-05, "loss": 0.8267, "num_input_tokens_seen": 12982912, "step": 1110 }, { "epoch": 0.5742982230234355, "grad_norm": 1.4138823100284208, "learning_rate": 8.032541847934146e-05, "loss": 0.8866, "num_input_tokens_seen": 13041424, "step": 1115 }, { "epoch": 0.5768735513778007, "grad_norm": 0.6515311059204204, "learning_rate": 8.013173181896283e-05, "loss": 0.8446, "num_input_tokens_seen": 13099888, "step": 1120 }, { "epoch": 0.5794488797321659, "grad_norm": 0.7537544303655812, "learning_rate": 7.993733253868256e-05, "loss": 0.8176, "num_input_tokens_seen": 13158344, "step": 1125 }, { "epoch": 0.582024208086531, "grad_norm": 1.3613777296967222, "learning_rate": 7.974222523607236e-05, "loss": 0.8138, "num_input_tokens_seen": 13216840, "step": 1130 }, { "epoch": 0.5845995364408962, "grad_norm": 0.6640843445520798, "learning_rate": 7.954641452544865e-05, "loss": 0.8204, "num_input_tokens_seen": 13275328, "step": 1135 }, { "epoch": 0.5871748647952614, "grad_norm": 0.6917895597906035, "learning_rate": 7.934990503776363e-05, "loss": 0.8485, "num_input_tokens_seen": 13333784, "step": 1140 }, { "epoch": 0.5897501931496266, "grad_norm": 0.45542718993625547, "learning_rate": 7.915270142049566e-05, "loss": 0.8191, "num_input_tokens_seen": 13392280, "step": 1145 }, { "epoch": 0.5923255215039918, "grad_norm": 0.618954778582039, "learning_rate": 7.89548083375394e-05, "loss": 0.8185, "num_input_tokens_seen": 13450720, "step": 1150 }, { "epoch": 0.5923255215039918, "eval_loss": 0.7827339768409729, "eval_runtime": 16.0127, "eval_samples_per_second": 3.747, "eval_steps_per_second": 0.937, "num_input_tokens_seen": 13450720, "step": 1150 }, { "epoch": 0.5949008498583569, "grad_norm": 1.5827740829243289, "learning_rate": 7.875623046909544e-05, "loss": 0.8168, "num_input_tokens_seen": 13509200, "step": 1155 }, { "epoch": 0.5974761782127221, "grad_norm": 2.344942216339615, "learning_rate": 7.855697251155967e-05, "loss": 0.7749, "num_input_tokens_seen": 13567656, "step": 1160 }, { "epoch": 0.6000515065670873, "grad_norm": 2.7313469239045305, "learning_rate": 7.835703917741212e-05, "loss": 0.9132, "num_input_tokens_seen": 13626136, "step": 1165 }, { "epoch": 0.6026268349214525, "grad_norm": 0.7410043911446527, "learning_rate": 7.81564351951057e-05, "loss": 0.8308, "num_input_tokens_seen": 13684608, "step": 1170 }, { "epoch": 0.6052021632758177, "grad_norm": 0.5628590604115411, "learning_rate": 7.795516530895414e-05, "loss": 0.8011, "num_input_tokens_seen": 13743080, "step": 1175 }, { "epoch": 0.6077774916301828, "grad_norm": 1.2008934424824649, "learning_rate": 7.775323427901993e-05, "loss": 0.8309, "num_input_tokens_seen": 13801552, "step": 1180 }, { "epoch": 0.610352819984548, "grad_norm": 1.2914156288367256, "learning_rate": 7.755064688100171e-05, "loss": 0.8089, "num_input_tokens_seen": 13860064, "step": 1185 }, { "epoch": 0.6129281483389132, "grad_norm": 1.420806774436513, "learning_rate": 7.734740790612136e-05, "loss": 0.8089, "num_input_tokens_seen": 13918552, "step": 1190 }, { "epoch": 0.6155034766932784, "grad_norm": 0.8352922832465102, "learning_rate": 7.714352216101055e-05, "loss": 0.8511, "num_input_tokens_seen": 13977056, "step": 1195 }, { "epoch": 0.6180788050476436, "grad_norm": 0.6321587989106885, "learning_rate": 7.693899446759727e-05, "loss": 0.8061, "num_input_tokens_seen": 14035544, "step": 1200 }, { "epoch": 0.6180788050476436, "eval_loss": 0.7593821287155151, "eval_runtime": 16.1368, "eval_samples_per_second": 3.718, "eval_steps_per_second": 0.93, "num_input_tokens_seen": 14035544, "step": 1200 }, { "epoch": 0.6206541334020087, "grad_norm": 1.0526811295206564, "learning_rate": 7.673382966299163e-05, "loss": 0.7871, "num_input_tokens_seen": 14094024, "step": 1205 }, { "epoch": 0.623229461756374, "grad_norm": 1.832697637344859, "learning_rate": 7.65280325993715e-05, "loss": 0.7594, "num_input_tokens_seen": 14152504, "step": 1210 }, { "epoch": 0.6258047901107391, "grad_norm": 1.6875031192331054, "learning_rate": 7.63216081438678e-05, "loss": 0.7833, "num_input_tokens_seen": 14210992, "step": 1215 }, { "epoch": 0.6283801184651043, "grad_norm": 1.867117238207419, "learning_rate": 7.611456117844934e-05, "loss": 0.8445, "num_input_tokens_seen": 14269488, "step": 1220 }, { "epoch": 0.6309554468194695, "grad_norm": 0.9089614634143406, "learning_rate": 7.59068965998074e-05, "loss": 0.7857, "num_input_tokens_seen": 14327968, "step": 1225 }, { "epoch": 0.6335307751738347, "grad_norm": 2.3911537408111214, "learning_rate": 7.569861931923989e-05, "loss": 0.8064, "num_input_tokens_seen": 14386448, "step": 1230 }, { "epoch": 0.6361061035281999, "grad_norm": 1.6500224851295993, "learning_rate": 7.548973426253521e-05, "loss": 0.7117, "num_input_tokens_seen": 14444912, "step": 1235 }, { "epoch": 0.638681431882565, "grad_norm": 1.508924461189316, "learning_rate": 7.528024636985575e-05, "loss": 0.7449, "num_input_tokens_seen": 14503392, "step": 1240 }, { "epoch": 0.6412567602369302, "grad_norm": 1.3801142620835953, "learning_rate": 7.507016059562107e-05, "loss": 0.7507, "num_input_tokens_seen": 14561872, "step": 1245 }, { "epoch": 0.6438320885912954, "grad_norm": 1.2994701535106117, "learning_rate": 7.485948190839077e-05, "loss": 0.7917, "num_input_tokens_seen": 14620336, "step": 1250 }, { "epoch": 0.6438320885912954, "eval_loss": 0.7407085299491882, "eval_runtime": 16.1168, "eval_samples_per_second": 3.723, "eval_steps_per_second": 0.931, "num_input_tokens_seen": 14620336, "step": 1250 }, { "epoch": 0.6464074169456606, "grad_norm": 0.9491399909407985, "learning_rate": 7.464821529074679e-05, "loss": 0.7763, "num_input_tokens_seen": 14678792, "step": 1255 }, { "epoch": 0.6489827453000258, "grad_norm": 1.1671149163333951, "learning_rate": 7.443636573917585e-05, "loss": 0.7979, "num_input_tokens_seen": 14737272, "step": 1260 }, { "epoch": 0.6515580736543909, "grad_norm": 1.4992002601057717, "learning_rate": 7.422393826395108e-05, "loss": 0.7883, "num_input_tokens_seen": 14795784, "step": 1265 }, { "epoch": 0.6541334020087561, "grad_norm": 1.2009664113851044, "learning_rate": 7.40109378890136e-05, "loss": 0.7183, "num_input_tokens_seen": 14854272, "step": 1270 }, { "epoch": 0.6567087303631213, "grad_norm": 1.5312778776593978, "learning_rate": 7.379736965185368e-05, "loss": 0.762, "num_input_tokens_seen": 14912720, "step": 1275 }, { "epoch": 0.6592840587174865, "grad_norm": 1.443384734396678, "learning_rate": 7.358323860339165e-05, "loss": 0.7951, "num_input_tokens_seen": 14971192, "step": 1280 }, { "epoch": 0.6618593870718517, "grad_norm": 1.3546652337943146, "learning_rate": 7.336854980785839e-05, "loss": 0.7528, "num_input_tokens_seen": 15029656, "step": 1285 }, { "epoch": 0.6644347154262168, "grad_norm": 1.4256460615881865, "learning_rate": 7.315330834267553e-05, "loss": 0.7633, "num_input_tokens_seen": 15088144, "step": 1290 }, { "epoch": 0.667010043780582, "grad_norm": 1.325772407306303, "learning_rate": 7.293751929833553e-05, "loss": 0.7443, "num_input_tokens_seen": 15146600, "step": 1295 }, { "epoch": 0.6695853721349472, "grad_norm": 2.727997344637842, "learning_rate": 7.272118777828108e-05, "loss": 0.7724, "num_input_tokens_seen": 15205064, "step": 1300 }, { "epoch": 0.6695853721349472, "eval_loss": 0.7189856171607971, "eval_runtime": 16.0307, "eval_samples_per_second": 3.743, "eval_steps_per_second": 0.936, "num_input_tokens_seen": 15205064, "step": 1300 }, { "epoch": 0.6721607004893124, "grad_norm": 2.6154468701895066, "learning_rate": 7.250431889878455e-05, "loss": 0.7524, "num_input_tokens_seen": 15263560, "step": 1305 }, { "epoch": 0.6747360288436776, "grad_norm": 1.9549500311782502, "learning_rate": 7.228691778882693e-05, "loss": 0.6748, "num_input_tokens_seen": 15322016, "step": 1310 }, { "epoch": 0.6773113571980427, "grad_norm": 2.991178206089954, "learning_rate": 7.20689895899765e-05, "loss": 0.7571, "num_input_tokens_seen": 15380504, "step": 1315 }, { "epoch": 0.6798866855524079, "grad_norm": 1.7022848080804835, "learning_rate": 7.185053945626733e-05, "loss": 0.6615, "num_input_tokens_seen": 15438944, "step": 1320 }, { "epoch": 0.6824620139067731, "grad_norm": 1.739259284519112, "learning_rate": 7.163157255407732e-05, "loss": 0.7421, "num_input_tokens_seen": 15497384, "step": 1325 }, { "epoch": 0.6850373422611383, "grad_norm": 1.9142982939434143, "learning_rate": 7.141209406200599e-05, "loss": 0.7886, "num_input_tokens_seen": 15555856, "step": 1330 }, { "epoch": 0.6876126706155035, "grad_norm": 1.7562659805497576, "learning_rate": 7.1192109170752e-05, "loss": 0.7484, "num_input_tokens_seen": 15614368, "step": 1335 }, { "epoch": 0.6901879989698687, "grad_norm": 1.7590122465257017, "learning_rate": 7.097162308299054e-05, "loss": 0.7086, "num_input_tokens_seen": 15672864, "step": 1340 }, { "epoch": 0.6927633273242338, "grad_norm": 2.1211445265818845, "learning_rate": 7.07506410132501e-05, "loss": 0.7494, "num_input_tokens_seen": 15731376, "step": 1345 }, { "epoch": 0.695338655678599, "grad_norm": 2.683073565523052, "learning_rate": 7.052916818778918e-05, "loss": 0.7278, "num_input_tokens_seen": 15789848, "step": 1350 }, { "epoch": 0.695338655678599, "eval_loss": 0.712917685508728, "eval_runtime": 16.0726, "eval_samples_per_second": 3.733, "eval_steps_per_second": 0.933, "num_input_tokens_seen": 15789848, "step": 1350 }, { "epoch": 0.6979139840329642, "grad_norm": 2.128495144345323, "learning_rate": 7.030720984447279e-05, "loss": 0.7005, "num_input_tokens_seen": 15848328, "step": 1355 }, { "epoch": 0.7004893123873294, "grad_norm": 1.9954206386005497, "learning_rate": 7.008477123264848e-05, "loss": 0.7406, "num_input_tokens_seen": 15906824, "step": 1360 }, { "epoch": 0.7030646407416946, "grad_norm": 2.2104679425901397, "learning_rate": 6.986185761302224e-05, "loss": 0.73, "num_input_tokens_seen": 15965312, "step": 1365 }, { "epoch": 0.7056399690960597, "grad_norm": 1.4881688553415275, "learning_rate": 6.963847425753403e-05, "loss": 0.7069, "num_input_tokens_seen": 16023824, "step": 1370 }, { "epoch": 0.7082152974504249, "grad_norm": 1.7307886623214839, "learning_rate": 6.941462644923318e-05, "loss": 0.6859, "num_input_tokens_seen": 16082280, "step": 1375 }, { "epoch": 0.7107906258047901, "grad_norm": 1.996363722225207, "learning_rate": 6.919031948215335e-05, "loss": 0.7254, "num_input_tokens_seen": 16140800, "step": 1380 }, { "epoch": 0.7133659541591553, "grad_norm": 1.9723274395570518, "learning_rate": 6.896555866118741e-05, "loss": 0.717, "num_input_tokens_seen": 16199320, "step": 1385 }, { "epoch": 0.7159412825135205, "grad_norm": 1.741253496639104, "learning_rate": 6.87403493019619e-05, "loss": 0.7094, "num_input_tokens_seen": 16257768, "step": 1390 }, { "epoch": 0.7185166108678857, "grad_norm": 1.6218002074106608, "learning_rate": 6.851469673071143e-05, "loss": 0.7862, "num_input_tokens_seen": 16316264, "step": 1395 }, { "epoch": 0.7210919392222508, "grad_norm": 1.7586707307941614, "learning_rate": 6.828860628415253e-05, "loss": 0.7359, "num_input_tokens_seen": 16374784, "step": 1400 }, { "epoch": 0.7210919392222508, "eval_loss": 0.6643603444099426, "eval_runtime": 16.1894, "eval_samples_per_second": 3.706, "eval_steps_per_second": 0.927, "num_input_tokens_seen": 16374784, "step": 1400 }, { "epoch": 0.723667267576616, "grad_norm": 2.665622720042704, "learning_rate": 6.806208330935766e-05, "loss": 0.706, "num_input_tokens_seen": 16433288, "step": 1405 }, { "epoch": 0.7262425959309812, "grad_norm": 2.123869663010538, "learning_rate": 6.783513316362855e-05, "loss": 0.6714, "num_input_tokens_seen": 16491784, "step": 1410 }, { "epoch": 0.7288179242853464, "grad_norm": 1.584213945279146, "learning_rate": 6.760776121436962e-05, "loss": 0.693, "num_input_tokens_seen": 16550272, "step": 1415 }, { "epoch": 0.7313932526397116, "grad_norm": 2.2481839233017764, "learning_rate": 6.737997283896103e-05, "loss": 0.7005, "num_input_tokens_seen": 16608704, "step": 1420 }, { "epoch": 0.7339685809940767, "grad_norm": 2.4818230151927643, "learning_rate": 6.715177342463145e-05, "loss": 0.6573, "num_input_tokens_seen": 16667200, "step": 1425 }, { "epoch": 0.7365439093484419, "grad_norm": 2.5398594354263486, "learning_rate": 6.692316836833065e-05, "loss": 0.6751, "num_input_tokens_seen": 16725704, "step": 1430 }, { "epoch": 0.7391192377028071, "grad_norm": 2.7486055345229343, "learning_rate": 6.6694163076602e-05, "loss": 0.6173, "num_input_tokens_seen": 16784192, "step": 1435 }, { "epoch": 0.7416945660571723, "grad_norm": 5.356237563459472, "learning_rate": 6.646476296545434e-05, "loss": 0.728, "num_input_tokens_seen": 16842704, "step": 1440 }, { "epoch": 0.7442698944115375, "grad_norm": 2.088505948846248, "learning_rate": 6.623497346023418e-05, "loss": 0.743, "num_input_tokens_seen": 16901176, "step": 1445 }, { "epoch": 0.7468452227659027, "grad_norm": 2.2198436340262, "learning_rate": 6.60047999954972e-05, "loss": 0.6291, "num_input_tokens_seen": 16959632, "step": 1450 }, { "epoch": 0.7468452227659027, "eval_loss": 0.753077507019043, "eval_runtime": 16.0383, "eval_samples_per_second": 3.741, "eval_steps_per_second": 0.935, "num_input_tokens_seen": 16959632, "step": 1450 }, { "epoch": 0.7494205511202678, "grad_norm": 1.9571252974715032, "learning_rate": 6.57742480148798e-05, "loss": 0.6533, "num_input_tokens_seen": 17018072, "step": 1455 }, { "epoch": 0.751995879474633, "grad_norm": 3.2075825448529542, "learning_rate": 6.554332297097031e-05, "loss": 0.7114, "num_input_tokens_seen": 17076560, "step": 1460 }, { "epoch": 0.7545712078289982, "grad_norm": 2.0030816579741266, "learning_rate": 6.53120303251801e-05, "loss": 0.6568, "num_input_tokens_seen": 17135016, "step": 1465 }, { "epoch": 0.7571465361833634, "grad_norm": 2.65056436638165, "learning_rate": 6.508037554761432e-05, "loss": 0.7016, "num_input_tokens_seen": 17193496, "step": 1470 }, { "epoch": 0.7597218645377286, "grad_norm": 1.9541651871708403, "learning_rate": 6.484836411694267e-05, "loss": 0.6612, "num_input_tokens_seen": 17251944, "step": 1475 }, { "epoch": 0.7622971928920937, "grad_norm": 3.0540242692558577, "learning_rate": 6.461600152026965e-05, "loss": 0.6115, "num_input_tokens_seen": 17310456, "step": 1480 }, { "epoch": 0.7648725212464589, "grad_norm": 2.796196437541352, "learning_rate": 6.438329325300499e-05, "loss": 0.6458, "num_input_tokens_seen": 17368968, "step": 1485 }, { "epoch": 0.7674478496008241, "grad_norm": 3.1979427976381207, "learning_rate": 6.415024481873352e-05, "loss": 0.6434, "num_input_tokens_seen": 17427424, "step": 1490 }, { "epoch": 0.7700231779551893, "grad_norm": 3.8375601078700203, "learning_rate": 6.391686172908506e-05, "loss": 0.5973, "num_input_tokens_seen": 17485936, "step": 1495 }, { "epoch": 0.7725985063095545, "grad_norm": 2.405705749864128, "learning_rate": 6.368314950360415e-05, "loss": 0.6021, "num_input_tokens_seen": 17544440, "step": 1500 }, { "epoch": 0.7725985063095545, "eval_loss": 0.632923424243927, "eval_runtime": 16.1038, "eval_samples_per_second": 3.726, "eval_steps_per_second": 0.931, "num_input_tokens_seen": 17544440, "step": 1500 }, { "epoch": 0.7751738346639196, "grad_norm": 2.8519087211521734, "learning_rate": 6.344911366961934e-05, "loss": 0.5779, "num_input_tokens_seen": 17602952, "step": 1505 }, { "epoch": 0.7777491630182848, "grad_norm": 2.861290579940173, "learning_rate": 6.321475976211266e-05, "loss": 0.6707, "num_input_tokens_seen": 17661440, "step": 1510 }, { "epoch": 0.78032449137265, "grad_norm": 3.541365161144121, "learning_rate": 6.298009332358856e-05, "loss": 0.6326, "num_input_tokens_seen": 17719928, "step": 1515 }, { "epoch": 0.7828998197270152, "grad_norm": 2.969962641272996, "learning_rate": 6.274511990394294e-05, "loss": 0.6472, "num_input_tokens_seen": 17778424, "step": 1520 }, { "epoch": 0.7854751480813804, "grad_norm": 2.762063548864621, "learning_rate": 6.250984506033183e-05, "loss": 0.6215, "num_input_tokens_seen": 17836936, "step": 1525 }, { "epoch": 0.7880504764357456, "grad_norm": 3.2198855545004097, "learning_rate": 6.227427435703997e-05, "loss": 0.6102, "num_input_tokens_seen": 17895392, "step": 1530 }, { "epoch": 0.7906258047901107, "grad_norm": 3.846544371420393, "learning_rate": 6.203841336534924e-05, "loss": 0.6161, "num_input_tokens_seen": 17953872, "step": 1535 }, { "epoch": 0.7932011331444759, "grad_norm": 3.811248686105134, "learning_rate": 6.180226766340688e-05, "loss": 0.6103, "num_input_tokens_seen": 18012320, "step": 1540 }, { "epoch": 0.7957764614988411, "grad_norm": 2.9539705466919703, "learning_rate": 6.156584283609359e-05, "loss": 0.5791, "num_input_tokens_seen": 18070792, "step": 1545 }, { "epoch": 0.7983517898532063, "grad_norm": 3.0546686267383283, "learning_rate": 6.132914447489137e-05, "loss": 0.667, "num_input_tokens_seen": 18129304, "step": 1550 }, { "epoch": 0.7983517898532063, "eval_loss": 0.6617516279220581, "eval_runtime": 16.0333, "eval_samples_per_second": 3.742, "eval_steps_per_second": 0.936, "num_input_tokens_seen": 18129304, "step": 1550 }, { "epoch": 0.8009271182075715, "grad_norm": 2.9735507158511987, "learning_rate": 6.109217817775139e-05, "loss": 0.5681, "num_input_tokens_seen": 18187728, "step": 1555 }, { "epoch": 0.8035024465619367, "grad_norm": 3.6620315644598778, "learning_rate": 6.085494954896156e-05, "loss": 0.6292, "num_input_tokens_seen": 18246192, "step": 1560 }, { "epoch": 0.8060777749163018, "grad_norm": 4.03631122919402, "learning_rate": 6.061746419901388e-05, "loss": 0.6512, "num_input_tokens_seen": 18304632, "step": 1565 }, { "epoch": 0.808653103270667, "grad_norm": 4.0040288177360805, "learning_rate": 6.0379727744471936e-05, "loss": 0.5476, "num_input_tokens_seen": 18363136, "step": 1570 }, { "epoch": 0.8112284316250322, "grad_norm": 3.9448861517599996, "learning_rate": 6.014174580783794e-05, "loss": 0.5632, "num_input_tokens_seen": 18421592, "step": 1575 }, { "epoch": 0.8138037599793974, "grad_norm": 3.8400680048739435, "learning_rate": 5.990352401741981e-05, "loss": 0.6225, "num_input_tokens_seen": 18480104, "step": 1580 }, { "epoch": 0.8163790883337626, "grad_norm": 2.7981339113543284, "learning_rate": 5.9665068007197976e-05, "loss": 0.5801, "num_input_tokens_seen": 18538600, "step": 1585 }, { "epoch": 0.8189544166881277, "grad_norm": 4.290843515697908, "learning_rate": 5.94263834166923e-05, "loss": 0.6364, "num_input_tokens_seen": 18597104, "step": 1590 }, { "epoch": 0.8215297450424929, "grad_norm": 3.9001572117535566, "learning_rate": 5.918747589082853e-05, "loss": 0.6088, "num_input_tokens_seen": 18655584, "step": 1595 }, { "epoch": 0.8241050733968581, "grad_norm": 3.5623412341260363, "learning_rate": 5.8948351079804875e-05, "loss": 0.6564, "num_input_tokens_seen": 18714072, "step": 1600 }, { "epoch": 0.8241050733968581, "eval_loss": 0.6319106221199036, "eval_runtime": 16.0199, "eval_samples_per_second": 3.745, "eval_steps_per_second": 0.936, "num_input_tokens_seen": 18714072, "step": 1600 }, { "epoch": 0.8266804017512233, "grad_norm": 3.4115030121534953, "learning_rate": 5.8709014638958404e-05, "loss": 0.6095, "num_input_tokens_seen": 18772552, "step": 1605 }, { "epoch": 0.8292557301055885, "grad_norm": 2.8584050529867895, "learning_rate": 5.846947222863123e-05, "loss": 0.5896, "num_input_tokens_seen": 18830992, "step": 1610 }, { "epoch": 0.8318310584599536, "grad_norm": 3.083134826868609, "learning_rate": 5.8229729514036705e-05, "loss": 0.545, "num_input_tokens_seen": 18889480, "step": 1615 }, { "epoch": 0.8344063868143188, "grad_norm": 3.5650772646006703, "learning_rate": 5.7989792165125356e-05, "loss": 0.6021, "num_input_tokens_seen": 18947936, "step": 1620 }, { "epoch": 0.836981715168684, "grad_norm": 3.1787537764025737, "learning_rate": 5.774966585645092e-05, "loss": 0.5741, "num_input_tokens_seen": 19006432, "step": 1625 }, { "epoch": 0.8395570435230492, "grad_norm": 4.505205596087594, "learning_rate": 5.7509356267035975e-05, "loss": 0.5796, "num_input_tokens_seen": 19064920, "step": 1630 }, { "epoch": 0.8421323718774144, "grad_norm": 3.854433226263906, "learning_rate": 5.726886908023776e-05, "loss": 0.5088, "num_input_tokens_seen": 19123376, "step": 1635 }, { "epoch": 0.8447077002317795, "grad_norm": 3.5910960304247643, "learning_rate": 5.702820998361373e-05, "loss": 0.5431, "num_input_tokens_seen": 19181864, "step": 1640 }, { "epoch": 0.8472830285861447, "grad_norm": 4.55639282269759, "learning_rate": 5.6787384668786994e-05, "loss": 0.5849, "num_input_tokens_seen": 19240352, "step": 1645 }, { "epoch": 0.8498583569405099, "grad_norm": 4.031478721616991, "learning_rate": 5.654639883131178e-05, "loss": 0.5668, "num_input_tokens_seen": 19298848, "step": 1650 }, { "epoch": 0.8498583569405099, "eval_loss": 0.6634677648544312, "eval_runtime": 16.0267, "eval_samples_per_second": 3.744, "eval_steps_per_second": 0.936, "num_input_tokens_seen": 19298848, "step": 1650 }, { "epoch": 0.8524336852948751, "grad_norm": 3.2430676664218496, "learning_rate": 5.6305258170538676e-05, "loss": 0.584, "num_input_tokens_seen": 19357304, "step": 1655 }, { "epoch": 0.8550090136492403, "grad_norm": 3.140559424454581, "learning_rate": 5.606396838947988e-05, "loss": 0.5544, "num_input_tokens_seen": 19415800, "step": 1660 }, { "epoch": 0.8575843420036054, "grad_norm": 3.993528386539066, "learning_rate": 5.582253519467432e-05, "loss": 0.6269, "num_input_tokens_seen": 19474256, "step": 1665 }, { "epoch": 0.8601596703579707, "grad_norm": 2.202747116085024, "learning_rate": 5.558096429605263e-05, "loss": 0.5073, "num_input_tokens_seen": 19532736, "step": 1670 }, { "epoch": 0.8627349987123358, "grad_norm": 4.4094334133851625, "learning_rate": 5.533926140680221e-05, "loss": 0.5319, "num_input_tokens_seen": 19591184, "step": 1675 }, { "epoch": 0.865310327066701, "grad_norm": 4.01821546567579, "learning_rate": 5.509743224323203e-05, "loss": 0.4525, "num_input_tokens_seen": 19649656, "step": 1680 }, { "epoch": 0.8678856554210662, "grad_norm": 5.3033277992950385, "learning_rate": 5.485548252463749e-05, "loss": 0.5276, "num_input_tokens_seen": 19708144, "step": 1685 }, { "epoch": 0.8704609837754314, "grad_norm": 5.124737819396939, "learning_rate": 5.4613417973165106e-05, "loss": 0.5482, "num_input_tokens_seen": 19766592, "step": 1690 }, { "epoch": 0.8730363121297966, "grad_norm": 3.47304956996904, "learning_rate": 5.4371244313677225e-05, "loss": 0.4656, "num_input_tokens_seen": 19825064, "step": 1695 }, { "epoch": 0.8756116404841617, "grad_norm": 6.394279811127835, "learning_rate": 5.4128967273616625e-05, "loss": 0.5701, "num_input_tokens_seen": 19883504, "step": 1700 }, { "epoch": 0.8756116404841617, "eval_loss": 0.7144017815589905, "eval_runtime": 16.1358, "eval_samples_per_second": 3.718, "eval_steps_per_second": 0.93, "num_input_tokens_seen": 19883504, "step": 1700 }, { "epoch": 0.8781869688385269, "grad_norm": 4.527262723362309, "learning_rate": 5.388659258287102e-05, "loss": 0.5823, "num_input_tokens_seen": 19942000, "step": 1705 }, { "epoch": 0.8807622971928921, "grad_norm": 4.628112845411063, "learning_rate": 5.364412597363759e-05, "loss": 0.5446, "num_input_tokens_seen": 20000440, "step": 1710 }, { "epoch": 0.8833376255472573, "grad_norm": 6.077375809046342, "learning_rate": 5.3401573180287426e-05, "loss": 0.5769, "num_input_tokens_seen": 20058920, "step": 1715 }, { "epoch": 0.8859129539016225, "grad_norm": 6.492863688878202, "learning_rate": 5.315893993922986e-05, "loss": 0.5614, "num_input_tokens_seen": 20117416, "step": 1720 }, { "epoch": 0.8884882822559876, "grad_norm": 5.332057542240503, "learning_rate": 5.29162319887768e-05, "loss": 0.5215, "num_input_tokens_seen": 20175936, "step": 1725 }, { "epoch": 0.8910636106103528, "grad_norm": 3.8772752615113077, "learning_rate": 5.26734550690071e-05, "loss": 0.4968, "num_input_tokens_seen": 20234368, "step": 1730 }, { "epoch": 0.893638938964718, "grad_norm": 4.886426418731965, "learning_rate": 5.243061492163073e-05, "loss": 0.5029, "num_input_tokens_seen": 20292856, "step": 1735 }, { "epoch": 0.8962142673190832, "grad_norm": 4.031774194047053, "learning_rate": 5.2187717289852955e-05, "loss": 0.5249, "num_input_tokens_seen": 20351272, "step": 1740 }, { "epoch": 0.8987895956734484, "grad_norm": 5.344580011428224, "learning_rate": 5.1944767918238624e-05, "loss": 0.5801, "num_input_tokens_seen": 20409744, "step": 1745 }, { "epoch": 0.9013649240278135, "grad_norm": 3.923379435953565, "learning_rate": 5.170177255257618e-05, "loss": 0.546, "num_input_tokens_seen": 20468200, "step": 1750 }, { "epoch": 0.9013649240278135, "eval_loss": 0.672294020652771, "eval_runtime": 16.0203, "eval_samples_per_second": 3.745, "eval_steps_per_second": 0.936, "num_input_tokens_seen": 20468200, "step": 1750 }, { "epoch": 0.9039402523821787, "grad_norm": 4.616122198129487, "learning_rate": 5.145873693974188e-05, "loss": 0.5248, "num_input_tokens_seen": 20526696, "step": 1755 }, { "epoch": 0.9065155807365439, "grad_norm": 5.322590172525407, "learning_rate": 5.12156668275638e-05, "loss": 0.4756, "num_input_tokens_seen": 20585160, "step": 1760 }, { "epoch": 0.9090909090909091, "grad_norm": 4.002252878507737, "learning_rate": 5.097256796468598e-05, "loss": 0.4405, "num_input_tokens_seen": 20643672, "step": 1765 }, { "epoch": 0.9116662374452743, "grad_norm": 5.58017966349683, "learning_rate": 5.072944610043232e-05, "loss": 0.5201, "num_input_tokens_seen": 20702152, "step": 1770 }, { "epoch": 0.9142415657996394, "grad_norm": 4.688576373892097, "learning_rate": 5.048630698467081e-05, "loss": 0.4662, "num_input_tokens_seen": 20760664, "step": 1775 }, { "epoch": 0.9168168941540046, "grad_norm": 4.984086874604376, "learning_rate": 5.024315636767738e-05, "loss": 0.5376, "num_input_tokens_seen": 20819144, "step": 1780 }, { "epoch": 0.9193922225083698, "grad_norm": 4.470690620190923, "learning_rate": 5e-05, "loss": 0.5174, "num_input_tokens_seen": 20877624, "step": 1785 }, { "epoch": 0.921967550862735, "grad_norm": 4.1127649145734795, "learning_rate": 4.9756843632322626e-05, "loss": 0.4273, "num_input_tokens_seen": 20936112, "step": 1790 }, { "epoch": 0.9245428792171002, "grad_norm": 5.1892527739805185, "learning_rate": 4.9513693015329197e-05, "loss": 0.4646, "num_input_tokens_seen": 20994608, "step": 1795 }, { "epoch": 0.9271182075714653, "grad_norm": 6.8574703914708985, "learning_rate": 4.9270553899567686e-05, "loss": 0.412, "num_input_tokens_seen": 21053080, "step": 1800 }, { "epoch": 0.9271182075714653, "eval_loss": 0.6768696904182434, "eval_runtime": 15.9758, "eval_samples_per_second": 3.756, "eval_steps_per_second": 0.939, "num_input_tokens_seen": 21053080, "step": 1800 }, { "epoch": 0.9296935359258306, "grad_norm": 6.328873193178562, "learning_rate": 4.902743203531405e-05, "loss": 0.4845, "num_input_tokens_seen": 21111592, "step": 1805 }, { "epoch": 0.9322688642801957, "grad_norm": 4.7019594666508215, "learning_rate": 4.8784333172436206e-05, "loss": 0.441, "num_input_tokens_seen": 21170024, "step": 1810 }, { "epoch": 0.9348441926345609, "grad_norm": 4.545287749618146, "learning_rate": 4.854126306025812e-05, "loss": 0.545, "num_input_tokens_seen": 21228480, "step": 1815 }, { "epoch": 0.9374195209889261, "grad_norm": 7.047942469299444, "learning_rate": 4.829822744742383e-05, "loss": 0.4697, "num_input_tokens_seen": 21286944, "step": 1820 }, { "epoch": 0.9399948493432912, "grad_norm": 3.917758669787159, "learning_rate": 4.8055232081761395e-05, "loss": 0.423, "num_input_tokens_seen": 21345456, "step": 1825 }, { "epoch": 0.9425701776976565, "grad_norm": 3.442911876713947, "learning_rate": 4.781228271014704e-05, "loss": 0.4715, "num_input_tokens_seen": 21403896, "step": 1830 }, { "epoch": 0.9451455060520216, "grad_norm": 4.755237925353789, "learning_rate": 4.756938507836929e-05, "loss": 0.5149, "num_input_tokens_seen": 21462360, "step": 1835 }, { "epoch": 0.9477208344063868, "grad_norm": 5.3552741805060275, "learning_rate": 4.732654493099291e-05, "loss": 0.5403, "num_input_tokens_seen": 21520864, "step": 1840 }, { "epoch": 0.950296162760752, "grad_norm": 3.417134377266731, "learning_rate": 4.708376801122321e-05, "loss": 0.4757, "num_input_tokens_seen": 21579376, "step": 1845 }, { "epoch": 0.9528714911151172, "grad_norm": 4.6802756294331855, "learning_rate": 4.6841060060770154e-05, "loss": 0.4347, "num_input_tokens_seen": 21637848, "step": 1850 }, { "epoch": 0.9528714911151172, "eval_loss": 0.6808218359947205, "eval_runtime": 16.1166, "eval_samples_per_second": 3.723, "eval_steps_per_second": 0.931, "num_input_tokens_seen": 21637848, "step": 1850 }, { "epoch": 0.9554468194694824, "grad_norm": 5.573192417675986, "learning_rate": 4.659842681971258e-05, "loss": 0.5132, "num_input_tokens_seen": 21696328, "step": 1855 }, { "epoch": 0.9580221478238475, "grad_norm": 7.109977536510439, "learning_rate": 4.635587402636241e-05, "loss": 0.4347, "num_input_tokens_seen": 21754816, "step": 1860 }, { "epoch": 0.9605974761782127, "grad_norm": 7.143552890986281, "learning_rate": 4.611340741712901e-05, "loss": 0.4015, "num_input_tokens_seen": 21813296, "step": 1865 }, { "epoch": 0.9631728045325779, "grad_norm": 6.289734219426663, "learning_rate": 4.5871032726383386e-05, "loss": 0.5023, "num_input_tokens_seen": 21871800, "step": 1870 }, { "epoch": 0.9657481328869431, "grad_norm": 5.981747103855226, "learning_rate": 4.562875568632278e-05, "loss": 0.5334, "num_input_tokens_seen": 21930272, "step": 1875 }, { "epoch": 0.9683234612413083, "grad_norm": 5.6559760588122545, "learning_rate": 4.5386582026834906e-05, "loss": 0.4386, "num_input_tokens_seen": 21988736, "step": 1880 }, { "epoch": 0.9708987895956734, "grad_norm": 5.861060155419055, "learning_rate": 4.5144517475362514e-05, "loss": 0.3807, "num_input_tokens_seen": 22047200, "step": 1885 }, { "epoch": 0.9734741179500386, "grad_norm": 7.801226281593827, "learning_rate": 4.490256775676797e-05, "loss": 0.4177, "num_input_tokens_seen": 22105664, "step": 1890 }, { "epoch": 0.9760494463044038, "grad_norm": 6.1755894964345135, "learning_rate": 4.466073859319781e-05, "loss": 0.5239, "num_input_tokens_seen": 22164184, "step": 1895 }, { "epoch": 0.978624774658769, "grad_norm": 5.397307732194541, "learning_rate": 4.441903570394739e-05, "loss": 0.3737, "num_input_tokens_seen": 22222632, "step": 1900 }, { "epoch": 0.978624774658769, "eval_loss": 0.773033082485199, "eval_runtime": 15.9975, "eval_samples_per_second": 3.751, "eval_steps_per_second": 0.938, "num_input_tokens_seen": 22222632, "step": 1900 }, { "epoch": 0.9812001030131342, "grad_norm": 6.997624273550619, "learning_rate": 4.41774648053257e-05, "loss": 0.4437, "num_input_tokens_seen": 22281080, "step": 1905 }, { "epoch": 0.9837754313674993, "grad_norm": 5.030616381143982, "learning_rate": 4.3936031610520124e-05, "loss": 0.465, "num_input_tokens_seen": 22339552, "step": 1910 }, { "epoch": 0.9863507597218646, "grad_norm": 5.025845260709186, "learning_rate": 4.3694741829461336e-05, "loss": 0.4975, "num_input_tokens_seen": 22398056, "step": 1915 }, { "epoch": 0.9889260880762297, "grad_norm": 6.43843242330618, "learning_rate": 4.345360116868823e-05, "loss": 0.4504, "num_input_tokens_seen": 22456520, "step": 1920 }, { "epoch": 0.9915014164305949, "grad_norm": 5.281203851622467, "learning_rate": 4.321261533121303e-05, "loss": 0.4528, "num_input_tokens_seen": 22515024, "step": 1925 }, { "epoch": 0.9940767447849601, "grad_norm": 6.158304256456398, "learning_rate": 4.2971790016386286e-05, "loss": 0.441, "num_input_tokens_seen": 22573480, "step": 1930 }, { "epoch": 0.9966520731393252, "grad_norm": 3.898263595049965, "learning_rate": 4.273113091976225e-05, "loss": 0.4678, "num_input_tokens_seen": 22631960, "step": 1935 }, { "epoch": 0.9992274014936905, "grad_norm": 6.266433889699235, "learning_rate": 4.249064373296403e-05, "loss": 0.4352, "num_input_tokens_seen": 22690432, "step": 1940 }, { "epoch": 1.001545197012619, "grad_norm": 2.4601530377865695, "learning_rate": 4.225033414354908e-05, "loss": 0.3792, "num_input_tokens_seen": 22743048, "step": 1945 }, { "epoch": 1.0041205253669843, "grad_norm": 4.761740260797231, "learning_rate": 4.201020783487464e-05, "loss": 0.3783, "num_input_tokens_seen": 22801512, "step": 1950 }, { "epoch": 1.0041205253669843, "eval_loss": 0.6983156204223633, "eval_runtime": 16.3172, "eval_samples_per_second": 3.677, "eval_steps_per_second": 0.919, "num_input_tokens_seen": 22801512, "step": 1950 }, { "epoch": 1.0066958537213495, "grad_norm": 6.506183969602581, "learning_rate": 4.17702704859633e-05, "loss": 0.3784, "num_input_tokens_seen": 22859952, "step": 1955 }, { "epoch": 1.0092711820757148, "grad_norm": 7.31299798110374, "learning_rate": 4.153052777136879e-05, "loss": 0.5587, "num_input_tokens_seen": 22918440, "step": 1960 }, { "epoch": 1.0118465104300798, "grad_norm": 4.338872323547646, "learning_rate": 4.1290985361041614e-05, "loss": 0.3803, "num_input_tokens_seen": 22976944, "step": 1965 }, { "epoch": 1.014421838784445, "grad_norm": 6.798827966152428, "learning_rate": 4.105164892019514e-05, "loss": 0.4038, "num_input_tokens_seen": 23035408, "step": 1970 }, { "epoch": 1.0169971671388103, "grad_norm": 5.018683403937771, "learning_rate": 4.0812524109171476e-05, "loss": 0.3226, "num_input_tokens_seen": 23093912, "step": 1975 }, { "epoch": 1.0195724954931753, "grad_norm": 4.594775856201265, "learning_rate": 4.0573616583307705e-05, "loss": 0.4026, "num_input_tokens_seen": 23152344, "step": 1980 }, { "epoch": 1.0221478238475405, "grad_norm": 7.5346230342964695, "learning_rate": 4.033493199280202e-05, "loss": 0.4225, "num_input_tokens_seen": 23210800, "step": 1985 }, { "epoch": 1.0247231522019058, "grad_norm": 8.213657673441388, "learning_rate": 4.009647598258022e-05, "loss": 0.3058, "num_input_tokens_seen": 23269304, "step": 1990 }, { "epoch": 1.0272984805562708, "grad_norm": 6.881744374075897, "learning_rate": 3.985825419216207e-05, "loss": 0.3821, "num_input_tokens_seen": 23327800, "step": 1995 }, { "epoch": 1.029873808910636, "grad_norm": 3.916989546123924, "learning_rate": 3.962027225552807e-05, "loss": 0.3328, "num_input_tokens_seen": 23386232, "step": 2000 }, { "epoch": 1.029873808910636, "eval_loss": 0.7484827041625977, "eval_runtime": 16.091, "eval_samples_per_second": 3.729, "eval_steps_per_second": 0.932, "num_input_tokens_seen": 23386232, "step": 2000 }, { "epoch": 1.0324491372650013, "grad_norm": 5.8532055715340245, "learning_rate": 3.938253580098613e-05, "loss": 0.362, "num_input_tokens_seen": 23444712, "step": 2005 }, { "epoch": 1.0350244656193666, "grad_norm": 7.087739461357715, "learning_rate": 3.914505045103845e-05, "loss": 0.3903, "num_input_tokens_seen": 23503192, "step": 2010 }, { "epoch": 1.0375997939737316, "grad_norm": 6.061997147134047, "learning_rate": 3.8907821822248605e-05, "loss": 0.3341, "num_input_tokens_seen": 23561688, "step": 2015 }, { "epoch": 1.0401751223280968, "grad_norm": 6.783069419644998, "learning_rate": 3.867085552510864e-05, "loss": 0.4794, "num_input_tokens_seen": 23620160, "step": 2020 }, { "epoch": 1.042750450682462, "grad_norm": 4.11088291372727, "learning_rate": 3.843415716390644e-05, "loss": 0.4104, "num_input_tokens_seen": 23678624, "step": 2025 }, { "epoch": 1.045325779036827, "grad_norm": 5.727855298190317, "learning_rate": 3.819773233659314e-05, "loss": 0.3639, "num_input_tokens_seen": 23737064, "step": 2030 }, { "epoch": 1.0479011073911924, "grad_norm": 6.936114108935384, "learning_rate": 3.7961586634650767e-05, "loss": 0.4294, "num_input_tokens_seen": 23795568, "step": 2035 }, { "epoch": 1.0504764357455576, "grad_norm": 5.577801320854008, "learning_rate": 3.772572564296005e-05, "loss": 0.4713, "num_input_tokens_seen": 23854040, "step": 2040 }, { "epoch": 1.0530517640999228, "grad_norm": 7.466883391944433, "learning_rate": 3.749015493966817e-05, "loss": 0.3864, "num_input_tokens_seen": 23912520, "step": 2045 }, { "epoch": 1.0556270924542879, "grad_norm": 4.120909561971508, "learning_rate": 3.7254880096057073e-05, "loss": 0.3602, "num_input_tokens_seen": 23971048, "step": 2050 }, { "epoch": 1.0556270924542879, "eval_loss": 0.7190810441970825, "eval_runtime": 16.0858, "eval_samples_per_second": 3.73, "eval_steps_per_second": 0.932, "num_input_tokens_seen": 23971048, "step": 2050 }, { "epoch": 1.0582024208086531, "grad_norm": 3.701758619566102, "learning_rate": 3.7019906676411446e-05, "loss": 0.3203, "num_input_tokens_seen": 24029544, "step": 2055 }, { "epoch": 1.0607777491630184, "grad_norm": 7.855789285552562, "learning_rate": 3.678524023788735e-05, "loss": 0.3906, "num_input_tokens_seen": 24088008, "step": 2060 }, { "epoch": 1.0633530775173834, "grad_norm": 6.682460948737117, "learning_rate": 3.6550886330380665e-05, "loss": 0.3604, "num_input_tokens_seen": 24146480, "step": 2065 }, { "epoch": 1.0659284058717486, "grad_norm": 3.587156705730744, "learning_rate": 3.631685049639586e-05, "loss": 0.3271, "num_input_tokens_seen": 24204984, "step": 2070 }, { "epoch": 1.0685037342261139, "grad_norm": 4.621273077841867, "learning_rate": 3.608313827091493e-05, "loss": 0.2996, "num_input_tokens_seen": 24263456, "step": 2075 }, { "epoch": 1.071079062580479, "grad_norm": 6.565390196167412, "learning_rate": 3.5849755181266474e-05, "loss": 0.3767, "num_input_tokens_seen": 24321960, "step": 2080 }, { "epoch": 1.0736543909348442, "grad_norm": 6.589833421708817, "learning_rate": 3.5616706746995026e-05, "loss": 0.4208, "num_input_tokens_seen": 24380464, "step": 2085 }, { "epoch": 1.0762297192892094, "grad_norm": 4.95070197991303, "learning_rate": 3.538399847973036e-05, "loss": 0.3479, "num_input_tokens_seen": 24438976, "step": 2090 }, { "epoch": 1.0788050476435747, "grad_norm": 5.124820683013397, "learning_rate": 3.515163588305735e-05, "loss": 0.3654, "num_input_tokens_seen": 24497448, "step": 2095 }, { "epoch": 1.0813803759979397, "grad_norm": 6.444785878585679, "learning_rate": 3.491962445238569e-05, "loss": 0.3351, "num_input_tokens_seen": 24555904, "step": 2100 }, { "epoch": 1.0813803759979397, "eval_loss": 0.8075026869773865, "eval_runtime": 16.096, "eval_samples_per_second": 3.728, "eval_steps_per_second": 0.932, "num_input_tokens_seen": 24555904, "step": 2100 }, { "epoch": 1.083955704352305, "grad_norm": 5.259882631403194, "learning_rate": 3.4687969674819906e-05, "loss": 0.3827, "num_input_tokens_seen": 24614392, "step": 2105 }, { "epoch": 1.0865310327066702, "grad_norm": 4.276410371848581, "learning_rate": 3.445667702902969e-05, "loss": 0.3676, "num_input_tokens_seen": 24672848, "step": 2110 }, { "epoch": 1.0891063610610352, "grad_norm": 10.209040215860048, "learning_rate": 3.4225751985120215e-05, "loss": 0.3253, "num_input_tokens_seen": 24731344, "step": 2115 }, { "epoch": 1.0916816894154004, "grad_norm": 6.169752493978822, "learning_rate": 3.3995200004502816e-05, "loss": 0.4297, "num_input_tokens_seen": 24789832, "step": 2120 }, { "epoch": 1.0942570177697657, "grad_norm": 4.238650399680663, "learning_rate": 3.3765026539765834e-05, "loss": 0.3536, "num_input_tokens_seen": 24848264, "step": 2125 }, { "epoch": 1.0968323461241307, "grad_norm": 5.445173229006411, "learning_rate": 3.3535237034545675e-05, "loss": 0.3588, "num_input_tokens_seen": 24906744, "step": 2130 }, { "epoch": 1.099407674478496, "grad_norm": 4.508587102151408, "learning_rate": 3.330583692339802e-05, "loss": 0.3666, "num_input_tokens_seen": 24965256, "step": 2135 }, { "epoch": 1.1019830028328612, "grad_norm": 5.836654544282574, "learning_rate": 3.307683163166934e-05, "loss": 0.3334, "num_input_tokens_seen": 25023768, "step": 2140 }, { "epoch": 1.1045583311872265, "grad_norm": 6.855334175793522, "learning_rate": 3.284822657536856e-05, "loss": 0.3848, "num_input_tokens_seen": 25082248, "step": 2145 }, { "epoch": 1.1071336595415915, "grad_norm": 5.3006438448712565, "learning_rate": 3.262002716103897e-05, "loss": 0.3699, "num_input_tokens_seen": 25140752, "step": 2150 }, { "epoch": 1.1071336595415915, "eval_loss": 0.8523861169815063, "eval_runtime": 16.0023, "eval_samples_per_second": 3.749, "eval_steps_per_second": 0.937, "num_input_tokens_seen": 25140752, "step": 2150 }, { "epoch": 1.1097089878959567, "grad_norm": 3.943124296473041, "learning_rate": 3.2392238785630386e-05, "loss": 0.3154, "num_input_tokens_seen": 25199208, "step": 2155 }, { "epoch": 1.112284316250322, "grad_norm": 8.398532132538953, "learning_rate": 3.216486683637146e-05, "loss": 0.3915, "num_input_tokens_seen": 25257680, "step": 2160 }, { "epoch": 1.114859644604687, "grad_norm": 4.081633194377614, "learning_rate": 3.1937916690642356e-05, "loss": 0.3675, "num_input_tokens_seen": 25316200, "step": 2165 }, { "epoch": 1.1174349729590523, "grad_norm": 6.920842495491902, "learning_rate": 3.1711393715847476e-05, "loss": 0.4047, "num_input_tokens_seen": 25374656, "step": 2170 }, { "epoch": 1.1200103013134175, "grad_norm": 8.460113153700512, "learning_rate": 3.14853032692886e-05, "loss": 0.4155, "num_input_tokens_seen": 25433168, "step": 2175 }, { "epoch": 1.1225856296677827, "grad_norm": 9.825074199159944, "learning_rate": 3.125965069803811e-05, "loss": 0.3966, "num_input_tokens_seen": 25491664, "step": 2180 }, { "epoch": 1.1251609580221478, "grad_norm": 5.732206927543506, "learning_rate": 3.103444133881261e-05, "loss": 0.3068, "num_input_tokens_seen": 25550128, "step": 2185 }, { "epoch": 1.127736286376513, "grad_norm": 6.135036052058211, "learning_rate": 3.080968051784666e-05, "loss": 0.386, "num_input_tokens_seen": 25608624, "step": 2190 }, { "epoch": 1.1303116147308783, "grad_norm": 3.31420885852192, "learning_rate": 3.058537355076683e-05, "loss": 0.3898, "num_input_tokens_seen": 25667128, "step": 2195 }, { "epoch": 1.1328869430852433, "grad_norm": 8.182546413863832, "learning_rate": 3.0361525742465973e-05, "loss": 0.4016, "num_input_tokens_seen": 25725560, "step": 2200 }, { "epoch": 1.1328869430852433, "eval_loss": 0.7534744143486023, "eval_runtime": 15.969, "eval_samples_per_second": 3.757, "eval_steps_per_second": 0.939, "num_input_tokens_seen": 25725560, "step": 2200 }, { "epoch": 1.1354622714396085, "grad_norm": 4.616007617470174, "learning_rate": 3.0138142386977787e-05, "loss": 0.3465, "num_input_tokens_seen": 25784048, "step": 2205 }, { "epoch": 1.1380375997939738, "grad_norm": 4.752551024155875, "learning_rate": 2.991522876735154e-05, "loss": 0.3077, "num_input_tokens_seen": 25842512, "step": 2210 }, { "epoch": 1.140612928148339, "grad_norm": 6.021213921198953, "learning_rate": 2.9692790155527227e-05, "loss": 0.4497, "num_input_tokens_seen": 25900992, "step": 2215 }, { "epoch": 1.143188256502704, "grad_norm": 8.098592782255322, "learning_rate": 2.9470831812210837e-05, "loss": 0.3811, "num_input_tokens_seen": 25959448, "step": 2220 }, { "epoch": 1.1457635848570693, "grad_norm": 6.108837560432838, "learning_rate": 2.924935898674992e-05, "loss": 0.4053, "num_input_tokens_seen": 26017936, "step": 2225 }, { "epoch": 1.1483389132114346, "grad_norm": 7.709937017464705, "learning_rate": 2.902837691700945e-05, "loss": 0.3421, "num_input_tokens_seen": 26076440, "step": 2230 }, { "epoch": 1.1509142415657996, "grad_norm": 3.840146275079161, "learning_rate": 2.880789082924798e-05, "loss": 0.3228, "num_input_tokens_seen": 26134896, "step": 2235 }, { "epoch": 1.1534895699201648, "grad_norm": 6.088757703790803, "learning_rate": 2.858790593799405e-05, "loss": 0.3695, "num_input_tokens_seen": 26193368, "step": 2240 }, { "epoch": 1.15606489827453, "grad_norm": 3.8647543120940844, "learning_rate": 2.8368427445922696e-05, "loss": 0.3463, "num_input_tokens_seen": 26251848, "step": 2245 }, { "epoch": 1.158640226628895, "grad_norm": 4.425454601086007, "learning_rate": 2.8149460543732664e-05, "loss": 0.3442, "num_input_tokens_seen": 26310336, "step": 2250 }, { "epoch": 1.158640226628895, "eval_loss": 0.7066138386726379, "eval_runtime": 15.9558, "eval_samples_per_second": 3.76, "eval_steps_per_second": 0.94, "num_input_tokens_seen": 26310336, "step": 2250 }, { "epoch": 1.1612155549832603, "grad_norm": 6.312367706992343, "learning_rate": 2.7931010410023518e-05, "loss": 0.3547, "num_input_tokens_seen": 26368840, "step": 2255 }, { "epoch": 1.1637908833376256, "grad_norm": 6.429493717694784, "learning_rate": 2.771308221117309e-05, "loss": 0.3125, "num_input_tokens_seen": 26427280, "step": 2260 }, { "epoch": 1.1663662116919906, "grad_norm": 6.993677707266103, "learning_rate": 2.749568110121545e-05, "loss": 0.3521, "num_input_tokens_seen": 26485760, "step": 2265 }, { "epoch": 1.1689415400463559, "grad_norm": 5.03743116566882, "learning_rate": 2.7278812221718924e-05, "loss": 0.281, "num_input_tokens_seen": 26544224, "step": 2270 }, { "epoch": 1.1715168684007211, "grad_norm": 5.828198718501714, "learning_rate": 2.7062480701664488e-05, "loss": 0.3653, "num_input_tokens_seen": 26602712, "step": 2275 }, { "epoch": 1.1740921967550864, "grad_norm": 6.1247491578050655, "learning_rate": 2.6846691657324473e-05, "loss": 0.3964, "num_input_tokens_seen": 26661160, "step": 2280 }, { "epoch": 1.1766675251094514, "grad_norm": 6.231155247277189, "learning_rate": 2.663145019214163e-05, "loss": 0.3119, "num_input_tokens_seen": 26719648, "step": 2285 }, { "epoch": 1.1792428534638166, "grad_norm": 6.501604840456734, "learning_rate": 2.6416761396608362e-05, "loss": 0.3832, "num_input_tokens_seen": 26778112, "step": 2290 }, { "epoch": 1.1818181818181819, "grad_norm": 5.377003761278013, "learning_rate": 2.6202630348146324e-05, "loss": 0.3277, "num_input_tokens_seen": 26836592, "step": 2295 }, { "epoch": 1.184393510172547, "grad_norm": 4.826044073542379, "learning_rate": 2.598906211098643e-05, "loss": 0.3877, "num_input_tokens_seen": 26895096, "step": 2300 }, { "epoch": 1.184393510172547, "eval_loss": 0.727741539478302, "eval_runtime": 15.9289, "eval_samples_per_second": 3.767, "eval_steps_per_second": 0.942, "num_input_tokens_seen": 26895096, "step": 2300 }, { "epoch": 1.1869688385269122, "grad_norm": 6.370847827905799, "learning_rate": 2.577606173604894e-05, "loss": 0.3033, "num_input_tokens_seen": 26953560, "step": 2305 }, { "epoch": 1.1895441668812774, "grad_norm": 11.746077197029585, "learning_rate": 2.5563634260824175e-05, "loss": 0.4104, "num_input_tokens_seen": 27012024, "step": 2310 }, { "epoch": 1.1921194952356426, "grad_norm": 3.9544988689102762, "learning_rate": 2.535178470925323e-05, "loss": 0.3447, "num_input_tokens_seen": 27070520, "step": 2315 }, { "epoch": 1.1946948235900077, "grad_norm": 4.72491689052158, "learning_rate": 2.5140518091609256e-05, "loss": 0.2882, "num_input_tokens_seen": 27128984, "step": 2320 }, { "epoch": 1.197270151944373, "grad_norm": 2.1806068747411245, "learning_rate": 2.4929839404378936e-05, "loss": 0.2817, "num_input_tokens_seen": 27187432, "step": 2325 }, { "epoch": 1.1998454802987382, "grad_norm": 3.2798105115490745, "learning_rate": 2.471975363014428e-05, "loss": 0.3693, "num_input_tokens_seen": 27245920, "step": 2330 }, { "epoch": 1.2024208086531032, "grad_norm": 7.472396523773262, "learning_rate": 2.451026573746482e-05, "loss": 0.3587, "num_input_tokens_seen": 27304384, "step": 2335 }, { "epoch": 1.2049961370074684, "grad_norm": 6.7073623181550275, "learning_rate": 2.430138068076013e-05, "loss": 0.354, "num_input_tokens_seen": 27362864, "step": 2340 }, { "epoch": 1.2075714653618337, "grad_norm": 6.2693798293878515, "learning_rate": 2.4093103400192625e-05, "loss": 0.3209, "num_input_tokens_seen": 27421360, "step": 2345 }, { "epoch": 1.210146793716199, "grad_norm": 6.606866726236357, "learning_rate": 2.388543882155067e-05, "loss": 0.3871, "num_input_tokens_seen": 27479840, "step": 2350 }, { "epoch": 1.210146793716199, "eval_loss": 0.7659633755683899, "eval_runtime": 16.0101, "eval_samples_per_second": 3.748, "eval_steps_per_second": 0.937, "num_input_tokens_seen": 27479840, "step": 2350 }, { "epoch": 1.212722122070564, "grad_norm": 8.004400275953609, "learning_rate": 2.3678391856132204e-05, "loss": 0.352, "num_input_tokens_seen": 27538344, "step": 2355 }, { "epoch": 1.2152974504249292, "grad_norm": 8.385547193425513, "learning_rate": 2.3471967400628513e-05, "loss": 0.347, "num_input_tokens_seen": 27596808, "step": 2360 }, { "epoch": 1.2178727787792945, "grad_norm": 3.9234442237475435, "learning_rate": 2.3266170337008398e-05, "loss": 0.3667, "num_input_tokens_seen": 27655272, "step": 2365 }, { "epoch": 1.2204481071336595, "grad_norm": 6.584480429736488, "learning_rate": 2.306100553240274e-05, "loss": 0.3311, "num_input_tokens_seen": 27713784, "step": 2370 }, { "epoch": 1.2230234354880247, "grad_norm": 5.791637874835276, "learning_rate": 2.2856477838989456e-05, "loss": 0.2964, "num_input_tokens_seen": 27772248, "step": 2375 }, { "epoch": 1.22559876384239, "grad_norm": 5.663503226529594, "learning_rate": 2.2652592093878666e-05, "loss": 0.3683, "num_input_tokens_seen": 27830704, "step": 2380 }, { "epoch": 1.228174092196755, "grad_norm": 9.657080260273457, "learning_rate": 2.244935311899829e-05, "loss": 0.3819, "num_input_tokens_seen": 27889160, "step": 2385 }, { "epoch": 1.2307494205511202, "grad_norm": 4.757552901440964, "learning_rate": 2.224676572098007e-05, "loss": 0.3084, "num_input_tokens_seen": 27947608, "step": 2390 }, { "epoch": 1.2333247489054855, "grad_norm": 5.188072586185411, "learning_rate": 2.2044834691045873e-05, "loss": 0.4267, "num_input_tokens_seen": 28006112, "step": 2395 }, { "epoch": 1.2359000772598505, "grad_norm": 7.221389028269126, "learning_rate": 2.184356480489432e-05, "loss": 0.3486, "num_input_tokens_seen": 28064552, "step": 2400 }, { "epoch": 1.2359000772598505, "eval_loss": 0.7410638928413391, "eval_runtime": 15.945, "eval_samples_per_second": 3.763, "eval_steps_per_second": 0.941, "num_input_tokens_seen": 28064552, "step": 2400 }, { "epoch": 1.2384754056142158, "grad_norm": 4.430659190759614, "learning_rate": 2.1642960822587878e-05, "loss": 0.2416, "num_input_tokens_seen": 28123016, "step": 2405 }, { "epoch": 1.241050733968581, "grad_norm": 4.985077238748084, "learning_rate": 2.1443027488440338e-05, "loss": 0.3007, "num_input_tokens_seen": 28181464, "step": 2410 }, { "epoch": 1.2436260623229463, "grad_norm": 11.21074775906945, "learning_rate": 2.124376953090456e-05, "loss": 0.2655, "num_input_tokens_seen": 28239920, "step": 2415 }, { "epoch": 1.2462013906773113, "grad_norm": 6.8116545197169724, "learning_rate": 2.104519166246059e-05, "loss": 0.3075, "num_input_tokens_seen": 28298432, "step": 2420 }, { "epoch": 1.2487767190316765, "grad_norm": 10.87615610006345, "learning_rate": 2.0847298579504344e-05, "loss": 0.3537, "num_input_tokens_seen": 28356904, "step": 2425 }, { "epoch": 1.2513520473860418, "grad_norm": 3.9413743825159133, "learning_rate": 2.065009496223638e-05, "loss": 0.2993, "num_input_tokens_seen": 28415384, "step": 2430 }, { "epoch": 1.2539273757404068, "grad_norm": 3.3043013555966407, "learning_rate": 2.045358547455138e-05, "loss": 0.2752, "num_input_tokens_seen": 28473848, "step": 2435 }, { "epoch": 1.256502704094772, "grad_norm": 3.6641007142438338, "learning_rate": 2.0257774763927655e-05, "loss": 0.2975, "num_input_tokens_seen": 28532312, "step": 2440 }, { "epoch": 1.2590780324491373, "grad_norm": 6.306122720573227, "learning_rate": 2.0062667461317426e-05, "loss": 0.4051, "num_input_tokens_seen": 28590784, "step": 2445 }, { "epoch": 1.2616533608035025, "grad_norm": 4.823015256168698, "learning_rate": 1.9868268181037185e-05, "loss": 0.2966, "num_input_tokens_seen": 28649256, "step": 2450 }, { "epoch": 1.2616533608035025, "eval_loss": 0.7485548853874207, "eval_runtime": 16.0437, "eval_samples_per_second": 3.74, "eval_steps_per_second": 0.935, "num_input_tokens_seen": 28649256, "step": 2450 }, { "epoch": 1.2642286891578676, "grad_norm": 10.005201788297592, "learning_rate": 1.967458152065857e-05, "loss": 0.2664, "num_input_tokens_seen": 28707736, "step": 2455 }, { "epoch": 1.2668040175122328, "grad_norm": 4.744134155404128, "learning_rate": 1.9481612060899646e-05, "loss": 0.3692, "num_input_tokens_seen": 28766232, "step": 2460 }, { "epoch": 1.269379345866598, "grad_norm": 8.49200897563331, "learning_rate": 1.928936436551661e-05, "loss": 0.315, "num_input_tokens_seen": 28824688, "step": 2465 }, { "epoch": 1.271954674220963, "grad_norm": 5.112500789477909, "learning_rate": 1.9097842981195834e-05, "loss": 0.3536, "num_input_tokens_seen": 28883176, "step": 2470 }, { "epoch": 1.2745300025753283, "grad_norm": 4.93472430343828, "learning_rate": 1.8907052437446272e-05, "loss": 0.3143, "num_input_tokens_seen": 28941592, "step": 2475 }, { "epoch": 1.2771053309296936, "grad_norm": 4.6754631245280365, "learning_rate": 1.871699724649244e-05, "loss": 0.3114, "num_input_tokens_seen": 29000064, "step": 2480 }, { "epoch": 1.2796806592840588, "grad_norm": 7.198381813960669, "learning_rate": 1.8527681903167644e-05, "loss": 0.3327, "num_input_tokens_seen": 29058496, "step": 2485 }, { "epoch": 1.2822559876384239, "grad_norm": 9.221713217692685, "learning_rate": 1.833911088480767e-05, "loss": 0.2543, "num_input_tokens_seen": 29116992, "step": 2490 }, { "epoch": 1.284831315992789, "grad_norm": 8.499870267936974, "learning_rate": 1.8151288651144893e-05, "loss": 0.2854, "num_input_tokens_seen": 29175496, "step": 2495 }, { "epoch": 1.2874066443471541, "grad_norm": 4.289294450742717, "learning_rate": 1.796421964420285e-05, "loss": 0.3221, "num_input_tokens_seen": 29233968, "step": 2500 }, { "epoch": 1.2874066443471541, "eval_loss": 0.7222262620925903, "eval_runtime": 16.106, "eval_samples_per_second": 3.725, "eval_steps_per_second": 0.931, "num_input_tokens_seen": 29233968, "step": 2500 }, { "epoch": 1.2899819727015194, "grad_norm": 3.3788238852269035, "learning_rate": 1.7777908288191176e-05, "loss": 0.2344, "num_input_tokens_seen": 29292464, "step": 2505 }, { "epoch": 1.2925573010558846, "grad_norm": 9.201457612553746, "learning_rate": 1.7592358989400883e-05, "loss": 0.2727, "num_input_tokens_seen": 29350952, "step": 2510 }, { "epoch": 1.2951326294102499, "grad_norm": 4.626370050462018, "learning_rate": 1.740757613610028e-05, "loss": 0.2687, "num_input_tokens_seen": 29409432, "step": 2515 }, { "epoch": 1.2977079577646151, "grad_norm": 5.784936514951468, "learning_rate": 1.7223564098431067e-05, "loss": 0.2632, "num_input_tokens_seen": 29467880, "step": 2520 }, { "epoch": 1.3002832861189801, "grad_norm": 4.405244480948001, "learning_rate": 1.704032722830512e-05, "loss": 0.3057, "num_input_tokens_seen": 29526384, "step": 2525 }, { "epoch": 1.3028586144733454, "grad_norm": 7.8069578913798825, "learning_rate": 1.68578698593014e-05, "loss": 0.3054, "num_input_tokens_seen": 29584880, "step": 2530 }, { "epoch": 1.3054339428277104, "grad_norm": 6.957468356582848, "learning_rate": 1.6676196306563613e-05, "loss": 0.28, "num_input_tokens_seen": 29643344, "step": 2535 }, { "epoch": 1.3080092711820757, "grad_norm": 9.353535349996537, "learning_rate": 1.6495310866698093e-05, "loss": 0.3169, "num_input_tokens_seen": 29701864, "step": 2540 }, { "epoch": 1.310584599536441, "grad_norm": 5.246799138683368, "learning_rate": 1.631521781767214e-05, "loss": 0.2985, "num_input_tokens_seen": 29760376, "step": 2545 }, { "epoch": 1.3131599278908062, "grad_norm": 10.51357763616516, "learning_rate": 1.6135921418712956e-05, "loss": 0.3231, "num_input_tokens_seen": 29818856, "step": 2550 }, { "epoch": 1.3131599278908062, "eval_loss": 0.7146337628364563, "eval_runtime": 16.171, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.928, "num_input_tokens_seen": 29818856, "step": 2550 }, { "epoch": 1.3157352562451712, "grad_norm": 5.568529968511631, "learning_rate": 1.5957425910206785e-05, "loss": 0.2689, "num_input_tokens_seen": 29877288, "step": 2555 }, { "epoch": 1.3183105845995364, "grad_norm": 4.860244764698273, "learning_rate": 1.577973551359877e-05, "loss": 0.3889, "num_input_tokens_seen": 29935776, "step": 2560 }, { "epoch": 1.3208859129539017, "grad_norm": 4.938342083847672, "learning_rate": 1.560285443129296e-05, "loss": 0.2489, "num_input_tokens_seen": 29994232, "step": 2565 }, { "epoch": 1.3234612413082667, "grad_norm": 7.223451539163128, "learning_rate": 1.542678684655306e-05, "loss": 0.3016, "num_input_tokens_seen": 30052760, "step": 2570 }, { "epoch": 1.326036569662632, "grad_norm": 8.03849051806361, "learning_rate": 1.5251536923403426e-05, "loss": 0.3063, "num_input_tokens_seen": 30111200, "step": 2575 }, { "epoch": 1.3286118980169972, "grad_norm": 6.631117517846943, "learning_rate": 1.5077108806530581e-05, "loss": 0.3159, "num_input_tokens_seen": 30169680, "step": 2580 }, { "epoch": 1.3311872263713624, "grad_norm": 4.171513219192368, "learning_rate": 1.4903506621185192e-05, "loss": 0.3752, "num_input_tokens_seen": 30228176, "step": 2585 }, { "epoch": 1.3337625547257275, "grad_norm": 4.3829742543964985, "learning_rate": 1.4730734473084568e-05, "loss": 0.3207, "num_input_tokens_seen": 30286656, "step": 2590 }, { "epoch": 1.3363378830800927, "grad_norm": 7.160007281376411, "learning_rate": 1.4558796448315504e-05, "loss": 0.2928, "num_input_tokens_seen": 30345160, "step": 2595 }, { "epoch": 1.338913211434458, "grad_norm": 7.316812087176357, "learning_rate": 1.4387696613237612e-05, "loss": 0.2779, "num_input_tokens_seen": 30403640, "step": 2600 }, { "epoch": 1.338913211434458, "eval_loss": 0.695651650428772, "eval_runtime": 16.2569, "eval_samples_per_second": 3.691, "eval_steps_per_second": 0.923, "num_input_tokens_seen": 30403640, "step": 2600 }, { "epoch": 1.341488539788823, "grad_norm": 6.900087606750275, "learning_rate": 1.4217439014387251e-05, "loss": 0.3037, "num_input_tokens_seen": 30462128, "step": 2605 }, { "epoch": 1.3440638681431882, "grad_norm": 9.361737062462586, "learning_rate": 1.404802767838176e-05, "loss": 0.2905, "num_input_tokens_seen": 30520616, "step": 2610 }, { "epoch": 1.3466391964975535, "grad_norm": 11.101564672040755, "learning_rate": 1.3879466611824199e-05, "loss": 0.317, "num_input_tokens_seen": 30579024, "step": 2615 }, { "epoch": 1.3492145248519187, "grad_norm": 5.213355428878847, "learning_rate": 1.371175980120864e-05, "loss": 0.2794, "num_input_tokens_seen": 30637464, "step": 2620 }, { "epoch": 1.3517898532062838, "grad_norm": 4.8688198861459915, "learning_rate": 1.3544911212825906e-05, "loss": 0.3056, "num_input_tokens_seen": 30695936, "step": 2625 }, { "epoch": 1.354365181560649, "grad_norm": 9.002025840794365, "learning_rate": 1.337892479266974e-05, "loss": 0.2712, "num_input_tokens_seen": 30754408, "step": 2630 }, { "epoch": 1.356940509915014, "grad_norm": 4.793656741683869, "learning_rate": 1.3213804466343421e-05, "loss": 0.2615, "num_input_tokens_seen": 30812848, "step": 2635 }, { "epoch": 1.3595158382693793, "grad_norm": 5.128300113893045, "learning_rate": 1.3049554138967051e-05, "loss": 0.2661, "num_input_tokens_seen": 30871344, "step": 2640 }, { "epoch": 1.3620911666237445, "grad_norm": 6.038434247454305, "learning_rate": 1.2886177695085078e-05, "loss": 0.3272, "num_input_tokens_seen": 30929824, "step": 2645 }, { "epoch": 1.3646664949781098, "grad_norm": 5.501317116522042, "learning_rate": 1.2723678998574512e-05, "loss": 0.2962, "num_input_tokens_seen": 30988344, "step": 2650 }, { "epoch": 1.3646664949781098, "eval_loss": 0.7657458186149597, "eval_runtime": 16.0821, "eval_samples_per_second": 3.731, "eval_steps_per_second": 0.933, "num_input_tokens_seen": 30988344, "step": 2650 }, { "epoch": 1.367241823332475, "grad_norm": 5.445887797084714, "learning_rate": 1.2562061892553473e-05, "loss": 0.3207, "num_input_tokens_seen": 31046848, "step": 2655 }, { "epoch": 1.36981715168684, "grad_norm": 8.28343197617098, "learning_rate": 1.2401330199290367e-05, "loss": 0.3001, "num_input_tokens_seen": 31105352, "step": 2660 }, { "epoch": 1.3723924800412053, "grad_norm": 6.0349779847885054, "learning_rate": 1.224148772011346e-05, "loss": 0.2858, "num_input_tokens_seen": 31163848, "step": 2665 }, { "epoch": 1.3749678083955703, "grad_norm": 6.430225669948217, "learning_rate": 1.2082538235320929e-05, "loss": 0.2338, "num_input_tokens_seen": 31222360, "step": 2670 }, { "epoch": 1.3775431367499356, "grad_norm": 7.550675916086161, "learning_rate": 1.1924485504091565e-05, "loss": 0.2212, "num_input_tokens_seen": 31280840, "step": 2675 }, { "epoch": 1.3801184651043008, "grad_norm": 9.927835245980713, "learning_rate": 1.1767333264395736e-05, "loss": 0.3131, "num_input_tokens_seen": 31339264, "step": 2680 }, { "epoch": 1.382693793458666, "grad_norm": 6.940248775417007, "learning_rate": 1.1611085232907132e-05, "loss": 0.3616, "num_input_tokens_seen": 31397744, "step": 2685 }, { "epoch": 1.385269121813031, "grad_norm": 13.50108715364713, "learning_rate": 1.14557451049147e-05, "loss": 0.3153, "num_input_tokens_seen": 31456240, "step": 2690 }, { "epoch": 1.3878444501673963, "grad_norm": 5.379761157260886, "learning_rate": 1.1301316554235397e-05, "loss": 0.3044, "num_input_tokens_seen": 31514744, "step": 2695 }, { "epoch": 1.3904197785217616, "grad_norm": 6.480605347127299, "learning_rate": 1.114780323312724e-05, "loss": 0.3163, "num_input_tokens_seen": 31573240, "step": 2700 }, { "epoch": 1.3904197785217616, "eval_loss": 0.7473158240318298, "eval_runtime": 16.166, "eval_samples_per_second": 3.711, "eval_steps_per_second": 0.928, "num_input_tokens_seen": 31573240, "step": 2700 }, { "epoch": 1.3929951068761266, "grad_norm": 4.579483859059419, "learning_rate": 1.0995208772202897e-05, "loss": 0.2798, "num_input_tokens_seen": 31631688, "step": 2705 }, { "epoch": 1.3955704352304918, "grad_norm": 6.098482033036635, "learning_rate": 1.0843536780343865e-05, "loss": 0.289, "num_input_tokens_seen": 31690200, "step": 2710 }, { "epoch": 1.398145763584857, "grad_norm": 9.834029857293697, "learning_rate": 1.069279084461513e-05, "loss": 0.2844, "num_input_tokens_seen": 31748664, "step": 2715 }, { "epoch": 1.4007210919392223, "grad_norm": 9.387518267357049, "learning_rate": 1.0542974530180327e-05, "loss": 0.3254, "num_input_tokens_seen": 31807176, "step": 2720 }, { "epoch": 1.4032964202935874, "grad_norm": 5.648695214602192, "learning_rate": 1.0394091380217352e-05, "loss": 0.3683, "num_input_tokens_seen": 31865696, "step": 2725 }, { "epoch": 1.4058717486479526, "grad_norm": 5.202858729177478, "learning_rate": 1.0246144915834683e-05, "loss": 0.2968, "num_input_tokens_seen": 31924200, "step": 2730 }, { "epoch": 1.4084470770023179, "grad_norm": 4.808429946385537, "learning_rate": 1.0099138635988026e-05, "loss": 0.2943, "num_input_tokens_seen": 31982712, "step": 2735 }, { "epoch": 1.4110224053566829, "grad_norm": 5.094039780174813, "learning_rate": 9.953076017397578e-06, "loss": 0.3037, "num_input_tokens_seen": 32041176, "step": 2740 }, { "epoch": 1.4135977337110481, "grad_norm": 5.807237736394797, "learning_rate": 9.807960514465792e-06, "loss": 0.3019, "num_input_tokens_seen": 32099656, "step": 2745 }, { "epoch": 1.4161730620654134, "grad_norm": 6.27488451409393, "learning_rate": 9.663795559195733e-06, "loss": 0.164, "num_input_tokens_seen": 32158144, "step": 2750 }, { "epoch": 1.4161730620654134, "eval_loss": 0.7807286381721497, "eval_runtime": 16.139, "eval_samples_per_second": 3.718, "eval_steps_per_second": 0.929, "num_input_tokens_seen": 32158144, "step": 2750 }, { "epoch": 1.4187483904197786, "grad_norm": 6.584628814510667, "learning_rate": 9.520584561109864e-06, "loss": 0.3333, "num_input_tokens_seen": 32216656, "step": 2755 }, { "epoch": 1.4213237187741437, "grad_norm": 7.509676086247465, "learning_rate": 9.378330907169386e-06, "loss": 0.2993, "num_input_tokens_seen": 32275168, "step": 2760 }, { "epoch": 1.423899047128509, "grad_norm": 5.1775193353141535, "learning_rate": 9.237037961694223e-06, "loss": 0.2683, "num_input_tokens_seen": 32333664, "step": 2765 }, { "epoch": 1.4264743754828741, "grad_norm": 7.856433365965151, "learning_rate": 9.096709066283354e-06, "loss": 0.3145, "num_input_tokens_seen": 32392088, "step": 2770 }, { "epoch": 1.4290497038372392, "grad_norm": 8.252870521534577, "learning_rate": 8.957347539735872e-06, "loss": 0.3092, "num_input_tokens_seen": 32450584, "step": 2775 }, { "epoch": 1.4316250321916044, "grad_norm": 9.74883489294415, "learning_rate": 8.818956677972406e-06, "loss": 0.2993, "num_input_tokens_seen": 32509096, "step": 2780 }, { "epoch": 1.4342003605459697, "grad_norm": 4.008158818829899, "learning_rate": 8.681539753957269e-06, "loss": 0.326, "num_input_tokens_seen": 32567560, "step": 2785 }, { "epoch": 1.436775688900335, "grad_norm": 3.4229494980881174, "learning_rate": 8.545100017620988e-06, "loss": 0.2494, "num_input_tokens_seen": 32626056, "step": 2790 }, { "epoch": 1.4393510172547, "grad_norm": 4.425295787830864, "learning_rate": 8.409640695783443e-06, "loss": 0.2691, "num_input_tokens_seen": 32684520, "step": 2795 }, { "epoch": 1.4419263456090652, "grad_norm": 5.132559476583136, "learning_rate": 8.275164992077556e-06, "loss": 0.2939, "num_input_tokens_seen": 32743032, "step": 2800 }, { "epoch": 1.4419263456090652, "eval_loss": 0.791334331035614, "eval_runtime": 16.1142, "eval_samples_per_second": 3.723, "eval_steps_per_second": 0.931, "num_input_tokens_seen": 32743032, "step": 2800 }, { "epoch": 1.4445016739634302, "grad_norm": 4.932628514942533, "learning_rate": 8.141676086873572e-06, "loss": 0.2974, "num_input_tokens_seen": 32801504, "step": 2805 }, { "epoch": 1.4470770023177955, "grad_norm": 8.764444587690557, "learning_rate": 8.009177137203794e-06, "loss": 0.2849, "num_input_tokens_seen": 32860032, "step": 2810 }, { "epoch": 1.4496523306721607, "grad_norm": 5.502098759051231, "learning_rate": 7.877671276687898e-06, "loss": 0.3024, "num_input_tokens_seen": 32918472, "step": 2815 }, { "epoch": 1.452227659026526, "grad_norm": 3.2634043608450183, "learning_rate": 7.747161615458902e-06, "loss": 0.2565, "num_input_tokens_seen": 32976944, "step": 2820 }, { "epoch": 1.4548029873808912, "grad_norm": 4.852977750360098, "learning_rate": 7.617651240089546e-06, "loss": 0.2473, "num_input_tokens_seen": 33035424, "step": 2825 }, { "epoch": 1.4573783157352562, "grad_norm": 8.667293936674204, "learning_rate": 7.489143213519301e-06, "loss": 0.3118, "num_input_tokens_seen": 33093880, "step": 2830 }, { "epoch": 1.4599536440896215, "grad_norm": 9.253351843058615, "learning_rate": 7.361640574981937e-06, "loss": 0.2593, "num_input_tokens_seen": 33152328, "step": 2835 }, { "epoch": 1.4625289724439865, "grad_norm": 6.811131820051524, "learning_rate": 7.2351463399336735e-06, "loss": 0.284, "num_input_tokens_seen": 33210816, "step": 2840 }, { "epoch": 1.4651043007983517, "grad_norm": 4.086720732934785, "learning_rate": 7.109663499981834e-06, "loss": 0.2671, "num_input_tokens_seen": 33269320, "step": 2845 }, { "epoch": 1.467679629152717, "grad_norm": 9.463519299706055, "learning_rate": 6.985195022814067e-06, "loss": 0.2848, "num_input_tokens_seen": 33327720, "step": 2850 }, { "epoch": 1.467679629152717, "eval_loss": 0.8045337796211243, "eval_runtime": 15.9996, "eval_samples_per_second": 3.75, "eval_steps_per_second": 0.938, "num_input_tokens_seen": 33327720, "step": 2850 }, { "epoch": 1.4702549575070822, "grad_norm": 6.856320486947826, "learning_rate": 6.861743852128233e-06, "loss": 0.2811, "num_input_tokens_seen": 33386160, "step": 2855 }, { "epoch": 1.4728302858614473, "grad_norm": 8.133776634702407, "learning_rate": 6.7393129075627335e-06, "loss": 0.2394, "num_input_tokens_seen": 33444648, "step": 2860 }, { "epoch": 1.4754056142158125, "grad_norm": 5.884612144672532, "learning_rate": 6.6179050846274515e-06, "loss": 0.243, "num_input_tokens_seen": 33503144, "step": 2865 }, { "epoch": 1.4779809425701778, "grad_norm": 7.133095118516192, "learning_rate": 6.497523254635296e-06, "loss": 0.242, "num_input_tokens_seen": 33561600, "step": 2870 }, { "epoch": 1.4805562709245428, "grad_norm": 3.725193081900286, "learning_rate": 6.37817026463432e-06, "loss": 0.1864, "num_input_tokens_seen": 33620056, "step": 2875 }, { "epoch": 1.483131599278908, "grad_norm": 5.26408055314188, "learning_rate": 6.25984893734034e-06, "loss": 0.2406, "num_input_tokens_seen": 33678512, "step": 2880 }, { "epoch": 1.4857069276332733, "grad_norm": 5.139938399894378, "learning_rate": 6.142562071070179e-06, "loss": 0.2287, "num_input_tokens_seen": 33736960, "step": 2885 }, { "epoch": 1.4882822559876385, "grad_norm": 5.551633292498772, "learning_rate": 6.026312439675552e-06, "loss": 0.2643, "num_input_tokens_seen": 33795416, "step": 2890 }, { "epoch": 1.4908575843420036, "grad_norm": 5.974549504189433, "learning_rate": 5.911102792477357e-06, "loss": 0.2956, "num_input_tokens_seen": 33853936, "step": 2895 }, { "epoch": 1.4934329126963688, "grad_norm": 5.786971041370645, "learning_rate": 5.796935854200763e-06, "loss": 0.29, "num_input_tokens_seen": 33912440, "step": 2900 }, { "epoch": 1.4934329126963688, "eval_loss": 0.8113046884536743, "eval_runtime": 16.0025, "eval_samples_per_second": 3.749, "eval_steps_per_second": 0.937, "num_input_tokens_seen": 33912440, "step": 2900 }, { "epoch": 1.496008241050734, "grad_norm": 5.559213288581127, "learning_rate": 5.683814324910685e-06, "loss": 0.2815, "num_input_tokens_seen": 33970888, "step": 2905 }, { "epoch": 1.498583569405099, "grad_norm": 4.093818675769417, "learning_rate": 5.571740879947979e-06, "loss": 0.2737, "num_input_tokens_seen": 34029376, "step": 2910 }, { "epoch": 1.5011588977594643, "grad_norm": 3.092699650877493, "learning_rate": 5.4607181698661634e-06, "loss": 0.2445, "num_input_tokens_seen": 34087864, "step": 2915 }, { "epoch": 1.5037342261138296, "grad_norm": 11.010380823046683, "learning_rate": 5.35074882036869e-06, "loss": 0.2802, "num_input_tokens_seen": 34146296, "step": 2920 }, { "epoch": 1.5063095544681948, "grad_norm": 6.09904123406433, "learning_rate": 5.241835432246889e-06, "loss": 0.2379, "num_input_tokens_seen": 34204800, "step": 2925 }, { "epoch": 1.5088848828225598, "grad_norm": 6.205588168386299, "learning_rate": 5.133980581318459e-06, "loss": 0.2783, "num_input_tokens_seen": 34263296, "step": 2930 }, { "epoch": 1.511460211176925, "grad_norm": 5.945749064464075, "learning_rate": 5.027186818366542e-06, "loss": 0.2609, "num_input_tokens_seen": 34321792, "step": 2935 }, { "epoch": 1.51403553953129, "grad_norm": 6.50829738633896, "learning_rate": 4.921456669079366e-06, "loss": 0.2367, "num_input_tokens_seen": 34380264, "step": 2940 }, { "epoch": 1.5166108678856554, "grad_norm": 8.02525724539128, "learning_rate": 4.816792633990569e-06, "loss": 0.3644, "num_input_tokens_seen": 34438752, "step": 2945 }, { "epoch": 1.5191861962400206, "grad_norm": 8.28398511184134, "learning_rate": 4.713197188420026e-06, "loss": 0.2494, "num_input_tokens_seen": 34497216, "step": 2950 }, { "epoch": 1.5191861962400206, "eval_loss": 0.8177086710929871, "eval_runtime": 16.0851, "eval_samples_per_second": 3.73, "eval_steps_per_second": 0.933, "num_input_tokens_seen": 34497216, "step": 2950 }, { "epoch": 1.5217615245943859, "grad_norm": 7.1653439027229, "learning_rate": 4.610672782415276e-06, "loss": 0.2892, "num_input_tokens_seen": 34555704, "step": 2955 }, { "epoch": 1.524336852948751, "grad_norm": 5.9872264088640295, "learning_rate": 4.509221840693656e-06, "loss": 0.3006, "num_input_tokens_seen": 34614168, "step": 2960 }, { "epoch": 1.5269121813031161, "grad_norm": 3.47728801697101, "learning_rate": 4.408846762584901e-06, "loss": 0.2931, "num_input_tokens_seen": 34672624, "step": 2965 }, { "epoch": 1.5294875096574814, "grad_norm": 5.342563435045045, "learning_rate": 4.309549921974421e-06, "loss": 0.2255, "num_input_tokens_seen": 34731056, "step": 2970 }, { "epoch": 1.5320628380118464, "grad_norm": 8.130368656554953, "learning_rate": 4.2113336672471245e-06, "loss": 0.2725, "num_input_tokens_seen": 34789552, "step": 2975 }, { "epoch": 1.5346381663662116, "grad_norm": 6.656792231449799, "learning_rate": 4.114200321231937e-06, "loss": 0.3158, "num_input_tokens_seen": 34848064, "step": 2980 }, { "epoch": 1.537213494720577, "grad_norm": 16.361277885783338, "learning_rate": 4.018152181146823e-06, "loss": 0.2562, "num_input_tokens_seen": 34906592, "step": 2985 }, { "epoch": 1.5397888230749421, "grad_norm": 5.885778380254227, "learning_rate": 3.923191518544434e-06, "loss": 0.2814, "num_input_tokens_seen": 34965064, "step": 2990 }, { "epoch": 1.5423641514293074, "grad_norm": 7.567800102342742, "learning_rate": 3.829320579258466e-06, "loss": 0.2555, "num_input_tokens_seen": 35023552, "step": 2995 }, { "epoch": 1.5449394797836724, "grad_norm": 6.846236051634878, "learning_rate": 3.7365415833504725e-06, "loss": 0.2259, "num_input_tokens_seen": 35082056, "step": 3000 }, { "epoch": 1.5449394797836724, "eval_loss": 0.8405727744102478, "eval_runtime": 16.2083, "eval_samples_per_second": 3.702, "eval_steps_per_second": 0.925, "num_input_tokens_seen": 35082056, "step": 3000 }, { "epoch": 1.5475148081380374, "grad_norm": 5.643348291984009, "learning_rate": 3.644856725057405e-06, "loss": 0.2157, "num_input_tokens_seen": 35140568, "step": 3005 }, { "epoch": 1.5500901364924027, "grad_norm": 6.225693907549098, "learning_rate": 3.554268172739661e-06, "loss": 0.2233, "num_input_tokens_seen": 35199064, "step": 3010 }, { "epoch": 1.552665464846768, "grad_norm": 5.080945994557626, "learning_rate": 3.4647780688298826e-06, "loss": 0.2951, "num_input_tokens_seen": 35257576, "step": 3015 }, { "epoch": 1.5552407932011332, "grad_norm": 5.263879934995459, "learning_rate": 3.376388529782215e-06, "loss": 0.2274, "num_input_tokens_seen": 35316064, "step": 3020 }, { "epoch": 1.5578161215554984, "grad_norm": 5.655349471422181, "learning_rate": 3.2891016460222967e-06, "loss": 0.2479, "num_input_tokens_seen": 35374504, "step": 3025 }, { "epoch": 1.5603914499098637, "grad_norm": 7.871895425892081, "learning_rate": 3.2029194818977983e-06, "loss": 0.292, "num_input_tokens_seen": 35432984, "step": 3030 }, { "epoch": 1.5629667782642287, "grad_norm": 6.441418084723481, "learning_rate": 3.117844075629617e-06, "loss": 0.241, "num_input_tokens_seen": 35491488, "step": 3035 }, { "epoch": 1.5655421066185937, "grad_norm": 5.268339109046189, "learning_rate": 3.033877439263666e-06, "loss": 0.228, "num_input_tokens_seen": 35549984, "step": 3040 }, { "epoch": 1.568117434972959, "grad_norm": 7.110464304213341, "learning_rate": 2.951021558623274e-06, "loss": 0.2485, "num_input_tokens_seen": 35608488, "step": 3045 }, { "epoch": 1.5706927633273242, "grad_norm": 12.567694093056492, "learning_rate": 2.869278393262226e-06, "loss": 0.2851, "num_input_tokens_seen": 35666976, "step": 3050 }, { "epoch": 1.5706927633273242, "eval_loss": 0.8473746180534363, "eval_runtime": 16.0314, "eval_samples_per_second": 3.743, "eval_steps_per_second": 0.936, "num_input_tokens_seen": 35666976, "step": 3050 }, { "epoch": 1.5732680916816895, "grad_norm": 5.787936921221981, "learning_rate": 2.7886498764184588e-06, "loss": 0.2514, "num_input_tokens_seen": 35725456, "step": 3055 }, { "epoch": 1.5758434200360547, "grad_norm": 7.052716790363759, "learning_rate": 2.7091379149682685e-06, "loss": 0.3091, "num_input_tokens_seen": 35783912, "step": 3060 }, { "epoch": 1.5784187483904197, "grad_norm": 4.6737853290480915, "learning_rate": 2.6307443893812843e-06, "loss": 0.2629, "num_input_tokens_seen": 35842376, "step": 3065 }, { "epoch": 1.580994076744785, "grad_norm": 8.400296818269052, "learning_rate": 2.5534711536759404e-06, "loss": 0.3065, "num_input_tokens_seen": 35900824, "step": 3070 }, { "epoch": 1.58356940509915, "grad_norm": 3.909241159865706, "learning_rate": 2.4773200353756798e-06, "loss": 0.2577, "num_input_tokens_seen": 35959264, "step": 3075 }, { "epoch": 1.5861447334535153, "grad_norm": 5.227660314173737, "learning_rate": 2.4022928354656473e-06, "loss": 0.2359, "num_input_tokens_seen": 36017760, "step": 3080 }, { "epoch": 1.5887200618078805, "grad_norm": 5.407491053931616, "learning_rate": 2.3283913283502044e-06, "loss": 0.1897, "num_input_tokens_seen": 36076280, "step": 3085 }, { "epoch": 1.5912953901622457, "grad_norm": 5.771594174948701, "learning_rate": 2.2556172618108997e-06, "loss": 0.286, "num_input_tokens_seen": 36134784, "step": 3090 }, { "epoch": 1.593870718516611, "grad_norm": 5.508770087080472, "learning_rate": 2.183972356965125e-06, "loss": 0.2733, "num_input_tokens_seen": 36193288, "step": 3095 }, { "epoch": 1.596446046870976, "grad_norm": 6.343942326218544, "learning_rate": 2.113458308225458e-06, "loss": 0.2351, "num_input_tokens_seen": 36251744, "step": 3100 }, { "epoch": 1.596446046870976, "eval_loss": 0.8650907874107361, "eval_runtime": 16.0989, "eval_samples_per_second": 3.727, "eval_steps_per_second": 0.932, "num_input_tokens_seen": 36251744, "step": 3100 }, { "epoch": 1.5990213752253413, "grad_norm": 4.00048030481465, "learning_rate": 2.0440767832595574e-06, "loss": 0.2454, "num_input_tokens_seen": 36310200, "step": 3105 }, { "epoch": 1.6015967035797063, "grad_norm": 5.230064679031373, "learning_rate": 1.975829422950709e-06, "loss": 0.2629, "num_input_tokens_seen": 36368688, "step": 3110 }, { "epoch": 1.6041720319340715, "grad_norm": 12.271894553598498, "learning_rate": 1.908717841359048e-06, "loss": 0.2848, "num_input_tokens_seen": 36427192, "step": 3115 }, { "epoch": 1.6067473602884368, "grad_norm": 8.178213306290619, "learning_rate": 1.8427436256833852e-06, "loss": 0.228, "num_input_tokens_seen": 36485656, "step": 3120 }, { "epoch": 1.609322688642802, "grad_norm": 4.853366085377887, "learning_rate": 1.7779083362236547e-06, "loss": 0.2239, "num_input_tokens_seen": 36544128, "step": 3125 }, { "epoch": 1.6118980169971673, "grad_norm": 10.968162741068843, "learning_rate": 1.7142135063440035e-06, "loss": 0.2585, "num_input_tokens_seen": 36602568, "step": 3130 }, { "epoch": 1.6144733453515323, "grad_norm": 5.564416348243761, "learning_rate": 1.6516606424365643e-06, "loss": 0.2887, "num_input_tokens_seen": 36661064, "step": 3135 }, { "epoch": 1.6170486737058976, "grad_norm": 8.095832161946442, "learning_rate": 1.5902512238857858e-06, "loss": 0.2446, "num_input_tokens_seen": 36719544, "step": 3140 }, { "epoch": 1.6196240020602626, "grad_norm": 8.906257390618395, "learning_rate": 1.5299867030334814e-06, "loss": 0.2673, "num_input_tokens_seen": 36778064, "step": 3145 }, { "epoch": 1.6221993304146278, "grad_norm": 6.864070166407251, "learning_rate": 1.4708685051444515e-06, "loss": 0.2638, "num_input_tokens_seen": 36836560, "step": 3150 }, { "epoch": 1.6221993304146278, "eval_loss": 0.8633677363395691, "eval_runtime": 16.2031, "eval_samples_per_second": 3.703, "eval_steps_per_second": 0.926, "num_input_tokens_seen": 36836560, "step": 3150 }, { "epoch": 1.624774658768993, "grad_norm": 8.026607293073416, "learning_rate": 1.4128980283727943e-06, "loss": 0.2793, "num_input_tokens_seen": 36895016, "step": 3155 }, { "epoch": 1.6273499871233583, "grad_norm": 11.669862098293653, "learning_rate": 1.356076643728843e-06, "loss": 0.2887, "num_input_tokens_seen": 36953528, "step": 3160 }, { "epoch": 1.6299253154777236, "grad_norm": 5.580791837684188, "learning_rate": 1.3004056950467135e-06, "loss": 0.317, "num_input_tokens_seen": 37012056, "step": 3165 }, { "epoch": 1.6325006438320886, "grad_norm": 4.650356589287389, "learning_rate": 1.2458864989525698e-06, "loss": 0.2095, "num_input_tokens_seen": 37070528, "step": 3170 }, { "epoch": 1.6350759721864536, "grad_norm": 6.089813437162075, "learning_rate": 1.19252034483342e-06, "loss": 0.237, "num_input_tokens_seen": 37129008, "step": 3175 }, { "epoch": 1.6376513005408189, "grad_norm": 5.287668578489162, "learning_rate": 1.1403084948067021e-06, "loss": 0.2448, "num_input_tokens_seen": 37187472, "step": 3180 }, { "epoch": 1.6402266288951841, "grad_norm": 6.982602482070445, "learning_rate": 1.089252183690348e-06, "loss": 0.2563, "num_input_tokens_seen": 37245936, "step": 3185 }, { "epoch": 1.6428019572495494, "grad_norm": 2.9242653665827647, "learning_rate": 1.0393526189736602e-06, "loss": 0.2538, "num_input_tokens_seen": 37304424, "step": 3190 }, { "epoch": 1.6453772856039146, "grad_norm": 6.894723044936381, "learning_rate": 9.906109807887032e-07, "loss": 0.1768, "num_input_tokens_seen": 37362888, "step": 3195 }, { "epoch": 1.6479526139582796, "grad_norm": 6.796664957587956, "learning_rate": 9.430284218824026e-07, "loss": 0.312, "num_input_tokens_seen": 37421416, "step": 3200 }, { "epoch": 1.6479526139582796, "eval_loss": 0.8679988980293274, "eval_runtime": 16.1678, "eval_samples_per_second": 3.711, "eval_steps_per_second": 0.928, "num_input_tokens_seen": 37421416, "step": 3200 }, { "epoch": 1.6505279423126449, "grad_norm": 12.027460444161642, "learning_rate": 8.966060675892951e-07, "loss": 0.2865, "num_input_tokens_seen": 37479848, "step": 3205 }, { "epoch": 1.65310327066701, "grad_norm": 6.851221931248735, "learning_rate": 8.513450158049108e-07, "loss": 0.3299, "num_input_tokens_seen": 37538312, "step": 3210 }, { "epoch": 1.6556785990213752, "grad_norm": 6.971651790450948, "learning_rate": 8.072463369597993e-07, "loss": 0.3218, "num_input_tokens_seen": 37596800, "step": 3215 }, { "epoch": 1.6582539273757404, "grad_norm": 10.994527310957624, "learning_rate": 7.643110739942172e-07, "loss": 0.2593, "num_input_tokens_seen": 37655312, "step": 3220 }, { "epoch": 1.6608292557301056, "grad_norm": 13.542379224085927, "learning_rate": 7.225402423334693e-07, "loss": 0.3072, "num_input_tokens_seen": 37713800, "step": 3225 }, { "epoch": 1.663404584084471, "grad_norm": 5.442561929450427, "learning_rate": 6.819348298638839e-07, "loss": 0.2276, "num_input_tokens_seen": 37772280, "step": 3230 }, { "epoch": 1.665979912438836, "grad_norm": 8.128386248398428, "learning_rate": 6.424957969094536e-07, "loss": 0.2489, "num_input_tokens_seen": 37830800, "step": 3235 }, { "epoch": 1.6685552407932012, "grad_norm": 3.9766881915113266, "learning_rate": 6.0422407620912e-07, "loss": 0.2552, "num_input_tokens_seen": 37889280, "step": 3240 }, { "epoch": 1.6711305691475662, "grad_norm": 5.555365927504982, "learning_rate": 5.671205728947305e-07, "loss": 0.226, "num_input_tokens_seen": 37947728, "step": 3245 }, { "epoch": 1.6737058975019314, "grad_norm": 5.733028191926084, "learning_rate": 5.311861644696048e-07, "loss": 0.2785, "num_input_tokens_seen": 38006200, "step": 3250 }, { "epoch": 1.6737058975019314, "eval_loss": 0.8640011548995972, "eval_runtime": 16.0965, "eval_samples_per_second": 3.728, "eval_steps_per_second": 0.932, "num_input_tokens_seen": 38006200, "step": 3250 }, { "epoch": 1.6762812258562967, "grad_norm": 4.778342712582032, "learning_rate": 4.964217007878081e-07, "loss": 0.2291, "num_input_tokens_seen": 38064672, "step": 3255 }, { "epoch": 1.678856554210662, "grad_norm": 4.4902131141962, "learning_rate": 4.6282800403402715e-07, "loss": 0.3101, "num_input_tokens_seen": 38123192, "step": 3260 }, { "epoch": 1.6814318825650272, "grad_norm": 7.687294001046122, "learning_rate": 4.3040586870415346e-07, "loss": 0.3196, "num_input_tokens_seen": 38181696, "step": 3265 }, { "epoch": 1.6840072109193922, "grad_norm": 7.392271519909896, "learning_rate": 3.991560615864587e-07, "loss": 0.2587, "num_input_tokens_seen": 38240216, "step": 3270 }, { "epoch": 1.6865825392737575, "grad_norm": 6.335589264461425, "learning_rate": 3.6907932174349846e-07, "loss": 0.2093, "num_input_tokens_seen": 38298688, "step": 3275 }, { "epoch": 1.6891578676281225, "grad_norm": 7.268228162683875, "learning_rate": 3.40176360494604e-07, "loss": 0.2282, "num_input_tokens_seen": 38357128, "step": 3280 }, { "epoch": 1.6917331959824877, "grad_norm": 4.776419874246786, "learning_rate": 3.124478613990733e-07, "loss": 0.2092, "num_input_tokens_seen": 38415600, "step": 3285 }, { "epoch": 1.694308524336853, "grad_norm": 8.522894464657169, "learning_rate": 2.8589448023998987e-07, "loss": 0.2861, "num_input_tokens_seen": 38474112, "step": 3290 }, { "epoch": 1.6968838526912182, "grad_norm": 5.304805044526707, "learning_rate": 2.605168450087514e-07, "loss": 0.2494, "num_input_tokens_seen": 38532624, "step": 3295 }, { "epoch": 1.6994591810455835, "grad_norm": 7.112591931914542, "learning_rate": 2.363155558901542e-07, "loss": 0.2752, "num_input_tokens_seen": 38591128, "step": 3300 }, { "epoch": 1.6994591810455835, "eval_loss": 0.8644178509712219, "eval_runtime": 16.1497, "eval_samples_per_second": 3.715, "eval_steps_per_second": 0.929, "num_input_tokens_seen": 38591128, "step": 3300 }, { "epoch": 1.7020345093999485, "grad_norm": 4.935833215525081, "learning_rate": 2.1329118524827662e-07, "loss": 0.2337, "num_input_tokens_seen": 38649640, "step": 3305 }, { "epoch": 1.7046098377543135, "grad_norm": 5.746920185244728, "learning_rate": 1.9144427761286222e-07, "loss": 0.215, "num_input_tokens_seen": 38708112, "step": 3310 }, { "epoch": 1.7071851661086788, "grad_norm": 6.501004359690972, "learning_rate": 1.7077534966650766e-07, "loss": 0.2871, "num_input_tokens_seen": 38766624, "step": 3315 }, { "epoch": 1.709760494463044, "grad_norm": 6.996403813160393, "learning_rate": 1.51284890232406e-07, "loss": 0.3478, "num_input_tokens_seen": 38825104, "step": 3320 }, { "epoch": 1.7123358228174093, "grad_norm": 5.178545190033401, "learning_rate": 1.3297336026280027e-07, "loss": 0.2055, "num_input_tokens_seen": 38883560, "step": 3325 }, { "epoch": 1.7149111511717745, "grad_norm": 6.686144266429449, "learning_rate": 1.158411928280645e-07, "loss": 0.2992, "num_input_tokens_seen": 38942040, "step": 3330 }, { "epoch": 1.7174864795261395, "grad_norm": 4.337439288142164, "learning_rate": 9.988879310649513e-08, "loss": 0.2302, "num_input_tokens_seen": 39000488, "step": 3335 }, { "epoch": 1.7200618078805048, "grad_norm": 6.5240260149211755, "learning_rate": 8.511653837470212e-08, "loss": 0.265, "num_input_tokens_seen": 39058960, "step": 3340 }, { "epoch": 1.7226371362348698, "grad_norm": 7.592689596688837, "learning_rate": 7.152477799867719e-08, "loss": 0.3147, "num_input_tokens_seen": 39117416, "step": 3345 }, { "epoch": 1.725212464589235, "grad_norm": 6.429413076205037, "learning_rate": 5.911383342556143e-08, "loss": 0.2674, "num_input_tokens_seen": 39175888, "step": 3350 }, { "epoch": 1.725212464589235, "eval_loss": 0.8666485548019409, "eval_runtime": 16.1238, "eval_samples_per_second": 3.721, "eval_steps_per_second": 0.93, "num_input_tokens_seen": 39175888, "step": 3350 }, { "epoch": 1.7277877929436003, "grad_norm": 10.968051828666288, "learning_rate": 4.788399817602929e-08, "loss": 0.2565, "num_input_tokens_seen": 39234336, "step": 3355 }, { "epoch": 1.7303631212979655, "grad_norm": 5.1159559645491335, "learning_rate": 3.7835537837338506e-08, "loss": 0.2762, "num_input_tokens_seen": 39292800, "step": 3360 }, { "epoch": 1.7329384496523308, "grad_norm": 6.735859744015271, "learning_rate": 2.8968690057051828e-08, "loss": 0.2196, "num_input_tokens_seen": 39351272, "step": 3365 }, { "epoch": 1.7355137780066958, "grad_norm": 3.989003741597172, "learning_rate": 2.128366453743591e-08, "loss": 0.2482, "num_input_tokens_seen": 39409736, "step": 3370 }, { "epoch": 1.738089106361061, "grad_norm": 5.083412307953648, "learning_rate": 1.4780643030476438e-08, "loss": 0.2778, "num_input_tokens_seen": 39468176, "step": 3375 }, { "epoch": 1.740664434715426, "grad_norm": 7.4306605849577565, "learning_rate": 9.459779333587104e-09, "loss": 0.2048, "num_input_tokens_seen": 39526688, "step": 3380 }, { "epoch": 1.7432397630697913, "grad_norm": 4.202839419581782, "learning_rate": 5.3211992859791835e-09, "loss": 0.2296, "num_input_tokens_seen": 39585152, "step": 3385 }, { "epoch": 1.7458150914241566, "grad_norm": 7.909317855624412, "learning_rate": 2.3650007656805806e-09, "loss": 0.2713, "num_input_tokens_seen": 39643640, "step": 3390 }, { "epoch": 1.7483904197785218, "grad_norm": 7.880795429819755, "learning_rate": 5.912536872321184e-10, "loss": 0.2964, "num_input_tokens_seen": 39702144, "step": 3395 }, { "epoch": 1.750965748132887, "grad_norm": 4.00234080349809, "learning_rate": 0.0, "loss": 0.1797, "num_input_tokens_seen": 39760664, "step": 3400 }, { "epoch": 1.750965748132887, "eval_loss": 0.8603056073188782, "eval_runtime": 16.2474, "eval_samples_per_second": 3.693, "eval_steps_per_second": 0.923, "num_input_tokens_seen": 39760664, "step": 3400 }, { "epoch": 1.750965748132887, "num_input_tokens_seen": 39760664, "step": 3400, "total_flos": 2232757993603072.0, "train_loss": 0.5904174627801951, "train_runtime": 45337.3565, "train_samples_per_second": 1.8, "train_steps_per_second": 0.075 } ], "logging_steps": 5, "max_steps": 3400, "num_input_tokens_seen": 39760664, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2232757993603072.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }