|
{ |
|
"best_metric": 0.6319106221199036, |
|
"best_model_checkpoint": "saves/CADICA_qwenvl_stenosis_classily_scale4_frozenVision/lora/sft/checkpoint-1600", |
|
"epoch": 1.750965748132887, |
|
"eval_steps": 50, |
|
"global_step": 3400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0025753283543651817, |
|
"grad_norm": 21.336819681898895, |
|
"learning_rate": 2.9411764705882355e-06, |
|
"loss": 3.0444, |
|
"num_input_tokens_seen": 58496, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0051506567087303634, |
|
"grad_norm": 20.576623155848594, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 2.9824, |
|
"num_input_tokens_seen": 116960, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.007725985063095545, |
|
"grad_norm": 22.989873871108518, |
|
"learning_rate": 8.823529411764707e-06, |
|
"loss": 2.8371, |
|
"num_input_tokens_seen": 175448, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.010301313417460727, |
|
"grad_norm": 19.533434089690918, |
|
"learning_rate": 1.1764705882352942e-05, |
|
"loss": 2.5198, |
|
"num_input_tokens_seen": 233944, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.012876641771825908, |
|
"grad_norm": 12.509494197145006, |
|
"learning_rate": 1.4705882352941177e-05, |
|
"loss": 1.772, |
|
"num_input_tokens_seen": 292416, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.01545197012619109, |
|
"grad_norm": 3.6901887027066667, |
|
"learning_rate": 1.7647058823529414e-05, |
|
"loss": 1.2263, |
|
"num_input_tokens_seen": 350904, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.018027298480556272, |
|
"grad_norm": 2.3996076770849744, |
|
"learning_rate": 2.058823529411765e-05, |
|
"loss": 1.0102, |
|
"num_input_tokens_seen": 409384, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.020602626834921454, |
|
"grad_norm": 0.9253415848864577, |
|
"learning_rate": 2.3529411764705884e-05, |
|
"loss": 0.9378, |
|
"num_input_tokens_seen": 467864, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.023177955189286635, |
|
"grad_norm": 1.1966244115097795, |
|
"learning_rate": 2.647058823529412e-05, |
|
"loss": 0.9265, |
|
"num_input_tokens_seen": 526384, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.025753283543651816, |
|
"grad_norm": 1.853648349752417, |
|
"learning_rate": 2.9411764705882354e-05, |
|
"loss": 0.9157, |
|
"num_input_tokens_seen": 584856, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.025753283543651816, |
|
"eval_loss": 0.9191630482673645, |
|
"eval_runtime": 36.6123, |
|
"eval_samples_per_second": 1.639, |
|
"eval_steps_per_second": 0.41, |
|
"num_input_tokens_seen": 584856, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.028328611898016998, |
|
"grad_norm": 0.8294990584587586, |
|
"learning_rate": 3.235294117647059e-05, |
|
"loss": 0.9009, |
|
"num_input_tokens_seen": 643344, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.03090394025238218, |
|
"grad_norm": 0.8278765532866457, |
|
"learning_rate": 3.529411764705883e-05, |
|
"loss": 0.9063, |
|
"num_input_tokens_seen": 701808, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03347926860674736, |
|
"grad_norm": 0.7285901101792476, |
|
"learning_rate": 3.8235294117647055e-05, |
|
"loss": 0.9031, |
|
"num_input_tokens_seen": 760304, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.036054596961112545, |
|
"grad_norm": 0.5341783688819233, |
|
"learning_rate": 4.11764705882353e-05, |
|
"loss": 0.8991, |
|
"num_input_tokens_seen": 818760, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03862992531547772, |
|
"grad_norm": 0.46059313680988906, |
|
"learning_rate": 4.411764705882353e-05, |
|
"loss": 0.9055, |
|
"num_input_tokens_seen": 877256, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.04120525366984291, |
|
"grad_norm": 0.8194379237293679, |
|
"learning_rate": 4.705882352941177e-05, |
|
"loss": 0.9092, |
|
"num_input_tokens_seen": 935752, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.043780582024208085, |
|
"grad_norm": 0.6745093544830881, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9069, |
|
"num_input_tokens_seen": 994216, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.04635591037857327, |
|
"grad_norm": 0.2894672897884604, |
|
"learning_rate": 5.294117647058824e-05, |
|
"loss": 0.8924, |
|
"num_input_tokens_seen": 1052704, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.04893123873293845, |
|
"grad_norm": 0.5108489024576455, |
|
"learning_rate": 5.588235294117647e-05, |
|
"loss": 0.9059, |
|
"num_input_tokens_seen": 1111176, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.05150656708730363, |
|
"grad_norm": 0.40317180386305224, |
|
"learning_rate": 5.882352941176471e-05, |
|
"loss": 0.901, |
|
"num_input_tokens_seen": 1169664, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05150656708730363, |
|
"eval_loss": 0.9077914953231812, |
|
"eval_runtime": 16.8879, |
|
"eval_samples_per_second": 3.553, |
|
"eval_steps_per_second": 0.888, |
|
"num_input_tokens_seen": 1169664, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05408189544166881, |
|
"grad_norm": 0.412918917979438, |
|
"learning_rate": 6.176470588235295e-05, |
|
"loss": 0.9159, |
|
"num_input_tokens_seen": 1228112, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.056657223796033995, |
|
"grad_norm": 0.34797408069968117, |
|
"learning_rate": 6.470588235294118e-05, |
|
"loss": 0.91, |
|
"num_input_tokens_seen": 1286608, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05923255215039917, |
|
"grad_norm": 0.27558494796967653, |
|
"learning_rate": 6.764705882352942e-05, |
|
"loss": 0.9047, |
|
"num_input_tokens_seen": 1345072, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.06180788050476436, |
|
"grad_norm": 0.5422134023513459, |
|
"learning_rate": 7.058823529411765e-05, |
|
"loss": 0.9022, |
|
"num_input_tokens_seen": 1403544, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06438320885912954, |
|
"grad_norm": 0.4452796218739235, |
|
"learning_rate": 7.352941176470589e-05, |
|
"loss": 0.9081, |
|
"num_input_tokens_seen": 1462024, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.06695853721349472, |
|
"grad_norm": 0.5632558160730559, |
|
"learning_rate": 7.647058823529411e-05, |
|
"loss": 0.8939, |
|
"num_input_tokens_seen": 1520528, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0695338655678599, |
|
"grad_norm": 0.3383115884436812, |
|
"learning_rate": 7.941176470588235e-05, |
|
"loss": 0.9029, |
|
"num_input_tokens_seen": 1579024, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.07210919392222509, |
|
"grad_norm": 0.3506611095466577, |
|
"learning_rate": 8.23529411764706e-05, |
|
"loss": 0.9014, |
|
"num_input_tokens_seen": 1637504, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.07468452227659027, |
|
"grad_norm": 0.6328034405712752, |
|
"learning_rate": 8.529411764705883e-05, |
|
"loss": 0.9053, |
|
"num_input_tokens_seen": 1696024, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.07725985063095545, |
|
"grad_norm": 0.3511657661506363, |
|
"learning_rate": 8.823529411764706e-05, |
|
"loss": 0.9032, |
|
"num_input_tokens_seen": 1754512, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07725985063095545, |
|
"eval_loss": 0.8962129950523376, |
|
"eval_runtime": 17.0673, |
|
"eval_samples_per_second": 3.515, |
|
"eval_steps_per_second": 0.879, |
|
"num_input_tokens_seen": 1754512, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07983517898532062, |
|
"grad_norm": 0.4047681172482029, |
|
"learning_rate": 9.11764705882353e-05, |
|
"loss": 0.8985, |
|
"num_input_tokens_seen": 1812976, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.08241050733968582, |
|
"grad_norm": 0.37729033726569733, |
|
"learning_rate": 9.411764705882353e-05, |
|
"loss": 0.8949, |
|
"num_input_tokens_seen": 1871464, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.08498583569405099, |
|
"grad_norm": 0.4655744785034158, |
|
"learning_rate": 9.705882352941177e-05, |
|
"loss": 0.9069, |
|
"num_input_tokens_seen": 1929928, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.08756116404841617, |
|
"grad_norm": 0.30643056878817176, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9049, |
|
"num_input_tokens_seen": 1988432, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.09013649240278135, |
|
"grad_norm": 0.39944696269496754, |
|
"learning_rate": 9.999940874631277e-05, |
|
"loss": 0.9026, |
|
"num_input_tokens_seen": 2046920, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.09271182075714654, |
|
"grad_norm": 0.31301259106593154, |
|
"learning_rate": 9.999763499923432e-05, |
|
"loss": 0.8984, |
|
"num_input_tokens_seen": 2105392, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.09528714911151172, |
|
"grad_norm": 0.4309753054454554, |
|
"learning_rate": 9.999467880071402e-05, |
|
"loss": 0.9057, |
|
"num_input_tokens_seen": 2163872, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.0978624774658769, |
|
"grad_norm": 0.262930252305763, |
|
"learning_rate": 9.999054022066641e-05, |
|
"loss": 0.9078, |
|
"num_input_tokens_seen": 2222352, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.10043780582024209, |
|
"grad_norm": 0.22073598270887426, |
|
"learning_rate": 9.998521935696953e-05, |
|
"loss": 0.9028, |
|
"num_input_tokens_seen": 2280800, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.10301313417460727, |
|
"grad_norm": 0.23764668792524696, |
|
"learning_rate": 9.997871633546257e-05, |
|
"loss": 0.9053, |
|
"num_input_tokens_seen": 2339304, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.10301313417460727, |
|
"eval_loss": 0.8982028961181641, |
|
"eval_runtime": 16.9118, |
|
"eval_samples_per_second": 3.548, |
|
"eval_steps_per_second": 0.887, |
|
"num_input_tokens_seen": 2339304, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.10558846252897244, |
|
"grad_norm": 0.6222576114383499, |
|
"learning_rate": 9.997103130994296e-05, |
|
"loss": 0.9003, |
|
"num_input_tokens_seen": 2397808, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.10816379088333762, |
|
"grad_norm": 0.2983149992592585, |
|
"learning_rate": 9.996216446216267e-05, |
|
"loss": 0.8969, |
|
"num_input_tokens_seen": 2456288, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.11073911923770281, |
|
"grad_norm": 0.3505370510576513, |
|
"learning_rate": 9.995211600182397e-05, |
|
"loss": 0.9114, |
|
"num_input_tokens_seen": 2514784, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.11331444759206799, |
|
"grad_norm": 0.3683806652106065, |
|
"learning_rate": 9.994088616657444e-05, |
|
"loss": 0.899, |
|
"num_input_tokens_seen": 2573240, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.11588977594643317, |
|
"grad_norm": 0.21111769827155855, |
|
"learning_rate": 9.992847522200133e-05, |
|
"loss": 0.898, |
|
"num_input_tokens_seen": 2631672, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.11846510430079835, |
|
"grad_norm": 0.3426987181783304, |
|
"learning_rate": 9.99148834616253e-05, |
|
"loss": 0.9006, |
|
"num_input_tokens_seen": 2690112, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.12104043265516354, |
|
"grad_norm": 0.236983209071443, |
|
"learning_rate": 9.990011120689351e-05, |
|
"loss": 0.8973, |
|
"num_input_tokens_seen": 2748608, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.12361576100952872, |
|
"grad_norm": 0.4575208248826409, |
|
"learning_rate": 9.988415880717194e-05, |
|
"loss": 0.8885, |
|
"num_input_tokens_seen": 2807080, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1261910893638939, |
|
"grad_norm": 0.5470317919414993, |
|
"learning_rate": 9.986702663973722e-05, |
|
"loss": 0.9066, |
|
"num_input_tokens_seen": 2865520, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.12876641771825909, |
|
"grad_norm": 0.4992479706331095, |
|
"learning_rate": 9.98487151097676e-05, |
|
"loss": 0.9098, |
|
"num_input_tokens_seen": 2924016, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.12876641771825909, |
|
"eval_loss": 0.8956434726715088, |
|
"eval_runtime": 17.4804, |
|
"eval_samples_per_second": 3.432, |
|
"eval_steps_per_second": 0.858, |
|
"num_input_tokens_seen": 2924016, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.13134174607262425, |
|
"grad_norm": 0.3762164361984238, |
|
"learning_rate": 9.98292246503335e-05, |
|
"loss": 0.8987, |
|
"num_input_tokens_seen": 2982520, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.13391707442698944, |
|
"grad_norm": 0.6447043002410199, |
|
"learning_rate": 9.980855572238714e-05, |
|
"loss": 0.9036, |
|
"num_input_tokens_seen": 3041008, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.13649240278135463, |
|
"grad_norm": 0.5308092769971742, |
|
"learning_rate": 9.978670881475172e-05, |
|
"loss": 0.8961, |
|
"num_input_tokens_seen": 3099464, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.1390677311357198, |
|
"grad_norm": 0.508333330469703, |
|
"learning_rate": 9.976368444410985e-05, |
|
"loss": 0.9012, |
|
"num_input_tokens_seen": 3157944, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.141643059490085, |
|
"grad_norm": 0.6801788563719119, |
|
"learning_rate": 9.973948315499126e-05, |
|
"loss": 0.8985, |
|
"num_input_tokens_seen": 3216448, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.14421838784445018, |
|
"grad_norm": 0.6933074703933572, |
|
"learning_rate": 9.971410551976002e-05, |
|
"loss": 0.9114, |
|
"num_input_tokens_seen": 3274928, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.14679371619881534, |
|
"grad_norm": 0.21208820897494882, |
|
"learning_rate": 9.968755213860094e-05, |
|
"loss": 0.8886, |
|
"num_input_tokens_seen": 3333408, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.14936904455318054, |
|
"grad_norm": 0.5791422669000065, |
|
"learning_rate": 9.96598236395054e-05, |
|
"loss": 0.8929, |
|
"num_input_tokens_seen": 3391896, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1519443729075457, |
|
"grad_norm": 0.3460368893191152, |
|
"learning_rate": 9.96309206782565e-05, |
|
"loss": 0.9091, |
|
"num_input_tokens_seen": 3450392, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.1545197012619109, |
|
"grad_norm": 0.22425222135997747, |
|
"learning_rate": 9.960084393841355e-05, |
|
"loss": 0.8893, |
|
"num_input_tokens_seen": 3508888, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1545197012619109, |
|
"eval_loss": 0.8908902406692505, |
|
"eval_runtime": 16.9521, |
|
"eval_samples_per_second": 3.539, |
|
"eval_steps_per_second": 0.885, |
|
"num_input_tokens_seen": 3508888, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.15709502961627608, |
|
"grad_norm": 0.23111596622064604, |
|
"learning_rate": 9.956959413129585e-05, |
|
"loss": 0.9056, |
|
"num_input_tokens_seen": 3567368, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.15967035797064125, |
|
"grad_norm": 0.3918406894807393, |
|
"learning_rate": 9.953717199596598e-05, |
|
"loss": 0.8982, |
|
"num_input_tokens_seen": 3625848, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.16224568632500644, |
|
"grad_norm": 0.22081666860189372, |
|
"learning_rate": 9.95035782992122e-05, |
|
"loss": 0.8968, |
|
"num_input_tokens_seen": 3684336, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.16482101467937163, |
|
"grad_norm": 0.18024383676398176, |
|
"learning_rate": 9.94688138355304e-05, |
|
"loss": 0.8975, |
|
"num_input_tokens_seen": 3742800, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.1673963430337368, |
|
"grad_norm": 0.3866897344302321, |
|
"learning_rate": 9.943287942710527e-05, |
|
"loss": 0.9061, |
|
"num_input_tokens_seen": 3801280, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.16997167138810199, |
|
"grad_norm": 0.4804151381712559, |
|
"learning_rate": 9.939577592379088e-05, |
|
"loss": 0.8948, |
|
"num_input_tokens_seen": 3859792, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.17254699974246718, |
|
"grad_norm": 0.35878231707669056, |
|
"learning_rate": 9.935750420309055e-05, |
|
"loss": 0.9063, |
|
"num_input_tokens_seen": 3918272, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.17512232809683234, |
|
"grad_norm": 0.8713957774909928, |
|
"learning_rate": 9.931806517013612e-05, |
|
"loss": 0.8952, |
|
"num_input_tokens_seen": 3976760, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.17769765645119753, |
|
"grad_norm": 0.6671526212854116, |
|
"learning_rate": 9.927745975766654e-05, |
|
"loss": 0.9136, |
|
"num_input_tokens_seen": 4035240, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.1802729848055627, |
|
"grad_norm": 0.28702679234521244, |
|
"learning_rate": 9.923568892600578e-05, |
|
"loss": 0.9075, |
|
"num_input_tokens_seen": 4093688, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1802729848055627, |
|
"eval_loss": 0.89204341173172, |
|
"eval_runtime": 16.5819, |
|
"eval_samples_per_second": 3.618, |
|
"eval_steps_per_second": 0.905, |
|
"num_input_tokens_seen": 4093688, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1828483131599279, |
|
"grad_norm": 0.32233149132200706, |
|
"learning_rate": 9.91927536630402e-05, |
|
"loss": 0.8812, |
|
"num_input_tokens_seen": 4152160, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.18542364151429308, |
|
"grad_norm": 0.5071871697326992, |
|
"learning_rate": 9.91486549841951e-05, |
|
"loss": 0.9109, |
|
"num_input_tokens_seen": 4210648, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.18799896986865824, |
|
"grad_norm": 0.4532792519849944, |
|
"learning_rate": 9.91033939324107e-05, |
|
"loss": 0.9176, |
|
"num_input_tokens_seen": 4269136, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.19057429822302344, |
|
"grad_norm": 0.5409761562534501, |
|
"learning_rate": 9.905697157811761e-05, |
|
"loss": 0.9077, |
|
"num_input_tokens_seen": 4327664, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.19314962657738863, |
|
"grad_norm": 0.3432361562809093, |
|
"learning_rate": 9.900938901921131e-05, |
|
"loss": 0.893, |
|
"num_input_tokens_seen": 4386120, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.1957249549317538, |
|
"grad_norm": 0.4756530294720616, |
|
"learning_rate": 9.896064738102635e-05, |
|
"loss": 0.9094, |
|
"num_input_tokens_seen": 4444560, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.19830028328611898, |
|
"grad_norm": 0.424836974193983, |
|
"learning_rate": 9.891074781630966e-05, |
|
"loss": 0.9091, |
|
"num_input_tokens_seen": 4503016, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.20087561164048418, |
|
"grad_norm": 0.31316926977469683, |
|
"learning_rate": 9.885969150519331e-05, |
|
"loss": 0.9033, |
|
"num_input_tokens_seen": 4561496, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.20345093999484934, |
|
"grad_norm": 0.6108378682480797, |
|
"learning_rate": 9.88074796551666e-05, |
|
"loss": 0.8851, |
|
"num_input_tokens_seen": 4619944, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.20602626834921453, |
|
"grad_norm": 0.38294566619219206, |
|
"learning_rate": 9.875411350104744e-05, |
|
"loss": 0.9004, |
|
"num_input_tokens_seen": 4678384, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.20602626834921453, |
|
"eval_loss": 0.9086406826972961, |
|
"eval_runtime": 16.7827, |
|
"eval_samples_per_second": 3.575, |
|
"eval_steps_per_second": 0.894, |
|
"num_input_tokens_seen": 4678384, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2086015967035797, |
|
"grad_norm": 0.4283475401297436, |
|
"learning_rate": 9.86995943049533e-05, |
|
"loss": 0.8976, |
|
"num_input_tokens_seen": 4736904, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.2111769250579449, |
|
"grad_norm": 0.40329738287583206, |
|
"learning_rate": 9.864392335627117e-05, |
|
"loss": 0.9134, |
|
"num_input_tokens_seen": 4795376, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.21375225341231008, |
|
"grad_norm": 0.37890634863656475, |
|
"learning_rate": 9.858710197162721e-05, |
|
"loss": 0.8955, |
|
"num_input_tokens_seen": 4853880, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.21632758176667524, |
|
"grad_norm": 0.32402245835420784, |
|
"learning_rate": 9.852913149485556e-05, |
|
"loss": 0.9014, |
|
"num_input_tokens_seen": 4912360, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.21890291012104043, |
|
"grad_norm": 0.49572499508345125, |
|
"learning_rate": 9.847001329696653e-05, |
|
"loss": 0.9065, |
|
"num_input_tokens_seen": 4970872, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.22147823847540563, |
|
"grad_norm": 0.11883567118448765, |
|
"learning_rate": 9.840974877611422e-05, |
|
"loss": 0.8952, |
|
"num_input_tokens_seen": 5029304, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2240535668297708, |
|
"grad_norm": 0.7105724703149633, |
|
"learning_rate": 9.834833935756344e-05, |
|
"loss": 0.9106, |
|
"num_input_tokens_seen": 5087800, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.22662889518413598, |
|
"grad_norm": 0.708953365388227, |
|
"learning_rate": 9.828578649365601e-05, |
|
"loss": 0.8996, |
|
"num_input_tokens_seen": 5146312, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.22920422353850115, |
|
"grad_norm": 0.4503080730364326, |
|
"learning_rate": 9.822209166377635e-05, |
|
"loss": 0.8999, |
|
"num_input_tokens_seen": 5204800, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.23177955189286634, |
|
"grad_norm": 0.20754132336834788, |
|
"learning_rate": 9.815725637431662e-05, |
|
"loss": 0.9076, |
|
"num_input_tokens_seen": 5263304, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.23177955189286634, |
|
"eval_loss": 0.8962157368659973, |
|
"eval_runtime": 17.2029, |
|
"eval_samples_per_second": 3.488, |
|
"eval_steps_per_second": 0.872, |
|
"num_input_tokens_seen": 5263304, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.23435488024723153, |
|
"grad_norm": 0.5906403377099594, |
|
"learning_rate": 9.809128215864097e-05, |
|
"loss": 0.8942, |
|
"num_input_tokens_seen": 5321760, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.2369302086015967, |
|
"grad_norm": 0.5706805631290568, |
|
"learning_rate": 9.802417057704931e-05, |
|
"loss": 0.9099, |
|
"num_input_tokens_seen": 5380224, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.23950553695596188, |
|
"grad_norm": 0.164631948732384, |
|
"learning_rate": 9.795592321674045e-05, |
|
"loss": 0.8981, |
|
"num_input_tokens_seen": 5438704, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.24208086531032708, |
|
"grad_norm": 0.32986780285522194, |
|
"learning_rate": 9.788654169177453e-05, |
|
"loss": 0.8952, |
|
"num_input_tokens_seen": 5497208, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.24465619366469224, |
|
"grad_norm": 0.40551569446674784, |
|
"learning_rate": 9.781602764303487e-05, |
|
"loss": 0.8959, |
|
"num_input_tokens_seen": 5555704, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.24723152201905743, |
|
"grad_norm": 0.20928586231326682, |
|
"learning_rate": 9.774438273818911e-05, |
|
"loss": 0.901, |
|
"num_input_tokens_seen": 5614160, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.24980685037342262, |
|
"grad_norm": 0.34365307116824517, |
|
"learning_rate": 9.767160867164979e-05, |
|
"loss": 0.9008, |
|
"num_input_tokens_seen": 5672640, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.2523821787277878, |
|
"grad_norm": 0.4212274243028996, |
|
"learning_rate": 9.759770716453436e-05, |
|
"loss": 0.9016, |
|
"num_input_tokens_seen": 5731072, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.254957507082153, |
|
"grad_norm": 0.39823625576558597, |
|
"learning_rate": 9.752267996462434e-05, |
|
"loss": 0.9132, |
|
"num_input_tokens_seen": 5789544, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.25753283543651817, |
|
"grad_norm": 0.24856324117583653, |
|
"learning_rate": 9.744652884632406e-05, |
|
"loss": 0.8962, |
|
"num_input_tokens_seen": 5848048, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.25753283543651817, |
|
"eval_loss": 0.8987945914268494, |
|
"eval_runtime": 17.1622, |
|
"eval_samples_per_second": 3.496, |
|
"eval_steps_per_second": 0.874, |
|
"num_input_tokens_seen": 5848048, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.26010816379088336, |
|
"grad_norm": 0.25461397268106634, |
|
"learning_rate": 9.736925561061871e-05, |
|
"loss": 0.8954, |
|
"num_input_tokens_seen": 5906512, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.2626834921452485, |
|
"grad_norm": 0.38602603275675745, |
|
"learning_rate": 9.729086208503174e-05, |
|
"loss": 0.8927, |
|
"num_input_tokens_seen": 5965024, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.2652588204996137, |
|
"grad_norm": 0.150082825225123, |
|
"learning_rate": 9.721135012358156e-05, |
|
"loss": 0.898, |
|
"num_input_tokens_seen": 6023496, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.2678341488539789, |
|
"grad_norm": 0.26881662025899655, |
|
"learning_rate": 9.713072160673777e-05, |
|
"loss": 0.9016, |
|
"num_input_tokens_seen": 6082000, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.2704094772083441, |
|
"grad_norm": 0.5039123575147229, |
|
"learning_rate": 9.704897844137673e-05, |
|
"loss": 0.8842, |
|
"num_input_tokens_seen": 6140480, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.27298480556270927, |
|
"grad_norm": 0.27836945453098666, |
|
"learning_rate": 9.696612256073633e-05, |
|
"loss": 0.8921, |
|
"num_input_tokens_seen": 6198968, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.2755601339170744, |
|
"grad_norm": 0.22936338891946384, |
|
"learning_rate": 9.688215592437039e-05, |
|
"loss": 0.8979, |
|
"num_input_tokens_seen": 6257464, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.2781354622714396, |
|
"grad_norm": 0.396486857609105, |
|
"learning_rate": 9.679708051810221e-05, |
|
"loss": 0.8951, |
|
"num_input_tokens_seen": 6315944, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.2807107906258048, |
|
"grad_norm": 0.4751226662261396, |
|
"learning_rate": 9.67108983539777e-05, |
|
"loss": 0.9149, |
|
"num_input_tokens_seen": 6374408, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.28328611898017, |
|
"grad_norm": 0.26829103885131056, |
|
"learning_rate": 9.662361147021779e-05, |
|
"loss": 0.9013, |
|
"num_input_tokens_seen": 6432936, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.28328611898017, |
|
"eval_loss": 0.9001271724700928, |
|
"eval_runtime": 16.9878, |
|
"eval_samples_per_second": 3.532, |
|
"eval_steps_per_second": 0.883, |
|
"num_input_tokens_seen": 6432936, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.28586144733453517, |
|
"grad_norm": 0.5334970266367584, |
|
"learning_rate": 9.653522193117013e-05, |
|
"loss": 0.8981, |
|
"num_input_tokens_seen": 6491400, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.28843677568890036, |
|
"grad_norm": 0.33261202813259866, |
|
"learning_rate": 9.644573182726035e-05, |
|
"loss": 0.9041, |
|
"num_input_tokens_seen": 6549872, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.2910121040432655, |
|
"grad_norm": 0.19122862132727417, |
|
"learning_rate": 9.63551432749426e-05, |
|
"loss": 0.9024, |
|
"num_input_tokens_seen": 6608296, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.2935874323976307, |
|
"grad_norm": 0.27778009425329764, |
|
"learning_rate": 9.626345841664953e-05, |
|
"loss": 0.9002, |
|
"num_input_tokens_seen": 6666768, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.2961627607519959, |
|
"grad_norm": 0.3065314332046026, |
|
"learning_rate": 9.617067942074153e-05, |
|
"loss": 0.9035, |
|
"num_input_tokens_seen": 6725248, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.29873808910636107, |
|
"grad_norm": 0.24431496415058412, |
|
"learning_rate": 9.607680848145558e-05, |
|
"loss": 0.9019, |
|
"num_input_tokens_seen": 6783680, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.30131341746072626, |
|
"grad_norm": 0.27088193021301504, |
|
"learning_rate": 9.598184781885318e-05, |
|
"loss": 0.9001, |
|
"num_input_tokens_seen": 6842144, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.3038887458150914, |
|
"grad_norm": 0.33893098113605125, |
|
"learning_rate": 9.588579967876806e-05, |
|
"loss": 0.8961, |
|
"num_input_tokens_seen": 6900656, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.3064640741694566, |
|
"grad_norm": 0.3038921833221806, |
|
"learning_rate": 9.578866633275288e-05, |
|
"loss": 0.9, |
|
"num_input_tokens_seen": 6959128, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.3090394025238218, |
|
"grad_norm": 0.48929637235055645, |
|
"learning_rate": 9.569045007802559e-05, |
|
"loss": 0.9046, |
|
"num_input_tokens_seen": 7017576, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3090394025238218, |
|
"eval_loss": 0.9053278565406799, |
|
"eval_runtime": 17.1218, |
|
"eval_samples_per_second": 3.504, |
|
"eval_steps_per_second": 0.876, |
|
"num_input_tokens_seen": 7017576, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.311614730878187, |
|
"grad_norm": 0.3545950949033049, |
|
"learning_rate": 9.55911532374151e-05, |
|
"loss": 0.9019, |
|
"num_input_tokens_seen": 7076032, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.31419005923255217, |
|
"grad_norm": 0.2355627006333952, |
|
"learning_rate": 9.549077815930636e-05, |
|
"loss": 0.8956, |
|
"num_input_tokens_seen": 7134536, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.31676538758691736, |
|
"grad_norm": 0.17552483625655946, |
|
"learning_rate": 9.538932721758474e-05, |
|
"loss": 0.898, |
|
"num_input_tokens_seen": 7193032, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.3193407159412825, |
|
"grad_norm": 0.1749010635522076, |
|
"learning_rate": 9.528680281157999e-05, |
|
"loss": 0.8991, |
|
"num_input_tokens_seen": 7251568, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.3219160442956477, |
|
"grad_norm": 0.19885182954224315, |
|
"learning_rate": 9.518320736600943e-05, |
|
"loss": 0.8961, |
|
"num_input_tokens_seen": 7310072, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.3244913726500129, |
|
"grad_norm": 0.4778756508206831, |
|
"learning_rate": 9.507854333092063e-05, |
|
"loss": 0.8994, |
|
"num_input_tokens_seen": 7368560, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.32706670100437807, |
|
"grad_norm": 0.4123272743887767, |
|
"learning_rate": 9.497281318163346e-05, |
|
"loss": 0.8925, |
|
"num_input_tokens_seen": 7427040, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.32964202935874326, |
|
"grad_norm": 0.34409942667705734, |
|
"learning_rate": 9.486601941868154e-05, |
|
"loss": 0.9087, |
|
"num_input_tokens_seen": 7485552, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.3322173577131084, |
|
"grad_norm": 0.43327107411223276, |
|
"learning_rate": 9.475816456775313e-05, |
|
"loss": 0.8924, |
|
"num_input_tokens_seen": 7544040, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.3347926860674736, |
|
"grad_norm": 0.6643023904352003, |
|
"learning_rate": 9.464925117963133e-05, |
|
"loss": 0.904, |
|
"num_input_tokens_seen": 7602512, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.3347926860674736, |
|
"eval_loss": 0.90328449010849, |
|
"eval_runtime": 16.1444, |
|
"eval_samples_per_second": 3.716, |
|
"eval_steps_per_second": 0.929, |
|
"num_input_tokens_seen": 7602512, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.3373680144218388, |
|
"grad_norm": 0.620349194493935, |
|
"learning_rate": 9.453928183013385e-05, |
|
"loss": 0.8929, |
|
"num_input_tokens_seen": 7660968, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.33994334277620397, |
|
"grad_norm": 0.18611846349930314, |
|
"learning_rate": 9.442825912005202e-05, |
|
"loss": 0.9078, |
|
"num_input_tokens_seen": 7719448, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.34251867113056916, |
|
"grad_norm": 0.4448289413172567, |
|
"learning_rate": 9.431618567508933e-05, |
|
"loss": 0.8963, |
|
"num_input_tokens_seen": 7777928, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.34509399948493436, |
|
"grad_norm": 0.6187189362250411, |
|
"learning_rate": 9.420306414579925e-05, |
|
"loss": 0.9134, |
|
"num_input_tokens_seen": 7836424, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.3476693278392995, |
|
"grad_norm": 0.35247743418537675, |
|
"learning_rate": 9.408889720752266e-05, |
|
"loss": 0.8984, |
|
"num_input_tokens_seen": 7894904, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.3502446561936647, |
|
"grad_norm": 0.20652916455346712, |
|
"learning_rate": 9.397368756032445e-05, |
|
"loss": 0.8997, |
|
"num_input_tokens_seen": 7953432, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.3528199845480299, |
|
"grad_norm": 0.4289996063998063, |
|
"learning_rate": 9.385743792892982e-05, |
|
"loss": 0.8926, |
|
"num_input_tokens_seen": 8011888, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.35539531290239507, |
|
"grad_norm": 0.13764054506536547, |
|
"learning_rate": 9.374015106265968e-05, |
|
"loss": 0.9008, |
|
"num_input_tokens_seen": 8070344, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.35797064125676026, |
|
"grad_norm": 0.22142459689499855, |
|
"learning_rate": 9.362182973536569e-05, |
|
"loss": 0.8986, |
|
"num_input_tokens_seen": 8128816, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.3605459696111254, |
|
"grad_norm": 0.3234539650829873, |
|
"learning_rate": 9.35024767453647e-05, |
|
"loss": 0.8972, |
|
"num_input_tokens_seen": 8187320, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.3605459696111254, |
|
"eval_loss": 0.9028835892677307, |
|
"eval_runtime": 16.1635, |
|
"eval_samples_per_second": 3.712, |
|
"eval_steps_per_second": 0.928, |
|
"num_input_tokens_seen": 8187320, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.3631212979654906, |
|
"grad_norm": 0.3215674690491891, |
|
"learning_rate": 9.338209491537257e-05, |
|
"loss": 0.8998, |
|
"num_input_tokens_seen": 8245776, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.3656966263198558, |
|
"grad_norm": 0.36428692362396536, |
|
"learning_rate": 9.326068709243727e-05, |
|
"loss": 0.8999, |
|
"num_input_tokens_seen": 8304280, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.36827195467422097, |
|
"grad_norm": 0.280459809393624, |
|
"learning_rate": 9.313825614787177e-05, |
|
"loss": 0.8983, |
|
"num_input_tokens_seen": 8362728, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.37084728302858616, |
|
"grad_norm": 0.1819339731162554, |
|
"learning_rate": 9.301480497718593e-05, |
|
"loss": 0.892, |
|
"num_input_tokens_seen": 8421224, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.37342261138295135, |
|
"grad_norm": 0.23784840563699303, |
|
"learning_rate": 9.289033650001817e-05, |
|
"loss": 0.9034, |
|
"num_input_tokens_seen": 8479720, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.3759979397373165, |
|
"grad_norm": 0.24070744588741375, |
|
"learning_rate": 9.276485366006634e-05, |
|
"loss": 0.895, |
|
"num_input_tokens_seen": 8538192, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.3785732680916817, |
|
"grad_norm": 0.24846723619231478, |
|
"learning_rate": 9.263835942501807e-05, |
|
"loss": 0.8973, |
|
"num_input_tokens_seen": 8596664, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.3811485964460469, |
|
"grad_norm": 0.2601614440419362, |
|
"learning_rate": 9.251085678648072e-05, |
|
"loss": 0.8972, |
|
"num_input_tokens_seen": 8655128, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.38372392480041206, |
|
"grad_norm": 0.30194733839751087, |
|
"learning_rate": 9.238234875991046e-05, |
|
"loss": 0.8987, |
|
"num_input_tokens_seen": 8713624, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.38629925315477726, |
|
"grad_norm": 0.3015609177439829, |
|
"learning_rate": 9.225283838454111e-05, |
|
"loss": 0.9005, |
|
"num_input_tokens_seen": 8772104, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.38629925315477726, |
|
"eval_loss": 0.8981761336326599, |
|
"eval_runtime": 16.0177, |
|
"eval_samples_per_second": 3.746, |
|
"eval_steps_per_second": 0.936, |
|
"num_input_tokens_seen": 8772104, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.3888745815091424, |
|
"grad_norm": 0.44991480631292463, |
|
"learning_rate": 9.21223287233121e-05, |
|
"loss": 0.8973, |
|
"num_input_tokens_seen": 8830568, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.3914499098635076, |
|
"grad_norm": 0.22570310903133853, |
|
"learning_rate": 9.199082286279622e-05, |
|
"loss": 0.8974, |
|
"num_input_tokens_seen": 8889072, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.3940252382178728, |
|
"grad_norm": 0.22090133233732026, |
|
"learning_rate": 9.185832391312644e-05, |
|
"loss": 0.8985, |
|
"num_input_tokens_seen": 8947568, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.39660056657223797, |
|
"grad_norm": 0.23738058530347297, |
|
"learning_rate": 9.172483500792244e-05, |
|
"loss": 0.8935, |
|
"num_input_tokens_seen": 9006056, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.39917589492660316, |
|
"grad_norm": 0.41232659301572594, |
|
"learning_rate": 9.159035930421658e-05, |
|
"loss": 0.8985, |
|
"num_input_tokens_seen": 9064592, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.40175122328096835, |
|
"grad_norm": 0.2004855543001356, |
|
"learning_rate": 9.145489998237902e-05, |
|
"loss": 0.9105, |
|
"num_input_tokens_seen": 9123096, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.4043265516353335, |
|
"grad_norm": 0.16209487510237375, |
|
"learning_rate": 9.131846024604274e-05, |
|
"loss": 0.8925, |
|
"num_input_tokens_seen": 9181576, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.4069018799896987, |
|
"grad_norm": 0.24319930530142153, |
|
"learning_rate": 9.11810433220276e-05, |
|
"loss": 0.8955, |
|
"num_input_tokens_seen": 9240048, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.40947720834406387, |
|
"grad_norm": 0.24311562892750557, |
|
"learning_rate": 9.104265246026415e-05, |
|
"loss": 0.8986, |
|
"num_input_tokens_seen": 9298528, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.41205253669842906, |
|
"grad_norm": 0.2891177185942039, |
|
"learning_rate": 9.090329093371666e-05, |
|
"loss": 0.8881, |
|
"num_input_tokens_seen": 9357016, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.41205253669842906, |
|
"eval_loss": 0.8973079919815063, |
|
"eval_runtime": 16.1396, |
|
"eval_samples_per_second": 3.718, |
|
"eval_steps_per_second": 0.929, |
|
"num_input_tokens_seen": 9357016, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.41462786505279425, |
|
"grad_norm": 0.4728970278357675, |
|
"learning_rate": 9.076296203830579e-05, |
|
"loss": 0.8798, |
|
"num_input_tokens_seen": 9415480, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.4172031934071594, |
|
"grad_norm": 0.2420351489416807, |
|
"learning_rate": 9.062166909283062e-05, |
|
"loss": 0.9104, |
|
"num_input_tokens_seen": 9473928, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.4197785217615246, |
|
"grad_norm": 0.2262623911682871, |
|
"learning_rate": 9.047941543889014e-05, |
|
"loss": 0.9007, |
|
"num_input_tokens_seen": 9532408, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.4223538501158898, |
|
"grad_norm": 0.18258980329217392, |
|
"learning_rate": 9.033620444080428e-05, |
|
"loss": 0.8974, |
|
"num_input_tokens_seen": 9590920, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.42492917847025496, |
|
"grad_norm": 0.2898762949979446, |
|
"learning_rate": 9.019203948553422e-05, |
|
"loss": 0.8992, |
|
"num_input_tokens_seen": 9649400, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.42750450682462016, |
|
"grad_norm": 0.3884592601874919, |
|
"learning_rate": 9.004692398260244e-05, |
|
"loss": 0.8991, |
|
"num_input_tokens_seen": 9707888, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.43007983517898535, |
|
"grad_norm": 0.24055719869667014, |
|
"learning_rate": 8.9900861364012e-05, |
|
"loss": 0.8964, |
|
"num_input_tokens_seen": 9766384, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.4326551635333505, |
|
"grad_norm": 0.4482774361285702, |
|
"learning_rate": 8.975385508416532e-05, |
|
"loss": 0.8723, |
|
"num_input_tokens_seen": 9824896, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.4352304918877157, |
|
"grad_norm": 0.4612030185875055, |
|
"learning_rate": 8.960590861978265e-05, |
|
"loss": 0.874, |
|
"num_input_tokens_seen": 9883408, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.43780582024208087, |
|
"grad_norm": 0.44197834194509644, |
|
"learning_rate": 8.945702546981969e-05, |
|
"loss": 0.9035, |
|
"num_input_tokens_seen": 9941896, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.43780582024208087, |
|
"eval_loss": 0.8779178261756897, |
|
"eval_runtime": 16.159, |
|
"eval_samples_per_second": 3.713, |
|
"eval_steps_per_second": 0.928, |
|
"num_input_tokens_seen": 9941896, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.44038114859644606, |
|
"grad_norm": 0.8207188524660312, |
|
"learning_rate": 8.930720915538487e-05, |
|
"loss": 0.8516, |
|
"num_input_tokens_seen": 10000336, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.44295647695081125, |
|
"grad_norm": 1.5881804699369033, |
|
"learning_rate": 8.915646321965614e-05, |
|
"loss": 0.9206, |
|
"num_input_tokens_seen": 10058816, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.4455318053051764, |
|
"grad_norm": 0.3364043503653687, |
|
"learning_rate": 8.900479122779712e-05, |
|
"loss": 0.9028, |
|
"num_input_tokens_seen": 10117320, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.4481071336595416, |
|
"grad_norm": 0.2888069815557639, |
|
"learning_rate": 8.885219676687277e-05, |
|
"loss": 0.8991, |
|
"num_input_tokens_seen": 10175824, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.45068246201390677, |
|
"grad_norm": 0.26081919755231314, |
|
"learning_rate": 8.869868344576459e-05, |
|
"loss": 0.8934, |
|
"num_input_tokens_seen": 10234288, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.45325779036827196, |
|
"grad_norm": 0.1672074260476841, |
|
"learning_rate": 8.854425489508532e-05, |
|
"loss": 0.8908, |
|
"num_input_tokens_seen": 10292736, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.45583311872263715, |
|
"grad_norm": 0.3141498425127344, |
|
"learning_rate": 8.838891476709288e-05, |
|
"loss": 0.8988, |
|
"num_input_tokens_seen": 10351224, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.4584084470770023, |
|
"grad_norm": 0.28442383194638554, |
|
"learning_rate": 8.823266673560426e-05, |
|
"loss": 0.8965, |
|
"num_input_tokens_seen": 10409736, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.4609837754313675, |
|
"grad_norm": 0.24793143025843287, |
|
"learning_rate": 8.807551449590846e-05, |
|
"loss": 0.8989, |
|
"num_input_tokens_seen": 10468240, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.4635591037857327, |
|
"grad_norm": 0.18173090045802157, |
|
"learning_rate": 8.791746176467907e-05, |
|
"loss": 0.8961, |
|
"num_input_tokens_seen": 10526712, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.4635591037857327, |
|
"eval_loss": 0.891426146030426, |
|
"eval_runtime": 16.0357, |
|
"eval_samples_per_second": 3.742, |
|
"eval_steps_per_second": 0.935, |
|
"num_input_tokens_seen": 10526712, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.46613443214009787, |
|
"grad_norm": 0.18755280770432675, |
|
"learning_rate": 8.775851227988656e-05, |
|
"loss": 0.8955, |
|
"num_input_tokens_seen": 10585232, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.46870976049446306, |
|
"grad_norm": 0.16684040416821233, |
|
"learning_rate": 8.759866980070963e-05, |
|
"loss": 0.8951, |
|
"num_input_tokens_seen": 10643728, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.47128508884882825, |
|
"grad_norm": 0.33346521793095785, |
|
"learning_rate": 8.743793810744654e-05, |
|
"loss": 0.8951, |
|
"num_input_tokens_seen": 10702240, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.4738604172031934, |
|
"grad_norm": 0.23650054707790025, |
|
"learning_rate": 8.727632100142551e-05, |
|
"loss": 0.9066, |
|
"num_input_tokens_seen": 10760656, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.4764357455575586, |
|
"grad_norm": 0.20217442955339224, |
|
"learning_rate": 8.711382230491493e-05, |
|
"loss": 0.8953, |
|
"num_input_tokens_seen": 10819128, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.47901107391192377, |
|
"grad_norm": 0.1648307621403396, |
|
"learning_rate": 8.695044586103296e-05, |
|
"loss": 0.8961, |
|
"num_input_tokens_seen": 10877600, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.48158640226628896, |
|
"grad_norm": 0.25983065938238986, |
|
"learning_rate": 8.678619553365659e-05, |
|
"loss": 0.8965, |
|
"num_input_tokens_seen": 10936088, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.48416173062065415, |
|
"grad_norm": 0.17882463002474594, |
|
"learning_rate": 8.662107520733027e-05, |
|
"loss": 0.9018, |
|
"num_input_tokens_seen": 10994560, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.4867370589750193, |
|
"grad_norm": 0.14644012846994445, |
|
"learning_rate": 8.64550887871741e-05, |
|
"loss": 0.8944, |
|
"num_input_tokens_seen": 11053016, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.4893123873293845, |
|
"grad_norm": 0.23751630760966444, |
|
"learning_rate": 8.628824019879137e-05, |
|
"loss": 0.8852, |
|
"num_input_tokens_seen": 11111520, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.4893123873293845, |
|
"eval_loss": 0.8915690183639526, |
|
"eval_runtime": 16.2589, |
|
"eval_samples_per_second": 3.69, |
|
"eval_steps_per_second": 0.923, |
|
"num_input_tokens_seen": 11111520, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.49188771568374967, |
|
"grad_norm": 0.3904846319143667, |
|
"learning_rate": 8.612053338817581e-05, |
|
"loss": 0.9087, |
|
"num_input_tokens_seen": 11170016, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.49446304403811486, |
|
"grad_norm": 0.44920450892911645, |
|
"learning_rate": 8.595197232161824e-05, |
|
"loss": 0.8915, |
|
"num_input_tokens_seen": 11228496, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.49703837239248005, |
|
"grad_norm": 0.6093857047738649, |
|
"learning_rate": 8.578256098561275e-05, |
|
"loss": 0.8836, |
|
"num_input_tokens_seen": 11286928, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.49961370074684525, |
|
"grad_norm": 0.6282945106836194, |
|
"learning_rate": 8.561230338676239e-05, |
|
"loss": 0.9116, |
|
"num_input_tokens_seen": 11345400, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.5021890291012104, |
|
"grad_norm": 0.3187294296147391, |
|
"learning_rate": 8.544120355168451e-05, |
|
"loss": 0.8809, |
|
"num_input_tokens_seen": 11403912, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.5047643574555756, |
|
"grad_norm": 0.4019889420836467, |
|
"learning_rate": 8.526926552691544e-05, |
|
"loss": 0.8895, |
|
"num_input_tokens_seen": 11462344, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.5073396858099408, |
|
"grad_norm": 0.4762279449607594, |
|
"learning_rate": 8.509649337881483e-05, |
|
"loss": 0.8674, |
|
"num_input_tokens_seen": 11520808, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.509915014164306, |
|
"grad_norm": 1.7062273050040726, |
|
"learning_rate": 8.492289119346943e-05, |
|
"loss": 0.8832, |
|
"num_input_tokens_seen": 11579248, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.5124903425186711, |
|
"grad_norm": 0.7896696939552226, |
|
"learning_rate": 8.474846307659658e-05, |
|
"loss": 0.8581, |
|
"num_input_tokens_seen": 11637712, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.5150656708730363, |
|
"grad_norm": 0.9287129351980297, |
|
"learning_rate": 8.457321315344694e-05, |
|
"loss": 0.8635, |
|
"num_input_tokens_seen": 11696200, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5150656708730363, |
|
"eval_loss": 0.860200047492981, |
|
"eval_runtime": 16.1196, |
|
"eval_samples_per_second": 3.722, |
|
"eval_steps_per_second": 0.931, |
|
"num_input_tokens_seen": 11696200, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5176409992274015, |
|
"grad_norm": 0.9492829276877938, |
|
"learning_rate": 8.439714556870704e-05, |
|
"loss": 0.8499, |
|
"num_input_tokens_seen": 11754720, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.5202163275817667, |
|
"grad_norm": 1.57473364910246, |
|
"learning_rate": 8.422026448640124e-05, |
|
"loss": 0.8556, |
|
"num_input_tokens_seen": 11813216, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.5227916559361319, |
|
"grad_norm": 0.6562994819534732, |
|
"learning_rate": 8.40425740897932e-05, |
|
"loss": 0.8533, |
|
"num_input_tokens_seen": 11871712, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.525366984290497, |
|
"grad_norm": 0.5420643724864006, |
|
"learning_rate": 8.386407858128706e-05, |
|
"loss": 0.8921, |
|
"num_input_tokens_seen": 11930200, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.5279423126448622, |
|
"grad_norm": 0.4900953324933905, |
|
"learning_rate": 8.368478218232787e-05, |
|
"loss": 0.8815, |
|
"num_input_tokens_seen": 11988704, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.5305176409992274, |
|
"grad_norm": 0.46534021808416004, |
|
"learning_rate": 8.350468913330192e-05, |
|
"loss": 0.854, |
|
"num_input_tokens_seen": 12047176, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.5330929693535926, |
|
"grad_norm": 0.6739669998528043, |
|
"learning_rate": 8.33238036934364e-05, |
|
"loss": 0.8642, |
|
"num_input_tokens_seen": 12105680, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.5356682977079578, |
|
"grad_norm": 1.100337259258234, |
|
"learning_rate": 8.31421301406986e-05, |
|
"loss": 0.8072, |
|
"num_input_tokens_seen": 12164208, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.5382436260623229, |
|
"grad_norm": 1.2731858488127639, |
|
"learning_rate": 8.29596727716949e-05, |
|
"loss": 0.8532, |
|
"num_input_tokens_seen": 12222672, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.5408189544166881, |
|
"grad_norm": 0.8686963016555517, |
|
"learning_rate": 8.277643590156894e-05, |
|
"loss": 0.8844, |
|
"num_input_tokens_seen": 12281072, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.5408189544166881, |
|
"eval_loss": 0.8446129560470581, |
|
"eval_runtime": 16.0508, |
|
"eval_samples_per_second": 3.738, |
|
"eval_steps_per_second": 0.935, |
|
"num_input_tokens_seen": 12281072, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.5433942827710533, |
|
"grad_norm": 0.5518554447099218, |
|
"learning_rate": 8.259242386389973e-05, |
|
"loss": 0.8602, |
|
"num_input_tokens_seen": 12339544, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.5459696111254185, |
|
"grad_norm": 0.7300911438509382, |
|
"learning_rate": 8.240764101059912e-05, |
|
"loss": 0.8615, |
|
"num_input_tokens_seen": 12397992, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.5485449394797837, |
|
"grad_norm": 0.7364983085887583, |
|
"learning_rate": 8.222209171180883e-05, |
|
"loss": 0.8732, |
|
"num_input_tokens_seen": 12456480, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.5511202678341488, |
|
"grad_norm": 0.4840408774949972, |
|
"learning_rate": 8.203578035579715e-05, |
|
"loss": 0.8691, |
|
"num_input_tokens_seen": 12515000, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.553695596188514, |
|
"grad_norm": 0.516278691776577, |
|
"learning_rate": 8.184871134885513e-05, |
|
"loss": 0.8544, |
|
"num_input_tokens_seen": 12573504, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.5562709245428792, |
|
"grad_norm": 0.8626943002609527, |
|
"learning_rate": 8.166088911519235e-05, |
|
"loss": 0.8501, |
|
"num_input_tokens_seen": 12632008, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.5588462528972444, |
|
"grad_norm": 0.7409465187036862, |
|
"learning_rate": 8.147231809683236e-05, |
|
"loss": 0.8646, |
|
"num_input_tokens_seen": 12690520, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.5614215812516096, |
|
"grad_norm": 0.5736639247313171, |
|
"learning_rate": 8.128300275350756e-05, |
|
"loss": 0.8327, |
|
"num_input_tokens_seen": 12749032, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.5639969096059748, |
|
"grad_norm": 0.7720514157947642, |
|
"learning_rate": 8.109294756255375e-05, |
|
"loss": 0.8218, |
|
"num_input_tokens_seen": 12807504, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.56657223796034, |
|
"grad_norm": 0.9129011996506371, |
|
"learning_rate": 8.090215701880419e-05, |
|
"loss": 0.8427, |
|
"num_input_tokens_seen": 12865992, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.56657223796034, |
|
"eval_loss": 0.7743102312088013, |
|
"eval_runtime": 16.1034, |
|
"eval_samples_per_second": 3.726, |
|
"eval_steps_per_second": 0.931, |
|
"num_input_tokens_seen": 12865992, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5691475663147051, |
|
"grad_norm": 1.6435842633079423, |
|
"learning_rate": 8.07106356344834e-05, |
|
"loss": 0.8335, |
|
"num_input_tokens_seen": 12924448, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.5717228946690703, |
|
"grad_norm": 1.2281943545237959, |
|
"learning_rate": 8.051838793910038e-05, |
|
"loss": 0.8267, |
|
"num_input_tokens_seen": 12982912, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.5742982230234355, |
|
"grad_norm": 1.4138823100284208, |
|
"learning_rate": 8.032541847934146e-05, |
|
"loss": 0.8866, |
|
"num_input_tokens_seen": 13041424, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.5768735513778007, |
|
"grad_norm": 0.6515311059204204, |
|
"learning_rate": 8.013173181896283e-05, |
|
"loss": 0.8446, |
|
"num_input_tokens_seen": 13099888, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.5794488797321659, |
|
"grad_norm": 0.7537544303655812, |
|
"learning_rate": 7.993733253868256e-05, |
|
"loss": 0.8176, |
|
"num_input_tokens_seen": 13158344, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.582024208086531, |
|
"grad_norm": 1.3613777296967222, |
|
"learning_rate": 7.974222523607236e-05, |
|
"loss": 0.8138, |
|
"num_input_tokens_seen": 13216840, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.5845995364408962, |
|
"grad_norm": 0.6640843445520798, |
|
"learning_rate": 7.954641452544865e-05, |
|
"loss": 0.8204, |
|
"num_input_tokens_seen": 13275328, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.5871748647952614, |
|
"grad_norm": 0.6917895597906035, |
|
"learning_rate": 7.934990503776363e-05, |
|
"loss": 0.8485, |
|
"num_input_tokens_seen": 13333784, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.5897501931496266, |
|
"grad_norm": 0.45542718993625547, |
|
"learning_rate": 7.915270142049566e-05, |
|
"loss": 0.8191, |
|
"num_input_tokens_seen": 13392280, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.5923255215039918, |
|
"grad_norm": 0.618954778582039, |
|
"learning_rate": 7.89548083375394e-05, |
|
"loss": 0.8185, |
|
"num_input_tokens_seen": 13450720, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.5923255215039918, |
|
"eval_loss": 0.7827339768409729, |
|
"eval_runtime": 16.0127, |
|
"eval_samples_per_second": 3.747, |
|
"eval_steps_per_second": 0.937, |
|
"num_input_tokens_seen": 13450720, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.5949008498583569, |
|
"grad_norm": 1.5827740829243289, |
|
"learning_rate": 7.875623046909544e-05, |
|
"loss": 0.8168, |
|
"num_input_tokens_seen": 13509200, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.5974761782127221, |
|
"grad_norm": 2.344942216339615, |
|
"learning_rate": 7.855697251155967e-05, |
|
"loss": 0.7749, |
|
"num_input_tokens_seen": 13567656, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.6000515065670873, |
|
"grad_norm": 2.7313469239045305, |
|
"learning_rate": 7.835703917741212e-05, |
|
"loss": 0.9132, |
|
"num_input_tokens_seen": 13626136, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.6026268349214525, |
|
"grad_norm": 0.7410043911446527, |
|
"learning_rate": 7.81564351951057e-05, |
|
"loss": 0.8308, |
|
"num_input_tokens_seen": 13684608, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.6052021632758177, |
|
"grad_norm": 0.5628590604115411, |
|
"learning_rate": 7.795516530895414e-05, |
|
"loss": 0.8011, |
|
"num_input_tokens_seen": 13743080, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.6077774916301828, |
|
"grad_norm": 1.2008934424824649, |
|
"learning_rate": 7.775323427901993e-05, |
|
"loss": 0.8309, |
|
"num_input_tokens_seen": 13801552, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.610352819984548, |
|
"grad_norm": 1.2914156288367256, |
|
"learning_rate": 7.755064688100171e-05, |
|
"loss": 0.8089, |
|
"num_input_tokens_seen": 13860064, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.6129281483389132, |
|
"grad_norm": 1.420806774436513, |
|
"learning_rate": 7.734740790612136e-05, |
|
"loss": 0.8089, |
|
"num_input_tokens_seen": 13918552, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.6155034766932784, |
|
"grad_norm": 0.8352922832465102, |
|
"learning_rate": 7.714352216101055e-05, |
|
"loss": 0.8511, |
|
"num_input_tokens_seen": 13977056, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.6180788050476436, |
|
"grad_norm": 0.6321587989106885, |
|
"learning_rate": 7.693899446759727e-05, |
|
"loss": 0.8061, |
|
"num_input_tokens_seen": 14035544, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.6180788050476436, |
|
"eval_loss": 0.7593821287155151, |
|
"eval_runtime": 16.1368, |
|
"eval_samples_per_second": 3.718, |
|
"eval_steps_per_second": 0.93, |
|
"num_input_tokens_seen": 14035544, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.6206541334020087, |
|
"grad_norm": 1.0526811295206564, |
|
"learning_rate": 7.673382966299163e-05, |
|
"loss": 0.7871, |
|
"num_input_tokens_seen": 14094024, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.623229461756374, |
|
"grad_norm": 1.832697637344859, |
|
"learning_rate": 7.65280325993715e-05, |
|
"loss": 0.7594, |
|
"num_input_tokens_seen": 14152504, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.6258047901107391, |
|
"grad_norm": 1.6875031192331054, |
|
"learning_rate": 7.63216081438678e-05, |
|
"loss": 0.7833, |
|
"num_input_tokens_seen": 14210992, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.6283801184651043, |
|
"grad_norm": 1.867117238207419, |
|
"learning_rate": 7.611456117844934e-05, |
|
"loss": 0.8445, |
|
"num_input_tokens_seen": 14269488, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.6309554468194695, |
|
"grad_norm": 0.9089614634143406, |
|
"learning_rate": 7.59068965998074e-05, |
|
"loss": 0.7857, |
|
"num_input_tokens_seen": 14327968, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.6335307751738347, |
|
"grad_norm": 2.3911537408111214, |
|
"learning_rate": 7.569861931923989e-05, |
|
"loss": 0.8064, |
|
"num_input_tokens_seen": 14386448, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.6361061035281999, |
|
"grad_norm": 1.6500224851295993, |
|
"learning_rate": 7.548973426253521e-05, |
|
"loss": 0.7117, |
|
"num_input_tokens_seen": 14444912, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.638681431882565, |
|
"grad_norm": 1.508924461189316, |
|
"learning_rate": 7.528024636985575e-05, |
|
"loss": 0.7449, |
|
"num_input_tokens_seen": 14503392, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.6412567602369302, |
|
"grad_norm": 1.3801142620835953, |
|
"learning_rate": 7.507016059562107e-05, |
|
"loss": 0.7507, |
|
"num_input_tokens_seen": 14561872, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.6438320885912954, |
|
"grad_norm": 1.2994701535106117, |
|
"learning_rate": 7.485948190839077e-05, |
|
"loss": 0.7917, |
|
"num_input_tokens_seen": 14620336, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.6438320885912954, |
|
"eval_loss": 0.7407085299491882, |
|
"eval_runtime": 16.1168, |
|
"eval_samples_per_second": 3.723, |
|
"eval_steps_per_second": 0.931, |
|
"num_input_tokens_seen": 14620336, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.6464074169456606, |
|
"grad_norm": 0.9491399909407985, |
|
"learning_rate": 7.464821529074679e-05, |
|
"loss": 0.7763, |
|
"num_input_tokens_seen": 14678792, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.6489827453000258, |
|
"grad_norm": 1.1671149163333951, |
|
"learning_rate": 7.443636573917585e-05, |
|
"loss": 0.7979, |
|
"num_input_tokens_seen": 14737272, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.6515580736543909, |
|
"grad_norm": 1.4992002601057717, |
|
"learning_rate": 7.422393826395108e-05, |
|
"loss": 0.7883, |
|
"num_input_tokens_seen": 14795784, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.6541334020087561, |
|
"grad_norm": 1.2009664113851044, |
|
"learning_rate": 7.40109378890136e-05, |
|
"loss": 0.7183, |
|
"num_input_tokens_seen": 14854272, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.6567087303631213, |
|
"grad_norm": 1.5312778776593978, |
|
"learning_rate": 7.379736965185368e-05, |
|
"loss": 0.762, |
|
"num_input_tokens_seen": 14912720, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.6592840587174865, |
|
"grad_norm": 1.443384734396678, |
|
"learning_rate": 7.358323860339165e-05, |
|
"loss": 0.7951, |
|
"num_input_tokens_seen": 14971192, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.6618593870718517, |
|
"grad_norm": 1.3546652337943146, |
|
"learning_rate": 7.336854980785839e-05, |
|
"loss": 0.7528, |
|
"num_input_tokens_seen": 15029656, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.6644347154262168, |
|
"grad_norm": 1.4256460615881865, |
|
"learning_rate": 7.315330834267553e-05, |
|
"loss": 0.7633, |
|
"num_input_tokens_seen": 15088144, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.667010043780582, |
|
"grad_norm": 1.325772407306303, |
|
"learning_rate": 7.293751929833553e-05, |
|
"loss": 0.7443, |
|
"num_input_tokens_seen": 15146600, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.6695853721349472, |
|
"grad_norm": 2.727997344637842, |
|
"learning_rate": 7.272118777828108e-05, |
|
"loss": 0.7724, |
|
"num_input_tokens_seen": 15205064, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6695853721349472, |
|
"eval_loss": 0.7189856171607971, |
|
"eval_runtime": 16.0307, |
|
"eval_samples_per_second": 3.743, |
|
"eval_steps_per_second": 0.936, |
|
"num_input_tokens_seen": 15205064, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6721607004893124, |
|
"grad_norm": 2.6154468701895066, |
|
"learning_rate": 7.250431889878455e-05, |
|
"loss": 0.7524, |
|
"num_input_tokens_seen": 15263560, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.6747360288436776, |
|
"grad_norm": 1.9549500311782502, |
|
"learning_rate": 7.228691778882693e-05, |
|
"loss": 0.6748, |
|
"num_input_tokens_seen": 15322016, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.6773113571980427, |
|
"grad_norm": 2.991178206089954, |
|
"learning_rate": 7.20689895899765e-05, |
|
"loss": 0.7571, |
|
"num_input_tokens_seen": 15380504, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.6798866855524079, |
|
"grad_norm": 1.7022848080804835, |
|
"learning_rate": 7.185053945626733e-05, |
|
"loss": 0.6615, |
|
"num_input_tokens_seen": 15438944, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.6824620139067731, |
|
"grad_norm": 1.739259284519112, |
|
"learning_rate": 7.163157255407732e-05, |
|
"loss": 0.7421, |
|
"num_input_tokens_seen": 15497384, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.6850373422611383, |
|
"grad_norm": 1.9142982939434143, |
|
"learning_rate": 7.141209406200599e-05, |
|
"loss": 0.7886, |
|
"num_input_tokens_seen": 15555856, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.6876126706155035, |
|
"grad_norm": 1.7562659805497576, |
|
"learning_rate": 7.1192109170752e-05, |
|
"loss": 0.7484, |
|
"num_input_tokens_seen": 15614368, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.6901879989698687, |
|
"grad_norm": 1.7590122465257017, |
|
"learning_rate": 7.097162308299054e-05, |
|
"loss": 0.7086, |
|
"num_input_tokens_seen": 15672864, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.6927633273242338, |
|
"grad_norm": 2.1211445265818845, |
|
"learning_rate": 7.07506410132501e-05, |
|
"loss": 0.7494, |
|
"num_input_tokens_seen": 15731376, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.695338655678599, |
|
"grad_norm": 2.683073565523052, |
|
"learning_rate": 7.052916818778918e-05, |
|
"loss": 0.7278, |
|
"num_input_tokens_seen": 15789848, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.695338655678599, |
|
"eval_loss": 0.712917685508728, |
|
"eval_runtime": 16.0726, |
|
"eval_samples_per_second": 3.733, |
|
"eval_steps_per_second": 0.933, |
|
"num_input_tokens_seen": 15789848, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.6979139840329642, |
|
"grad_norm": 2.128495144345323, |
|
"learning_rate": 7.030720984447279e-05, |
|
"loss": 0.7005, |
|
"num_input_tokens_seen": 15848328, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.7004893123873294, |
|
"grad_norm": 1.9954206386005497, |
|
"learning_rate": 7.008477123264848e-05, |
|
"loss": 0.7406, |
|
"num_input_tokens_seen": 15906824, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.7030646407416946, |
|
"grad_norm": 2.2104679425901397, |
|
"learning_rate": 6.986185761302224e-05, |
|
"loss": 0.73, |
|
"num_input_tokens_seen": 15965312, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.7056399690960597, |
|
"grad_norm": 1.4881688553415275, |
|
"learning_rate": 6.963847425753403e-05, |
|
"loss": 0.7069, |
|
"num_input_tokens_seen": 16023824, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.7082152974504249, |
|
"grad_norm": 1.7307886623214839, |
|
"learning_rate": 6.941462644923318e-05, |
|
"loss": 0.6859, |
|
"num_input_tokens_seen": 16082280, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.7107906258047901, |
|
"grad_norm": 1.996363722225207, |
|
"learning_rate": 6.919031948215335e-05, |
|
"loss": 0.7254, |
|
"num_input_tokens_seen": 16140800, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.7133659541591553, |
|
"grad_norm": 1.9723274395570518, |
|
"learning_rate": 6.896555866118741e-05, |
|
"loss": 0.717, |
|
"num_input_tokens_seen": 16199320, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.7159412825135205, |
|
"grad_norm": 1.741253496639104, |
|
"learning_rate": 6.87403493019619e-05, |
|
"loss": 0.7094, |
|
"num_input_tokens_seen": 16257768, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.7185166108678857, |
|
"grad_norm": 1.6218002074106608, |
|
"learning_rate": 6.851469673071143e-05, |
|
"loss": 0.7862, |
|
"num_input_tokens_seen": 16316264, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.7210919392222508, |
|
"grad_norm": 1.7586707307941614, |
|
"learning_rate": 6.828860628415253e-05, |
|
"loss": 0.7359, |
|
"num_input_tokens_seen": 16374784, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.7210919392222508, |
|
"eval_loss": 0.6643603444099426, |
|
"eval_runtime": 16.1894, |
|
"eval_samples_per_second": 3.706, |
|
"eval_steps_per_second": 0.927, |
|
"num_input_tokens_seen": 16374784, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.723667267576616, |
|
"grad_norm": 2.665622720042704, |
|
"learning_rate": 6.806208330935766e-05, |
|
"loss": 0.706, |
|
"num_input_tokens_seen": 16433288, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.7262425959309812, |
|
"grad_norm": 2.123869663010538, |
|
"learning_rate": 6.783513316362855e-05, |
|
"loss": 0.6714, |
|
"num_input_tokens_seen": 16491784, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.7288179242853464, |
|
"grad_norm": 1.584213945279146, |
|
"learning_rate": 6.760776121436962e-05, |
|
"loss": 0.693, |
|
"num_input_tokens_seen": 16550272, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.7313932526397116, |
|
"grad_norm": 2.2481839233017764, |
|
"learning_rate": 6.737997283896103e-05, |
|
"loss": 0.7005, |
|
"num_input_tokens_seen": 16608704, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.7339685809940767, |
|
"grad_norm": 2.4818230151927643, |
|
"learning_rate": 6.715177342463145e-05, |
|
"loss": 0.6573, |
|
"num_input_tokens_seen": 16667200, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.7365439093484419, |
|
"grad_norm": 2.5398594354263486, |
|
"learning_rate": 6.692316836833065e-05, |
|
"loss": 0.6751, |
|
"num_input_tokens_seen": 16725704, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.7391192377028071, |
|
"grad_norm": 2.7486055345229343, |
|
"learning_rate": 6.6694163076602e-05, |
|
"loss": 0.6173, |
|
"num_input_tokens_seen": 16784192, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.7416945660571723, |
|
"grad_norm": 5.356237563459472, |
|
"learning_rate": 6.646476296545434e-05, |
|
"loss": 0.728, |
|
"num_input_tokens_seen": 16842704, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.7442698944115375, |
|
"grad_norm": 2.088505948846248, |
|
"learning_rate": 6.623497346023418e-05, |
|
"loss": 0.743, |
|
"num_input_tokens_seen": 16901176, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.7468452227659027, |
|
"grad_norm": 2.2198436340262, |
|
"learning_rate": 6.60047999954972e-05, |
|
"loss": 0.6291, |
|
"num_input_tokens_seen": 16959632, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.7468452227659027, |
|
"eval_loss": 0.753077507019043, |
|
"eval_runtime": 16.0383, |
|
"eval_samples_per_second": 3.741, |
|
"eval_steps_per_second": 0.935, |
|
"num_input_tokens_seen": 16959632, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.7494205511202678, |
|
"grad_norm": 1.9571252974715032, |
|
"learning_rate": 6.57742480148798e-05, |
|
"loss": 0.6533, |
|
"num_input_tokens_seen": 17018072, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.751995879474633, |
|
"grad_norm": 3.2075825448529542, |
|
"learning_rate": 6.554332297097031e-05, |
|
"loss": 0.7114, |
|
"num_input_tokens_seen": 17076560, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.7545712078289982, |
|
"grad_norm": 2.0030816579741266, |
|
"learning_rate": 6.53120303251801e-05, |
|
"loss": 0.6568, |
|
"num_input_tokens_seen": 17135016, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.7571465361833634, |
|
"grad_norm": 2.65056436638165, |
|
"learning_rate": 6.508037554761432e-05, |
|
"loss": 0.7016, |
|
"num_input_tokens_seen": 17193496, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.7597218645377286, |
|
"grad_norm": 1.9541651871708403, |
|
"learning_rate": 6.484836411694267e-05, |
|
"loss": 0.6612, |
|
"num_input_tokens_seen": 17251944, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.7622971928920937, |
|
"grad_norm": 3.0540242692558577, |
|
"learning_rate": 6.461600152026965e-05, |
|
"loss": 0.6115, |
|
"num_input_tokens_seen": 17310456, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.7648725212464589, |
|
"grad_norm": 2.796196437541352, |
|
"learning_rate": 6.438329325300499e-05, |
|
"loss": 0.6458, |
|
"num_input_tokens_seen": 17368968, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.7674478496008241, |
|
"grad_norm": 3.1979427976381207, |
|
"learning_rate": 6.415024481873352e-05, |
|
"loss": 0.6434, |
|
"num_input_tokens_seen": 17427424, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.7700231779551893, |
|
"grad_norm": 3.8375601078700203, |
|
"learning_rate": 6.391686172908506e-05, |
|
"loss": 0.5973, |
|
"num_input_tokens_seen": 17485936, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.7725985063095545, |
|
"grad_norm": 2.405705749864128, |
|
"learning_rate": 6.368314950360415e-05, |
|
"loss": 0.6021, |
|
"num_input_tokens_seen": 17544440, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7725985063095545, |
|
"eval_loss": 0.632923424243927, |
|
"eval_runtime": 16.1038, |
|
"eval_samples_per_second": 3.726, |
|
"eval_steps_per_second": 0.931, |
|
"num_input_tokens_seen": 17544440, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7751738346639196, |
|
"grad_norm": 2.8519087211521734, |
|
"learning_rate": 6.344911366961934e-05, |
|
"loss": 0.5779, |
|
"num_input_tokens_seen": 17602952, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.7777491630182848, |
|
"grad_norm": 2.861290579940173, |
|
"learning_rate": 6.321475976211266e-05, |
|
"loss": 0.6707, |
|
"num_input_tokens_seen": 17661440, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.78032449137265, |
|
"grad_norm": 3.541365161144121, |
|
"learning_rate": 6.298009332358856e-05, |
|
"loss": 0.6326, |
|
"num_input_tokens_seen": 17719928, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.7828998197270152, |
|
"grad_norm": 2.969962641272996, |
|
"learning_rate": 6.274511990394294e-05, |
|
"loss": 0.6472, |
|
"num_input_tokens_seen": 17778424, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.7854751480813804, |
|
"grad_norm": 2.762063548864621, |
|
"learning_rate": 6.250984506033183e-05, |
|
"loss": 0.6215, |
|
"num_input_tokens_seen": 17836936, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.7880504764357456, |
|
"grad_norm": 3.2198855545004097, |
|
"learning_rate": 6.227427435703997e-05, |
|
"loss": 0.6102, |
|
"num_input_tokens_seen": 17895392, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.7906258047901107, |
|
"grad_norm": 3.846544371420393, |
|
"learning_rate": 6.203841336534924e-05, |
|
"loss": 0.6161, |
|
"num_input_tokens_seen": 17953872, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.7932011331444759, |
|
"grad_norm": 3.811248686105134, |
|
"learning_rate": 6.180226766340688e-05, |
|
"loss": 0.6103, |
|
"num_input_tokens_seen": 18012320, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.7957764614988411, |
|
"grad_norm": 2.9539705466919703, |
|
"learning_rate": 6.156584283609359e-05, |
|
"loss": 0.5791, |
|
"num_input_tokens_seen": 18070792, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.7983517898532063, |
|
"grad_norm": 3.0546686267383283, |
|
"learning_rate": 6.132914447489137e-05, |
|
"loss": 0.667, |
|
"num_input_tokens_seen": 18129304, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.7983517898532063, |
|
"eval_loss": 0.6617516279220581, |
|
"eval_runtime": 16.0333, |
|
"eval_samples_per_second": 3.742, |
|
"eval_steps_per_second": 0.936, |
|
"num_input_tokens_seen": 18129304, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.8009271182075715, |
|
"grad_norm": 2.9735507158511987, |
|
"learning_rate": 6.109217817775139e-05, |
|
"loss": 0.5681, |
|
"num_input_tokens_seen": 18187728, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.8035024465619367, |
|
"grad_norm": 3.6620315644598778, |
|
"learning_rate": 6.085494954896156e-05, |
|
"loss": 0.6292, |
|
"num_input_tokens_seen": 18246192, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.8060777749163018, |
|
"grad_norm": 4.03631122919402, |
|
"learning_rate": 6.061746419901388e-05, |
|
"loss": 0.6512, |
|
"num_input_tokens_seen": 18304632, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.808653103270667, |
|
"grad_norm": 4.0040288177360805, |
|
"learning_rate": 6.0379727744471936e-05, |
|
"loss": 0.5476, |
|
"num_input_tokens_seen": 18363136, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.8112284316250322, |
|
"grad_norm": 3.9448861517599996, |
|
"learning_rate": 6.014174580783794e-05, |
|
"loss": 0.5632, |
|
"num_input_tokens_seen": 18421592, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.8138037599793974, |
|
"grad_norm": 3.8400680048739435, |
|
"learning_rate": 5.990352401741981e-05, |
|
"loss": 0.6225, |
|
"num_input_tokens_seen": 18480104, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.8163790883337626, |
|
"grad_norm": 2.7981339113543284, |
|
"learning_rate": 5.9665068007197976e-05, |
|
"loss": 0.5801, |
|
"num_input_tokens_seen": 18538600, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.8189544166881277, |
|
"grad_norm": 4.290843515697908, |
|
"learning_rate": 5.94263834166923e-05, |
|
"loss": 0.6364, |
|
"num_input_tokens_seen": 18597104, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.8215297450424929, |
|
"grad_norm": 3.9001572117535566, |
|
"learning_rate": 5.918747589082853e-05, |
|
"loss": 0.6088, |
|
"num_input_tokens_seen": 18655584, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.8241050733968581, |
|
"grad_norm": 3.5623412341260363, |
|
"learning_rate": 5.8948351079804875e-05, |
|
"loss": 0.6564, |
|
"num_input_tokens_seen": 18714072, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.8241050733968581, |
|
"eval_loss": 0.6319106221199036, |
|
"eval_runtime": 16.0199, |
|
"eval_samples_per_second": 3.745, |
|
"eval_steps_per_second": 0.936, |
|
"num_input_tokens_seen": 18714072, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.8266804017512233, |
|
"grad_norm": 3.4115030121534953, |
|
"learning_rate": 5.8709014638958404e-05, |
|
"loss": 0.6095, |
|
"num_input_tokens_seen": 18772552, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.8292557301055885, |
|
"grad_norm": 2.8584050529867895, |
|
"learning_rate": 5.846947222863123e-05, |
|
"loss": 0.5896, |
|
"num_input_tokens_seen": 18830992, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.8318310584599536, |
|
"grad_norm": 3.083134826868609, |
|
"learning_rate": 5.8229729514036705e-05, |
|
"loss": 0.545, |
|
"num_input_tokens_seen": 18889480, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.8344063868143188, |
|
"grad_norm": 3.5650772646006703, |
|
"learning_rate": 5.7989792165125356e-05, |
|
"loss": 0.6021, |
|
"num_input_tokens_seen": 18947936, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.836981715168684, |
|
"grad_norm": 3.1787537764025737, |
|
"learning_rate": 5.774966585645092e-05, |
|
"loss": 0.5741, |
|
"num_input_tokens_seen": 19006432, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.8395570435230492, |
|
"grad_norm": 4.505205596087594, |
|
"learning_rate": 5.7509356267035975e-05, |
|
"loss": 0.5796, |
|
"num_input_tokens_seen": 19064920, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.8421323718774144, |
|
"grad_norm": 3.854433226263906, |
|
"learning_rate": 5.726886908023776e-05, |
|
"loss": 0.5088, |
|
"num_input_tokens_seen": 19123376, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.8447077002317795, |
|
"grad_norm": 3.5910960304247643, |
|
"learning_rate": 5.702820998361373e-05, |
|
"loss": 0.5431, |
|
"num_input_tokens_seen": 19181864, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.8472830285861447, |
|
"grad_norm": 4.55639282269759, |
|
"learning_rate": 5.6787384668786994e-05, |
|
"loss": 0.5849, |
|
"num_input_tokens_seen": 19240352, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.8498583569405099, |
|
"grad_norm": 4.031478721616991, |
|
"learning_rate": 5.654639883131178e-05, |
|
"loss": 0.5668, |
|
"num_input_tokens_seen": 19298848, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.8498583569405099, |
|
"eval_loss": 0.6634677648544312, |
|
"eval_runtime": 16.0267, |
|
"eval_samples_per_second": 3.744, |
|
"eval_steps_per_second": 0.936, |
|
"num_input_tokens_seen": 19298848, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.8524336852948751, |
|
"grad_norm": 3.2430676664218496, |
|
"learning_rate": 5.6305258170538676e-05, |
|
"loss": 0.584, |
|
"num_input_tokens_seen": 19357304, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.8550090136492403, |
|
"grad_norm": 3.140559424454581, |
|
"learning_rate": 5.606396838947988e-05, |
|
"loss": 0.5544, |
|
"num_input_tokens_seen": 19415800, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.8575843420036054, |
|
"grad_norm": 3.993528386539066, |
|
"learning_rate": 5.582253519467432e-05, |
|
"loss": 0.6269, |
|
"num_input_tokens_seen": 19474256, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.8601596703579707, |
|
"grad_norm": 2.202747116085024, |
|
"learning_rate": 5.558096429605263e-05, |
|
"loss": 0.5073, |
|
"num_input_tokens_seen": 19532736, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.8627349987123358, |
|
"grad_norm": 4.4094334133851625, |
|
"learning_rate": 5.533926140680221e-05, |
|
"loss": 0.5319, |
|
"num_input_tokens_seen": 19591184, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.865310327066701, |
|
"grad_norm": 4.01821546567579, |
|
"learning_rate": 5.509743224323203e-05, |
|
"loss": 0.4525, |
|
"num_input_tokens_seen": 19649656, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.8678856554210662, |
|
"grad_norm": 5.3033277992950385, |
|
"learning_rate": 5.485548252463749e-05, |
|
"loss": 0.5276, |
|
"num_input_tokens_seen": 19708144, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.8704609837754314, |
|
"grad_norm": 5.124737819396939, |
|
"learning_rate": 5.4613417973165106e-05, |
|
"loss": 0.5482, |
|
"num_input_tokens_seen": 19766592, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.8730363121297966, |
|
"grad_norm": 3.47304956996904, |
|
"learning_rate": 5.4371244313677225e-05, |
|
"loss": 0.4656, |
|
"num_input_tokens_seen": 19825064, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.8756116404841617, |
|
"grad_norm": 6.394279811127835, |
|
"learning_rate": 5.4128967273616625e-05, |
|
"loss": 0.5701, |
|
"num_input_tokens_seen": 19883504, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.8756116404841617, |
|
"eval_loss": 0.7144017815589905, |
|
"eval_runtime": 16.1358, |
|
"eval_samples_per_second": 3.718, |
|
"eval_steps_per_second": 0.93, |
|
"num_input_tokens_seen": 19883504, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.8781869688385269, |
|
"grad_norm": 4.527262723362309, |
|
"learning_rate": 5.388659258287102e-05, |
|
"loss": 0.5823, |
|
"num_input_tokens_seen": 19942000, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.8807622971928921, |
|
"grad_norm": 4.628112845411063, |
|
"learning_rate": 5.364412597363759e-05, |
|
"loss": 0.5446, |
|
"num_input_tokens_seen": 20000440, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.8833376255472573, |
|
"grad_norm": 6.077375809046342, |
|
"learning_rate": 5.3401573180287426e-05, |
|
"loss": 0.5769, |
|
"num_input_tokens_seen": 20058920, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.8859129539016225, |
|
"grad_norm": 6.492863688878202, |
|
"learning_rate": 5.315893993922986e-05, |
|
"loss": 0.5614, |
|
"num_input_tokens_seen": 20117416, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.8884882822559876, |
|
"grad_norm": 5.332057542240503, |
|
"learning_rate": 5.29162319887768e-05, |
|
"loss": 0.5215, |
|
"num_input_tokens_seen": 20175936, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.8910636106103528, |
|
"grad_norm": 3.8772752615113077, |
|
"learning_rate": 5.26734550690071e-05, |
|
"loss": 0.4968, |
|
"num_input_tokens_seen": 20234368, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.893638938964718, |
|
"grad_norm": 4.886426418731965, |
|
"learning_rate": 5.243061492163073e-05, |
|
"loss": 0.5029, |
|
"num_input_tokens_seen": 20292856, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.8962142673190832, |
|
"grad_norm": 4.031774194047053, |
|
"learning_rate": 5.2187717289852955e-05, |
|
"loss": 0.5249, |
|
"num_input_tokens_seen": 20351272, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.8987895956734484, |
|
"grad_norm": 5.344580011428224, |
|
"learning_rate": 5.1944767918238624e-05, |
|
"loss": 0.5801, |
|
"num_input_tokens_seen": 20409744, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.9013649240278135, |
|
"grad_norm": 3.923379435953565, |
|
"learning_rate": 5.170177255257618e-05, |
|
"loss": 0.546, |
|
"num_input_tokens_seen": 20468200, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.9013649240278135, |
|
"eval_loss": 0.672294020652771, |
|
"eval_runtime": 16.0203, |
|
"eval_samples_per_second": 3.745, |
|
"eval_steps_per_second": 0.936, |
|
"num_input_tokens_seen": 20468200, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.9039402523821787, |
|
"grad_norm": 4.616122198129487, |
|
"learning_rate": 5.145873693974188e-05, |
|
"loss": 0.5248, |
|
"num_input_tokens_seen": 20526696, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.9065155807365439, |
|
"grad_norm": 5.322590172525407, |
|
"learning_rate": 5.12156668275638e-05, |
|
"loss": 0.4756, |
|
"num_input_tokens_seen": 20585160, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 4.002252878507737, |
|
"learning_rate": 5.097256796468598e-05, |
|
"loss": 0.4405, |
|
"num_input_tokens_seen": 20643672, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.9116662374452743, |
|
"grad_norm": 5.58017966349683, |
|
"learning_rate": 5.072944610043232e-05, |
|
"loss": 0.5201, |
|
"num_input_tokens_seen": 20702152, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.9142415657996394, |
|
"grad_norm": 4.688576373892097, |
|
"learning_rate": 5.048630698467081e-05, |
|
"loss": 0.4662, |
|
"num_input_tokens_seen": 20760664, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.9168168941540046, |
|
"grad_norm": 4.984086874604376, |
|
"learning_rate": 5.024315636767738e-05, |
|
"loss": 0.5376, |
|
"num_input_tokens_seen": 20819144, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.9193922225083698, |
|
"grad_norm": 4.470690620190923, |
|
"learning_rate": 5e-05, |
|
"loss": 0.5174, |
|
"num_input_tokens_seen": 20877624, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.921967550862735, |
|
"grad_norm": 4.1127649145734795, |
|
"learning_rate": 4.9756843632322626e-05, |
|
"loss": 0.4273, |
|
"num_input_tokens_seen": 20936112, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.9245428792171002, |
|
"grad_norm": 5.1892527739805185, |
|
"learning_rate": 4.9513693015329197e-05, |
|
"loss": 0.4646, |
|
"num_input_tokens_seen": 20994608, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.9271182075714653, |
|
"grad_norm": 6.8574703914708985, |
|
"learning_rate": 4.9270553899567686e-05, |
|
"loss": 0.412, |
|
"num_input_tokens_seen": 21053080, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.9271182075714653, |
|
"eval_loss": 0.6768696904182434, |
|
"eval_runtime": 15.9758, |
|
"eval_samples_per_second": 3.756, |
|
"eval_steps_per_second": 0.939, |
|
"num_input_tokens_seen": 21053080, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.9296935359258306, |
|
"grad_norm": 6.328873193178562, |
|
"learning_rate": 4.902743203531405e-05, |
|
"loss": 0.4845, |
|
"num_input_tokens_seen": 21111592, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.9322688642801957, |
|
"grad_norm": 4.7019594666508215, |
|
"learning_rate": 4.8784333172436206e-05, |
|
"loss": 0.441, |
|
"num_input_tokens_seen": 21170024, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.9348441926345609, |
|
"grad_norm": 4.545287749618146, |
|
"learning_rate": 4.854126306025812e-05, |
|
"loss": 0.545, |
|
"num_input_tokens_seen": 21228480, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.9374195209889261, |
|
"grad_norm": 7.047942469299444, |
|
"learning_rate": 4.829822744742383e-05, |
|
"loss": 0.4697, |
|
"num_input_tokens_seen": 21286944, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.9399948493432912, |
|
"grad_norm": 3.917758669787159, |
|
"learning_rate": 4.8055232081761395e-05, |
|
"loss": 0.423, |
|
"num_input_tokens_seen": 21345456, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.9425701776976565, |
|
"grad_norm": 3.442911876713947, |
|
"learning_rate": 4.781228271014704e-05, |
|
"loss": 0.4715, |
|
"num_input_tokens_seen": 21403896, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.9451455060520216, |
|
"grad_norm": 4.755237925353789, |
|
"learning_rate": 4.756938507836929e-05, |
|
"loss": 0.5149, |
|
"num_input_tokens_seen": 21462360, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.9477208344063868, |
|
"grad_norm": 5.3552741805060275, |
|
"learning_rate": 4.732654493099291e-05, |
|
"loss": 0.5403, |
|
"num_input_tokens_seen": 21520864, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.950296162760752, |
|
"grad_norm": 3.417134377266731, |
|
"learning_rate": 4.708376801122321e-05, |
|
"loss": 0.4757, |
|
"num_input_tokens_seen": 21579376, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.9528714911151172, |
|
"grad_norm": 4.6802756294331855, |
|
"learning_rate": 4.6841060060770154e-05, |
|
"loss": 0.4347, |
|
"num_input_tokens_seen": 21637848, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.9528714911151172, |
|
"eval_loss": 0.6808218359947205, |
|
"eval_runtime": 16.1166, |
|
"eval_samples_per_second": 3.723, |
|
"eval_steps_per_second": 0.931, |
|
"num_input_tokens_seen": 21637848, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.9554468194694824, |
|
"grad_norm": 5.573192417675986, |
|
"learning_rate": 4.659842681971258e-05, |
|
"loss": 0.5132, |
|
"num_input_tokens_seen": 21696328, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.9580221478238475, |
|
"grad_norm": 7.109977536510439, |
|
"learning_rate": 4.635587402636241e-05, |
|
"loss": 0.4347, |
|
"num_input_tokens_seen": 21754816, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.9605974761782127, |
|
"grad_norm": 7.143552890986281, |
|
"learning_rate": 4.611340741712901e-05, |
|
"loss": 0.4015, |
|
"num_input_tokens_seen": 21813296, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.9631728045325779, |
|
"grad_norm": 6.289734219426663, |
|
"learning_rate": 4.5871032726383386e-05, |
|
"loss": 0.5023, |
|
"num_input_tokens_seen": 21871800, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.9657481328869431, |
|
"grad_norm": 5.981747103855226, |
|
"learning_rate": 4.562875568632278e-05, |
|
"loss": 0.5334, |
|
"num_input_tokens_seen": 21930272, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.9683234612413083, |
|
"grad_norm": 5.6559760588122545, |
|
"learning_rate": 4.5386582026834906e-05, |
|
"loss": 0.4386, |
|
"num_input_tokens_seen": 21988736, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.9708987895956734, |
|
"grad_norm": 5.861060155419055, |
|
"learning_rate": 4.5144517475362514e-05, |
|
"loss": 0.3807, |
|
"num_input_tokens_seen": 22047200, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.9734741179500386, |
|
"grad_norm": 7.801226281593827, |
|
"learning_rate": 4.490256775676797e-05, |
|
"loss": 0.4177, |
|
"num_input_tokens_seen": 22105664, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.9760494463044038, |
|
"grad_norm": 6.1755894964345135, |
|
"learning_rate": 4.466073859319781e-05, |
|
"loss": 0.5239, |
|
"num_input_tokens_seen": 22164184, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.978624774658769, |
|
"grad_norm": 5.397307732194541, |
|
"learning_rate": 4.441903570394739e-05, |
|
"loss": 0.3737, |
|
"num_input_tokens_seen": 22222632, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.978624774658769, |
|
"eval_loss": 0.773033082485199, |
|
"eval_runtime": 15.9975, |
|
"eval_samples_per_second": 3.751, |
|
"eval_steps_per_second": 0.938, |
|
"num_input_tokens_seen": 22222632, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.9812001030131342, |
|
"grad_norm": 6.997624273550619, |
|
"learning_rate": 4.41774648053257e-05, |
|
"loss": 0.4437, |
|
"num_input_tokens_seen": 22281080, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.9837754313674993, |
|
"grad_norm": 5.030616381143982, |
|
"learning_rate": 4.3936031610520124e-05, |
|
"loss": 0.465, |
|
"num_input_tokens_seen": 22339552, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.9863507597218646, |
|
"grad_norm": 5.025845260709186, |
|
"learning_rate": 4.3694741829461336e-05, |
|
"loss": 0.4975, |
|
"num_input_tokens_seen": 22398056, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.9889260880762297, |
|
"grad_norm": 6.43843242330618, |
|
"learning_rate": 4.345360116868823e-05, |
|
"loss": 0.4504, |
|
"num_input_tokens_seen": 22456520, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.9915014164305949, |
|
"grad_norm": 5.281203851622467, |
|
"learning_rate": 4.321261533121303e-05, |
|
"loss": 0.4528, |
|
"num_input_tokens_seen": 22515024, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.9940767447849601, |
|
"grad_norm": 6.158304256456398, |
|
"learning_rate": 4.2971790016386286e-05, |
|
"loss": 0.441, |
|
"num_input_tokens_seen": 22573480, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.9966520731393252, |
|
"grad_norm": 3.898263595049965, |
|
"learning_rate": 4.273113091976225e-05, |
|
"loss": 0.4678, |
|
"num_input_tokens_seen": 22631960, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.9992274014936905, |
|
"grad_norm": 6.266433889699235, |
|
"learning_rate": 4.249064373296403e-05, |
|
"loss": 0.4352, |
|
"num_input_tokens_seen": 22690432, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.001545197012619, |
|
"grad_norm": 2.4601530377865695, |
|
"learning_rate": 4.225033414354908e-05, |
|
"loss": 0.3792, |
|
"num_input_tokens_seen": 22743048, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 1.0041205253669843, |
|
"grad_norm": 4.761740260797231, |
|
"learning_rate": 4.201020783487464e-05, |
|
"loss": 0.3783, |
|
"num_input_tokens_seen": 22801512, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.0041205253669843, |
|
"eval_loss": 0.6983156204223633, |
|
"eval_runtime": 16.3172, |
|
"eval_samples_per_second": 3.677, |
|
"eval_steps_per_second": 0.919, |
|
"num_input_tokens_seen": 22801512, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.0066958537213495, |
|
"grad_norm": 6.506183969602581, |
|
"learning_rate": 4.17702704859633e-05, |
|
"loss": 0.3784, |
|
"num_input_tokens_seen": 22859952, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 1.0092711820757148, |
|
"grad_norm": 7.31299798110374, |
|
"learning_rate": 4.153052777136879e-05, |
|
"loss": 0.5587, |
|
"num_input_tokens_seen": 22918440, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.0118465104300798, |
|
"grad_norm": 4.338872323547646, |
|
"learning_rate": 4.1290985361041614e-05, |
|
"loss": 0.3803, |
|
"num_input_tokens_seen": 22976944, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 1.014421838784445, |
|
"grad_norm": 6.798827966152428, |
|
"learning_rate": 4.105164892019514e-05, |
|
"loss": 0.4038, |
|
"num_input_tokens_seen": 23035408, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.0169971671388103, |
|
"grad_norm": 5.018683403937771, |
|
"learning_rate": 4.0812524109171476e-05, |
|
"loss": 0.3226, |
|
"num_input_tokens_seen": 23093912, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 1.0195724954931753, |
|
"grad_norm": 4.594775856201265, |
|
"learning_rate": 4.0573616583307705e-05, |
|
"loss": 0.4026, |
|
"num_input_tokens_seen": 23152344, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.0221478238475405, |
|
"grad_norm": 7.5346230342964695, |
|
"learning_rate": 4.033493199280202e-05, |
|
"loss": 0.4225, |
|
"num_input_tokens_seen": 23210800, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 1.0247231522019058, |
|
"grad_norm": 8.213657673441388, |
|
"learning_rate": 4.009647598258022e-05, |
|
"loss": 0.3058, |
|
"num_input_tokens_seen": 23269304, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.0272984805562708, |
|
"grad_norm": 6.881744374075897, |
|
"learning_rate": 3.985825419216207e-05, |
|
"loss": 0.3821, |
|
"num_input_tokens_seen": 23327800, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 1.029873808910636, |
|
"grad_norm": 3.916989546123924, |
|
"learning_rate": 3.962027225552807e-05, |
|
"loss": 0.3328, |
|
"num_input_tokens_seen": 23386232, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.029873808910636, |
|
"eval_loss": 0.7484827041625977, |
|
"eval_runtime": 16.091, |
|
"eval_samples_per_second": 3.729, |
|
"eval_steps_per_second": 0.932, |
|
"num_input_tokens_seen": 23386232, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.0324491372650013, |
|
"grad_norm": 5.8532055715340245, |
|
"learning_rate": 3.938253580098613e-05, |
|
"loss": 0.362, |
|
"num_input_tokens_seen": 23444712, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 1.0350244656193666, |
|
"grad_norm": 7.087739461357715, |
|
"learning_rate": 3.914505045103845e-05, |
|
"loss": 0.3903, |
|
"num_input_tokens_seen": 23503192, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.0375997939737316, |
|
"grad_norm": 6.061997147134047, |
|
"learning_rate": 3.8907821822248605e-05, |
|
"loss": 0.3341, |
|
"num_input_tokens_seen": 23561688, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 1.0401751223280968, |
|
"grad_norm": 6.783069419644998, |
|
"learning_rate": 3.867085552510864e-05, |
|
"loss": 0.4794, |
|
"num_input_tokens_seen": 23620160, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.042750450682462, |
|
"grad_norm": 4.11088291372727, |
|
"learning_rate": 3.843415716390644e-05, |
|
"loss": 0.4104, |
|
"num_input_tokens_seen": 23678624, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 1.045325779036827, |
|
"grad_norm": 5.727855298190317, |
|
"learning_rate": 3.819773233659314e-05, |
|
"loss": 0.3639, |
|
"num_input_tokens_seen": 23737064, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.0479011073911924, |
|
"grad_norm": 6.936114108935384, |
|
"learning_rate": 3.7961586634650767e-05, |
|
"loss": 0.4294, |
|
"num_input_tokens_seen": 23795568, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 1.0504764357455576, |
|
"grad_norm": 5.577801320854008, |
|
"learning_rate": 3.772572564296005e-05, |
|
"loss": 0.4713, |
|
"num_input_tokens_seen": 23854040, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.0530517640999228, |
|
"grad_norm": 7.466883391944433, |
|
"learning_rate": 3.749015493966817e-05, |
|
"loss": 0.3864, |
|
"num_input_tokens_seen": 23912520, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 1.0556270924542879, |
|
"grad_norm": 4.120909561971508, |
|
"learning_rate": 3.7254880096057073e-05, |
|
"loss": 0.3602, |
|
"num_input_tokens_seen": 23971048, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.0556270924542879, |
|
"eval_loss": 0.7190810441970825, |
|
"eval_runtime": 16.0858, |
|
"eval_samples_per_second": 3.73, |
|
"eval_steps_per_second": 0.932, |
|
"num_input_tokens_seen": 23971048, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.0582024208086531, |
|
"grad_norm": 3.701758619566102, |
|
"learning_rate": 3.7019906676411446e-05, |
|
"loss": 0.3203, |
|
"num_input_tokens_seen": 24029544, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 1.0607777491630184, |
|
"grad_norm": 7.855789285552562, |
|
"learning_rate": 3.678524023788735e-05, |
|
"loss": 0.3906, |
|
"num_input_tokens_seen": 24088008, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.0633530775173834, |
|
"grad_norm": 6.682460948737117, |
|
"learning_rate": 3.6550886330380665e-05, |
|
"loss": 0.3604, |
|
"num_input_tokens_seen": 24146480, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 1.0659284058717486, |
|
"grad_norm": 3.587156705730744, |
|
"learning_rate": 3.631685049639586e-05, |
|
"loss": 0.3271, |
|
"num_input_tokens_seen": 24204984, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.0685037342261139, |
|
"grad_norm": 4.621273077841867, |
|
"learning_rate": 3.608313827091493e-05, |
|
"loss": 0.2996, |
|
"num_input_tokens_seen": 24263456, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 1.071079062580479, |
|
"grad_norm": 6.565390196167412, |
|
"learning_rate": 3.5849755181266474e-05, |
|
"loss": 0.3767, |
|
"num_input_tokens_seen": 24321960, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.0736543909348442, |
|
"grad_norm": 6.589833421708817, |
|
"learning_rate": 3.5616706746995026e-05, |
|
"loss": 0.4208, |
|
"num_input_tokens_seen": 24380464, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 1.0762297192892094, |
|
"grad_norm": 4.95070197991303, |
|
"learning_rate": 3.538399847973036e-05, |
|
"loss": 0.3479, |
|
"num_input_tokens_seen": 24438976, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.0788050476435747, |
|
"grad_norm": 5.124820683013397, |
|
"learning_rate": 3.515163588305735e-05, |
|
"loss": 0.3654, |
|
"num_input_tokens_seen": 24497448, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 1.0813803759979397, |
|
"grad_norm": 6.444785878585679, |
|
"learning_rate": 3.491962445238569e-05, |
|
"loss": 0.3351, |
|
"num_input_tokens_seen": 24555904, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.0813803759979397, |
|
"eval_loss": 0.8075026869773865, |
|
"eval_runtime": 16.096, |
|
"eval_samples_per_second": 3.728, |
|
"eval_steps_per_second": 0.932, |
|
"num_input_tokens_seen": 24555904, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.083955704352305, |
|
"grad_norm": 5.259882631403194, |
|
"learning_rate": 3.4687969674819906e-05, |
|
"loss": 0.3827, |
|
"num_input_tokens_seen": 24614392, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 1.0865310327066702, |
|
"grad_norm": 4.276410371848581, |
|
"learning_rate": 3.445667702902969e-05, |
|
"loss": 0.3676, |
|
"num_input_tokens_seen": 24672848, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.0891063610610352, |
|
"grad_norm": 10.209040215860048, |
|
"learning_rate": 3.4225751985120215e-05, |
|
"loss": 0.3253, |
|
"num_input_tokens_seen": 24731344, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 1.0916816894154004, |
|
"grad_norm": 6.169752493978822, |
|
"learning_rate": 3.3995200004502816e-05, |
|
"loss": 0.4297, |
|
"num_input_tokens_seen": 24789832, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.0942570177697657, |
|
"grad_norm": 4.238650399680663, |
|
"learning_rate": 3.3765026539765834e-05, |
|
"loss": 0.3536, |
|
"num_input_tokens_seen": 24848264, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 1.0968323461241307, |
|
"grad_norm": 5.445173229006411, |
|
"learning_rate": 3.3535237034545675e-05, |
|
"loss": 0.3588, |
|
"num_input_tokens_seen": 24906744, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.099407674478496, |
|
"grad_norm": 4.508587102151408, |
|
"learning_rate": 3.330583692339802e-05, |
|
"loss": 0.3666, |
|
"num_input_tokens_seen": 24965256, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 1.1019830028328612, |
|
"grad_norm": 5.836654544282574, |
|
"learning_rate": 3.307683163166934e-05, |
|
"loss": 0.3334, |
|
"num_input_tokens_seen": 25023768, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.1045583311872265, |
|
"grad_norm": 6.855334175793522, |
|
"learning_rate": 3.284822657536856e-05, |
|
"loss": 0.3848, |
|
"num_input_tokens_seen": 25082248, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 1.1071336595415915, |
|
"grad_norm": 5.3006438448712565, |
|
"learning_rate": 3.262002716103897e-05, |
|
"loss": 0.3699, |
|
"num_input_tokens_seen": 25140752, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.1071336595415915, |
|
"eval_loss": 0.8523861169815063, |
|
"eval_runtime": 16.0023, |
|
"eval_samples_per_second": 3.749, |
|
"eval_steps_per_second": 0.937, |
|
"num_input_tokens_seen": 25140752, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.1097089878959567, |
|
"grad_norm": 3.943124296473041, |
|
"learning_rate": 3.2392238785630386e-05, |
|
"loss": 0.3154, |
|
"num_input_tokens_seen": 25199208, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 1.112284316250322, |
|
"grad_norm": 8.398532132538953, |
|
"learning_rate": 3.216486683637146e-05, |
|
"loss": 0.3915, |
|
"num_input_tokens_seen": 25257680, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.114859644604687, |
|
"grad_norm": 4.081633194377614, |
|
"learning_rate": 3.1937916690642356e-05, |
|
"loss": 0.3675, |
|
"num_input_tokens_seen": 25316200, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 1.1174349729590523, |
|
"grad_norm": 6.920842495491902, |
|
"learning_rate": 3.1711393715847476e-05, |
|
"loss": 0.4047, |
|
"num_input_tokens_seen": 25374656, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.1200103013134175, |
|
"grad_norm": 8.460113153700512, |
|
"learning_rate": 3.14853032692886e-05, |
|
"loss": 0.4155, |
|
"num_input_tokens_seen": 25433168, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 1.1225856296677827, |
|
"grad_norm": 9.825074199159944, |
|
"learning_rate": 3.125965069803811e-05, |
|
"loss": 0.3966, |
|
"num_input_tokens_seen": 25491664, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.1251609580221478, |
|
"grad_norm": 5.732206927543506, |
|
"learning_rate": 3.103444133881261e-05, |
|
"loss": 0.3068, |
|
"num_input_tokens_seen": 25550128, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 1.127736286376513, |
|
"grad_norm": 6.135036052058211, |
|
"learning_rate": 3.080968051784666e-05, |
|
"loss": 0.386, |
|
"num_input_tokens_seen": 25608624, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.1303116147308783, |
|
"grad_norm": 3.31420885852192, |
|
"learning_rate": 3.058537355076683e-05, |
|
"loss": 0.3898, |
|
"num_input_tokens_seen": 25667128, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 1.1328869430852433, |
|
"grad_norm": 8.182546413863832, |
|
"learning_rate": 3.0361525742465973e-05, |
|
"loss": 0.4016, |
|
"num_input_tokens_seen": 25725560, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.1328869430852433, |
|
"eval_loss": 0.7534744143486023, |
|
"eval_runtime": 15.969, |
|
"eval_samples_per_second": 3.757, |
|
"eval_steps_per_second": 0.939, |
|
"num_input_tokens_seen": 25725560, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.1354622714396085, |
|
"grad_norm": 4.616007617470174, |
|
"learning_rate": 3.0138142386977787e-05, |
|
"loss": 0.3465, |
|
"num_input_tokens_seen": 25784048, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 1.1380375997939738, |
|
"grad_norm": 4.752551024155875, |
|
"learning_rate": 2.991522876735154e-05, |
|
"loss": 0.3077, |
|
"num_input_tokens_seen": 25842512, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.140612928148339, |
|
"grad_norm": 6.021213921198953, |
|
"learning_rate": 2.9692790155527227e-05, |
|
"loss": 0.4497, |
|
"num_input_tokens_seen": 25900992, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 1.143188256502704, |
|
"grad_norm": 8.098592782255322, |
|
"learning_rate": 2.9470831812210837e-05, |
|
"loss": 0.3811, |
|
"num_input_tokens_seen": 25959448, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.1457635848570693, |
|
"grad_norm": 6.108837560432838, |
|
"learning_rate": 2.924935898674992e-05, |
|
"loss": 0.4053, |
|
"num_input_tokens_seen": 26017936, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 1.1483389132114346, |
|
"grad_norm": 7.709937017464705, |
|
"learning_rate": 2.902837691700945e-05, |
|
"loss": 0.3421, |
|
"num_input_tokens_seen": 26076440, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.1509142415657996, |
|
"grad_norm": 3.840146275079161, |
|
"learning_rate": 2.880789082924798e-05, |
|
"loss": 0.3228, |
|
"num_input_tokens_seen": 26134896, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 1.1534895699201648, |
|
"grad_norm": 6.088757703790803, |
|
"learning_rate": 2.858790593799405e-05, |
|
"loss": 0.3695, |
|
"num_input_tokens_seen": 26193368, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.15606489827453, |
|
"grad_norm": 3.8647543120940844, |
|
"learning_rate": 2.8368427445922696e-05, |
|
"loss": 0.3463, |
|
"num_input_tokens_seen": 26251848, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 1.158640226628895, |
|
"grad_norm": 4.425454601086007, |
|
"learning_rate": 2.8149460543732664e-05, |
|
"loss": 0.3442, |
|
"num_input_tokens_seen": 26310336, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.158640226628895, |
|
"eval_loss": 0.7066138386726379, |
|
"eval_runtime": 15.9558, |
|
"eval_samples_per_second": 3.76, |
|
"eval_steps_per_second": 0.94, |
|
"num_input_tokens_seen": 26310336, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.1612155549832603, |
|
"grad_norm": 6.312367706992343, |
|
"learning_rate": 2.7931010410023518e-05, |
|
"loss": 0.3547, |
|
"num_input_tokens_seen": 26368840, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 1.1637908833376256, |
|
"grad_norm": 6.429493717694784, |
|
"learning_rate": 2.771308221117309e-05, |
|
"loss": 0.3125, |
|
"num_input_tokens_seen": 26427280, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.1663662116919906, |
|
"grad_norm": 6.993677707266103, |
|
"learning_rate": 2.749568110121545e-05, |
|
"loss": 0.3521, |
|
"num_input_tokens_seen": 26485760, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 1.1689415400463559, |
|
"grad_norm": 5.03743116566882, |
|
"learning_rate": 2.7278812221718924e-05, |
|
"loss": 0.281, |
|
"num_input_tokens_seen": 26544224, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.1715168684007211, |
|
"grad_norm": 5.828198718501714, |
|
"learning_rate": 2.7062480701664488e-05, |
|
"loss": 0.3653, |
|
"num_input_tokens_seen": 26602712, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 1.1740921967550864, |
|
"grad_norm": 6.1247491578050655, |
|
"learning_rate": 2.6846691657324473e-05, |
|
"loss": 0.3964, |
|
"num_input_tokens_seen": 26661160, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.1766675251094514, |
|
"grad_norm": 6.231155247277189, |
|
"learning_rate": 2.663145019214163e-05, |
|
"loss": 0.3119, |
|
"num_input_tokens_seen": 26719648, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 1.1792428534638166, |
|
"grad_norm": 6.501604840456734, |
|
"learning_rate": 2.6416761396608362e-05, |
|
"loss": 0.3832, |
|
"num_input_tokens_seen": 26778112, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.1818181818181819, |
|
"grad_norm": 5.377003761278013, |
|
"learning_rate": 2.6202630348146324e-05, |
|
"loss": 0.3277, |
|
"num_input_tokens_seen": 26836592, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 1.184393510172547, |
|
"grad_norm": 4.826044073542379, |
|
"learning_rate": 2.598906211098643e-05, |
|
"loss": 0.3877, |
|
"num_input_tokens_seen": 26895096, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.184393510172547, |
|
"eval_loss": 0.727741539478302, |
|
"eval_runtime": 15.9289, |
|
"eval_samples_per_second": 3.767, |
|
"eval_steps_per_second": 0.942, |
|
"num_input_tokens_seen": 26895096, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.1869688385269122, |
|
"grad_norm": 6.370847827905799, |
|
"learning_rate": 2.577606173604894e-05, |
|
"loss": 0.3033, |
|
"num_input_tokens_seen": 26953560, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 1.1895441668812774, |
|
"grad_norm": 11.746077197029585, |
|
"learning_rate": 2.5563634260824175e-05, |
|
"loss": 0.4104, |
|
"num_input_tokens_seen": 27012024, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.1921194952356426, |
|
"grad_norm": 3.9544988689102762, |
|
"learning_rate": 2.535178470925323e-05, |
|
"loss": 0.3447, |
|
"num_input_tokens_seen": 27070520, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 1.1946948235900077, |
|
"grad_norm": 4.72491689052158, |
|
"learning_rate": 2.5140518091609256e-05, |
|
"loss": 0.2882, |
|
"num_input_tokens_seen": 27128984, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.197270151944373, |
|
"grad_norm": 2.1806068747411245, |
|
"learning_rate": 2.4929839404378936e-05, |
|
"loss": 0.2817, |
|
"num_input_tokens_seen": 27187432, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 1.1998454802987382, |
|
"grad_norm": 3.2798105115490745, |
|
"learning_rate": 2.471975363014428e-05, |
|
"loss": 0.3693, |
|
"num_input_tokens_seen": 27245920, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.2024208086531032, |
|
"grad_norm": 7.472396523773262, |
|
"learning_rate": 2.451026573746482e-05, |
|
"loss": 0.3587, |
|
"num_input_tokens_seen": 27304384, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 1.2049961370074684, |
|
"grad_norm": 6.7073623181550275, |
|
"learning_rate": 2.430138068076013e-05, |
|
"loss": 0.354, |
|
"num_input_tokens_seen": 27362864, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.2075714653618337, |
|
"grad_norm": 6.2693798293878515, |
|
"learning_rate": 2.4093103400192625e-05, |
|
"loss": 0.3209, |
|
"num_input_tokens_seen": 27421360, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 1.210146793716199, |
|
"grad_norm": 6.606866726236357, |
|
"learning_rate": 2.388543882155067e-05, |
|
"loss": 0.3871, |
|
"num_input_tokens_seen": 27479840, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.210146793716199, |
|
"eval_loss": 0.7659633755683899, |
|
"eval_runtime": 16.0101, |
|
"eval_samples_per_second": 3.748, |
|
"eval_steps_per_second": 0.937, |
|
"num_input_tokens_seen": 27479840, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.212722122070564, |
|
"grad_norm": 8.004400275953609, |
|
"learning_rate": 2.3678391856132204e-05, |
|
"loss": 0.352, |
|
"num_input_tokens_seen": 27538344, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 1.2152974504249292, |
|
"grad_norm": 8.385547193425513, |
|
"learning_rate": 2.3471967400628513e-05, |
|
"loss": 0.347, |
|
"num_input_tokens_seen": 27596808, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.2178727787792945, |
|
"grad_norm": 3.9234442237475435, |
|
"learning_rate": 2.3266170337008398e-05, |
|
"loss": 0.3667, |
|
"num_input_tokens_seen": 27655272, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 1.2204481071336595, |
|
"grad_norm": 6.584480429736488, |
|
"learning_rate": 2.306100553240274e-05, |
|
"loss": 0.3311, |
|
"num_input_tokens_seen": 27713784, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.2230234354880247, |
|
"grad_norm": 5.791637874835276, |
|
"learning_rate": 2.2856477838989456e-05, |
|
"loss": 0.2964, |
|
"num_input_tokens_seen": 27772248, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 1.22559876384239, |
|
"grad_norm": 5.663503226529594, |
|
"learning_rate": 2.2652592093878666e-05, |
|
"loss": 0.3683, |
|
"num_input_tokens_seen": 27830704, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.228174092196755, |
|
"grad_norm": 9.657080260273457, |
|
"learning_rate": 2.244935311899829e-05, |
|
"loss": 0.3819, |
|
"num_input_tokens_seen": 27889160, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 1.2307494205511202, |
|
"grad_norm": 4.757552901440964, |
|
"learning_rate": 2.224676572098007e-05, |
|
"loss": 0.3084, |
|
"num_input_tokens_seen": 27947608, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.2333247489054855, |
|
"grad_norm": 5.188072586185411, |
|
"learning_rate": 2.2044834691045873e-05, |
|
"loss": 0.4267, |
|
"num_input_tokens_seen": 28006112, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 1.2359000772598505, |
|
"grad_norm": 7.221389028269126, |
|
"learning_rate": 2.184356480489432e-05, |
|
"loss": 0.3486, |
|
"num_input_tokens_seen": 28064552, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.2359000772598505, |
|
"eval_loss": 0.7410638928413391, |
|
"eval_runtime": 15.945, |
|
"eval_samples_per_second": 3.763, |
|
"eval_steps_per_second": 0.941, |
|
"num_input_tokens_seen": 28064552, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.2384754056142158, |
|
"grad_norm": 4.430659190759614, |
|
"learning_rate": 2.1642960822587878e-05, |
|
"loss": 0.2416, |
|
"num_input_tokens_seen": 28123016, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 1.241050733968581, |
|
"grad_norm": 4.985077238748084, |
|
"learning_rate": 2.1443027488440338e-05, |
|
"loss": 0.3007, |
|
"num_input_tokens_seen": 28181464, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.2436260623229463, |
|
"grad_norm": 11.21074775906945, |
|
"learning_rate": 2.124376953090456e-05, |
|
"loss": 0.2655, |
|
"num_input_tokens_seen": 28239920, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 1.2462013906773113, |
|
"grad_norm": 6.8116545197169724, |
|
"learning_rate": 2.104519166246059e-05, |
|
"loss": 0.3075, |
|
"num_input_tokens_seen": 28298432, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.2487767190316765, |
|
"grad_norm": 10.87615610006345, |
|
"learning_rate": 2.0847298579504344e-05, |
|
"loss": 0.3537, |
|
"num_input_tokens_seen": 28356904, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 1.2513520473860418, |
|
"grad_norm": 3.9413743825159133, |
|
"learning_rate": 2.065009496223638e-05, |
|
"loss": 0.2993, |
|
"num_input_tokens_seen": 28415384, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.2539273757404068, |
|
"grad_norm": 3.3043013555966407, |
|
"learning_rate": 2.045358547455138e-05, |
|
"loss": 0.2752, |
|
"num_input_tokens_seen": 28473848, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 1.256502704094772, |
|
"grad_norm": 3.6641007142438338, |
|
"learning_rate": 2.0257774763927655e-05, |
|
"loss": 0.2975, |
|
"num_input_tokens_seen": 28532312, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.2590780324491373, |
|
"grad_norm": 6.306122720573227, |
|
"learning_rate": 2.0062667461317426e-05, |
|
"loss": 0.4051, |
|
"num_input_tokens_seen": 28590784, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 1.2616533608035025, |
|
"grad_norm": 4.823015256168698, |
|
"learning_rate": 1.9868268181037185e-05, |
|
"loss": 0.2966, |
|
"num_input_tokens_seen": 28649256, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.2616533608035025, |
|
"eval_loss": 0.7485548853874207, |
|
"eval_runtime": 16.0437, |
|
"eval_samples_per_second": 3.74, |
|
"eval_steps_per_second": 0.935, |
|
"num_input_tokens_seen": 28649256, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.2642286891578676, |
|
"grad_norm": 10.005201788297592, |
|
"learning_rate": 1.967458152065857e-05, |
|
"loss": 0.2664, |
|
"num_input_tokens_seen": 28707736, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 1.2668040175122328, |
|
"grad_norm": 4.744134155404128, |
|
"learning_rate": 1.9481612060899646e-05, |
|
"loss": 0.3692, |
|
"num_input_tokens_seen": 28766232, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.269379345866598, |
|
"grad_norm": 8.49200897563331, |
|
"learning_rate": 1.928936436551661e-05, |
|
"loss": 0.315, |
|
"num_input_tokens_seen": 28824688, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 1.271954674220963, |
|
"grad_norm": 5.112500789477909, |
|
"learning_rate": 1.9097842981195834e-05, |
|
"loss": 0.3536, |
|
"num_input_tokens_seen": 28883176, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.2745300025753283, |
|
"grad_norm": 4.93472430343828, |
|
"learning_rate": 1.8907052437446272e-05, |
|
"loss": 0.3143, |
|
"num_input_tokens_seen": 28941592, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 1.2771053309296936, |
|
"grad_norm": 4.6754631245280365, |
|
"learning_rate": 1.871699724649244e-05, |
|
"loss": 0.3114, |
|
"num_input_tokens_seen": 29000064, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.2796806592840588, |
|
"grad_norm": 7.198381813960669, |
|
"learning_rate": 1.8527681903167644e-05, |
|
"loss": 0.3327, |
|
"num_input_tokens_seen": 29058496, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 1.2822559876384239, |
|
"grad_norm": 9.221713217692685, |
|
"learning_rate": 1.833911088480767e-05, |
|
"loss": 0.2543, |
|
"num_input_tokens_seen": 29116992, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.284831315992789, |
|
"grad_norm": 8.499870267936974, |
|
"learning_rate": 1.8151288651144893e-05, |
|
"loss": 0.2854, |
|
"num_input_tokens_seen": 29175496, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 1.2874066443471541, |
|
"grad_norm": 4.289294450742717, |
|
"learning_rate": 1.796421964420285e-05, |
|
"loss": 0.3221, |
|
"num_input_tokens_seen": 29233968, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.2874066443471541, |
|
"eval_loss": 0.7222262620925903, |
|
"eval_runtime": 16.106, |
|
"eval_samples_per_second": 3.725, |
|
"eval_steps_per_second": 0.931, |
|
"num_input_tokens_seen": 29233968, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.2899819727015194, |
|
"grad_norm": 3.3788238852269035, |
|
"learning_rate": 1.7777908288191176e-05, |
|
"loss": 0.2344, |
|
"num_input_tokens_seen": 29292464, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 1.2925573010558846, |
|
"grad_norm": 9.201457612553746, |
|
"learning_rate": 1.7592358989400883e-05, |
|
"loss": 0.2727, |
|
"num_input_tokens_seen": 29350952, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 1.2951326294102499, |
|
"grad_norm": 4.626370050462018, |
|
"learning_rate": 1.740757613610028e-05, |
|
"loss": 0.2687, |
|
"num_input_tokens_seen": 29409432, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 1.2977079577646151, |
|
"grad_norm": 5.784936514951468, |
|
"learning_rate": 1.7223564098431067e-05, |
|
"loss": 0.2632, |
|
"num_input_tokens_seen": 29467880, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.3002832861189801, |
|
"grad_norm": 4.405244480948001, |
|
"learning_rate": 1.704032722830512e-05, |
|
"loss": 0.3057, |
|
"num_input_tokens_seen": 29526384, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 1.3028586144733454, |
|
"grad_norm": 7.8069578913798825, |
|
"learning_rate": 1.68578698593014e-05, |
|
"loss": 0.3054, |
|
"num_input_tokens_seen": 29584880, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 1.3054339428277104, |
|
"grad_norm": 6.957468356582848, |
|
"learning_rate": 1.6676196306563613e-05, |
|
"loss": 0.28, |
|
"num_input_tokens_seen": 29643344, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 1.3080092711820757, |
|
"grad_norm": 9.353535349996537, |
|
"learning_rate": 1.6495310866698093e-05, |
|
"loss": 0.3169, |
|
"num_input_tokens_seen": 29701864, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 1.310584599536441, |
|
"grad_norm": 5.246799138683368, |
|
"learning_rate": 1.631521781767214e-05, |
|
"loss": 0.2985, |
|
"num_input_tokens_seen": 29760376, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 1.3131599278908062, |
|
"grad_norm": 10.51357763616516, |
|
"learning_rate": 1.6135921418712956e-05, |
|
"loss": 0.3231, |
|
"num_input_tokens_seen": 29818856, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.3131599278908062, |
|
"eval_loss": 0.7146337628364563, |
|
"eval_runtime": 16.171, |
|
"eval_samples_per_second": 3.71, |
|
"eval_steps_per_second": 0.928, |
|
"num_input_tokens_seen": 29818856, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.3157352562451712, |
|
"grad_norm": 5.568529968511631, |
|
"learning_rate": 1.5957425910206785e-05, |
|
"loss": 0.2689, |
|
"num_input_tokens_seen": 29877288, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 1.3183105845995364, |
|
"grad_norm": 4.860244764698273, |
|
"learning_rate": 1.577973551359877e-05, |
|
"loss": 0.3889, |
|
"num_input_tokens_seen": 29935776, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 1.3208859129539017, |
|
"grad_norm": 4.938342083847672, |
|
"learning_rate": 1.560285443129296e-05, |
|
"loss": 0.2489, |
|
"num_input_tokens_seen": 29994232, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 1.3234612413082667, |
|
"grad_norm": 7.223451539163128, |
|
"learning_rate": 1.542678684655306e-05, |
|
"loss": 0.3016, |
|
"num_input_tokens_seen": 30052760, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 1.326036569662632, |
|
"grad_norm": 8.03849051806361, |
|
"learning_rate": 1.5251536923403426e-05, |
|
"loss": 0.3063, |
|
"num_input_tokens_seen": 30111200, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 1.3286118980169972, |
|
"grad_norm": 6.631117517846943, |
|
"learning_rate": 1.5077108806530581e-05, |
|
"loss": 0.3159, |
|
"num_input_tokens_seen": 30169680, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 1.3311872263713624, |
|
"grad_norm": 4.171513219192368, |
|
"learning_rate": 1.4903506621185192e-05, |
|
"loss": 0.3752, |
|
"num_input_tokens_seen": 30228176, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 1.3337625547257275, |
|
"grad_norm": 4.3829742543964985, |
|
"learning_rate": 1.4730734473084568e-05, |
|
"loss": 0.3207, |
|
"num_input_tokens_seen": 30286656, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 1.3363378830800927, |
|
"grad_norm": 7.160007281376411, |
|
"learning_rate": 1.4558796448315504e-05, |
|
"loss": 0.2928, |
|
"num_input_tokens_seen": 30345160, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 1.338913211434458, |
|
"grad_norm": 7.316812087176357, |
|
"learning_rate": 1.4387696613237612e-05, |
|
"loss": 0.2779, |
|
"num_input_tokens_seen": 30403640, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.338913211434458, |
|
"eval_loss": 0.695651650428772, |
|
"eval_runtime": 16.2569, |
|
"eval_samples_per_second": 3.691, |
|
"eval_steps_per_second": 0.923, |
|
"num_input_tokens_seen": 30403640, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.341488539788823, |
|
"grad_norm": 6.900087606750275, |
|
"learning_rate": 1.4217439014387251e-05, |
|
"loss": 0.3037, |
|
"num_input_tokens_seen": 30462128, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 1.3440638681431882, |
|
"grad_norm": 9.361737062462586, |
|
"learning_rate": 1.404802767838176e-05, |
|
"loss": 0.2905, |
|
"num_input_tokens_seen": 30520616, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.3466391964975535, |
|
"grad_norm": 11.101564672040755, |
|
"learning_rate": 1.3879466611824199e-05, |
|
"loss": 0.317, |
|
"num_input_tokens_seen": 30579024, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 1.3492145248519187, |
|
"grad_norm": 5.213355428878847, |
|
"learning_rate": 1.371175980120864e-05, |
|
"loss": 0.2794, |
|
"num_input_tokens_seen": 30637464, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.3517898532062838, |
|
"grad_norm": 4.8688198861459915, |
|
"learning_rate": 1.3544911212825906e-05, |
|
"loss": 0.3056, |
|
"num_input_tokens_seen": 30695936, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 1.354365181560649, |
|
"grad_norm": 9.002025840794365, |
|
"learning_rate": 1.337892479266974e-05, |
|
"loss": 0.2712, |
|
"num_input_tokens_seen": 30754408, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 1.356940509915014, |
|
"grad_norm": 4.793656741683869, |
|
"learning_rate": 1.3213804466343421e-05, |
|
"loss": 0.2615, |
|
"num_input_tokens_seen": 30812848, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 1.3595158382693793, |
|
"grad_norm": 5.128300113893045, |
|
"learning_rate": 1.3049554138967051e-05, |
|
"loss": 0.2661, |
|
"num_input_tokens_seen": 30871344, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.3620911666237445, |
|
"grad_norm": 6.038434247454305, |
|
"learning_rate": 1.2886177695085078e-05, |
|
"loss": 0.3272, |
|
"num_input_tokens_seen": 30929824, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 1.3646664949781098, |
|
"grad_norm": 5.501317116522042, |
|
"learning_rate": 1.2723678998574512e-05, |
|
"loss": 0.2962, |
|
"num_input_tokens_seen": 30988344, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.3646664949781098, |
|
"eval_loss": 0.7657458186149597, |
|
"eval_runtime": 16.0821, |
|
"eval_samples_per_second": 3.731, |
|
"eval_steps_per_second": 0.933, |
|
"num_input_tokens_seen": 30988344, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.367241823332475, |
|
"grad_norm": 5.445887797084714, |
|
"learning_rate": 1.2562061892553473e-05, |
|
"loss": 0.3207, |
|
"num_input_tokens_seen": 31046848, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 1.36981715168684, |
|
"grad_norm": 8.28343197617098, |
|
"learning_rate": 1.2401330199290367e-05, |
|
"loss": 0.3001, |
|
"num_input_tokens_seen": 31105352, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.3723924800412053, |
|
"grad_norm": 6.0349779847885054, |
|
"learning_rate": 1.224148772011346e-05, |
|
"loss": 0.2858, |
|
"num_input_tokens_seen": 31163848, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 1.3749678083955703, |
|
"grad_norm": 6.430225669948217, |
|
"learning_rate": 1.2082538235320929e-05, |
|
"loss": 0.2338, |
|
"num_input_tokens_seen": 31222360, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 1.3775431367499356, |
|
"grad_norm": 7.550675916086161, |
|
"learning_rate": 1.1924485504091565e-05, |
|
"loss": 0.2212, |
|
"num_input_tokens_seen": 31280840, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 1.3801184651043008, |
|
"grad_norm": 9.927835245980713, |
|
"learning_rate": 1.1767333264395736e-05, |
|
"loss": 0.3131, |
|
"num_input_tokens_seen": 31339264, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.382693793458666, |
|
"grad_norm": 6.940248775417007, |
|
"learning_rate": 1.1611085232907132e-05, |
|
"loss": 0.3616, |
|
"num_input_tokens_seen": 31397744, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 1.385269121813031, |
|
"grad_norm": 13.50108715364713, |
|
"learning_rate": 1.14557451049147e-05, |
|
"loss": 0.3153, |
|
"num_input_tokens_seen": 31456240, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 1.3878444501673963, |
|
"grad_norm": 5.379761157260886, |
|
"learning_rate": 1.1301316554235397e-05, |
|
"loss": 0.3044, |
|
"num_input_tokens_seen": 31514744, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 1.3904197785217616, |
|
"grad_norm": 6.480605347127299, |
|
"learning_rate": 1.114780323312724e-05, |
|
"loss": 0.3163, |
|
"num_input_tokens_seen": 31573240, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.3904197785217616, |
|
"eval_loss": 0.7473158240318298, |
|
"eval_runtime": 16.166, |
|
"eval_samples_per_second": 3.711, |
|
"eval_steps_per_second": 0.928, |
|
"num_input_tokens_seen": 31573240, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.3929951068761266, |
|
"grad_norm": 4.579483859059419, |
|
"learning_rate": 1.0995208772202897e-05, |
|
"loss": 0.2798, |
|
"num_input_tokens_seen": 31631688, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 1.3955704352304918, |
|
"grad_norm": 6.098482033036635, |
|
"learning_rate": 1.0843536780343865e-05, |
|
"loss": 0.289, |
|
"num_input_tokens_seen": 31690200, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 1.398145763584857, |
|
"grad_norm": 9.834029857293697, |
|
"learning_rate": 1.069279084461513e-05, |
|
"loss": 0.2844, |
|
"num_input_tokens_seen": 31748664, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 1.4007210919392223, |
|
"grad_norm": 9.387518267357049, |
|
"learning_rate": 1.0542974530180327e-05, |
|
"loss": 0.3254, |
|
"num_input_tokens_seen": 31807176, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.4032964202935874, |
|
"grad_norm": 5.648695214602192, |
|
"learning_rate": 1.0394091380217352e-05, |
|
"loss": 0.3683, |
|
"num_input_tokens_seen": 31865696, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 1.4058717486479526, |
|
"grad_norm": 5.202858729177478, |
|
"learning_rate": 1.0246144915834683e-05, |
|
"loss": 0.2968, |
|
"num_input_tokens_seen": 31924200, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 1.4084470770023179, |
|
"grad_norm": 4.808429946385537, |
|
"learning_rate": 1.0099138635988026e-05, |
|
"loss": 0.2943, |
|
"num_input_tokens_seen": 31982712, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 1.4110224053566829, |
|
"grad_norm": 5.094039780174813, |
|
"learning_rate": 9.953076017397578e-06, |
|
"loss": 0.3037, |
|
"num_input_tokens_seen": 32041176, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.4135977337110481, |
|
"grad_norm": 5.807237736394797, |
|
"learning_rate": 9.807960514465792e-06, |
|
"loss": 0.3019, |
|
"num_input_tokens_seen": 32099656, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 1.4161730620654134, |
|
"grad_norm": 6.27488451409393, |
|
"learning_rate": 9.663795559195733e-06, |
|
"loss": 0.164, |
|
"num_input_tokens_seen": 32158144, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.4161730620654134, |
|
"eval_loss": 0.7807286381721497, |
|
"eval_runtime": 16.139, |
|
"eval_samples_per_second": 3.718, |
|
"eval_steps_per_second": 0.929, |
|
"num_input_tokens_seen": 32158144, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.4187483904197786, |
|
"grad_norm": 6.584628814510667, |
|
"learning_rate": 9.520584561109864e-06, |
|
"loss": 0.3333, |
|
"num_input_tokens_seen": 32216656, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 1.4213237187741437, |
|
"grad_norm": 7.509676086247465, |
|
"learning_rate": 9.378330907169386e-06, |
|
"loss": 0.2993, |
|
"num_input_tokens_seen": 32275168, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.423899047128509, |
|
"grad_norm": 5.1775193353141535, |
|
"learning_rate": 9.237037961694223e-06, |
|
"loss": 0.2683, |
|
"num_input_tokens_seen": 32333664, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 1.4264743754828741, |
|
"grad_norm": 7.856433365965151, |
|
"learning_rate": 9.096709066283354e-06, |
|
"loss": 0.3145, |
|
"num_input_tokens_seen": 32392088, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 1.4290497038372392, |
|
"grad_norm": 8.252870521534577, |
|
"learning_rate": 8.957347539735872e-06, |
|
"loss": 0.3092, |
|
"num_input_tokens_seen": 32450584, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 1.4316250321916044, |
|
"grad_norm": 9.74883489294415, |
|
"learning_rate": 8.818956677972406e-06, |
|
"loss": 0.2993, |
|
"num_input_tokens_seen": 32509096, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.4342003605459697, |
|
"grad_norm": 4.008158818829899, |
|
"learning_rate": 8.681539753957269e-06, |
|
"loss": 0.326, |
|
"num_input_tokens_seen": 32567560, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 1.436775688900335, |
|
"grad_norm": 3.4229494980881174, |
|
"learning_rate": 8.545100017620988e-06, |
|
"loss": 0.2494, |
|
"num_input_tokens_seen": 32626056, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 1.4393510172547, |
|
"grad_norm": 4.425295787830864, |
|
"learning_rate": 8.409640695783443e-06, |
|
"loss": 0.2691, |
|
"num_input_tokens_seen": 32684520, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 1.4419263456090652, |
|
"grad_norm": 5.132559476583136, |
|
"learning_rate": 8.275164992077556e-06, |
|
"loss": 0.2939, |
|
"num_input_tokens_seen": 32743032, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.4419263456090652, |
|
"eval_loss": 0.791334331035614, |
|
"eval_runtime": 16.1142, |
|
"eval_samples_per_second": 3.723, |
|
"eval_steps_per_second": 0.931, |
|
"num_input_tokens_seen": 32743032, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.4445016739634302, |
|
"grad_norm": 4.932628514942533, |
|
"learning_rate": 8.141676086873572e-06, |
|
"loss": 0.2974, |
|
"num_input_tokens_seen": 32801504, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 1.4470770023177955, |
|
"grad_norm": 8.764444587690557, |
|
"learning_rate": 8.009177137203794e-06, |
|
"loss": 0.2849, |
|
"num_input_tokens_seen": 32860032, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.4496523306721607, |
|
"grad_norm": 5.502098759051231, |
|
"learning_rate": 7.877671276687898e-06, |
|
"loss": 0.3024, |
|
"num_input_tokens_seen": 32918472, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 1.452227659026526, |
|
"grad_norm": 3.2634043608450183, |
|
"learning_rate": 7.747161615458902e-06, |
|
"loss": 0.2565, |
|
"num_input_tokens_seen": 32976944, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.4548029873808912, |
|
"grad_norm": 4.852977750360098, |
|
"learning_rate": 7.617651240089546e-06, |
|
"loss": 0.2473, |
|
"num_input_tokens_seen": 33035424, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 1.4573783157352562, |
|
"grad_norm": 8.667293936674204, |
|
"learning_rate": 7.489143213519301e-06, |
|
"loss": 0.3118, |
|
"num_input_tokens_seen": 33093880, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 1.4599536440896215, |
|
"grad_norm": 9.253351843058615, |
|
"learning_rate": 7.361640574981937e-06, |
|
"loss": 0.2593, |
|
"num_input_tokens_seen": 33152328, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 1.4625289724439865, |
|
"grad_norm": 6.811131820051524, |
|
"learning_rate": 7.2351463399336735e-06, |
|
"loss": 0.284, |
|
"num_input_tokens_seen": 33210816, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.4651043007983517, |
|
"grad_norm": 4.086720732934785, |
|
"learning_rate": 7.109663499981834e-06, |
|
"loss": 0.2671, |
|
"num_input_tokens_seen": 33269320, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 1.467679629152717, |
|
"grad_norm": 9.463519299706055, |
|
"learning_rate": 6.985195022814067e-06, |
|
"loss": 0.2848, |
|
"num_input_tokens_seen": 33327720, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.467679629152717, |
|
"eval_loss": 0.8045337796211243, |
|
"eval_runtime": 15.9996, |
|
"eval_samples_per_second": 3.75, |
|
"eval_steps_per_second": 0.938, |
|
"num_input_tokens_seen": 33327720, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.4702549575070822, |
|
"grad_norm": 6.856320486947826, |
|
"learning_rate": 6.861743852128233e-06, |
|
"loss": 0.2811, |
|
"num_input_tokens_seen": 33386160, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 1.4728302858614473, |
|
"grad_norm": 8.133776634702407, |
|
"learning_rate": 6.7393129075627335e-06, |
|
"loss": 0.2394, |
|
"num_input_tokens_seen": 33444648, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.4754056142158125, |
|
"grad_norm": 5.884612144672532, |
|
"learning_rate": 6.6179050846274515e-06, |
|
"loss": 0.243, |
|
"num_input_tokens_seen": 33503144, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 1.4779809425701778, |
|
"grad_norm": 7.133095118516192, |
|
"learning_rate": 6.497523254635296e-06, |
|
"loss": 0.242, |
|
"num_input_tokens_seen": 33561600, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 1.4805562709245428, |
|
"grad_norm": 3.725193081900286, |
|
"learning_rate": 6.37817026463432e-06, |
|
"loss": 0.1864, |
|
"num_input_tokens_seen": 33620056, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 1.483131599278908, |
|
"grad_norm": 5.26408055314188, |
|
"learning_rate": 6.25984893734034e-06, |
|
"loss": 0.2406, |
|
"num_input_tokens_seen": 33678512, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.4857069276332733, |
|
"grad_norm": 5.139938399894378, |
|
"learning_rate": 6.142562071070179e-06, |
|
"loss": 0.2287, |
|
"num_input_tokens_seen": 33736960, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 1.4882822559876385, |
|
"grad_norm": 5.551633292498772, |
|
"learning_rate": 6.026312439675552e-06, |
|
"loss": 0.2643, |
|
"num_input_tokens_seen": 33795416, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.4908575843420036, |
|
"grad_norm": 5.974549504189433, |
|
"learning_rate": 5.911102792477357e-06, |
|
"loss": 0.2956, |
|
"num_input_tokens_seen": 33853936, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 1.4934329126963688, |
|
"grad_norm": 5.786971041370645, |
|
"learning_rate": 5.796935854200763e-06, |
|
"loss": 0.29, |
|
"num_input_tokens_seen": 33912440, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.4934329126963688, |
|
"eval_loss": 0.8113046884536743, |
|
"eval_runtime": 16.0025, |
|
"eval_samples_per_second": 3.749, |
|
"eval_steps_per_second": 0.937, |
|
"num_input_tokens_seen": 33912440, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.496008241050734, |
|
"grad_norm": 5.559213288581127, |
|
"learning_rate": 5.683814324910685e-06, |
|
"loss": 0.2815, |
|
"num_input_tokens_seen": 33970888, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 1.498583569405099, |
|
"grad_norm": 4.093818675769417, |
|
"learning_rate": 5.571740879947979e-06, |
|
"loss": 0.2737, |
|
"num_input_tokens_seen": 34029376, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 1.5011588977594643, |
|
"grad_norm": 3.092699650877493, |
|
"learning_rate": 5.4607181698661634e-06, |
|
"loss": 0.2445, |
|
"num_input_tokens_seen": 34087864, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 1.5037342261138296, |
|
"grad_norm": 11.010380823046683, |
|
"learning_rate": 5.35074882036869e-06, |
|
"loss": 0.2802, |
|
"num_input_tokens_seen": 34146296, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 1.5063095544681948, |
|
"grad_norm": 6.09904123406433, |
|
"learning_rate": 5.241835432246889e-06, |
|
"loss": 0.2379, |
|
"num_input_tokens_seen": 34204800, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 1.5088848828225598, |
|
"grad_norm": 6.205588168386299, |
|
"learning_rate": 5.133980581318459e-06, |
|
"loss": 0.2783, |
|
"num_input_tokens_seen": 34263296, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 1.511460211176925, |
|
"grad_norm": 5.945749064464075, |
|
"learning_rate": 5.027186818366542e-06, |
|
"loss": 0.2609, |
|
"num_input_tokens_seen": 34321792, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 1.51403553953129, |
|
"grad_norm": 6.50829738633896, |
|
"learning_rate": 4.921456669079366e-06, |
|
"loss": 0.2367, |
|
"num_input_tokens_seen": 34380264, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 1.5166108678856554, |
|
"grad_norm": 8.02525724539128, |
|
"learning_rate": 4.816792633990569e-06, |
|
"loss": 0.3644, |
|
"num_input_tokens_seen": 34438752, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 1.5191861962400206, |
|
"grad_norm": 8.28398511184134, |
|
"learning_rate": 4.713197188420026e-06, |
|
"loss": 0.2494, |
|
"num_input_tokens_seen": 34497216, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.5191861962400206, |
|
"eval_loss": 0.8177086710929871, |
|
"eval_runtime": 16.0851, |
|
"eval_samples_per_second": 3.73, |
|
"eval_steps_per_second": 0.933, |
|
"num_input_tokens_seen": 34497216, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.5217615245943859, |
|
"grad_norm": 7.1653439027229, |
|
"learning_rate": 4.610672782415276e-06, |
|
"loss": 0.2892, |
|
"num_input_tokens_seen": 34555704, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 1.524336852948751, |
|
"grad_norm": 5.9872264088640295, |
|
"learning_rate": 4.509221840693656e-06, |
|
"loss": 0.3006, |
|
"num_input_tokens_seen": 34614168, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 1.5269121813031161, |
|
"grad_norm": 3.47728801697101, |
|
"learning_rate": 4.408846762584901e-06, |
|
"loss": 0.2931, |
|
"num_input_tokens_seen": 34672624, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 1.5294875096574814, |
|
"grad_norm": 5.342563435045045, |
|
"learning_rate": 4.309549921974421e-06, |
|
"loss": 0.2255, |
|
"num_input_tokens_seen": 34731056, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 1.5320628380118464, |
|
"grad_norm": 8.130368656554953, |
|
"learning_rate": 4.2113336672471245e-06, |
|
"loss": 0.2725, |
|
"num_input_tokens_seen": 34789552, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 1.5346381663662116, |
|
"grad_norm": 6.656792231449799, |
|
"learning_rate": 4.114200321231937e-06, |
|
"loss": 0.3158, |
|
"num_input_tokens_seen": 34848064, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 1.537213494720577, |
|
"grad_norm": 16.361277885783338, |
|
"learning_rate": 4.018152181146823e-06, |
|
"loss": 0.2562, |
|
"num_input_tokens_seen": 34906592, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 1.5397888230749421, |
|
"grad_norm": 5.885778380254227, |
|
"learning_rate": 3.923191518544434e-06, |
|
"loss": 0.2814, |
|
"num_input_tokens_seen": 34965064, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 1.5423641514293074, |
|
"grad_norm": 7.567800102342742, |
|
"learning_rate": 3.829320579258466e-06, |
|
"loss": 0.2555, |
|
"num_input_tokens_seen": 35023552, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 1.5449394797836724, |
|
"grad_norm": 6.846236051634878, |
|
"learning_rate": 3.7365415833504725e-06, |
|
"loss": 0.2259, |
|
"num_input_tokens_seen": 35082056, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.5449394797836724, |
|
"eval_loss": 0.8405727744102478, |
|
"eval_runtime": 16.2083, |
|
"eval_samples_per_second": 3.702, |
|
"eval_steps_per_second": 0.925, |
|
"num_input_tokens_seen": 35082056, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.5475148081380374, |
|
"grad_norm": 5.643348291984009, |
|
"learning_rate": 3.644856725057405e-06, |
|
"loss": 0.2157, |
|
"num_input_tokens_seen": 35140568, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 1.5500901364924027, |
|
"grad_norm": 6.225693907549098, |
|
"learning_rate": 3.554268172739661e-06, |
|
"loss": 0.2233, |
|
"num_input_tokens_seen": 35199064, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 1.552665464846768, |
|
"grad_norm": 5.080945994557626, |
|
"learning_rate": 3.4647780688298826e-06, |
|
"loss": 0.2951, |
|
"num_input_tokens_seen": 35257576, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 1.5552407932011332, |
|
"grad_norm": 5.263879934995459, |
|
"learning_rate": 3.376388529782215e-06, |
|
"loss": 0.2274, |
|
"num_input_tokens_seen": 35316064, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 1.5578161215554984, |
|
"grad_norm": 5.655349471422181, |
|
"learning_rate": 3.2891016460222967e-06, |
|
"loss": 0.2479, |
|
"num_input_tokens_seen": 35374504, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 1.5603914499098637, |
|
"grad_norm": 7.871895425892081, |
|
"learning_rate": 3.2029194818977983e-06, |
|
"loss": 0.292, |
|
"num_input_tokens_seen": 35432984, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 1.5629667782642287, |
|
"grad_norm": 6.441418084723481, |
|
"learning_rate": 3.117844075629617e-06, |
|
"loss": 0.241, |
|
"num_input_tokens_seen": 35491488, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 1.5655421066185937, |
|
"grad_norm": 5.268339109046189, |
|
"learning_rate": 3.033877439263666e-06, |
|
"loss": 0.228, |
|
"num_input_tokens_seen": 35549984, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 1.568117434972959, |
|
"grad_norm": 7.110464304213341, |
|
"learning_rate": 2.951021558623274e-06, |
|
"loss": 0.2485, |
|
"num_input_tokens_seen": 35608488, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 1.5706927633273242, |
|
"grad_norm": 12.567694093056492, |
|
"learning_rate": 2.869278393262226e-06, |
|
"loss": 0.2851, |
|
"num_input_tokens_seen": 35666976, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.5706927633273242, |
|
"eval_loss": 0.8473746180534363, |
|
"eval_runtime": 16.0314, |
|
"eval_samples_per_second": 3.743, |
|
"eval_steps_per_second": 0.936, |
|
"num_input_tokens_seen": 35666976, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.5732680916816895, |
|
"grad_norm": 5.787936921221981, |
|
"learning_rate": 2.7886498764184588e-06, |
|
"loss": 0.2514, |
|
"num_input_tokens_seen": 35725456, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 1.5758434200360547, |
|
"grad_norm": 7.052716790363759, |
|
"learning_rate": 2.7091379149682685e-06, |
|
"loss": 0.3091, |
|
"num_input_tokens_seen": 35783912, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 1.5784187483904197, |
|
"grad_norm": 4.6737853290480915, |
|
"learning_rate": 2.6307443893812843e-06, |
|
"loss": 0.2629, |
|
"num_input_tokens_seen": 35842376, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 1.580994076744785, |
|
"grad_norm": 8.400296818269052, |
|
"learning_rate": 2.5534711536759404e-06, |
|
"loss": 0.3065, |
|
"num_input_tokens_seen": 35900824, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 1.58356940509915, |
|
"grad_norm": 3.909241159865706, |
|
"learning_rate": 2.4773200353756798e-06, |
|
"loss": 0.2577, |
|
"num_input_tokens_seen": 35959264, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 1.5861447334535153, |
|
"grad_norm": 5.227660314173737, |
|
"learning_rate": 2.4022928354656473e-06, |
|
"loss": 0.2359, |
|
"num_input_tokens_seen": 36017760, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 1.5887200618078805, |
|
"grad_norm": 5.407491053931616, |
|
"learning_rate": 2.3283913283502044e-06, |
|
"loss": 0.1897, |
|
"num_input_tokens_seen": 36076280, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 1.5912953901622457, |
|
"grad_norm": 5.771594174948701, |
|
"learning_rate": 2.2556172618108997e-06, |
|
"loss": 0.286, |
|
"num_input_tokens_seen": 36134784, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 1.593870718516611, |
|
"grad_norm": 5.508770087080472, |
|
"learning_rate": 2.183972356965125e-06, |
|
"loss": 0.2733, |
|
"num_input_tokens_seen": 36193288, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 1.596446046870976, |
|
"grad_norm": 6.343942326218544, |
|
"learning_rate": 2.113458308225458e-06, |
|
"loss": 0.2351, |
|
"num_input_tokens_seen": 36251744, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.596446046870976, |
|
"eval_loss": 0.8650907874107361, |
|
"eval_runtime": 16.0989, |
|
"eval_samples_per_second": 3.727, |
|
"eval_steps_per_second": 0.932, |
|
"num_input_tokens_seen": 36251744, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.5990213752253413, |
|
"grad_norm": 4.00048030481465, |
|
"learning_rate": 2.0440767832595574e-06, |
|
"loss": 0.2454, |
|
"num_input_tokens_seen": 36310200, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 1.6015967035797063, |
|
"grad_norm": 5.230064679031373, |
|
"learning_rate": 1.975829422950709e-06, |
|
"loss": 0.2629, |
|
"num_input_tokens_seen": 36368688, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 1.6041720319340715, |
|
"grad_norm": 12.271894553598498, |
|
"learning_rate": 1.908717841359048e-06, |
|
"loss": 0.2848, |
|
"num_input_tokens_seen": 36427192, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 1.6067473602884368, |
|
"grad_norm": 8.178213306290619, |
|
"learning_rate": 1.8427436256833852e-06, |
|
"loss": 0.228, |
|
"num_input_tokens_seen": 36485656, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 1.609322688642802, |
|
"grad_norm": 4.853366085377887, |
|
"learning_rate": 1.7779083362236547e-06, |
|
"loss": 0.2239, |
|
"num_input_tokens_seen": 36544128, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 1.6118980169971673, |
|
"grad_norm": 10.968162741068843, |
|
"learning_rate": 1.7142135063440035e-06, |
|
"loss": 0.2585, |
|
"num_input_tokens_seen": 36602568, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 1.6144733453515323, |
|
"grad_norm": 5.564416348243761, |
|
"learning_rate": 1.6516606424365643e-06, |
|
"loss": 0.2887, |
|
"num_input_tokens_seen": 36661064, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 1.6170486737058976, |
|
"grad_norm": 8.095832161946442, |
|
"learning_rate": 1.5902512238857858e-06, |
|
"loss": 0.2446, |
|
"num_input_tokens_seen": 36719544, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 1.6196240020602626, |
|
"grad_norm": 8.906257390618395, |
|
"learning_rate": 1.5299867030334814e-06, |
|
"loss": 0.2673, |
|
"num_input_tokens_seen": 36778064, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 1.6221993304146278, |
|
"grad_norm": 6.864070166407251, |
|
"learning_rate": 1.4708685051444515e-06, |
|
"loss": 0.2638, |
|
"num_input_tokens_seen": 36836560, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.6221993304146278, |
|
"eval_loss": 0.8633677363395691, |
|
"eval_runtime": 16.2031, |
|
"eval_samples_per_second": 3.703, |
|
"eval_steps_per_second": 0.926, |
|
"num_input_tokens_seen": 36836560, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.624774658768993, |
|
"grad_norm": 8.026607293073416, |
|
"learning_rate": 1.4128980283727943e-06, |
|
"loss": 0.2793, |
|
"num_input_tokens_seen": 36895016, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 1.6273499871233583, |
|
"grad_norm": 11.669862098293653, |
|
"learning_rate": 1.356076643728843e-06, |
|
"loss": 0.2887, |
|
"num_input_tokens_seen": 36953528, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 1.6299253154777236, |
|
"grad_norm": 5.580791837684188, |
|
"learning_rate": 1.3004056950467135e-06, |
|
"loss": 0.317, |
|
"num_input_tokens_seen": 37012056, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 1.6325006438320886, |
|
"grad_norm": 4.650356589287389, |
|
"learning_rate": 1.2458864989525698e-06, |
|
"loss": 0.2095, |
|
"num_input_tokens_seen": 37070528, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 1.6350759721864536, |
|
"grad_norm": 6.089813437162075, |
|
"learning_rate": 1.19252034483342e-06, |
|
"loss": 0.237, |
|
"num_input_tokens_seen": 37129008, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 1.6376513005408189, |
|
"grad_norm": 5.287668578489162, |
|
"learning_rate": 1.1403084948067021e-06, |
|
"loss": 0.2448, |
|
"num_input_tokens_seen": 37187472, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 1.6402266288951841, |
|
"grad_norm": 6.982602482070445, |
|
"learning_rate": 1.089252183690348e-06, |
|
"loss": 0.2563, |
|
"num_input_tokens_seen": 37245936, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 1.6428019572495494, |
|
"grad_norm": 2.9242653665827647, |
|
"learning_rate": 1.0393526189736602e-06, |
|
"loss": 0.2538, |
|
"num_input_tokens_seen": 37304424, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 1.6453772856039146, |
|
"grad_norm": 6.894723044936381, |
|
"learning_rate": 9.906109807887032e-07, |
|
"loss": 0.1768, |
|
"num_input_tokens_seen": 37362888, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 1.6479526139582796, |
|
"grad_norm": 6.796664957587956, |
|
"learning_rate": 9.430284218824026e-07, |
|
"loss": 0.312, |
|
"num_input_tokens_seen": 37421416, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.6479526139582796, |
|
"eval_loss": 0.8679988980293274, |
|
"eval_runtime": 16.1678, |
|
"eval_samples_per_second": 3.711, |
|
"eval_steps_per_second": 0.928, |
|
"num_input_tokens_seen": 37421416, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.6505279423126449, |
|
"grad_norm": 12.027460444161642, |
|
"learning_rate": 8.966060675892951e-07, |
|
"loss": 0.2865, |
|
"num_input_tokens_seen": 37479848, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 1.65310327066701, |
|
"grad_norm": 6.851221931248735, |
|
"learning_rate": 8.513450158049108e-07, |
|
"loss": 0.3299, |
|
"num_input_tokens_seen": 37538312, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 1.6556785990213752, |
|
"grad_norm": 6.971651790450948, |
|
"learning_rate": 8.072463369597993e-07, |
|
"loss": 0.3218, |
|
"num_input_tokens_seen": 37596800, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 1.6582539273757404, |
|
"grad_norm": 10.994527310957624, |
|
"learning_rate": 7.643110739942172e-07, |
|
"loss": 0.2593, |
|
"num_input_tokens_seen": 37655312, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 1.6608292557301056, |
|
"grad_norm": 13.542379224085927, |
|
"learning_rate": 7.225402423334693e-07, |
|
"loss": 0.3072, |
|
"num_input_tokens_seen": 37713800, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 1.663404584084471, |
|
"grad_norm": 5.442561929450427, |
|
"learning_rate": 6.819348298638839e-07, |
|
"loss": 0.2276, |
|
"num_input_tokens_seen": 37772280, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 1.665979912438836, |
|
"grad_norm": 8.128386248398428, |
|
"learning_rate": 6.424957969094536e-07, |
|
"loss": 0.2489, |
|
"num_input_tokens_seen": 37830800, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 1.6685552407932012, |
|
"grad_norm": 3.9766881915113266, |
|
"learning_rate": 6.0422407620912e-07, |
|
"loss": 0.2552, |
|
"num_input_tokens_seen": 37889280, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 1.6711305691475662, |
|
"grad_norm": 5.555365927504982, |
|
"learning_rate": 5.671205728947305e-07, |
|
"loss": 0.226, |
|
"num_input_tokens_seen": 37947728, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 1.6737058975019314, |
|
"grad_norm": 5.733028191926084, |
|
"learning_rate": 5.311861644696048e-07, |
|
"loss": 0.2785, |
|
"num_input_tokens_seen": 38006200, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.6737058975019314, |
|
"eval_loss": 0.8640011548995972, |
|
"eval_runtime": 16.0965, |
|
"eval_samples_per_second": 3.728, |
|
"eval_steps_per_second": 0.932, |
|
"num_input_tokens_seen": 38006200, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.6762812258562967, |
|
"grad_norm": 4.778342712582032, |
|
"learning_rate": 4.964217007878081e-07, |
|
"loss": 0.2291, |
|
"num_input_tokens_seen": 38064672, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 1.678856554210662, |
|
"grad_norm": 4.4902131141962, |
|
"learning_rate": 4.6282800403402715e-07, |
|
"loss": 0.3101, |
|
"num_input_tokens_seen": 38123192, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 1.6814318825650272, |
|
"grad_norm": 7.687294001046122, |
|
"learning_rate": 4.3040586870415346e-07, |
|
"loss": 0.3196, |
|
"num_input_tokens_seen": 38181696, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 1.6840072109193922, |
|
"grad_norm": 7.392271519909896, |
|
"learning_rate": 3.991560615864587e-07, |
|
"loss": 0.2587, |
|
"num_input_tokens_seen": 38240216, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 1.6865825392737575, |
|
"grad_norm": 6.335589264461425, |
|
"learning_rate": 3.6907932174349846e-07, |
|
"loss": 0.2093, |
|
"num_input_tokens_seen": 38298688, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 1.6891578676281225, |
|
"grad_norm": 7.268228162683875, |
|
"learning_rate": 3.40176360494604e-07, |
|
"loss": 0.2282, |
|
"num_input_tokens_seen": 38357128, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 1.6917331959824877, |
|
"grad_norm": 4.776419874246786, |
|
"learning_rate": 3.124478613990733e-07, |
|
"loss": 0.2092, |
|
"num_input_tokens_seen": 38415600, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 1.694308524336853, |
|
"grad_norm": 8.522894464657169, |
|
"learning_rate": 2.8589448023998987e-07, |
|
"loss": 0.2861, |
|
"num_input_tokens_seen": 38474112, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 1.6968838526912182, |
|
"grad_norm": 5.304805044526707, |
|
"learning_rate": 2.605168450087514e-07, |
|
"loss": 0.2494, |
|
"num_input_tokens_seen": 38532624, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 1.6994591810455835, |
|
"grad_norm": 7.112591931914542, |
|
"learning_rate": 2.363155558901542e-07, |
|
"loss": 0.2752, |
|
"num_input_tokens_seen": 38591128, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.6994591810455835, |
|
"eval_loss": 0.8644178509712219, |
|
"eval_runtime": 16.1497, |
|
"eval_samples_per_second": 3.715, |
|
"eval_steps_per_second": 0.929, |
|
"num_input_tokens_seen": 38591128, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.7020345093999485, |
|
"grad_norm": 4.935833215525081, |
|
"learning_rate": 2.1329118524827662e-07, |
|
"loss": 0.2337, |
|
"num_input_tokens_seen": 38649640, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 1.7046098377543135, |
|
"grad_norm": 5.746920185244728, |
|
"learning_rate": 1.9144427761286222e-07, |
|
"loss": 0.215, |
|
"num_input_tokens_seen": 38708112, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 1.7071851661086788, |
|
"grad_norm": 6.501004359690972, |
|
"learning_rate": 1.7077534966650766e-07, |
|
"loss": 0.2871, |
|
"num_input_tokens_seen": 38766624, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 1.709760494463044, |
|
"grad_norm": 6.996403813160393, |
|
"learning_rate": 1.51284890232406e-07, |
|
"loss": 0.3478, |
|
"num_input_tokens_seen": 38825104, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 1.7123358228174093, |
|
"grad_norm": 5.178545190033401, |
|
"learning_rate": 1.3297336026280027e-07, |
|
"loss": 0.2055, |
|
"num_input_tokens_seen": 38883560, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 1.7149111511717745, |
|
"grad_norm": 6.686144266429449, |
|
"learning_rate": 1.158411928280645e-07, |
|
"loss": 0.2992, |
|
"num_input_tokens_seen": 38942040, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 1.7174864795261395, |
|
"grad_norm": 4.337439288142164, |
|
"learning_rate": 9.988879310649513e-08, |
|
"loss": 0.2302, |
|
"num_input_tokens_seen": 39000488, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 1.7200618078805048, |
|
"grad_norm": 6.5240260149211755, |
|
"learning_rate": 8.511653837470212e-08, |
|
"loss": 0.265, |
|
"num_input_tokens_seen": 39058960, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 1.7226371362348698, |
|
"grad_norm": 7.592689596688837, |
|
"learning_rate": 7.152477799867719e-08, |
|
"loss": 0.3147, |
|
"num_input_tokens_seen": 39117416, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 1.725212464589235, |
|
"grad_norm": 6.429413076205037, |
|
"learning_rate": 5.911383342556143e-08, |
|
"loss": 0.2674, |
|
"num_input_tokens_seen": 39175888, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.725212464589235, |
|
"eval_loss": 0.8666485548019409, |
|
"eval_runtime": 16.1238, |
|
"eval_samples_per_second": 3.721, |
|
"eval_steps_per_second": 0.93, |
|
"num_input_tokens_seen": 39175888, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.7277877929436003, |
|
"grad_norm": 10.968051828666288, |
|
"learning_rate": 4.788399817602929e-08, |
|
"loss": 0.2565, |
|
"num_input_tokens_seen": 39234336, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 1.7303631212979655, |
|
"grad_norm": 5.1159559645491335, |
|
"learning_rate": 3.7835537837338506e-08, |
|
"loss": 0.2762, |
|
"num_input_tokens_seen": 39292800, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 1.7329384496523308, |
|
"grad_norm": 6.735859744015271, |
|
"learning_rate": 2.8968690057051828e-08, |
|
"loss": 0.2196, |
|
"num_input_tokens_seen": 39351272, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 1.7355137780066958, |
|
"grad_norm": 3.989003741597172, |
|
"learning_rate": 2.128366453743591e-08, |
|
"loss": 0.2482, |
|
"num_input_tokens_seen": 39409736, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 1.738089106361061, |
|
"grad_norm": 5.083412307953648, |
|
"learning_rate": 1.4780643030476438e-08, |
|
"loss": 0.2778, |
|
"num_input_tokens_seen": 39468176, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 1.740664434715426, |
|
"grad_norm": 7.4306605849577565, |
|
"learning_rate": 9.459779333587104e-09, |
|
"loss": 0.2048, |
|
"num_input_tokens_seen": 39526688, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 1.7432397630697913, |
|
"grad_norm": 4.202839419581782, |
|
"learning_rate": 5.3211992859791835e-09, |
|
"loss": 0.2296, |
|
"num_input_tokens_seen": 39585152, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 1.7458150914241566, |
|
"grad_norm": 7.909317855624412, |
|
"learning_rate": 2.3650007656805806e-09, |
|
"loss": 0.2713, |
|
"num_input_tokens_seen": 39643640, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 1.7483904197785218, |
|
"grad_norm": 7.880795429819755, |
|
"learning_rate": 5.912536872321184e-10, |
|
"loss": 0.2964, |
|
"num_input_tokens_seen": 39702144, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 1.750965748132887, |
|
"grad_norm": 4.00234080349809, |
|
"learning_rate": 0.0, |
|
"loss": 0.1797, |
|
"num_input_tokens_seen": 39760664, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.750965748132887, |
|
"eval_loss": 0.8603056073188782, |
|
"eval_runtime": 16.2474, |
|
"eval_samples_per_second": 3.693, |
|
"eval_steps_per_second": 0.923, |
|
"num_input_tokens_seen": 39760664, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.750965748132887, |
|
"num_input_tokens_seen": 39760664, |
|
"step": 3400, |
|
"total_flos": 2232757993603072.0, |
|
"train_loss": 0.5904174627801951, |
|
"train_runtime": 45337.3565, |
|
"train_samples_per_second": 1.8, |
|
"train_steps_per_second": 0.075 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 3400, |
|
"num_input_tokens_seen": 39760664, |
|
"num_train_epochs": 2, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2232757993603072.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|