|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.995634549423137, |
|
"eval_steps": 100, |
|
"global_step": 800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.012472715933894606, |
|
"grad_norm": 6.119478225708008, |
|
"learning_rate": 3.75e-05, |
|
"loss": 36.8721, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.024945431867789213, |
|
"grad_norm": 2.9732778072357178, |
|
"learning_rate": 7.5e-05, |
|
"loss": 33.0439, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.037418147801683815, |
|
"grad_norm": 1.5332609415054321, |
|
"learning_rate": 0.0001125, |
|
"loss": 30.1165, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.049890863735578425, |
|
"grad_norm": 1.2270578145980835, |
|
"learning_rate": 0.00015, |
|
"loss": 28.647, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06236357966947303, |
|
"grad_norm": 1.053142786026001, |
|
"learning_rate": 0.00018749999999999998, |
|
"loss": 26.0629, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07483629560336763, |
|
"grad_norm": 1.0131248235702515, |
|
"learning_rate": 0.000225, |
|
"loss": 23.8703, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08730901153726224, |
|
"grad_norm": 0.9197985529899597, |
|
"learning_rate": 0.0002625, |
|
"loss": 21.521, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09978172747115685, |
|
"grad_norm": 1.0926002264022827, |
|
"learning_rate": 0.0003, |
|
"loss": 19.8433, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11225444340505145, |
|
"grad_norm": 0.7152827382087708, |
|
"learning_rate": 0.0003, |
|
"loss": 18.618, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.12472715933894606, |
|
"grad_norm": 0.6178381443023682, |
|
"learning_rate": 0.0003, |
|
"loss": 17.3644, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13719987527284067, |
|
"grad_norm": 0.48063215613365173, |
|
"learning_rate": 0.0003, |
|
"loss": 16.6105, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.14967259120673526, |
|
"grad_norm": 0.46090102195739746, |
|
"learning_rate": 0.0003, |
|
"loss": 16.2326, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16214530714062989, |
|
"grad_norm": 0.4266461730003357, |
|
"learning_rate": 0.0003, |
|
"loss": 15.8385, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.17461802307452448, |
|
"grad_norm": 0.3876805901527405, |
|
"learning_rate": 0.0003, |
|
"loss": 15.3119, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.18709073900841908, |
|
"grad_norm": 0.3796117603778839, |
|
"learning_rate": 0.0003, |
|
"loss": 15.2481, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1995634549423137, |
|
"grad_norm": 0.37646082043647766, |
|
"learning_rate": 0.0003, |
|
"loss": 14.7319, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2120361708762083, |
|
"grad_norm": 0.3688748776912689, |
|
"learning_rate": 0.0003, |
|
"loss": 14.6364, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2245088868101029, |
|
"grad_norm": 0.37435677647590637, |
|
"learning_rate": 0.0003, |
|
"loss": 14.2134, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.23698160274399752, |
|
"grad_norm": 0.36440223455429077, |
|
"learning_rate": 0.0003, |
|
"loss": 13.9198, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2494543186778921, |
|
"grad_norm": 0.33530500531196594, |
|
"learning_rate": 0.0003, |
|
"loss": 13.6044, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2494543186778921, |
|
"eval_accuracy": 0.007913978494623657, |
|
"eval_loss": 12.544113159179688, |
|
"eval_runtime": 18.5829, |
|
"eval_samples_per_second": 13.453, |
|
"eval_steps_per_second": 3.39, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.26192703461178674, |
|
"grad_norm": 0.3251523971557617, |
|
"learning_rate": 0.0003, |
|
"loss": 13.3181, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.27439975054568133, |
|
"grad_norm": 0.3473041355609894, |
|
"learning_rate": 0.0003, |
|
"loss": 12.9976, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2868724664795759, |
|
"grad_norm": 0.3266255557537079, |
|
"learning_rate": 0.0003, |
|
"loss": 12.7667, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2993451824134705, |
|
"grad_norm": 0.35194671154022217, |
|
"learning_rate": 0.0003, |
|
"loss": 12.7544, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3118178983473651, |
|
"grad_norm": 0.34635770320892334, |
|
"learning_rate": 0.0003, |
|
"loss": 12.2756, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.32429061428125977, |
|
"grad_norm": 0.3480404019355774, |
|
"learning_rate": 0.0003, |
|
"loss": 12.1192, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.33676333021515437, |
|
"grad_norm": 0.3309994339942932, |
|
"learning_rate": 0.0003, |
|
"loss": 11.8339, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.34923604614904896, |
|
"grad_norm": 0.33558282256126404, |
|
"learning_rate": 0.0003, |
|
"loss": 11.6745, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.36170876208294356, |
|
"grad_norm": 0.3359847664833069, |
|
"learning_rate": 0.0003, |
|
"loss": 11.3363, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.37418147801683815, |
|
"grad_norm": 0.33947232365608215, |
|
"learning_rate": 0.0003, |
|
"loss": 11.0303, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.38665419395073275, |
|
"grad_norm": 0.32984089851379395, |
|
"learning_rate": 0.0003, |
|
"loss": 10.9271, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3991269098846274, |
|
"grad_norm": 0.3498048782348633, |
|
"learning_rate": 0.0003, |
|
"loss": 10.6215, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.411599625818522, |
|
"grad_norm": 0.354889839887619, |
|
"learning_rate": 0.0003, |
|
"loss": 10.5165, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4240723417524166, |
|
"grad_norm": 0.34426406025886536, |
|
"learning_rate": 0.0003, |
|
"loss": 10.0716, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4365450576863112, |
|
"grad_norm": 0.34653356671333313, |
|
"learning_rate": 0.0003, |
|
"loss": 10.0709, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4490177736202058, |
|
"grad_norm": 0.3454643189907074, |
|
"learning_rate": 0.0003, |
|
"loss": 9.7226, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4614904895541004, |
|
"grad_norm": 0.3724479377269745, |
|
"learning_rate": 0.0003, |
|
"loss": 9.5827, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.47396320548799503, |
|
"grad_norm": 0.37687671184539795, |
|
"learning_rate": 0.0003, |
|
"loss": 9.3702, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4864359214218896, |
|
"grad_norm": 0.3670942187309265, |
|
"learning_rate": 0.0003, |
|
"loss": 9.2377, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.4989086373557842, |
|
"grad_norm": 0.3864516019821167, |
|
"learning_rate": 0.0003, |
|
"loss": 8.9524, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4989086373557842, |
|
"eval_accuracy": 0.04734701857282502, |
|
"eval_loss": 8.425415992736816, |
|
"eval_runtime": 17.9427, |
|
"eval_samples_per_second": 13.933, |
|
"eval_steps_per_second": 3.511, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5113813532896788, |
|
"grad_norm": 0.3540992736816406, |
|
"learning_rate": 0.0003, |
|
"loss": 8.9811, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5238540692235735, |
|
"grad_norm": 0.35756129026412964, |
|
"learning_rate": 0.0003, |
|
"loss": 8.6522, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.536326785157468, |
|
"grad_norm": 0.38473081588745117, |
|
"learning_rate": 0.0003, |
|
"loss": 8.6516, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5487995010913627, |
|
"grad_norm": 0.3616325259208679, |
|
"learning_rate": 0.0003, |
|
"loss": 8.5213, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5612722170252572, |
|
"grad_norm": 0.375959187746048, |
|
"learning_rate": 0.0003, |
|
"loss": 8.3109, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5737449329591519, |
|
"grad_norm": 0.38421833515167236, |
|
"learning_rate": 0.0003, |
|
"loss": 8.2747, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5862176488930465, |
|
"grad_norm": 0.379168301820755, |
|
"learning_rate": 0.0003, |
|
"loss": 8.197, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.598690364826941, |
|
"grad_norm": 0.39803043007850647, |
|
"learning_rate": 0.0003, |
|
"loss": 8.0836, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6111630807608357, |
|
"grad_norm": 0.41287195682525635, |
|
"learning_rate": 0.0003, |
|
"loss": 7.9406, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6236357966947302, |
|
"grad_norm": 0.3857806324958801, |
|
"learning_rate": 0.0003, |
|
"loss": 7.9488, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6361085126286249, |
|
"grad_norm": 0.3808286488056183, |
|
"learning_rate": 0.0003, |
|
"loss": 7.7673, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6485812285625195, |
|
"grad_norm": 0.4393250048160553, |
|
"learning_rate": 0.0003, |
|
"loss": 7.707, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6610539444964141, |
|
"grad_norm": 0.4232034981250763, |
|
"learning_rate": 0.0003, |
|
"loss": 7.7852, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6735266604303087, |
|
"grad_norm": 0.42222586274147034, |
|
"learning_rate": 0.0003, |
|
"loss": 7.6145, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6859993763642033, |
|
"grad_norm": 0.35792261362075806, |
|
"learning_rate": 0.0003, |
|
"loss": 7.5498, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6984720922980979, |
|
"grad_norm": 0.343427449464798, |
|
"learning_rate": 0.0003, |
|
"loss": 7.4698, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7109448082319925, |
|
"grad_norm": 0.4176105856895447, |
|
"learning_rate": 0.0003, |
|
"loss": 7.3752, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7234175241658871, |
|
"grad_norm": 0.40987178683280945, |
|
"learning_rate": 0.0003, |
|
"loss": 7.342, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7358902400997818, |
|
"grad_norm": 0.4014261066913605, |
|
"learning_rate": 0.0003, |
|
"loss": 7.1609, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7483629560336763, |
|
"grad_norm": 0.4236806035041809, |
|
"learning_rate": 0.0003, |
|
"loss": 7.1721, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7483629560336763, |
|
"eval_accuracy": 0.03885043988269795, |
|
"eval_loss": 6.619859218597412, |
|
"eval_runtime": 18.2015, |
|
"eval_samples_per_second": 13.735, |
|
"eval_steps_per_second": 3.461, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.760835671967571, |
|
"grad_norm": 0.4133549630641937, |
|
"learning_rate": 0.0003, |
|
"loss": 7.1892, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.7733083879014655, |
|
"grad_norm": 0.44653546810150146, |
|
"learning_rate": 0.0003, |
|
"loss": 7.0446, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7857811038353602, |
|
"grad_norm": 0.41286739706993103, |
|
"learning_rate": 0.0003, |
|
"loss": 6.9656, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.7982538197692548, |
|
"grad_norm": 0.3720580041408539, |
|
"learning_rate": 0.0003, |
|
"loss": 6.907, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8107265357031493, |
|
"grad_norm": 0.39917078614234924, |
|
"learning_rate": 0.0003, |
|
"loss": 6.9853, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.823199251637044, |
|
"grad_norm": 0.4373719096183777, |
|
"learning_rate": 0.0003, |
|
"loss": 6.8592, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8356719675709385, |
|
"grad_norm": 0.4183291792869568, |
|
"learning_rate": 0.0003, |
|
"loss": 6.7432, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8481446835048332, |
|
"grad_norm": 0.40696659684181213, |
|
"learning_rate": 0.0003, |
|
"loss": 6.7505, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8606173994387278, |
|
"grad_norm": 0.36887314915657043, |
|
"learning_rate": 0.0003, |
|
"loss": 6.7657, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.8730901153726224, |
|
"grad_norm": 0.4768717885017395, |
|
"learning_rate": 0.0003, |
|
"loss": 6.7173, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.885562831306517, |
|
"grad_norm": 0.43819448351860046, |
|
"learning_rate": 0.0003, |
|
"loss": 6.5465, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.8980355472404116, |
|
"grad_norm": 0.40145763754844666, |
|
"learning_rate": 0.0003, |
|
"loss": 6.512, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9105082631743062, |
|
"grad_norm": 0.49852269887924194, |
|
"learning_rate": 0.0003, |
|
"loss": 6.5335, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.9229809791082008, |
|
"grad_norm": 0.454698771238327, |
|
"learning_rate": 0.0003, |
|
"loss": 6.4527, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9354536950420954, |
|
"grad_norm": 0.4860341250896454, |
|
"learning_rate": 0.0003, |
|
"loss": 6.4102, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9479264109759901, |
|
"grad_norm": 0.39718613028526306, |
|
"learning_rate": 0.0003, |
|
"loss": 6.4694, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9603991269098846, |
|
"grad_norm": 0.4210009276866913, |
|
"learning_rate": 0.0003, |
|
"loss": 6.4807, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.9728718428437793, |
|
"grad_norm": 0.4482674300670624, |
|
"learning_rate": 0.0003, |
|
"loss": 6.414, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9853445587776738, |
|
"grad_norm": 0.42889419198036194, |
|
"learning_rate": 0.0003, |
|
"loss": 6.3543, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.9978172747115684, |
|
"grad_norm": 0.5144391059875488, |
|
"learning_rate": 0.0003, |
|
"loss": 6.2087, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9978172747115684, |
|
"eval_accuracy": 0.22513000977517106, |
|
"eval_loss": 5.719752311706543, |
|
"eval_runtime": 17.8865, |
|
"eval_samples_per_second": 13.977, |
|
"eval_steps_per_second": 3.522, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.010289990645463, |
|
"grad_norm": 0.6417849063873291, |
|
"learning_rate": 0.0003, |
|
"loss": 6.048, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.0227627065793576, |
|
"grad_norm": 0.5739749073982239, |
|
"learning_rate": 0.0003, |
|
"loss": 5.9866, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.0352354225132523, |
|
"grad_norm": 0.49603304266929626, |
|
"learning_rate": 0.0003, |
|
"loss": 5.9419, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.047708138447147, |
|
"grad_norm": 0.5403385162353516, |
|
"learning_rate": 0.0003, |
|
"loss": 5.8366, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.0601808543810414, |
|
"grad_norm": 0.6306777000427246, |
|
"learning_rate": 0.0003, |
|
"loss": 5.7657, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.072653570314936, |
|
"grad_norm": 0.7016925811767578, |
|
"learning_rate": 0.0003, |
|
"loss": 5.6619, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.0851262862488307, |
|
"grad_norm": 0.6606624722480774, |
|
"learning_rate": 0.0003, |
|
"loss": 5.6094, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.0975990021827253, |
|
"grad_norm": 0.7023086547851562, |
|
"learning_rate": 0.0003, |
|
"loss": 5.6074, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.11007171811662, |
|
"grad_norm": 0.8505487442016602, |
|
"learning_rate": 0.0003, |
|
"loss": 5.6959, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.1225444340505144, |
|
"grad_norm": 0.6713190674781799, |
|
"learning_rate": 0.0003, |
|
"loss": 5.6344, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.135017149984409, |
|
"grad_norm": 0.5908814668655396, |
|
"learning_rate": 0.0003, |
|
"loss": 5.4591, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.1474898659183037, |
|
"grad_norm": 0.7601476311683655, |
|
"learning_rate": 0.0003, |
|
"loss": 5.5622, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1599625818521984, |
|
"grad_norm": 0.5737589001655579, |
|
"learning_rate": 0.0003, |
|
"loss": 5.4541, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.172435297786093, |
|
"grad_norm": 0.8831024169921875, |
|
"learning_rate": 0.0003, |
|
"loss": 5.4784, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.1849080137199874, |
|
"grad_norm": 0.8297187089920044, |
|
"learning_rate": 0.0003, |
|
"loss": 5.4252, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.197380729653882, |
|
"grad_norm": 0.857667863368988, |
|
"learning_rate": 0.0003, |
|
"loss": 5.3268, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2098534455877767, |
|
"grad_norm": 0.8937066793441772, |
|
"learning_rate": 0.0003, |
|
"loss": 5.279, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.2223261615216714, |
|
"grad_norm": 0.784275472164154, |
|
"learning_rate": 0.0003, |
|
"loss": 5.3079, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.234798877455566, |
|
"grad_norm": 0.7549949884414673, |
|
"learning_rate": 0.0003, |
|
"loss": 5.3977, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.2472715933894605, |
|
"grad_norm": 0.7452312111854553, |
|
"learning_rate": 0.0003, |
|
"loss": 5.4917, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2472715933894605, |
|
"eval_accuracy": 0.32684261974584555, |
|
"eval_loss": 4.947990894317627, |
|
"eval_runtime": 19.5683, |
|
"eval_samples_per_second": 12.776, |
|
"eval_steps_per_second": 3.219, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2597443093233551, |
|
"grad_norm": 0.6744974255561829, |
|
"learning_rate": 0.0003, |
|
"loss": 5.1679, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.2722170252572498, |
|
"grad_norm": 1.0095832347869873, |
|
"learning_rate": 0.0003, |
|
"loss": 5.3918, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.2846897411911444, |
|
"grad_norm": 0.7461665272712708, |
|
"learning_rate": 0.0003, |
|
"loss": 5.2346, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.2971624571250389, |
|
"grad_norm": 0.88801109790802, |
|
"learning_rate": 0.0003, |
|
"loss": 5.2033, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.3096351730589335, |
|
"grad_norm": 0.7549375891685486, |
|
"learning_rate": 0.0003, |
|
"loss": 5.098, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.3221078889928282, |
|
"grad_norm": 1.1236454248428345, |
|
"learning_rate": 0.0003, |
|
"loss": 5.2069, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.3345806049267228, |
|
"grad_norm": 0.9261302947998047, |
|
"learning_rate": 0.0003, |
|
"loss": 5.1925, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.3470533208606175, |
|
"grad_norm": 0.7248057126998901, |
|
"learning_rate": 0.0003, |
|
"loss": 5.109, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.3595260367945121, |
|
"grad_norm": 0.941017210483551, |
|
"learning_rate": 0.0003, |
|
"loss": 5.0975, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.3719987527284065, |
|
"grad_norm": 0.9451349973678589, |
|
"learning_rate": 0.0003, |
|
"loss": 5.1825, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.3844714686623012, |
|
"grad_norm": 0.9956802725791931, |
|
"learning_rate": 0.0003, |
|
"loss": 5.1017, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.3969441845961958, |
|
"grad_norm": 1.0484583377838135, |
|
"learning_rate": 0.0003, |
|
"loss": 5.1371, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.4094169005300905, |
|
"grad_norm": 1.1080021858215332, |
|
"learning_rate": 0.0003, |
|
"loss": 5.0146, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.421889616463985, |
|
"grad_norm": 0.9495016932487488, |
|
"learning_rate": 0.0003, |
|
"loss": 5.0971, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.4343623323978796, |
|
"grad_norm": 0.7586097717285156, |
|
"learning_rate": 0.0003, |
|
"loss": 5.0336, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.4468350483317742, |
|
"grad_norm": 0.647396981716156, |
|
"learning_rate": 0.0003, |
|
"loss": 5.0119, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.4593077642656689, |
|
"grad_norm": 0.7189023494720459, |
|
"learning_rate": 0.0003, |
|
"loss": 5.0908, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.4717804801995635, |
|
"grad_norm": 0.9973328113555908, |
|
"learning_rate": 0.0003, |
|
"loss": 4.7903, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.4842531961334582, |
|
"grad_norm": 0.8094688057899475, |
|
"learning_rate": 0.0003, |
|
"loss": 5.0103, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.4967259120673526, |
|
"grad_norm": 1.0308438539505005, |
|
"learning_rate": 0.0003, |
|
"loss": 4.9408, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.4967259120673526, |
|
"eval_accuracy": 0.35667253176930597, |
|
"eval_loss": 4.673036575317383, |
|
"eval_runtime": 19.5514, |
|
"eval_samples_per_second": 12.787, |
|
"eval_steps_per_second": 3.222, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5091986280012473, |
|
"grad_norm": 0.7587366104125977, |
|
"learning_rate": 0.0003, |
|
"loss": 4.9818, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.521671343935142, |
|
"grad_norm": 1.0271868705749512, |
|
"learning_rate": 0.0003, |
|
"loss": 4.9614, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.5341440598690363, |
|
"grad_norm": 1.061369776725769, |
|
"learning_rate": 0.0003, |
|
"loss": 4.8608, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.546616775802931, |
|
"grad_norm": 0.9442321062088013, |
|
"learning_rate": 0.0003, |
|
"loss": 4.9478, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.5590894917368257, |
|
"grad_norm": 0.8110609650611877, |
|
"learning_rate": 0.0003, |
|
"loss": 5.0979, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.5715622076707203, |
|
"grad_norm": 0.6862745881080627, |
|
"learning_rate": 0.0003, |
|
"loss": 4.8345, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.584034923604615, |
|
"grad_norm": 0.8737391233444214, |
|
"learning_rate": 0.0003, |
|
"loss": 4.8572, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.5965076395385096, |
|
"grad_norm": 0.8002131581306458, |
|
"learning_rate": 0.0003, |
|
"loss": 4.8072, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.6089803554724043, |
|
"grad_norm": 0.7860103845596313, |
|
"learning_rate": 0.0003, |
|
"loss": 4.8922, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.6214530714062987, |
|
"grad_norm": 0.9875708222389221, |
|
"learning_rate": 0.0003, |
|
"loss": 4.9247, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.6339257873401933, |
|
"grad_norm": 0.8873936533927917, |
|
"learning_rate": 0.0003, |
|
"loss": 4.8795, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.646398503274088, |
|
"grad_norm": 0.7963967323303223, |
|
"learning_rate": 0.0003, |
|
"loss": 4.835, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.6588712192079824, |
|
"grad_norm": 0.8068607449531555, |
|
"learning_rate": 0.0003, |
|
"loss": 4.8713, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.671343935141877, |
|
"grad_norm": 0.9093911647796631, |
|
"learning_rate": 0.0003, |
|
"loss": 4.7725, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.6838166510757717, |
|
"grad_norm": 0.7699265480041504, |
|
"learning_rate": 0.0003, |
|
"loss": 4.7502, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.6962893670096664, |
|
"grad_norm": 0.7545697689056396, |
|
"learning_rate": 0.0003, |
|
"loss": 4.9555, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.708762082943561, |
|
"grad_norm": 0.7571801543235779, |
|
"learning_rate": 0.0003, |
|
"loss": 4.7616, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.7212347988774557, |
|
"grad_norm": 0.7757474184036255, |
|
"learning_rate": 0.0003, |
|
"loss": 4.6462, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.7337075148113503, |
|
"grad_norm": 0.7473092079162598, |
|
"learning_rate": 0.0003, |
|
"loss": 4.6699, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.7461802307452448, |
|
"grad_norm": 1.2531319856643677, |
|
"learning_rate": 0.0003, |
|
"loss": 4.8347, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.7461802307452448, |
|
"eval_accuracy": 0.37069794721407623, |
|
"eval_loss": 4.498379707336426, |
|
"eval_runtime": 20.0355, |
|
"eval_samples_per_second": 12.478, |
|
"eval_steps_per_second": 3.144, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.7586529466791394, |
|
"grad_norm": 1.3069407939910889, |
|
"learning_rate": 0.0003, |
|
"loss": 4.7338, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.7711256626130338, |
|
"grad_norm": 1.1146960258483887, |
|
"learning_rate": 0.0003, |
|
"loss": 4.8758, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.7835983785469285, |
|
"grad_norm": 1.0376973152160645, |
|
"learning_rate": 0.0003, |
|
"loss": 4.7604, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.7960710944808231, |
|
"grad_norm": 1.2044090032577515, |
|
"learning_rate": 0.0003, |
|
"loss": 4.7472, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.8085438104147178, |
|
"grad_norm": 1.0660207271575928, |
|
"learning_rate": 0.0003, |
|
"loss": 4.79, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.8210165263486124, |
|
"grad_norm": 0.7932606935501099, |
|
"learning_rate": 0.0003, |
|
"loss": 4.7476, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.833489242282507, |
|
"grad_norm": 0.8554738759994507, |
|
"learning_rate": 0.0003, |
|
"loss": 4.7839, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.8459619582164017, |
|
"grad_norm": 1.015703797340393, |
|
"learning_rate": 0.0003, |
|
"loss": 4.7935, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.8584346741502962, |
|
"grad_norm": 1.1005243062973022, |
|
"learning_rate": 0.0003, |
|
"loss": 4.7913, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.8709073900841908, |
|
"grad_norm": 0.8775972127914429, |
|
"learning_rate": 0.0003, |
|
"loss": 4.5128, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.8833801060180855, |
|
"grad_norm": 0.8116542100906372, |
|
"learning_rate": 0.0003, |
|
"loss": 4.6496, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.89585282195198, |
|
"grad_norm": 0.7614642381668091, |
|
"learning_rate": 0.0003, |
|
"loss": 4.7695, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.9083255378858746, |
|
"grad_norm": 1.0064287185668945, |
|
"learning_rate": 0.0003, |
|
"loss": 4.7929, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.9207982538197692, |
|
"grad_norm": 0.7342740297317505, |
|
"learning_rate": 0.0003, |
|
"loss": 4.6711, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.9332709697536639, |
|
"grad_norm": 0.9723834991455078, |
|
"learning_rate": 0.0003, |
|
"loss": 4.6212, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.9457436856875585, |
|
"grad_norm": 1.20729398727417, |
|
"learning_rate": 0.0003, |
|
"loss": 4.6513, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.9582164016214532, |
|
"grad_norm": 0.7920907735824585, |
|
"learning_rate": 0.0003, |
|
"loss": 4.6264, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.9706891175553478, |
|
"grad_norm": 0.6307650804519653, |
|
"learning_rate": 0.0003, |
|
"loss": 4.6481, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.9831618334892422, |
|
"grad_norm": 0.8942980766296387, |
|
"learning_rate": 0.0003, |
|
"loss": 4.6598, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.995634549423137, |
|
"grad_norm": 0.7046281099319458, |
|
"learning_rate": 0.0003, |
|
"loss": 4.7023, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.995634549423137, |
|
"eval_accuracy": 0.3789325513196481, |
|
"eval_loss": 4.358436584472656, |
|
"eval_runtime": 20.1663, |
|
"eval_samples_per_second": 12.397, |
|
"eval_steps_per_second": 3.124, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.995634549423137, |
|
"step": 800, |
|
"total_flos": 6.441101073108173e+16, |
|
"train_loss": 8.280340445041656, |
|
"train_runtime": 18888.2342, |
|
"train_samples_per_second": 5.433, |
|
"train_steps_per_second": 0.042 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 800, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"total_flos": 6.441101073108173e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|