|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 14.962593516209477, |
|
"eval_steps": 500, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004987531172069825, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 6.666666666666667e-07, |
|
"loss": 2.7726, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.02493765586034913, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 2.8032, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04987531172069826, |
|
"grad_norm": 1.625, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 2.8059, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07481296758104738, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1e-05, |
|
"loss": 2.8169, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.09975062344139651, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 2.7666, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.12468827930174564, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 2.7631, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.14962593516209477, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 2e-05, |
|
"loss": 2.6985, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1745635910224439, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 2.3333333333333336e-05, |
|
"loss": 2.6688, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.19950124688279303, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 2.558, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.22443890274314215, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 3e-05, |
|
"loss": 2.4229, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.24937655860349128, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 2.3437, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2743142144638404, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 3.6666666666666666e-05, |
|
"loss": 2.237, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.29925187032418954, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 4e-05, |
|
"loss": 2.1707, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.32418952618453867, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 4.3333333333333334e-05, |
|
"loss": 2.0941, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.3491271820448878, |
|
"grad_norm": 13.125, |
|
"learning_rate": 4.666666666666667e-05, |
|
"loss": 1.9843, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3740648379052369, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 5e-05, |
|
"loss": 1.8983, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.39900249376558605, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 5.333333333333333e-05, |
|
"loss": 1.8418, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.4239401496259352, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 5.666666666666667e-05, |
|
"loss": 1.7522, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.4488778054862843, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 6e-05, |
|
"loss": 1.7016, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.47381546134663344, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 6.333333333333333e-05, |
|
"loss": 1.6093, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.49875311720698257, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 1.5596, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5236907730673317, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 7e-05, |
|
"loss": 1.5197, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.5486284289276808, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 7.333333333333333e-05, |
|
"loss": 1.4761, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.57356608478803, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 7.666666666666667e-05, |
|
"loss": 1.4268, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5985037406483791, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 8e-05, |
|
"loss": 1.3845, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6234413965087282, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 1.3622, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.6483790523690773, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 8.666666666666667e-05, |
|
"loss": 1.3215, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.6733167082294265, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 9e-05, |
|
"loss": 1.2959, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.6982543640897756, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 9.333333333333334e-05, |
|
"loss": 1.2876, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7231920199501247, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 9.666666666666667e-05, |
|
"loss": 1.28, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.7481296758104738, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2628, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.773067331670823, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00010333333333333334, |
|
"loss": 1.2393, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.7980049875311721, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.00010666666666666667, |
|
"loss": 1.2335, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8229426433915212, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.00011000000000000002, |
|
"loss": 1.2125, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.8478802992518704, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00011333333333333334, |
|
"loss": 1.2182, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.8728179551122195, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.00011666666666666668, |
|
"loss": 1.1958, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.8977556109725686, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.00012, |
|
"loss": 1.2011, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.9226932668329177, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00012333333333333334, |
|
"loss": 1.1778, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.9476309226932669, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.00012666666666666666, |
|
"loss": 1.1628, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.972568578553616, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00013000000000000002, |
|
"loss": 1.1709, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.9975062344139651, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 1.163, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.9975062344139651, |
|
"eval_loss": 2.5127053260803223, |
|
"eval_runtime": 0.5084, |
|
"eval_samples_per_second": 19.67, |
|
"eval_steps_per_second": 1.967, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0224438902743143, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00013666666666666666, |
|
"loss": 1.1675, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.0473815461346634, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00014, |
|
"loss": 1.1397, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.0723192019950125, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.00014333333333333334, |
|
"loss": 1.143, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.0972568578553616, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.00014666666666666666, |
|
"loss": 1.1466, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.1221945137157108, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.1303, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.14713216957606, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00015333333333333334, |
|
"loss": 1.1195, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.172069825436409, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00015666666666666666, |
|
"loss": 1.1156, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.1970074812967582, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.00016, |
|
"loss": 1.1262, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.2219451371571073, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.00016333333333333334, |
|
"loss": 1.1202, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.2468827930174564, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 1.1114, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.2718204488778055, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00017, |
|
"loss": 1.1186, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.2967581047381547, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00017333333333333334, |
|
"loss": 1.1142, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.3216957605985038, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00017666666666666666, |
|
"loss": 1.1062, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.346633416458853, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.00018, |
|
"loss": 1.0976, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.371571072319202, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00018333333333333334, |
|
"loss": 1.1058, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.3965087281795512, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.0001866666666666667, |
|
"loss": 1.0984, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.4214463840399003, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00019, |
|
"loss": 1.0941, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.4463840399002494, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00019333333333333333, |
|
"loss": 1.0903, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.4713216957605986, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00019666666666666666, |
|
"loss": 1.0919, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.4962593516209477, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0744, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.5211970074812968, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00019999830768577443, |
|
"loss": 1.0881, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.546134663341646, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00019999323080037624, |
|
"loss": 1.0791, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.571072319201995, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.00019998476951563915, |
|
"loss": 1.0633, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.5960099750623442, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00019997292411794618, |
|
"loss": 1.0775, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.6209476309226933, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.0001999576950082201, |
|
"loss": 1.0701, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.6458852867830425, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.0001999390827019096, |
|
"loss": 1.0794, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.6708229426433916, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00019991708782897213, |
|
"loss": 1.0662, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.6957605985037407, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.0001998917111338525, |
|
"loss": 1.0781, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.7206982543640899, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0001998629534754574, |
|
"loss": 1.0677, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.745635910224439, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00019983081582712685, |
|
"loss": 1.072, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.770573566084788, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.00019979529927660074, |
|
"loss": 1.0668, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.7955112219451372, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.00019975640502598244, |
|
"loss": 1.0688, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.8204488778054864, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00019971413439169775, |
|
"loss": 1.0561, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.8453865336658355, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00019966848880445062, |
|
"loss": 1.0585, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.8703241895261846, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00019961946980917456, |
|
"loss": 1.0642, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.8952618453865338, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 0.00019956707906498044, |
|
"loss": 1.0538, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.9201995012468829, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 0.00019951131834510032, |
|
"loss": 1.0605, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.945137157107232, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00019945218953682734, |
|
"loss": 1.0631, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.9700748129675811, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.000199389694641452, |
|
"loss": 1.0573, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.9950124688279303, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00019932383577419432, |
|
"loss": 1.0647, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.4642767906188965, |
|
"eval_runtime": 0.528, |
|
"eval_samples_per_second": 18.94, |
|
"eval_steps_per_second": 1.894, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 2.0199501246882794, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.00019925461516413223, |
|
"loss": 1.0445, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.0448877805486285, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00019918203515412617, |
|
"loss": 1.0286, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.0698254364089776, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00019910609820073986, |
|
"loss": 1.0285, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.0947630922693268, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00019902680687415705, |
|
"loss": 1.0393, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.119700748129676, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.00019894416385809444, |
|
"loss": 1.0295, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.144638403990025, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00019885817194971117, |
|
"loss": 1.0329, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.169576059850374, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00019876883405951377, |
|
"loss": 1.0332, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.1945137157107233, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00019867615321125795, |
|
"loss": 1.0357, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.2194513715710724, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00019858013254184597, |
|
"loss": 1.0275, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.2443890274314215, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.00019848077530122083, |
|
"loss": 1.0239, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.2693266832917707, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.0001983780848522559, |
|
"loss": 1.0292, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.29426433915212, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00019827206467064133, |
|
"loss": 1.0222, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.319201995012469, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00019816271834476642, |
|
"loss": 1.0256, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.344139650872818, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00019805004957559793, |
|
"loss": 1.0329, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.369077306733167, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00019793406217655517, |
|
"loss": 1.0253, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.3940149625935163, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.00019781476007338058, |
|
"loss": 1.0264, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.4189526184538654, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00019769214730400712, |
|
"loss": 1.0241, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 2.4438902743142146, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00019756622801842143, |
|
"loss": 1.0198, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.4688279301745637, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00019743700647852354, |
|
"loss": 1.0253, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 2.493765586034913, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00019730448705798239, |
|
"loss": 1.0231, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.518703241895262, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.00019716867424208806, |
|
"loss": 1.033, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.543640897755611, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00019702957262759965, |
|
"loss": 1.0258, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.56857855361596, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.00019688718692259006, |
|
"loss": 1.003, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.5935162094763093, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00019674152194628638, |
|
"loss": 1.0126, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.6184538653366585, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00019659258262890683, |
|
"loss": 1.012, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.6433915211970076, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.0001964403740114939, |
|
"loss": 1.0175, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.6683291770573567, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.00019628490124574377, |
|
"loss": 1.0117, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.693266832917706, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.0001961261695938319, |
|
"loss": 1.0187, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.718204488778055, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00019596418442823494, |
|
"loss": 1.0184, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.743142144638404, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0001957989512315489, |
|
"loss": 1.0146, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.7680798004987532, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.00019563047559630357, |
|
"loss": 1.0141, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.7930174563591024, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.0001954587632247732, |
|
"loss": 1.0209, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.8179551122194515, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00019528381992878362, |
|
"loss": 1.0165, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 2.8428927680798006, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 1.0161, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.8678304239401498, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 0.0001949242643573034, |
|
"loss": 1.0158, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.892768079800499, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.00019473966425143292, |
|
"loss": 1.0127, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.917705735660848, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0001945518575599317, |
|
"loss": 1.0037, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.942643391521197, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00019436085063935835, |
|
"loss": 1.0068, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.9675810473815463, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00019416664995458756, |
|
"loss": 1.0061, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.9925187032418954, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 1.0051, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.997506234413965, |
|
"eval_loss": 2.4609696865081787, |
|
"eval_runtime": 0.6338, |
|
"eval_samples_per_second": 15.777, |
|
"eval_steps_per_second": 1.578, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 3.0174563591022445, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00019376869369221452, |
|
"loss": 0.9992, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 3.0423940149625937, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00019356495158395315, |
|
"loss": 0.9824, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.067331670822943, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00019335804264972018, |
|
"loss": 0.9911, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 3.092269326683292, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.00019314797389261424, |
|
"loss": 0.9921, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.117206982543641, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00019293475242268223, |
|
"loss": 0.9925, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 3.14214463840399, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00019271838545667876, |
|
"loss": 0.9767, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 3.1670822942643393, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0001924988803178216, |
|
"loss": 0.9821, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 3.1920199501246884, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.00019227624443554425, |
|
"loss": 0.9839, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 3.2169576059850375, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00019205048534524406, |
|
"loss": 0.9904, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 3.2418952618453867, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00019182161068802741, |
|
"loss": 0.987, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.266832917705736, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00019158962821045112, |
|
"loss": 0.9912, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 3.291770573566085, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0001913545457642601, |
|
"loss": 0.9848, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.316708229426434, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0001911163713061217, |
|
"loss": 0.9872, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 3.341645885286783, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00019087511289735644, |
|
"loss": 0.989, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 3.3665835411471323, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.000190630778703665, |
|
"loss": 0.9968, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 3.3915211970074814, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00019038337699485208, |
|
"loss": 0.9839, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.4164588528678306, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00019013291614454621, |
|
"loss": 0.9919, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 3.4413965087281797, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.0001898794046299167, |
|
"loss": 0.9905, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.466334164588529, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00018962285103138636, |
|
"loss": 0.9932, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 3.491271820448878, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00018936326403234125, |
|
"loss": 0.982, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.516209476309227, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0001891006524188368, |
|
"loss": 0.9914, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 3.541147132169576, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00018883502507930042, |
|
"loss": 0.9809, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.5660847880299253, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0001885663910042306, |
|
"loss": 0.9818, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 3.5910224438902745, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00018829475928589271, |
|
"loss": 0.9841, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.6159600997506236, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00018802013911801112, |
|
"loss": 0.9796, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 3.6408977556109727, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0001877425397954582, |
|
"loss": 0.9817, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.665835411471322, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.00018746197071393958, |
|
"loss": 0.9848, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 3.690773067331671, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00018717844136967624, |
|
"loss": 0.9814, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.71571072319202, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00018689196135908304, |
|
"loss": 0.9831, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 3.7406483790523692, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 0.9829, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.765586034912718, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00018631018822358363, |
|
"loss": 0.9651, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 3.7905236907730675, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00018601491478953657, |
|
"loss": 0.9748, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.815461346633416, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00018571673007021123, |
|
"loss": 0.9896, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 3.8403990024937658, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00018541564415805258, |
|
"loss": 0.9782, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.8653366583541144, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00018511166724369997, |
|
"loss": 0.9824, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 3.890274314214464, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.0001848048096156426, |
|
"loss": 0.9884, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.9152119700748127, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00018449508165987105, |
|
"loss": 0.9872, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 3.9401496259351623, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.00018418249385952575, |
|
"loss": 0.9833, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.965087281795511, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.00018386705679454242, |
|
"loss": 0.9718, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 3.9900249376558605, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.00018354878114129367, |
|
"loss": 0.9807, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.4767043590545654, |
|
"eval_runtime": 0.5377, |
|
"eval_samples_per_second": 18.597, |
|
"eval_steps_per_second": 1.86, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 4.014962593516209, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.0001832276776722278, |
|
"loss": 0.9718, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 4.039900249376559, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.00018290375725550417, |
|
"loss": 0.9596, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 4.0648379052369075, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00018257703085462542, |
|
"loss": 0.9552, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 4.089775561097257, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.00018224750952806624, |
|
"loss": 0.9614, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 4.114713216957606, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.0001819152044288992, |
|
"loss": 0.9702, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 4.139650872817955, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00018158012680441723, |
|
"loss": 0.9534, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 4.164588528678304, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00018124228799575295, |
|
"loss": 0.9643, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 4.1895261845386536, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.9464, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 4.214463840399002, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00018055837265729994, |
|
"loss": 0.9624, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 4.239401496259352, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.0001802123192755044, |
|
"loss": 0.9606, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 4.2643391521197005, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00017986355100472928, |
|
"loss": 0.9548, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 4.28927680798005, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0001795120796494848, |
|
"loss": 0.9541, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 4.314214463840399, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.00017915791710577033, |
|
"loss": 0.9537, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 4.339152119700748, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00017880107536067218, |
|
"loss": 0.9691, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 4.364089775561097, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00017844156649195759, |
|
"loss": 0.9597, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 4.389027431421447, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00017807940266766593, |
|
"loss": 0.9517, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 4.413965087281795, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.0001777145961456971, |
|
"loss": 0.9598, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 4.438902743142145, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0001773471592733964, |
|
"loss": 0.9617, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 4.4638403990024935, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.00017697710448713678, |
|
"loss": 0.9513, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 4.488778054862843, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.0001766044443118978, |
|
"loss": 0.9637, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.513715710723192, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.00017622919136084183, |
|
"loss": 0.9562, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 4.538653366583541, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00017585135833488692, |
|
"loss": 0.9632, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 4.56359102244389, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.00017547095802227723, |
|
"loss": 0.9448, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 4.58852867830424, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.00017508800329814995, |
|
"loss": 0.963, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 4.613466334164588, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0001747025071240996, |
|
"loss": 0.962, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 4.638403990024938, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.00017431448254773944, |
|
"loss": 0.9627, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 4.6633416458852865, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0001739239427022596, |
|
"loss": 0.9574, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 4.688279301745636, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.0001735309008059829, |
|
"loss": 0.9675, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 4.713216957605985, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00017313537016191706, |
|
"loss": 0.9546, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 4.738154613466334, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.00017273736415730488, |
|
"loss": 0.9472, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 4.763092269326683, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0001723368962631708, |
|
"loss": 0.9528, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 4.788029925187033, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.0001719339800338651, |
|
"loss": 0.962, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 4.812967581047381, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00017152862910660516, |
|
"loss": 0.9602, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 4.837905236907731, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00017112085720101373, |
|
"loss": 0.9529, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 4.86284289276808, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 0.957, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 4.887780548628429, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.0001702981057425662, |
|
"loss": 0.9597, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 4.912718204488778, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.00016988315403679, |
|
"loss": 0.9653, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 4.937655860349127, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00016946583704589973, |
|
"loss": 0.9486, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 4.962593516209476, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00016904616889452497, |
|
"loss": 0.9587, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 4.987531172069826, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 0.9508, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.997506234413965, |
|
"eval_loss": 2.4788339138031006, |
|
"eval_runtime": 0.6532, |
|
"eval_samples_per_second": 15.309, |
|
"eval_steps_per_second": 1.531, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 5.012468827930174, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.00016819983600624986, |
|
"loss": 0.9493, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 5.037406483790524, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00016777319991457325, |
|
"loss": 0.941, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 5.062344139650873, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00016734426995189004, |
|
"loss": 0.9364, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 5.087281795511222, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00016691306063588583, |
|
"loss": 0.9424, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 5.112219451371571, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00016647958656139378, |
|
"loss": 0.9365, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 5.13715710723192, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00016604386239990078, |
|
"loss": 0.9405, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 5.162094763092269, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00016560590289905073, |
|
"loss": 0.9274, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 5.187032418952619, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00016516572288214552, |
|
"loss": 0.9433, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 5.211970074812967, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00016472333724764325, |
|
"loss": 0.9372, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 5.236907730673317, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00016427876096865394, |
|
"loss": 0.948, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 5.261845386533666, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00016383200909243285, |
|
"loss": 0.9342, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 5.286783042394015, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00016338309673987101, |
|
"loss": 0.944, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 5.311720698254364, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00016293203910498376, |
|
"loss": 0.9438, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 5.3366583541147135, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.000162478851454396, |
|
"loss": 0.9319, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 5.361596009975062, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.000162023549126826, |
|
"loss": 0.9411, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 5.386533665835412, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.0001615661475325658, |
|
"loss": 0.9316, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 5.41147132169576, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00016110666215295998, |
|
"loss": 0.9386, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 5.43640897755611, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00016064510853988138, |
|
"loss": 0.9429, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 5.461346633416459, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00016018150231520486, |
|
"loss": 0.9378, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 5.486284289276808, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.00015971585917027862, |
|
"loss": 0.9416, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 5.511221945137157, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.00015924819486539307, |
|
"loss": 0.9239, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 5.5361596009975065, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 0.9264, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 5.561097256857855, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.00015830686615841348, |
|
"loss": 0.9382, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 5.586034912718205, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00015783323361679864, |
|
"loss": 0.9405, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 5.610972568578553, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0001573576436351046, |
|
"loss": 0.9466, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 5.635910224438903, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00015688011231028518, |
|
"loss": 0.9366, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 5.660847880299252, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00015640065580500148, |
|
"loss": 0.9419, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 5.685785536159601, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.0001559192903470747, |
|
"loss": 0.9283, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 5.71072319201995, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.00015543603222893716, |
|
"loss": 0.9384, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 5.7356608478802995, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.0001549508978070806, |
|
"loss": 0.9373, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 5.760598503740648, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00015446390350150273, |
|
"loss": 0.9268, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 5.785536159600998, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.0001539750657951513, |
|
"loss": 0.9381, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 5.8104738154613464, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.00015348440123336645, |
|
"loss": 0.9321, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 5.835411471321696, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.0001529919264233205, |
|
"loss": 0.9304, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 5.860349127182045, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.000152497658033456, |
|
"loss": 0.934, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 5.885286783042394, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00015200161279292155, |
|
"loss": 0.9493, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 5.910224438902743, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.00015150380749100545, |
|
"loss": 0.9362, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 5.9351620947630925, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.00015100425897656753, |
|
"loss": 0.9348, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 5.960099750623441, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.000150502984157469, |
|
"loss": 0.9406, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 5.985037406483791, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.9256, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.491248607635498, |
|
"eval_runtime": 0.5413, |
|
"eval_samples_per_second": 18.475, |
|
"eval_steps_per_second": 1.848, |
|
"step": 1203 |
|
}, |
|
{ |
|
"epoch": 6.0099750623441395, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00014949532352830541, |
|
"loss": 0.9255, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 6.034912718204489, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.0001489889718238087, |
|
"loss": 0.9186, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 6.059850374064838, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00014848096202463372, |
|
"loss": 0.9201, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 6.084788029925187, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.00014797131132502465, |
|
"loss": 0.9192, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 6.109725685785536, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00014746003697476404, |
|
"loss": 0.9178, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 6.134663341645886, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.00014694715627858908, |
|
"loss": 0.9112, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 6.159600997506234, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00014643268659560572, |
|
"loss": 0.9181, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 6.184538653366584, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00014591664533870118, |
|
"loss": 0.9302, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 6.2094763092269325, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.00014539904997395468, |
|
"loss": 0.9257, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 6.234413965087282, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00014487991802004623, |
|
"loss": 0.9188, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 6.259351620947631, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00014435926704766362, |
|
"loss": 0.9218, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 6.28428927680798, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00014383711467890774, |
|
"loss": 0.9132, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 6.309226932668329, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00014331347858669632, |
|
"loss": 0.931, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 6.334164588528679, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00014278837649416544, |
|
"loss": 0.9187, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 6.359102244389027, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00014226182617406996, |
|
"loss": 0.9258, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 6.384039900249377, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0001417338454481818, |
|
"loss": 0.9125, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 6.4089775561097255, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.00014120445218668686, |
|
"loss": 0.9148, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 6.433915211970075, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00014067366430758004, |
|
"loss": 0.9149, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 6.458852867830424, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00014014149977605893, |
|
"loss": 0.9119, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 6.483790523690773, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0001396079766039157, |
|
"loss": 0.9313, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 6.508728179551122, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.00013907311284892736, |
|
"loss": 0.9148, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 6.533665835411472, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.00013853692661424484, |
|
"loss": 0.9253, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 6.55860349127182, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00013799943604777992, |
|
"loss": 0.9204, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 6.58354114713217, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00013746065934159123, |
|
"loss": 0.9219, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 6.6084788029925186, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.00013692061473126845, |
|
"loss": 0.9162, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 6.633416458852868, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.00013637932049531516, |
|
"loss": 0.9239, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 6.658354114713217, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00013583679495453, |
|
"loss": 0.9217, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 6.683291770573566, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.00013529305647138687, |
|
"loss": 0.9268, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 6.708229426433915, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.00013474812344941315, |
|
"loss": 0.9238, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 6.733167082294265, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.00013420201433256689, |
|
"loss": 0.9141, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 6.758104738154613, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00013365474760461266, |
|
"loss": 0.9219, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 6.783042394014963, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.0001331063417884958, |
|
"loss": 0.9177, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 6.807980049875312, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.00013255681544571568, |
|
"loss": 0.9215, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 6.832917705735661, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.00013200618717569714, |
|
"loss": 0.9087, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 6.85785536159601, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.00013145447561516138, |
|
"loss": 0.9276, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 6.882793017456359, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.9147, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 6.907730673316708, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.0001303478773521171, |
|
"loss": 0.9209, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 6.932668329177058, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0001297930281038482, |
|
"loss": 0.9248, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 6.957605985037406, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00012923717047227368, |
|
"loss": 0.9197, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 6.982543640897756, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.00012868032327110904, |
|
"loss": 0.9216, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 6.997506234413965, |
|
"eval_loss": 2.5038018226623535, |
|
"eval_runtime": 0.6591, |
|
"eval_samples_per_second": 15.172, |
|
"eval_steps_per_second": 1.517, |
|
"step": 1403 |
|
}, |
|
{ |
|
"epoch": 7.007481296758105, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.00012812250534756308, |
|
"loss": 0.9115, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 7.032418952618454, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.0001275637355816999, |
|
"loss": 0.9038, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 7.057356608478803, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.0001270040328858001, |
|
"loss": 0.9088, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 7.082294264339152, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.00012644341620372023, |
|
"loss": 0.9016, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 7.107231920199501, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00012588190451025207, |
|
"loss": 0.9112, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 7.132169576059851, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.0001253195168104802, |
|
"loss": 0.9081, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 7.157107231920199, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.0001247562721391386, |
|
"loss": 0.9094, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 7.182044887780549, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.00012419218955996676, |
|
"loss": 0.8956, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 7.206982543640898, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00012362728816506417, |
|
"loss": 0.9044, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 7.231920199501247, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.00012306158707424403, |
|
"loss": 0.9069, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 7.256857855361596, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0001224951054343865, |
|
"loss": 0.8949, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 7.2817955112219455, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00012192786241879033, |
|
"loss": 0.901, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 7.306733167082294, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00012135987722652402, |
|
"loss": 0.8983, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 7.331670822942644, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00012079116908177593, |
|
"loss": 0.9113, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 7.356608478802992, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.00012022175723320381, |
|
"loss": 0.9026, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 7.381546134663342, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00011965166095328301, |
|
"loss": 0.9075, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 7.406483790523691, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00011908089953765449, |
|
"loss": 0.9045, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 7.43142144638404, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.00011850949230447145, |
|
"loss": 0.9099, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 7.456359102244389, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.00011793745859374575, |
|
"loss": 0.9035, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 7.4812967581047385, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.9093, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 7.506234413965087, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00011679158920507774, |
|
"loss": 0.8925, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 7.531172069825437, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00011621779231055676, |
|
"loss": 0.9054, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 7.556109725685785, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.0001156434465040231, |
|
"loss": 0.9017, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 7.581047381546135, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00011506857122494831, |
|
"loss": 0.9125, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 7.605985037406484, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00011449318593072466, |
|
"loss": 0.9051, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 7.630922693266833, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00011391731009600654, |
|
"loss": 0.894, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 7.655860349127182, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.00011334096321205128, |
|
"loss": 0.9096, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 7.6807980049875315, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00011276416478605949, |
|
"loss": 0.908, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 7.70573566084788, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.00011218693434051475, |
|
"loss": 0.9118, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 7.73067331670823, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00011160929141252303, |
|
"loss": 0.8973, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 7.7556109725685785, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00011103125555315119, |
|
"loss": 0.9025, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 7.780548628428928, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.00011045284632676536, |
|
"loss": 0.8963, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 7.805486284289277, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00010987408331036879, |
|
"loss": 0.9171, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 7.830423940149626, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00010929498609293924, |
|
"loss": 0.9225, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 7.855361596009975, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.00010871557427476583, |
|
"loss": 0.9087, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 7.8802992518703245, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.00010813586746678583, |
|
"loss": 0.9119, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 7.905236907730673, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00010755588528992082, |
|
"loss": 0.8963, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 7.930174563591023, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00010697564737441252, |
|
"loss": 0.9115, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 7.9551122194513715, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00010639517335915856, |
|
"loss": 0.903, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 7.980049875311721, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00010581448289104758, |
|
"loss": 0.9094, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.512359380722046, |
|
"eval_runtime": 0.5399, |
|
"eval_samples_per_second": 18.52, |
|
"eval_steps_per_second": 1.852, |
|
"step": 1604 |
|
}, |
|
{ |
|
"epoch": 8.00498753117207, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.0001052335956242944, |
|
"loss": 0.9066, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 8.029925187032418, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0001046525312197747, |
|
"loss": 0.8932, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 8.054862842892769, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.0001040713093443596, |
|
"loss": 0.8877, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 8.079800498753118, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.00010348994967025012, |
|
"loss": 0.9041, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 8.104738154613466, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00010290847187431113, |
|
"loss": 0.8898, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 8.129675810473815, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.00010232689563740563, |
|
"loss": 0.8814, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 8.154613466334165, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00010174524064372837, |
|
"loss": 0.8814, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 8.179551122194514, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00010116352658013973, |
|
"loss": 0.8934, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 8.204488778054863, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.00010058177313549939, |
|
"loss": 0.8979, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 8.229426433915211, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9019, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 8.254364089775562, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 9.94182268645006e-05, |
|
"loss": 0.8859, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 8.27930174563591, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 9.883647341986032e-05, |
|
"loss": 0.8938, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 8.30423940149626, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 9.825475935627165e-05, |
|
"loss": 0.892, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 8.329177057356608, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 9.767310436259438e-05, |
|
"loss": 0.8955, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 8.354114713216958, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 9.709152812568886e-05, |
|
"loss": 0.8929, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 8.379052369077307, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.651005032974994e-05, |
|
"loss": 0.8945, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 8.403990024937656, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 9.592869065564043e-05, |
|
"loss": 0.888, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 8.428927680798004, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 9.534746878022534e-05, |
|
"loss": 0.8907, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 8.453865336658355, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 9.476640437570562e-05, |
|
"loss": 0.8885, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 8.478802992518704, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 9.418551710895243e-05, |
|
"loss": 0.8867, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 8.503740648379052, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 9.360482664084145e-05, |
|
"loss": 0.8881, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 8.528678304239401, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 9.302435262558747e-05, |
|
"loss": 0.8964, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 8.553615960099751, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 9.244411471007922e-05, |
|
"loss": 0.888, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 8.5785536159601, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 9.186413253321418e-05, |
|
"loss": 0.8806, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 8.603491271820449, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 9.128442572523417e-05, |
|
"loss": 0.8849, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 8.628428927680797, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 9.070501390706079e-05, |
|
"loss": 0.8924, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 8.653366583541148, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 9.012591668963122e-05, |
|
"loss": 0.8911, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 8.678304239401497, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 8.954715367323468e-05, |
|
"loss": 0.8956, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 8.703241895261845, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 8.896874444684883e-05, |
|
"loss": 0.8981, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 8.728179551122194, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 8.839070858747697e-05, |
|
"loss": 0.8989, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 8.753117206982544, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 8.781306565948528e-05, |
|
"loss": 0.9039, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 8.778054862842893, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 8.723583521394054e-05, |
|
"loss": 0.9023, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 8.802992518703242, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 8.665903678794873e-05, |
|
"loss": 0.9084, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 8.82793017456359, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 8.608268990399349e-05, |
|
"loss": 0.8976, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 8.85286783042394, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 8.550681406927535e-05, |
|
"loss": 0.8931, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 8.87780548628429, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 8.49314287750517e-05, |
|
"loss": 0.8893, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 8.902743142144638, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 8.435655349597689e-05, |
|
"loss": 0.9013, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 8.927680798004987, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 8.378220768944327e-05, |
|
"loss": 0.9058, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 8.952618453865338, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 8.32084107949223e-05, |
|
"loss": 0.9019, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 8.977556109725686, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 8.263518223330697e-05, |
|
"loss": 0.8961, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 8.997506234413965, |
|
"eval_loss": 2.5246224403381348, |
|
"eval_runtime": 0.6529, |
|
"eval_samples_per_second": 15.316, |
|
"eval_steps_per_second": 1.532, |
|
"step": 1804 |
|
}, |
|
{ |
|
"epoch": 9.002493765586035, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 8.206254140625426e-05, |
|
"loss": 0.8949, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 9.027431421446384, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 8.149050769552856e-05, |
|
"loss": 0.8901, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 9.052369077306734, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 8.091910046234552e-05, |
|
"loss": 0.8822, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 9.077306733167083, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 8.034833904671698e-05, |
|
"loss": 0.8846, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 9.102244389027431, |
|
"grad_norm": 0.5, |
|
"learning_rate": 7.977824276679623e-05, |
|
"loss": 0.8819, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 9.12718204488778, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 7.920883091822408e-05, |
|
"loss": 0.888, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 9.15211970074813, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 7.864012277347602e-05, |
|
"loss": 0.8682, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 9.17705735660848, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 7.807213758120966e-05, |
|
"loss": 0.8772, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 9.201995012468828, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 7.750489456561352e-05, |
|
"loss": 0.8871, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 9.226932668329177, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 7.693841292575598e-05, |
|
"loss": 0.8856, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 9.251870324189527, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 7.637271183493586e-05, |
|
"loss": 0.884, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 9.276807980049876, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 7.580781044003324e-05, |
|
"loss": 0.8885, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 9.301745635910224, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 7.524372786086142e-05, |
|
"loss": 0.8825, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 9.326683291770573, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 7.468048318951983e-05, |
|
"loss": 0.8863, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 9.351620947630924, |
|
"grad_norm": 0.5, |
|
"learning_rate": 7.411809548974792e-05, |
|
"loss": 0.8877, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 9.376558603491272, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 7.35565837962798e-05, |
|
"loss": 0.8749, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 9.401496259351621, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 7.299596711419994e-05, |
|
"loss": 0.8851, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 9.42643391521197, |
|
"grad_norm": 0.5, |
|
"learning_rate": 7.243626441830009e-05, |
|
"loss": 0.8957, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 9.451371571072318, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 7.187749465243693e-05, |
|
"loss": 0.8904, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 9.476309226932669, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 7.131967672889101e-05, |
|
"loss": 0.8822, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 9.501246882793017, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 7.076282952772633e-05, |
|
"loss": 0.8771, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 9.526184538653366, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 7.02069718961518e-05, |
|
"loss": 0.8841, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 9.551122194513717, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 6.965212264788297e-05, |
|
"loss": 0.8857, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 9.576059850374065, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 0.8808, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 9.600997506234414, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 6.854552438483865e-05, |
|
"loss": 0.8872, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 9.625935162094763, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 6.799381282430284e-05, |
|
"loss": 0.893, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 9.650872817955111, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 6.744318455428436e-05, |
|
"loss": 0.87, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 9.675810473815462, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 6.68936582115042e-05, |
|
"loss": 0.9005, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 9.70074812967581, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 6.634525239538736e-05, |
|
"loss": 0.8936, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 9.72568578553616, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 6.579798566743314e-05, |
|
"loss": 0.898, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 9.75062344139651, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 6.525187655058686e-05, |
|
"loss": 0.8844, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 9.775561097256858, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 6.470694352861312e-05, |
|
"loss": 0.8849, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 9.800498753117207, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 6.416320504546997e-05, |
|
"loss": 0.881, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 9.825436408977556, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 6.362067950468489e-05, |
|
"loss": 0.8804, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 9.850374064837904, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 6.307938526873157e-05, |
|
"loss": 0.8794, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 9.875311720698255, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 6.25393406584088e-05, |
|
"loss": 0.8784, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 9.900249376558603, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 6.200056395222012e-05, |
|
"loss": 0.8875, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 9.925187032418952, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 6.146307338575519e-05, |
|
"loss": 0.8774, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 9.950124688279303, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 6.092688715107264e-05, |
|
"loss": 0.8789, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 9.975062344139651, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 6.039202339608432e-05, |
|
"loss": 0.8922, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 5.985850022394106e-05, |
|
"loss": 0.8816, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 2.5341641902923584, |
|
"eval_runtime": 0.5369, |
|
"eval_samples_per_second": 18.625, |
|
"eval_steps_per_second": 1.863, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 10.024937655860349, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 5.9326335692419995e-05, |
|
"loss": 0.8711, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 10.049875311720697, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 5.879554781331317e-05, |
|
"loss": 0.8889, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 10.074812967581048, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 5.8266154551818216e-05, |
|
"loss": 0.8789, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 10.099750623441397, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 5.773817382593008e-05, |
|
"loss": 0.8638, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 10.124688279301745, |
|
"grad_norm": 0.5, |
|
"learning_rate": 5.72116235058346e-05, |
|
"loss": 0.874, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 10.149625935162096, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 5.668652141330373e-05, |
|
"loss": 0.8764, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 10.174563591022444, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 5.616288532109225e-05, |
|
"loss": 0.8855, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 10.199501246882793, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 5.564073295233645e-05, |
|
"loss": 0.8802, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 10.224438902743142, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 5.5120081979953785e-05, |
|
"loss": 0.8738, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 10.24937655860349, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 5.4600950026045326e-05, |
|
"loss": 0.8715, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 10.27431421446384, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 5.4083354661298814e-05, |
|
"loss": 0.878, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 10.29925187032419, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 5.356731340439431e-05, |
|
"loss": 0.8851, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 10.324189526184538, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 5.305284372141095e-05, |
|
"loss": 0.8741, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 10.349127182044889, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 5.253996302523596e-05, |
|
"loss": 0.8801, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 10.374064837905237, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 5.2028688674975415e-05, |
|
"loss": 0.8729, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 10.399002493765586, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 5.15190379753663e-05, |
|
"loss": 0.8726, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 10.423940149625935, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 5.101102817619131e-05, |
|
"loss": 0.8908, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 10.448877805486283, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 5.05046764716946e-05, |
|
"loss": 0.8868, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 10.473815461346634, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 0.8743, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 10.498753117206983, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 4.9497015842531026e-05, |
|
"loss": 0.8756, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 10.523690773067331, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 4.899574102343247e-05, |
|
"loss": 0.881, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 10.548628428927682, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 4.8496192508994576e-05, |
|
"loss": 0.8769, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 10.57356608478803, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 4.799838720707846e-05, |
|
"loss": 0.88, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 10.598503740648379, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 4.7502341966544e-05, |
|
"loss": 0.8787, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 10.623441396508728, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 4.700807357667952e-05, |
|
"loss": 0.8774, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 10.648379052369076, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 4.6515598766633597e-05, |
|
"loss": 0.8807, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 10.673316708229427, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 4.6024934204848745e-05, |
|
"loss": 0.8702, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 10.698254364089776, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 4.5536096498497295e-05, |
|
"loss": 0.873, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 10.723192019950124, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 4.50491021929194e-05, |
|
"loss": 0.8785, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 10.748129675810475, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 4.456396777106285e-05, |
|
"loss": 0.8851, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 10.773067331670823, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 4.4080709652925336e-05, |
|
"loss": 0.8827, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 10.798004987531172, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 4.359934419499858e-05, |
|
"loss": 0.8717, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 10.82294264339152, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 4.3119887689714844e-05, |
|
"loss": 0.8737, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 10.84788029925187, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 4.264235636489542e-05, |
|
"loss": 0.8703, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 10.87281795511222, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 4.216676638320135e-05, |
|
"loss": 0.8657, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 10.897755610972569, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 4.169313384158653e-05, |
|
"loss": 0.8798, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 10.922693266832917, |
|
"grad_norm": 0.5, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 0.873, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 10.947630922693268, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 4.0751805134606944e-05, |
|
"loss": 0.877, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 10.972568578553616, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 4.028414082972141e-05, |
|
"loss": 0.8892, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 10.997506234413965, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 3.981849768479517e-05, |
|
"loss": 0.8722, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 10.997506234413965, |
|
"eval_loss": 2.534595012664795, |
|
"eval_runtime": 0.6862, |
|
"eval_samples_per_second": 14.573, |
|
"eval_steps_per_second": 1.457, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 11.022443890274314, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 3.935489146011869e-05, |
|
"loss": 0.8622, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 11.047381546134662, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 3.8893337847040025e-05, |
|
"loss": 0.8802, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 11.072319201995013, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 3.843385246743417e-05, |
|
"loss": 0.8698, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 11.097256857855362, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 3.7976450873174005e-05, |
|
"loss": 0.8726, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 11.12219451371571, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 3.7521148545604e-05, |
|
"loss": 0.8788, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 11.147132169576059, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 3.7067960895016275e-05, |
|
"loss": 0.8712, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 11.17206982543641, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 3.661690326012897e-05, |
|
"loss": 0.8716, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 11.197007481296758, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 3.61679909075672e-05, |
|
"loss": 0.8845, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 11.221945137157107, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 0.874, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 11.246882793017456, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 3.527666275235677e-05, |
|
"loss": 0.8713, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 11.271820448877806, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 3.483427711785449e-05, |
|
"loss": 0.8764, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 11.296758104738155, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 3.439409710094929e-05, |
|
"loss": 0.8751, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 11.321695760598503, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 3.395613760009925e-05, |
|
"loss": 0.8752, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 11.346633416458852, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 3.352041343860621e-05, |
|
"loss": 0.8738, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 11.371571072319203, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 3.308693936411421e-05, |
|
"loss": 0.869, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 11.396508728179551, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 3.265573004810997e-05, |
|
"loss": 0.8755, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 11.4214463840399, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 3.222680008542678e-05, |
|
"loss": 0.8732, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 11.446384039900249, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 3.1800163993750166e-05, |
|
"loss": 0.8741, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 11.471321695760599, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 3.137583621312665e-05, |
|
"loss": 0.8636, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 11.496259351620948, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 3.095383110547506e-05, |
|
"loss": 0.8758, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 11.521197007481296, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 3.053416295410026e-05, |
|
"loss": 0.8626, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 11.546134663341645, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 3.0116845963209993e-05, |
|
"loss": 0.8797, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 11.571072319201996, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 2.9701894257433826e-05, |
|
"loss": 0.8623, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 11.596009975062344, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 0.8736, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 11.620947630922693, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 2.8879142798986292e-05, |
|
"loss": 0.8656, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 11.645885286783042, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 2.8471370893394866e-05, |
|
"loss": 0.8637, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 11.670822942643392, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 2.8066019966134904e-05, |
|
"loss": 0.875, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 11.69576059850374, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 2.7663103736829198e-05, |
|
"loss": 0.8727, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 11.72069825436409, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 2.7262635842695127e-05, |
|
"loss": 0.8767, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 11.745635910224438, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 2.6864629838082956e-05, |
|
"loss": 0.8718, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 11.770573566084789, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 2.6469099194017143e-05, |
|
"loss": 0.8805, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 11.795511221945137, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 2.6076057297740407e-05, |
|
"loss": 0.8701, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 11.820448877805486, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 2.5685517452260567e-05, |
|
"loss": 0.8661, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 11.845386533665835, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 2.529749287590042e-05, |
|
"loss": 0.8684, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 11.870324189526185, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 2.491199670185008e-05, |
|
"loss": 0.8673, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 11.895261845386534, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 2.45290419777228e-05, |
|
"loss": 0.8707, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 11.920199501246882, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 2.4148641665113113e-05, |
|
"loss": 0.8744, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 11.945137157107231, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 2.3770808639158216e-05, |
|
"loss": 0.8767, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 11.970074812967582, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 2.339555568810221e-05, |
|
"loss": 0.8761, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 11.99501246882793, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 2.302289551286321e-05, |
|
"loss": 0.8768, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 2.5409655570983887, |
|
"eval_runtime": 0.5391, |
|
"eval_samples_per_second": 18.549, |
|
"eval_steps_per_second": 1.855, |
|
"step": 2406 |
|
}, |
|
{ |
|
"epoch": 12.019950124688279, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 2.265284072660362e-05, |
|
"loss": 0.8646, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 12.044887780548628, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 2.2285403854302912e-05, |
|
"loss": 0.8762, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 12.069825436408978, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 2.192059733233408e-05, |
|
"loss": 0.8769, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 12.094763092269327, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 2.155843350804243e-05, |
|
"loss": 0.865, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 12.119700748129675, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 2.119892463932781e-05, |
|
"loss": 0.8751, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 12.144638403990024, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 2.0842082894229676e-05, |
|
"loss": 0.8704, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 12.169576059850375, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 2.0487920350515212e-05, |
|
"loss": 0.8698, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 12.194513715710723, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 2.013644899527074e-05, |
|
"loss": 0.8773, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 12.219451371571072, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 1.9787680724495617e-05, |
|
"loss": 0.869, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 12.24438902743142, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 1.9441627342700065e-05, |
|
"loss": 0.8817, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 12.269326683291771, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 0.8671, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 12.29426433915212, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 1.8757712004247096e-05, |
|
"loss": 0.8663, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 12.319201995012468, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 1.8419873195582814e-05, |
|
"loss": 0.8712, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 12.344139650872817, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 1.808479557110081e-05, |
|
"loss": 0.8737, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 12.369077306733168, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 1.775249047193377e-05, |
|
"loss": 0.866, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 12.394014962593516, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 1.7422969145374592e-05, |
|
"loss": 0.8699, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 12.418952618453865, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 1.7096242744495837e-05, |
|
"loss": 0.875, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 12.443890274314214, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 1.677232232777224e-05, |
|
"loss": 0.8704, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 12.468827930174564, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 1.6451218858706374e-05, |
|
"loss": 0.869, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 12.493765586034913, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 1.6132943205457606e-05, |
|
"loss": 0.8732, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 12.518703241895262, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 1.5817506140474247e-05, |
|
"loss": 0.8706, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 12.54364089775561, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 1.550491834012898e-05, |
|
"loss": 0.8665, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 12.56857855361596, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 1.5195190384357404e-05, |
|
"loss": 0.8715, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 12.59351620947631, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 1.4888332756300027e-05, |
|
"loss": 0.873, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 12.618453865336658, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 1.458435584194745e-05, |
|
"loss": 0.8663, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 12.643391521197007, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 1.4283269929788779e-05, |
|
"loss": 0.8715, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 12.668329177057357, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 1.3985085210463477e-05, |
|
"loss": 0.8625, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 12.693266832917706, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 1.3689811776416362e-05, |
|
"loss": 0.872, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 12.718204488778055, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 1.339745962155613e-05, |
|
"loss": 0.8621, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 12.743142144638403, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 1.3108038640916986e-05, |
|
"loss": 0.8709, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 12.768079800498754, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 1.2821558630323772e-05, |
|
"loss": 0.8675, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 12.793017456359102, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 1.2538029286060426e-05, |
|
"loss": 0.8683, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 12.817955112219451, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 1.2257460204541794e-05, |
|
"loss": 0.8646, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 12.8428927680798, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 1.1979860881988902e-05, |
|
"loss": 0.8703, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 12.86783042394015, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 1.1705240714107302e-05, |
|
"loss": 0.8732, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 12.892768079800499, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 1.1433608995769395e-05, |
|
"loss": 0.8722, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 12.917705735660848, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 1.116497492069961e-05, |
|
"loss": 0.8724, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 12.942643391521196, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 1.0899347581163221e-05, |
|
"loss": 0.867, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 12.967581047381547, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 1.0636735967658784e-05, |
|
"loss": 0.8667, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 12.992518703241895, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 1.0377148968613658e-05, |
|
"loss": 0.8694, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 12.997506234413965, |
|
"eval_loss": 2.5414979457855225, |
|
"eval_runtime": 0.6532, |
|
"eval_samples_per_second": 15.31, |
|
"eval_steps_per_second": 1.531, |
|
"step": 2606 |
|
}, |
|
{ |
|
"epoch": 13.017456359102244, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 1.0120595370083318e-05, |
|
"loss": 0.8778, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 13.042394014962593, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 9.867083855453774e-06, |
|
"loss": 0.8707, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 13.067331670822943, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 9.616623005147951e-06, |
|
"loss": 0.8675, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 13.092269326683292, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 9.369221296335006e-06, |
|
"loss": 0.8655, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 13.11720698254364, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 9.124887102643575e-06, |
|
"loss": 0.8662, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 13.14214463840399, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 8.883628693878298e-06, |
|
"loss": 0.8656, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 13.16708229426434, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 8.645454235739903e-06, |
|
"loss": 0.8677, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 13.192019950124688, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 8.41037178954891e-06, |
|
"loss": 0.8673, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 13.216957605985037, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 8.178389311972612e-06, |
|
"loss": 0.8726, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 13.241895261845386, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 7.949514654755962e-06, |
|
"loss": 0.8622, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 13.266832917705736, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 7.72375556445577e-06, |
|
"loss": 0.8703, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 13.291770573566085, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 7.501119682178393e-06, |
|
"loss": 0.8723, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 13.316708229426434, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 7.281614543321269e-06, |
|
"loss": 0.8673, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 13.341645885286782, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 7.0652475773177464e-06, |
|
"loss": 0.867, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 13.366583541147133, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 6.852026107385756e-06, |
|
"loss": 0.8627, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 13.391521197007481, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 6.6419573502798374e-06, |
|
"loss": 0.8693, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 13.41645885286783, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 6.435048416046863e-06, |
|
"loss": 0.8745, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 13.441396508728179, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 6.231306307785522e-06, |
|
"loss": 0.8654, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 13.46633416458853, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 6.030737921409169e-06, |
|
"loss": 0.8763, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 13.491271820448878, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 5.833350045412478e-06, |
|
"loss": 0.8773, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 13.516209476309227, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 5.639149360641649e-06, |
|
"loss": 0.8763, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 13.541147132169575, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 5.448142440068316e-06, |
|
"loss": 0.8732, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 13.566084788029926, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 5.26033574856708e-06, |
|
"loss": 0.8718, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 13.591022443890274, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 5.075735642696611e-06, |
|
"loss": 0.8776, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 13.615960099750623, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 4.8943483704846475e-06, |
|
"loss": 0.8643, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 13.640897755610972, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 4.7161800712163806e-06, |
|
"loss": 0.8767, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 13.665835411471322, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 4.541236775226809e-06, |
|
"loss": 0.8656, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 13.690773067331671, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 4.369524403696457e-06, |
|
"loss": 0.8735, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 13.71571072319202, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 4.20104876845111e-06, |
|
"loss": 0.8678, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 13.740648379052368, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 4.0358155717650895e-06, |
|
"loss": 0.8591, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 13.765586034912719, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 3.873830406168111e-06, |
|
"loss": 0.8646, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 13.790523690773068, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 3.715098754256241e-06, |
|
"loss": 0.8643, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 13.815461346633416, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 3.5596259885061102e-06, |
|
"loss": 0.8702, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 13.840399002493765, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 3.40741737109318e-06, |
|
"loss": 0.8669, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 13.865336658354115, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 3.2584780537136207e-06, |
|
"loss": 0.8818, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 13.890274314214464, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 3.1128130774099262e-06, |
|
"loss": 0.8771, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 13.915211970074813, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 2.970427372400353e-06, |
|
"loss": 0.8643, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 13.940149625935161, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 2.8313257579119843e-06, |
|
"loss": 0.8569, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 13.965087281795512, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 2.6955129420176196e-06, |
|
"loss": 0.8664, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 13.99002493765586, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 2.5629935214764865e-06, |
|
"loss": 0.8709, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 2.5418429374694824, |
|
"eval_runtime": 0.5336, |
|
"eval_samples_per_second": 18.742, |
|
"eval_steps_per_second": 1.874, |
|
"step": 2807 |
|
}, |
|
{ |
|
"epoch": 14.01496259351621, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 2.433771981578581e-06, |
|
"loss": 0.8642, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 14.039900249376558, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 2.307852695992907e-06, |
|
"loss": 0.8614, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 14.064837905236908, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 2.1852399266194314e-06, |
|
"loss": 0.8539, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 14.089775561097257, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 2.0659378234448525e-06, |
|
"loss": 0.867, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 14.114713216957606, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 1.9499504244020693e-06, |
|
"loss": 0.8612, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 14.139650872817954, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 1.8372816552336026e-06, |
|
"loss": 0.8633, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 14.164588528678305, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 1.7279353293586765e-06, |
|
"loss": 0.8609, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 14.189526184538654, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 1.621915147744124e-06, |
|
"loss": 0.8658, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 14.214463840399002, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 1.5192246987791981e-06, |
|
"loss": 0.8608, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 14.239401496259351, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 1.4198674581540339e-06, |
|
"loss": 0.8689, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 14.264339152119701, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 1.323846788742078e-06, |
|
"loss": 0.8778, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 14.28927680798005, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 1.231165940486234e-06, |
|
"loss": 0.8685, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 14.314214463840399, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 1.14182805028884e-06, |
|
"loss": 0.8658, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 14.339152119700747, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 1.055836141905553e-06, |
|
"loss": 0.8771, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 14.364089775561098, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 9.731931258429638e-07, |
|
"loss": 0.8816, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 14.389027431421447, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 8.939017992601328e-07, |
|
"loss": 0.8699, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 14.413965087281795, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 8.17964845873831e-07, |
|
"loss": 0.8734, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 14.438902743142144, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 7.453848358678017e-07, |
|
"loss": 0.8724, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 14.463840399002494, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 6.761642258056978e-07, |
|
"loss": 0.8784, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 14.488778054862843, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 6.103053585480023e-07, |
|
"loss": 0.8644, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 14.513715710723192, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 5.478104631726711e-07, |
|
"loss": 0.8615, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 14.53865336658354, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 4.88681654899692e-07, |
|
"loss": 0.8662, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 14.563591022443891, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 4.329209350195651e-07, |
|
"loss": 0.8694, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 14.58852867830424, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 3.805301908254455e-07, |
|
"loss": 0.8729, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 14.613466334164588, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 3.315111955493944e-07, |
|
"loss": 0.8758, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 14.638403990024937, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 2.858656083022604e-07, |
|
"loss": 0.8569, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 14.663341645885287, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 2.4359497401758024e-07, |
|
"loss": 0.8734, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 14.688279301745636, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 2.0470072339926484e-07, |
|
"loss": 0.868, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 14.713216957605985, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 1.6918417287318245e-07, |
|
"loss": 0.8797, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 14.738154613466333, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 1.3704652454261668e-07, |
|
"loss": 0.8734, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 14.763092269326684, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 1.0828886614754341e-07, |
|
"loss": 0.88, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 14.788029925187033, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 8.29121710278713e-08, |
|
"loss": 0.8667, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 14.812967581047381, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 6.09172980904238e-08, |
|
"loss": 0.8642, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 14.83790523690773, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 4.230499177994007e-08, |
|
"loss": 0.8733, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 14.86284289276808, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 2.7075882053828605e-08, |
|
"loss": 0.8704, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 14.88778054862843, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 1.5230484360873044e-08, |
|
"loss": 0.8678, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 14.912718204488778, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 6.769199623779532e-09, |
|
"loss": 0.8684, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 14.937655860349127, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 1.6923142255764746e-09, |
|
"loss": 0.8586, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 14.962593516209477, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.0, |
|
"loss": 0.8781, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 14.962593516209477, |
|
"eval_loss": 2.542728900909424, |
|
"eval_runtime": 0.4926, |
|
"eval_samples_per_second": 20.301, |
|
"eval_steps_per_second": 2.03, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 14.962593516209477, |
|
"step": 3000, |
|
"total_flos": 1.7621200159784305e+18, |
|
"train_loss": 0.9822600702444713, |
|
"train_runtime": 10311.9233, |
|
"train_samples_per_second": 13.994, |
|
"train_steps_per_second": 0.291 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 3000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 100, |
|
"total_flos": 1.7621200159784305e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|