|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 28.0, |
|
"eval_steps": 500, |
|
"global_step": 9562, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.29282576866764276, |
|
"grad_norm": 0.26115021109580994, |
|
"learning_rate": 3e-05, |
|
"loss": 2.0719, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5856515373352855, |
|
"grad_norm": 0.2679530680179596, |
|
"learning_rate": 3e-05, |
|
"loss": 1.969, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8784773060029283, |
|
"grad_norm": 0.2916224002838135, |
|
"learning_rate": 3e-05, |
|
"loss": 1.9686, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9985358711566618, |
|
"eval_accuracy": 0.43743589743589745, |
|
"eval_loss": 3.6918885707855225, |
|
"eval_runtime": 5.2974, |
|
"eval_samples_per_second": 94.385, |
|
"eval_steps_per_second": 11.893, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.9985358711566618, |
|
"eval_bleu": 0.1527908371131025, |
|
"eval_exact_match": 0.012, |
|
"eval_prefix_exact_match": 0.026, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.171303074670571, |
|
"grad_norm": 0.3534347712993622, |
|
"learning_rate": 3e-05, |
|
"loss": 1.9603, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.4641288433382138, |
|
"grad_norm": 0.3338424563407898, |
|
"learning_rate": 3e-05, |
|
"loss": 1.9235, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.7569546120058566, |
|
"grad_norm": 0.4033390283584595, |
|
"learning_rate": 3e-05, |
|
"loss": 1.9337, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.44764102564102565, |
|
"eval_loss": 3.7484843730926514, |
|
"eval_runtime": 5.0649, |
|
"eval_samples_per_second": 98.718, |
|
"eval_steps_per_second": 12.438, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_bleu": 0.21311912714932216, |
|
"eval_exact_match": 0.02, |
|
"eval_prefix_exact_match": 0.032, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 2.049780380673499, |
|
"grad_norm": 0.38882532715797424, |
|
"learning_rate": 3e-05, |
|
"loss": 1.9267, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.342606149341142, |
|
"grad_norm": 0.44310709834098816, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8823, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.6354319180087846, |
|
"grad_norm": 0.5196680426597595, |
|
"learning_rate": 3e-05, |
|
"loss": 1.901, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.9282576866764276, |
|
"grad_norm": 0.5293437242507935, |
|
"learning_rate": 3e-05, |
|
"loss": 1.9033, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.998535871156662, |
|
"eval_accuracy": 0.4495897435897436, |
|
"eval_loss": 3.882551670074463, |
|
"eval_runtime": 5.1314, |
|
"eval_samples_per_second": 97.439, |
|
"eval_steps_per_second": 12.277, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 2.998535871156662, |
|
"eval_bleu": 0.24307488023362148, |
|
"eval_exact_match": 0.016, |
|
"eval_prefix_exact_match": 0.032, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 3.22108345534407, |
|
"grad_norm": 0.552943229675293, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8529, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.513909224011713, |
|
"grad_norm": 0.6421984434127808, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8342, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.8067349926793557, |
|
"grad_norm": 0.7155198454856873, |
|
"learning_rate": 3e-05, |
|
"loss": 1.857, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.4481025641025641, |
|
"eval_loss": 3.970107316970825, |
|
"eval_runtime": 5.1563, |
|
"eval_samples_per_second": 96.969, |
|
"eval_steps_per_second": 12.218, |
|
"step": 1366 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_bleu": 0.25309406645138827, |
|
"eval_exact_match": 0.016, |
|
"eval_prefix_exact_match": 0.044, |
|
"step": 1366 |
|
}, |
|
{ |
|
"epoch": 4.099560761346998, |
|
"grad_norm": 0.7837833166122437, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8284, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.392386530014641, |
|
"grad_norm": 0.784609854221344, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7884, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.685212298682284, |
|
"grad_norm": 0.834195613861084, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7841, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.978038067349927, |
|
"grad_norm": 0.8902062177658081, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8042, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.998535871156662, |
|
"eval_accuracy": 0.4472820512820513, |
|
"eval_loss": 4.117065906524658, |
|
"eval_runtime": 5.2376, |
|
"eval_samples_per_second": 95.464, |
|
"eval_steps_per_second": 12.028, |
|
"step": 1707 |
|
}, |
|
{ |
|
"epoch": 4.998535871156662, |
|
"eval_bleu": 0.2531049955279983, |
|
"eval_exact_match": 0.014, |
|
"eval_prefix_exact_match": 0.042, |
|
"step": 1707 |
|
}, |
|
{ |
|
"epoch": 5.270863836017569, |
|
"grad_norm": 0.9495383501052856, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7493, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 5.563689604685212, |
|
"grad_norm": 0.9647482633590698, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7334, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 5.856515373352855, |
|
"grad_norm": 0.9825767874717712, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7443, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.446974358974359, |
|
"eval_loss": 4.183700084686279, |
|
"eval_runtime": 5.2674, |
|
"eval_samples_per_second": 94.923, |
|
"eval_steps_per_second": 11.96, |
|
"step": 2049 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_bleu": 0.23587359955369594, |
|
"eval_exact_match": 0.012, |
|
"eval_prefix_exact_match": 0.04, |
|
"step": 2049 |
|
}, |
|
{ |
|
"epoch": 6.149341142020498, |
|
"grad_norm": 1.0430296659469604, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6959, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 6.44216691068814, |
|
"grad_norm": 1.0777732133865356, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6795, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 6.734992679355783, |
|
"grad_norm": 1.0959677696228027, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7019, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 6.998535871156662, |
|
"eval_accuracy": 0.4461538461538462, |
|
"eval_loss": 4.260408401489258, |
|
"eval_runtime": 5.178, |
|
"eval_samples_per_second": 96.563, |
|
"eval_steps_per_second": 12.167, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 6.998535871156662, |
|
"eval_bleu": 0.2008158466316808, |
|
"eval_exact_match": 0.006, |
|
"eval_prefix_exact_match": 0.042, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 7.027818448023426, |
|
"grad_norm": 1.0649549961090088, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6812, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 7.320644216691068, |
|
"grad_norm": 1.294122576713562, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6226, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 7.613469985358711, |
|
"grad_norm": 1.2657020092010498, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6404, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 7.906295754026354, |
|
"grad_norm": 1.1963133811950684, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6305, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.44148717948717947, |
|
"eval_loss": 4.406544208526611, |
|
"eval_runtime": 5.3355, |
|
"eval_samples_per_second": 93.711, |
|
"eval_steps_per_second": 11.808, |
|
"step": 2732 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_bleu": 0.24435500637461924, |
|
"eval_exact_match": 0.008, |
|
"eval_prefix_exact_match": 0.038, |
|
"step": 2732 |
|
}, |
|
{ |
|
"epoch": 8.199121522693996, |
|
"grad_norm": 1.3801360130310059, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5734, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 8.49194729136164, |
|
"grad_norm": 1.3306772708892822, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5651, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 8.784773060029282, |
|
"grad_norm": 1.5165746212005615, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6056, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 8.998535871156662, |
|
"eval_accuracy": 0.4397948717948718, |
|
"eval_loss": 4.448728561401367, |
|
"eval_runtime": 5.187, |
|
"eval_samples_per_second": 96.395, |
|
"eval_steps_per_second": 12.146, |
|
"step": 3073 |
|
}, |
|
{ |
|
"epoch": 8.998535871156662, |
|
"eval_bleu": 0.20120283162176952, |
|
"eval_exact_match": 0.008, |
|
"eval_prefix_exact_match": 0.036, |
|
"step": 3073 |
|
}, |
|
{ |
|
"epoch": 9.077598828696924, |
|
"grad_norm": 1.4077060222625732, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5612, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 9.370424597364568, |
|
"grad_norm": 1.475984811782837, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5158, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 9.66325036603221, |
|
"grad_norm": 1.586300015449524, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5145, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 9.956076134699854, |
|
"grad_norm": 1.510048270225525, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5521, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.4388717948717949, |
|
"eval_loss": 4.5474138259887695, |
|
"eval_runtime": 5.169, |
|
"eval_samples_per_second": 96.731, |
|
"eval_steps_per_second": 12.188, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_bleu": 0.2258317707850195, |
|
"eval_exact_match": 0.004, |
|
"eval_prefix_exact_match": 0.032, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 10.248901903367496, |
|
"grad_norm": 1.6384536027908325, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4631, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 10.541727672035138, |
|
"grad_norm": 1.7626007795333862, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4623, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 10.834553440702782, |
|
"grad_norm": 1.6483694314956665, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4934, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 10.998535871156662, |
|
"eval_accuracy": 0.4367179487179487, |
|
"eval_loss": 4.589761734008789, |
|
"eval_runtime": 5.853, |
|
"eval_samples_per_second": 85.426, |
|
"eval_steps_per_second": 10.764, |
|
"step": 3756 |
|
}, |
|
{ |
|
"epoch": 10.998535871156662, |
|
"eval_bleu": 0.20792247275869374, |
|
"eval_exact_match": 0.004, |
|
"eval_prefix_exact_match": 0.026, |
|
"step": 3756 |
|
}, |
|
{ |
|
"epoch": 11.127379209370424, |
|
"grad_norm": 1.8367844820022583, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4521, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 11.420204978038067, |
|
"grad_norm": 1.8123779296875, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4085, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 11.71303074670571, |
|
"grad_norm": 1.9437752962112427, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4287, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.43553846153846154, |
|
"eval_loss": 4.691086769104004, |
|
"eval_runtime": 5.3738, |
|
"eval_samples_per_second": 93.043, |
|
"eval_steps_per_second": 11.723, |
|
"step": 4098 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_bleu": 0.21672573731424186, |
|
"eval_exact_match": 0.006, |
|
"eval_prefix_exact_match": 0.028, |
|
"step": 4098 |
|
}, |
|
{ |
|
"epoch": 12.005856515373353, |
|
"grad_norm": 1.5838289260864258, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4361, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 12.298682284040996, |
|
"grad_norm": 1.8575290441513062, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3433, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 12.591508052708638, |
|
"grad_norm": 1.9598041772842407, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3661, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 12.88433382137628, |
|
"grad_norm": 1.972550868988037, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3846, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 12.998535871156662, |
|
"eval_accuracy": 0.43553846153846154, |
|
"eval_loss": 4.762892246246338, |
|
"eval_runtime": 5.106, |
|
"eval_samples_per_second": 97.925, |
|
"eval_steps_per_second": 12.339, |
|
"step": 4439 |
|
}, |
|
{ |
|
"epoch": 12.998535871156662, |
|
"eval_bleu": 0.2416046504676966, |
|
"eval_exact_match": 0.006, |
|
"eval_prefix_exact_match": 0.028, |
|
"step": 4439 |
|
}, |
|
{ |
|
"epoch": 13.177159590043924, |
|
"grad_norm": 1.9469107389450073, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3229, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 13.469985358711567, |
|
"grad_norm": 2.1680469512939453, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3172, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 13.762811127379209, |
|
"grad_norm": 2.1960484981536865, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3185, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.4327692307692308, |
|
"eval_loss": 4.758542537689209, |
|
"eval_runtime": 5.778, |
|
"eval_samples_per_second": 86.535, |
|
"eval_steps_per_second": 10.903, |
|
"step": 4781 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_bleu": 0.19085834097154403, |
|
"eval_exact_match": 0.004, |
|
"eval_prefix_exact_match": 0.022, |
|
"step": 4781 |
|
}, |
|
{ |
|
"epoch": 14.055636896046853, |
|
"grad_norm": 2.4733147621154785, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3027, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 14.348462664714495, |
|
"grad_norm": 2.2826030254364014, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2513, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 14.641288433382137, |
|
"grad_norm": 2.041346788406372, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2637, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 14.93411420204978, |
|
"grad_norm": 2.3494467735290527, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2667, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 14.998535871156662, |
|
"eval_accuracy": 0.4308717948717949, |
|
"eval_loss": 4.938868522644043, |
|
"eval_runtime": 5.9024, |
|
"eval_samples_per_second": 84.711, |
|
"eval_steps_per_second": 10.674, |
|
"step": 5122 |
|
}, |
|
{ |
|
"epoch": 14.998535871156662, |
|
"eval_bleu": 0.18549760176612626, |
|
"eval_exact_match": 0.002, |
|
"eval_prefix_exact_match": 0.024, |
|
"step": 5122 |
|
}, |
|
{ |
|
"epoch": 15.226939970717423, |
|
"grad_norm": 2.411022663116455, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2158, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 15.519765739385067, |
|
"grad_norm": 2.3778491020202637, |
|
"learning_rate": 3e-05, |
|
"loss": 1.193, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 15.812591508052709, |
|
"grad_norm": 2.2003743648529053, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2144, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.43025641025641026, |
|
"eval_loss": 4.898688316345215, |
|
"eval_runtime": 5.1834, |
|
"eval_samples_per_second": 96.463, |
|
"eval_steps_per_second": 12.154, |
|
"step": 5464 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_bleu": 0.20104450766148693, |
|
"eval_exact_match": 0.004, |
|
"eval_prefix_exact_match": 0.026, |
|
"step": 5464 |
|
}, |
|
{ |
|
"epoch": 16.105417276720353, |
|
"grad_norm": 2.1213016510009766, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1842, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 16.398243045387993, |
|
"grad_norm": 2.4457600116729736, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1527, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 16.691068814055637, |
|
"grad_norm": 2.5844104290008545, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1536, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 16.98389458272328, |
|
"grad_norm": 2.7657299041748047, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1708, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 16.998535871156662, |
|
"eval_accuracy": 0.4297435897435897, |
|
"eval_loss": 5.00166654586792, |
|
"eval_runtime": 5.4251, |
|
"eval_samples_per_second": 92.164, |
|
"eval_steps_per_second": 11.613, |
|
"step": 5805 |
|
}, |
|
{ |
|
"epoch": 16.998535871156662, |
|
"eval_bleu": 0.2059810081312132, |
|
"eval_exact_match": 0.004, |
|
"eval_prefix_exact_match": 0.028, |
|
"step": 5805 |
|
}, |
|
{ |
|
"epoch": 17.27672035139092, |
|
"grad_norm": 2.4050705432891846, |
|
"learning_rate": 3e-05, |
|
"loss": 1.083, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 17.569546120058565, |
|
"grad_norm": 2.584331750869751, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0993, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 17.86237188872621, |
|
"grad_norm": 3.257659912109375, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1146, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.43066666666666664, |
|
"eval_loss": 4.977816581726074, |
|
"eval_runtime": 5.2952, |
|
"eval_samples_per_second": 94.425, |
|
"eval_steps_per_second": 11.898, |
|
"step": 6147 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_bleu": 0.1948234970665932, |
|
"eval_exact_match": 0.004, |
|
"eval_prefix_exact_match": 0.032, |
|
"step": 6147 |
|
}, |
|
{ |
|
"epoch": 18.15519765739385, |
|
"grad_norm": 3.068880319595337, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0673, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 18.448023426061493, |
|
"grad_norm": 2.737912893295288, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0381, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 18.740849194729137, |
|
"grad_norm": 2.878462076187134, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0531, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 18.998535871156662, |
|
"eval_accuracy": 0.42866666666666664, |
|
"eval_loss": 5.121572494506836, |
|
"eval_runtime": 5.3996, |
|
"eval_samples_per_second": 92.599, |
|
"eval_steps_per_second": 11.667, |
|
"step": 6488 |
|
}, |
|
{ |
|
"epoch": 18.998535871156662, |
|
"eval_bleu": 0.19788734504730915, |
|
"eval_exact_match": 0.0, |
|
"eval_prefix_exact_match": 0.02, |
|
"step": 6488 |
|
}, |
|
{ |
|
"epoch": 19.03367496339678, |
|
"grad_norm": 2.6934640407562256, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0564, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 19.32650073206442, |
|
"grad_norm": 2.822169542312622, |
|
"learning_rate": 3e-05, |
|
"loss": 0.99, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 19.619326500732065, |
|
"grad_norm": 2.520814895629883, |
|
"learning_rate": 3e-05, |
|
"loss": 0.991, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 19.91215226939971, |
|
"grad_norm": 3.1566810607910156, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0158, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.42728205128205127, |
|
"eval_loss": 5.120981216430664, |
|
"eval_runtime": 5.394, |
|
"eval_samples_per_second": 92.696, |
|
"eval_steps_per_second": 11.68, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_bleu": 0.15806516253752823, |
|
"eval_exact_match": 0.006, |
|
"eval_prefix_exact_match": 0.028, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 20.20497803806735, |
|
"grad_norm": 2.937910556793213, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9615, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 20.497803806734993, |
|
"grad_norm": 3.2016685009002686, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9404, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 20.790629575402637, |
|
"grad_norm": 3.2996864318847656, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9555, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 20.998535871156662, |
|
"eval_accuracy": 0.42928205128205127, |
|
"eval_loss": 5.198832988739014, |
|
"eval_runtime": 5.8905, |
|
"eval_samples_per_second": 84.882, |
|
"eval_steps_per_second": 10.695, |
|
"step": 7171 |
|
}, |
|
{ |
|
"epoch": 20.998535871156662, |
|
"eval_bleu": 0.18303907416923465, |
|
"eval_exact_match": 0.004, |
|
"eval_prefix_exact_match": 0.02, |
|
"step": 7171 |
|
}, |
|
{ |
|
"epoch": 21.083455344070277, |
|
"grad_norm": 3.4803571701049805, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9454, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 21.37628111273792, |
|
"grad_norm": 2.6315698623657227, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8835, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 21.669106881405565, |
|
"grad_norm": 3.0926690101623535, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9004, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 21.961932650073205, |
|
"grad_norm": 2.9990713596343994, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9205, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.427025641025641, |
|
"eval_loss": 5.2239508628845215, |
|
"eval_runtime": 5.4025, |
|
"eval_samples_per_second": 92.55, |
|
"eval_steps_per_second": 11.661, |
|
"step": 7513 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_bleu": 0.1854279974336752, |
|
"eval_exact_match": 0.004, |
|
"eval_prefix_exact_match": 0.02, |
|
"step": 7513 |
|
}, |
|
{ |
|
"epoch": 22.25475841874085, |
|
"grad_norm": 3.2822601795196533, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8461, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 22.547584187408493, |
|
"grad_norm": 3.4992525577545166, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8485, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 22.840409956076133, |
|
"grad_norm": 3.7167255878448486, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8711, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 22.998535871156662, |
|
"eval_accuracy": 0.42512820512820515, |
|
"eval_loss": 5.346656322479248, |
|
"eval_runtime": 5.4264, |
|
"eval_samples_per_second": 92.143, |
|
"eval_steps_per_second": 11.61, |
|
"step": 7854 |
|
}, |
|
{ |
|
"epoch": 22.998535871156662, |
|
"eval_bleu": 0.18735744635859433, |
|
"eval_exact_match": 0.002, |
|
"eval_prefix_exact_match": 0.016, |
|
"step": 7854 |
|
}, |
|
{ |
|
"epoch": 23.133235724743777, |
|
"grad_norm": 3.3037803173065186, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8378, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 23.42606149341142, |
|
"grad_norm": 3.446135997772217, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8102, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 23.718887262079065, |
|
"grad_norm": 3.0248115062713623, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8082, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.42430769230769233, |
|
"eval_loss": 5.35550594329834, |
|
"eval_runtime": 5.1152, |
|
"eval_samples_per_second": 97.747, |
|
"eval_steps_per_second": 12.316, |
|
"step": 8196 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_bleu": 0.1404162407909808, |
|
"eval_exact_match": 0.004, |
|
"eval_prefix_exact_match": 0.016, |
|
"step": 8196 |
|
}, |
|
{ |
|
"epoch": 24.011713030746705, |
|
"grad_norm": 3.7103264331817627, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8279, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 24.30453879941435, |
|
"grad_norm": 4.033411026000977, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7449, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 24.597364568081993, |
|
"grad_norm": 3.5919322967529297, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7654, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 24.890190336749633, |
|
"grad_norm": 3.9135005474090576, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7854, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 24.998535871156662, |
|
"eval_accuracy": 0.4241025641025641, |
|
"eval_loss": 5.4628777503967285, |
|
"eval_runtime": 5.4188, |
|
"eval_samples_per_second": 92.272, |
|
"eval_steps_per_second": 11.626, |
|
"step": 8537 |
|
}, |
|
{ |
|
"epoch": 24.998535871156662, |
|
"eval_bleu": 0.16704717168781627, |
|
"eval_exact_match": 0.002, |
|
"eval_prefix_exact_match": 0.018, |
|
"step": 8537 |
|
}, |
|
{ |
|
"epoch": 25.183016105417277, |
|
"grad_norm": 3.4372332096099854, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7377, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 25.47584187408492, |
|
"grad_norm": 3.6895363330841064, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7141, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 25.76866764275256, |
|
"grad_norm": 3.793881893157959, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7359, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.4230769230769231, |
|
"eval_loss": 5.469931602478027, |
|
"eval_runtime": 5.4593, |
|
"eval_samples_per_second": 91.587, |
|
"eval_steps_per_second": 11.54, |
|
"step": 8879 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_bleu": 0.20994054385069763, |
|
"eval_exact_match": 0.0, |
|
"eval_prefix_exact_match": 0.018, |
|
"step": 8879 |
|
}, |
|
{ |
|
"epoch": 26.061493411420205, |
|
"grad_norm": 3.7599456310272217, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7299, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 26.35431918008785, |
|
"grad_norm": 3.9128973484039307, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6829, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 26.64714494875549, |
|
"grad_norm": 3.68682861328125, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6835, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 26.939970717423133, |
|
"grad_norm": 3.563685894012451, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7002, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 26.998535871156662, |
|
"eval_accuracy": 0.42107692307692307, |
|
"eval_loss": 5.505324363708496, |
|
"eval_runtime": 5.119, |
|
"eval_samples_per_second": 97.676, |
|
"eval_steps_per_second": 12.307, |
|
"step": 9220 |
|
}, |
|
{ |
|
"epoch": 26.998535871156662, |
|
"eval_bleu": 0.18409646236051158, |
|
"eval_exact_match": 0.004, |
|
"eval_prefix_exact_match": 0.02, |
|
"step": 9220 |
|
}, |
|
{ |
|
"epoch": 27.232796486090777, |
|
"grad_norm": 3.794928550720215, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6346, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 27.525622254758417, |
|
"grad_norm": 3.7905471324920654, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6439, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 27.81844802342606, |
|
"grad_norm": 4.2111711502075195, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6684, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.42138461538461536, |
|
"eval_loss": 5.558403491973877, |
|
"eval_runtime": 5.3841, |
|
"eval_samples_per_second": 92.866, |
|
"eval_steps_per_second": 11.701, |
|
"step": 9562 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_bleu": 0.15544965656402868, |
|
"eval_exact_match": 0.004, |
|
"eval_prefix_exact_match": 0.016, |
|
"step": 9562 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 17050, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.211035953226056e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|