|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.608089082298733, |
|
"eval_steps": 500, |
|
"global_step": 18000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004957330602491893, |
|
"loss": 2.0441, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004914661204983786, |
|
"loss": 1.713, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00048719918074756785, |
|
"loss": 1.6007, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00048293224099675715, |
|
"loss": 1.5181, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00047866530124594645, |
|
"loss": 1.4493, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"eval_bleu": 3.9997520454971136, |
|
"eval_loss": 1.3540712594985962, |
|
"eval_runtime": 10.3322, |
|
"eval_samples_per_second": 193.569, |
|
"eval_steps_per_second": 1.549, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004743983614951357, |
|
"loss": 1.4096, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.000470131421744325, |
|
"loss": 1.3729, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004658644819935143, |
|
"loss": 1.3389, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 0.00046159754224270354, |
|
"loss": 1.3166, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 0.00045733060249189284, |
|
"loss": 1.2915, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"eval_bleu": 5.393565315342523, |
|
"eval_loss": 1.2113043069839478, |
|
"eval_runtime": 8.5631, |
|
"eval_samples_per_second": 233.562, |
|
"eval_steps_per_second": 1.868, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 0.00045306366274108214, |
|
"loss": 1.2669, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 0.0004487967229902714, |
|
"loss": 1.2492, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 0.0004445297832394607, |
|
"loss": 1.2371, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 0.00044026284348864993, |
|
"loss": 1.2231, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 0.00043599590373783923, |
|
"loss": 1.2059, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"eval_bleu": 5.838067918496751, |
|
"eval_loss": 1.1367273330688477, |
|
"eval_runtime": 9.2404, |
|
"eval_samples_per_second": 216.442, |
|
"eval_steps_per_second": 1.732, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 0.0004317289639870285, |
|
"loss": 1.1931, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 0.0004274620242362178, |
|
"loss": 1.1761, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 0.0004231950844854071, |
|
"loss": 1.1792, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 0.0004189281447345963, |
|
"loss": 1.1614, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 0.0004146612049837856, |
|
"loss": 1.1573, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_bleu": 6.242227124133382, |
|
"eval_loss": 1.0900604724884033, |
|
"eval_runtime": 8.3953, |
|
"eval_samples_per_second": 238.228, |
|
"eval_steps_per_second": 1.906, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 0.0004103942652329749, |
|
"loss": 1.1449, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 0.00040612732548216417, |
|
"loss": 1.1379, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 0.00040186038573135347, |
|
"loss": 1.1281, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 0.00039759344598054277, |
|
"loss": 1.1226, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 0.00039332650622973206, |
|
"loss": 1.1121, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_bleu": 6.627148578471305, |
|
"eval_loss": 1.0542418956756592, |
|
"eval_runtime": 9.2835, |
|
"eval_samples_per_second": 215.436, |
|
"eval_steps_per_second": 1.723, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 0.0003890595664789213, |
|
"loss": 1.1076, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 0.0003847926267281106, |
|
"loss": 1.1001, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 0.0003805256869772999, |
|
"loss": 1.0996, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 0.00037625874722648915, |
|
"loss": 1.088, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 0.00037199180747567845, |
|
"loss": 1.0867, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"eval_bleu": 6.879580310573366, |
|
"eval_loss": 1.025155782699585, |
|
"eval_runtime": 8.5706, |
|
"eval_samples_per_second": 233.355, |
|
"eval_steps_per_second": 1.867, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 0.00036772486772486775, |
|
"loss": 1.0784, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 0.000363457927974057, |
|
"loss": 1.0794, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 0.0003591909882232463, |
|
"loss": 1.0736, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 0.0003549240484724356, |
|
"loss": 1.0684, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 0.0003506571087216249, |
|
"loss": 1.0623, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_bleu": 7.039305128716119, |
|
"eval_loss": 1.0067821741104126, |
|
"eval_runtime": 9.3223, |
|
"eval_samples_per_second": 214.539, |
|
"eval_steps_per_second": 1.716, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 0.00034639016897081414, |
|
"loss": 1.0609, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 0.00034212322922000344, |
|
"loss": 1.0579, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 0.00033785628946919274, |
|
"loss": 1.0517, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 0.000333589349718382, |
|
"loss": 1.054, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"learning_rate": 0.0003293224099675713, |
|
"loss": 1.0408, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"eval_bleu": 7.2660174031875915, |
|
"eval_loss": 0.9882246255874634, |
|
"eval_runtime": 8.4135, |
|
"eval_samples_per_second": 237.713, |
|
"eval_steps_per_second": 1.902, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"learning_rate": 0.0003250554702167606, |
|
"loss": 1.0317, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"learning_rate": 0.0003207885304659498, |
|
"loss": 1.0344, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"learning_rate": 0.0003165215907151391, |
|
"loss": 1.0274, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"learning_rate": 0.0003122546509643284, |
|
"loss": 1.0216, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"learning_rate": 0.0003079877112135177, |
|
"loss": 1.0203, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"eval_bleu": 7.055297387993883, |
|
"eval_loss": 0.9723305702209473, |
|
"eval_runtime": 9.3412, |
|
"eval_samples_per_second": 214.105, |
|
"eval_steps_per_second": 1.713, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"learning_rate": 0.0003037207714627069, |
|
"loss": 1.0217, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"learning_rate": 0.0002994538317118962, |
|
"loss": 1.0184, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"learning_rate": 0.0002951868919610855, |
|
"loss": 1.015, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"learning_rate": 0.00029091995221027477, |
|
"loss": 1.011, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"learning_rate": 0.00028665301245946406, |
|
"loss": 1.0054, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"eval_bleu": 7.455525040183996, |
|
"eval_loss": 0.9624391198158264, |
|
"eval_runtime": 8.4826, |
|
"eval_samples_per_second": 235.778, |
|
"eval_steps_per_second": 1.886, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"learning_rate": 0.00028238607270865336, |
|
"loss": 1.0062, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"learning_rate": 0.0002781191329578426, |
|
"loss": 1.0081, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"learning_rate": 0.0002738521932070319, |
|
"loss": 1.0007, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"learning_rate": 0.0002695852534562212, |
|
"loss": 0.9973, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"learning_rate": 0.0002653183137054105, |
|
"loss": 0.9977, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"eval_bleu": 7.426000467701662, |
|
"eval_loss": 0.9526358842849731, |
|
"eval_runtime": 9.3659, |
|
"eval_samples_per_second": 213.542, |
|
"eval_steps_per_second": 1.708, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"learning_rate": 0.00026105137395459975, |
|
"loss": 0.9982, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"learning_rate": 0.00025678443420378905, |
|
"loss": 0.9956, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"learning_rate": 0.00025251749445297835, |
|
"loss": 0.9941, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"learning_rate": 0.0002482505547021676, |
|
"loss": 0.9881, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"learning_rate": 0.0002439836149513569, |
|
"loss": 0.9931, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"eval_bleu": 7.523072732517067, |
|
"eval_loss": 0.9395684003829956, |
|
"eval_runtime": 9.0823, |
|
"eval_samples_per_second": 220.209, |
|
"eval_steps_per_second": 1.762, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"learning_rate": 0.0002397166752005462, |
|
"loss": 0.9834, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"learning_rate": 0.00023544973544973544, |
|
"loss": 0.9821, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"learning_rate": 0.00023118279569892471, |
|
"loss": 0.9859, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"learning_rate": 0.00022691585594811401, |
|
"loss": 0.9762, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"learning_rate": 0.0002226489161973033, |
|
"loss": 0.9804, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"eval_bleu": 7.43758431482021, |
|
"eval_loss": 0.9323887825012207, |
|
"eval_runtime": 9.0344, |
|
"eval_samples_per_second": 221.376, |
|
"eval_steps_per_second": 1.771, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"learning_rate": 0.00021838197644649259, |
|
"loss": 0.9794, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"learning_rate": 0.00021411503669568186, |
|
"loss": 0.9737, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"learning_rate": 0.00020984809694487113, |
|
"loss": 0.9696, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"learning_rate": 0.00020558115719406043, |
|
"loss": 0.9735, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"learning_rate": 0.0002013142174432497, |
|
"loss": 0.9691, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"eval_bleu": 7.52270860836675, |
|
"eval_loss": 0.9264442324638367, |
|
"eval_runtime": 9.3377, |
|
"eval_samples_per_second": 214.186, |
|
"eval_steps_per_second": 1.713, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"learning_rate": 0.000197047277692439, |
|
"loss": 0.9754, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"learning_rate": 0.00019278033794162827, |
|
"loss": 0.9688, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"learning_rate": 0.00018851339819081755, |
|
"loss": 0.968, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"learning_rate": 0.00018424645844000685, |
|
"loss": 0.9628, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"learning_rate": 0.00017997951868919612, |
|
"loss": 0.9645, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"eval_bleu": 7.685904345266889, |
|
"eval_loss": 0.9192501306533813, |
|
"eval_runtime": 8.7414, |
|
"eval_samples_per_second": 228.796, |
|
"eval_steps_per_second": 1.83, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"learning_rate": 0.0001757125789383854, |
|
"loss": 0.9674, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"learning_rate": 0.00017144563918757466, |
|
"loss": 0.9659, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"learning_rate": 0.00016717869943676394, |
|
"loss": 0.9646, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"learning_rate": 0.00016291175968595324, |
|
"loss": 0.9477, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"learning_rate": 0.0001586448199351425, |
|
"loss": 0.9509, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"eval_bleu": 7.647325378973634, |
|
"eval_loss": 0.9144095778465271, |
|
"eval_runtime": 9.1538, |
|
"eval_samples_per_second": 218.489, |
|
"eval_steps_per_second": 1.748, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"learning_rate": 0.00024078341013824886, |
|
"loss": 0.9532, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"learning_rate": 0.00023758320532514082, |
|
"loss": 0.949, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"learning_rate": 0.00023438300051203275, |
|
"loss": 0.9506, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"learning_rate": 0.00023118279569892471, |
|
"loss": 0.9539, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"learning_rate": 0.00022798259088581668, |
|
"loss": 0.9485, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"eval_bleu": 7.654834398763605, |
|
"eval_loss": 0.9117684364318848, |
|
"eval_runtime": 11.8365, |
|
"eval_samples_per_second": 168.969, |
|
"eval_steps_per_second": 1.352, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"learning_rate": 0.00022478238607270864, |
|
"loss": 0.9495, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"learning_rate": 0.00022158218125960063, |
|
"loss": 0.9484, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"learning_rate": 0.00021838197644649259, |
|
"loss": 0.9486, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"learning_rate": 0.00021518177163338455, |
|
"loss": 0.9445, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"learning_rate": 0.0002119815668202765, |
|
"loss": 0.9437, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"eval_bleu": 7.606584348844775, |
|
"eval_loss": 0.9072746634483337, |
|
"eval_runtime": 9.3821, |
|
"eval_samples_per_second": 213.171, |
|
"eval_steps_per_second": 1.705, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"learning_rate": 0.00020878136200716847, |
|
"loss": 0.9406, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"learning_rate": 0.00020558115719406043, |
|
"loss": 0.9418, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"learning_rate": 0.0002023809523809524, |
|
"loss": 0.9402, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"learning_rate": 0.00019918074756784435, |
|
"loss": 0.94, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"learning_rate": 0.00019598054275473631, |
|
"loss": 0.9393, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"eval_bleu": 7.714033503089272, |
|
"eval_loss": 0.9019351601600647, |
|
"eval_runtime": 9.2531, |
|
"eval_samples_per_second": 216.144, |
|
"eval_steps_per_second": 1.729, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"learning_rate": 0.00019278033794162827, |
|
"loss": 0.9378, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"learning_rate": 0.00018958013312852024, |
|
"loss": 0.9345, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"learning_rate": 0.0001863799283154122, |
|
"loss": 0.936, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"learning_rate": 0.00018317972350230416, |
|
"loss": 0.9389, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"learning_rate": 0.00017997951868919612, |
|
"loss": 0.9336, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"eval_bleu": 7.80952091204729, |
|
"eval_loss": 0.897022545337677, |
|
"eval_runtime": 9.0616, |
|
"eval_samples_per_second": 220.711, |
|
"eval_steps_per_second": 1.766, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"learning_rate": 0.00017677931387608805, |
|
"loss": 0.9376, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"learning_rate": 0.00017357910906298001, |
|
"loss": 0.9344, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"learning_rate": 0.00017037890424987198, |
|
"loss": 0.9358, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"learning_rate": 0.00016717869943676394, |
|
"loss": 0.9309, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"learning_rate": 0.0001639784946236559, |
|
"loss": 0.9368, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"eval_bleu": 7.937668751101428, |
|
"eval_loss": 0.8936744928359985, |
|
"eval_runtime": 9.5691, |
|
"eval_samples_per_second": 209.006, |
|
"eval_steps_per_second": 1.672, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"learning_rate": 0.00016077828981054789, |
|
"loss": 0.9337, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"learning_rate": 0.00015757808499743985, |
|
"loss": 0.9266, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"learning_rate": 0.0001543778801843318, |
|
"loss": 0.9276, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"learning_rate": 0.00015117767537122377, |
|
"loss": 0.9266, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"learning_rate": 0.00014797747055811573, |
|
"loss": 0.925, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"eval_bleu": 7.842526661794731, |
|
"eval_loss": 0.8898113965988159, |
|
"eval_runtime": 9.4133, |
|
"eval_samples_per_second": 212.466, |
|
"eval_steps_per_second": 1.7, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"learning_rate": 0.0001447772657450077, |
|
"loss": 0.9161, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"learning_rate": 0.00014157706093189965, |
|
"loss": 0.9263, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"learning_rate": 0.0001383768561187916, |
|
"loss": 0.9293, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"learning_rate": 0.00013517665130568357, |
|
"loss": 0.924, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"learning_rate": 0.00013197644649257554, |
|
"loss": 0.921, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"eval_bleu": 7.900750479934787, |
|
"eval_loss": 0.8863839507102966, |
|
"eval_runtime": 9.7323, |
|
"eval_samples_per_second": 205.501, |
|
"eval_steps_per_second": 1.644, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"learning_rate": 0.0001287762416794675, |
|
"loss": 0.9213, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"learning_rate": 0.00012557603686635946, |
|
"loss": 0.9219, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"learning_rate": 0.00012237583205325142, |
|
"loss": 0.9141, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"learning_rate": 0.00011917562724014337, |
|
"loss": 0.9125, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"learning_rate": 0.00011597542242703534, |
|
"loss": 0.9177, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"eval_bleu": 7.913356299834283, |
|
"eval_loss": 0.8835927844047546, |
|
"eval_runtime": 9.3736, |
|
"eval_samples_per_second": 213.364, |
|
"eval_steps_per_second": 1.707, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"learning_rate": 0.00019022017409114184, |
|
"loss": 0.9129, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"learning_rate": 0.0001876600102406554, |
|
"loss": 0.9121, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"learning_rate": 0.00018509984639016898, |
|
"loss": 0.9123, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"learning_rate": 0.00018253968253968252, |
|
"loss": 0.9168, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"learning_rate": 0.00017997951868919612, |
|
"loss": 0.9151, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"eval_bleu": 7.864673119868297, |
|
"eval_loss": 0.8821397423744202, |
|
"eval_runtime": 11.1353, |
|
"eval_samples_per_second": 179.609, |
|
"eval_steps_per_second": 1.437, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"learning_rate": 0.0001774193548387097, |
|
"loss": 0.9112, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"learning_rate": 0.00017485919098822326, |
|
"loss": 0.9175, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"learning_rate": 0.00017229902713773683, |
|
"loss": 0.9131, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"learning_rate": 0.0001697388632872504, |
|
"loss": 0.9116, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"learning_rate": 0.00016717869943676394, |
|
"loss": 0.9104, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"eval_bleu": 8.083046367100428, |
|
"eval_loss": 0.8790320158004761, |
|
"eval_runtime": 9.2669, |
|
"eval_samples_per_second": 215.821, |
|
"eval_steps_per_second": 1.727, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"learning_rate": 0.0001646185355862775, |
|
"loss": 0.9083, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"learning_rate": 0.0001620583717357911, |
|
"loss": 0.9084, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"learning_rate": 0.00015949820788530467, |
|
"loss": 0.9155, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"learning_rate": 0.00015693804403481824, |
|
"loss": 0.9059, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"learning_rate": 0.0001543778801843318, |
|
"loss": 0.9035, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"eval_bleu": 8.095903306480848, |
|
"eval_loss": 0.8766345381736755, |
|
"eval_runtime": 9.4138, |
|
"eval_samples_per_second": 212.454, |
|
"eval_steps_per_second": 1.7, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"learning_rate": 0.00015181771633384535, |
|
"loss": 0.9053, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"learning_rate": 0.00014925755248335892, |
|
"loss": 0.9047, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"learning_rate": 0.00014669738863287251, |
|
"loss": 0.9017, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"learning_rate": 0.00014413722478238608, |
|
"loss": 0.9073, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"learning_rate": 0.00014157706093189965, |
|
"loss": 0.8992, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"eval_bleu": 8.01784311685486, |
|
"eval_loss": 0.8740718960762024, |
|
"eval_runtime": 9.3262, |
|
"eval_samples_per_second": 214.449, |
|
"eval_steps_per_second": 1.716, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"learning_rate": 0.00013901689708141322, |
|
"loss": 0.904, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"learning_rate": 0.0001364567332309268, |
|
"loss": 0.9043, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"learning_rate": 0.00013389656938044033, |
|
"loss": 0.9038, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"learning_rate": 0.0001313364055299539, |
|
"loss": 0.9061, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"learning_rate": 0.0001287762416794675, |
|
"loss": 0.8986, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"eval_bleu": 8.038360794483163, |
|
"eval_loss": 0.8720167875289917, |
|
"eval_runtime": 9.7469, |
|
"eval_samples_per_second": 205.193, |
|
"eval_steps_per_second": 1.642, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"learning_rate": 0.00012621607782898107, |
|
"loss": 0.8972, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"learning_rate": 0.00012365591397849463, |
|
"loss": 0.9032, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"learning_rate": 0.00012109575012800819, |
|
"loss": 0.9009, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"learning_rate": 0.00011853558627752177, |
|
"loss": 0.9021, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"learning_rate": 0.00011597542242703534, |
|
"loss": 0.894, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"eval_bleu": 8.091339671473314, |
|
"eval_loss": 0.8682767152786255, |
|
"eval_runtime": 9.833, |
|
"eval_samples_per_second": 203.397, |
|
"eval_steps_per_second": 1.627, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"learning_rate": 0.0001134152585765489, |
|
"loss": 0.9011, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"learning_rate": 0.00011085509472606247, |
|
"loss": 0.9016, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"learning_rate": 0.00010829493087557605, |
|
"loss": 0.8951, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"learning_rate": 0.0001057347670250896, |
|
"loss": 0.8981, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"learning_rate": 0.00010317460317460317, |
|
"loss": 0.8932, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"eval_bleu": 8.099679129372298, |
|
"eval_loss": 0.8663304448127747, |
|
"eval_runtime": 9.7444, |
|
"eval_samples_per_second": 205.246, |
|
"eval_steps_per_second": 1.642, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"learning_rate": 0.00010061443932411675, |
|
"loss": 0.8991, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 4.02, |
|
"learning_rate": 9.805427547363031e-05, |
|
"loss": 0.8937, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"learning_rate": 9.549411162314388e-05, |
|
"loss": 0.8877, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"learning_rate": 9.293394777265746e-05, |
|
"loss": 0.8916, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"learning_rate": 9.037378392217102e-05, |
|
"loss": 0.8889, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"eval_bleu": 8.108828400885466, |
|
"eval_loss": 0.8641292452812195, |
|
"eval_runtime": 9.0966, |
|
"eval_samples_per_second": 219.862, |
|
"eval_steps_per_second": 1.759, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"learning_rate": 8.781362007168459e-05, |
|
"loss": 0.8868, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"learning_rate": 8.525345622119817e-05, |
|
"loss": 0.8887, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"learning_rate": 8.269329237071172e-05, |
|
"loss": 0.8893, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"learning_rate": 8.013312852022529e-05, |
|
"loss": 0.8953, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"learning_rate": 7.757296466973887e-05, |
|
"loss": 0.8888, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"eval_bleu": 8.066510672738694, |
|
"eval_loss": 0.8628500699996948, |
|
"eval_runtime": 9.0524, |
|
"eval_samples_per_second": 220.936, |
|
"eval_steps_per_second": 1.767, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"learning_rate": 7.501280081925243e-05, |
|
"loss": 0.8867, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"learning_rate": 7.2452636968766e-05, |
|
"loss": 0.8869, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"learning_rate": 6.989247311827957e-05, |
|
"loss": 0.8931, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"learning_rate": 6.733230926779315e-05, |
|
"loss": 0.8887, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"learning_rate": 6.47721454173067e-05, |
|
"loss": 0.8856, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"eval_bleu": 8.283643999074862, |
|
"eval_loss": 0.8606928586959839, |
|
"eval_runtime": 9.8187, |
|
"eval_samples_per_second": 203.693, |
|
"eval_steps_per_second": 1.63, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"learning_rate": 6.221198156682027e-05, |
|
"loss": 0.8885, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"learning_rate": 5.965181771633384e-05, |
|
"loss": 0.8853, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 4.43, |
|
"learning_rate": 5.709165386584742e-05, |
|
"loss": 0.8842, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"learning_rate": 5.453149001536099e-05, |
|
"loss": 0.8845, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"learning_rate": 5.197132616487455e-05, |
|
"loss": 0.8826, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"eval_bleu": 8.235425004794472, |
|
"eval_loss": 0.8612557649612427, |
|
"eval_runtime": 9.8717, |
|
"eval_samples_per_second": 202.6, |
|
"eval_steps_per_second": 1.621, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"learning_rate": 4.9411162314388125e-05, |
|
"loss": 0.8846, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"learning_rate": 4.6850998463901694e-05, |
|
"loss": 0.8908, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"learning_rate": 4.4290834613415256e-05, |
|
"loss": 0.8861, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"learning_rate": 4.173067076292883e-05, |
|
"loss": 0.8881, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 4.61, |
|
"learning_rate": 3.9170506912442394e-05, |
|
"loss": 0.8862, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 4.61, |
|
"eval_bleu": 8.116630685408094, |
|
"eval_loss": 0.8578370213508606, |
|
"eval_runtime": 9.776, |
|
"eval_samples_per_second": 204.583, |
|
"eval_steps_per_second": 1.637, |
|
"step": 18000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 19530, |
|
"num_train_epochs": 5, |
|
"save_steps": 2000, |
|
"total_flos": 5.25714080661504e+16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|