|
{ |
|
"best_metric": 7.9168, |
|
"best_model_checkpoint": "/content/tst-translation/checkpoint-1600", |
|
"epoch": 20.0, |
|
"eval_steps": 200, |
|
"global_step": 2540, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.574803149606299, |
|
"grad_norm": 2.453336238861084, |
|
"learning_rate": 0.00046062992125984255, |
|
"loss": 4.5126, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.574803149606299, |
|
"eval_bleu": 1.0891, |
|
"eval_gen_len": 120.8315, |
|
"eval_loss": 2.747382402420044, |
|
"eval_runtime": 505.8332, |
|
"eval_samples_per_second": 1.971, |
|
"eval_steps_per_second": 0.125, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.1496062992125986, |
|
"grad_norm": 2.0367627143859863, |
|
"learning_rate": 0.00042125984251968504, |
|
"loss": 2.4414, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.1496062992125986, |
|
"eval_bleu": 5.0172, |
|
"eval_gen_len": 54.7623, |
|
"eval_loss": 2.5120186805725098, |
|
"eval_runtime": 340.8894, |
|
"eval_samples_per_second": 2.925, |
|
"eval_steps_per_second": 0.185, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 4.724409448818898, |
|
"grad_norm": 2.0367233753204346, |
|
"learning_rate": 0.00038188976377952753, |
|
"loss": 1.724, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.724409448818898, |
|
"eval_bleu": 5.2115, |
|
"eval_gen_len": 61.985, |
|
"eval_loss": 2.4089620113372803, |
|
"eval_runtime": 368.5588, |
|
"eval_samples_per_second": 2.705, |
|
"eval_steps_per_second": 0.171, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 6.299212598425197, |
|
"grad_norm": 1.8066316843032837, |
|
"learning_rate": 0.00034251968503937007, |
|
"loss": 1.2755, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 6.299212598425197, |
|
"eval_bleu": 7.0203, |
|
"eval_gen_len": 46.1414, |
|
"eval_loss": 2.551859140396118, |
|
"eval_runtime": 215.1683, |
|
"eval_samples_per_second": 4.634, |
|
"eval_steps_per_second": 0.293, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 7.874015748031496, |
|
"grad_norm": 1.8381917476654053, |
|
"learning_rate": 0.0003031496062992126, |
|
"loss": 0.97, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 7.874015748031496, |
|
"eval_bleu": 7.005, |
|
"eval_gen_len": 56.5266, |
|
"eval_loss": 2.5974559783935547, |
|
"eval_runtime": 308.9884, |
|
"eval_samples_per_second": 3.227, |
|
"eval_steps_per_second": 0.204, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 9.448818897637794, |
|
"grad_norm": 1.8475762605667114, |
|
"learning_rate": 0.0002637795275590551, |
|
"loss": 0.7251, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 9.448818897637794, |
|
"eval_bleu": 7.6235, |
|
"eval_gen_len": 52.6841, |
|
"eval_loss": 2.791808605194092, |
|
"eval_runtime": 280.2999, |
|
"eval_samples_per_second": 3.557, |
|
"eval_steps_per_second": 0.225, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 11.023622047244094, |
|
"grad_norm": 1.7481825351715088, |
|
"learning_rate": 0.00022440944881889764, |
|
"loss": 0.584, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 11.023622047244094, |
|
"eval_bleu": 7.3273, |
|
"eval_gen_len": 49.9659, |
|
"eval_loss": 2.8952395915985107, |
|
"eval_runtime": 205.6302, |
|
"eval_samples_per_second": 4.849, |
|
"eval_steps_per_second": 0.306, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 12.598425196850394, |
|
"grad_norm": 1.5020047426223755, |
|
"learning_rate": 0.00018503937007874016, |
|
"loss": 0.4358, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 12.598425196850394, |
|
"eval_bleu": 7.9168, |
|
"eval_gen_len": 51.4945, |
|
"eval_loss": 3.120616912841797, |
|
"eval_runtime": 243.389, |
|
"eval_samples_per_second": 4.096, |
|
"eval_steps_per_second": 0.259, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 14.173228346456693, |
|
"grad_norm": 1.6936888694763184, |
|
"learning_rate": 0.00014566929133858267, |
|
"loss": 0.3619, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 14.173228346456693, |
|
"eval_bleu": 7.9096, |
|
"eval_gen_len": 50.5517, |
|
"eval_loss": 3.264512538909912, |
|
"eval_runtime": 220.31, |
|
"eval_samples_per_second": 4.525, |
|
"eval_steps_per_second": 0.286, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 15.748031496062993, |
|
"grad_norm": 1.4287927150726318, |
|
"learning_rate": 0.0001062992125984252, |
|
"loss": 0.2933, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 15.748031496062993, |
|
"eval_bleu": 7.9015, |
|
"eval_gen_len": 49.6169, |
|
"eval_loss": 3.3970730304718018, |
|
"eval_runtime": 195.9873, |
|
"eval_samples_per_second": 5.087, |
|
"eval_steps_per_second": 0.321, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 17.322834645669293, |
|
"grad_norm": 1.6358137130737305, |
|
"learning_rate": 6.692913385826773e-05, |
|
"loss": 0.2447, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 17.322834645669293, |
|
"eval_bleu": 7.8441, |
|
"eval_gen_len": 49.6911, |
|
"eval_loss": 3.5039305686950684, |
|
"eval_runtime": 201.4971, |
|
"eval_samples_per_second": 4.948, |
|
"eval_steps_per_second": 0.313, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 18.89763779527559, |
|
"grad_norm": 1.3624520301818848, |
|
"learning_rate": 2.7559055118110236e-05, |
|
"loss": 0.2151, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 18.89763779527559, |
|
"eval_bleu": 7.8198, |
|
"eval_gen_len": 50.1153, |
|
"eval_loss": 3.5556399822235107, |
|
"eval_runtime": 195.3596, |
|
"eval_samples_per_second": 5.103, |
|
"eval_steps_per_second": 0.322, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 2540, |
|
"total_flos": 2811538357714944.0, |
|
"train_loss": 1.096170819650485, |
|
"train_runtime": 5288.9286, |
|
"train_samples_per_second": 3.827, |
|
"train_steps_per_second": 0.48 |
|
} |
|
], |
|
"logging_steps": 200, |
|
"max_steps": 2540, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2811538357714944.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|