|
{ |
|
"best_metric": 0.19640253484249115, |
|
"best_model_checkpoint": "/kaggle/working/hubert-amharic/checkpoint-1500", |
|
"epoch": 12.121212121212121, |
|
"eval_steps": 500, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.40404040404040403, |
|
"grad_norm": 2.795124053955078, |
|
"learning_rate": 9.730094466936572e-06, |
|
"loss": 1.5155, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8080808080808081, |
|
"grad_norm": 2.900432825088501, |
|
"learning_rate": 9.460188933873145e-06, |
|
"loss": 1.175, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.2121212121212122, |
|
"grad_norm": 3.376126766204834, |
|
"learning_rate": 9.195681511470986e-06, |
|
"loss": 0.7957, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.6161616161616161, |
|
"grad_norm": 6.262876987457275, |
|
"learning_rate": 8.925775978407558e-06, |
|
"loss": 0.6227, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.0202020202020203, |
|
"grad_norm": 13.498932838439941, |
|
"learning_rate": 8.65587044534413e-06, |
|
"loss": 0.5372, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.0202020202020203, |
|
"eval_accuracy": 0.8929293155670166, |
|
"eval_loss": 0.3763836622238159, |
|
"eval_runtime": 22.9508, |
|
"eval_samples_per_second": 21.568, |
|
"eval_steps_per_second": 5.403, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.4242424242424243, |
|
"grad_norm": 7.660800457000732, |
|
"learning_rate": 8.388663967611337e-06, |
|
"loss": 0.4195, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.8282828282828283, |
|
"grad_norm": 10.333969116210938, |
|
"learning_rate": 8.118758434547908e-06, |
|
"loss": 0.4207, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.2323232323232323, |
|
"grad_norm": 25.161924362182617, |
|
"learning_rate": 7.848852901484481e-06, |
|
"loss": 0.3231, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 0.32242974638938904, |
|
"learning_rate": 7.578947368421054e-06, |
|
"loss": 0.236, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.040404040404041, |
|
"grad_norm": 4.83636474609375, |
|
"learning_rate": 7.309041835357625e-06, |
|
"loss": 0.2254, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.040404040404041, |
|
"eval_accuracy": 0.9111111164093018, |
|
"eval_loss": 0.3798251748085022, |
|
"eval_runtime": 23.0284, |
|
"eval_samples_per_second": 21.495, |
|
"eval_steps_per_second": 5.385, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 10.075101852416992, |
|
"learning_rate": 7.039136302294197e-06, |
|
"loss": 0.2038, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.848484848484849, |
|
"grad_norm": 5.713657379150391, |
|
"learning_rate": 6.76923076923077e-06, |
|
"loss": 0.2233, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 5.252525252525253, |
|
"grad_norm": 64.90503692626953, |
|
"learning_rate": 6.499325236167342e-06, |
|
"loss": 0.2077, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 5.656565656565657, |
|
"grad_norm": 33.007598876953125, |
|
"learning_rate": 6.229419703103914e-06, |
|
"loss": 0.1811, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 6.0606060606060606, |
|
"grad_norm": 2.3471884727478027, |
|
"learning_rate": 5.959514170040487e-06, |
|
"loss": 0.1699, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 6.0606060606060606, |
|
"eval_accuracy": 0.9535353779792786, |
|
"eval_loss": 0.19640253484249115, |
|
"eval_runtime": 22.503, |
|
"eval_samples_per_second": 21.997, |
|
"eval_steps_per_second": 5.51, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 6.4646464646464645, |
|
"grad_norm": 46.472137451171875, |
|
"learning_rate": 5.692307692307692e-06, |
|
"loss": 0.1676, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 6.8686868686868685, |
|
"grad_norm": 0.10037334263324738, |
|
"learning_rate": 5.4224021592442655e-06, |
|
"loss": 0.1468, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 7.2727272727272725, |
|
"grad_norm": 2.2828967571258545, |
|
"learning_rate": 5.152496626180837e-06, |
|
"loss": 0.1122, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 7.6767676767676765, |
|
"grad_norm": 2.2225501537323, |
|
"learning_rate": 4.882591093117409e-06, |
|
"loss": 0.1162, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 8.080808080808081, |
|
"grad_norm": 3.6428568363189697, |
|
"learning_rate": 4.6126855600539814e-06, |
|
"loss": 0.1245, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 8.080808080808081, |
|
"eval_accuracy": 0.9595959782600403, |
|
"eval_loss": 0.22902674973011017, |
|
"eval_runtime": 22.8105, |
|
"eval_samples_per_second": 21.701, |
|
"eval_steps_per_second": 5.436, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 8.484848484848484, |
|
"grad_norm": 0.09281215816736221, |
|
"learning_rate": 4.342780026990554e-06, |
|
"loss": 0.1027, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 8.88888888888889, |
|
"grad_norm": 30.872848510742188, |
|
"learning_rate": 4.072874493927126e-06, |
|
"loss": 0.105, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 9.292929292929292, |
|
"grad_norm": 0.3525031507015228, |
|
"learning_rate": 3.8029689608636982e-06, |
|
"loss": 0.1184, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 9.696969696969697, |
|
"grad_norm": 0.40138208866119385, |
|
"learning_rate": 3.53306342780027e-06, |
|
"loss": 0.1048, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 10.1010101010101, |
|
"grad_norm": 0.025619197636842728, |
|
"learning_rate": 3.2631578947368423e-06, |
|
"loss": 0.0597, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 10.1010101010101, |
|
"eval_accuracy": 0.9636363387107849, |
|
"eval_loss": 0.22432678937911987, |
|
"eval_runtime": 22.7183, |
|
"eval_samples_per_second": 21.789, |
|
"eval_steps_per_second": 5.458, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 10.505050505050505, |
|
"grad_norm": 0.028187109157443047, |
|
"learning_rate": 2.9932523616734146e-06, |
|
"loss": 0.0525, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 10.909090909090908, |
|
"grad_norm": 0.020364606752991676, |
|
"learning_rate": 2.723346828609987e-06, |
|
"loss": 0.1105, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 11.313131313131313, |
|
"grad_norm": 18.78951644897461, |
|
"learning_rate": 2.453441295546559e-06, |
|
"loss": 0.078, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 11.717171717171716, |
|
"grad_norm": 48.03165817260742, |
|
"learning_rate": 2.183535762483131e-06, |
|
"loss": 0.1166, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 12.121212121212121, |
|
"grad_norm": 0.21182887256145477, |
|
"learning_rate": 1.913630229419703e-06, |
|
"loss": 0.0816, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 12.121212121212121, |
|
"eval_accuracy": 0.9555555582046509, |
|
"eval_loss": 0.27169349789619446, |
|
"eval_runtime": 22.618, |
|
"eval_samples_per_second": 21.885, |
|
"eval_steps_per_second": 5.482, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 12.121212121212121, |
|
"step": 3000, |
|
"total_flos": 7.685447856522912e+17, |
|
"train_loss": 0.2951181084314982, |
|
"train_runtime": 1630.5884, |
|
"train_samples_per_second": 18.205, |
|
"train_steps_per_second": 2.272 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 3705, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.685447856522912e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|