|
{ |
|
"best_metric": 3.158884286880493, |
|
"best_model_checkpoint": "./snap_diff_llama/diff_llama_410m_diff_attn_nh8/checkpoint-13000", |
|
"epoch": 0.19492857142857142, |
|
"eval_steps": 1000, |
|
"global_step": 14000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 2.1428571428571426e-07, |
|
"loss": 11.0209, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00021428571428571427, |
|
"loss": 6.3835, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 4.906383991241455, |
|
"eval_ppl": 135.14982680795362, |
|
"eval_runtime": 31.9248, |
|
"eval_samples_per_second": 15.662, |
|
"eval_steps_per_second": 0.125, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0002983246239337692, |
|
"loss": 4.3861, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"eval_loss": 4.004458427429199, |
|
"eval_ppl": 54.84211536958963, |
|
"eval_runtime": 31.4229, |
|
"eval_samples_per_second": 15.912, |
|
"eval_steps_per_second": 0.127, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00028822143178056114, |
|
"loss": 3.7829, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"eval_loss": 3.621983051300049, |
|
"eval_ppl": 37.41168359786832, |
|
"eval_runtime": 31.3363, |
|
"eval_samples_per_second": 15.956, |
|
"eval_steps_per_second": 0.128, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0002695698760834384, |
|
"loss": 3.4948, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 3.4599356651306152, |
|
"eval_ppl": 31.81492963948283, |
|
"eval_runtime": 32.1788, |
|
"eval_samples_per_second": 15.538, |
|
"eval_steps_per_second": 0.124, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00024352347027881003, |
|
"loss": 3.4801, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 3.4120349884033203, |
|
"eval_ppl": 30.326896389900888, |
|
"eval_runtime": 32.3388, |
|
"eval_samples_per_second": 15.461, |
|
"eval_steps_per_second": 0.124, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00021169306546959174, |
|
"loss": 3.3808, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 3.3343753814697266, |
|
"eval_ppl": 28.060850421565963, |
|
"eval_runtime": 31.722, |
|
"eval_samples_per_second": 15.762, |
|
"eval_steps_per_second": 0.126, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 0.00017604722665003956, |
|
"loss": 3.3217, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"eval_loss": 3.264651298522949, |
|
"eval_ppl": 26.17098353892157, |
|
"eval_runtime": 31.6861, |
|
"eval_samples_per_second": 15.78, |
|
"eval_steps_per_second": 0.126, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.00013879048596203636, |
|
"loss": 3.224, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 3.2256884574890137, |
|
"eval_ppl": 25.170897284756094, |
|
"eval_runtime": 32.2116, |
|
"eval_samples_per_second": 15.522, |
|
"eval_steps_per_second": 0.124, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00010222700246224735, |
|
"loss": 3.2587, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 3.2039520740509033, |
|
"eval_ppl": 24.629676411177265, |
|
"eval_runtime": 32.2421, |
|
"eval_samples_per_second": 15.508, |
|
"eval_steps_per_second": 0.124, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 6.86180604201361e-05, |
|
"loss": 3.2347, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"eval_loss": 3.177614450454712, |
|
"eval_ppl": 23.989457198879418, |
|
"eval_runtime": 32.2121, |
|
"eval_samples_per_second": 15.522, |
|
"eval_steps_per_second": 0.124, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 4.004221922552608e-05, |
|
"loss": 3.2281, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 3.1658647060394287, |
|
"eval_ppl": 23.70923669164879, |
|
"eval_runtime": 31.8013, |
|
"eval_samples_per_second": 15.723, |
|
"eval_steps_per_second": 0.126, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 1.82667639944657e-05, |
|
"loss": 3.219, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 3.1636743545532227, |
|
"eval_ppl": 23.65736196250915, |
|
"eval_runtime": 32.1149, |
|
"eval_samples_per_second": 15.569, |
|
"eval_steps_per_second": 0.125, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2048, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 3.158884286880493, |
|
"eval_ppl": 23.54431257102566, |
|
"eval_runtime": 31.5514, |
|
"eval_samples_per_second": 15.847, |
|
"eval_steps_per_second": 0.127, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2405, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"eval_loss": 3.159707546234131, |
|
"eval_ppl": 23.5637036274138, |
|
"eval_runtime": 31.7181, |
|
"eval_samples_per_second": 15.764, |
|
"eval_steps_per_second": 0.126, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"step": 14000, |
|
"total_flos": 2.0002889491529335e+19, |
|
"train_loss": 0.6279961286272322, |
|
"train_runtime": 10197.8291, |
|
"train_samples_per_second": 351.447, |
|
"train_steps_per_second": 1.373 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 14000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 1000, |
|
"total_flos": 2.0002889491529335e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|