|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 29, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.034482758620689655, |
|
"grad_norm": 4.296210765838623, |
|
"learning_rate": 3.333333333333333e-07, |
|
"loss": 1.5533, |
|
"mean_token_accuracy": 0.6920064091682434, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.06896551724137931, |
|
"grad_norm": 5.163259029388428, |
|
"learning_rate": 6.666666666666666e-07, |
|
"loss": 1.7853, |
|
"mean_token_accuracy": 0.6263327598571777, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.10344827586206896, |
|
"grad_norm": 4.452831745147705, |
|
"learning_rate": 1e-06, |
|
"loss": 1.5075, |
|
"mean_token_accuracy": 0.7126692533493042, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.13793103448275862, |
|
"grad_norm": 6.790956497192383, |
|
"learning_rate": 9.963544370490268e-07, |
|
"loss": 1.9253, |
|
"mean_token_accuracy": 0.6712393760681152, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.1724137931034483, |
|
"grad_norm": 3.9182674884796143, |
|
"learning_rate": 9.85470908713026e-07, |
|
"loss": 1.6858, |
|
"mean_token_accuracy": 0.559170126914978, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.20689655172413793, |
|
"grad_norm": 3.839620351791382, |
|
"learning_rate": 9.675081213427074e-07, |
|
"loss": 1.5418, |
|
"mean_token_accuracy": 0.6354226469993591, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.2413793103448276, |
|
"grad_norm": 4.464385986328125, |
|
"learning_rate": 9.427280128266049e-07, |
|
"loss": 1.5432, |
|
"mean_token_accuracy": 0.6771755814552307, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.27586206896551724, |
|
"grad_norm": 3.300748348236084, |
|
"learning_rate": 9.114919329468282e-07, |
|
"loss": 1.3202, |
|
"mean_token_accuracy": 0.6744688153266907, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.3103448275862069, |
|
"grad_norm": 4.529242515563965, |
|
"learning_rate": 8.742553740855505e-07, |
|
"loss": 1.5632, |
|
"mean_token_accuracy": 0.7091418504714966, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"grad_norm": 4.9364800453186035, |
|
"learning_rate": 8.315613291203976e-07, |
|
"loss": 1.3939, |
|
"mean_token_accuracy": 0.676058828830719, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3793103448275862, |
|
"grad_norm": 4.520140171051025, |
|
"learning_rate": 7.840323733655778e-07, |
|
"loss": 1.6434, |
|
"mean_token_accuracy": 0.6085848808288574, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.41379310344827586, |
|
"grad_norm": 3.786013126373291, |
|
"learning_rate": 7.323615860218842e-07, |
|
"loss": 1.4203, |
|
"mean_token_accuracy": 0.6495858430862427, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.4482758620689655, |
|
"grad_norm": 4.0102105140686035, |
|
"learning_rate": 6.773024435212677e-07, |
|
"loss": 1.4744, |
|
"mean_token_accuracy": 0.715476393699646, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.4827586206896552, |
|
"grad_norm": 6.1422576904296875, |
|
"learning_rate": 6.196578321437789e-07, |
|
"loss": 1.7096, |
|
"mean_token_accuracy": 0.6734651327133179, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.5172413793103449, |
|
"grad_norm": 4.225828647613525, |
|
"learning_rate": 5.602683401276614e-07, |
|
"loss": 1.6184, |
|
"mean_token_accuracy": 0.6401326656341553, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.5517241379310345, |
|
"grad_norm": 2.499864101409912, |
|
"learning_rate": 5e-07, |
|
"loss": 1.0916, |
|
"mean_token_accuracy": 0.712737500667572, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5862068965517241, |
|
"grad_norm": 3.797278881072998, |
|
"learning_rate": 4.397316598723385e-07, |
|
"loss": 1.5532, |
|
"mean_token_accuracy": 0.6137330532073975, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.6206896551724138, |
|
"grad_norm": 3.9196736812591553, |
|
"learning_rate": 3.8034216785622125e-07, |
|
"loss": 1.5172, |
|
"mean_token_accuracy": 0.6513392925262451, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.6551724137931034, |
|
"grad_norm": 3.312458038330078, |
|
"learning_rate": 3.2269755647873214e-07, |
|
"loss": 1.4259, |
|
"mean_token_accuracy": 0.6606206893920898, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 4.863149642944336, |
|
"learning_rate": 2.676384139781157e-07, |
|
"loss": 1.801, |
|
"mean_token_accuracy": 0.6028177738189697, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.7241379310344828, |
|
"grad_norm": 6.340369701385498, |
|
"learning_rate": 2.1596762663442213e-07, |
|
"loss": 1.8937, |
|
"mean_token_accuracy": 0.6635770797729492, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.7586206896551724, |
|
"grad_norm": 2.400047540664673, |
|
"learning_rate": 1.6843867087960251e-07, |
|
"loss": 1.1017, |
|
"mean_token_accuracy": 0.7170366048812866, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.7931034482758621, |
|
"grad_norm": 6.618093490600586, |
|
"learning_rate": 1.257446259144494e-07, |
|
"loss": 1.8901, |
|
"mean_token_accuracy": 0.6319121718406677, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.8275862068965517, |
|
"grad_norm": 3.391517400741577, |
|
"learning_rate": 8.850806705317182e-08, |
|
"loss": 1.3663, |
|
"mean_token_accuracy": 0.6898515224456787, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.8620689655172413, |
|
"grad_norm": 4.413179874420166, |
|
"learning_rate": 5.72719871733951e-08, |
|
"loss": 1.5641, |
|
"mean_token_accuracy": 0.6674157381057739, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.896551724137931, |
|
"grad_norm": 4.75150728225708, |
|
"learning_rate": 3.2491878657292635e-08, |
|
"loss": 1.6575, |
|
"mean_token_accuracy": 0.6414722204208374, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.9310344827586207, |
|
"grad_norm": 3.682435989379883, |
|
"learning_rate": 1.4529091286973993e-08, |
|
"loss": 1.3724, |
|
"mean_token_accuracy": 0.674655020236969, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.9655172413793104, |
|
"grad_norm": 3.129405975341797, |
|
"learning_rate": 3.6455629509730135e-09, |
|
"loss": 1.5536, |
|
"mean_token_accuracy": 0.5809841752052307, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.9102606773376465, |
|
"learning_rate": 0.0, |
|
"loss": 1.2491, |
|
"mean_token_accuracy": 0.6780526041984558, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 29, |
|
"total_flos": 955996483584.0, |
|
"train_loss": 1.542164264054134, |
|
"train_runtime": 1339.7897, |
|
"train_samples_per_second": 0.085, |
|
"train_steps_per_second": 0.022 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 29, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 955996483584.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|