|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.13971358714635, |
|
"eval_steps": 13, |
|
"global_step": 50, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0027942717429269995, |
|
"grad_norm": 81.70710754394531, |
|
"learning_rate": 5e-05, |
|
"loss": 43.3812, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0027942717429269995, |
|
"eval_loss": 1.3799916505813599, |
|
"eval_runtime": 212.1241, |
|
"eval_samples_per_second": 11.366, |
|
"eval_steps_per_second": 2.843, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005588543485853999, |
|
"grad_norm": 80.07818603515625, |
|
"learning_rate": 0.0001, |
|
"loss": 41.728, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.008382815228781, |
|
"grad_norm": 77.7547836303711, |
|
"learning_rate": 9.989294616193017e-05, |
|
"loss": 41.7219, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.011177086971707998, |
|
"grad_norm": 76.19801330566406, |
|
"learning_rate": 9.957224306869053e-05, |
|
"loss": 42.2267, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.013971358714634998, |
|
"grad_norm": 37.9150390625, |
|
"learning_rate": 9.903926402016153e-05, |
|
"loss": 37.8819, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.016765630457562, |
|
"grad_norm": 38.34889602661133, |
|
"learning_rate": 9.829629131445342e-05, |
|
"loss": 35.4385, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.019559902200488997, |
|
"grad_norm": 39.905494689941406, |
|
"learning_rate": 9.73465064747553e-05, |
|
"loss": 36.405, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.022354173943415996, |
|
"grad_norm": 37.06133270263672, |
|
"learning_rate": 9.619397662556435e-05, |
|
"loss": 36.5912, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.025148445686342998, |
|
"grad_norm": 28.63506317138672, |
|
"learning_rate": 9.484363707663442e-05, |
|
"loss": 35.7375, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.027942717429269997, |
|
"grad_norm": 29.515090942382812, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 33.1971, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.030736989172196996, |
|
"grad_norm": 27.13974380493164, |
|
"learning_rate": 9.157348061512727e-05, |
|
"loss": 33.0697, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.033531260915124, |
|
"grad_norm": 24.511131286621094, |
|
"learning_rate": 8.966766701456177e-05, |
|
"loss": 33.8159, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.036325532658050996, |
|
"grad_norm": 24.15830421447754, |
|
"learning_rate": 8.759199037394887e-05, |
|
"loss": 33.9119, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.036325532658050996, |
|
"eval_loss": 1.0672633647918701, |
|
"eval_runtime": 212.5444, |
|
"eval_samples_per_second": 11.344, |
|
"eval_steps_per_second": 2.837, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.039119804400977995, |
|
"grad_norm": 28.11492347717285, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 35.8233, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.041914076143904994, |
|
"grad_norm": 19.98900604248047, |
|
"learning_rate": 8.296729075500344e-05, |
|
"loss": 31.6706, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04470834788683199, |
|
"grad_norm": 24.489959716796875, |
|
"learning_rate": 8.043807145043604e-05, |
|
"loss": 35.4651, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04750261962975899, |
|
"grad_norm": 25.26296615600586, |
|
"learning_rate": 7.777851165098012e-05, |
|
"loss": 34.7415, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.050296891372685996, |
|
"grad_norm": 22.68831443786621, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 34.8815, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.053091163115612995, |
|
"grad_norm": 23.189281463623047, |
|
"learning_rate": 7.211443451095007e-05, |
|
"loss": 32.5742, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.055885434858539994, |
|
"grad_norm": 21.66847038269043, |
|
"learning_rate": 6.91341716182545e-05, |
|
"loss": 34.393, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05867970660146699, |
|
"grad_norm": 21.759904861450195, |
|
"learning_rate": 6.607197326515808e-05, |
|
"loss": 35.1236, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.06147397834439399, |
|
"grad_norm": 24.153629302978516, |
|
"learning_rate": 6.294095225512603e-05, |
|
"loss": 34.0533, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.064268250087321, |
|
"grad_norm": 20.246461868286133, |
|
"learning_rate": 5.9754516100806423e-05, |
|
"loss": 33.2113, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.067062521830248, |
|
"grad_norm": 24.22486114501953, |
|
"learning_rate": 5.6526309611002594e-05, |
|
"loss": 31.9, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.069856793573175, |
|
"grad_norm": 20.538053512573242, |
|
"learning_rate": 5.327015646150716e-05, |
|
"loss": 34.0572, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07265106531610199, |
|
"grad_norm": 21.71095848083496, |
|
"learning_rate": 5e-05, |
|
"loss": 31.9274, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.07265106531610199, |
|
"eval_loss": 1.0352919101715088, |
|
"eval_runtime": 216.7154, |
|
"eval_samples_per_second": 11.125, |
|
"eval_steps_per_second": 2.782, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.07544533705902899, |
|
"grad_norm": 19.745807647705078, |
|
"learning_rate": 4.6729843538492847e-05, |
|
"loss": 31.4099, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07823960880195599, |
|
"grad_norm": 20.407533645629883, |
|
"learning_rate": 4.347369038899744e-05, |
|
"loss": 33.3792, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.08103388054488299, |
|
"grad_norm": 18.46002197265625, |
|
"learning_rate": 4.0245483899193595e-05, |
|
"loss": 32.646, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.08382815228780999, |
|
"grad_norm": 21.341697692871094, |
|
"learning_rate": 3.705904774487396e-05, |
|
"loss": 32.8125, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08662242403073699, |
|
"grad_norm": 19.30036735534668, |
|
"learning_rate": 3.392802673484193e-05, |
|
"loss": 32.4647, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.08941669577366398, |
|
"grad_norm": 17.527427673339844, |
|
"learning_rate": 3.086582838174551e-05, |
|
"loss": 32.1851, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.09221096751659098, |
|
"grad_norm": 22.025373458862305, |
|
"learning_rate": 2.7885565489049946e-05, |
|
"loss": 31.5842, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.09500523925951798, |
|
"grad_norm": 23.377222061157227, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 31.3084, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.097799511002445, |
|
"grad_norm": 19.32001304626465, |
|
"learning_rate": 2.2221488349019903e-05, |
|
"loss": 34.4906, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.10059378274537199, |
|
"grad_norm": 19.37845230102539, |
|
"learning_rate": 1.9561928549563968e-05, |
|
"loss": 33.1804, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.10338805448829899, |
|
"grad_norm": 19.92717742919922, |
|
"learning_rate": 1.703270924499656e-05, |
|
"loss": 34.5119, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.10618232623122599, |
|
"grad_norm": 19.28239631652832, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 34.3349, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.10897659797415299, |
|
"grad_norm": 25.358318328857422, |
|
"learning_rate": 1.2408009626051137e-05, |
|
"loss": 34.2337, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.10897659797415299, |
|
"eval_loss": 1.0242209434509277, |
|
"eval_runtime": 215.991, |
|
"eval_samples_per_second": 11.163, |
|
"eval_steps_per_second": 2.792, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.11177086971707999, |
|
"grad_norm": 20.202943801879883, |
|
"learning_rate": 1.0332332985438248e-05, |
|
"loss": 32.6489, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11456514146000699, |
|
"grad_norm": 19.53352928161621, |
|
"learning_rate": 8.426519384872733e-06, |
|
"loss": 33.9152, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.11735941320293398, |
|
"grad_norm": 18.473039627075195, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 32.9272, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.12015368494586098, |
|
"grad_norm": 21.347410202026367, |
|
"learning_rate": 5.156362923365588e-06, |
|
"loss": 32.0126, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.12294795668878798, |
|
"grad_norm": 18.974212646484375, |
|
"learning_rate": 3.8060233744356633e-06, |
|
"loss": 32.8613, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.12574222843171498, |
|
"grad_norm": 20.034618377685547, |
|
"learning_rate": 2.653493525244721e-06, |
|
"loss": 33.6573, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.128536500174642, |
|
"grad_norm": 19.2609920501709, |
|
"learning_rate": 1.70370868554659e-06, |
|
"loss": 33.1644, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.13133077191756898, |
|
"grad_norm": 18.932796478271484, |
|
"learning_rate": 9.607359798384785e-07, |
|
"loss": 33.0339, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.134125043660496, |
|
"grad_norm": 18.432373046875, |
|
"learning_rate": 4.277569313094809e-07, |
|
"loss": 32.077, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.13691931540342298, |
|
"grad_norm": 19.448955535888672, |
|
"learning_rate": 1.0705383806982606e-07, |
|
"loss": 35.2186, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.13971358714635, |
|
"grad_norm": 22.028099060058594, |
|
"learning_rate": 0.0, |
|
"loss": 31.1742, |
|
"step": 50 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 50, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 13, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1250278137331712e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|