|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.934306569343065, |
|
"eval_steps": 500, |
|
"global_step": 340, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.145985401459854, |
|
"grad_norm": 0.1969512701034546, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6345, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.291970802919708, |
|
"grad_norm": 0.17513811588287354, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4433, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.43795620437956206, |
|
"grad_norm": 0.17957448959350586, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4281, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.583941605839416, |
|
"grad_norm": 0.16534289717674255, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3108, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.7299270072992701, |
|
"grad_norm": 0.1696271151304245, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3387, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8759124087591241, |
|
"grad_norm": 0.14049474895000458, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3062, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.0145985401459854, |
|
"grad_norm": 0.14177313446998596, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3354, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.1605839416058394, |
|
"grad_norm": 0.18519847095012665, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1708, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.3065693430656935, |
|
"grad_norm": 0.1618432253599167, |
|
"learning_rate": 0.0002, |
|
"loss": 1.09, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.4525547445255476, |
|
"grad_norm": 0.22861599922180176, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1921, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.5985401459854014, |
|
"grad_norm": 0.18863703310489655, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1883, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.7445255474452555, |
|
"grad_norm": 0.21330659091472626, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1134, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.8905109489051095, |
|
"grad_norm": 0.16281643509864807, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1218, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.0291970802919708, |
|
"grad_norm": 0.1913510113954544, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0937, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.1751824817518246, |
|
"grad_norm": 0.258828729391098, |
|
"learning_rate": 0.0002, |
|
"loss": 0.844, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.321167883211679, |
|
"grad_norm": 0.2650935649871826, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9038, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.4671532846715327, |
|
"grad_norm": 0.21534548699855804, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8955, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.613138686131387, |
|
"grad_norm": 0.28936412930488586, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9623, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.759124087591241, |
|
"grad_norm": 0.22461599111557007, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9206, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.905109489051095, |
|
"grad_norm": 0.2333170771598816, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9143, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.0437956204379564, |
|
"grad_norm": 0.23609858751296997, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8417, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.18978102189781, |
|
"grad_norm": 0.2533261477947235, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6803, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.335766423357664, |
|
"grad_norm": 0.28708019852638245, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6947, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.4817518248175183, |
|
"grad_norm": 0.2757578194141388, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6416, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.627737226277372, |
|
"grad_norm": 0.29800841212272644, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6616, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.7737226277372264, |
|
"grad_norm": 0.28990235924720764, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6788, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.9197080291970803, |
|
"grad_norm": 0.2645922303199768, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6891, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.0583941605839415, |
|
"grad_norm": 0.2718052864074707, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6122, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.204379562043796, |
|
"grad_norm": 0.2716304063796997, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4449, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.350364963503649, |
|
"grad_norm": 0.30169153213500977, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4474, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.4963503649635035, |
|
"grad_norm": 0.29635927081108093, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4637, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 4.642335766423358, |
|
"grad_norm": 0.3702072501182556, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4961, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.788321167883212, |
|
"grad_norm": 0.3127199709415436, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4497, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 4.934306569343065, |
|
"grad_norm": 0.2875048816204071, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4823, |
|
"step": 340 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 340, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.364208997702451e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|