|
{ |
|
"best_metric": 5.346867561340332, |
|
"best_model_checkpoint": "./results/models/mistral-dna/checkpoint-22675", |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 22675, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11025358324145534, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 0.0003991179713340684, |
|
"loss": 6.6131, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2205071664829107, |
|
"grad_norm": 0.1298828125, |
|
"learning_rate": 0.00039823594266813673, |
|
"loss": 6.0212, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.33076074972436603, |
|
"grad_norm": 0.12255859375, |
|
"learning_rate": 0.00039735391400220506, |
|
"loss": 5.8931, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4410143329658214, |
|
"grad_norm": 0.11572265625, |
|
"learning_rate": 0.00039647188533627344, |
|
"loss": 5.822, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5512679162072768, |
|
"grad_norm": 0.125, |
|
"learning_rate": 0.0003955898566703418, |
|
"loss": 5.7678, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.6615214994487321, |
|
"grad_norm": 0.119140625, |
|
"learning_rate": 0.00039470782800441015, |
|
"loss": 5.7204, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.7717750826901875, |
|
"grad_norm": 0.12255859375, |
|
"learning_rate": 0.00039382579933847854, |
|
"loss": 5.7025, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.8820286659316428, |
|
"grad_norm": 0.1376953125, |
|
"learning_rate": 0.00039294377067254687, |
|
"loss": 5.6733, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.9922822491730982, |
|
"grad_norm": 0.1328125, |
|
"learning_rate": 0.00039206174200661525, |
|
"loss": 5.6491, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 5.551197528839111, |
|
"eval_runtime": 3.3539, |
|
"eval_samples_per_second": 86.765, |
|
"eval_steps_per_second": 1.491, |
|
"step": 4535 |
|
}, |
|
{ |
|
"epoch": 1.1025358324145536, |
|
"grad_norm": 0.126953125, |
|
"learning_rate": 0.0003911797133406836, |
|
"loss": 5.6279, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.2127894156560088, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 0.00039029768467475196, |
|
"loss": 5.6068, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.3230429988974641, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 0.0003894156560088203, |
|
"loss": 5.6068, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.4332965821389196, |
|
"grad_norm": 0.130859375, |
|
"learning_rate": 0.00038853362734288867, |
|
"loss": 5.5658, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.543550165380375, |
|
"grad_norm": 0.1318359375, |
|
"learning_rate": 0.000387651598676957, |
|
"loss": 5.5665, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.6538037486218302, |
|
"grad_norm": 0.138671875, |
|
"learning_rate": 0.0003867695700110254, |
|
"loss": 5.5565, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.7640573318632855, |
|
"grad_norm": 0.1376953125, |
|
"learning_rate": 0.00038588754134509377, |
|
"loss": 5.5477, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.8743109151047408, |
|
"grad_norm": 0.1298828125, |
|
"learning_rate": 0.0003850055126791621, |
|
"loss": 5.5295, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.9845644983461963, |
|
"grad_norm": 0.140625, |
|
"learning_rate": 0.0003841234840132304, |
|
"loss": 5.5208, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 5.451062202453613, |
|
"eval_runtime": 3.2969, |
|
"eval_samples_per_second": 88.264, |
|
"eval_steps_per_second": 1.517, |
|
"step": 9070 |
|
}, |
|
{ |
|
"epoch": 2.0948180815876514, |
|
"grad_norm": 0.138671875, |
|
"learning_rate": 0.0003832414553472988, |
|
"loss": 5.5017, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.205071664829107, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 0.0003823594266813672, |
|
"loss": 5.4918, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.3153252480705624, |
|
"grad_norm": 0.1357421875, |
|
"learning_rate": 0.0003814773980154355, |
|
"loss": 5.4806, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.4255788313120177, |
|
"grad_norm": 0.13671875, |
|
"learning_rate": 0.00038059536934950385, |
|
"loss": 5.4877, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.535832414553473, |
|
"grad_norm": 0.138671875, |
|
"learning_rate": 0.00037971334068357223, |
|
"loss": 5.4696, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 2.6460859977949283, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 0.0003788313120176406, |
|
"loss": 5.4781, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 2.7563395810363835, |
|
"grad_norm": 0.140625, |
|
"learning_rate": 0.00037794928335170894, |
|
"loss": 5.4748, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 2.8665931642778393, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 0.0003770672546857773, |
|
"loss": 5.4608, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 2.9768467475192946, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 0.00037618522601984565, |
|
"loss": 5.453, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 5.399576663970947, |
|
"eval_runtime": 3.4053, |
|
"eval_samples_per_second": 85.455, |
|
"eval_steps_per_second": 1.468, |
|
"step": 13605 |
|
}, |
|
{ |
|
"epoch": 3.08710033076075, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 0.00037530319735391404, |
|
"loss": 5.4278, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 3.197353914002205, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 0.00037442116868798236, |
|
"loss": 5.4226, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 3.3076074972436604, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 0.00037353914002205075, |
|
"loss": 5.4334, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 3.4178610804851157, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 0.0003726571113561191, |
|
"loss": 5.4321, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 3.528114663726571, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 0.00037177508269018746, |
|
"loss": 5.4277, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 3.6383682469680263, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 0.0003708930540242558, |
|
"loss": 5.4193, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 3.7486218302094816, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 0.00037001102535832417, |
|
"loss": 5.4226, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 3.8588754134509373, |
|
"grad_norm": 0.154296875, |
|
"learning_rate": 0.00036912899669239255, |
|
"loss": 5.4214, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 3.9691289966923926, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 0.0003682469680264609, |
|
"loss": 5.4218, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 5.367885112762451, |
|
"eval_runtime": 3.6662, |
|
"eval_samples_per_second": 79.373, |
|
"eval_steps_per_second": 1.364, |
|
"step": 18140 |
|
}, |
|
{ |
|
"epoch": 4.0793825799338475, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 0.0003673649393605292, |
|
"loss": 5.4067, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 4.189636163175303, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 0.0003664829106945976, |
|
"loss": 5.3963, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 4.299889746416759, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 0.000365600882028666, |
|
"loss": 5.3929, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 4.410143329658214, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 0.0003647188533627343, |
|
"loss": 5.3888, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 4.5203969128996695, |
|
"grad_norm": 0.16015625, |
|
"learning_rate": 0.00036383682469680263, |
|
"loss": 5.3982, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 4.630650496141125, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 0.000362954796030871, |
|
"loss": 5.3754, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 4.74090407938258, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 0.0003620727673649394, |
|
"loss": 5.3854, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 4.851157662624035, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 0.00036119073869900773, |
|
"loss": 5.3876, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 4.961411245865491, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 0.0003603087100330761, |
|
"loss": 5.3899, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 5.346867561340332, |
|
"eval_runtime": 3.7974, |
|
"eval_samples_per_second": 76.63, |
|
"eval_steps_per_second": 1.317, |
|
"step": 22675 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 226750, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.976338069294799e+18, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|