|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 147, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.10204081632653061, |
|
"grad_norm": 4.106373310089111, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2854, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.20408163265306123, |
|
"grad_norm": 1.1787455081939697, |
|
"learning_rate": 1.9938879040295508e-05, |
|
"loss": 0.5714, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.30612244897959184, |
|
"grad_norm": 1.0353399515151978, |
|
"learning_rate": 1.975626331552507e-05, |
|
"loss": 0.424, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.40816326530612246, |
|
"grad_norm": 0.9334375262260437, |
|
"learning_rate": 1.9454385155359704e-05, |
|
"loss": 0.3665, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.5102040816326531, |
|
"grad_norm": 0.8077430725097656, |
|
"learning_rate": 1.903693477637204e-05, |
|
"loss": 0.3533, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.6122448979591837, |
|
"grad_norm": 0.786541759967804, |
|
"learning_rate": 1.850901517212062e-05, |
|
"loss": 0.3211, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.6843534708023071, |
|
"learning_rate": 1.7877079733177185e-05, |
|
"loss": 0.3147, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.8163265306122449, |
|
"grad_norm": 0.6673664450645447, |
|
"learning_rate": 1.7148853359641627e-05, |
|
"loss": 0.2972, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.9183673469387755, |
|
"grad_norm": 0.6665471792221069, |
|
"learning_rate": 1.6333238030480473e-05, |
|
"loss": 0.2947, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.0204081632653061, |
|
"grad_norm": 0.6398372650146484, |
|
"learning_rate": 1.5440203984027323e-05, |
|
"loss": 0.2689, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.1224489795918366, |
|
"grad_norm": 0.5915564894676208, |
|
"learning_rate": 1.4480667839875786e-05, |
|
"loss": 0.2341, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.2244897959183674, |
|
"grad_norm": 0.5560858845710754, |
|
"learning_rate": 1.3466359152026197e-05, |
|
"loss": 0.23, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.3265306122448979, |
|
"grad_norm": 0.6363187432289124, |
|
"learning_rate": 1.2409677024566145e-05, |
|
"loss": 0.2209, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.5852870941162109, |
|
"learning_rate": 1.1323538542642227e-05, |
|
"loss": 0.2096, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.5306122448979593, |
|
"grad_norm": 0.5593776702880859, |
|
"learning_rate": 1.022122087153187e-05, |
|
"loss": 0.2155, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.6326530612244898, |
|
"grad_norm": 0.5509619116783142, |
|
"learning_rate": 9.116198954026577e-06, |
|
"loss": 0.2104, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.7346938775510203, |
|
"grad_norm": 0.5514574646949768, |
|
"learning_rate": 8.021980790144828e-06, |
|
"loss": 0.2097, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.836734693877551, |
|
"grad_norm": 0.6069679856300354, |
|
"learning_rate": 6.951942312747135e-06, |
|
"loss": 0.2062, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.9387755102040818, |
|
"grad_norm": 0.6014193296432495, |
|
"learning_rate": 5.919163877565351e-06, |
|
"loss": 0.2075, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.0408163265306123, |
|
"grad_norm": 0.5056990385055542, |
|
"learning_rate": 4.936270366423563e-06, |
|
"loss": 0.1778, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 0.6073999404907227, |
|
"learning_rate": 4.015276858259427e-06, |
|
"loss": 0.1457, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.2448979591836733, |
|
"grad_norm": 0.5745469331741333, |
|
"learning_rate": 3.167441754493066e-06, |
|
"loss": 0.1441, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.3469387755102042, |
|
"grad_norm": 0.5980499386787415, |
|
"learning_rate": 2.403129154167153e-06, |
|
"loss": 0.1426, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.4489795918367347, |
|
"grad_norm": 0.5385629534721375, |
|
"learning_rate": 1.7316821612109136e-06, |
|
"loss": 0.1341, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.5510204081632653, |
|
"grad_norm": 0.5529075860977173, |
|
"learning_rate": 1.161308672544389e-06, |
|
"loss": 0.1367, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.6530612244897958, |
|
"grad_norm": 0.5598864555358887, |
|
"learning_rate": 6.989810431710375e-07, |
|
"loss": 0.1356, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.7551020408163263, |
|
"grad_norm": 0.5336365699768066, |
|
"learning_rate": 3.5035085477190143e-07, |
|
"loss": 0.1352, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.516101598739624, |
|
"learning_rate": 1.1967982968635994e-07, |
|
"loss": 0.1373, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.9591836734693877, |
|
"grad_norm": 0.5510626435279846, |
|
"learning_rate": 9.78773480026396e-09, |
|
"loss": 0.1373, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 147, |
|
"total_flos": 1.9096132999184384e+17, |
|
"train_loss": 0.26934566870838605, |
|
"train_runtime": 589.2508, |
|
"train_samples_per_second": 15.879, |
|
"train_steps_per_second": 0.249 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 147, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.9096132999184384e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|