{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 10.0, | |
"eval_steps": 500, | |
"global_step": 110, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.09, | |
"learning_rate": 0.0, | |
"loss": 6.3097, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.18, | |
"learning_rate": 1e-06, | |
"loss": 6.2197, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.27, | |
"learning_rate": 1.5849625007211562e-06, | |
"loss": 6.235, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.36, | |
"learning_rate": 2e-06, | |
"loss": 5.9965, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.45, | |
"learning_rate": 2e-06, | |
"loss": 4.9695, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.55, | |
"learning_rate": 2e-06, | |
"loss": 4.3271, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.64, | |
"learning_rate": 2e-06, | |
"loss": 1.8078, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.73, | |
"learning_rate": 2e-06, | |
"loss": 1.0825, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.82, | |
"learning_rate": 2e-06, | |
"loss": 0.6882, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.91, | |
"learning_rate": 2e-06, | |
"loss": 0.5505, | |
"step": 10 | |
}, | |
{ | |
"epoch": 1.0, | |
"learning_rate": 2e-06, | |
"loss": 0.3077, | |
"step": 11 | |
}, | |
{ | |
"epoch": 1.09, | |
"learning_rate": 2e-06, | |
"loss": 0.3388, | |
"step": 12 | |
}, | |
{ | |
"epoch": 1.18, | |
"learning_rate": 2e-06, | |
"loss": 0.3057, | |
"step": 13 | |
}, | |
{ | |
"epoch": 1.27, | |
"learning_rate": 2e-06, | |
"loss": 0.29, | |
"step": 14 | |
}, | |
{ | |
"epoch": 1.36, | |
"learning_rate": 2e-06, | |
"loss": 0.2837, | |
"step": 15 | |
}, | |
{ | |
"epoch": 1.45, | |
"learning_rate": 2e-06, | |
"loss": 0.2941, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.55, | |
"learning_rate": 2e-06, | |
"loss": 0.2905, | |
"step": 17 | |
}, | |
{ | |
"epoch": 1.64, | |
"learning_rate": 2e-06, | |
"loss": 0.3125, | |
"step": 18 | |
}, | |
{ | |
"epoch": 1.73, | |
"learning_rate": 2e-06, | |
"loss": 0.2781, | |
"step": 19 | |
}, | |
{ | |
"epoch": 1.82, | |
"learning_rate": 2e-06, | |
"loss": 0.2785, | |
"step": 20 | |
}, | |
{ | |
"epoch": 1.91, | |
"learning_rate": 2e-06, | |
"loss": 0.2979, | |
"step": 21 | |
}, | |
{ | |
"epoch": 2.0, | |
"learning_rate": 2e-06, | |
"loss": 0.2613, | |
"step": 22 | |
}, | |
{ | |
"epoch": 2.09, | |
"learning_rate": 2e-06, | |
"loss": 0.2672, | |
"step": 23 | |
}, | |
{ | |
"epoch": 2.18, | |
"learning_rate": 2e-06, | |
"loss": 0.2744, | |
"step": 24 | |
}, | |
{ | |
"epoch": 2.27, | |
"learning_rate": 2e-06, | |
"loss": 0.2552, | |
"step": 25 | |
}, | |
{ | |
"epoch": 2.36, | |
"learning_rate": 2e-06, | |
"loss": 0.2093, | |
"step": 26 | |
}, | |
{ | |
"epoch": 2.45, | |
"learning_rate": 2e-06, | |
"loss": 0.2445, | |
"step": 27 | |
}, | |
{ | |
"epoch": 2.55, | |
"learning_rate": 2e-06, | |
"loss": 0.2397, | |
"step": 28 | |
}, | |
{ | |
"epoch": 2.64, | |
"learning_rate": 2e-06, | |
"loss": 0.2134, | |
"step": 29 | |
}, | |
{ | |
"epoch": 2.73, | |
"learning_rate": 2e-06, | |
"loss": 0.1786, | |
"step": 30 | |
}, | |
{ | |
"epoch": 2.82, | |
"learning_rate": 2e-06, | |
"loss": 0.2645, | |
"step": 31 | |
}, | |
{ | |
"epoch": 2.91, | |
"learning_rate": 2e-06, | |
"loss": 0.2503, | |
"step": 32 | |
}, | |
{ | |
"epoch": 3.0, | |
"learning_rate": 2e-06, | |
"loss": 0.1944, | |
"step": 33 | |
}, | |
{ | |
"epoch": 3.09, | |
"learning_rate": 2e-06, | |
"loss": 0.155, | |
"step": 34 | |
}, | |
{ | |
"epoch": 3.18, | |
"learning_rate": 2e-06, | |
"loss": 0.1626, | |
"step": 35 | |
}, | |
{ | |
"epoch": 3.27, | |
"learning_rate": 2e-06, | |
"loss": 0.1659, | |
"step": 36 | |
}, | |
{ | |
"epoch": 3.36, | |
"learning_rate": 2e-06, | |
"loss": 0.2184, | |
"step": 37 | |
}, | |
{ | |
"epoch": 3.45, | |
"learning_rate": 2e-06, | |
"loss": 0.1362, | |
"step": 38 | |
}, | |
{ | |
"epoch": 3.55, | |
"learning_rate": 2e-06, | |
"loss": 0.1553, | |
"step": 39 | |
}, | |
{ | |
"epoch": 3.64, | |
"learning_rate": 2e-06, | |
"loss": 0.1637, | |
"step": 40 | |
}, | |
{ | |
"epoch": 3.73, | |
"learning_rate": 2e-06, | |
"loss": 0.148, | |
"step": 41 | |
}, | |
{ | |
"epoch": 3.82, | |
"learning_rate": 2e-06, | |
"loss": 0.146, | |
"step": 42 | |
}, | |
{ | |
"epoch": 3.91, | |
"learning_rate": 2e-06, | |
"loss": 0.1595, | |
"step": 43 | |
}, | |
{ | |
"epoch": 4.0, | |
"learning_rate": 2e-06, | |
"loss": 0.1222, | |
"step": 44 | |
}, | |
{ | |
"epoch": 4.09, | |
"learning_rate": 2e-06, | |
"loss": 0.1031, | |
"step": 45 | |
}, | |
{ | |
"epoch": 4.18, | |
"learning_rate": 2e-06, | |
"loss": 0.0905, | |
"step": 46 | |
}, | |
{ | |
"epoch": 4.27, | |
"learning_rate": 2e-06, | |
"loss": 0.0814, | |
"step": 47 | |
}, | |
{ | |
"epoch": 4.36, | |
"learning_rate": 2e-06, | |
"loss": 0.0868, | |
"step": 48 | |
}, | |
{ | |
"epoch": 4.45, | |
"learning_rate": 2e-06, | |
"loss": 0.0742, | |
"step": 49 | |
}, | |
{ | |
"epoch": 4.55, | |
"learning_rate": 2e-06, | |
"loss": 0.0694, | |
"step": 50 | |
}, | |
{ | |
"epoch": 4.64, | |
"learning_rate": 2e-06, | |
"loss": 0.091, | |
"step": 51 | |
}, | |
{ | |
"epoch": 4.73, | |
"learning_rate": 2e-06, | |
"loss": 0.1032, | |
"step": 52 | |
}, | |
{ | |
"epoch": 4.82, | |
"learning_rate": 2e-06, | |
"loss": 0.097, | |
"step": 53 | |
}, | |
{ | |
"epoch": 4.91, | |
"learning_rate": 2e-06, | |
"loss": 0.0609, | |
"step": 54 | |
}, | |
{ | |
"epoch": 5.0, | |
"learning_rate": 2e-06, | |
"loss": 0.078, | |
"step": 55 | |
}, | |
{ | |
"epoch": 5.09, | |
"learning_rate": 2e-06, | |
"loss": 0.0203, | |
"step": 56 | |
}, | |
{ | |
"epoch": 5.18, | |
"learning_rate": 2e-06, | |
"loss": 0.0806, | |
"step": 57 | |
}, | |
{ | |
"epoch": 5.27, | |
"learning_rate": 2e-06, | |
"loss": 0.0413, | |
"step": 58 | |
}, | |
{ | |
"epoch": 5.36, | |
"learning_rate": 2e-06, | |
"loss": 0.0682, | |
"step": 59 | |
}, | |
{ | |
"epoch": 5.45, | |
"learning_rate": 2e-06, | |
"loss": 0.0339, | |
"step": 60 | |
}, | |
{ | |
"epoch": 5.55, | |
"learning_rate": 2e-06, | |
"loss": 0.0209, | |
"step": 61 | |
}, | |
{ | |
"epoch": 5.64, | |
"learning_rate": 2e-06, | |
"loss": 0.035, | |
"step": 62 | |
}, | |
{ | |
"epoch": 5.73, | |
"learning_rate": 2e-06, | |
"loss": 0.0646, | |
"step": 63 | |
}, | |
{ | |
"epoch": 5.82, | |
"learning_rate": 2e-06, | |
"loss": 0.0421, | |
"step": 64 | |
}, | |
{ | |
"epoch": 5.91, | |
"learning_rate": 2e-06, | |
"loss": 0.0404, | |
"step": 65 | |
}, | |
{ | |
"epoch": 6.0, | |
"learning_rate": 2e-06, | |
"loss": 0.0243, | |
"step": 66 | |
}, | |
{ | |
"epoch": 6.09, | |
"learning_rate": 2e-06, | |
"loss": 0.0266, | |
"step": 67 | |
}, | |
{ | |
"epoch": 6.18, | |
"learning_rate": 2e-06, | |
"loss": 0.0221, | |
"step": 68 | |
}, | |
{ | |
"epoch": 6.27, | |
"learning_rate": 2e-06, | |
"loss": 0.015, | |
"step": 69 | |
}, | |
{ | |
"epoch": 6.36, | |
"learning_rate": 2e-06, | |
"loss": 0.0341, | |
"step": 70 | |
}, | |
{ | |
"epoch": 6.45, | |
"learning_rate": 2e-06, | |
"loss": 0.0437, | |
"step": 71 | |
}, | |
{ | |
"epoch": 6.55, | |
"learning_rate": 2e-06, | |
"loss": 0.0179, | |
"step": 72 | |
}, | |
{ | |
"epoch": 6.64, | |
"learning_rate": 2e-06, | |
"loss": 0.0424, | |
"step": 73 | |
}, | |
{ | |
"epoch": 6.73, | |
"learning_rate": 2e-06, | |
"loss": 0.043, | |
"step": 74 | |
}, | |
{ | |
"epoch": 6.82, | |
"learning_rate": 2e-06, | |
"loss": 0.0203, | |
"step": 75 | |
}, | |
{ | |
"epoch": 6.91, | |
"learning_rate": 2e-06, | |
"loss": 0.0204, | |
"step": 76 | |
}, | |
{ | |
"epoch": 7.0, | |
"learning_rate": 2e-06, | |
"loss": 0.0286, | |
"step": 77 | |
}, | |
{ | |
"epoch": 7.09, | |
"learning_rate": 2e-06, | |
"loss": 0.0094, | |
"step": 78 | |
}, | |
{ | |
"epoch": 7.18, | |
"learning_rate": 2e-06, | |
"loss": 0.0245, | |
"step": 79 | |
}, | |
{ | |
"epoch": 7.27, | |
"learning_rate": 2e-06, | |
"loss": 0.0267, | |
"step": 80 | |
}, | |
{ | |
"epoch": 7.36, | |
"learning_rate": 2e-06, | |
"loss": 0.0034, | |
"step": 81 | |
}, | |
{ | |
"epoch": 7.45, | |
"learning_rate": 2e-06, | |
"loss": 0.02, | |
"step": 82 | |
}, | |
{ | |
"epoch": 7.55, | |
"learning_rate": 2e-06, | |
"loss": 0.0287, | |
"step": 83 | |
}, | |
{ | |
"epoch": 7.64, | |
"learning_rate": 2e-06, | |
"loss": 0.016, | |
"step": 84 | |
}, | |
{ | |
"epoch": 7.73, | |
"learning_rate": 2e-06, | |
"loss": 0.0204, | |
"step": 85 | |
}, | |
{ | |
"epoch": 7.82, | |
"learning_rate": 2e-06, | |
"loss": 0.0084, | |
"step": 86 | |
}, | |
{ | |
"epoch": 7.91, | |
"learning_rate": 2e-06, | |
"loss": 0.0328, | |
"step": 87 | |
}, | |
{ | |
"epoch": 8.0, | |
"learning_rate": 2e-06, | |
"loss": 0.0038, | |
"step": 88 | |
}, | |
{ | |
"epoch": 8.09, | |
"learning_rate": 2e-06, | |
"loss": 0.0028, | |
"step": 89 | |
}, | |
{ | |
"epoch": 8.18, | |
"learning_rate": 2e-06, | |
"loss": 0.0007, | |
"step": 90 | |
}, | |
{ | |
"epoch": 8.27, | |
"learning_rate": 2e-06, | |
"loss": 0.0025, | |
"step": 91 | |
}, | |
{ | |
"epoch": 8.36, | |
"learning_rate": 2e-06, | |
"loss": 0.0021, | |
"step": 92 | |
}, | |
{ | |
"epoch": 8.45, | |
"learning_rate": 2e-06, | |
"loss": 0.0227, | |
"step": 93 | |
}, | |
{ | |
"epoch": 8.55, | |
"learning_rate": 2e-06, | |
"loss": 0.0049, | |
"step": 94 | |
}, | |
{ | |
"epoch": 8.64, | |
"learning_rate": 2e-06, | |
"loss": 0.0144, | |
"step": 95 | |
}, | |
{ | |
"epoch": 8.73, | |
"learning_rate": 2e-06, | |
"loss": 0.0057, | |
"step": 96 | |
}, | |
{ | |
"epoch": 8.82, | |
"learning_rate": 2e-06, | |
"loss": 0.0002, | |
"step": 97 | |
}, | |
{ | |
"epoch": 8.91, | |
"learning_rate": 2e-06, | |
"loss": 0.1342, | |
"step": 98 | |
}, | |
{ | |
"epoch": 9.0, | |
"learning_rate": 2e-06, | |
"loss": 0.0287, | |
"step": 99 | |
}, | |
{ | |
"epoch": 9.09, | |
"learning_rate": 2e-06, | |
"loss": 0.0003, | |
"step": 100 | |
}, | |
{ | |
"epoch": 9.18, | |
"learning_rate": 2e-06, | |
"loss": 0.0073, | |
"step": 101 | |
}, | |
{ | |
"epoch": 9.27, | |
"learning_rate": 2e-06, | |
"loss": 0.0001, | |
"step": 102 | |
}, | |
{ | |
"epoch": 9.36, | |
"learning_rate": 2e-06, | |
"loss": 0.0052, | |
"step": 103 | |
}, | |
{ | |
"epoch": 9.45, | |
"learning_rate": 2e-06, | |
"loss": 0.0031, | |
"step": 104 | |
}, | |
{ | |
"epoch": 9.55, | |
"learning_rate": 2e-06, | |
"loss": 0.001, | |
"step": 105 | |
}, | |
{ | |
"epoch": 9.64, | |
"learning_rate": 2e-06, | |
"loss": 0.035, | |
"step": 106 | |
}, | |
{ | |
"epoch": 9.73, | |
"learning_rate": 2e-06, | |
"loss": 0.0291, | |
"step": 107 | |
}, | |
{ | |
"epoch": 9.82, | |
"learning_rate": 2e-06, | |
"loss": 0.0303, | |
"step": 108 | |
}, | |
{ | |
"epoch": 9.91, | |
"learning_rate": 2e-06, | |
"loss": 0.0034, | |
"step": 109 | |
}, | |
{ | |
"epoch": 10.0, | |
"learning_rate": 2e-06, | |
"loss": 0.0051, | |
"step": 110 | |
}, | |
{ | |
"epoch": 10.0, | |
"step": 110, | |
"total_flos": 3159825776640.0, | |
"train_loss": 0.4391266070077439, | |
"train_runtime": 3835.8947, | |
"train_samples_per_second": 3.452, | |
"train_steps_per_second": 0.029 | |
} | |
], | |
"logging_steps": 1.0, | |
"max_steps": 110, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 10, | |
"save_steps": 50000, | |
"total_flos": 3159825776640.0, | |
"train_batch_size": 16, | |
"trial_name": null, | |
"trial_params": null | |
} | |