|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 100, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.5629515647888184, |
|
"learning_rate": 0.0001, |
|
"loss": 2.2932, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.5660052299499512, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1775, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.7450891733169556, |
|
"learning_rate": 0.0001, |
|
"loss": 1.048, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.785325050354004, |
|
"learning_rate": 0.0001, |
|
"loss": 1.0855, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.3252322673797607, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9265, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.5729879140853882, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9402, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.5131912231445312, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9867, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.9407297372817993, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8494, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.085139513015747, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8612, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.0694725513458252, |
|
"learning_rate": 0.0001, |
|
"loss": 1.0641, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.8292067050933838, |
|
"eval_runtime": 53.3995, |
|
"eval_samples_per_second": 9.363, |
|
"eval_steps_per_second": 4.682, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.7079308032989502, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8869, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.5450018644332886, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9056, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.4224787950515747, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8426, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.317502737045288, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1121, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.5484294891357422, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7657, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.1524784564971924, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9395, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.3373467922210693, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8741, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.813785195350647, |
|
"learning_rate": 0.0001, |
|
"loss": 0.807, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.3544365167617798, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8619, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.4264659881591797, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7061, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.7746801972389221, |
|
"eval_runtime": 53.3696, |
|
"eval_samples_per_second": 9.369, |
|
"eval_steps_per_second": 4.684, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.9158445596694946, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8594, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.9574500322341919, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8221, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.2509143352508545, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7963, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.714758038520813, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7244, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.070537805557251, |
|
"learning_rate": 0.0001, |
|
"loss": 0.784, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 3.861132860183716, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8176, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.2554326057434082, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8537, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.9133288264274597, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7829, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.508233666419983, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9645, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.0573885440826416, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7912, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 0.7474251985549927, |
|
"eval_runtime": 53.3631, |
|
"eval_samples_per_second": 9.37, |
|
"eval_steps_per_second": 4.685, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.4665676355361938, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8004, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.0176584720611572, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8841, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.208796501159668, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8888, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.3332535028457642, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7775, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.814683675765991, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7112, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.540370225906372, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8344, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.5666872262954712, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7647, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.2998279333114624, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8868, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.2700724601745605, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8204, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.0913368463516235, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7203, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.7208259105682373, |
|
"eval_runtime": 53.3297, |
|
"eval_samples_per_second": 9.376, |
|
"eval_steps_per_second": 4.688, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.0129300355911255, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6732, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.3054537773132324, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7467, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.4525396823883057, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8085, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.1094130277633667, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7971, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.422092318534851, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7452, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.9026339054107666, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9912, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.0568705797195435, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7249, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.3295648097991943, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6573, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.215010404586792, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8908, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.9312228560447693, |
|
"learning_rate": 0.0001, |
|
"loss": 0.714, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.7121768593788147, |
|
"eval_runtime": 53.3521, |
|
"eval_samples_per_second": 9.372, |
|
"eval_steps_per_second": 4.686, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.294978471477248e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|