|
{ |
|
"best_metric": 0.29386183619499207, |
|
"best_model_checkpoint": "./ryan_model314/checkpoint-200", |
|
"epoch": 4.0, |
|
"eval_steps": 100, |
|
"global_step": 252, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.1858803033828735, |
|
"learning_rate": 0.00019206349206349208, |
|
"loss": 0.5736, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.41181480884552, |
|
"learning_rate": 0.00018412698412698412, |
|
"loss": 0.4142, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.6347964406013489, |
|
"learning_rate": 0.0001761904761904762, |
|
"loss": 0.3916, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.7893273234367371, |
|
"learning_rate": 0.00016825396825396826, |
|
"loss": 0.3628, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.9887136816978455, |
|
"learning_rate": 0.00016031746031746033, |
|
"loss": 0.3852, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.1596781015396118, |
|
"learning_rate": 0.00015238095238095237, |
|
"loss": 0.3962, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 1.1897984743118286, |
|
"learning_rate": 0.00014444444444444444, |
|
"loss": 0.2923, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.5242781639099121, |
|
"learning_rate": 0.0001365079365079365, |
|
"loss": 0.2335, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 1.0704305171966553, |
|
"learning_rate": 0.00012857142857142858, |
|
"loss": 0.268, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.852606475353241, |
|
"learning_rate": 0.00012063492063492063, |
|
"loss": 0.1969, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"eval_loss": 0.2954840064048767, |
|
"eval_na_accuracy": 0.945, |
|
"eval_ordinal_accuracy": 0.4785276073619632, |
|
"eval_runtime": 26.5841, |
|
"eval_samples_per_second": 7.523, |
|
"eval_steps_per_second": 0.94, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.6733121871948242, |
|
"learning_rate": 0.0001126984126984127, |
|
"loss": 0.2328, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.9004744291305542, |
|
"learning_rate": 0.00010476190476190477, |
|
"loss": 0.2248, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 3.1687183380126953, |
|
"learning_rate": 9.682539682539682e-05, |
|
"loss": 0.1717, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 0.6659616827964783, |
|
"learning_rate": 8.888888888888889e-05, |
|
"loss": 0.145, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 0.6846858859062195, |
|
"learning_rate": 8.095238095238096e-05, |
|
"loss": 0.1157, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.6155730485916138, |
|
"learning_rate": 7.301587301587302e-05, |
|
"loss": 0.1424, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.6559838056564331, |
|
"learning_rate": 6.507936507936509e-05, |
|
"loss": 0.1041, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 0.4849882423877716, |
|
"learning_rate": 5.714285714285714e-05, |
|
"loss": 0.1339, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 0.6311644911766052, |
|
"learning_rate": 4.9206349206349204e-05, |
|
"loss": 0.103, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 0.8323171138763428, |
|
"learning_rate": 4.126984126984127e-05, |
|
"loss": 0.0746, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"eval_loss": 0.29386183619499207, |
|
"eval_na_accuracy": 0.945, |
|
"eval_ordinal_accuracy": 0.5705521472392638, |
|
"eval_runtime": 8.3287, |
|
"eval_samples_per_second": 24.013, |
|
"eval_steps_per_second": 3.002, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.8798254132270813, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.086, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 0.37054240703582764, |
|
"learning_rate": 2.5396825396825397e-05, |
|
"loss": 0.0933, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 0.46293869614601135, |
|
"learning_rate": 1.746031746031746e-05, |
|
"loss": 0.066, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 0.3859086334705353, |
|
"learning_rate": 9.523809523809523e-06, |
|
"loss": 0.0649, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 0.300207257270813, |
|
"learning_rate": 1.5873015873015873e-06, |
|
"loss": 0.0623, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 252, |
|
"total_flos": 3.0997907103744e+17, |
|
"train_loss": 0.212149089468377, |
|
"train_runtime": 378.6413, |
|
"train_samples_per_second": 10.564, |
|
"train_steps_per_second": 0.666 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 252, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"total_flos": 3.0997907103744e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|