|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.383378016085791, |
|
"eval_steps": 500, |
|
"global_step": 7000, |
|
"is_hyper_param_search": true, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.6702412868632708, |
|
"grad_norm": 1.8121609687805176, |
|
"learning_rate": 4.400558387583305e-05, |
|
"loss": 0.6264, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.5072075128555298, |
|
"eval_loss": 0.6931472420692444, |
|
"eval_runtime": 9.1548, |
|
"eval_samples_per_second": 325.839, |
|
"eval_steps_per_second": 20.426, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 1.3404825737265416, |
|
"grad_norm": 1.8386740684509277, |
|
"learning_rate": 4.0844263195097915e-05, |
|
"loss": 0.7039, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.49882668256759644, |
|
"eval_loss": 0.6931472420692444, |
|
"eval_runtime": 9.1028, |
|
"eval_samples_per_second": 327.702, |
|
"eval_steps_per_second": 20.543, |
|
"step": 1492 |
|
}, |
|
{ |
|
"epoch": 2.0107238605898123, |
|
"grad_norm": 1.775593876838684, |
|
"learning_rate": 3.768294251436278e-05, |
|
"loss": 0.7008, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.680965147453083, |
|
"grad_norm": 1.6979082822799683, |
|
"learning_rate": 3.452162183362765e-05, |
|
"loss": 0.7034, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.5015085339546204, |
|
"eval_loss": 0.6931472420692444, |
|
"eval_runtime": 9.1903, |
|
"eval_samples_per_second": 324.583, |
|
"eval_steps_per_second": 20.348, |
|
"step": 2238 |
|
}, |
|
{ |
|
"epoch": 3.351206434316354, |
|
"grad_norm": 1.7350631952285767, |
|
"learning_rate": 3.136030115289252e-05, |
|
"loss": 0.7017, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.5065370202064514, |
|
"eval_loss": 0.6931472420692444, |
|
"eval_runtime": 9.2185, |
|
"eval_samples_per_second": 323.589, |
|
"eval_steps_per_second": 20.285, |
|
"step": 2984 |
|
}, |
|
{ |
|
"epoch": 4.021447721179625, |
|
"grad_norm": 1.841038465499878, |
|
"learning_rate": 2.8198980472157382e-05, |
|
"loss": 0.7019, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.6916890080428955, |
|
"grad_norm": 1.6272307634353638, |
|
"learning_rate": 2.5037659791422253e-05, |
|
"loss": 0.6994, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.5095541477203369, |
|
"eval_loss": 0.6931472420692444, |
|
"eval_runtime": 9.0976, |
|
"eval_samples_per_second": 327.89, |
|
"eval_steps_per_second": 20.555, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 5.361930294906166, |
|
"grad_norm": 1.7355244159698486, |
|
"learning_rate": 2.1876339110687117e-05, |
|
"loss": 0.7006, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.49446865916252136, |
|
"eval_loss": 0.6931472420692444, |
|
"eval_runtime": 9.0977, |
|
"eval_samples_per_second": 327.885, |
|
"eval_steps_per_second": 20.555, |
|
"step": 4476 |
|
}, |
|
{ |
|
"epoch": 6.032171581769437, |
|
"grad_norm": 1.6792231798171997, |
|
"learning_rate": 1.8715018429951985e-05, |
|
"loss": 0.7005, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 6.702412868632708, |
|
"grad_norm": 1.7535431385040283, |
|
"learning_rate": 1.5553697749216852e-05, |
|
"loss": 0.701, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.4941334128379822, |
|
"eval_loss": 0.6931472420692444, |
|
"eval_runtime": 9.0431, |
|
"eval_samples_per_second": 329.865, |
|
"eval_steps_per_second": 20.679, |
|
"step": 5222 |
|
}, |
|
{ |
|
"epoch": 7.372654155495979, |
|
"grad_norm": 1.7443852424621582, |
|
"learning_rate": 1.239237706848172e-05, |
|
"loss": 0.7015, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.5122359991073608, |
|
"eval_loss": 0.6931472420692444, |
|
"eval_runtime": 9.0713, |
|
"eval_samples_per_second": 328.839, |
|
"eval_steps_per_second": 20.614, |
|
"step": 5968 |
|
}, |
|
{ |
|
"epoch": 8.04289544235925, |
|
"grad_norm": 1.6793931722640991, |
|
"learning_rate": 9.231056387746587e-06, |
|
"loss": 0.7003, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 8.71313672922252, |
|
"grad_norm": 1.6744327545166016, |
|
"learning_rate": 6.069735707011455e-06, |
|
"loss": 0.7013, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.48776400089263916, |
|
"eval_loss": 0.6931472420692444, |
|
"eval_runtime": 9.1943, |
|
"eval_samples_per_second": 324.441, |
|
"eval_steps_per_second": 20.339, |
|
"step": 6714 |
|
}, |
|
{ |
|
"epoch": 9.383378016085791, |
|
"grad_norm": 1.55774986743927, |
|
"learning_rate": 2.908415026276322e-06, |
|
"loss": 0.6984, |
|
"step": 7000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 7460, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 1.8226917764893548e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": { |
|
"learning_rate": 4.716690455656818e-05, |
|
"per_device_train_batch_size": 16 |
|
} |
|
} |
|
|