two_agent_1_epoch_2_dpo_iter_4 / trainer_state.json
YYYYYYibo's picture
Model save
c4f1e44 verified
raw
history blame
8.93 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984,
"eval_steps": 500,
"global_step": 156,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 24.144679616293203,
"learning_rate": 6.25e-09,
"logits/chosen": -0.4835050106048584,
"logits/rejected": -0.45789963006973267,
"logps/chosen": -214.22390747070312,
"logps/rejected": -238.45899963378906,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.06,
"grad_norm": 27.157833222439553,
"learning_rate": 6.25e-08,
"logits/chosen": -0.5191683769226074,
"logits/rejected": -0.6054410338401794,
"logps/chosen": -233.09689331054688,
"logps/rejected": -247.0145721435547,
"loss": 0.6931,
"rewards/accuracies": 0.4097222089767456,
"rewards/chosen": -0.00034871429670602083,
"rewards/margins": -0.0003790514019783586,
"rewards/rejected": 3.03371598420199e-05,
"step": 10
},
{
"epoch": 0.13,
"grad_norm": 22.695219914328067,
"learning_rate": 9.979871469976195e-08,
"logits/chosen": -0.5110132098197937,
"logits/rejected": -0.5159324407577515,
"logps/chosen": -226.8810272216797,
"logps/rejected": -234.03579711914062,
"loss": 0.6927,
"rewards/accuracies": 0.484375,
"rewards/chosen": -0.012425726279616356,
"rewards/margins": 0.0005857773358002305,
"rewards/rejected": -0.013011504895985126,
"step": 20
},
{
"epoch": 0.19,
"grad_norm": 27.267303752253287,
"learning_rate": 9.755282581475768e-08,
"logits/chosen": -0.46132326126098633,
"logits/rejected": -0.5038495063781738,
"logps/chosen": -228.7631378173828,
"logps/rejected": -243.3440399169922,
"loss": 0.6895,
"rewards/accuracies": 0.5718749761581421,
"rewards/chosen": -0.05522267147898674,
"rewards/margins": 0.011394877918064594,
"rewards/rejected": -0.06661754846572876,
"step": 30
},
{
"epoch": 0.26,
"grad_norm": 25.80441755927495,
"learning_rate": 9.29224396800933e-08,
"logits/chosen": -0.5605762600898743,
"logits/rejected": -0.4145810008049011,
"logps/chosen": -235.9694366455078,
"logps/rejected": -245.53140258789062,
"loss": 0.6877,
"rewards/accuracies": 0.5406249761581421,
"rewards/chosen": -0.12484677881002426,
"rewards/margins": 0.01826881244778633,
"rewards/rejected": -0.1431155800819397,
"step": 40
},
{
"epoch": 0.32,
"grad_norm": 29.148756992059326,
"learning_rate": 8.613974319136957e-08,
"logits/chosen": -0.33988311886787415,
"logits/rejected": -0.4071916937828064,
"logps/chosen": -225.6988525390625,
"logps/rejected": -248.31716918945312,
"loss": 0.6858,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.11917146295309067,
"rewards/margins": 0.03840441256761551,
"rewards/rejected": -0.15757587552070618,
"step": 50
},
{
"epoch": 0.38,
"grad_norm": 31.691697738483036,
"learning_rate": 7.754484907260513e-08,
"logits/chosen": -0.32609015703201294,
"logits/rejected": -0.26230502128601074,
"logps/chosen": -227.6748504638672,
"logps/rejected": -247.0984649658203,
"loss": 0.6813,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07656601071357727,
"rewards/margins": 0.052247386425733566,
"rewards/rejected": -0.12881340086460114,
"step": 60
},
{
"epoch": 0.45,
"grad_norm": 29.21753574540554,
"learning_rate": 6.756874120406714e-08,
"logits/chosen": -0.46613264083862305,
"logits/rejected": -0.40840277075767517,
"logps/chosen": -227.2427520751953,
"logps/rejected": -237.88174438476562,
"loss": 0.6836,
"rewards/accuracies": 0.5406249761581421,
"rewards/chosen": -0.011777873151004314,
"rewards/margins": 0.03103628195822239,
"rewards/rejected": -0.04281415045261383,
"step": 70
},
{
"epoch": 0.51,
"grad_norm": 27.295936995560456,
"learning_rate": 5.6711663290882774e-08,
"logits/chosen": -0.3539288640022278,
"logits/rejected": -0.4821072518825531,
"logps/chosen": -227.5625,
"logps/rejected": -246.0811767578125,
"loss": 0.6843,
"rewards/accuracies": 0.5093749761581421,
"rewards/chosen": -0.02285691350698471,
"rewards/margins": 0.04252464324235916,
"rewards/rejected": -0.06538156419992447,
"step": 80
},
{
"epoch": 0.58,
"grad_norm": 37.94611808985469,
"learning_rate": 4.551803455482833e-08,
"logits/chosen": -0.31488946080207825,
"logits/rejected": -0.2586648762226105,
"logps/chosen": -235.135009765625,
"logps/rejected": -250.8152618408203,
"loss": 0.6811,
"rewards/accuracies": 0.559374988079071,
"rewards/chosen": -0.05702267214655876,
"rewards/margins": 0.03373400494456291,
"rewards/rejected": -0.09075668454170227,
"step": 90
},
{
"epoch": 0.64,
"grad_norm": 31.89166999144619,
"learning_rate": 3.4549150281252633e-08,
"logits/chosen": -0.2342231571674347,
"logits/rejected": -0.11592201143503189,
"logps/chosen": -241.8380126953125,
"logps/rejected": -248.9578094482422,
"loss": 0.6851,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.10655052959918976,
"rewards/margins": 0.02655043639242649,
"rewards/rejected": -0.1331009566783905,
"step": 100
},
{
"epoch": 0.7,
"grad_norm": 33.31375799730263,
"learning_rate": 2.43550361297047e-08,
"logits/chosen": -0.18439307808876038,
"logits/rejected": -0.2467001974582672,
"logps/chosen": -231.55880737304688,
"logps/rejected": -247.73422241210938,
"loss": 0.6818,
"rewards/accuracies": 0.503125011920929,
"rewards/chosen": -0.1505533754825592,
"rewards/margins": 0.03467424958944321,
"rewards/rejected": -0.185227632522583,
"step": 110
},
{
"epoch": 0.77,
"grad_norm": 47.472733400866,
"learning_rate": 1.5446867550656767e-08,
"logits/chosen": -0.33943504095077515,
"logits/rejected": -0.27855488657951355,
"logps/chosen": -237.60281372070312,
"logps/rejected": -256.1316833496094,
"loss": 0.6833,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.1447935551404953,
"rewards/margins": 0.04320163279771805,
"rewards/rejected": -0.18799519538879395,
"step": 120
},
{
"epoch": 0.83,
"grad_norm": 33.56575623501931,
"learning_rate": 8.271337313934867e-09,
"logits/chosen": -0.20994436740875244,
"logits/rejected": -0.30067163705825806,
"logps/chosen": -242.28720092773438,
"logps/rejected": -260.9054260253906,
"loss": 0.6824,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.13135352730751038,
"rewards/margins": 0.033290181308984756,
"rewards/rejected": -0.16464371979236603,
"step": 130
},
{
"epoch": 0.9,
"grad_norm": 36.04452677999868,
"learning_rate": 3.1882564680131396e-09,
"logits/chosen": -0.2294171154499054,
"logits/rejected": -0.14980368316173553,
"logps/chosen": -226.87939453125,
"logps/rejected": -243.36831665039062,
"loss": 0.6884,
"rewards/accuracies": 0.503125011920929,
"rewards/chosen": -0.11503396183252335,
"rewards/margins": 0.014261065050959587,
"rewards/rejected": -0.12929502129554749,
"step": 140
},
{
"epoch": 0.96,
"grad_norm": 34.70896434984119,
"learning_rate": 4.52511911603265e-10,
"logits/chosen": -0.28860437870025635,
"logits/rejected": -0.18086276948451996,
"logps/chosen": -245.88034057617188,
"logps/rejected": -255.14749145507812,
"loss": 0.6783,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.12854784727096558,
"rewards/margins": 0.03599141910672188,
"rewards/rejected": -0.16453926265239716,
"step": 150
},
{
"epoch": 1.0,
"step": 156,
"total_flos": 0.0,
"train_loss": 0.6857114946230863,
"train_runtime": 19132.8262,
"train_samples_per_second": 1.045,
"train_steps_per_second": 0.008
}
],
"logging_steps": 10,
"max_steps": 156,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}