two_agent_1_epoch_2_dpo_iter_6 / trainer_state.json
YYYYYYibo's picture
Model save
f8e7726 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9992429977289932,
"eval_steps": 500,
"global_step": 165,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 43.6239071005382,
"learning_rate": 2.941176470588235e-09,
"logits/chosen": -1.3522639274597168,
"logits/rejected": -1.3693311214447021,
"logps/chosen": -262.57476806640625,
"logps/rejected": -283.94244384765625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.06,
"grad_norm": 40.670158110610615,
"learning_rate": 2.941176470588235e-08,
"logits/chosen": -1.1757179498672485,
"logits/rejected": -1.2358938455581665,
"logps/chosen": -280.3355407714844,
"logps/rejected": -300.9811706542969,
"loss": 0.6928,
"rewards/accuracies": 0.4236111044883728,
"rewards/chosen": 0.0006423706654459238,
"rewards/margins": 0.00042565667536109686,
"rewards/rejected": 0.00021671393187716603,
"step": 10
},
{
"epoch": 0.12,
"grad_norm": 40.34301193149703,
"learning_rate": 4.994932636402031e-08,
"logits/chosen": -1.1265027523040771,
"logits/rejected": -1.3426095247268677,
"logps/chosen": -277.8979187011719,
"logps/rejected": -299.1261291503906,
"loss": 0.6925,
"rewards/accuracies": 0.5406249761581421,
"rewards/chosen": 0.0021728514693677425,
"rewards/margins": 0.000990995205938816,
"rewards/rejected": 0.0011818561470136046,
"step": 20
},
{
"epoch": 0.18,
"grad_norm": 38.47792188182457,
"learning_rate": 4.905416503522123e-08,
"logits/chosen": -1.0218889713287354,
"logits/rejected": -1.151049256324768,
"logps/chosen": -273.4291687011719,
"logps/rejected": -301.57781982421875,
"loss": 0.6924,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.016858745366334915,
"rewards/margins": 0.0027274340391159058,
"rewards/rejected": 0.014131310395896435,
"step": 30
},
{
"epoch": 0.24,
"grad_norm": 56.52700447561209,
"learning_rate": 4.707922373336523e-08,
"logits/chosen": -1.084263801574707,
"logits/rejected": -1.2900816202163696,
"logps/chosen": -292.4299011230469,
"logps/rejected": -308.45062255859375,
"loss": 0.6932,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.030295047909021378,
"rewards/margins": 0.000461754942080006,
"rewards/rejected": 0.029833292588591576,
"step": 40
},
{
"epoch": 0.3,
"grad_norm": 38.00149695178064,
"learning_rate": 4.4113156629677314e-08,
"logits/chosen": -1.167959451675415,
"logits/rejected": -1.299862265586853,
"logps/chosen": -296.1455383300781,
"logps/rejected": -305.6954040527344,
"loss": 0.6903,
"rewards/accuracies": 0.4906249940395355,
"rewards/chosen": 0.04823774844408035,
"rewards/margins": 0.00479243416339159,
"rewards/rejected": 0.043445318937301636,
"step": 50
},
{
"epoch": 0.36,
"grad_norm": 37.72422002493558,
"learning_rate": 4.028910905897228e-08,
"logits/chosen": -1.181056261062622,
"logits/rejected": -1.0861554145812988,
"logps/chosen": -292.48040771484375,
"logps/rejected": -304.0435485839844,
"loss": 0.6912,
"rewards/accuracies": 0.528124988079071,
"rewards/chosen": 0.06034231185913086,
"rewards/margins": 0.006084255874156952,
"rewards/rejected": 0.054258059710264206,
"step": 60
},
{
"epoch": 0.42,
"grad_norm": 42.79122392676645,
"learning_rate": 3.577874068920445e-08,
"logits/chosen": -1.210323691368103,
"logits/rejected": -1.065538763999939,
"logps/chosen": -286.93572998046875,
"logps/rejected": -306.0190124511719,
"loss": 0.6918,
"rewards/accuracies": 0.5093749761581421,
"rewards/chosen": 0.06624683737754822,
"rewards/margins": 0.004461642820388079,
"rewards/rejected": 0.06178520247340202,
"step": 70
},
{
"epoch": 0.48,
"grad_norm": 39.75881017842744,
"learning_rate": 3.078451980100854e-08,
"logits/chosen": -1.1516613960266113,
"logits/rejected": -1.3043029308319092,
"logps/chosen": -270.6875,
"logps/rejected": -290.72998046875,
"loss": 0.6911,
"rewards/accuracies": 0.515625,
"rewards/chosen": 0.04866773635149002,
"rewards/margins": 0.0037067097146064043,
"rewards/rejected": 0.04496103152632713,
"step": 80
},
{
"epoch": 0.55,
"grad_norm": 43.01085877492651,
"learning_rate": 2.5530634583340587e-08,
"logits/chosen": -1.2572039365768433,
"logits/rejected": -1.0870755910873413,
"logps/chosen": -273.9654235839844,
"logps/rejected": -290.50836181640625,
"loss": 0.6898,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.03104880452156067,
"rewards/margins": 0.00489948783069849,
"rewards/rejected": 0.026149321347475052,
"step": 90
},
{
"epoch": 0.61,
"grad_norm": 47.166985943498034,
"learning_rate": 2.0252929432814285e-08,
"logits/chosen": -1.1381770372390747,
"logits/rejected": -1.3748772144317627,
"logps/chosen": -282.6134338378906,
"logps/rejected": -304.66790771484375,
"loss": 0.6897,
"rewards/accuracies": 0.5843750238418579,
"rewards/chosen": 0.03918559476733208,
"rewards/margins": 0.01519505213946104,
"rewards/rejected": 0.023990539833903313,
"step": 100
},
{
"epoch": 0.67,
"grad_norm": 44.673514372021515,
"learning_rate": 1.5188318011445905e-08,
"logits/chosen": -1.065263271331787,
"logits/rejected": -1.2649091482162476,
"logps/chosen": -277.50775146484375,
"logps/rejected": -300.609619140625,
"loss": 0.6902,
"rewards/accuracies": 0.5531250238418579,
"rewards/chosen": 0.028311368077993393,
"rewards/margins": 0.008604733273386955,
"rewards/rejected": 0.01970663294196129,
"step": 110
},
{
"epoch": 0.73,
"grad_norm": 46.58773437073759,
"learning_rate": 1.0564148305586295e-08,
"logits/chosen": -1.1271841526031494,
"logits/rejected": -1.1778924465179443,
"logps/chosen": -279.55084228515625,
"logps/rejected": -298.75030517578125,
"loss": 0.6902,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.028242077678442,
"rewards/margins": 0.003035143483430147,
"rewards/rejected": 0.02520693466067314,
"step": 120
},
{
"epoch": 0.79,
"grad_norm": 41.9191117851892,
"learning_rate": 6.587997083462196e-09,
"logits/chosen": -1.0855623483657837,
"logits/rejected": -1.1804945468902588,
"logps/chosen": -283.80682373046875,
"logps/rejected": -294.71844482421875,
"loss": 0.6889,
"rewards/accuracies": 0.590624988079071,
"rewards/chosen": 0.033344708383083344,
"rewards/margins": 0.013705698773264885,
"rewards/rejected": 0.01963900588452816,
"step": 130
},
{
"epoch": 0.85,
"grad_norm": 44.355218289856595,
"learning_rate": 3.438351873250492e-09,
"logits/chosen": -1.092165470123291,
"logits/rejected": -1.280500054359436,
"logps/chosen": -278.0908508300781,
"logps/rejected": -305.513427734375,
"loss": 0.6901,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.033924926072359085,
"rewards/margins": 0.007848087698221207,
"rewards/rejected": 0.026076842099428177,
"step": 140
},
{
"epoch": 0.91,
"grad_norm": 44.46354128863562,
"learning_rate": 1.256598743236703e-09,
"logits/chosen": -1.0778554677963257,
"logits/rejected": -1.2542009353637695,
"logps/chosen": -265.0628967285156,
"logps/rejected": -297.0721130371094,
"loss": 0.6898,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.022747624665498734,
"rewards/margins": 0.007828270085155964,
"rewards/rejected": 0.014919353649020195,
"step": 150
},
{
"epoch": 0.97,
"grad_norm": 46.98138173894952,
"learning_rate": 1.4067554877743859e-10,
"logits/chosen": -1.170921802520752,
"logits/rejected": -1.1549434661865234,
"logps/chosen": -280.11676025390625,
"logps/rejected": -299.28729248046875,
"loss": 0.6895,
"rewards/accuracies": 0.528124988079071,
"rewards/chosen": 0.031538333743810654,
"rewards/margins": 0.006610988173633814,
"rewards/rejected": 0.024927344173192978,
"step": 160
},
{
"epoch": 1.0,
"step": 165,
"total_flos": 0.0,
"train_loss": 0.6908076347726764,
"train_runtime": 32496.9517,
"train_samples_per_second": 0.65,
"train_steps_per_second": 0.005
}
],
"logging_steps": 10,
"max_steps": 165,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}