llama-3.1-8b-instruct-armorm / trainer_state.json
simonycl's picture
Upload folder using huggingface_hub
8088680 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9995071463775259,
"eval_steps": 400,
"global_step": 507,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001971414489896501,
"grad_norm": 6.2392770862642,
"learning_rate": 9.803921568627451e-09,
"logits/chosen": -1.594488501548767,
"logits/rejected": -1.1860766410827637,
"logps/chosen": -198.3888397216797,
"logps/rejected": -269.352783203125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.009857072449482503,
"grad_norm": 5.49954498256661,
"learning_rate": 4.901960784313725e-08,
"logits/chosen": -1.645488977432251,
"logits/rejected": -1.0096673965454102,
"logps/chosen": -192.4307861328125,
"logps/rejected": -247.57391357421875,
"loss": 0.6931,
"rewards/accuracies": 0.390625,
"rewards/chosen": 0.00013264300650916994,
"rewards/margins": 0.0001808845845516771,
"rewards/rejected": -4.824160714633763e-05,
"step": 5
},
{
"epoch": 0.019714144898965006,
"grad_norm": 4.196436716438617,
"learning_rate": 9.80392156862745e-08,
"logits/chosen": -1.6045820713043213,
"logits/rejected": -1.0348637104034424,
"logps/chosen": -184.26632690429688,
"logps/rejected": -245.4076690673828,
"loss": 0.6931,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": 0.0013285436434671283,
"rewards/margins": -0.0003174581506755203,
"rewards/rejected": 0.001646001823246479,
"step": 10
},
{
"epoch": 0.02957121734844751,
"grad_norm": 5.768149867251834,
"learning_rate": 1.4705882352941175e-07,
"logits/chosen": -1.8137686252593994,
"logits/rejected": -1.135617971420288,
"logps/chosen": -199.5909881591797,
"logps/rejected": -266.2090759277344,
"loss": 0.6924,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.0006634569144807756,
"rewards/margins": 0.0016718091210350394,
"rewards/rejected": -0.002335265977308154,
"step": 15
},
{
"epoch": 0.03942828979793001,
"grad_norm": 5.9407046802470065,
"learning_rate": 1.96078431372549e-07,
"logits/chosen": -1.7376708984375,
"logits/rejected": -1.1297136545181274,
"logps/chosen": -189.01934814453125,
"logps/rejected": -255.4130859375,
"loss": 0.6901,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.006191645748913288,
"rewards/margins": 0.006456127855926752,
"rewards/rejected": -0.012647772207856178,
"step": 20
},
{
"epoch": 0.04928536224741252,
"grad_norm": 5.2689388633967456,
"learning_rate": 2.4509803921568627e-07,
"logits/chosen": -1.7063930034637451,
"logits/rejected": -1.1289308071136475,
"logps/chosen": -204.759765625,
"logps/rejected": -266.6024169921875,
"loss": 0.684,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.017932727932929993,
"rewards/margins": 0.019068485125899315,
"rewards/rejected": -0.03700121492147446,
"step": 25
},
{
"epoch": 0.05914243469689502,
"grad_norm": 6.582821307605904,
"learning_rate": 2.941176470588235e-07,
"logits/chosen": -1.5701669454574585,
"logits/rejected": -1.041677474975586,
"logps/chosen": -194.34347534179688,
"logps/rejected": -276.304443359375,
"loss": 0.673,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.02758154645562172,
"rewards/margins": 0.045433152467012405,
"rewards/rejected": -0.07301469147205353,
"step": 30
},
{
"epoch": 0.06899950714637752,
"grad_norm": 10.279144076298133,
"learning_rate": 3.431372549019608e-07,
"logits/chosen": -1.4824097156524658,
"logits/rejected": -0.9899765253067017,
"logps/chosen": -198.76766967773438,
"logps/rejected": -265.6862487792969,
"loss": 0.6359,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.068024180829525,
"rewards/margins": 0.12255563586950302,
"rewards/rejected": -0.1905798316001892,
"step": 35
},
{
"epoch": 0.07885657959586002,
"grad_norm": 9.399070867553222,
"learning_rate": 3.92156862745098e-07,
"logits/chosen": -1.7072070837020874,
"logits/rejected": -1.1361684799194336,
"logps/chosen": -204.9685516357422,
"logps/rejected": -303.8945617675781,
"loss": 0.5789,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.17202235758304596,
"rewards/margins": 0.32544782757759094,
"rewards/rejected": -0.4974702000617981,
"step": 40
},
{
"epoch": 0.08871365204534254,
"grad_norm": 13.971472653574747,
"learning_rate": 4.4117647058823526e-07,
"logits/chosen": -2.052572727203369,
"logits/rejected": -1.6851530075073242,
"logps/chosen": -326.47637939453125,
"logps/rejected": -529.0755004882812,
"loss": 0.5125,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.31343674659729,
"rewards/margins": 1.4102681875228882,
"rewards/rejected": -2.7237050533294678,
"step": 45
},
{
"epoch": 0.09857072449482504,
"grad_norm": 15.586119367213884,
"learning_rate": 4.901960784313725e-07,
"logits/chosen": -2.3275997638702393,
"logits/rejected": -1.8939182758331299,
"logps/chosen": -430.58563232421875,
"logps/rejected": -688.560302734375,
"loss": 0.491,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.3751883506774902,
"rewards/margins": 2.010326862335205,
"rewards/rejected": -4.385515213012695,
"step": 50
},
{
"epoch": 0.10842779694430754,
"grad_norm": 25.434888651559017,
"learning_rate": 4.999050767562379e-07,
"logits/chosen": -2.086081027984619,
"logits/rejected": -1.7880547046661377,
"logps/chosen": -363.26824951171875,
"logps/rejected": -565.7152099609375,
"loss": 0.4485,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.765494704246521,
"rewards/margins": 1.468611478805542,
"rewards/rejected": -3.2341067790985107,
"step": 55
},
{
"epoch": 0.11828486939379004,
"grad_norm": 33.28729029749558,
"learning_rate": 4.99519574616467e-07,
"logits/chosen": -2.2118542194366455,
"logits/rejected": -1.929535150527954,
"logps/chosen": -434.70794677734375,
"logps/rejected": -744.1588745117188,
"loss": 0.4177,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -2.4308719635009766,
"rewards/margins": 2.5395278930664062,
"rewards/rejected": -4.970398902893066,
"step": 60
},
{
"epoch": 0.12814194184327254,
"grad_norm": 18.906078399540743,
"learning_rate": 4.988380179235842e-07,
"logits/chosen": -2.071911334991455,
"logits/rejected": -1.7777721881866455,
"logps/chosen": -411.3829040527344,
"logps/rejected": -706.1156005859375,
"loss": 0.3931,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -2.1888458728790283,
"rewards/margins": 2.4369874000549316,
"rewards/rejected": -4.625833511352539,
"step": 65
},
{
"epoch": 0.13799901429275505,
"grad_norm": 29.037425921796533,
"learning_rate": 4.978612153434526e-07,
"logits/chosen": -2.3122410774230957,
"logits/rejected": -2.039794683456421,
"logps/chosen": -457.93646240234375,
"logps/rejected": -924.6302490234375,
"loss": 0.4394,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.7360401153564453,
"rewards/margins": 3.9763436317443848,
"rewards/rejected": -6.712383270263672,
"step": 70
},
{
"epoch": 0.14785608674223755,
"grad_norm": 12.920105460240974,
"learning_rate": 4.965903258506806e-07,
"logits/chosen": -2.1728882789611816,
"logits/rejected": -1.9507039785385132,
"logps/chosen": -443.1044006347656,
"logps/rejected": -733.8536376953125,
"loss": 0.4033,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.428837537765503,
"rewards/margins": 2.4469170570373535,
"rewards/rejected": -4.8757548332214355,
"step": 75
},
{
"epoch": 0.15771315919172005,
"grad_norm": 13.726152345059775,
"learning_rate": 4.950268573535011e-07,
"logits/chosen": -2.0774412155151367,
"logits/rejected": -1.877873420715332,
"logps/chosen": -434.2333068847656,
"logps/rejected": -697.5498657226562,
"loss": 0.3896,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.339465618133545,
"rewards/margins": 2.073235034942627,
"rewards/rejected": -4.412700176239014,
"step": 80
},
{
"epoch": 0.16757023164120255,
"grad_norm": 22.96957352147464,
"learning_rate": 4.93172664904641e-07,
"logits/chosen": -2.578918695449829,
"logits/rejected": -2.313844680786133,
"logps/chosen": -714.9383544921875,
"logps/rejected": -1185.1837158203125,
"loss": 0.3544,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -5.246799468994141,
"rewards/margins": 4.040070533752441,
"rewards/rejected": -9.286870002746582,
"step": 85
},
{
"epoch": 0.17742730409068508,
"grad_norm": 16.12185087053667,
"learning_rate": 4.910299485003033e-07,
"logits/chosen": -2.3522980213165283,
"logits/rejected": -2.1312594413757324,
"logps/chosen": -548.0660400390625,
"logps/rejected": -941.7537231445312,
"loss": 0.326,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -3.534759044647217,
"rewards/margins": 3.3071117401123047,
"rewards/rejected": -6.841870307922363,
"step": 90
},
{
"epoch": 0.18728437654016758,
"grad_norm": 38.45908394327309,
"learning_rate": 4.886012504698769e-07,
"logits/chosen": -2.29638671875,
"logits/rejected": -2.0095603466033936,
"logps/chosen": -526.743408203125,
"logps/rejected": -906.7442626953125,
"loss": 0.3562,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -3.3386921882629395,
"rewards/margins": 3.1595466136932373,
"rewards/rejected": -6.498239040374756,
"step": 95
},
{
"epoch": 0.19714144898965008,
"grad_norm": 23.37026509407894,
"learning_rate": 4.858894524594652e-07,
"logits/chosen": -2.509087085723877,
"logits/rejected": -2.2394092082977295,
"logps/chosen": -597.5819091796875,
"logps/rejected": -1110.536376953125,
"loss": 0.3208,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -4.074545383453369,
"rewards/margins": 4.4778618812561035,
"rewards/rejected": -8.552406311035156,
"step": 100
},
{
"epoch": 0.20699852143913258,
"grad_norm": 17.538993246799073,
"learning_rate": 4.828977720128198e-07,
"logits/chosen": -2.368518114089966,
"logits/rejected": -2.0958077907562256,
"logps/chosen": -522.4010620117188,
"logps/rejected": -853.4436645507812,
"loss": 0.3199,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -3.3017165660858154,
"rewards/margins": 2.8141353130340576,
"rewards/rejected": -6.115852355957031,
"step": 105
},
{
"epoch": 0.21685559388861508,
"grad_norm": 14.205208079234838,
"learning_rate": 4.796297587537285e-07,
"logits/chosen": -2.4165451526641846,
"logits/rejected": -2.1057496070861816,
"logps/chosen": -577.1276245117188,
"logps/rejected": -963.6633911132812,
"loss": 0.2935,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -3.8097610473632812,
"rewards/margins": 3.394871950149536,
"rewards/rejected": -7.2046332359313965,
"step": 110
},
{
"epoch": 0.22671266633809758,
"grad_norm": 16.990565766105274,
"learning_rate": 4.760892901743944e-07,
"logits/chosen": -2.536337375640869,
"logits/rejected": -2.2590508460998535,
"logps/chosen": -760.9464111328125,
"logps/rejected": -1193.0130615234375,
"loss": 0.3468,
"rewards/accuracies": 0.84375,
"rewards/chosen": -5.601290702819824,
"rewards/margins": 3.6806259155273438,
"rewards/rejected": -9.281916618347168,
"step": 115
},
{
"epoch": 0.23656973878758009,
"grad_norm": 14.381039222874595,
"learning_rate": 4.7228056703479626e-07,
"logits/chosen": -2.490741014480591,
"logits/rejected": -2.1797027587890625,
"logps/chosen": -651.6326293945312,
"logps/rejected": -1045.991943359375,
"loss": 0.3,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -4.517138481140137,
"rewards/margins": 3.300055742263794,
"rewards/rejected": -7.817193508148193,
"step": 120
},
{
"epoch": 0.2464268112370626,
"grad_norm": 13.845809576206165,
"learning_rate": 4.6820810837849535e-07,
"logits/chosen": -2.4549553394317627,
"logits/rejected": -2.05999755859375,
"logps/chosen": -606.1448974609375,
"logps/rejected": -1030.6544189453125,
"loss": 0.2987,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.098814487457275,
"rewards/margins": 3.521782636642456,
"rewards/rejected": -7.620597839355469,
"step": 125
},
{
"epoch": 0.2562838836865451,
"grad_norm": 30.897064038144002,
"learning_rate": 4.63876746170797e-07,
"logits/chosen": -2.3905959129333496,
"logits/rejected": -2.17751145362854,
"logps/chosen": -677.5357055664062,
"logps/rejected": -1075.299072265625,
"loss": 0.299,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -4.783341407775879,
"rewards/margins": 3.367708683013916,
"rewards/rejected": -8.151049613952637,
"step": 130
},
{
"epoch": 0.2661409561360276,
"grad_norm": 24.59610522580519,
"learning_rate": 4.592916195656321e-07,
"logits/chosen": -2.686401844024658,
"logits/rejected": -2.2882132530212402,
"logps/chosen": -798.7413330078125,
"logps/rejected": -1337.7080078125,
"loss": 0.2956,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -6.00932502746582,
"rewards/margins": 4.70266580581665,
"rewards/rejected": -10.711990356445312,
"step": 135
},
{
"epoch": 0.2759980285855101,
"grad_norm": 14.965471726804886,
"learning_rate": 4.544581688079602e-07,
"logits/chosen": -2.4349093437194824,
"logits/rejected": -2.14192533493042,
"logps/chosen": -705.6304321289062,
"logps/rejected": -1094.288330078125,
"loss": 0.2863,
"rewards/accuracies": 0.875,
"rewards/chosen": -5.049565315246582,
"rewards/margins": 3.3041865825653076,
"rewards/rejected": -8.353752136230469,
"step": 140
},
{
"epoch": 0.2858551010349926,
"grad_norm": 18.293122078134097,
"learning_rate": 4.493821287789272e-07,
"logits/chosen": -2.5565428733825684,
"logits/rejected": -2.1939361095428467,
"logps/chosen": -744.5687255859375,
"logps/rejected": -1154.7205810546875,
"loss": 0.2788,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -5.435603618621826,
"rewards/margins": 3.5441932678222656,
"rewards/rejected": -8.979796409606934,
"step": 145
},
{
"epoch": 0.2957121734844751,
"grad_norm": 20.91998686803295,
"learning_rate": 4.4406952219143934e-07,
"logits/chosen": -2.5498974323272705,
"logits/rejected": -2.22133731842041,
"logps/chosen": -842.6162109375,
"logps/rejected": -1307.8778076171875,
"loss": 0.295,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -6.563225746154785,
"rewards/margins": 4.067451477050781,
"rewards/rejected": -10.630678176879883,
"step": 150
},
{
"epoch": 0.3055692459339576,
"grad_norm": 16.16798121676358,
"learning_rate": 4.38526652444224e-07,
"logits/chosen": -2.5155484676361084,
"logits/rejected": -2.1966238021850586,
"logps/chosen": -806.795166015625,
"logps/rejected": -1259.8773193359375,
"loss": 0.2963,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -6.04946231842041,
"rewards/margins": 3.812061309814453,
"rewards/rejected": -9.861523628234863,
"step": 155
},
{
"epoch": 0.3154263183834401,
"grad_norm": 18.735794107249802,
"learning_rate": 4.3276009614285824e-07,
"logits/chosen": -2.464740037918091,
"logits/rejected": -2.1177756786346436,
"logps/chosen": -709.6548461914062,
"logps/rejected": -1163.3350830078125,
"loss": 0.2554,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -5.127840042114258,
"rewards/margins": 3.861670970916748,
"rewards/rejected": -8.989511489868164,
"step": 160
},
{
"epoch": 0.3252833908329226,
"grad_norm": 22.02574974928147,
"learning_rate": 4.2677669529663686e-07,
"logits/chosen": -2.545640707015991,
"logits/rejected": -2.2462990283966064,
"logps/chosen": -783.61669921875,
"logps/rejected": -1255.912353515625,
"loss": 0.2591,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -5.863072395324707,
"rewards/margins": 4.111878395080566,
"rewards/rejected": -9.974950790405273,
"step": 165
},
{
"epoch": 0.3351404632824051,
"grad_norm": 18.815167005575123,
"learning_rate": 4.2058354920054043e-07,
"logits/chosen": -2.5555951595306396,
"logits/rejected": -2.2355425357818604,
"logps/chosen": -801.5789184570312,
"logps/rejected": -1247.4630126953125,
"loss": 0.2675,
"rewards/accuracies": 0.84375,
"rewards/chosen": -6.109245777130127,
"rewards/margins": 3.8287367820739746,
"rewards/rejected": -9.937983512878418,
"step": 170
},
{
"epoch": 0.34499753573188763,
"grad_norm": 15.86237856217812,
"learning_rate": 4.141880060119336e-07,
"logits/chosen": -2.541696786880493,
"logits/rejected": -2.180537700653076,
"logps/chosen": -784.6647338867188,
"logps/rejected": -1234.05126953125,
"loss": 0.2502,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -5.834389686584473,
"rewards/margins": 3.9414896965026855,
"rewards/rejected": -9.77587890625,
"step": 175
},
{
"epoch": 0.35485460818137016,
"grad_norm": 16.77349624146522,
"learning_rate": 4.0759765403198877e-07,
"logits/chosen": -2.5138328075408936,
"logits/rejected": -2.1284890174865723,
"logps/chosen": -700.7459106445312,
"logps/rejected": -1123.9356689453125,
"loss": 0.2808,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -5.06411075592041,
"rewards/margins": 3.7050411701202393,
"rewards/rejected": -8.769152641296387,
"step": 180
},
{
"epoch": 0.36471168063085263,
"grad_norm": 17.550598446162923,
"learning_rate": 4.008203127021797e-07,
"logits/chosen": -2.5796236991882324,
"logits/rejected": -2.215527057647705,
"logps/chosen": -717.72119140625,
"logps/rejected": -1230.483642578125,
"loss": 0.2249,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -5.206206798553467,
"rewards/margins": 4.433660507202148,
"rewards/rejected": -9.639867782592773,
"step": 185
},
{
"epoch": 0.37456875308033516,
"grad_norm": 15.558233723999136,
"learning_rate": 3.9386402332652754e-07,
"logits/chosen": -2.6024489402770996,
"logits/rejected": -2.3488709926605225,
"logps/chosen": -900.3855590820312,
"logps/rejected": -1392.37255859375,
"loss": 0.2267,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -7.0901055335998535,
"rewards/margins": 4.340862274169922,
"rewards/rejected": -11.430967330932617,
"step": 190
},
{
"epoch": 0.38442582552981763,
"grad_norm": 22.187048055147955,
"learning_rate": 3.867370395306068e-07,
"logits/chosen": -2.6506357192993164,
"logits/rejected": -2.2959604263305664,
"logps/chosen": -900.3836059570312,
"logps/rejected": -1402.96630859375,
"loss": 0.2693,
"rewards/accuracies": 0.875,
"rewards/chosen": -7.0995588302612305,
"rewards/margins": 4.546249866485596,
"rewards/rejected": -11.645808219909668,
"step": 195
},
{
"epoch": 0.39428289797930016,
"grad_norm": 15.588853964432303,
"learning_rate": 3.794478174686328e-07,
"logits/chosen": -2.5797057151794434,
"logits/rejected": -2.1939449310302734,
"logps/chosen": -769.4427490234375,
"logps/rejected": -1267.3035888671875,
"loss": 0.2491,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -5.717661380767822,
"rewards/margins": 4.36967658996582,
"rewards/rejected": -10.087339401245117,
"step": 200
},
{
"epoch": 0.40413997042878264,
"grad_norm": 15.604577624570162,
"learning_rate": 3.720050057902495e-07,
"logits/chosen": -2.4678874015808105,
"logits/rejected": -2.166454553604126,
"logps/chosen": -664.3575439453125,
"logps/rejected": -1184.0172119140625,
"loss": 0.2733,
"rewards/accuracies": 0.84375,
"rewards/chosen": -4.809238910675049,
"rewards/margins": 4.560500144958496,
"rewards/rejected": -9.369739532470703,
"step": 205
},
{
"epoch": 0.41399704287826516,
"grad_norm": 16.104577186140947,
"learning_rate": 3.644174353789204e-07,
"logits/chosen": -2.470492124557495,
"logits/rejected": -2.2408156394958496,
"logps/chosen": -702.6835327148438,
"logps/rejected": -1184.7685546875,
"loss": 0.24,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -5.020668983459473,
"rewards/margins": 4.07871150970459,
"rewards/rejected": -9.099380493164062,
"step": 210
},
{
"epoch": 0.42385411532774764,
"grad_norm": 42.39094832845099,
"learning_rate": 3.566941088741009e-07,
"logits/chosen": -2.465122699737549,
"logits/rejected": -2.202960968017578,
"logps/chosen": -784.384765625,
"logps/rejected": -1312.956298828125,
"loss": 0.2914,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -5.859043598175049,
"rewards/margins": 4.713334083557129,
"rewards/rejected": -10.572378158569336,
"step": 215
},
{
"epoch": 0.43371118777723017,
"grad_norm": 20.944458558373373,
"learning_rate": 3.488441899896217e-07,
"logits/chosen": -2.487208843231201,
"logits/rejected": -2.197640895843506,
"logps/chosen": -729.4404296875,
"logps/rejected": -1207.813232421875,
"loss": 0.2843,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -5.363978862762451,
"rewards/margins": 4.154356956481934,
"rewards/rejected": -9.518336296081543,
"step": 220
},
{
"epoch": 0.44356826022671264,
"grad_norm": 14.863684470935617,
"learning_rate": 3.408769926409574e-07,
"logits/chosen": -2.4418163299560547,
"logits/rejected": -2.1561474800109863,
"logps/chosen": -578.9898071289062,
"logps/rejected": -913.1700439453125,
"loss": 0.2547,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -3.821843385696411,
"rewards/margins": 2.9205775260925293,
"rewards/rejected": -6.742421627044678,
"step": 225
},
{
"epoch": 0.45342533267619517,
"grad_norm": 17.477623835490277,
"learning_rate": 3.3280196989428263e-07,
"logits/chosen": -2.4349989891052246,
"logits/rejected": -2.196359634399414,
"logps/chosen": -682.2156982421875,
"logps/rejected": -1149.534423828125,
"loss": 0.2754,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -4.879435062408447,
"rewards/margins": 3.9676411151885986,
"rewards/rejected": -8.847076416015625,
"step": 230
},
{
"epoch": 0.4632824051256777,
"grad_norm": 16.30612813668589,
"learning_rate": 3.2462870275042367e-07,
"logits/chosen": -2.5115764141082764,
"logits/rejected": -2.3011107444763184,
"logps/chosen": -744.306396484375,
"logps/rejected": -1183.502685546875,
"loss": 0.2276,
"rewards/accuracies": 0.90625,
"rewards/chosen": -5.445645332336426,
"rewards/margins": 3.8912956714630127,
"rewards/rejected": -9.33694076538086,
"step": 235
},
{
"epoch": 0.47313947757516017,
"grad_norm": 18.35946878564802,
"learning_rate": 3.1636688877701806e-07,
"logits/chosen": -2.5281643867492676,
"logits/rejected": -2.2399466037750244,
"logps/chosen": -777.9661865234375,
"logps/rejected": -1258.2623291015625,
"loss": 0.2537,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -5.761153221130371,
"rewards/margins": 4.262064456939697,
"rewards/rejected": -10.023218154907227,
"step": 240
},
{
"epoch": 0.4829965500246427,
"grad_norm": 22.711594265033526,
"learning_rate": 3.080263306023669e-07,
"logits/chosen": -2.43805193901062,
"logits/rejected": -2.136569023132324,
"logps/chosen": -738.892578125,
"logps/rejected": -1253.740966796875,
"loss": 0.2465,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -5.394611358642578,
"rewards/margins": 4.49846076965332,
"rewards/rejected": -9.893071174621582,
"step": 245
},
{
"epoch": 0.4928536224741252,
"grad_norm": 22.523084393015623,
"learning_rate": 2.996169242846328e-07,
"logits/chosen": -2.456860065460205,
"logits/rejected": -2.1488893032073975,
"logps/chosen": -664.892822265625,
"logps/rejected": -1094.06640625,
"loss": 0.2643,
"rewards/accuracies": 0.90625,
"rewards/chosen": -4.737056732177734,
"rewards/margins": 3.7822394371032715,
"rewards/rejected": -8.519296646118164,
"step": 250
},
{
"epoch": 0.5027106949236076,
"grad_norm": 21.85050975494629,
"learning_rate": 2.911486475701835e-07,
"logits/chosen": -2.3711659908294678,
"logits/rejected": -2.104147434234619,
"logps/chosen": -632.7847900390625,
"logps/rejected": -1041.4219970703125,
"loss": 0.2848,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -4.385097503662109,
"rewards/margins": 3.560230255126953,
"rewards/rejected": -7.9453277587890625,
"step": 255
},
{
"epoch": 0.5125677673730902,
"grad_norm": 17.27564046380349,
"learning_rate": 2.826315480550129e-07,
"logits/chosen": -2.326019763946533,
"logits/rejected": -2.0808887481689453,
"logps/chosen": -590.845458984375,
"logps/rejected": -1011.6871337890625,
"loss": 0.2489,
"rewards/accuracies": 0.90625,
"rewards/chosen": -3.986447811126709,
"rewards/margins": 3.542587995529175,
"rewards/rejected": -7.529036045074463,
"step": 260
},
{
"epoch": 0.5224248398225727,
"grad_norm": 19.212069446863243,
"learning_rate": 2.740757312632854e-07,
"logits/chosen": -2.414062261581421,
"logits/rejected": -2.197702646255493,
"logps/chosen": -744.2257080078125,
"logps/rejected": -1205.133544921875,
"loss": 0.2221,
"rewards/accuracies": 0.875,
"rewards/chosen": -5.464824676513672,
"rewards/margins": 4.082161903381348,
"rewards/rejected": -9.54698657989502,
"step": 265
},
{
"epoch": 0.5322819122720552,
"grad_norm": 16.242036970306053,
"learning_rate": 2.654913486571487e-07,
"logits/chosen": -2.5215845108032227,
"logits/rejected": -2.260974168777466,
"logps/chosen": -794.4285888671875,
"logps/rejected": -1301.2264404296875,
"loss": 0.3103,
"rewards/accuracies": 0.875,
"rewards/chosen": -5.997513294219971,
"rewards/margins": 4.493828773498535,
"rewards/rejected": -10.491341590881348,
"step": 270
},
{
"epoch": 0.5421389847215377,
"grad_norm": 14.856129841637888,
"learning_rate": 2.5688858559204053e-07,
"logits/chosen": -2.406275987625122,
"logits/rejected": -2.168721914291382,
"logps/chosen": -711.0574951171875,
"logps/rejected": -1197.390380859375,
"loss": 0.2365,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -5.203823566436768,
"rewards/margins": 4.352889060974121,
"rewards/rejected": -9.55671215057373,
"step": 275
},
{
"epoch": 0.5519960571710202,
"grad_norm": 15.78620841198885,
"learning_rate": 2.4827764923178246e-07,
"logits/chosen": -2.5056891441345215,
"logits/rejected": -2.270139694213867,
"logps/chosen": -740.6078491210938,
"logps/rejected": -1241.2222900390625,
"loss": 0.2371,
"rewards/accuracies": 0.9375,
"rewards/chosen": -5.419187068939209,
"rewards/margins": 4.417618751525879,
"rewards/rejected": -9.83680534362793,
"step": 280
},
{
"epoch": 0.5618531296205027,
"grad_norm": 17.093071523621635,
"learning_rate": 2.3966875643779667e-07,
"logits/chosen": -2.443941593170166,
"logits/rejected": -2.2383456230163574,
"logps/chosen": -725.4220581054688,
"logps/rejected": -1253.807373046875,
"loss": 0.2179,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -5.367009162902832,
"rewards/margins": 4.663185119628906,
"rewards/rejected": -10.030195236206055,
"step": 285
},
{
"epoch": 0.5717102020699852,
"grad_norm": 25.373557062497504,
"learning_rate": 2.3107212164681774e-07,
"logits/chosen": -2.5970406532287598,
"logits/rejected": -2.2234134674072266,
"logps/chosen": -703.3094482421875,
"logps/rejected": -1330.679931640625,
"loss": 0.2351,
"rewards/accuracies": 0.90625,
"rewards/chosen": -5.1501851081848145,
"rewards/margins": 5.526017665863037,
"rewards/rejected": -10.676202774047852,
"step": 290
},
{
"epoch": 0.5815672745194678,
"grad_norm": 39.54586447558642,
"learning_rate": 2.2249794475148019e-07,
"logits/chosen": -2.508376359939575,
"logits/rejected": -2.293508768081665,
"logps/chosen": -826.1845703125,
"logps/rejected": -1299.66845703125,
"loss": 0.2529,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -6.255187034606934,
"rewards/margins": 4.241654872894287,
"rewards/rejected": -10.496840476989746,
"step": 295
},
{
"epoch": 0.5914243469689502,
"grad_norm": 21.452851323361823,
"learning_rate": 2.1395639899816332e-07,
"logits/chosen": -2.580679416656494,
"logits/rejected": -2.2998709678649902,
"logps/chosen": -733.3718872070312,
"logps/rejected": -1240.96533203125,
"loss": 0.229,
"rewards/accuracies": 0.875,
"rewards/chosen": -5.495171070098877,
"rewards/margins": 4.4129509925842285,
"rewards/rejected": -9.908121109008789,
"step": 300
},
{
"epoch": 0.6012814194184327,
"grad_norm": 16.276691413135083,
"learning_rate": 2.0545761891645177e-07,
"logits/chosen": -2.456111431121826,
"logits/rejected": -2.243847608566284,
"logps/chosen": -683.7113037109375,
"logps/rejected": -1161.4580078125,
"loss": 0.2226,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -4.874439239501953,
"rewards/margins": 4.157763957977295,
"rewards/rejected": -9.032203674316406,
"step": 305
},
{
"epoch": 0.6111384918679152,
"grad_norm": 24.661935948628066,
"learning_rate": 1.9701168829453305e-07,
"logits/chosen": -2.6442089080810547,
"logits/rejected": -2.186643600463867,
"logps/chosen": -696.6754150390625,
"logps/rejected": -1296.2703857421875,
"loss": 0.2543,
"rewards/accuracies": 0.875,
"rewards/chosen": -5.091025352478027,
"rewards/margins": 5.146512031555176,
"rewards/rejected": -10.237536430358887,
"step": 310
},
{
"epoch": 0.6209955643173978,
"grad_norm": 14.790443951524152,
"learning_rate": 1.886286282148002e-07,
"logits/chosen": -2.444152355194092,
"logits/rejected": -2.1477932929992676,
"logps/chosen": -734.7506713867188,
"logps/rejected": -1255.331787109375,
"loss": 0.2501,
"rewards/accuracies": 0.90625,
"rewards/chosen": -5.458280563354492,
"rewards/margins": 4.611725807189941,
"rewards/rejected": -10.070005416870117,
"step": 315
},
{
"epoch": 0.6308526367668802,
"grad_norm": 16.271316784779167,
"learning_rate": 1.8031838516385422e-07,
"logits/chosen": -2.369560718536377,
"logits/rejected": -2.1628785133361816,
"logps/chosen": -670.2017822265625,
"logps/rejected": -1158.4266357421875,
"loss": 0.2539,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -4.8326568603515625,
"rewards/margins": 4.299530982971191,
"rewards/rejected": -9.132187843322754,
"step": 320
},
{
"epoch": 0.6407097092163627,
"grad_norm": 17.744715641719303,
"learning_rate": 1.7209081923101472e-07,
"logits/chosen": -2.6445670127868652,
"logits/rejected": -2.266472578048706,
"logps/chosen": -690.3375854492188,
"logps/rejected": -1188.698974609375,
"loss": 0.205,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -4.986203193664551,
"rewards/margins": 4.211085319519043,
"rewards/rejected": -9.19728946685791,
"step": 325
},
{
"epoch": 0.6505667816658453,
"grad_norm": 21.850943779213573,
"learning_rate": 1.639556924093404e-07,
"logits/chosen": -2.358119249343872,
"logits/rejected": -2.1153066158294678,
"logps/chosen": -764.6770629882812,
"logps/rejected": -1240.2838134765625,
"loss": 0.2799,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -5.743839263916016,
"rewards/margins": 4.227473735809326,
"rewards/rejected": -9.971312522888184,
"step": 330
},
{
"epoch": 0.6604238541153278,
"grad_norm": 34.1719257406542,
"learning_rate": 1.5592265701304114e-07,
"logits/chosen": -2.375866413116455,
"logits/rejected": -2.240598678588867,
"logps/chosen": -763.527099609375,
"logps/rejected": -1258.715576171875,
"loss": 0.2564,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -5.7242560386657715,
"rewards/margins": 4.458041667938232,
"rewards/rejected": -10.18229866027832,
"step": 335
},
{
"epoch": 0.6702809265648102,
"grad_norm": 18.71419612814259,
"learning_rate": 1.4800124422502334e-07,
"logits/chosen": -2.519636631011963,
"logits/rejected": -2.2316250801086426,
"logps/chosen": -762.00439453125,
"logps/rejected": -1267.931396484375,
"loss": 0.2514,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -5.641887664794922,
"rewards/margins": 4.440293312072754,
"rewards/rejected": -10.08218002319336,
"step": 340
},
{
"epoch": 0.6801379990142927,
"grad_norm": 18.664999037354942,
"learning_rate": 1.4020085278815743e-07,
"logits/chosen": -2.458855628967285,
"logits/rejected": -2.2174124717712402,
"logps/chosen": -758.8146362304688,
"logps/rejected": -1192.955322265625,
"loss": 0.2308,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -5.576407432556152,
"rewards/margins": 3.762500762939453,
"rewards/rejected": -9.338907241821289,
"step": 345
},
{
"epoch": 0.6899950714637753,
"grad_norm": 19.00593669045522,
"learning_rate": 1.3253073785368545e-07,
"logits/chosen": -2.4038822650909424,
"logits/rejected": -2.114386796951294,
"logps/chosen": -628.0557250976562,
"logps/rejected": -1128.370849609375,
"loss": 0.2821,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -4.437934875488281,
"rewards/margins": 4.355624198913574,
"rewards/rejected": -8.793559074401855,
"step": 350
},
{
"epoch": 0.6998521439132578,
"grad_norm": 13.437480532125694,
"learning_rate": 1.2500000000000005e-07,
"logits/chosen": -2.367783308029175,
"logits/rejected": -2.1017680168151855,
"logps/chosen": -647.2433471679688,
"logps/rejected": -1109.231689453125,
"loss": 0.2264,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -4.586766242980957,
"rewards/margins": 4.0332794189453125,
"rewards/rejected": -8.62004566192627,
"step": 355
},
{
"epoch": 0.7097092163627403,
"grad_norm": 23.214131611033924,
"learning_rate": 1.1761757443482285e-07,
"logits/chosen": -2.4149296283721924,
"logits/rejected": -2.0817036628723145,
"logps/chosen": -711.5889892578125,
"logps/rejected": -1216.048095703125,
"loss": 0.2471,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -5.251239776611328,
"rewards/margins": 4.419920921325684,
"rewards/rejected": -9.671161651611328,
"step": 360
},
{
"epoch": 0.7195662888122227,
"grad_norm": 21.14502188501099,
"learning_rate": 1.1039222039359644e-07,
"logits/chosen": -2.5779356956481934,
"logits/rejected": -2.228896141052246,
"logps/chosen": -739.5020751953125,
"logps/rejected": -1144.7041015625,
"loss": 0.2331,
"rewards/accuracies": 0.90625,
"rewards/chosen": -5.435536861419678,
"rewards/margins": 3.5643341541290283,
"rewards/rejected": -8.999870300292969,
"step": 365
},
{
"epoch": 0.7294233612617053,
"grad_norm": 24.352395974541345,
"learning_rate": 1.0333251074666608e-07,
"logits/chosen": -2.4502475261688232,
"logits/rejected": -2.300096035003662,
"logps/chosen": -781.7764282226562,
"logps/rejected": -1243.557373046875,
"loss": 0.2034,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -5.88522481918335,
"rewards/margins": 4.11319637298584,
"rewards/rejected": -9.998421669006348,
"step": 370
},
{
"epoch": 0.7392804337111878,
"grad_norm": 12.733858279084933,
"learning_rate": 9.644682182758304e-08,
"logits/chosen": -2.5493714809417725,
"logits/rejected": -2.2471814155578613,
"logps/chosen": -801.8941650390625,
"logps/rejected": -1274.67529296875,
"loss": 0.2314,
"rewards/accuracies": 0.9375,
"rewards/chosen": -6.090977668762207,
"rewards/margins": 4.2742109298706055,
"rewards/rejected": -10.365188598632812,
"step": 375
},
{
"epoch": 0.7491375061606703,
"grad_norm": 30.453462939771114,
"learning_rate": 8.974332349459992e-08,
"logits/chosen": -2.3520667552948,
"logits/rejected": -2.144470691680908,
"logps/chosen": -808.9397583007812,
"logps/rejected": -1302.123291015625,
"loss": 0.2251,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -6.170880317687988,
"rewards/margins": 4.420655727386475,
"rewards/rejected": -10.591535568237305,
"step": 380
},
{
"epoch": 0.7589945786101527,
"grad_norm": 25.800864953205974,
"learning_rate": 8.322996943714672e-08,
"logits/chosen": -2.4617538452148438,
"logits/rejected": -2.170855760574341,
"logps/chosen": -752.3043212890625,
"logps/rejected": -1303.364013671875,
"loss": 0.2474,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -5.609736919403076,
"rewards/margins": 4.878790378570557,
"rewards/rejected": -10.488527297973633,
"step": 385
},
{
"epoch": 0.7688516510596353,
"grad_norm": 20.367448051714003,
"learning_rate": 7.691448773879256e-08,
"logits/chosen": -2.631474018096924,
"logits/rejected": -2.1774039268493652,
"logps/chosen": -788.4654541015625,
"logps/rejected": -1410.991455078125,
"loss": 0.219,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -5.944725036621094,
"rewards/margins": 5.383862495422363,
"rewards/rejected": -11.328587532043457,
"step": 390
},
{
"epoch": 0.7787087235091178,
"grad_norm": 22.316993245593054,
"learning_rate": 7.080437170788722e-08,
"logits/chosen": -2.5305416584014893,
"logits/rejected": -2.246816873550415,
"logps/chosen": -782.3768310546875,
"logps/rejected": -1279.210693359375,
"loss": 0.267,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -5.886050701141357,
"rewards/margins": 4.281658172607422,
"rewards/rejected": -10.167708396911621,
"step": 395
},
{
"epoch": 0.7885657959586003,
"grad_norm": 25.32693997023262,
"learning_rate": 6.490687098676332e-08,
"logits/chosen": -2.4314379692077637,
"logits/rejected": -2.1938157081604004,
"logps/chosen": -747.9923095703125,
"logps/rejected": -1171.9027099609375,
"loss": 0.2606,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -5.560776710510254,
"rewards/margins": 3.7204151153564453,
"rewards/rejected": -9.2811918258667,
"step": 400
},
{
"epoch": 0.7885657959586003,
"eval_logits/chosen": -2.784451484680176,
"eval_logits/rejected": -2.6733083724975586,
"eval_logps/chosen": -513.8394165039062,
"eval_logps/rejected": -600.927978515625,
"eval_loss": 0.5123496651649475,
"eval_rewards/accuracies": 0.7782257795333862,
"eval_rewards/chosen": -2.5094728469848633,
"eval_rewards/margins": 0.760833203792572,
"eval_rewards/rejected": -3.27030611038208,
"eval_runtime": 327.294,
"eval_samples_per_second": 6.037,
"eval_steps_per_second": 0.379,
"step": 400
},
{
"epoch": 0.7984228684080829,
"grad_norm": 16.407923464923826,
"learning_rate": 5.9228982950048414e-08,
"logits/chosen": -2.4307689666748047,
"logits/rejected": -2.029819965362549,
"logps/chosen": -701.4022827148438,
"logps/rejected": -1147.330322265625,
"loss": 0.227,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -5.128365993499756,
"rewards/margins": 3.8555781841278076,
"rewards/rejected": -8.9839448928833,
"step": 405
},
{
"epoch": 0.8082799408575653,
"grad_norm": 23.480190565228476,
"learning_rate": 5.3777444402291345e-08,
"logits/chosen": -2.4188990592956543,
"logits/rejected": -2.1691110134124756,
"logps/chosen": -730.129638671875,
"logps/rejected": -1147.34521484375,
"loss": 0.2563,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -5.437844753265381,
"rewards/margins": 3.608722686767578,
"rewards/rejected": -9.0465669631958,
"step": 410
},
{
"epoch": 0.8181370133070478,
"grad_norm": 14.849124963520776,
"learning_rate": 4.855872358475546e-08,
"logits/chosen": -2.4617886543273926,
"logits/rejected": -2.174734592437744,
"logps/chosen": -733.2481079101562,
"logps/rejected": -1173.011962890625,
"loss": 0.2203,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -5.399328708648682,
"rewards/margins": 3.8578476905822754,
"rewards/rejected": -9.257177352905273,
"step": 415
},
{
"epoch": 0.8279940857565303,
"grad_norm": 14.728191064922497,
"learning_rate": 4.357901250086107e-08,
"logits/chosen": -2.605170488357544,
"logits/rejected": -2.1935315132141113,
"logps/chosen": -696.6317749023438,
"logps/rejected": -1134.62890625,
"loss": 0.2169,
"rewards/accuracies": 0.875,
"rewards/chosen": -5.047616481781006,
"rewards/margins": 3.7440898418426514,
"rewards/rejected": -8.791706085205078,
"step": 420
},
{
"epoch": 0.8378511582060129,
"grad_norm": 21.656855077862126,
"learning_rate": 3.884421956938377e-08,
"logits/chosen": -2.443837881088257,
"logits/rejected": -2.016244649887085,
"logps/chosen": -734.0977783203125,
"logps/rejected": -1261.663818359375,
"loss": 0.2601,
"rewards/accuracies": 0.9375,
"rewards/chosen": -5.3997087478637695,
"rewards/margins": 4.594438552856445,
"rewards/rejected": -9.994146347045898,
"step": 425
},
{
"epoch": 0.8477082306554953,
"grad_norm": 22.361665031765803,
"learning_rate": 3.435996261412591e-08,
"logits/chosen": -2.4327638149261475,
"logits/rejected": -2.148250102996826,
"logps/chosen": -736.1185913085938,
"logps/rejected": -1234.297607421875,
"loss": 0.2572,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -5.458459854125977,
"rewards/margins": 4.413580894470215,
"rewards/rejected": -9.872041702270508,
"step": 430
},
{
"epoch": 0.8575653031049778,
"grad_norm": 34.61319327349877,
"learning_rate": 3.013156219837776e-08,
"logits/chosen": -2.418109655380249,
"logits/rejected": -2.1507232189178467,
"logps/chosen": -754.8319091796875,
"logps/rejected": -1258.600830078125,
"loss": 0.2446,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -5.640649795532227,
"rewards/margins": 4.45644998550415,
"rewards/rejected": -10.097099304199219,
"step": 435
},
{
"epoch": 0.8674223755544603,
"grad_norm": 17.275605835829435,
"learning_rate": 2.6164035312078447e-08,
"logits/chosen": -2.610421657562256,
"logits/rejected": -2.200122356414795,
"logps/chosen": -775.4208984375,
"logps/rejected": -1280.5159912109375,
"loss": 0.2162,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -5.812338829040527,
"rewards/margins": 4.107577800750732,
"rewards/rejected": -9.919916152954102,
"step": 440
},
{
"epoch": 0.8772794480039429,
"grad_norm": 19.69767066811732,
"learning_rate": 2.2462089419165776e-08,
"logits/chosen": -2.454554319381714,
"logits/rejected": -2.129283905029297,
"logps/chosen": -772.8093872070312,
"logps/rejected": -1197.3636474609375,
"loss": 0.2623,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -5.787473201751709,
"rewards/margins": 3.7656428813934326,
"rewards/rejected": -9.553116798400879,
"step": 445
},
{
"epoch": 0.8871365204534253,
"grad_norm": 16.68762717345223,
"learning_rate": 1.9030116872178314e-08,
"logits/chosen": -2.3323419094085693,
"logits/rejected": -2.1572489738464355,
"logps/chosen": -730.7281494140625,
"logps/rejected": -1192.001220703125,
"loss": 0.2348,
"rewards/accuracies": 0.875,
"rewards/chosen": -5.397282600402832,
"rewards/margins": 4.056326866149902,
"rewards/rejected": -9.453609466552734,
"step": 450
},
{
"epoch": 0.8969935929029078,
"grad_norm": 20.379506675051896,
"learning_rate": 1.5872189700736337e-08,
"logits/chosen": -2.3889849185943604,
"logits/rejected": -2.237183094024658,
"logps/chosen": -754.2752075195312,
"logps/rejected": -1230.703125,
"loss": 0.1963,
"rewards/accuracies": 0.9375,
"rewards/chosen": -5.620961666107178,
"rewards/margins": 4.211625099182129,
"rewards/rejected": -9.832587242126465,
"step": 455
},
{
"epoch": 0.9068506653523903,
"grad_norm": 17.181830991927878,
"learning_rate": 1.2992054780085692e-08,
"logits/chosen": -2.495082139968872,
"logits/rejected": -2.1834959983825684,
"logps/chosen": -710.3396606445312,
"logps/rejected": -1224.693603515625,
"loss": 0.2459,
"rewards/accuracies": 0.90625,
"rewards/chosen": -5.253153324127197,
"rewards/margins": 4.523493766784668,
"rewards/rejected": -9.776647567749023,
"step": 460
},
{
"epoch": 0.9167077378018729,
"grad_norm": 17.458158491525015,
"learning_rate": 1.0393129385436823e-08,
"logits/chosen": -2.5279664993286133,
"logits/rejected": -2.2738842964172363,
"logps/chosen": -760.11962890625,
"logps/rejected": -1247.168212890625,
"loss": 0.2254,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -5.765726566314697,
"rewards/margins": 4.349237442016602,
"rewards/rejected": -10.11496353149414,
"step": 465
},
{
"epoch": 0.9265648102513554,
"grad_norm": 22.242802721359375,
"learning_rate": 8.078497137373242e-09,
"logits/chosen": -2.6163723468780518,
"logits/rejected": -2.2263712882995605,
"logps/chosen": -774.3194580078125,
"logps/rejected": -1314.293212890625,
"loss": 0.2375,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -5.78206205368042,
"rewards/margins": 4.660614490509033,
"rewards/rejected": -10.442676544189453,
"step": 470
},
{
"epoch": 0.9364218827008378,
"grad_norm": 17.50981352291082,
"learning_rate": 6.0509043431410945e-09,
"logits/chosen": -2.4221930503845215,
"logits/rejected": -2.220930814743042,
"logps/chosen": -804.204345703125,
"logps/rejected": -1225.1212158203125,
"loss": 0.2554,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -6.063734531402588,
"rewards/margins": 3.7845940589904785,
"rewards/rejected": -9.848328590393066,
"step": 475
},
{
"epoch": 0.9462789551503203,
"grad_norm": 19.77591605111257,
"learning_rate": 4.312756738160145e-09,
"logits/chosen": -2.5149528980255127,
"logits/rejected": -2.154731512069702,
"logps/chosen": -768.4055786132812,
"logps/rejected": -1274.7755126953125,
"loss": 0.2458,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -5.735711574554443,
"rewards/margins": 4.444643974304199,
"rewards/rejected": -10.180355072021484,
"step": 480
},
{
"epoch": 0.9561360275998029,
"grad_norm": 16.377470184235065,
"learning_rate": 2.8661166316229223e-09,
"logits/chosen": -2.3629002571105957,
"logits/rejected": -2.151808738708496,
"logps/chosen": -723.3502197265625,
"logps/rejected": -1148.914794921875,
"loss": 0.2446,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -5.385800361633301,
"rewards/margins": 3.7826755046844482,
"rewards/rejected": -9.168476104736328,
"step": 485
},
{
"epoch": 0.9659931000492854,
"grad_norm": 15.391953903269371,
"learning_rate": 1.7127004595681727e-09,
"logits/chosen": -2.5323967933654785,
"logits/rejected": -2.1350226402282715,
"logps/chosen": -750.980712890625,
"logps/rejected": -1350.3404541015625,
"loss": 0.2446,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -5.573115348815918,
"rewards/margins": 5.145482063293457,
"rewards/rejected": -10.718597412109375,
"step": 490
},
{
"epoch": 0.9758501724987678,
"grad_norm": 16.723832751333937,
"learning_rate": 8.538767483325383e-10,
"logits/chosen": -2.532517433166504,
"logits/rejected": -2.180654525756836,
"logps/chosen": -762.9144287109375,
"logps/rejected": -1284.332763671875,
"loss": 0.2173,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -5.670698165893555,
"rewards/margins": 4.554699897766113,
"rewards/rejected": -10.225398063659668,
"step": 495
},
{
"epoch": 0.9857072449482503,
"grad_norm": 26.735630621648028,
"learning_rate": 2.9066449079634404e-10,
"logits/chosen": -2.5005085468292236,
"logits/rejected": -2.2136847972869873,
"logps/chosen": -738.4940185546875,
"logps/rejected": -1214.4185791015625,
"loss": 0.2165,
"rewards/accuracies": 0.9375,
"rewards/chosen": -5.457156658172607,
"rewards/margins": 4.181014060974121,
"rewards/rejected": -9.63817024230957,
"step": 500
},
{
"epoch": 0.9955643173977329,
"grad_norm": 17.938477728126337,
"learning_rate": 2.3731937350224273e-11,
"logits/chosen": -2.449402093887329,
"logits/rejected": -2.1034648418426514,
"logps/chosen": -783.5701293945312,
"logps/rejected": -1268.8685302734375,
"loss": 0.2476,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -5.870804786682129,
"rewards/margins": 4.206555366516113,
"rewards/rejected": -10.077360153198242,
"step": 505
},
{
"epoch": 0.9995071463775259,
"step": 507,
"total_flos": 0.0,
"train_loss": 0.30356378627011527,
"train_runtime": 18867.8748,
"train_samples_per_second": 3.441,
"train_steps_per_second": 0.027
}
],
"logging_steps": 5,
"max_steps": 507,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}