phi3-dpo / trainer_state.json
ludekcizinsky's picture
Upload folder using huggingface_hub
013e8b5 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 1589,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0031466331025802393,
"grad_norm": 8.804184913635254,
"learning_rate": 1.257861635220126e-06,
"logits/chosen": -0.11013289541006088,
"logits/rejected": -0.5208367109298706,
"logps/chosen": -0.8537980914115906,
"logps/rejected": -1.0550096035003662,
"loss": 24.9985,
"rewards/accuracies": 0.3125,
"rewards/chosen": -5.359128408599645e-06,
"rewards/margins": 1.545622944831848e-05,
"rewards/rejected": -2.081535967590753e-05,
"step": 5
},
{
"epoch": 0.0062932662051604785,
"grad_norm": 18.678768157958984,
"learning_rate": 2.2641509433962266e-06,
"logits/chosen": -0.3030635714530945,
"logits/rejected": -0.5435053706169128,
"logps/chosen": -0.9865642786026001,
"logps/rejected": -1.107262372970581,
"loss": 24.9967,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.00010868474782910198,
"rewards/margins": 3.348257814650424e-05,
"rewards/rejected": -0.00014216733688954264,
"step": 10
},
{
"epoch": 0.009439899307740718,
"grad_norm": 11.281435012817383,
"learning_rate": 3.5220125786163524e-06,
"logits/chosen": -0.5111545324325562,
"logits/rejected": -0.8536307215690613,
"logps/chosen": -1.0305876731872559,
"logps/rejected": -1.2494089603424072,
"loss": 24.9847,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.00031705142464488745,
"rewards/margins": 0.000152838954818435,
"rewards/rejected": -0.00046989036491140723,
"step": 15
},
{
"epoch": 0.012586532410320957,
"grad_norm": 59.5455322265625,
"learning_rate": 4.528301886792453e-06,
"logits/chosen": -0.616014838218689,
"logits/rejected": -0.6851056218147278,
"logps/chosen": -1.130916953086853,
"logps/rejected": -1.46986985206604,
"loss": 24.9645,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.0012707245768979192,
"rewards/margins": 0.0003901523014064878,
"rewards/rejected": -0.0016608769074082375,
"step": 20
},
{
"epoch": 0.015733165512901194,
"grad_norm": 9.532500267028809,
"learning_rate": 5.786163522012579e-06,
"logits/chosen": -0.12423186004161835,
"logits/rejected": -0.4599896967411041,
"logps/chosen": -0.8485546112060547,
"logps/rejected": -1.0018525123596191,
"loss": 24.9267,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.001064571551978588,
"rewards/margins": 0.0007417487213388085,
"rewards/rejected": -0.0018063202733173966,
"step": 25
},
{
"epoch": 0.018879798615481436,
"grad_norm": 11.064372062683105,
"learning_rate": 7.044025157232705e-06,
"logits/chosen": -0.1580429971218109,
"logits/rejected": -0.38266992568969727,
"logps/chosen": -0.8662201166152954,
"logps/rejected": -1.0262982845306396,
"loss": 24.8872,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.0021042851731181145,
"rewards/margins": 0.0011668736115098,
"rewards/rejected": -0.0032711587846279144,
"step": 30
},
{
"epoch": 0.022026431718061675,
"grad_norm": 37.646690368652344,
"learning_rate": 8.301886792452832e-06,
"logits/chosen": 0.026471847668290138,
"logits/rejected": -0.49966034293174744,
"logps/chosen": -0.8883110880851746,
"logps/rejected": -1.199055790901184,
"loss": 24.6732,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.003758195089176297,
"rewards/margins": 0.0034271504264324903,
"rewards/rejected": -0.0071853450499475,
"step": 35
},
{
"epoch": 0.025173064820641914,
"grad_norm": 34.88091278076172,
"learning_rate": 9.559748427672956e-06,
"logits/chosen": -0.36181551218032837,
"logits/rejected": -0.6659843325614929,
"logps/chosen": -0.9565097689628601,
"logps/rejected": -1.183980941772461,
"loss": 24.6032,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.007488996721804142,
"rewards/margins": 0.004166100639849901,
"rewards/rejected": -0.011655096895992756,
"step": 40
},
{
"epoch": 0.028319697923222153,
"grad_norm": 20.635541915893555,
"learning_rate": 1.0817610062893083e-05,
"logits/chosen": -0.5469181537628174,
"logits/rejected": -0.7580572366714478,
"logps/chosen": -1.0930149555206299,
"logps/rejected": -1.2114075422286987,
"loss": 24.7536,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0149534298107028,
"rewards/margins": 0.002928597154095769,
"rewards/rejected": -0.017882030457258224,
"step": 45
},
{
"epoch": 0.03146633102580239,
"grad_norm": 36.45863723754883,
"learning_rate": 1.2075471698113209e-05,
"logits/chosen": -0.5085287094116211,
"logits/rejected": -0.7208930253982544,
"logps/chosen": -1.083438515663147,
"logps/rejected": -1.3884985446929932,
"loss": 23.9964,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.018525470048189163,
"rewards/margins": 0.011683688499033451,
"rewards/rejected": -0.03020915761590004,
"step": 50
},
{
"epoch": 0.034612964128382634,
"grad_norm": 45.08958053588867,
"learning_rate": 1.3081761006289308e-05,
"logits/chosen": -0.6235328912734985,
"logits/rejected": -0.8463523983955383,
"logps/chosen": -1.1567853689193726,
"logps/rejected": -2.149567127227783,
"loss": 23.8291,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.028398022055625916,
"rewards/margins": 0.03046615794301033,
"rewards/rejected": -0.05886417627334595,
"step": 55
},
{
"epoch": 0.03775959723096287,
"grad_norm": 48.65830612182617,
"learning_rate": 1.408805031446541e-05,
"logits/chosen": -0.6318017244338989,
"logits/rejected": -0.9939996600151062,
"logps/chosen": -1.7119739055633545,
"logps/rejected": -2.3402199745178223,
"loss": 23.5592,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.060618169605731964,
"rewards/margins": 0.028203105553984642,
"rewards/rejected": -0.08882127702236176,
"step": 60
},
{
"epoch": 0.04090623033354311,
"grad_norm": 138.4451904296875,
"learning_rate": 1.5345911949685536e-05,
"logits/chosen": -0.9717090725898743,
"logits/rejected": -1.1959871053695679,
"logps/chosen": -1.9082473516464233,
"logps/rejected": -2.449486494064331,
"loss": 22.7524,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.08517814427614212,
"rewards/margins": 0.03464372828602791,
"rewards/rejected": -0.11982186883687973,
"step": 65
},
{
"epoch": 0.04405286343612335,
"grad_norm": 49.257568359375,
"learning_rate": 1.6603773584905664e-05,
"logits/chosen": -0.7433441281318665,
"logits/rejected": -1.0399134159088135,
"logps/chosen": -2.255545139312744,
"logps/rejected": -2.98321533203125,
"loss": 23.4113,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.11126575618982315,
"rewards/margins": 0.04789165034890175,
"rewards/rejected": -0.159157395362854,
"step": 70
},
{
"epoch": 0.04719949653870359,
"grad_norm": 56.168670654296875,
"learning_rate": 1.7861635220125788e-05,
"logits/chosen": -1.0234445333480835,
"logits/rejected": -1.288999080657959,
"logps/chosen": -1.653058648109436,
"logps/rejected": -2.370941162109375,
"loss": 22.181,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.08025868237018585,
"rewards/margins": 0.044546954333782196,
"rewards/rejected": -0.12480561435222626,
"step": 75
},
{
"epoch": 0.05034612964128383,
"grad_norm": NaN,
"learning_rate": 1.8867924528301888e-05,
"logits/chosen": -1.1835613250732422,
"logits/rejected": -1.4036767482757568,
"logps/chosen": -1.90883469581604,
"logps/rejected": -2.1450889110565186,
"loss": 25.8869,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.09280741214752197,
"rewards/margins": 0.021463319659233093,
"rewards/rejected": -0.11427073180675507,
"step": 80
},
{
"epoch": 0.05349276274386407,
"grad_norm": 103.32862091064453,
"learning_rate": 2.0125786163522016e-05,
"logits/chosen": -1.539898157119751,
"logits/rejected": -1.6518356800079346,
"logps/chosen": -2.0776684284210205,
"logps/rejected": -2.5599067211151123,
"loss": 24.1212,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.12567706406116486,
"rewards/margins": 0.026400262489914894,
"rewards/rejected": -0.152077317237854,
"step": 85
},
{
"epoch": 0.056639395846444306,
"grad_norm": 176.14784240722656,
"learning_rate": 2.138364779874214e-05,
"logits/chosen": -1.3818124532699585,
"logits/rejected": -1.5843524932861328,
"logps/chosen": -2.48514461517334,
"logps/rejected": -2.8115108013153076,
"loss": 26.3518,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.15766306221485138,
"rewards/margins": 0.025023411959409714,
"rewards/rejected": -0.1826864778995514,
"step": 90
},
{
"epoch": 0.059786028949024544,
"grad_norm": 106.15309143066406,
"learning_rate": 2.2641509433962265e-05,
"logits/chosen": -1.5876004695892334,
"logits/rejected": -1.7525005340576172,
"logps/chosen": -2.2529654502868652,
"logps/rejected": -3.2197937965393066,
"loss": 20.8074,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.14453086256980896,
"rewards/margins": 0.07421146333217621,
"rewards/rejected": -0.21874232590198517,
"step": 95
},
{
"epoch": 0.06293266205160478,
"grad_norm": 91.7872085571289,
"learning_rate": 2.3899371069182393e-05,
"logits/chosen": -1.6880552768707275,
"logits/rejected": -1.667741060256958,
"logps/chosen": -3.5453040599823,
"logps/rejected": -3.8808798789978027,
"loss": 24.6555,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.2320992648601532,
"rewards/margins": 0.020265836268663406,
"rewards/rejected": -0.2523651123046875,
"step": 100
},
{
"epoch": 0.06607929515418502,
"grad_norm": 778.8959350585938,
"learning_rate": 2.4905660377358492e-05,
"logits/chosen": -1.8318984508514404,
"logits/rejected": -1.8932411670684814,
"logps/chosen": -3.125164031982422,
"logps/rejected": -4.746774673461914,
"loss": 27.3293,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.19288301467895508,
"rewards/margins": 0.096275694668293,
"rewards/rejected": -0.28915873169898987,
"step": 105
},
{
"epoch": 0.06922592825676527,
"grad_norm": 132.40354919433594,
"learning_rate": 2.6163522012578617e-05,
"logits/chosen": -1.7445348501205444,
"logits/rejected": -1.902320146560669,
"logps/chosen": -1.9325546026229858,
"logps/rejected": -3.3019511699676514,
"loss": 21.7317,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.10555150359869003,
"rewards/margins": 0.07426220178604126,
"rewards/rejected": -0.1798136979341507,
"step": 110
},
{
"epoch": 0.0723725613593455,
"grad_norm": 98.48819732666016,
"learning_rate": 2.742138364779874e-05,
"logits/chosen": -1.7994951009750366,
"logits/rejected": -1.9057296514511108,
"logps/chosen": -2.1663613319396973,
"logps/rejected": -2.82452392578125,
"loss": 22.7429,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.1297454833984375,
"rewards/margins": 0.04077299311757088,
"rewards/rejected": -0.17051845788955688,
"step": 115
},
{
"epoch": 0.07551919446192575,
"grad_norm": 93.13198852539062,
"learning_rate": 2.867924528301887e-05,
"logits/chosen": -1.6606374979019165,
"logits/rejected": -1.7864787578582764,
"logps/chosen": -2.2936453819274902,
"logps/rejected": -2.5704400539398193,
"loss": 24.0989,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.12692785263061523,
"rewards/margins": 0.020071204751729965,
"rewards/rejected": -0.1469990611076355,
"step": 120
},
{
"epoch": 0.07866582756450598,
"grad_norm": 101.10535430908203,
"learning_rate": 2.968553459119497e-05,
"logits/chosen": -1.648816704750061,
"logits/rejected": -1.6658546924591064,
"logps/chosen": -2.0479884147644043,
"logps/rejected": -2.8278560638427734,
"loss": 27.9983,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.11854568868875504,
"rewards/margins": 0.0439017117023468,
"rewards/rejected": -0.16244739294052124,
"step": 125
},
{
"epoch": 0.08181246066708622,
"grad_norm": 92.93740844726562,
"learning_rate": 3.09433962264151e-05,
"logits/chosen": -1.7306410074234009,
"logits/rejected": -1.8349339962005615,
"logps/chosen": -2.082920551300049,
"logps/rejected": -3.115952253341675,
"loss": 23.5299,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.11979808658361435,
"rewards/margins": 0.06730449199676514,
"rewards/rejected": -0.18710258603096008,
"step": 130
},
{
"epoch": 0.08495909376966645,
"grad_norm": 123.31324768066406,
"learning_rate": 3.220125786163522e-05,
"logits/chosen": -1.8235572576522827,
"logits/rejected": -1.8541405200958252,
"logps/chosen": -1.9667946100234985,
"logps/rejected": -2.772089958190918,
"loss": 22.6137,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.10746946185827255,
"rewards/margins": 0.04866869002580643,
"rewards/rejected": -0.15613815188407898,
"step": 135
},
{
"epoch": 0.0881057268722467,
"grad_norm": 126.42218017578125,
"learning_rate": 3.345911949685535e-05,
"logits/chosen": -1.674515962600708,
"logits/rejected": -1.8894662857055664,
"logps/chosen": -2.245245933532715,
"logps/rejected": -3.0301966667175293,
"loss": 22.6984,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.11901465803384781,
"rewards/margins": 0.049767203629016876,
"rewards/rejected": -0.16878187656402588,
"step": 140
},
{
"epoch": 0.09125235997482693,
"grad_norm": 114.61023712158203,
"learning_rate": 3.471698113207548e-05,
"logits/chosen": -1.7905619144439697,
"logits/rejected": -1.8821656703948975,
"logps/chosen": -3.373708724975586,
"logps/rejected": -4.691153526306152,
"loss": 22.2353,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.18414758145809174,
"rewards/margins": 0.0776277631521225,
"rewards/rejected": -0.26177531480789185,
"step": 145
},
{
"epoch": 0.09439899307740718,
"grad_norm": 296.22955322265625,
"learning_rate": 3.59748427672956e-05,
"logits/chosen": -1.654166579246521,
"logits/rejected": -1.847495436668396,
"logps/chosen": -3.2497410774230957,
"logps/rejected": -4.303386688232422,
"loss": 20.9992,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.22493870556354523,
"rewards/margins": 0.07029641419649124,
"rewards/rejected": -0.2952350974082947,
"step": 150
},
{
"epoch": 0.09754562617998741,
"grad_norm": 579.211669921875,
"learning_rate": 3.7232704402515726e-05,
"logits/chosen": -1.6689754724502563,
"logits/rejected": -1.7173473834991455,
"logps/chosen": -3.7695910930633545,
"logps/rejected": -4.783900260925293,
"loss": 25.2195,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.2695242762565613,
"rewards/margins": 0.05760473012924194,
"rewards/rejected": -0.3271290063858032,
"step": 155
},
{
"epoch": 0.10069225928256766,
"grad_norm": 200.13609313964844,
"learning_rate": 3.8490566037735854e-05,
"logits/chosen": -1.7428325414657593,
"logits/rejected": -1.74752938747406,
"logps/chosen": -3.6156649589538574,
"logps/rejected": -4.805546760559082,
"loss": 22.7118,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.2704419791698456,
"rewards/margins": 0.06444540619850159,
"rewards/rejected": -0.33488741517066956,
"step": 160
},
{
"epoch": 0.10383889238514789,
"grad_norm": 172.51622009277344,
"learning_rate": 3.9748427672955975e-05,
"logits/chosen": -1.7474384307861328,
"logits/rejected": -1.7428706884384155,
"logps/chosen": -3.276729106903076,
"logps/rejected": -4.082120418548584,
"loss": 22.3077,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.2311684638261795,
"rewards/margins": 0.051828593015670776,
"rewards/rejected": -0.2829970717430115,
"step": 165
},
{
"epoch": 0.10698552548772813,
"grad_norm": 146.08352661132812,
"learning_rate": 3.9999227773220194e-05,
"logits/chosen": -1.6052366495132446,
"logits/rejected": -1.6235520839691162,
"logps/chosen": -3.030139207839966,
"logps/rejected": -4.707204818725586,
"loss": 20.0014,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.21569938957691193,
"rewards/margins": 0.12310032546520233,
"rewards/rejected": -0.33879974484443665,
"step": 170
},
{
"epoch": 0.11013215859030837,
"grad_norm": 133.93203735351562,
"learning_rate": 3.9996090704130684e-05,
"logits/chosen": -1.7021839618682861,
"logits/rejected": -1.7295335531234741,
"logps/chosen": -3.9147982597351074,
"logps/rejected": -5.332208633422852,
"loss": 20.047,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.3039882779121399,
"rewards/margins": 0.1180083155632019,
"rewards/rejected": -0.4219965934753418,
"step": 175
},
{
"epoch": 0.11327879169288861,
"grad_norm": 558.7332763671875,
"learning_rate": 3.999054090678532e-05,
"logits/chosen": -1.5368597507476807,
"logits/rejected": -1.592637300491333,
"logps/chosen": -6.026860237121582,
"logps/rejected": -6.550711631774902,
"loss": 29.6933,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.4695097804069519,
"rewards/margins": 0.02213056944310665,
"rewards/rejected": -0.4916403889656067,
"step": 180
},
{
"epoch": 0.11642542479546884,
"grad_norm": 212.05760192871094,
"learning_rate": 3.9982579050822615e-05,
"logits/chosen": -1.5933212041854858,
"logits/rejected": -1.5753694772720337,
"logps/chosen": -4.716382026672363,
"logps/rejected": -5.257371425628662,
"loss": 27.5815,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3512773811817169,
"rewards/margins": 0.033867720514535904,
"rewards/rejected": -0.3851450979709625,
"step": 185
},
{
"epoch": 0.11957205789804909,
"grad_norm": 134.0122833251953,
"learning_rate": 3.997220609692011e-05,
"logits/chosen": -1.6495559215545654,
"logits/rejected": -1.6725133657455444,
"logps/chosen": -3.984989643096924,
"logps/rejected": -5.001562595367432,
"loss": 22.6766,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.2919956147670746,
"rewards/margins": 0.05422482639551163,
"rewards/rejected": -0.3462204337120056,
"step": 190
},
{
"epoch": 0.12271869100062932,
"grad_norm": 151.4617919921875,
"learning_rate": 3.9959423296678384e-05,
"logits/chosen": -1.7128961086273193,
"logits/rejected": -1.6318174600601196,
"logps/chosen": -3.3435721397399902,
"logps/rejected": -4.078289985656738,
"loss": 25.0994,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.23941469192504883,
"rewards/margins": 0.04007013887166977,
"rewards/rejected": -0.2794848084449768,
"step": 195
},
{
"epoch": 0.12586532410320955,
"grad_norm": 115.02042388916016,
"learning_rate": 3.9944232192470094e-05,
"logits/chosen": -1.7137172222137451,
"logits/rejected": -1.7910420894622803,
"logps/chosen": -3.106358051300049,
"logps/rejected": -3.97111439704895,
"loss": 21.6293,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.2061949521303177,
"rewards/margins": 0.04795767739415169,
"rewards/rejected": -0.2541525959968567,
"step": 200
},
{
"epoch": 0.1290119572057898,
"grad_norm": 81.87369537353516,
"learning_rate": 3.992663461725383e-05,
"logits/chosen": -1.5431886911392212,
"logits/rejected": -1.557018518447876,
"logps/chosen": -2.805392026901245,
"logps/rejected": -4.356006622314453,
"loss": 21.8817,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.1901206076145172,
"rewards/margins": 0.0818587988615036,
"rewards/rejected": -0.2719793915748596,
"step": 205
},
{
"epoch": 0.13215859030837004,
"grad_norm": 188.29173278808594,
"learning_rate": 3.990663269435298e-05,
"logits/chosen": -1.6920125484466553,
"logits/rejected": -1.6854931116104126,
"logps/chosen": -3.156735897064209,
"logps/rejected": -4.396471977233887,
"loss": 27.5638,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.20159177482128143,
"rewards/margins": 0.07304342836141586,
"rewards/rejected": -0.2746351957321167,
"step": 210
},
{
"epoch": 0.13530522341095028,
"grad_norm": 141.69142150878906,
"learning_rate": 3.98842288371995e-05,
"logits/chosen": -1.6487762928009033,
"logits/rejected": -1.7372210025787354,
"logps/chosen": -2.6156325340270996,
"logps/rejected": -3.677928924560547,
"loss": 21.5613,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.1552310734987259,
"rewards/margins": 0.0671745091676712,
"rewards/rejected": -0.2224055826663971,
"step": 215
},
{
"epoch": 0.13845185651353054,
"grad_norm": 92.31954193115234,
"learning_rate": 3.985942574904275e-05,
"logits/chosen": -1.677199363708496,
"logits/rejected": -1.6414434909820557,
"logps/chosen": -2.499932050704956,
"logps/rejected": -3.3010895252227783,
"loss": 22.2151,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.16245323419570923,
"rewards/margins": 0.05558500811457634,
"rewards/rejected": -0.21803824603557587,
"step": 220
},
{
"epoch": 0.14159848961611077,
"grad_norm": 106.32303619384766,
"learning_rate": 3.983222642262329e-05,
"logits/chosen": -1.6422779560089111,
"logits/rejected": -1.6500838994979858,
"logps/chosen": -2.66230845451355,
"logps/rejected": -3.7150185108184814,
"loss": 20.2102,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.18372972309589386,
"rewards/margins": 0.08325181156396866,
"rewards/rejected": -0.2669815421104431,
"step": 225
},
{
"epoch": 0.144745122718691,
"grad_norm": 113.5155029296875,
"learning_rate": 3.980263413981178e-05,
"logits/chosen": -1.5669496059417725,
"logits/rejected": -1.5747731924057007,
"logps/chosen": -3.1706671714782715,
"logps/rejected": -3.948005199432373,
"loss": 21.8852,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.2336561679840088,
"rewards/margins": 0.06708776950836182,
"rewards/rejected": -0.300743967294693,
"step": 230
},
{
"epoch": 0.14789175582127123,
"grad_norm": 99.03396606445312,
"learning_rate": 3.977065247121298e-05,
"logits/chosen": -1.639129400253296,
"logits/rejected": -1.6693006753921509,
"logps/chosen": -3.2495856285095215,
"logps/rejected": -4.634251594543457,
"loss": 22.4292,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.23980684578418732,
"rewards/margins": 0.10619230568408966,
"rewards/rejected": -0.345999151468277,
"step": 235
},
{
"epoch": 0.1510383889238515,
"grad_norm": 254.25714111328125,
"learning_rate": 3.973628527573495e-05,
"logits/chosen": -1.4451357126235962,
"logits/rejected": -1.415290355682373,
"logps/chosen": -4.496035575866699,
"logps/rejected": -5.4740800857543945,
"loss": 24.0697,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.3446175158023834,
"rewards/margins": 0.07305804640054703,
"rewards/rejected": -0.41767558455467224,
"step": 240
},
{
"epoch": 0.15418502202643172,
"grad_norm": 98.8416976928711,
"learning_rate": 3.969953670012342e-05,
"logits/chosen": -1.6127903461456299,
"logits/rejected": -1.529802918434143,
"logps/chosen": -3.744677782058716,
"logps/rejected": -5.76874303817749,
"loss": 20.2498,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.2814852297306061,
"rewards/margins": 0.12259259074926376,
"rewards/rejected": -0.40407782793045044,
"step": 245
},
{
"epoch": 0.15733165512901195,
"grad_norm": 174.65185546875,
"learning_rate": 3.9660411178461427e-05,
"logits/chosen": -1.6170070171356201,
"logits/rejected": -1.5994997024536133,
"logps/chosen": -3.390500545501709,
"logps/rejected": -4.377715587615967,
"loss": 22.3596,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.2286950647830963,
"rewards/margins": 0.07162559777498245,
"rewards/rejected": -0.30032065510749817,
"step": 250
},
{
"epoch": 0.1604782882315922,
"grad_norm": 98.30694580078125,
"learning_rate": 3.9618913431634326e-05,
"logits/chosen": -1.5248662233352661,
"logits/rejected": -1.570233702659607,
"logps/chosen": -2.914156436920166,
"logps/rejected": -3.4877963066101074,
"loss": 21.8392,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.17873355746269226,
"rewards/margins": 0.04700728505849838,
"rewards/rejected": -0.22574086487293243,
"step": 255
},
{
"epoch": 0.16362492133417245,
"grad_norm": 108.39894104003906,
"learning_rate": 3.957504846676015e-05,
"logits/chosen": -1.5246005058288574,
"logits/rejected": -1.6037238836288452,
"logps/chosen": -3.113523006439209,
"logps/rejected": -4.024534702301025,
"loss": 21.9178,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.21206626296043396,
"rewards/margins": 0.06214412301778793,
"rewards/rejected": -0.2742103934288025,
"step": 260
},
{
"epoch": 0.16677155443675268,
"grad_norm": 122.53215026855469,
"learning_rate": 3.952882157658545e-05,
"logits/chosen": -1.4534975290298462,
"logits/rejected": -1.4294064044952393,
"logps/chosen": -3.44130277633667,
"logps/rejected": -3.7570698261260986,
"loss": 25.6929,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.2504199147224426,
"rewards/margins": 0.021822316572070122,
"rewards/rejected": -0.2722422182559967,
"step": 265
},
{
"epoch": 0.1699181875393329,
"grad_norm": 117.72502899169922,
"learning_rate": 3.948023833884667e-05,
"logits/chosen": -1.596609354019165,
"logits/rejected": -1.6202917098999023,
"logps/chosen": -3.7515816688537598,
"logps/rejected": -3.9420647621154785,
"loss": 25.1709,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.23414090275764465,
"rewards/margins": 0.027978042140603065,
"rewards/rejected": -0.26211896538734436,
"step": 270
},
{
"epoch": 0.17306482064191314,
"grad_norm": 84.38936614990234,
"learning_rate": 3.942930461559718e-05,
"logits/chosen": -1.5714600086212158,
"logits/rejected": -1.683579683303833,
"logps/chosen": -3.3148865699768066,
"logps/rejected": -3.7648849487304688,
"loss": 24.1859,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.2219853699207306,
"rewards/margins": 0.02847103402018547,
"rewards/rejected": -0.25045639276504517,
"step": 275
},
{
"epoch": 0.1762114537444934,
"grad_norm": 122.990478515625,
"learning_rate": 3.9376026552499894e-05,
"logits/chosen": -1.5986852645874023,
"logits/rejected": -1.6811764240264893,
"logps/chosen": -3.261617660522461,
"logps/rejected": -4.3173418045043945,
"loss": 19.8872,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.22893181443214417,
"rewards/margins": 0.0762997642159462,
"rewards/rejected": -0.30523157119750977,
"step": 280
},
{
"epoch": 0.17935808684707363,
"grad_norm": 128.1126251220703,
"learning_rate": 3.9320410578085774e-05,
"logits/chosen": -1.5240422487258911,
"logits/rejected": -1.5410079956054688,
"logps/chosen": -3.7498767375946045,
"logps/rejected": -4.466190338134766,
"loss": 22.8035,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.2702712118625641,
"rewards/margins": 0.0467303991317749,
"rewards/rejected": -0.3170016407966614,
"step": 285
},
{
"epoch": 0.18250471994965387,
"grad_norm": 160.00189208984375,
"learning_rate": 3.9262463402978165e-05,
"logits/chosen": -1.413119912147522,
"logits/rejected": -1.3633155822753906,
"logps/chosen": -3.8721237182617188,
"logps/rejected": -5.0125298500061035,
"loss": 22.2056,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.3019005358219147,
"rewards/margins": 0.08287017047405243,
"rewards/rejected": -0.3847707211971283,
"step": 290
},
{
"epoch": 0.1856513530522341,
"grad_norm": 168.05908203125,
"learning_rate": 3.920219201908306e-05,
"logits/chosen": -1.2270746231079102,
"logits/rejected": -1.2809008359909058,
"logps/chosen": -4.052460670471191,
"logps/rejected": -5.228961944580078,
"loss": 21.1427,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3124389052391052,
"rewards/margins": 0.0833948403596878,
"rewards/rejected": -0.3958337903022766,
"step": 295
},
{
"epoch": 0.18879798615481436,
"grad_norm": 94.47506713867188,
"learning_rate": 3.9139603698745514e-05,
"logits/chosen": -1.1681110858917236,
"logits/rejected": -1.2372829914093018,
"logps/chosen": -3.511944532394409,
"logps/rejected": -4.100220680236816,
"loss": 22.7025,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.24873590469360352,
"rewards/margins": 0.03639525547623634,
"rewards/rejected": -0.28513115644454956,
"step": 300
},
{
"epoch": 0.1919446192573946,
"grad_norm": 560.2835083007812,
"learning_rate": 3.907470599387209e-05,
"logits/chosen": -1.101466178894043,
"logits/rejected": -1.0982881784439087,
"logps/chosen": -3.0287392139434814,
"logps/rejected": -3.3412985801696777,
"loss": 25.2732,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.210398867726326,
"rewards/margins": 0.023965148255228996,
"rewards/rejected": -0.23436403274536133,
"step": 305
},
{
"epoch": 0.19509125235997482,
"grad_norm": 190.03529357910156,
"learning_rate": 3.900750673501971e-05,
"logits/chosen": -0.8078586459159851,
"logits/rejected": -1.0514795780181885,
"logps/chosen": -2.391371250152588,
"logps/rejected": -3.401437282562256,
"loss": 21.6721,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.14814777672290802,
"rewards/margins": 0.07396493852138519,
"rewards/rejected": -0.22211270034313202,
"step": 310
},
{
"epoch": 0.19823788546255505,
"grad_norm": 127.30278778076172,
"learning_rate": 3.893801403045078e-05,
"logits/chosen": -0.9948938488960266,
"logits/rejected": -1.1343729496002197,
"logps/chosen": -2.520848274230957,
"logps/rejected": -3.695737838745117,
"loss": 21.1395,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.16981211304664612,
"rewards/margins": 0.08695949614048004,
"rewards/rejected": -0.25677159428596497,
"step": 315
},
{
"epoch": 0.2013845185651353,
"grad_norm": 164.39279174804688,
"learning_rate": 3.8866236265154864e-05,
"logits/chosen": -1.059020757675171,
"logits/rejected": -1.1909369230270386,
"logps/chosen": -3.2958297729492188,
"logps/rejected": -4.60178279876709,
"loss": 23.1263,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.24349625408649445,
"rewards/margins": 0.08941353857517242,
"rewards/rejected": -0.33290979266166687,
"step": 320
},
{
"epoch": 0.20453115166771554,
"grad_norm": 317.5319519042969,
"learning_rate": 3.8792182099836956e-05,
"logits/chosen": -1.1690977811813354,
"logits/rejected": -1.221868872642517,
"logps/chosen": -3.4916579723358154,
"logps/rejected": -4.967286109924316,
"loss": 19.5685,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.2618243992328644,
"rewards/margins": 0.09256020933389664,
"rewards/rejected": -0.35438457131385803,
"step": 325
},
{
"epoch": 0.20767778477029578,
"grad_norm": 113.65757751464844,
"learning_rate": 3.8715860469872456e-05,
"logits/chosen": -1.230567216873169,
"logits/rejected": -1.2354533672332764,
"logps/chosen": -4.1219682693481445,
"logps/rejected": -5.140664577484131,
"loss": 24.1262,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.3101332485675812,
"rewards/margins": 0.07826542854309082,
"rewards/rejected": -0.3883987069129944,
"step": 330
},
{
"epoch": 0.21082441787287604,
"grad_norm": 103.66908264160156,
"learning_rate": 3.863728058422905e-05,
"logits/chosen": -1.1679656505584717,
"logits/rejected": -1.2492824792861938,
"logps/chosen": -4.176590442657471,
"logps/rejected": -5.121442794799805,
"loss": 21.9799,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.3107621371746063,
"rewards/margins": 0.07555123418569565,
"rewards/rejected": -0.38631340861320496,
"step": 335
},
{
"epoch": 0.21397105097545627,
"grad_norm": 187.34596252441406,
"learning_rate": 3.855645192435555e-05,
"logits/chosen": -1.4208840131759644,
"logits/rejected": -1.357755422592163,
"logps/chosen": -3.746802568435669,
"logps/rejected": -4.651678562164307,
"loss": 21.8739,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.2814106345176697,
"rewards/margins": 0.06742358207702637,
"rewards/rejected": -0.34883421659469604,
"step": 340
},
{
"epoch": 0.2171176840780365,
"grad_norm": 128.47970581054688,
"learning_rate": 3.847338424303787e-05,
"logits/chosen": -1.403939962387085,
"logits/rejected": -1.3926942348480225,
"logps/chosen": -3.540362596511841,
"logps/rejected": -4.463648796081543,
"loss": 22.9837,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2591942250728607,
"rewards/margins": 0.06667342782020569,
"rewards/rejected": -0.3258676528930664,
"step": 345
},
{
"epoch": 0.22026431718061673,
"grad_norm": 91.00343322753906,
"learning_rate": 3.838808756322222e-05,
"logits/chosen": -1.4555909633636475,
"logits/rejected": -1.4179480075836182,
"logps/chosen": -3.3319029808044434,
"logps/rejected": -4.7188615798950195,
"loss": 22.182,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.24019880592823029,
"rewards/margins": 0.09150617569684982,
"rewards/rejected": -0.3317049741744995,
"step": 350
},
{
"epoch": 0.223410950283197,
"grad_norm": 89.21013641357422,
"learning_rate": 3.8300572176805796e-05,
"logits/chosen": -1.505953073501587,
"logits/rejected": -1.4713289737701416,
"logps/chosen": -3.2633144855499268,
"logps/rejected": -4.148341655731201,
"loss": 22.4622,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.23655016720294952,
"rewards/margins": 0.04711543396115303,
"rewards/rejected": -0.28366559743881226,
"step": 355
},
{
"epoch": 0.22655758338577722,
"grad_norm": 136.71780395507812,
"learning_rate": 3.82108486433949e-05,
"logits/chosen": -1.4959208965301514,
"logits/rejected": -1.4308115243911743,
"logps/chosen": -3.161681652069092,
"logps/rejected": -3.9897868633270264,
"loss": 23.3097,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.2291373759508133,
"rewards/margins": 0.045841820538043976,
"rewards/rejected": -0.2749791741371155,
"step": 360
},
{
"epoch": 0.22970421648835745,
"grad_norm": 233.93896484375,
"learning_rate": 3.8118927789030854e-05,
"logits/chosen": -1.5138304233551025,
"logits/rejected": -1.5346544981002808,
"logps/chosen": -4.37386417388916,
"logps/rejected": -5.469226837158203,
"loss": 20.9319,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.3008665442466736,
"rewards/margins": 0.07115120440721512,
"rewards/rejected": -0.3720177412033081,
"step": 365
},
{
"epoch": 0.2328508495909377,
"grad_norm": 100.57418060302734,
"learning_rate": 3.802482070488373e-05,
"logits/chosen": -1.3890790939331055,
"logits/rejected": -1.4434179067611694,
"logps/chosen": -3.4095160961151123,
"logps/rejected": -4.254734039306641,
"loss": 21.2175,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.24669814109802246,
"rewards/margins": 0.06480761617422104,
"rewards/rejected": -0.3115057349205017,
"step": 370
},
{
"epoch": 0.23599748269351795,
"grad_norm": 194.1370391845703,
"learning_rate": 3.792853874591408e-05,
"logits/chosen": -1.5562362670898438,
"logits/rejected": -1.4487522840499878,
"logps/chosen": -3.45831561088562,
"logps/rejected": -4.16960334777832,
"loss": 24.8363,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.23216786980628967,
"rewards/margins": 0.047336287796497345,
"rewards/rejected": -0.2795041799545288,
"step": 375
},
{
"epoch": 0.23914411579609818,
"grad_norm": 88.31356811523438,
"learning_rate": 3.783009352950282e-05,
"logits/chosen": -1.371385097503662,
"logits/rejected": -1.373175859451294,
"logps/chosen": -2.55993390083313,
"logps/rejected": -3.111349105834961,
"loss": 22.3814,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.17149809002876282,
"rewards/margins": 0.04337615519762039,
"rewards/rejected": -0.214874267578125,
"step": 380
},
{
"epoch": 0.2422907488986784,
"grad_norm": 126.74950408935547,
"learning_rate": 3.772949693404954e-05,
"logits/chosen": -1.33748459815979,
"logits/rejected": -1.3754979372024536,
"logps/chosen": -2.633439064025879,
"logps/rejected": -3.534024715423584,
"loss": 20.4661,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.17298361659049988,
"rewards/margins": 0.07067564874887466,
"rewards/rejected": -0.24365928769111633,
"step": 385
},
{
"epoch": 0.24543738200125864,
"grad_norm": 90.40318298339844,
"learning_rate": 3.762676109753919e-05,
"logits/chosen": -1.2709859609603882,
"logits/rejected": -1.294306755065918,
"logps/chosen": -3.954099655151367,
"logps/rejected": -5.9721527099609375,
"loss": 21.932,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.29533275961875916,
"rewards/margins": 0.12940457463264465,
"rewards/rejected": -0.4247373640537262,
"step": 390
},
{
"epoch": 0.2485840151038389,
"grad_norm": 84.59414672851562,
"learning_rate": 3.7521898416077565e-05,
"logits/chosen": -1.4984506368637085,
"logits/rejected": -1.5229644775390625,
"logps/chosen": -4.4091901779174805,
"logps/rejected": -5.3940815925598145,
"loss": 21.5859,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.3109613358974457,
"rewards/margins": 0.08055521547794342,
"rewards/rejected": -0.3915165364742279,
"step": 395
},
{
"epoch": 0.2517306482064191,
"grad_norm": 120.28202056884766,
"learning_rate": 3.7414921542395546e-05,
"logits/chosen": -1.5182693004608154,
"logits/rejected": -1.5193490982055664,
"logps/chosen": -4.545083045959473,
"logps/rejected": -5.492725372314453,
"loss": 21.539,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.36579376459121704,
"rewards/margins": 0.06641928851604462,
"rewards/rejected": -0.4322130084037781,
"step": 400
},
{
"epoch": 0.2548772813089994,
"grad_norm": 143.28396606445312,
"learning_rate": 3.7305843384322466e-05,
"logits/chosen": -1.5114035606384277,
"logits/rejected": -1.5092270374298096,
"logps/chosen": -5.28603982925415,
"logps/rejected": -6.232533931732178,
"loss": 21.4891,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.43935927748680115,
"rewards/margins": 0.08039890229701996,
"rewards/rejected": -0.5197581648826599,
"step": 405
},
{
"epoch": 0.2580239144115796,
"grad_norm": 129.09864807128906,
"learning_rate": 3.71946771032286e-05,
"logits/chosen": -1.6940416097640991,
"logits/rejected": -1.6389005184173584,
"logps/chosen": -5.122313022613525,
"logps/rejected": -6.010600566864014,
"loss": 21.8681,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.42212480306625366,
"rewards/margins": 0.076592817902565,
"rewards/rejected": -0.49871763586997986,
"step": 410
},
{
"epoch": 0.26117054751415986,
"grad_norm": 1118.02392578125,
"learning_rate": 3.708143611243716e-05,
"logits/chosen": -1.65127432346344,
"logits/rejected": -1.6758639812469482,
"logps/chosen": -5.203777313232422,
"logps/rejected": -6.3162078857421875,
"loss": 21.2512,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.37822961807250977,
"rewards/margins": 0.09629149734973907,
"rewards/rejected": -0.4745211601257324,
"step": 415
},
{
"epoch": 0.2643171806167401,
"grad_norm": 109.98821258544922,
"learning_rate": 3.696613407560582e-05,
"logits/chosen": -1.6237115859985352,
"logits/rejected": -1.5712984800338745,
"logps/chosen": -4.632975101470947,
"logps/rejected": -6.082078456878662,
"loss": 20.9477,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.3740273118019104,
"rewards/margins": 0.103847935795784,
"rewards/rejected": -0.4778752326965332,
"step": 420
},
{
"epoch": 0.2674638137193203,
"grad_norm": 95.2988052368164,
"learning_rate": 3.684878490507808e-05,
"logits/chosen": -1.5806386470794678,
"logits/rejected": -1.6192169189453125,
"logps/chosen": -4.849827766418457,
"logps/rejected": -5.800168037414551,
"loss": 23.5806,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.3749791085720062,
"rewards/margins": 0.07270670682191849,
"rewards/rejected": -0.4476858079433441,
"step": 425
},
{
"epoch": 0.27061044682190055,
"grad_norm": 111.99176788330078,
"learning_rate": 3.6729402760204535e-05,
"logits/chosen": -1.6522388458251953,
"logits/rejected": -1.6433773040771484,
"logps/chosen": -3.4129672050476074,
"logps/rejected": -4.362156867980957,
"loss": 21.9253,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.25616371631622314,
"rewards/margins": 0.07649270445108414,
"rewards/rejected": -0.3326564431190491,
"step": 430
},
{
"epoch": 0.2737570799244808,
"grad_norm": 219.88124084472656,
"learning_rate": 3.6608002045634535e-05,
"logits/chosen": -1.7825971841812134,
"logits/rejected": -1.6959110498428345,
"logps/chosen": -3.785250425338745,
"logps/rejected": -4.989777565002441,
"loss": 22.1928,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.26640018820762634,
"rewards/margins": 0.07046084105968475,
"rewards/rejected": -0.3368610143661499,
"step": 435
},
{
"epoch": 0.27690371302706107,
"grad_norm": 110.93528747558594,
"learning_rate": 3.6484597409577975e-05,
"logits/chosen": -1.8389028310775757,
"logits/rejected": -1.7533693313598633,
"logps/chosen": -3.4091110229492188,
"logps/rejected": -4.324118614196777,
"loss": 21.2394,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.25699272751808167,
"rewards/margins": 0.06507135927677155,
"rewards/rejected": -0.322064071893692,
"step": 440
},
{
"epoch": 0.2800503461296413,
"grad_norm": 128.312255859375,
"learning_rate": 3.6359203742037966e-05,
"logits/chosen": -1.8402115106582642,
"logits/rejected": -1.7344493865966797,
"logps/chosen": -4.041749000549316,
"logps/rejected": -4.417330741882324,
"loss": 22.7853,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.2921445965766907,
"rewards/margins": 0.04909106716513634,
"rewards/rejected": -0.3412356376647949,
"step": 445
},
{
"epoch": 0.28319697923222154,
"grad_norm": 121.12706756591797,
"learning_rate": 3.623183617301411e-05,
"logits/chosen": -1.7311460971832275,
"logits/rejected": -1.7096502780914307,
"logps/chosen": -3.8819706439971924,
"logps/rejected": -4.670237064361572,
"loss": 22.6275,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.30139902234077454,
"rewards/margins": 0.05851779133081436,
"rewards/rejected": -0.3599168360233307,
"step": 450
},
{
"epoch": 0.28634361233480177,
"grad_norm": 93.03048706054688,
"learning_rate": 3.610251007067699e-05,
"logits/chosen": -1.836363434791565,
"logits/rejected": -1.736104965209961,
"logps/chosen": -4.1447577476501465,
"logps/rejected": -4.325010299682617,
"loss": 26.2728,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.32724231481552124,
"rewards/margins": 0.010385597124695778,
"rewards/rejected": -0.33762794733047485,
"step": 455
},
{
"epoch": 0.289490245437382,
"grad_norm": 76.58390808105469,
"learning_rate": 3.597124103951379e-05,
"logits/chosen": -1.7278220653533936,
"logits/rejected": -1.7181174755096436,
"logps/chosen": -4.0262017250061035,
"logps/rejected": -4.855641841888428,
"loss": 22.3804,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.2886626124382019,
"rewards/margins": 0.06016182899475098,
"rewards/rejected": -0.3488244414329529,
"step": 460
},
{
"epoch": 0.29263687853996223,
"grad_norm": 80.33660888671875,
"learning_rate": 3.583804491844551e-05,
"logits/chosen": -1.8658571243286133,
"logits/rejected": -1.7413606643676758,
"logps/chosen": -3.758129835128784,
"logps/rejected": -4.306906223297119,
"loss": 26.088,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2766272723674774,
"rewards/margins": 0.03810672461986542,
"rewards/rejected": -0.31473398208618164,
"step": 465
},
{
"epoch": 0.29578351164254246,
"grad_norm": 66.17215728759766,
"learning_rate": 3.5702937778915765e-05,
"logits/chosen": -1.8694692850112915,
"logits/rejected": -1.82939875125885,
"logps/chosen": -2.9322712421417236,
"logps/rejected": -3.7157013416290283,
"loss": 21.7852,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.2061152458190918,
"rewards/margins": 0.056372471153736115,
"rewards/rejected": -0.2624877095222473,
"step": 470
},
{
"epoch": 0.2989301447451227,
"grad_norm": 95.2267837524414,
"learning_rate": 3.556593592295171e-05,
"logits/chosen": -1.8632274866104126,
"logits/rejected": -1.8683363199234009,
"logps/chosen": -2.8304595947265625,
"logps/rejected": -3.464296817779541,
"loss": 22.1458,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.19707690179347992,
"rewards/margins": 0.04870922490954399,
"rewards/rejected": -0.24578611552715302,
"step": 475
},
{
"epoch": 0.302076777847703,
"grad_norm": 128.1005096435547,
"learning_rate": 3.5427055881196946e-05,
"logits/chosen": -1.7504918575286865,
"logits/rejected": -1.8846075534820557,
"logps/chosen": -2.7551674842834473,
"logps/rejected": -3.501314163208008,
"loss": 21.4037,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.192325159907341,
"rewards/margins": 0.05459358170628548,
"rewards/rejected": -0.2469187080860138,
"step": 480
},
{
"epoch": 0.3052234109502832,
"grad_norm": 64.81920623779297,
"learning_rate": 3.5286314410916967e-05,
"logits/chosen": -1.8015562295913696,
"logits/rejected": -1.9157085418701172,
"logps/chosen": -3.297150135040283,
"logps/rejected": -4.347265243530273,
"loss": 20.2599,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.24196556210517883,
"rewards/margins": 0.06687469035387039,
"rewards/rejected": -0.30884024500846863,
"step": 485
},
{
"epoch": 0.30837004405286345,
"grad_norm": 121.4966812133789,
"learning_rate": 3.5143728493977245e-05,
"logits/chosen": -1.7404873371124268,
"logits/rejected": -1.8498218059539795,
"logps/chosen": -3.553678035736084,
"logps/rejected": -4.084536075592041,
"loss": 24.4702,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.2725631594657898,
"rewards/margins": 0.037132084369659424,
"rewards/rejected": -0.3096952736377716,
"step": 490
},
{
"epoch": 0.3115166771554437,
"grad_norm": 102.46180725097656,
"learning_rate": 3.499931533479417e-05,
"logits/chosen": -1.7682313919067383,
"logits/rejected": -1.7660820484161377,
"logps/chosen": -3.595475435256958,
"logps/rejected": -4.801576137542725,
"loss": 20.9722,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.2746976315975189,
"rewards/margins": 0.10004003345966339,
"rewards/rejected": -0.3747376501560211,
"step": 495
},
{
"epoch": 0.3146633102580239,
"grad_norm": 100.82923889160156,
"learning_rate": 3.485309235825916e-05,
"logits/chosen": -1.7638380527496338,
"logits/rejected": -1.857962965965271,
"logps/chosen": -4.1785569190979,
"logps/rejected": -5.445678234100342,
"loss": 20.121,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.30823373794555664,
"rewards/margins": 0.09736496210098267,
"rewards/rejected": -0.4055987298488617,
"step": 500
},
{
"epoch": 0.31780994336060414,
"grad_norm": 299.635009765625,
"learning_rate": 3.470507720763625e-05,
"logits/chosen": -1.7603092193603516,
"logits/rejected": -1.8294856548309326,
"logps/chosen": -3.818953037261963,
"logps/rejected": -4.965951442718506,
"loss": 24.0421,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.2867090702056885,
"rewards/margins": 0.09908684343099594,
"rewards/rejected": -0.385795921087265,
"step": 505
},
{
"epoch": 0.3209565764631844,
"grad_norm": 121.77188110351562,
"learning_rate": 3.4555287742433115e-05,
"logits/chosen": -1.8968608379364014,
"logits/rejected": -1.863628625869751,
"logps/chosen": -3.3851046562194824,
"logps/rejected": -4.313992500305176,
"loss": 21.5651,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.2504531145095825,
"rewards/margins": 0.07505444437265396,
"rewards/rejected": -0.3255075514316559,
"step": 510
},
{
"epoch": 0.3241032095657646,
"grad_norm": 84.7723617553711,
"learning_rate": 3.440374203624628e-05,
"logits/chosen": -1.8949018716812134,
"logits/rejected": -2.03389573097229,
"logps/chosen": -3.739046573638916,
"logps/rejected": -4.937285423278809,
"loss": 22.0895,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2827950417995453,
"rewards/margins": 0.07987246662378311,
"rewards/rejected": -0.3626675605773926,
"step": 515
},
{
"epoch": 0.3272498426683449,
"grad_norm": 96.02967071533203,
"learning_rate": 3.425045837458028e-05,
"logits/chosen": -1.9336235523223877,
"logits/rejected": -1.9811556339263916,
"logps/chosen": -3.5748794078826904,
"logps/rejected": -4.64247465133667,
"loss": 20.7454,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2698992192745209,
"rewards/margins": 0.07278282940387726,
"rewards/rejected": -0.3426820635795593,
"step": 520
},
{
"epoch": 0.3303964757709251,
"grad_norm": 138.71051025390625,
"learning_rate": 3.4095455252641376e-05,
"logits/chosen": -1.938104271888733,
"logits/rejected": -2.024137020111084,
"logps/chosen": -4.332060813903809,
"logps/rejected": -5.391437530517578,
"loss": 23.3511,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3168641924858093,
"rewards/margins": 0.049729883670806885,
"rewards/rejected": -0.3665940761566162,
"step": 525
},
{
"epoch": 0.33354310887350536,
"grad_norm": 93.8726577758789,
"learning_rate": 3.393875137310588e-05,
"logits/chosen": -1.8752260208129883,
"logits/rejected": -1.8945411443710327,
"logps/chosen": -4.053868770599365,
"logps/rejected": -5.044325828552246,
"loss": 21.8528,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.3227534890174866,
"rewards/margins": 0.0821223258972168,
"rewards/rejected": -0.4048757553100586,
"step": 530
},
{
"epoch": 0.3366897419760856,
"grad_norm": 261.39129638671875,
"learning_rate": 3.378036564386349e-05,
"logits/chosen": -1.770957589149475,
"logits/rejected": -1.8808790445327759,
"logps/chosen": -3.8808326721191406,
"logps/rejected": -4.960693836212158,
"loss": 23.7267,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.3041539788246155,
"rewards/margins": 0.08733677119016647,
"rewards/rejected": -0.39149072766304016,
"step": 535
},
{
"epoch": 0.3398363750786658,
"grad_norm": 141.79991149902344,
"learning_rate": 3.3620317175735945e-05,
"logits/chosen": -1.929517149925232,
"logits/rejected": -1.8599262237548828,
"logps/chosen": -4.427219867706299,
"logps/rejected": -5.757664680480957,
"loss": 20.8591,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.3481447994709015,
"rewards/margins": 0.0858476310968399,
"rewards/rejected": -0.4339924454689026,
"step": 540
},
{
"epoch": 0.34298300818124605,
"grad_norm": 76.495361328125,
"learning_rate": 3.345862528017101e-05,
"logits/chosen": -1.8648240566253662,
"logits/rejected": -1.899430513381958,
"logps/chosen": -4.430551528930664,
"logps/rejected": -5.134209156036377,
"loss": 21.6823,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.3582889139652252,
"rewards/margins": 0.05610053986310959,
"rewards/rejected": -0.4143894612789154,
"step": 545
},
{
"epoch": 0.3461296412838263,
"grad_norm": 65.95896911621094,
"learning_rate": 3.32953094669124e-05,
"logits/chosen": -1.6951459646224976,
"logits/rejected": -1.7398831844329834,
"logps/chosen": -5.35291051864624,
"logps/rejected": -6.347973823547363,
"loss": 24.8551,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.4343182146549225,
"rewards/margins": 0.085027314722538,
"rewards/rejected": -0.5193454623222351,
"step": 550
},
{
"epoch": 0.34927627438640657,
"grad_norm": 64.50738525390625,
"learning_rate": 3.313038944164577e-05,
"logits/chosen": -1.7779582738876343,
"logits/rejected": -1.8077032566070557,
"logps/chosen": -4.008457183837891,
"logps/rejected": -5.838412761688232,
"loss": 19.2472,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3185553550720215,
"rewards/margins": 0.10776933282613754,
"rewards/rejected": -0.4263246953487396,
"step": 555
},
{
"epoch": 0.3524229074889868,
"grad_norm": 62.579227447509766,
"learning_rate": 3.296388510362095e-05,
"logits/chosen": -1.5932537317276,
"logits/rejected": -1.7019790410995483,
"logps/chosen": -4.049741268157959,
"logps/rejected": -4.859818935394287,
"loss": 21.4107,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.29325228929519653,
"rewards/margins": 0.06688085943460464,
"rewards/rejected": -0.36013317108154297,
"step": 560
},
{
"epoch": 0.35556954059156703,
"grad_norm": 105.9216079711914,
"learning_rate": 3.2795816543250977e-05,
"logits/chosen": -1.5411794185638428,
"logits/rejected": -1.5789968967437744,
"logps/chosen": -3.8824076652526855,
"logps/rejected": -4.560225486755371,
"loss": 23.1195,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.2929798662662506,
"rewards/margins": 0.05188722163438797,
"rewards/rejected": -0.34486711025238037,
"step": 565
},
{
"epoch": 0.35871617369414727,
"grad_norm": 55.46923065185547,
"learning_rate": 3.262620403968792e-05,
"logits/chosen": -1.5855820178985596,
"logits/rejected": -1.7370961904525757,
"logps/chosen": -3.6918272972106934,
"logps/rejected": -5.205948352813721,
"loss": 19.1367,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.27848196029663086,
"rewards/margins": 0.11322972923517227,
"rewards/rejected": -0.3917117416858673,
"step": 570
},
{
"epoch": 0.3618628067967275,
"grad_norm": 114.82603454589844,
"learning_rate": 3.245506805837605e-05,
"logits/chosen": -1.6395822763442993,
"logits/rejected": -1.8543764352798462,
"logps/chosen": -4.298351287841797,
"logps/rejected": -5.546226501464844,
"loss": 19.9406,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.30993199348449707,
"rewards/margins": 0.08511951565742493,
"rewards/rejected": -0.3950514793395996,
"step": 575
},
{
"epoch": 0.36500943989930773,
"grad_norm": 174.55496215820312,
"learning_rate": 3.228242924858248e-05,
"logits/chosen": -1.5872471332550049,
"logits/rejected": -1.688132882118225,
"logps/chosen": -4.568819999694824,
"logps/rejected": -5.411607265472412,
"loss": 22.4314,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.34597450494766235,
"rewards/margins": 0.07728902995586395,
"rewards/rejected": -0.4232635498046875,
"step": 580
},
{
"epoch": 0.36815607300188796,
"grad_norm": 70.5542221069336,
"learning_rate": 3.210830844090555e-05,
"logits/chosen": -1.6192104816436768,
"logits/rejected": -1.6785539388656616,
"logps/chosen": -5.1252007484436035,
"logps/rejected": -5.851187705993652,
"loss": 25.8619,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.40460100769996643,
"rewards/margins": 0.06072293594479561,
"rewards/rejected": -0.46532392501831055,
"step": 585
},
{
"epoch": 0.3713027061044682,
"grad_norm": 100.62268829345703,
"learning_rate": 3.193272664476152e-05,
"logits/chosen": -1.7602649927139282,
"logits/rejected": -1.9346716403961182,
"logps/chosen": -4.961272239685059,
"logps/rejected": -5.8130645751953125,
"loss": 22.8852,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.3982604444026947,
"rewards/margins": 0.059664536267519,
"rewards/rejected": -0.457925021648407,
"step": 590
},
{
"epoch": 0.3744493392070485,
"grad_norm": 411.0801696777344,
"learning_rate": 3.1755705045849465e-05,
"logits/chosen": -1.7633399963378906,
"logits/rejected": -1.818737268447876,
"logps/chosen": -5.510100364685059,
"logps/rejected": -6.382575035095215,
"loss": 23.8471,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.4236491620540619,
"rewards/margins": 0.06903719902038574,
"rewards/rejected": -0.49268636107444763,
"step": 595
},
{
"epoch": 0.3775959723096287,
"grad_norm": 98.035888671875,
"learning_rate": 3.157726500359509e-05,
"logits/chosen": -1.825554609298706,
"logits/rejected": -1.907472014427185,
"logps/chosen": -5.569567680358887,
"logps/rejected": -6.1025004386901855,
"loss": 24.087,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.4460601210594177,
"rewards/margins": 0.03472483158111572,
"rewards/rejected": -0.48078498244285583,
"step": 600
},
{
"epoch": 0.38074260541220895,
"grad_norm": 80.47187805175781,
"learning_rate": 3.1397428048573465e-05,
"logits/chosen": -1.798015832901001,
"logits/rejected": -1.9216489791870117,
"logps/chosen": -4.644695281982422,
"logps/rejected": -5.7896294593811035,
"loss": 19.835,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.380901575088501,
"rewards/margins": 0.08407244086265564,
"rewards/rejected": -0.4649740159511566,
"step": 605
},
{
"epoch": 0.3838892385147892,
"grad_norm": 65.88395690917969,
"learning_rate": 3.121621587991113e-05,
"logits/chosen": -1.9489303827285767,
"logits/rejected": -1.9782030582427979,
"logps/chosen": -4.736275672912598,
"logps/rejected": -5.893181800842285,
"loss": 21.2523,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.37491172552108765,
"rewards/margins": 0.09068160504102707,
"rewards/rejected": -0.46559327840805054,
"step": 610
},
{
"epoch": 0.3870358716173694,
"grad_norm": 126.57975769042969,
"learning_rate": 3.1033650362667935e-05,
"logits/chosen": -1.945927619934082,
"logits/rejected": -2.0246009826660156,
"logps/chosen": -4.42104434967041,
"logps/rejected": -5.623631000518799,
"loss": 20.477,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.3376965820789337,
"rewards/margins": 0.07996558398008347,
"rewards/rejected": -0.41766220331192017,
"step": 615
},
{
"epoch": 0.39018250471994964,
"grad_norm": 88.92438507080078,
"learning_rate": 3.084975352519874e-05,
"logits/chosen": -2.063378095626831,
"logits/rejected": -2.161208391189575,
"logps/chosen": -4.2682085037231445,
"logps/rejected": -5.291066646575928,
"loss": 22.2295,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.3386593759059906,
"rewards/margins": 0.07158732414245605,
"rewards/rejected": -0.41024675965309143,
"step": 620
},
{
"epoch": 0.3933291378225299,
"grad_norm": 53.47737503051758,
"learning_rate": 3.06645475564955e-05,
"logits/chosen": -1.9409205913543701,
"logits/rejected": -2.0371243953704834,
"logps/chosen": -3.6241352558135986,
"logps/rejected": -5.033164978027344,
"loss": 20.5698,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.27396219968795776,
"rewards/margins": 0.09085332602262497,
"rewards/rejected": -0.36481553316116333,
"step": 625
},
{
"epoch": 0.3964757709251101,
"grad_norm": 87.2447738647461,
"learning_rate": 3.0478054803509975e-05,
"logits/chosen": -1.9413238763809204,
"logits/rejected": -1.989638328552246,
"logps/chosen": -3.974926710128784,
"logps/rejected": -5.115756034851074,
"loss": 20.8679,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.3056657314300537,
"rewards/margins": 0.09486590325832367,
"rewards/rejected": -0.4005316197872162,
"step": 630
},
{
"epoch": 0.3996224040276904,
"grad_norm": 105.37754821777344,
"learning_rate": 3.029029776845726e-05,
"logits/chosen": -1.9769777059555054,
"logits/rejected": -2.0631349086761475,
"logps/chosen": -4.811491012573242,
"logps/rejected": -6.024916648864746,
"loss": 22.3949,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.36858421564102173,
"rewards/margins": 0.09452919661998749,
"rewards/rejected": -0.463113397359848,
"step": 635
},
{
"epoch": 0.4027690371302706,
"grad_norm": 107.63380432128906,
"learning_rate": 3.0101299106100766e-05,
"logits/chosen": -1.9259755611419678,
"logits/rejected": -2.0011420249938965,
"logps/chosen": -4.672276496887207,
"logps/rejected": -5.433979034423828,
"loss": 23.4548,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.34038934111595154,
"rewards/margins": 0.05264373868703842,
"rewards/rejected": -0.39303308725357056,
"step": 640
},
{
"epoch": 0.40591567023285086,
"grad_norm": 72.93191528320312,
"learning_rate": 2.991108162101862e-05,
"logits/chosen": -1.8639154434204102,
"logits/rejected": -2.00860333442688,
"logps/chosen": -4.0379438400268555,
"logps/rejected": -4.966481685638428,
"loss": 24.2063,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.3016512095928192,
"rewards/margins": 0.05989114195108414,
"rewards/rejected": -0.36154234409332275,
"step": 645
},
{
"epoch": 0.4090623033354311,
"grad_norm": 241.30491638183594,
"learning_rate": 2.971966826485212e-05,
"logits/chosen": -2.0276923179626465,
"logits/rejected": -2.075092077255249,
"logps/chosen": -3.9584078788757324,
"logps/rejected": -4.5398454666137695,
"loss": 22.3358,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.2686071991920471,
"rewards/margins": 0.05414595082402229,
"rewards/rejected": -0.3227531313896179,
"step": 650
},
{
"epoch": 0.4122089364380113,
"grad_norm": 72.65229797363281,
"learning_rate": 2.952708213353636e-05,
"logits/chosen": -2.087306499481201,
"logits/rejected": -2.120595932006836,
"logps/chosen": -2.7464280128479004,
"logps/rejected": -3.2665913105010986,
"loss": 23.396,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.19495923817157745,
"rewards/margins": 0.03470323234796524,
"rewards/rejected": -0.2296624630689621,
"step": 655
},
{
"epoch": 0.41535556954059155,
"grad_norm": 36.565982818603516,
"learning_rate": 2.9333346464513476e-05,
"logits/chosen": -2.0568580627441406,
"logits/rejected": -2.171510934829712,
"logps/chosen": -3.1527762413024902,
"logps/rejected": -3.5696024894714355,
"loss": 23.204,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.2180822342634201,
"rewards/margins": 0.029619824141263962,
"rewards/rejected": -0.24770204722881317,
"step": 660
},
{
"epoch": 0.4185022026431718,
"grad_norm": 57.84255599975586,
"learning_rate": 2.9138484633928818e-05,
"logits/chosen": -1.940320372581482,
"logits/rejected": -1.9845908880233765,
"logps/chosen": -3.0434772968292236,
"logps/rejected": -3.5398964881896973,
"loss": 24.3501,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.2063741683959961,
"rewards/margins": 0.023456847295165062,
"rewards/rejected": -0.2298310250043869,
"step": 665
},
{
"epoch": 0.42164883574575207,
"grad_norm": 56.995887756347656,
"learning_rate": 2.8942520153810396e-05,
"logits/chosen": -2.0002236366271973,
"logits/rejected": -2.08671498298645,
"logps/chosen": -2.834512710571289,
"logps/rejected": -3.5050129890441895,
"loss": 22.4039,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.18090704083442688,
"rewards/margins": 0.04532923549413681,
"rewards/rejected": -0.2262362688779831,
"step": 670
},
{
"epoch": 0.4247954688483323,
"grad_norm": 75.65125274658203,
"learning_rate": 2.8745476669231894e-05,
"logits/chosen": -2.020886182785034,
"logits/rejected": -2.111823558807373,
"logps/chosen": -3.5571112632751465,
"logps/rejected": -4.481097221374512,
"loss": 22.9676,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.22384686768054962,
"rewards/margins": 0.04108366742730141,
"rewards/rejected": -0.2649305462837219,
"step": 675
},
{
"epoch": 0.42794210195091253,
"grad_norm": 77.30415344238281,
"learning_rate": 2.8547377955459704e-05,
"logits/chosen": -1.9961265325546265,
"logits/rejected": -2.0482177734375,
"logps/chosen": -2.892690658569336,
"logps/rejected": -3.2253260612487793,
"loss": 25.6658,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.19483526051044464,
"rewards/margins": 0.01912742853164673,
"rewards/rejected": -0.21396267414093018,
"step": 680
},
{
"epoch": 0.43108873505349277,
"grad_norm": 49.21062088012695,
"learning_rate": 2.834824791508413e-05,
"logits/chosen": -1.930086374282837,
"logits/rejected": -2.131298542022705,
"logps/chosen": -2.739534854888916,
"logps/rejected": -3.5602822303771973,
"loss": 21.1908,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.17746233940124512,
"rewards/margins": 0.06554970890283585,
"rewards/rejected": -0.24301204085350037,
"step": 685
},
{
"epoch": 0.434235368156073,
"grad_norm": 64.88590240478516,
"learning_rate": 2.814811057513537e-05,
"logits/chosen": -2.0517029762268066,
"logits/rejected": -2.067883253097534,
"logps/chosen": -2.82458758354187,
"logps/rejected": -3.6670260429382324,
"loss": 21.8595,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.1833667755126953,
"rewards/margins": 0.0560932457447052,
"rewards/rejected": -0.2394600361585617,
"step": 690
},
{
"epoch": 0.43738200125865323,
"grad_norm": 48.841331481933594,
"learning_rate": 2.7946990084184383e-05,
"logits/chosen": -1.798683524131775,
"logits/rejected": -1.9806129932403564,
"logps/chosen": -3.2995662689208984,
"logps/rejected": -4.0815110206604,
"loss": 22.0918,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2146444320678711,
"rewards/margins": 0.05965212732553482,
"rewards/rejected": -0.27429652214050293,
"step": 695
},
{
"epoch": 0.44052863436123346,
"grad_norm": 266.59381103515625,
"learning_rate": 2.7744910709429104e-05,
"logits/chosen": -1.800355315208435,
"logits/rejected": -1.9262745380401611,
"logps/chosen": -3.308371067047119,
"logps/rejected": -4.3786821365356445,
"loss": 22.6616,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.20500688254833221,
"rewards/margins": 0.07705695927143097,
"rewards/rejected": -0.2820638120174408,
"step": 700
},
{
"epoch": 0.4436752674638137,
"grad_norm": 45.74457550048828,
"learning_rate": 2.754189683376641e-05,
"logits/chosen": -1.8245214223861694,
"logits/rejected": -1.9188095331192017,
"logps/chosen": -2.6574292182922363,
"logps/rejected": -3.3347110748291016,
"loss": 21.6472,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.1792256087064743,
"rewards/margins": 0.054762959480285645,
"rewards/rejected": -0.23398856818675995,
"step": 705
},
{
"epoch": 0.446821900566394,
"grad_norm": 82.67216491699219,
"learning_rate": 2.7337972952850047e-05,
"logits/chosen": -1.764173150062561,
"logits/rejected": -1.9260650873184204,
"logps/chosen": -2.8055293560028076,
"logps/rejected": -3.9603447914123535,
"loss": 21.7022,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.19627173244953156,
"rewards/margins": 0.07794789969921112,
"rewards/rejected": -0.2742196321487427,
"step": 710
},
{
"epoch": 0.4499685336689742,
"grad_norm": 63.396240234375,
"learning_rate": 2.713316367213499e-05,
"logits/chosen": -1.6747219562530518,
"logits/rejected": -1.8347587585449219,
"logps/chosen": -2.9625911712646484,
"logps/rejected": -3.7656357288360596,
"loss": 22.6149,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.21037223935127258,
"rewards/margins": 0.05833571031689644,
"rewards/rejected": -0.26870793104171753,
"step": 715
},
{
"epoch": 0.45311516677155445,
"grad_norm": 118.00112915039062,
"learning_rate": 2.692749370390855e-05,
"logits/chosen": -1.7990179061889648,
"logits/rejected": -1.8915067911148071,
"logps/chosen": -3.0249316692352295,
"logps/rejected": -4.06134033203125,
"loss": 23.4425,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.21054935455322266,
"rewards/margins": 0.05246324464678764,
"rewards/rejected": -0.2630125880241394,
"step": 720
},
{
"epoch": 0.4562617998741347,
"grad_norm": 64.52631378173828,
"learning_rate": 2.6720987864308603e-05,
"logits/chosen": -1.695908546447754,
"logits/rejected": -1.7583353519439697,
"logps/chosen": -2.815432548522949,
"logps/rejected": -4.123710632324219,
"loss": 21.0095,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.1960502415895462,
"rewards/margins": 0.08241166174411774,
"rewards/rejected": -0.27846187353134155,
"step": 725
},
{
"epoch": 0.4594084329767149,
"grad_norm": 59.4410285949707,
"learning_rate": 2.6513671070329244e-05,
"logits/chosen": -1.7788522243499756,
"logits/rejected": -1.8245208263397217,
"logps/chosen": -3.012934446334839,
"logps/rejected": -4.003429412841797,
"loss": 21.1484,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.2150099277496338,
"rewards/margins": 0.07829871028661728,
"rewards/rejected": -0.2933086156845093,
"step": 730
},
{
"epoch": 0.46255506607929514,
"grad_norm": 84.89627075195312,
"learning_rate": 2.630556833681434e-05,
"logits/chosen": -1.738438606262207,
"logits/rejected": -1.8424345254898071,
"logps/chosen": -2.7983458042144775,
"logps/rejected": -4.087245941162109,
"loss": 19.2453,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.19751907885074615,
"rewards/margins": 0.09776587784290314,
"rewards/rejected": -0.2952849566936493,
"step": 735
},
{
"epoch": 0.4657016991818754,
"grad_norm": 101.38806915283203,
"learning_rate": 2.609670477343921e-05,
"logits/chosen": -1.6957628726959229,
"logits/rejected": -1.825757384300232,
"logps/chosen": -4.030215263366699,
"logps/rejected": -5.008100509643555,
"loss": 22.1478,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.30844345688819885,
"rewards/margins": 0.0614703968167305,
"rewards/rejected": -0.36991381645202637,
"step": 740
},
{
"epoch": 0.46884833228445566,
"grad_norm": 101.18181610107422,
"learning_rate": 2.5887105581680905e-05,
"logits/chosen": -1.7838348150253296,
"logits/rejected": -1.7674500942230225,
"logps/chosen": -4.438131809234619,
"logps/rejected": -5.542893886566162,
"loss": 23.806,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.36128634214401245,
"rewards/margins": 0.07241909205913544,
"rewards/rejected": -0.43370547890663147,
"step": 745
},
{
"epoch": 0.4719949653870359,
"grad_norm": 89.2279052734375,
"learning_rate": 2.567679605177739e-05,
"logits/chosen": -1.7873433828353882,
"logits/rejected": -1.831865906715393,
"logps/chosen": -4.315898895263672,
"logps/rejected": -5.43391227722168,
"loss": 20.4258,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.32439109683036804,
"rewards/margins": 0.09124849736690521,
"rewards/rejected": -0.41563957929611206,
"step": 750
},
{
"epoch": 0.4751415984896161,
"grad_norm": 68.27491760253906,
"learning_rate": 2.5465801559676033e-05,
"logits/chosen": -1.716103196144104,
"logits/rejected": -1.744837999343872,
"logps/chosen": -3.913160800933838,
"logps/rejected": -5.709442615509033,
"loss": 19.3215,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.30374833941459656,
"rewards/margins": 0.12692494690418243,
"rewards/rejected": -0.4306732714176178,
"step": 755
},
{
"epoch": 0.47828823159219636,
"grad_norm": 149.6294708251953,
"learning_rate": 2.525414756397174e-05,
"logits/chosen": -1.7440742254257202,
"logits/rejected": -1.8239097595214844,
"logps/chosen": -3.586292266845703,
"logps/rejected": -4.596356391906738,
"loss": 19.9662,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.2702713906764984,
"rewards/margins": 0.08218260109424591,
"rewards/rejected": -0.3524540364742279,
"step": 760
},
{
"epoch": 0.4814348646947766,
"grad_norm": 102.944580078125,
"learning_rate": 2.504185960283512e-05,
"logits/chosen": -1.7996543645858765,
"logits/rejected": -1.8109557628631592,
"logps/chosen": -4.447735786437988,
"logps/rejected": -5.870986461639404,
"loss": 20.4207,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.35062670707702637,
"rewards/margins": 0.09269314259290695,
"rewards/rejected": -0.4433198869228363,
"step": 765
},
{
"epoch": 0.4845814977973568,
"grad_norm": 128.53907775878906,
"learning_rate": 2.482896329093106e-05,
"logits/chosen": -1.9051790237426758,
"logits/rejected": -1.9270706176757812,
"logps/chosen": -5.1721906661987305,
"logps/rejected": -6.744166374206543,
"loss": 19.0615,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.4306749701499939,
"rewards/margins": 0.1142655462026596,
"rewards/rejected": -0.5449405312538147,
"step": 770
},
{
"epoch": 0.48772813089993705,
"grad_norm": 123.44400024414062,
"learning_rate": 2.4615484316328023e-05,
"logits/chosen": -1.8487358093261719,
"logits/rejected": -1.8219711780548096,
"logps/chosen": -5.741638660430908,
"logps/rejected": -7.048303127288818,
"loss": 22.6075,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.4748842120170593,
"rewards/margins": 0.09859482944011688,
"rewards/rejected": -0.5734790563583374,
"step": 775
},
{
"epoch": 0.4908747640025173,
"grad_norm": 97.28683471679688,
"learning_rate": 2.440144843739857e-05,
"logits/chosen": -1.8166711330413818,
"logits/rejected": -1.856359839439392,
"logps/chosen": -6.369978904724121,
"logps/rejected": -7.745943546295166,
"loss": 21.1624,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.5159797072410583,
"rewards/margins": 0.09467221796512604,
"rewards/rejected": -0.610651969909668,
"step": 780
},
{
"epoch": 0.49402139710509757,
"grad_norm": 94.76971435546875,
"learning_rate": 2.4186881479711338e-05,
"logits/chosen": -1.8901869058609009,
"logits/rejected": -1.996917724609375,
"logps/chosen": -5.151943206787109,
"logps/rejected": -6.655333518981934,
"loss": 17.5696,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.3987768888473511,
"rewards/margins": 0.11971308290958405,
"rewards/rejected": -0.5184900164604187,
"step": 785
},
{
"epoch": 0.4971680302076778,
"grad_norm": 362.07489013671875,
"learning_rate": 2.397180933291491e-05,
"logits/chosen": -1.6789305210113525,
"logits/rejected": -1.75827157497406,
"logps/chosen": -4.5332841873168945,
"logps/rejected": -5.266444206237793,
"loss": 22.7215,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.3525087535381317,
"rewards/margins": 0.07219593226909637,
"rewards/rejected": -0.42470473051071167,
"step": 790
},
{
"epoch": 0.500314663310258,
"grad_norm": 181.0984344482422,
"learning_rate": 2.375625794761401e-05,
"logits/chosen": -1.769201636314392,
"logits/rejected": -1.7219161987304688,
"logps/chosen": -4.633937358856201,
"logps/rejected": -5.043046474456787,
"loss": 26.0541,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.3703366816043854,
"rewards/margins": 0.028562629595398903,
"rewards/rejected": -0.3988993167877197,
"step": 795
},
{
"epoch": 0.5034612964128382,
"grad_norm": 120.9494857788086,
"learning_rate": 2.3540253332238266e-05,
"logits/chosen": -1.6151552200317383,
"logits/rejected": -1.646795630455017,
"logps/chosen": -4.029574394226074,
"logps/rejected": -5.215254783630371,
"loss": 20.2479,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.314275324344635,
"rewards/margins": 0.08437344431877136,
"rewards/rejected": -0.39864879846572876,
"step": 800
},
{
"epoch": 0.5066079295154186,
"grad_norm": 119.4858169555664,
"learning_rate": 2.3323821549904038e-05,
"logits/chosen": -1.670577049255371,
"logits/rejected": -1.5533939599990845,
"logps/chosen": -3.9187912940979004,
"logps/rejected": -4.743254661560059,
"loss": 23.6037,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.3001677095890045,
"rewards/margins": 0.06169123575091362,
"rewards/rejected": -0.36185896396636963,
"step": 805
},
{
"epoch": 0.5097545626179988,
"grad_norm": 316.2997741699219,
"learning_rate": 2.310698871526966e-05,
"logits/chosen": -1.5207440853118896,
"logits/rejected": -1.6267799139022827,
"logps/chosen": -3.097418785095215,
"logps/rejected": -4.804646015167236,
"loss": 21.8575,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.22499537467956543,
"rewards/margins": 0.11616162210702896,
"rewards/rejected": -0.3411570191383362,
"step": 810
},
{
"epoch": 0.512901195720579,
"grad_norm": 78.00189971923828,
"learning_rate": 2.288978099138443e-05,
"logits/chosen": -1.5745933055877686,
"logits/rejected": -1.5564606189727783,
"logps/chosen": -2.8804163932800293,
"logps/rejected": -3.5308539867401123,
"loss": 22.241,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.20254115760326385,
"rewards/margins": 0.05405501648783684,
"rewards/rejected": -0.2565961480140686,
"step": 815
},
{
"epoch": 0.5160478288231592,
"grad_norm": 118.51643371582031,
"learning_rate": 2.267222458653179e-05,
"logits/chosen": -1.5091989040374756,
"logits/rejected": -1.6645923852920532,
"logps/chosen": -3.255237579345703,
"logps/rejected": -4.126650333404541,
"loss": 22.0187,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.23314771056175232,
"rewards/margins": 0.06177164986729622,
"rewards/rejected": -0.29491934180259705,
"step": 820
},
{
"epoch": 0.5191944619257395,
"grad_norm": 68.80047607421875,
"learning_rate": 2.245434575106702e-05,
"logits/chosen": -1.525356411933899,
"logits/rejected": -1.701436996459961,
"logps/chosen": -3.166797161102295,
"logps/rejected": -4.742985248565674,
"loss": 20.3686,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.2261020839214325,
"rewards/margins": 0.08829782903194427,
"rewards/rejected": -0.3143998980522156,
"step": 825
},
{
"epoch": 0.5223410950283197,
"grad_norm": 73.1375503540039,
"learning_rate": 2.223617077424988e-05,
"logits/chosen": -1.6771663427352905,
"logits/rejected": -1.7121098041534424,
"logps/chosen": -3.020296573638916,
"logps/rejected": -4.426422119140625,
"loss": 20.0836,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.21738722920417786,
"rewards/margins": 0.09777109324932098,
"rewards/rejected": -0.31515830755233765,
"step": 830
},
{
"epoch": 0.5254877281309,
"grad_norm": 76.68984985351562,
"learning_rate": 2.2017725981072536e-05,
"logits/chosen": -1.4603363275527954,
"logits/rejected": -1.5595886707305908,
"logps/chosen": -3.6973624229431152,
"logps/rejected": -5.027807712554932,
"loss": 20.512,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.2720819115638733,
"rewards/margins": 0.08642515540122986,
"rewards/rejected": -0.35850709676742554,
"step": 835
},
{
"epoch": 0.5286343612334802,
"grad_norm": 122.99668884277344,
"learning_rate": 2.1799037729083213e-05,
"logits/chosen": -1.5949891805648804,
"logits/rejected": -1.7137962579727173,
"logps/chosen": -3.5109829902648926,
"logps/rejected": -4.95348596572876,
"loss": 21.517,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.27030450105667114,
"rewards/margins": 0.09910550713539124,
"rewards/rejected": -0.36940997838974,
"step": 840
},
{
"epoch": 0.5317809943360604,
"grad_norm": 65.23582458496094,
"learning_rate": 2.1580132405205862e-05,
"logits/chosen": -1.4871020317077637,
"logits/rejected": -1.5624678134918213,
"logps/chosen": -4.474881172180176,
"logps/rejected": -5.375269412994385,
"loss": 23.3138,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3389451503753662,
"rewards/margins": 0.06582923233509064,
"rewards/rejected": -0.40477436780929565,
"step": 845
},
{
"epoch": 0.5349276274386406,
"grad_norm": 175.08432006835938,
"learning_rate": 2.1361036422556337e-05,
"logits/chosen": -1.5353832244873047,
"logits/rejected": -1.596407175064087,
"logps/chosen": -3.814873218536377,
"logps/rejected": -4.92036771774292,
"loss": 21.5442,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.2893931567668915,
"rewards/margins": 0.07075894623994827,
"rewards/rejected": -0.36015206575393677,
"step": 850
},
{
"epoch": 0.5380742605412209,
"grad_norm": 64.21197509765625,
"learning_rate": 2.1141776217255365e-05,
"logits/chosen": -1.567317247390747,
"logits/rejected": -1.5555747747421265,
"logps/chosen": -3.8906242847442627,
"logps/rejected": -4.897479057312012,
"loss": 21.8379,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.29526472091674805,
"rewards/margins": 0.06354343891143799,
"rewards/rejected": -0.35880815982818604,
"step": 855
},
{
"epoch": 0.5412208936438011,
"grad_norm": 104.57052612304688,
"learning_rate": 2.0922378245238787e-05,
"logits/chosen": -1.5869696140289307,
"logits/rejected": -1.6049997806549072,
"logps/chosen": -3.8140482902526855,
"logps/rejected": -4.755133628845215,
"loss": 23.1968,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.29255491495132446,
"rewards/margins": 0.052004069089889526,
"rewards/rejected": -0.3445590138435364,
"step": 860
},
{
"epoch": 0.5443675267463813,
"grad_norm": 92.2053451538086,
"learning_rate": 2.070286897906537e-05,
"logits/chosen": -1.602929711341858,
"logits/rejected": -1.6071062088012695,
"logps/chosen": -3.990319013595581,
"logps/rejected": -5.2248215675354,
"loss": 20.3706,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3073904812335968,
"rewards/margins": 0.09087739139795303,
"rewards/rejected": -0.39826786518096924,
"step": 865
},
{
"epoch": 0.5475141598489616,
"grad_norm": 83.128662109375,
"learning_rate": 2.0483274904722647e-05,
"logits/chosen": -1.7051680088043213,
"logits/rejected": -1.6087182760238647,
"logps/chosen": -3.986027956008911,
"logps/rejected": -4.851881980895996,
"loss": 21.4848,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.30843600630760193,
"rewards/margins": 0.06898938864469528,
"rewards/rejected": -0.3774254322052002,
"step": 870
},
{
"epoch": 0.5506607929515418,
"grad_norm": 62.2298583984375,
"learning_rate": 2.026362251843109e-05,
"logits/chosen": -1.6034513711929321,
"logits/rejected": -1.699464201927185,
"logps/chosen": -3.4193336963653564,
"logps/rejected": -4.403960227966309,
"loss": 21.3108,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.2610613703727722,
"rewards/margins": 0.08181565254926682,
"rewards/rejected": -0.34287700057029724,
"step": 875
},
{
"epoch": 0.5538074260541221,
"grad_norm": 88.62437438964844,
"learning_rate": 2.004393832344711e-05,
"logits/chosen": -1.6719697713851929,
"logits/rejected": -1.5851457118988037,
"logps/chosen": -3.8325066566467285,
"logps/rejected": -5.3017473220825195,
"loss": 19.635,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3032756447792053,
"rewards/margins": 0.09231220185756683,
"rewards/rejected": -0.39558783173561096,
"step": 880
},
{
"epoch": 0.5569540591567024,
"grad_norm": 64.06165313720703,
"learning_rate": 1.9824248826865124e-05,
"logits/chosen": -1.5828460454940796,
"logits/rejected": -1.6327168941497803,
"logps/chosen": -4.681789398193359,
"logps/rejected": -6.566616058349609,
"loss": 18.3853,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.3667379915714264,
"rewards/margins": 0.12741395831108093,
"rewards/rejected": -0.49415192008018494,
"step": 885
},
{
"epoch": 0.5601006922592826,
"grad_norm": 204.93890380859375,
"learning_rate": 1.9604580536419254e-05,
"logits/chosen": -1.572584867477417,
"logits/rejected": -1.6088756322860718,
"logps/chosen": -5.441628456115723,
"logps/rejected": -7.085760593414307,
"loss": 24.9097,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.45653265714645386,
"rewards/margins": 0.0925588458776474,
"rewards/rejected": -0.5490914583206177,
"step": 890
},
{
"epoch": 0.5632473253618628,
"grad_norm": 162.79714965820312,
"learning_rate": 1.93849599572849e-05,
"logits/chosen": -1.6288610696792603,
"logits/rejected": -1.6398794651031494,
"logps/chosen": -5.213116645812988,
"logps/rejected": -6.9830803871154785,
"loss": 20.22,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.42777156829833984,
"rewards/margins": 0.12980665266513824,
"rewards/rejected": -0.5575782060623169,
"step": 895
},
{
"epoch": 0.5663939584644431,
"grad_norm": 75.16659545898438,
"learning_rate": 1.916541358888062e-05,
"logits/chosen": -1.6041675806045532,
"logits/rejected": -1.6970984935760498,
"logps/chosen": -4.644831657409668,
"logps/rejected": -5.80092716217041,
"loss": 20.4964,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.37974128127098083,
"rewards/margins": 0.09219308942556381,
"rewards/rejected": -0.47193440794944763,
"step": 900
},
{
"epoch": 0.5695405915670233,
"grad_norm": 110.90229797363281,
"learning_rate": 1.8945967921670676e-05,
"logits/chosen": -1.619327187538147,
"logits/rejected": -1.6541610956192017,
"logps/chosen": -5.146854400634766,
"logps/rejected": -6.011466026306152,
"loss": 22.4066,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.41492849588394165,
"rewards/margins": 0.07109946012496948,
"rewards/rejected": -0.48602795600891113,
"step": 905
},
{
"epoch": 0.5726872246696035,
"grad_norm": 139.65293884277344,
"learning_rate": 1.872664943396875e-05,
"logits/chosen": -1.6764265298843384,
"logits/rejected": -1.6785293817520142,
"logps/chosen": -4.107344150543213,
"logps/rejected": -5.6308698654174805,
"loss": 20.0103,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.3305855095386505,
"rewards/margins": 0.11647170782089233,
"rewards/rejected": -0.44705715775489807,
"step": 910
},
{
"epoch": 0.5758338577721838,
"grad_norm": 147.52713012695312,
"learning_rate": 1.8507484588743025e-05,
"logits/chosen": -1.7002742290496826,
"logits/rejected": -1.7680556774139404,
"logps/chosen": -4.6784772872924805,
"logps/rejected": -5.973324775695801,
"loss": 21.0769,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.3756680190563202,
"rewards/margins": 0.09194694459438324,
"rewards/rejected": -0.4676149785518646,
"step": 915
},
{
"epoch": 0.578980490874764,
"grad_norm": 71.16407012939453,
"learning_rate": 1.828849983042321e-05,
"logits/chosen": -1.7075554132461548,
"logits/rejected": -1.6953094005584717,
"logps/chosen": -4.460357666015625,
"logps/rejected": -5.521221160888672,
"loss": 21.7677,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.35953736305236816,
"rewards/margins": 0.08199813961982727,
"rewards/rejected": -0.44153547286987305,
"step": 920
},
{
"epoch": 0.5821271239773442,
"grad_norm": 114.27317810058594,
"learning_rate": 1.8069721581709697e-05,
"logits/chosen": -1.6304935216903687,
"logits/rejected": -1.6967551708221436,
"logps/chosen": -4.526963233947754,
"logps/rejected": -5.7123494148254395,
"loss": 21.5069,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.35851508378982544,
"rewards/margins": 0.07467497885227203,
"rewards/rejected": -0.4331900477409363,
"step": 925
},
{
"epoch": 0.5852737570799245,
"grad_norm": 71.74990844726562,
"learning_rate": 1.785117624038546e-05,
"logits/chosen": -1.704414963722229,
"logits/rejected": -1.7506616115570068,
"logps/chosen": -5.388034820556641,
"logps/rejected": -6.3465657234191895,
"loss": 21.8977,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.4279704689979553,
"rewards/margins": 0.05819786712527275,
"rewards/rejected": -0.48616838455200195,
"step": 930
},
{
"epoch": 0.5884203901825047,
"grad_norm": 78.14295196533203,
"learning_rate": 1.763289017613085e-05,
"logits/chosen": -1.6152721643447876,
"logits/rejected": -1.640634536743164,
"logps/chosen": -4.3263750076293945,
"logps/rejected": -5.279467582702637,
"loss": 21.887,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.34328222274780273,
"rewards/margins": 0.07140573114156723,
"rewards/rejected": -0.41468796133995056,
"step": 935
},
{
"epoch": 0.5915670232850849,
"grad_norm": 219.88279724121094,
"learning_rate": 1.741488972734184e-05,
"logits/chosen": -1.5857679843902588,
"logits/rejected": -1.65940260887146,
"logps/chosen": -4.669988632202148,
"logps/rejected": -6.202586650848389,
"loss": 20.5667,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3540535271167755,
"rewards/margins": 0.10686901956796646,
"rewards/rejected": -0.46092256903648376,
"step": 940
},
{
"epoch": 0.5947136563876652,
"grad_norm": 90.00337219238281,
"learning_rate": 1.7197201197952065e-05,
"logits/chosen": -1.5206947326660156,
"logits/rejected": -1.53545343875885,
"logps/chosen": -4.086690902709961,
"logps/rejected": -4.490893363952637,
"loss": 25.9453,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.30406898260116577,
"rewards/margins": 0.034761372953653336,
"rewards/rejected": -0.3388303220272064,
"step": 945
},
{
"epoch": 0.5978602894902454,
"grad_norm": 79.93099212646484,
"learning_rate": 1.6979850854258938e-05,
"logits/chosen": -1.3608052730560303,
"logits/rejected": -1.4760938882827759,
"logps/chosen": -3.6326985359191895,
"logps/rejected": -5.186118125915527,
"loss": 20.6064,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.2708989083766937,
"rewards/margins": 0.10907317698001862,
"rewards/rejected": -0.37997210025787354,
"step": 950
},
{
"epoch": 0.6010069225928257,
"grad_norm": 54.11685562133789,
"learning_rate": 1.6762864921754426e-05,
"logits/chosen": -1.3788961172103882,
"logits/rejected": -1.4954605102539062,
"logps/chosen": -3.189054250717163,
"logps/rejected": -4.365990161895752,
"loss": 20.0193,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.23121857643127441,
"rewards/margins": 0.09906688332557678,
"rewards/rejected": -0.3302854597568512,
"step": 955
},
{
"epoch": 0.604153555695406,
"grad_norm": 78.23949432373047,
"learning_rate": 1.654626958196059e-05,
"logits/chosen": -1.509225606918335,
"logits/rejected": -1.4755313396453857,
"logps/chosen": -4.190049648284912,
"logps/rejected": -5.553238391876221,
"loss": 18.6024,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3084833025932312,
"rewards/margins": 0.10999338328838348,
"rewards/rejected": -0.4184766709804535,
"step": 960
},
{
"epoch": 0.6073001887979862,
"grad_norm": 46.66254806518555,
"learning_rate": 1.633009096927062e-05,
"logits/chosen": -1.5157467126846313,
"logits/rejected": -1.6129589080810547,
"logps/chosen": -3.3808016777038574,
"logps/rejected": -4.686802864074707,
"loss": 18.8156,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.26101940870285034,
"rewards/margins": 0.10853584110736847,
"rewards/rejected": -0.3695552349090576,
"step": 965
},
{
"epoch": 0.6104468219005664,
"grad_norm": 76.67229461669922,
"learning_rate": 1.6114355167795407e-05,
"logits/chosen": -1.507666826248169,
"logits/rejected": -1.642401099205017,
"logps/chosen": -4.4493513107299805,
"logps/rejected": -5.8435235023498535,
"loss": 20.6314,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.32613635063171387,
"rewards/margins": 0.10264672338962555,
"rewards/rejected": -0.42878302931785583,
"step": 970
},
{
"epoch": 0.6135934550031467,
"grad_norm": 97.02481842041016,
"learning_rate": 1.5899088208216215e-05,
"logits/chosen": -1.501697301864624,
"logits/rejected": -1.594618558883667,
"logps/chosen": -4.284520149230957,
"logps/rejected": -4.852963447570801,
"loss": 26.4688,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.33568352460861206,
"rewards/margins": 0.03864779695868492,
"rewards/rejected": -0.37433135509490967,
"step": 975
},
{
"epoch": 0.6167400881057269,
"grad_norm": 176.32850646972656,
"learning_rate": 1.568431606464388e-05,
"logits/chosen": -1.595866084098816,
"logits/rejected": -1.6668930053710938,
"logps/chosen": -4.345438480377197,
"logps/rejected": -5.242307662963867,
"loss": 21.0145,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.3100406527519226,
"rewards/margins": 0.0767713412642479,
"rewards/rejected": -0.3868120312690735,
"step": 980
},
{
"epoch": 0.6198867212083071,
"grad_norm": 76.86431884765625,
"learning_rate": 1.547006465148471e-05,
"logits/chosen": -1.5940501689910889,
"logits/rejected": -1.7789547443389893,
"logps/chosen": -4.4857177734375,
"logps/rejected": -5.875302314758301,
"loss": 21.8847,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.3244941830635071,
"rewards/margins": 0.08251677453517914,
"rewards/rejected": -0.4070109724998474,
"step": 985
},
{
"epoch": 0.6230333543108874,
"grad_norm": 49.81745147705078,
"learning_rate": 1.5256359820313718e-05,
"logits/chosen": -1.550085425376892,
"logits/rejected": -1.5959933996200562,
"logps/chosen": -3.699030637741089,
"logps/rejected": -4.6470842361450195,
"loss": 20.7306,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.2788650095462799,
"rewards/margins": 0.0799705758690834,
"rewards/rejected": -0.3588356077671051,
"step": 990
},
{
"epoch": 0.6261799874134676,
"grad_norm": 81.01653289794922,
"learning_rate": 1.5043227356755292e-05,
"logits/chosen": -1.58163321018219,
"logits/rejected": -1.663260817527771,
"logps/chosen": -4.869448661804199,
"logps/rejected": -5.365525245666504,
"loss": 24.1646,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.34130221605300903,
"rewards/margins": 0.04551910609006882,
"rewards/rejected": -0.38682132959365845,
"step": 995
},
{
"epoch": 0.6293266205160478,
"grad_norm": 101.5945053100586,
"learning_rate": 1.4830692977371985e-05,
"logits/chosen": -1.747009038925171,
"logits/rejected": -1.7761609554290771,
"logps/chosen": -4.585317134857178,
"logps/rejected": -5.033480644226074,
"loss": 23.2309,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.3538682162761688,
"rewards/margins": 0.037090349942445755,
"rewards/rejected": -0.39095860719680786,
"step": 1000
},
{
"epoch": 0.632473253618628,
"grad_norm": 55.57672882080078,
"learning_rate": 1.4618782326561483e-05,
"logits/chosen": -1.7331736087799072,
"logits/rejected": -1.771627426147461,
"logps/chosen": -3.9518864154815674,
"logps/rejected": -4.847538948059082,
"loss": 20.4833,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.2925838530063629,
"rewards/margins": 0.0719093531370163,
"rewards/rejected": -0.3644932210445404,
"step": 1005
},
{
"epoch": 0.6356198867212083,
"grad_norm": 75.53394317626953,
"learning_rate": 1.4407520973462408e-05,
"logits/chosen": -1.7358888387680054,
"logits/rejected": -1.7642987966537476,
"logps/chosen": -4.450674057006836,
"logps/rejected": -5.2704572677612305,
"loss": 22.8124,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3556494116783142,
"rewards/margins": 0.04838743433356285,
"rewards/rejected": -0.40403684973716736,
"step": 1010
},
{
"epoch": 0.6387665198237885,
"grad_norm": 67.8470230102539,
"learning_rate": 1.4196934408869118e-05,
"logits/chosen": -1.8153152465820312,
"logits/rejected": -1.8065166473388672,
"logps/chosen": -5.316075325012207,
"logps/rejected": -6.770912170410156,
"loss": 21.5925,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.3945319950580597,
"rewards/margins": 0.06610045582056046,
"rewards/rejected": -0.46063241362571716,
"step": 1015
},
{
"epoch": 0.6419131529263687,
"grad_norm": 104.53321075439453,
"learning_rate": 1.3987048042155977e-05,
"logits/chosen": -1.6470744609832764,
"logits/rejected": -1.6989984512329102,
"logps/chosen": -4.787189960479736,
"logps/rejected": -5.5443525314331055,
"loss": 22.5867,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.3872155249118805,
"rewards/margins": 0.05858270451426506,
"rewards/rejected": -0.44579824805259705,
"step": 1020
},
{
"epoch": 0.645059786028949,
"grad_norm": 122.49982452392578,
"learning_rate": 1.377788719821149e-05,
"logits/chosen": -1.6421356201171875,
"logits/rejected": -1.702820062637329,
"logps/chosen": -4.435242652893066,
"logps/rejected": -4.579672336578369,
"loss": 25.1424,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.3478389382362366,
"rewards/margins": 0.0215731430798769,
"rewards/rejected": -0.3694121241569519,
"step": 1025
},
{
"epoch": 0.6482064191315292,
"grad_norm": 145.1405487060547,
"learning_rate": 1.3569477114382568e-05,
"logits/chosen": -1.6365470886230469,
"logits/rejected": -1.6954962015151978,
"logps/chosen": -4.985340595245361,
"logps/rejected": -5.898791313171387,
"loss": 21.7627,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.385195255279541,
"rewards/margins": 0.05228766053915024,
"rewards/rejected": -0.43748289346694946,
"step": 1030
},
{
"epoch": 0.6513530522341096,
"grad_norm": 82.04701232910156,
"learning_rate": 1.3361842937429436e-05,
"logits/chosen": -1.6654088497161865,
"logits/rejected": -1.732187032699585,
"logps/chosen": -4.262317180633545,
"logps/rejected": -5.410677909851074,
"loss": 20.2359,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.3416784703731537,
"rewards/margins": 0.08638517558574677,
"rewards/rejected": -0.42806363105773926,
"step": 1035
},
{
"epoch": 0.6544996853366898,
"grad_norm": 95.95136260986328,
"learning_rate": 1.3155009720491368e-05,
"logits/chosen": -1.5801721811294556,
"logits/rejected": -1.5603923797607422,
"logps/chosen": -5.278650760650635,
"logps/rejected": -6.190367698669434,
"loss": 22.4881,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.3998781740665436,
"rewards/margins": 0.06602592766284943,
"rewards/rejected": -0.4659040868282318,
"step": 1040
},
{
"epoch": 0.65764631843927,
"grad_norm": 60.0530891418457,
"learning_rate": 1.2949002420063828e-05,
"logits/chosen": -1.6326820850372314,
"logits/rejected": -1.720810890197754,
"logps/chosen": -4.082489967346191,
"logps/rejected": -5.006215572357178,
"loss": 21.0105,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.32317107915878296,
"rewards/margins": 0.07444654405117035,
"rewards/rejected": -0.3976176679134369,
"step": 1045
},
{
"epoch": 0.6607929515418502,
"grad_norm": 221.81906127929688,
"learning_rate": 1.2743845892987183e-05,
"logits/chosen": -1.6526765823364258,
"logits/rejected": -1.697488784790039,
"logps/chosen": -4.53380823135376,
"logps/rejected": -5.771850109100342,
"loss": 23.2634,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.34263211488723755,
"rewards/margins": 0.07287438213825226,
"rewards/rejected": -0.4155064523220062,
"step": 1050
},
{
"epoch": 0.6639395846444305,
"grad_norm": 137.2283172607422,
"learning_rate": 1.2539564893447489e-05,
"logits/chosen": -1.631956696510315,
"logits/rejected": -1.654306173324585,
"logps/chosen": -4.1559600830078125,
"logps/rejected": -5.033182621002197,
"loss": 22.6183,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.32424792647361755,
"rewards/margins": 0.06618380546569824,
"rewards/rejected": -0.3904317319393158,
"step": 1055
},
{
"epoch": 0.6670862177470107,
"grad_norm": 72.95520782470703,
"learning_rate": 1.2336184069989663e-05,
"logits/chosen": -1.670440435409546,
"logits/rejected": -1.6872297525405884,
"logps/chosen": -3.9552032947540283,
"logps/rejected": -5.303035259246826,
"loss": 19.5681,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.31223589181900024,
"rewards/margins": 0.09164074063301086,
"rewards/rejected": -0.4038766026496887,
"step": 1060
},
{
"epoch": 0.6702328508495909,
"grad_norm": 90.91898345947266,
"learning_rate": 1.2133727962543356e-05,
"logits/chosen": -1.6696465015411377,
"logits/rejected": -1.6963016986846924,
"logps/chosen": -4.434679985046387,
"logps/rejected": -5.158357620239258,
"loss": 21.8675,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.3489730954170227,
"rewards/margins": 0.05557180196046829,
"rewards/rejected": -0.4045449197292328,
"step": 1065
},
{
"epoch": 0.6733794839521712,
"grad_norm": 185.79261779785156,
"learning_rate": 1.193222099946202e-05,
"logits/chosen": -1.6571991443634033,
"logits/rejected": -1.7073132991790771,
"logps/chosen": -4.607517242431641,
"logps/rejected": -5.376668930053711,
"loss": 22.3462,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.35802438855171204,
"rewards/margins": 0.0643647164106369,
"rewards/rejected": -0.42238911986351013,
"step": 1070
},
{
"epoch": 0.6765261170547514,
"grad_norm": 71.50703430175781,
"learning_rate": 1.1731687494575319e-05,
"logits/chosen": -1.585889458656311,
"logits/rejected": -1.6507800817489624,
"logps/chosen": -4.845611572265625,
"logps/rejected": -6.422255516052246,
"loss": 18.5681,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.37992939352989197,
"rewards/margins": 0.10727685689926147,
"rewards/rejected": -0.48720628023147583,
"step": 1075
},
{
"epoch": 0.6796727501573316,
"grad_norm": 210.3772430419922,
"learning_rate": 1.153215164425547e-05,
"logits/chosen": -1.5637327432632446,
"logits/rejected": -1.628791093826294,
"logps/chosen": -4.643498420715332,
"logps/rejected": -5.90508508682251,
"loss": 22.429,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.3530879020690918,
"rewards/margins": 0.07370196282863617,
"rewards/rejected": -0.42678990960121155,
"step": 1080
},
{
"epoch": 0.6828193832599119,
"grad_norm": 324.6168212890625,
"learning_rate": 1.133363752449768e-05,
"logits/chosen": -1.6127498149871826,
"logits/rejected": -1.5895841121673584,
"logps/chosen": -3.8858344554901123,
"logps/rejected": -5.141265392303467,
"loss": 18.9867,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.31075209379196167,
"rewards/margins": 0.10046511888504028,
"rewards/rejected": -0.41121721267700195,
"step": 1085
},
{
"epoch": 0.6859660163624921,
"grad_norm": 269.12744140625,
"learning_rate": 1.1136169088015177e-05,
"logits/chosen": -1.5152666568756104,
"logits/rejected": -1.5772387981414795,
"logps/chosen": -4.37540864944458,
"logps/rejected": -5.073463439941406,
"loss": 22.4614,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.3488185405731201,
"rewards/margins": 0.05801800638437271,
"rewards/rejected": -0.40683650970458984,
"step": 1090
},
{
"epoch": 0.6891126494650723,
"grad_norm": 407.53985595703125,
"learning_rate": 1.0939770161349015e-05,
"logits/chosen": -1.604278802871704,
"logits/rejected": -1.6394538879394531,
"logps/chosen": -4.725668907165527,
"logps/rejected": -6.037966728210449,
"loss": 23.0495,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.3864028751850128,
"rewards/margins": 0.09246636927127838,
"rewards/rejected": -0.4788691997528076,
"step": 1095
},
{
"epoch": 0.6922592825676526,
"grad_norm": 65.52562713623047,
"learning_rate": 1.0744464441993205e-05,
"logits/chosen": -1.4906436204910278,
"logits/rejected": -1.570569634437561,
"logps/chosen": -4.404895782470703,
"logps/rejected": -5.454612731933594,
"loss": 21.9146,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.3443445563316345,
"rewards/margins": 0.07481996715068817,
"rewards/rejected": -0.4191645085811615,
"step": 1100
},
{
"epoch": 0.6954059156702328,
"grad_norm": 60.899654388427734,
"learning_rate": 1.0550275495535382e-05,
"logits/chosen": -1.5062484741210938,
"logits/rejected": -1.5998207330703735,
"logps/chosen": -5.046140193939209,
"logps/rejected": -6.212726593017578,
"loss": 22.0906,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.3926524817943573,
"rewards/margins": 0.08822645246982574,
"rewards/rejected": -0.48087891936302185,
"step": 1105
},
{
"epoch": 0.6985525487728131,
"grad_norm": 85.36582946777344,
"learning_rate": 1.0357226752813343e-05,
"logits/chosen": -1.48141348361969,
"logits/rejected": -1.532138705253601,
"logps/chosen": -4.955922603607178,
"logps/rejected": -6.1522979736328125,
"loss": 19.2663,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.3955458402633667,
"rewards/margins": 0.09683366119861603,
"rewards/rejected": -0.4923795163631439,
"step": 1110
},
{
"epoch": 0.7016991818753934,
"grad_norm": 92.5035171508789,
"learning_rate": 1.0165341507087922e-05,
"logits/chosen": -1.4898306131362915,
"logits/rejected": -1.589817762374878,
"logps/chosen": -4.877270221710205,
"logps/rejected": -6.326567649841309,
"loss": 21.0751,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3771550953388214,
"rewards/margins": 0.10272278636693954,
"rewards/rejected": -0.47987785935401917,
"step": 1115
},
{
"epoch": 0.7048458149779736,
"grad_norm": 100.18026733398438,
"learning_rate": 9.974642911232413e-06,
"logits/chosen": -1.5176981687545776,
"logits/rejected": -1.5406978130340576,
"logps/chosen": -5.319207191467285,
"logps/rejected": -6.242737770080566,
"loss": 20.9524,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.4210137724876404,
"rewards/margins": 0.07255946844816208,
"rewards/rejected": -0.49357327818870544,
"step": 1120
},
{
"epoch": 0.7079924480805538,
"grad_norm": 176.3753662109375,
"learning_rate": 9.785153974938912e-06,
"logits/chosen": -1.5830824375152588,
"logits/rejected": -1.6101982593536377,
"logps/chosen": -5.879128456115723,
"logps/rejected": -6.807085990905762,
"loss": 22.111,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.44119253754615784,
"rewards/margins": 0.07570262253284454,
"rewards/rejected": -0.5168951749801636,
"step": 1125
},
{
"epoch": 0.7111390811831341,
"grad_norm": 67.40308380126953,
"learning_rate": 9.596897561942026e-06,
"logits/chosen": -1.463176965713501,
"logits/rejected": -1.4804832935333252,
"logps/chosen": -4.481048107147217,
"logps/rejected": -5.287797451019287,
"loss": 22.1994,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.3575854003429413,
"rewards/margins": 0.06364957243204117,
"rewards/rejected": -0.42123493552207947,
"step": 1130
},
{
"epoch": 0.7142857142857143,
"grad_norm": 93.39257049560547,
"learning_rate": 9.409896387260082e-06,
"logits/chosen": -1.4179964065551758,
"logits/rejected": -1.4655730724334717,
"logps/chosen": -4.708760738372803,
"logps/rejected": -6.217686653137207,
"loss": 21.4161,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.37350720167160034,
"rewards/margins": 0.10105752944946289,
"rewards/rejected": -0.47456473112106323,
"step": 1135
},
{
"epoch": 0.7174323473882945,
"grad_norm": 97.41583251953125,
"learning_rate": 9.224173014454372e-06,
"logits/chosen": -1.4397246837615967,
"logits/rejected": -1.4766523838043213,
"logps/chosen": -4.817109107971191,
"logps/rejected": -6.214907169342041,
"loss": 22.7104,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.384985089302063,
"rewards/margins": 0.0952010303735733,
"rewards/rejected": -0.4801861345767975,
"step": 1140
},
{
"epoch": 0.7205789804908748,
"grad_norm": 103.4198989868164,
"learning_rate": 9.039749852906606e-06,
"logits/chosen": -1.368666648864746,
"logits/rejected": -1.4239342212677002,
"logps/chosen": -4.382673740386963,
"logps/rejected": -5.262811183929443,
"loss": 20.8727,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.35242724418640137,
"rewards/margins": 0.075123131275177,
"rewards/rejected": -0.42755040526390076,
"step": 1145
},
{
"epoch": 0.723725613593455,
"grad_norm": 131.38589477539062,
"learning_rate": 8.856649155115002e-06,
"logits/chosen": -1.409711241722107,
"logits/rejected": -1.455235481262207,
"logps/chosen": -4.550191402435303,
"logps/rejected": -5.52540922164917,
"loss": 23.0103,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.3620757460594177,
"rewards/margins": 0.06903600692749023,
"rewards/rejected": -0.43111175298690796,
"step": 1150
},
{
"epoch": 0.7268722466960352,
"grad_norm": 60.0385627746582,
"learning_rate": 8.674893014009311e-06,
"logits/chosen": -1.3705095052719116,
"logits/rejected": -1.4764083623886108,
"logps/chosen": -4.423483848571777,
"logps/rejected": -5.486600875854492,
"loss": 21.3505,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3566300570964813,
"rewards/margins": 0.07945708185434341,
"rewards/rejected": -0.4360871911048889,
"step": 1155
},
{
"epoch": 0.7300188797986155,
"grad_norm": 80.497802734375,
"learning_rate": 8.494503360285084e-06,
"logits/chosen": -1.406087875366211,
"logits/rejected": -1.5597848892211914,
"logps/chosen": -4.28043270111084,
"logps/rejected": -5.639766216278076,
"loss": 21.9094,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.3276395797729492,
"rewards/margins": 0.07139433920383453,
"rewards/rejected": -0.39903393387794495,
"step": 1160
},
{
"epoch": 0.7331655129011957,
"grad_norm": 106.78560638427734,
"learning_rate": 8.315501959757506e-06,
"logits/chosen": -1.4479920864105225,
"logits/rejected": -1.530386209487915,
"logps/chosen": -5.356269836425781,
"logps/rejected": -6.295357704162598,
"loss": 20.2622,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.39192715287208557,
"rewards/margins": 0.07843243330717087,
"rewards/rejected": -0.47035956382751465,
"step": 1165
},
{
"epoch": 0.7363121460037759,
"grad_norm": 70.2252426147461,
"learning_rate": 8.137910410735119e-06,
"logits/chosen": -1.3913201093673706,
"logits/rejected": -1.5211797952651978,
"logps/chosen": -4.186515808105469,
"logps/rejected": -5.630705833435059,
"loss": 19.5955,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.3086017966270447,
"rewards/margins": 0.1026659831404686,
"rewards/rejected": -0.4112677574157715,
"step": 1170
},
{
"epoch": 0.7394587791063562,
"grad_norm": 192.9811553955078,
"learning_rate": 7.961750141413811e-06,
"logits/chosen": -1.4113714694976807,
"logits/rejected": -1.4863709211349487,
"logps/chosen": -4.043957710266113,
"logps/rejected": -4.903926849365234,
"loss": 21.1766,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.30225270986557007,
"rewards/margins": 0.0713002160191536,
"rewards/rejected": -0.3735528886318207,
"step": 1175
},
{
"epoch": 0.7426054122089364,
"grad_norm": 120.66477966308594,
"learning_rate": 7.787042407291236e-06,
"logits/chosen": -1.4459470510482788,
"logits/rejected": -1.4732497930526733,
"logps/chosen": -4.194180488586426,
"logps/rejected": -5.103634834289551,
"loss": 21.7414,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.33040323853492737,
"rewards/margins": 0.07123108208179474,
"rewards/rejected": -0.4016343653202057,
"step": 1180
},
{
"epoch": 0.7457520453115167,
"grad_norm": 76.2223892211914,
"learning_rate": 7.613808288602185e-06,
"logits/chosen": -1.3101516962051392,
"logits/rejected": -1.410070776939392,
"logps/chosen": -3.897928237915039,
"logps/rejected": -4.853459358215332,
"loss": 20.4936,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.30166110396385193,
"rewards/margins": 0.07222743332386017,
"rewards/rejected": -0.3738885223865509,
"step": 1185
},
{
"epoch": 0.748898678414097,
"grad_norm": 71.2408676147461,
"learning_rate": 7.442068687774983e-06,
"logits/chosen": -1.3900350332260132,
"logits/rejected": -1.4306429624557495,
"logps/chosen": -4.03500509262085,
"logps/rejected": -4.971550941467285,
"loss": 20.8514,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.31287750601768494,
"rewards/margins": 0.07024455070495605,
"rewards/rejected": -0.383122056722641,
"step": 1190
},
{
"epoch": 0.7520453115166772,
"grad_norm": 174.92535400390625,
"learning_rate": 7.271844326909465e-06,
"logits/chosen": -1.3968006372451782,
"logits/rejected": -1.3997862339019775,
"logps/chosen": -4.94242000579834,
"logps/rejected": -5.543642520904541,
"loss": 23.6965,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.36712104082107544,
"rewards/margins": 0.041298940777778625,
"rewards/rejected": -0.40841999650001526,
"step": 1195
},
{
"epoch": 0.7551919446192574,
"grad_norm": 83.12405395507812,
"learning_rate": 7.1031557452765934e-06,
"logits/chosen": -1.4155142307281494,
"logits/rejected": -1.4555690288543701,
"logps/chosen": -3.987143039703369,
"logps/rejected": -5.240988731384277,
"loss": 20.4557,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.312110960483551,
"rewards/margins": 0.08850479125976562,
"rewards/rejected": -0.40061575174331665,
"step": 1200
},
{
"epoch": 0.7583385777218377,
"grad_norm": 82.25894165039062,
"learning_rate": 6.936023296840211e-06,
"logits/chosen": -1.3227570056915283,
"logits/rejected": -1.4542601108551025,
"logps/chosen": -4.520358562469482,
"logps/rejected": -5.628200531005859,
"loss": 21.0717,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.3516823947429657,
"rewards/margins": 0.07068557292222977,
"rewards/rejected": -0.42236796021461487,
"step": 1205
},
{
"epoch": 0.7614852108244179,
"grad_norm": 63.93009567260742,
"learning_rate": 6.770467147801152e-06,
"logits/chosen": -1.3352692127227783,
"logits/rejected": -1.4765124320983887,
"logps/chosen": -3.903353452682495,
"logps/rejected": -5.777923107147217,
"loss": 18.1176,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.30418699979782104,
"rewards/margins": 0.1330038160085678,
"rewards/rejected": -0.43719083070755005,
"step": 1210
},
{
"epoch": 0.7646318439269981,
"grad_norm": 123.6090087890625,
"learning_rate": 6.606507274163949e-06,
"logits/chosen": -1.4196144342422485,
"logits/rejected": -1.5160802602767944,
"logps/chosen": -4.3763604164123535,
"logps/rejected": -5.507956504821777,
"loss": 21.3593,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3455001711845398,
"rewards/margins": 0.08908528089523315,
"rewards/rejected": -0.43458548188209534,
"step": 1215
},
{
"epoch": 0.7677784770295784,
"grad_norm": 79.51527404785156,
"learning_rate": 6.444163459326569e-06,
"logits/chosen": -1.3841816186904907,
"logits/rejected": -1.44673752784729,
"logps/chosen": -4.642246723175049,
"logps/rejected": -5.952216625213623,
"loss": 20.2826,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.37046322226524353,
"rewards/margins": 0.10095451772212982,
"rewards/rejected": -0.47141775488853455,
"step": 1220
},
{
"epoch": 0.7709251101321586,
"grad_norm": 115.33991241455078,
"learning_rate": 6.283455291693303e-06,
"logits/chosen": -1.2804498672485352,
"logits/rejected": -1.336126446723938,
"logps/chosen": -4.530810356140137,
"logps/rejected": -5.714901924133301,
"loss": 23.5811,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3543975353240967,
"rewards/margins": 0.07916755974292755,
"rewards/rejected": -0.43356508016586304,
"step": 1225
},
{
"epoch": 0.7740717432347388,
"grad_norm": 102.68350219726562,
"learning_rate": 6.124402162311274e-06,
"logits/chosen": -1.3455007076263428,
"logits/rejected": -1.3819594383239746,
"logps/chosen": -4.560150146484375,
"logps/rejected": -5.909863471984863,
"loss": 21.4806,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.36466413736343384,
"rewards/margins": 0.07453545182943344,
"rewards/rejected": -0.4391995966434479,
"step": 1230
},
{
"epoch": 0.777218376337319,
"grad_norm": 78.0007553100586,
"learning_rate": 5.9670232625306955e-06,
"logits/chosen": -1.3267484903335571,
"logits/rejected": -1.3938989639282227,
"logps/chosen": -4.1908979415893555,
"logps/rejected": -4.819875240325928,
"loss": 24.9323,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.3211560845375061,
"rewards/margins": 0.054157011210918427,
"rewards/rejected": -0.3753131031990051,
"step": 1235
},
{
"epoch": 0.7803650094398993,
"grad_norm": 910.2023315429688,
"learning_rate": 5.81133758168922e-06,
"logits/chosen": -1.4007585048675537,
"logits/rejected": -1.4542076587677002,
"logps/chosen": -5.091724872589111,
"logps/rejected": -6.444447994232178,
"loss": 20.9318,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.36028310656547546,
"rewards/margins": 0.09358057379722595,
"rewards/rejected": -0.4538637101650238,
"step": 1240
},
{
"epoch": 0.7835116425424795,
"grad_norm": 68.25672912597656,
"learning_rate": 5.6573639048207315e-06,
"logits/chosen": -1.3604391813278198,
"logits/rejected": -1.3182973861694336,
"logps/chosen": -4.621526718139648,
"logps/rejected": -5.245944023132324,
"loss": 21.9955,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.32450687885284424,
"rewards/margins": 0.07323630154132843,
"rewards/rejected": -0.3977431654930115,
"step": 1245
},
{
"epoch": 0.7866582756450597,
"grad_norm": 106.51322937011719,
"learning_rate": 5.5051208103887025e-06,
"logits/chosen": -1.3608815670013428,
"logits/rejected": -1.4448637962341309,
"logps/chosen": -4.045924663543701,
"logps/rejected": -5.57630729675293,
"loss": 20.889,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.3083491623401642,
"rewards/margins": 0.09919731318950653,
"rewards/rejected": -0.40754643082618713,
"step": 1250
},
{
"epoch": 0.78980490874764,
"grad_norm": 70.59749603271484,
"learning_rate": 5.354626668044535e-06,
"logits/chosen": -1.3460859060287476,
"logits/rejected": -1.412706732749939,
"logps/chosen": -3.734891891479492,
"logps/rejected": -4.818475246429443,
"loss": 21.0468,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2917167842388153,
"rewards/margins": 0.07943135499954224,
"rewards/rejected": -0.37114813923835754,
"step": 1255
},
{
"epoch": 0.7929515418502202,
"grad_norm": 83.2120361328125,
"learning_rate": 5.205899636411078e-06,
"logits/chosen": -1.3329652547836304,
"logits/rejected": -1.3952248096466064,
"logps/chosen": -4.460053443908691,
"logps/rejected": -4.993377685546875,
"loss": 25.4182,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.34508955478668213,
"rewards/margins": 0.03861779719591141,
"rewards/rejected": -0.38370734453201294,
"step": 1260
},
{
"epoch": 0.7960981749528006,
"grad_norm": 74.94086456298828,
"learning_rate": 5.058957660891613e-06,
"logits/chosen": -1.353829264640808,
"logits/rejected": -1.36537766456604,
"logps/chosen": -3.8537967205047607,
"logps/rejected": -4.86336612701416,
"loss": 21.0046,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.29704272747039795,
"rewards/margins": 0.07927833497524261,
"rewards/rejected": -0.376321017742157,
"step": 1265
},
{
"epoch": 0.7992448080553808,
"grad_norm": 68.53548431396484,
"learning_rate": 4.913818471504552e-06,
"logits/chosen": -1.3891483545303345,
"logits/rejected": -1.4956327676773071,
"logps/chosen": -3.83349609375,
"logps/rejected": -5.111277103424072,
"loss": 20.258,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.299319326877594,
"rewards/margins": 0.09995778650045395,
"rewards/rejected": -0.3992771506309509,
"step": 1270
},
{
"epoch": 0.802391441157961,
"grad_norm": 161.29922485351562,
"learning_rate": 4.770499580744125e-06,
"logits/chosen": -1.3398183584213257,
"logits/rejected": -1.3453642129898071,
"logps/chosen": -3.9315247535705566,
"logps/rejected": -4.841611862182617,
"loss": 22.4824,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.30572158098220825,
"rewards/margins": 0.06097061559557915,
"rewards/rejected": -0.3666921854019165,
"step": 1275
},
{
"epoch": 0.8055380742605412,
"grad_norm": 68.45879364013672,
"learning_rate": 4.629018281467357e-06,
"logits/chosen": -1.297154188156128,
"logits/rejected": -1.338921070098877,
"logps/chosen": -3.7794177532196045,
"logps/rejected": -4.509110927581787,
"loss": 21.658,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2888321876525879,
"rewards/margins": 0.05916588753461838,
"rewards/rejected": -0.3479980528354645,
"step": 1280
},
{
"epoch": 0.8086847073631215,
"grad_norm": 74.77375793457031,
"learning_rate": 4.489391644807462e-06,
"logits/chosen": -1.4385647773742676,
"logits/rejected": -1.5144340991973877,
"logps/chosen": -3.69215726852417,
"logps/rejected": -4.667183876037598,
"loss": 21.0338,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.2842097282409668,
"rewards/margins": 0.07265909761190414,
"rewards/rejected": -0.35686883330345154,
"step": 1285
},
{
"epoch": 0.8118313404657017,
"grad_norm": 78.63387298583984,
"learning_rate": 4.351636518114091e-06,
"logits/chosen": -1.3093000650405884,
"logits/rejected": -1.3893928527832031,
"logps/chosen": -3.599902629852295,
"logps/rejected": -4.570587635040283,
"loss": 22.1635,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.2749050259590149,
"rewards/margins": 0.08025064319372177,
"rewards/rejected": -0.3551556468009949,
"step": 1290
},
{
"epoch": 0.8149779735682819,
"grad_norm": 78.53893280029297,
"learning_rate": 4.215769522920487e-06,
"logits/chosen": -1.2443653345108032,
"logits/rejected": -1.3605782985687256,
"logps/chosen": -3.2713770866394043,
"logps/rejected": -4.569630146026611,
"loss": 20.9369,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.24853453040122986,
"rewards/margins": 0.10017738491296768,
"rewards/rejected": -0.34871190786361694,
"step": 1295
},
{
"epoch": 0.8181246066708622,
"grad_norm": 82.4554672241211,
"learning_rate": 4.0818070529379715e-06,
"logits/chosen": -1.383690357208252,
"logits/rejected": -1.4704560041427612,
"logps/chosen": -4.524319171905518,
"logps/rejected": -5.7077460289001465,
"loss": 21.9118,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.316571444272995,
"rewards/margins": 0.0641409307718277,
"rewards/rejected": -0.3807123601436615,
"step": 1300
},
{
"epoch": 0.8212712397734424,
"grad_norm": 71.1880111694336,
"learning_rate": 3.949765272077843e-06,
"logits/chosen": -1.3107343912124634,
"logits/rejected": -1.3561115264892578,
"logps/chosen": -3.846195936203003,
"logps/rejected": -4.79428768157959,
"loss": 21.0994,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.27155357599258423,
"rewards/margins": 0.07163957506418228,
"rewards/rejected": -0.3431931734085083,
"step": 1305
},
{
"epoch": 0.8244178728760226,
"grad_norm": 50.073204040527344,
"learning_rate": 3.819660112501053e-06,
"logits/chosen": -1.2764497995376587,
"logits/rejected": -1.3517284393310547,
"logps/chosen": -3.5745315551757812,
"logps/rejected": -4.921723365783691,
"loss": 19.6469,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.27279648184776306,
"rewards/margins": 0.1019618958234787,
"rewards/rejected": -0.37475839257240295,
"step": 1310
},
{
"epoch": 0.8275645059786029,
"grad_norm": 83.62207794189453,
"learning_rate": 3.6915072726958514e-06,
"logits/chosen": -1.2466180324554443,
"logits/rejected": -1.2861813306808472,
"logps/chosen": -3.430490016937256,
"logps/rejected": -4.824821949005127,
"loss": 20.5161,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.2643739581108093,
"rewards/margins": 0.1028999462723732,
"rewards/rejected": -0.3672739565372467,
"step": 1315
},
{
"epoch": 0.8307111390811831,
"grad_norm": 76.6629638671875,
"learning_rate": 3.5653222155835686e-06,
"logits/chosen": -1.2766977548599243,
"logits/rejected": -1.3114259243011475,
"logps/chosen": -4.222517967224121,
"logps/rejected": -5.029845714569092,
"loss": 22.1218,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3145274221897125,
"rewards/margins": 0.06165830045938492,
"rewards/rejected": -0.37618574500083923,
"step": 1320
},
{
"epoch": 0.8338577721837633,
"grad_norm": 159.4115447998047,
"learning_rate": 3.4411201666529003e-06,
"logits/chosen": -1.3758924007415771,
"logits/rejected": -1.4244683980941772,
"logps/chosen": -4.457423210144043,
"logps/rejected": -5.342848300933838,
"loss": 23.3834,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.31679028272628784,
"rewards/margins": 0.06267707049846649,
"rewards/rejected": -0.3794673979282379,
"step": 1325
},
{
"epoch": 0.8370044052863436,
"grad_norm": 56.71870803833008,
"learning_rate": 3.3189161121227564e-06,
"logits/chosen": -1.3166803121566772,
"logits/rejected": -1.385522723197937,
"logps/chosen": -3.8323776721954346,
"logps/rejected": -4.732277870178223,
"loss": 23.3384,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.2978932559490204,
"rewards/margins": 0.0644349679350853,
"rewards/rejected": -0.3623282313346863,
"step": 1330
},
{
"epoch": 0.8401510383889238,
"grad_norm": 66.62996673583984,
"learning_rate": 3.198724797134074e-06,
"logits/chosen": -1.2822662591934204,
"logits/rejected": -1.4124181270599365,
"logps/chosen": -3.9724369049072266,
"logps/rejected": -5.0466437339782715,
"loss": 22.4903,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.2994682192802429,
"rewards/margins": 0.0788046196103096,
"rewards/rejected": -0.3782728910446167,
"step": 1335
},
{
"epoch": 0.8432976714915041,
"grad_norm": 70.8177261352539,
"learning_rate": 3.080560723970616e-06,
"logits/chosen": -1.2813329696655273,
"logits/rejected": -1.3586981296539307,
"logps/chosen": -3.6214439868927,
"logps/rejected": -4.637081623077393,
"loss": 20.5515,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.28146275877952576,
"rewards/margins": 0.07804764062166214,
"rewards/rejected": -0.3595103919506073,
"step": 1340
},
{
"epoch": 0.8464443045940844,
"grad_norm": 64.40753173828125,
"learning_rate": 2.96443815030917e-06,
"logits/chosen": -1.3396605253219604,
"logits/rejected": -1.4255945682525635,
"logps/chosen": -3.604154586791992,
"logps/rejected": -4.95128059387207,
"loss": 20.7037,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.2776135206222534,
"rewards/margins": 0.09353432059288025,
"rewards/rejected": -0.37114784121513367,
"step": 1345
},
{
"epoch": 0.8495909376966646,
"grad_norm": 93.99842071533203,
"learning_rate": 2.850371087499195e-06,
"logits/chosen": -1.381260633468628,
"logits/rejected": -1.4631612300872803,
"logps/chosen": -4.883763790130615,
"logps/rejected": -6.07845401763916,
"loss": 21.0858,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.3591059148311615,
"rewards/margins": 0.09570769965648651,
"rewards/rejected": -0.4548136591911316,
"step": 1350
},
{
"epoch": 0.8527375707992448,
"grad_norm": 62.075279235839844,
"learning_rate": 2.7383732988722057e-06,
"logits/chosen": -1.3089946508407593,
"logits/rejected": -1.3634613752365112,
"logps/chosen": -3.7724010944366455,
"logps/rejected": -4.929832458496094,
"loss": 19.0202,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.273120641708374,
"rewards/margins": 0.09602681547403336,
"rewards/rejected": -0.36914747953414917,
"step": 1355
},
{
"epoch": 0.8558842039018251,
"grad_norm": 80.0210189819336,
"learning_rate": 2.6284582980811136e-06,
"logits/chosen": -1.4461333751678467,
"logits/rejected": -1.370339035987854,
"logps/chosen": -4.136780738830566,
"logps/rejected": -5.008397579193115,
"loss": 23.5672,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.3027392327785492,
"rewards/margins": 0.062295325100421906,
"rewards/rejected": -0.3650345206260681,
"step": 1360
},
{
"epoch": 0.8590308370044053,
"grad_norm": 169.91099548339844,
"learning_rate": 2.5206393474696422e-06,
"logits/chosen": -1.2922241687774658,
"logits/rejected": -1.3685882091522217,
"logps/chosen": -3.8860459327697754,
"logps/rejected": -4.820228099822998,
"loss": 20.1345,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2878992557525635,
"rewards/margins": 0.07816118001937866,
"rewards/rejected": -0.36606043577194214,
"step": 1365
},
{
"epoch": 0.8621774701069855,
"grad_norm": 291.87542724609375,
"learning_rate": 2.4149294564721146e-06,
"logits/chosen": -1.390933632850647,
"logits/rejected": -1.477757215499878,
"logps/chosen": -4.5947346687316895,
"logps/rejected": -5.662859916687012,
"loss": 22.1173,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.32581329345703125,
"rewards/margins": 0.0882103443145752,
"rewards/rejected": -0.4140236973762512,
"step": 1370
},
{
"epoch": 0.8653241032095658,
"grad_norm": 50.774810791015625,
"learning_rate": 2.3113413800437145e-06,
"logits/chosen": -1.3678381443023682,
"logits/rejected": -1.4147788286209106,
"logps/chosen": -4.411424160003662,
"logps/rejected": -5.547976970672607,
"loss": 20.419,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3136950135231018,
"rewards/margins": 0.08119923621416092,
"rewards/rejected": -0.3948942720890045,
"step": 1375
},
{
"epoch": 0.868470736312146,
"grad_norm": 75.1661605834961,
"learning_rate": 2.2098876171215e-06,
"logits/chosen": -1.2949163913726807,
"logits/rejected": -1.4591166973114014,
"logps/chosen": -3.913958787918091,
"logps/rejected": -4.945563316345215,
"loss": 20.5075,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.30175596475601196,
"rewards/margins": 0.09277000278234482,
"rewards/rejected": -0.394525945186615,
"step": 1380
},
{
"epoch": 0.8716173694147262,
"grad_norm": 116.18523406982422,
"learning_rate": 2.110580409116261e-06,
"logits/chosen": -1.3234283924102783,
"logits/rejected": -1.3651349544525146,
"logps/chosen": -4.782530307769775,
"logps/rejected": -5.800885200500488,
"loss": 22.8406,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.3511677384376526,
"rewards/margins": 0.07397367060184479,
"rewards/rejected": -0.4251413345336914,
"step": 1385
},
{
"epoch": 0.8747640025173065,
"grad_norm": 145.46861267089844,
"learning_rate": 2.013431738435465e-06,
"logits/chosen": -1.3332188129425049,
"logits/rejected": -1.4134724140167236,
"logps/chosen": -4.268718242645264,
"logps/rejected": -5.433601379394531,
"loss": 22.5056,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.3332800269126892,
"rewards/margins": 0.07072637230157852,
"rewards/rejected": -0.4040064215660095,
"step": 1390
},
{
"epoch": 0.8779106356198867,
"grad_norm": 117.83720397949219,
"learning_rate": 1.9184533270374928e-06,
"logits/chosen": -1.3927792310714722,
"logits/rejected": -1.4590123891830444,
"logps/chosen": -4.519114017486572,
"logps/rejected": -5.810807228088379,
"loss": 21.2018,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.34003710746765137,
"rewards/margins": 0.08822458237409592,
"rewards/rejected": -0.4282616972923279,
"step": 1395
},
{
"epoch": 0.8810572687224669,
"grad_norm": 128.75563049316406,
"learning_rate": 1.8256566350172211e-06,
"logits/chosen": -1.4642970561981201,
"logits/rejected": -1.56011962890625,
"logps/chosen": -5.124087810516357,
"logps/rejected": -6.271437168121338,
"loss": 20.9824,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.34806352853775024,
"rewards/margins": 0.0969148576259613,
"rewards/rejected": -0.44497838616371155,
"step": 1400
},
{
"epoch": 0.8842039018250472,
"grad_norm": 88.87577056884766,
"learning_rate": 1.7350528592232962e-06,
"logits/chosen": -1.3359493017196655,
"logits/rejected": -1.4811887741088867,
"logps/chosen": -4.525036811828613,
"logps/rejected": -5.623012542724609,
"loss": 22.1104,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.3581879138946533,
"rewards/margins": 0.07608196139335632,
"rewards/rejected": -0.43426984548568726,
"step": 1405
},
{
"epoch": 0.8873505349276274,
"grad_norm": 69.19255065917969,
"learning_rate": 1.6466529319070735e-06,
"logits/chosen": -1.2726246118545532,
"logits/rejected": -1.39580237865448,
"logps/chosen": -3.7457852363586426,
"logps/rejected": -5.324977397918701,
"loss": 18.2434,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.2871127724647522,
"rewards/margins": 0.11219409853219986,
"rewards/rejected": -0.39930686354637146,
"step": 1410
},
{
"epoch": 0.8904971680302077,
"grad_norm": 73.79737854003906,
"learning_rate": 1.560467519403579e-06,
"logits/chosen": -1.3266379833221436,
"logits/rejected": -1.3948261737823486,
"logps/chosen": -4.1067681312561035,
"logps/rejected": -4.673392295837402,
"loss": 22.1702,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3159501254558563,
"rewards/margins": 0.04971395805478096,
"rewards/rejected": -0.3656640946865082,
"step": 1415
},
{
"epoch": 0.893643801132788,
"grad_norm": 106.870361328125,
"learning_rate": 1.4765070208444732e-06,
"logits/chosen": -1.3216549158096313,
"logits/rejected": -1.35343337059021,
"logps/chosen": -4.343778133392334,
"logps/rejected": -5.122066497802734,
"loss": 22.7187,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.33430585265159607,
"rewards/margins": 0.06294408440589905,
"rewards/rejected": -0.3972499370574951,
"step": 1420
},
{
"epoch": 0.8967904342353682,
"grad_norm": 62.6711311340332,
"learning_rate": 1.3947815669033026e-06,
"logits/chosen": -1.3594673871994019,
"logits/rejected": -1.4739999771118164,
"logps/chosen": -4.087611198425293,
"logps/rejected": -5.339770317077637,
"loss": 20.526,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.31806594133377075,
"rewards/margins": 0.08883042633533478,
"rewards/rejected": -0.40689635276794434,
"step": 1425
},
{
"epoch": 0.8999370673379484,
"grad_norm": 98.1043930053711,
"learning_rate": 1.3153010185731495e-06,
"logits/chosen": -1.2508734464645386,
"logits/rejected": -1.32900869846344,
"logps/chosen": -4.235801696777344,
"logps/rejected": -5.670529842376709,
"loss": 20.3076,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.3296756148338318,
"rewards/margins": 0.09636791795492172,
"rewards/rejected": -0.4260435700416565,
"step": 1430
},
{
"epoch": 0.9030837004405287,
"grad_norm": 87.73750305175781,
"learning_rate": 1.2380749659767766e-06,
"logits/chosen": -1.3343340158462524,
"logits/rejected": -1.3880221843719482,
"logps/chosen": -4.322578430175781,
"logps/rejected": -5.371191501617432,
"loss": 20.9961,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.33630794286727905,
"rewards/margins": 0.0794602781534195,
"rewards/rejected": -0.41576823592185974,
"step": 1435
},
{
"epoch": 0.9062303335431089,
"grad_norm": 72.0036392211914,
"learning_rate": 1.1631127272095077e-06,
"logits/chosen": -1.3422092199325562,
"logits/rejected": -1.4017739295959473,
"logps/chosen": -3.97587251663208,
"logps/rejected": -5.63102388381958,
"loss": 18.4484,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.302670419216156,
"rewards/margins": 0.1103433147072792,
"rewards/rejected": -0.413013756275177,
"step": 1440
},
{
"epoch": 0.9093769666456891,
"grad_norm": 55.72761917114258,
"learning_rate": 1.0904233472148862e-06,
"logits/chosen": -1.4325498342514038,
"logits/rejected": -1.5191594362258911,
"logps/chosen": -4.523946285247803,
"logps/rejected": -5.913887023925781,
"loss": 20.9945,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.34643903374671936,
"rewards/margins": 0.07747067511081696,
"rewards/rejected": -0.4239097237586975,
"step": 1445
},
{
"epoch": 0.9125235997482694,
"grad_norm": 74.03398132324219,
"learning_rate": 1.0200155966933333e-06,
"logits/chosen": -1.3860814571380615,
"logits/rejected": -1.4824600219726562,
"logps/chosen": -4.180668830871582,
"logps/rejected": -5.086295127868652,
"loss": 22.6256,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.31416332721710205,
"rewards/margins": 0.06807545572519302,
"rewards/rejected": -0.3822387754917145,
"step": 1450
},
{
"epoch": 0.9156702328508496,
"grad_norm": 55.17578887939453,
"learning_rate": 9.51897971043847e-07,
"logits/chosen": -1.277956485748291,
"logits/rejected": -1.4699045419692993,
"logps/chosen": -3.923815965652466,
"logps/rejected": -5.776226997375488,
"loss": 18.1837,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.30271369218826294,
"rewards/margins": 0.13357527554035187,
"rewards/rejected": -0.4362889230251312,
"step": 1455
},
{
"epoch": 0.9188168659534298,
"grad_norm": 67.42135620117188,
"learning_rate": 8.860786893389761e-07,
"logits/chosen": -1.3501498699188232,
"logits/rejected": -1.4162402153015137,
"logps/chosen": -4.456291198730469,
"logps/rejected": -4.891867637634277,
"loss": 23.4746,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.35184237360954285,
"rewards/margins": 0.03937570005655289,
"rewards/rejected": -0.3912180960178375,
"step": 1460
},
{
"epoch": 0.92196349905601,
"grad_norm": 86.8721923828125,
"learning_rate": 8.225656933330972e-07,
"logits/chosen": -1.396032691001892,
"logits/rejected": -1.3607252836227417,
"logps/chosen": -4.139504909515381,
"logps/rejected": -5.256811618804932,
"loss": 20.6197,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.31887346506118774,
"rewards/margins": 0.08756524324417114,
"rewards/rejected": -0.4064387381076813,
"step": 1465
},
{
"epoch": 0.9251101321585903,
"grad_norm": 63.26131057739258,
"learning_rate": 7.613666465041492e-07,
"logits/chosen": -1.296687364578247,
"logits/rejected": -1.338370442390442,
"logps/chosen": -4.0869526863098145,
"logps/rejected": -4.680004596710205,
"loss": 22.3496,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.30148619413375854,
"rewards/margins": 0.06435124576091766,
"rewards/rejected": -0.365837424993515,
"step": 1470
},
{
"epoch": 0.9282567652611705,
"grad_norm": 64.71456909179688,
"learning_rate": 7.024889331289731e-07,
"logits/chosen": -1.3576750755310059,
"logits/rejected": -1.4629138708114624,
"logps/chosen": -4.305732250213623,
"logps/rejected": -6.287524700164795,
"loss": 19.0147,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3270648717880249,
"rewards/margins": 0.12565208971500397,
"rewards/rejected": -0.45271697640419006,
"step": 1475
},
{
"epoch": 0.9314033983637507,
"grad_norm": 79.55664825439453,
"learning_rate": 6.459396573923227e-07,
"logits/chosen": -1.2750294208526611,
"logits/rejected": -1.3182651996612549,
"logps/chosen": -3.8780131340026855,
"logps/rejected": -5.497721195220947,
"loss": 19.3141,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.29957860708236694,
"rewards/margins": 0.11124887317419052,
"rewards/rejected": -0.41082748770713806,
"step": 1480
},
{
"epoch": 0.934550031466331,
"grad_norm": 97.28962707519531,
"learning_rate": 5.917256425296725e-07,
"logits/chosen": -1.3326900005340576,
"logits/rejected": -1.3848145008087158,
"logps/chosen": -4.326709270477295,
"logps/rejected": -5.8570427894592285,
"loss": 17.956,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.32169783115386963,
"rewards/margins": 0.11987517029047012,
"rewards/rejected": -0.44157299399375916,
"step": 1485
},
{
"epoch": 0.9376966645689113,
"grad_norm": 104.4383773803711,
"learning_rate": 5.398534300039227e-07,
"logits/chosen": -1.3669896125793457,
"logits/rejected": -1.4102351665496826,
"logps/chosen": -4.2153167724609375,
"logps/rejected": -5.1999030113220215,
"loss": 20.9588,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.3310829997062683,
"rewards/margins": 0.07336001843214035,
"rewards/rejected": -0.40444302558898926,
"step": 1490
},
{
"epoch": 0.9408432976714916,
"grad_norm": 59.6121826171875,
"learning_rate": 4.903292787161129e-07,
"logits/chosen": -1.4228112697601318,
"logits/rejected": -1.528313159942627,
"logps/chosen": -4.338911533355713,
"logps/rejected": -5.048561096191406,
"loss": 22.4697,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.3458613455295563,
"rewards/margins": 0.05565253645181656,
"rewards/rejected": -0.40151387453079224,
"step": 1495
},
{
"epoch": 0.9439899307740718,
"grad_norm": 134.8368377685547,
"learning_rate": 4.4315916425021755e-07,
"logits/chosen": -1.4706683158874512,
"logits/rejected": -1.5189244747161865,
"logps/chosen": -4.430064678192139,
"logps/rejected": -4.881100177764893,
"loss": 24.7599,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.34278133511543274,
"rewards/margins": 0.03427756577730179,
"rewards/rejected": -0.37705889344215393,
"step": 1500
},
{
"epoch": 0.947136563876652,
"grad_norm": 75.44186401367188,
"learning_rate": 3.983487781521311e-07,
"logits/chosen": -1.3628993034362793,
"logits/rejected": -1.5227676630020142,
"logps/chosen": -4.508485317230225,
"logps/rejected": -5.836249351501465,
"loss": 21.4824,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.35081833600997925,
"rewards/margins": 0.0795225128531456,
"rewards/rejected": -0.43034085631370544,
"step": 1505
},
{
"epoch": 0.9502831969792322,
"grad_norm": 53.86139678955078,
"learning_rate": 3.5590352724293565e-07,
"logits/chosen": -1.2814509868621826,
"logits/rejected": -1.383336067199707,
"logps/chosen": -3.697767972946167,
"logps/rejected": -5.5374345779418945,
"loss": 18.3089,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.28700894117355347,
"rewards/margins": 0.12951508164405823,
"rewards/rejected": -0.4165240228176117,
"step": 1510
},
{
"epoch": 0.9534298300818125,
"grad_norm": 55.83627700805664,
"learning_rate": 3.1582853296649785e-07,
"logits/chosen": -1.3301982879638672,
"logits/rejected": -1.4231036901474,
"logps/chosen": -3.7521042823791504,
"logps/rejected": -4.861963748931885,
"loss": 19.3616,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2891950309276581,
"rewards/margins": 0.09701049327850342,
"rewards/rejected": -0.3862055242061615,
"step": 1515
},
{
"epoch": 0.9565764631843927,
"grad_norm": 88.61446380615234,
"learning_rate": 2.7812863077153253e-07,
"logits/chosen": -1.2899259328842163,
"logits/rejected": -1.398050308227539,
"logps/chosen": -4.068936824798584,
"logps/rejected": -5.717960357666016,
"loss": 17.8938,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.31747734546661377,
"rewards/margins": 0.11797485500574112,
"rewards/rejected": -0.4354521632194519,
"step": 1520
},
{
"epoch": 0.9597230962869729,
"grad_norm": 58.96453857421875,
"learning_rate": 2.4280836952814913e-07,
"logits/chosen": -1.3611301183700562,
"logits/rejected": -1.4117127656936646,
"logps/chosen": -4.0526018142700195,
"logps/rejected": -5.437824249267578,
"loss": 21.3406,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.31340762972831726,
"rewards/margins": 0.07557393610477448,
"rewards/rejected": -0.38898158073425293,
"step": 1525
},
{
"epoch": 0.9628697293895532,
"grad_norm": 82.22030639648438,
"learning_rate": 2.0987201097897757e-07,
"logits/chosen": -1.290305256843567,
"logits/rejected": -1.3669493198394775,
"logps/chosen": -4.012240409851074,
"logps/rejected": -6.001503944396973,
"loss": 18.4697,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3110642433166504,
"rewards/margins": 0.14227357506752014,
"rewards/rejected": -0.45333781838417053,
"step": 1530
},
{
"epoch": 0.9660163624921334,
"grad_norm": 69.16776275634766,
"learning_rate": 1.7932352922496844e-07,
"logits/chosen": -1.3238952159881592,
"logits/rejected": -1.4009875059127808,
"logps/chosen": -4.168734550476074,
"logps/rejected": -5.520012855529785,
"loss": 18.6757,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.3233780264854431,
"rewards/margins": 0.10560549795627594,
"rewards/rejected": -0.42898350954055786,
"step": 1535
},
{
"epoch": 0.9691629955947136,
"grad_norm": 87.41554260253906,
"learning_rate": 1.5116661024584756e-07,
"logits/chosen": -1.3047425746917725,
"logits/rejected": -1.2935268878936768,
"logps/chosen": -3.8972859382629395,
"logps/rejected": -5.7956743240356445,
"loss": 19.4437,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2976101040840149,
"rewards/margins": 0.13567054271697998,
"rewards/rejected": -0.4332806169986725,
"step": 1540
},
{
"epoch": 0.9723096286972939,
"grad_norm": 129.71432495117188,
"learning_rate": 1.254046514553986e-07,
"logits/chosen": -1.3411355018615723,
"logits/rejected": -1.3150873184204102,
"logps/chosen": -4.793996334075928,
"logps/rejected": -6.1579155921936035,
"loss": 22.5465,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.36123085021972656,
"rewards/margins": 0.08643898367881775,
"rewards/rejected": -0.4476698338985443,
"step": 1545
},
{
"epoch": 0.9754562617998741,
"grad_norm": 156.82296752929688,
"learning_rate": 1.0204076129150198e-07,
"logits/chosen": -1.3176259994506836,
"logits/rejected": -1.371140956878662,
"logps/chosen": -4.381787300109863,
"logps/rejected": -5.822647571563721,
"loss": 20.2445,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.322052001953125,
"rewards/margins": 0.08496570587158203,
"rewards/rejected": -0.40701770782470703,
"step": 1550
},
{
"epoch": 0.9786028949024543,
"grad_norm": 101.22770690917969,
"learning_rate": 8.107775884109048e-08,
"logits/chosen": -1.377939224243164,
"logits/rejected": -1.460756540298462,
"logps/chosen": -4.821037292480469,
"logps/rejected": -5.5621137619018555,
"loss": 23.1685,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.37864041328430176,
"rewards/margins": 0.05817138031125069,
"rewards/rejected": -0.43681177496910095,
"step": 1555
},
{
"epoch": 0.9817495280050346,
"grad_norm": 93.55181884765625,
"learning_rate": 6.251817349998578e-08,
"logits/chosen": -1.2559947967529297,
"logits/rejected": -1.3171112537384033,
"logps/chosen": -3.9931647777557373,
"logps/rejected": -5.348459243774414,
"loss": 22.9477,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.30862289667129517,
"rewards/margins": 0.0842631608247757,
"rewards/rejected": -0.39288607239723206,
"step": 1560
},
{
"epoch": 0.9848961611076148,
"grad_norm": 80.63821411132812,
"learning_rate": 4.636424466771372e-08,
"logits/chosen": -1.24492347240448,
"logits/rejected": -1.3349525928497314,
"logps/chosen": -4.380553245544434,
"logps/rejected": -5.421158313751221,
"loss": 22.0329,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.34523719549179077,
"rewards/margins": 0.07052381336688995,
"rewards/rejected": -0.4157610535621643,
"step": 1565
},
{
"epoch": 0.9880427942101951,
"grad_norm": 55.254642486572266,
"learning_rate": 3.261792147728704e-08,
"logits/chosen": -1.3501121997833252,
"logits/rejected": -1.3522610664367676,
"logps/chosen": -4.829428195953369,
"logps/rejected": -5.480432033538818,
"loss": 22.6751,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3382914662361145,
"rewards/margins": 0.05635923147201538,
"rewards/rejected": -0.3946506381034851,
"step": 1570
},
{
"epoch": 0.9911894273127754,
"grad_norm": 102.65123748779297,
"learning_rate": 2.1280862560026927e-08,
"logits/chosen": -1.350527048110962,
"logits/rejected": -1.3495935201644897,
"logps/chosen": -3.8183772563934326,
"logps/rejected": -4.949650764465332,
"loss": 22.3353,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.3000851273536682,
"rewards/margins": 0.07860491424798965,
"rewards/rejected": -0.37869006395339966,
"step": 1575
},
{
"epoch": 0.9943360604153556,
"grad_norm": 67.94386291503906,
"learning_rate": 1.2354435845436385e-08,
"logits/chosen": -1.2602336406707764,
"logits/rejected": -1.2594802379608154,
"logps/chosen": -3.5885491371154785,
"logps/rejected": -4.909377098083496,
"loss": 18.7801,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.27459749579429626,
"rewards/margins": 0.10287781804800034,
"rewards/rejected": -0.3774753212928772,
"step": 1580
},
{
"epoch": 0.9974826935179358,
"grad_norm": 78.847412109375,
"learning_rate": 5.8397183961411694e-09,
"logits/chosen": -1.4188308715820312,
"logits/rejected": -1.3911654949188232,
"logps/chosen": -4.257325649261475,
"logps/rejected": -5.559029579162598,
"loss": 20.67,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.30969610810279846,
"rewards/margins": 0.08111827820539474,
"rewards/rejected": -0.3908143639564514,
"step": 1585
},
{
"epoch": 1.0,
"step": 1589,
"total_flos": 0.0,
"train_loss": 22.009478435192264,
"train_runtime": 23016.83,
"train_samples_per_second": 1.105,
"train_steps_per_second": 0.069
}
],
"logging_steps": 5,
"max_steps": 1589,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}