{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 1589, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0031466331025802393, "grad_norm": 8.804184913635254, "learning_rate": 1.257861635220126e-06, "logits/chosen": -0.11013289541006088, "logits/rejected": -0.5208367109298706, "logps/chosen": -0.8537980914115906, "logps/rejected": -1.0550096035003662, "loss": 24.9985, "rewards/accuracies": 0.3125, "rewards/chosen": -5.359128408599645e-06, "rewards/margins": 1.545622944831848e-05, "rewards/rejected": -2.081535967590753e-05, "step": 5 }, { "epoch": 0.0062932662051604785, "grad_norm": 18.678768157958984, "learning_rate": 2.2641509433962266e-06, "logits/chosen": -0.3030635714530945, "logits/rejected": -0.5435053706169128, "logps/chosen": -0.9865642786026001, "logps/rejected": -1.107262372970581, "loss": 24.9967, "rewards/accuracies": 0.5625, "rewards/chosen": -0.00010868474782910198, "rewards/margins": 3.348257814650424e-05, "rewards/rejected": -0.00014216733688954264, "step": 10 }, { "epoch": 0.009439899307740718, "grad_norm": 11.281435012817383, "learning_rate": 3.5220125786163524e-06, "logits/chosen": -0.5111545324325562, "logits/rejected": -0.8536307215690613, "logps/chosen": -1.0305876731872559, "logps/rejected": -1.2494089603424072, "loss": 24.9847, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.00031705142464488745, "rewards/margins": 0.000152838954818435, "rewards/rejected": -0.00046989036491140723, "step": 15 }, { "epoch": 0.012586532410320957, "grad_norm": 59.5455322265625, "learning_rate": 4.528301886792453e-06, "logits/chosen": -0.616014838218689, "logits/rejected": -0.6851056218147278, "logps/chosen": -1.130916953086853, "logps/rejected": -1.46986985206604, "loss": 24.9645, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0012707245768979192, "rewards/margins": 0.0003901523014064878, "rewards/rejected": -0.0016608769074082375, "step": 20 }, { "epoch": 0.015733165512901194, "grad_norm": 9.532500267028809, "learning_rate": 5.786163522012579e-06, "logits/chosen": -0.12423186004161835, "logits/rejected": -0.4599896967411041, "logps/chosen": -0.8485546112060547, "logps/rejected": -1.0018525123596191, "loss": 24.9267, "rewards/accuracies": 0.6875, "rewards/chosen": -0.001064571551978588, "rewards/margins": 0.0007417487213388085, "rewards/rejected": -0.0018063202733173966, "step": 25 }, { "epoch": 0.018879798615481436, "grad_norm": 11.064372062683105, "learning_rate": 7.044025157232705e-06, "logits/chosen": -0.1580429971218109, "logits/rejected": -0.38266992568969727, "logps/chosen": -0.8662201166152954, "logps/rejected": -1.0262982845306396, "loss": 24.8872, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0021042851731181145, "rewards/margins": 0.0011668736115098, "rewards/rejected": -0.0032711587846279144, "step": 30 }, { "epoch": 0.022026431718061675, "grad_norm": 37.646690368652344, "learning_rate": 8.301886792452832e-06, "logits/chosen": 0.026471847668290138, "logits/rejected": -0.49966034293174744, "logps/chosen": -0.8883110880851746, "logps/rejected": -1.199055790901184, "loss": 24.6732, "rewards/accuracies": 0.5625, "rewards/chosen": -0.003758195089176297, "rewards/margins": 0.0034271504264324903, "rewards/rejected": -0.0071853450499475, "step": 35 }, { "epoch": 0.025173064820641914, "grad_norm": 34.88091278076172, "learning_rate": 9.559748427672956e-06, "logits/chosen": -0.36181551218032837, "logits/rejected": -0.6659843325614929, "logps/chosen": -0.9565097689628601, "logps/rejected": -1.183980941772461, "loss": 24.6032, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.007488996721804142, "rewards/margins": 0.004166100639849901, "rewards/rejected": -0.011655096895992756, "step": 40 }, { "epoch": 0.028319697923222153, "grad_norm": 20.635541915893555, "learning_rate": 1.0817610062893083e-05, "logits/chosen": -0.5469181537628174, "logits/rejected": -0.7580572366714478, "logps/chosen": -1.0930149555206299, "logps/rejected": -1.2114075422286987, "loss": 24.7536, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0149534298107028, "rewards/margins": 0.002928597154095769, "rewards/rejected": -0.017882030457258224, "step": 45 }, { "epoch": 0.03146633102580239, "grad_norm": 36.45863723754883, "learning_rate": 1.2075471698113209e-05, "logits/chosen": -0.5085287094116211, "logits/rejected": -0.7208930253982544, "logps/chosen": -1.083438515663147, "logps/rejected": -1.3884985446929932, "loss": 23.9964, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.018525470048189163, "rewards/margins": 0.011683688499033451, "rewards/rejected": -0.03020915761590004, "step": 50 }, { "epoch": 0.034612964128382634, "grad_norm": 45.08958053588867, "learning_rate": 1.3081761006289308e-05, "logits/chosen": -0.6235328912734985, "logits/rejected": -0.8463523983955383, "logps/chosen": -1.1567853689193726, "logps/rejected": -2.149567127227783, "loss": 23.8291, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.028398022055625916, "rewards/margins": 0.03046615794301033, "rewards/rejected": -0.05886417627334595, "step": 55 }, { "epoch": 0.03775959723096287, "grad_norm": 48.65830612182617, "learning_rate": 1.408805031446541e-05, "logits/chosen": -0.6318017244338989, "logits/rejected": -0.9939996600151062, "logps/chosen": -1.7119739055633545, "logps/rejected": -2.3402199745178223, "loss": 23.5592, "rewards/accuracies": 0.6875, "rewards/chosen": -0.060618169605731964, "rewards/margins": 0.028203105553984642, "rewards/rejected": -0.08882127702236176, "step": 60 }, { "epoch": 0.04090623033354311, "grad_norm": 138.4451904296875, "learning_rate": 1.5345911949685536e-05, "logits/chosen": -0.9717090725898743, "logits/rejected": -1.1959871053695679, "logps/chosen": -1.9082473516464233, "logps/rejected": -2.449486494064331, "loss": 22.7524, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08517814427614212, "rewards/margins": 0.03464372828602791, "rewards/rejected": -0.11982186883687973, "step": 65 }, { "epoch": 0.04405286343612335, "grad_norm": 49.257568359375, "learning_rate": 1.6603773584905664e-05, "logits/chosen": -0.7433441281318665, "logits/rejected": -1.0399134159088135, "logps/chosen": -2.255545139312744, "logps/rejected": -2.98321533203125, "loss": 23.4113, "rewards/accuracies": 0.5625, "rewards/chosen": -0.11126575618982315, "rewards/margins": 0.04789165034890175, "rewards/rejected": -0.159157395362854, "step": 70 }, { "epoch": 0.04719949653870359, "grad_norm": 56.168670654296875, "learning_rate": 1.7861635220125788e-05, "logits/chosen": -1.0234445333480835, "logits/rejected": -1.288999080657959, "logps/chosen": -1.653058648109436, "logps/rejected": -2.370941162109375, "loss": 22.181, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08025868237018585, "rewards/margins": 0.044546954333782196, "rewards/rejected": -0.12480561435222626, "step": 75 }, { "epoch": 0.05034612964128383, "grad_norm": NaN, "learning_rate": 1.8867924528301888e-05, "logits/chosen": -1.1835613250732422, "logits/rejected": -1.4036767482757568, "logps/chosen": -1.90883469581604, "logps/rejected": -2.1450889110565186, "loss": 25.8869, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.09280741214752197, "rewards/margins": 0.021463319659233093, "rewards/rejected": -0.11427073180675507, "step": 80 }, { "epoch": 0.05349276274386407, "grad_norm": 103.32862091064453, "learning_rate": 2.0125786163522016e-05, "logits/chosen": -1.539898157119751, "logits/rejected": -1.6518356800079346, "logps/chosen": -2.0776684284210205, "logps/rejected": -2.5599067211151123, "loss": 24.1212, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.12567706406116486, "rewards/margins": 0.026400262489914894, "rewards/rejected": -0.152077317237854, "step": 85 }, { "epoch": 0.056639395846444306, "grad_norm": 176.14784240722656, "learning_rate": 2.138364779874214e-05, "logits/chosen": -1.3818124532699585, "logits/rejected": -1.5843524932861328, "logps/chosen": -2.48514461517334, "logps/rejected": -2.8115108013153076, "loss": 26.3518, "rewards/accuracies": 0.625, "rewards/chosen": -0.15766306221485138, "rewards/margins": 0.025023411959409714, "rewards/rejected": -0.1826864778995514, "step": 90 }, { "epoch": 0.059786028949024544, "grad_norm": 106.15309143066406, "learning_rate": 2.2641509433962265e-05, "logits/chosen": -1.5876004695892334, "logits/rejected": -1.7525005340576172, "logps/chosen": -2.2529654502868652, "logps/rejected": -3.2197937965393066, "loss": 20.8074, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.14453086256980896, "rewards/margins": 0.07421146333217621, "rewards/rejected": -0.21874232590198517, "step": 95 }, { "epoch": 0.06293266205160478, "grad_norm": 91.7872085571289, "learning_rate": 2.3899371069182393e-05, "logits/chosen": -1.6880552768707275, "logits/rejected": -1.667741060256958, "logps/chosen": -3.5453040599823, "logps/rejected": -3.8808798789978027, "loss": 24.6555, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.2320992648601532, "rewards/margins": 0.020265836268663406, "rewards/rejected": -0.2523651123046875, "step": 100 }, { "epoch": 0.06607929515418502, "grad_norm": 778.8959350585938, "learning_rate": 2.4905660377358492e-05, "logits/chosen": -1.8318984508514404, "logits/rejected": -1.8932411670684814, "logps/chosen": -3.125164031982422, "logps/rejected": -4.746774673461914, "loss": 27.3293, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.19288301467895508, "rewards/margins": 0.096275694668293, "rewards/rejected": -0.28915873169898987, "step": 105 }, { "epoch": 0.06922592825676527, "grad_norm": 132.40354919433594, "learning_rate": 2.6163522012578617e-05, "logits/chosen": -1.7445348501205444, "logits/rejected": -1.902320146560669, "logps/chosen": -1.9325546026229858, "logps/rejected": -3.3019511699676514, "loss": 21.7317, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.10555150359869003, "rewards/margins": 0.07426220178604126, "rewards/rejected": -0.1798136979341507, "step": 110 }, { "epoch": 0.0723725613593455, "grad_norm": 98.48819732666016, "learning_rate": 2.742138364779874e-05, "logits/chosen": -1.7994951009750366, "logits/rejected": -1.9057296514511108, "logps/chosen": -2.1663613319396973, "logps/rejected": -2.82452392578125, "loss": 22.7429, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1297454833984375, "rewards/margins": 0.04077299311757088, "rewards/rejected": -0.17051845788955688, "step": 115 }, { "epoch": 0.07551919446192575, "grad_norm": 93.13198852539062, "learning_rate": 2.867924528301887e-05, "logits/chosen": -1.6606374979019165, "logits/rejected": -1.7864787578582764, "logps/chosen": -2.2936453819274902, "logps/rejected": -2.5704400539398193, "loss": 24.0989, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.12692785263061523, "rewards/margins": 0.020071204751729965, "rewards/rejected": -0.1469990611076355, "step": 120 }, { "epoch": 0.07866582756450598, "grad_norm": 101.10535430908203, "learning_rate": 2.968553459119497e-05, "logits/chosen": -1.648816704750061, "logits/rejected": -1.6658546924591064, "logps/chosen": -2.0479884147644043, "logps/rejected": -2.8278560638427734, "loss": 27.9983, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.11854568868875504, "rewards/margins": 0.0439017117023468, "rewards/rejected": -0.16244739294052124, "step": 125 }, { "epoch": 0.08181246066708622, "grad_norm": 92.93740844726562, "learning_rate": 3.09433962264151e-05, "logits/chosen": -1.7306410074234009, "logits/rejected": -1.8349339962005615, "logps/chosen": -2.082920551300049, "logps/rejected": -3.115952253341675, "loss": 23.5299, "rewards/accuracies": 0.625, "rewards/chosen": -0.11979808658361435, "rewards/margins": 0.06730449199676514, "rewards/rejected": -0.18710258603096008, "step": 130 }, { "epoch": 0.08495909376966645, "grad_norm": 123.31324768066406, "learning_rate": 3.220125786163522e-05, "logits/chosen": -1.8235572576522827, "logits/rejected": -1.8541405200958252, "logps/chosen": -1.9667946100234985, "logps/rejected": -2.772089958190918, "loss": 22.6137, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.10746946185827255, "rewards/margins": 0.04866869002580643, "rewards/rejected": -0.15613815188407898, "step": 135 }, { "epoch": 0.0881057268722467, "grad_norm": 126.42218017578125, "learning_rate": 3.345911949685535e-05, "logits/chosen": -1.674515962600708, "logits/rejected": -1.8894662857055664, "logps/chosen": -2.245245933532715, "logps/rejected": -3.0301966667175293, "loss": 22.6984, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.11901465803384781, "rewards/margins": 0.049767203629016876, "rewards/rejected": -0.16878187656402588, "step": 140 }, { "epoch": 0.09125235997482693, "grad_norm": 114.61023712158203, "learning_rate": 3.471698113207548e-05, "logits/chosen": -1.7905619144439697, "logits/rejected": -1.8821656703948975, "logps/chosen": -3.373708724975586, "logps/rejected": -4.691153526306152, "loss": 22.2353, "rewards/accuracies": 0.75, "rewards/chosen": -0.18414758145809174, "rewards/margins": 0.0776277631521225, "rewards/rejected": -0.26177531480789185, "step": 145 }, { "epoch": 0.09439899307740718, "grad_norm": 296.22955322265625, "learning_rate": 3.59748427672956e-05, "logits/chosen": -1.654166579246521, "logits/rejected": -1.847495436668396, "logps/chosen": -3.2497410774230957, "logps/rejected": -4.303386688232422, "loss": 20.9992, "rewards/accuracies": 0.6875, "rewards/chosen": -0.22493870556354523, "rewards/margins": 0.07029641419649124, "rewards/rejected": -0.2952350974082947, "step": 150 }, { "epoch": 0.09754562617998741, "grad_norm": 579.211669921875, "learning_rate": 3.7232704402515726e-05, "logits/chosen": -1.6689754724502563, "logits/rejected": -1.7173473834991455, "logps/chosen": -3.7695910930633545, "logps/rejected": -4.783900260925293, "loss": 25.2195, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2695242762565613, "rewards/margins": 0.05760473012924194, "rewards/rejected": -0.3271290063858032, "step": 155 }, { "epoch": 0.10069225928256766, "grad_norm": 200.13609313964844, "learning_rate": 3.8490566037735854e-05, "logits/chosen": -1.7428325414657593, "logits/rejected": -1.74752938747406, "logps/chosen": -3.6156649589538574, "logps/rejected": -4.805546760559082, "loss": 22.7118, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.2704419791698456, "rewards/margins": 0.06444540619850159, "rewards/rejected": -0.33488741517066956, "step": 160 }, { "epoch": 0.10383889238514789, "grad_norm": 172.51622009277344, "learning_rate": 3.9748427672955975e-05, "logits/chosen": -1.7474384307861328, "logits/rejected": -1.7428706884384155, "logps/chosen": -3.276729106903076, "logps/rejected": -4.082120418548584, "loss": 22.3077, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2311684638261795, "rewards/margins": 0.051828593015670776, "rewards/rejected": -0.2829970717430115, "step": 165 }, { "epoch": 0.10698552548772813, "grad_norm": 146.08352661132812, "learning_rate": 3.9999227773220194e-05, "logits/chosen": -1.6052366495132446, "logits/rejected": -1.6235520839691162, "logps/chosen": -3.030139207839966, "logps/rejected": -4.707204818725586, "loss": 20.0014, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.21569938957691193, "rewards/margins": 0.12310032546520233, "rewards/rejected": -0.33879974484443665, "step": 170 }, { "epoch": 0.11013215859030837, "grad_norm": 133.93203735351562, "learning_rate": 3.9996090704130684e-05, "logits/chosen": -1.7021839618682861, "logits/rejected": -1.7295335531234741, "logps/chosen": -3.9147982597351074, "logps/rejected": -5.332208633422852, "loss": 20.047, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3039882779121399, "rewards/margins": 0.1180083155632019, "rewards/rejected": -0.4219965934753418, "step": 175 }, { "epoch": 0.11327879169288861, "grad_norm": 558.7332763671875, "learning_rate": 3.999054090678532e-05, "logits/chosen": -1.5368597507476807, "logits/rejected": -1.592637300491333, "logps/chosen": -6.026860237121582, "logps/rejected": -6.550711631774902, "loss": 29.6933, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.4695097804069519, "rewards/margins": 0.02213056944310665, "rewards/rejected": -0.4916403889656067, "step": 180 }, { "epoch": 0.11642542479546884, "grad_norm": 212.05760192871094, "learning_rate": 3.9982579050822615e-05, "logits/chosen": -1.5933212041854858, "logits/rejected": -1.5753694772720337, "logps/chosen": -4.716382026672363, "logps/rejected": -5.257371425628662, "loss": 27.5815, "rewards/accuracies": 0.625, "rewards/chosen": -0.3512773811817169, "rewards/margins": 0.033867720514535904, "rewards/rejected": -0.3851450979709625, "step": 185 }, { "epoch": 0.11957205789804909, "grad_norm": 134.0122833251953, "learning_rate": 3.997220609692011e-05, "logits/chosen": -1.6495559215545654, "logits/rejected": -1.6725133657455444, "logps/chosen": -3.984989643096924, "logps/rejected": -5.001562595367432, "loss": 22.6766, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2919956147670746, "rewards/margins": 0.05422482639551163, "rewards/rejected": -0.3462204337120056, "step": 190 }, { "epoch": 0.12271869100062932, "grad_norm": 151.4617919921875, "learning_rate": 3.9959423296678384e-05, "logits/chosen": -1.7128961086273193, "logits/rejected": -1.6318174600601196, "logps/chosen": -3.3435721397399902, "logps/rejected": -4.078289985656738, "loss": 25.0994, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.23941469192504883, "rewards/margins": 0.04007013887166977, "rewards/rejected": -0.2794848084449768, "step": 195 }, { "epoch": 0.12586532410320955, "grad_norm": 115.02042388916016, "learning_rate": 3.9944232192470094e-05, "logits/chosen": -1.7137172222137451, "logits/rejected": -1.7910420894622803, "logps/chosen": -3.106358051300049, "logps/rejected": -3.97111439704895, "loss": 21.6293, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.2061949521303177, "rewards/margins": 0.04795767739415169, "rewards/rejected": -0.2541525959968567, "step": 200 }, { "epoch": 0.1290119572057898, "grad_norm": 81.87369537353516, "learning_rate": 3.992663461725383e-05, "logits/chosen": -1.5431886911392212, "logits/rejected": -1.557018518447876, "logps/chosen": -2.805392026901245, "logps/rejected": -4.356006622314453, "loss": 21.8817, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1901206076145172, "rewards/margins": 0.0818587988615036, "rewards/rejected": -0.2719793915748596, "step": 205 }, { "epoch": 0.13215859030837004, "grad_norm": 188.29173278808594, "learning_rate": 3.990663269435298e-05, "logits/chosen": -1.6920125484466553, "logits/rejected": -1.6854931116104126, "logps/chosen": -3.156735897064209, "logps/rejected": -4.396471977233887, "loss": 27.5638, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.20159177482128143, "rewards/margins": 0.07304342836141586, "rewards/rejected": -0.2746351957321167, "step": 210 }, { "epoch": 0.13530522341095028, "grad_norm": 141.69142150878906, "learning_rate": 3.98842288371995e-05, "logits/chosen": -1.6487762928009033, "logits/rejected": -1.7372210025787354, "logps/chosen": -2.6156325340270996, "logps/rejected": -3.677928924560547, "loss": 21.5613, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1552310734987259, "rewards/margins": 0.0671745091676712, "rewards/rejected": -0.2224055826663971, "step": 215 }, { "epoch": 0.13845185651353054, "grad_norm": 92.31954193115234, "learning_rate": 3.985942574904275e-05, "logits/chosen": -1.677199363708496, "logits/rejected": -1.6414434909820557, "logps/chosen": -2.499932050704956, "logps/rejected": -3.3010895252227783, "loss": 22.2151, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.16245323419570923, "rewards/margins": 0.05558500811457634, "rewards/rejected": -0.21803824603557587, "step": 220 }, { "epoch": 0.14159848961611077, "grad_norm": 106.32303619384766, "learning_rate": 3.983222642262329e-05, "logits/chosen": -1.6422779560089111, "logits/rejected": -1.6500838994979858, "logps/chosen": -2.66230845451355, "logps/rejected": -3.7150185108184814, "loss": 20.2102, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.18372972309589386, "rewards/margins": 0.08325181156396866, "rewards/rejected": -0.2669815421104431, "step": 225 }, { "epoch": 0.144745122718691, "grad_norm": 113.5155029296875, "learning_rate": 3.980263413981178e-05, "logits/chosen": -1.5669496059417725, "logits/rejected": -1.5747731924057007, "logps/chosen": -3.1706671714782715, "logps/rejected": -3.948005199432373, "loss": 21.8852, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2336561679840088, "rewards/margins": 0.06708776950836182, "rewards/rejected": -0.300743967294693, "step": 230 }, { "epoch": 0.14789175582127123, "grad_norm": 99.03396606445312, "learning_rate": 3.977065247121298e-05, "logits/chosen": -1.639129400253296, "logits/rejected": -1.6693006753921509, "logps/chosen": -3.2495856285095215, "logps/rejected": -4.634251594543457, "loss": 22.4292, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.23980684578418732, "rewards/margins": 0.10619230568408966, "rewards/rejected": -0.345999151468277, "step": 235 }, { "epoch": 0.1510383889238515, "grad_norm": 254.25714111328125, "learning_rate": 3.973628527573495e-05, "logits/chosen": -1.4451357126235962, "logits/rejected": -1.415290355682373, "logps/chosen": -4.496035575866699, "logps/rejected": -5.4740800857543945, "loss": 24.0697, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3446175158023834, "rewards/margins": 0.07305804640054703, "rewards/rejected": -0.41767558455467224, "step": 240 }, { "epoch": 0.15418502202643172, "grad_norm": 98.8416976928711, "learning_rate": 3.969953670012342e-05, "logits/chosen": -1.6127903461456299, "logits/rejected": -1.529802918434143, "logps/chosen": -3.744677782058716, "logps/rejected": -5.76874303817749, "loss": 20.2498, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2814852297306061, "rewards/margins": 0.12259259074926376, "rewards/rejected": -0.40407782793045044, "step": 245 }, { "epoch": 0.15733165512901195, "grad_norm": 174.65185546875, "learning_rate": 3.9660411178461427e-05, "logits/chosen": -1.6170070171356201, "logits/rejected": -1.5994997024536133, "logps/chosen": -3.390500545501709, "logps/rejected": -4.377715587615967, "loss": 22.3596, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2286950647830963, "rewards/margins": 0.07162559777498245, "rewards/rejected": -0.30032065510749817, "step": 250 }, { "epoch": 0.1604782882315922, "grad_norm": 98.30694580078125, "learning_rate": 3.9618913431634326e-05, "logits/chosen": -1.5248662233352661, "logits/rejected": -1.570233702659607, "logps/chosen": -2.914156436920166, "logps/rejected": -3.4877963066101074, "loss": 21.8392, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.17873355746269226, "rewards/margins": 0.04700728505849838, "rewards/rejected": -0.22574086487293243, "step": 255 }, { "epoch": 0.16362492133417245, "grad_norm": 108.39894104003906, "learning_rate": 3.957504846676015e-05, "logits/chosen": -1.5246005058288574, "logits/rejected": -1.6037238836288452, "logps/chosen": -3.113523006439209, "logps/rejected": -4.024534702301025, "loss": 21.9178, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.21206626296043396, "rewards/margins": 0.06214412301778793, "rewards/rejected": -0.2742103934288025, "step": 260 }, { "epoch": 0.16677155443675268, "grad_norm": 122.53215026855469, "learning_rate": 3.952882157658545e-05, "logits/chosen": -1.4534975290298462, "logits/rejected": -1.4294064044952393, "logps/chosen": -3.44130277633667, "logps/rejected": -3.7570698261260986, "loss": 25.6929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.2504199147224426, "rewards/margins": 0.021822316572070122, "rewards/rejected": -0.2722422182559967, "step": 265 }, { "epoch": 0.1699181875393329, "grad_norm": 117.72502899169922, "learning_rate": 3.948023833884667e-05, "logits/chosen": -1.596609354019165, "logits/rejected": -1.6202917098999023, "logps/chosen": -3.7515816688537598, "logps/rejected": -3.9420647621154785, "loss": 25.1709, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.23414090275764465, "rewards/margins": 0.027978042140603065, "rewards/rejected": -0.26211896538734436, "step": 270 }, { "epoch": 0.17306482064191314, "grad_norm": 84.38936614990234, "learning_rate": 3.942930461559718e-05, "logits/chosen": -1.5714600086212158, "logits/rejected": -1.683579683303833, "logps/chosen": -3.3148865699768066, "logps/rejected": -3.7648849487304688, "loss": 24.1859, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2219853699207306, "rewards/margins": 0.02847103402018547, "rewards/rejected": -0.25045639276504517, "step": 275 }, { "epoch": 0.1762114537444934, "grad_norm": 122.990478515625, "learning_rate": 3.9376026552499894e-05, "logits/chosen": -1.5986852645874023, "logits/rejected": -1.6811764240264893, "logps/chosen": -3.261617660522461, "logps/rejected": -4.3173418045043945, "loss": 19.8872, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.22893181443214417, "rewards/margins": 0.0762997642159462, "rewards/rejected": -0.30523157119750977, "step": 280 }, { "epoch": 0.17935808684707363, "grad_norm": 128.1126251220703, "learning_rate": 3.9320410578085774e-05, "logits/chosen": -1.5240422487258911, "logits/rejected": -1.5410079956054688, "logps/chosen": -3.7498767375946045, "logps/rejected": -4.466190338134766, "loss": 22.8035, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2702712118625641, "rewards/margins": 0.0467303991317749, "rewards/rejected": -0.3170016407966614, "step": 285 }, { "epoch": 0.18250471994965387, "grad_norm": 160.00189208984375, "learning_rate": 3.9262463402978165e-05, "logits/chosen": -1.413119912147522, "logits/rejected": -1.3633155822753906, "logps/chosen": -3.8721237182617188, "logps/rejected": -5.0125298500061035, "loss": 22.2056, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3019005358219147, "rewards/margins": 0.08287017047405243, "rewards/rejected": -0.3847707211971283, "step": 290 }, { "epoch": 0.1856513530522341, "grad_norm": 168.05908203125, "learning_rate": 3.920219201908306e-05, "logits/chosen": -1.2270746231079102, "logits/rejected": -1.2809008359909058, "logps/chosen": -4.052460670471191, "logps/rejected": -5.228961944580078, "loss": 21.1427, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3124389052391052, "rewards/margins": 0.0833948403596878, "rewards/rejected": -0.3958337903022766, "step": 295 }, { "epoch": 0.18879798615481436, "grad_norm": 94.47506713867188, "learning_rate": 3.9139603698745514e-05, "logits/chosen": -1.1681110858917236, "logits/rejected": -1.2372829914093018, "logps/chosen": -3.511944532394409, "logps/rejected": -4.100220680236816, "loss": 22.7025, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.24873590469360352, "rewards/margins": 0.03639525547623634, "rewards/rejected": -0.28513115644454956, "step": 300 }, { "epoch": 0.1919446192573946, "grad_norm": 560.2835083007812, "learning_rate": 3.907470599387209e-05, "logits/chosen": -1.101466178894043, "logits/rejected": -1.0982881784439087, "logps/chosen": -3.0287392139434814, "logps/rejected": -3.3412985801696777, "loss": 25.2732, "rewards/accuracies": 0.5625, "rewards/chosen": -0.210398867726326, "rewards/margins": 0.023965148255228996, "rewards/rejected": -0.23436403274536133, "step": 305 }, { "epoch": 0.19509125235997482, "grad_norm": 190.03529357910156, "learning_rate": 3.900750673501971e-05, "logits/chosen": -0.8078586459159851, "logits/rejected": -1.0514795780181885, "logps/chosen": -2.391371250152588, "logps/rejected": -3.401437282562256, "loss": 21.6721, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.14814777672290802, "rewards/margins": 0.07396493852138519, "rewards/rejected": -0.22211270034313202, "step": 310 }, { "epoch": 0.19823788546255505, "grad_norm": 127.30278778076172, "learning_rate": 3.893801403045078e-05, "logits/chosen": -0.9948938488960266, "logits/rejected": -1.1343729496002197, "logps/chosen": -2.520848274230957, "logps/rejected": -3.695737838745117, "loss": 21.1395, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.16981211304664612, "rewards/margins": 0.08695949614048004, "rewards/rejected": -0.25677159428596497, "step": 315 }, { "epoch": 0.2013845185651353, "grad_norm": 164.39279174804688, "learning_rate": 3.8866236265154864e-05, "logits/chosen": -1.059020757675171, "logits/rejected": -1.1909369230270386, "logps/chosen": -3.2958297729492188, "logps/rejected": -4.60178279876709, "loss": 23.1263, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.24349625408649445, "rewards/margins": 0.08941353857517242, "rewards/rejected": -0.33290979266166687, "step": 320 }, { "epoch": 0.20453115166771554, "grad_norm": 317.5319519042969, "learning_rate": 3.8792182099836956e-05, "logits/chosen": -1.1690977811813354, "logits/rejected": -1.221868872642517, "logps/chosen": -3.4916579723358154, "logps/rejected": -4.967286109924316, "loss": 19.5685, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2618243992328644, "rewards/margins": 0.09256020933389664, "rewards/rejected": -0.35438457131385803, "step": 325 }, { "epoch": 0.20767778477029578, "grad_norm": 113.65757751464844, "learning_rate": 3.8715860469872456e-05, "logits/chosen": -1.230567216873169, "logits/rejected": -1.2354533672332764, "logps/chosen": -4.1219682693481445, "logps/rejected": -5.140664577484131, "loss": 24.1262, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3101332485675812, "rewards/margins": 0.07826542854309082, "rewards/rejected": -0.3883987069129944, "step": 330 }, { "epoch": 0.21082441787287604, "grad_norm": 103.66908264160156, "learning_rate": 3.863728058422905e-05, "logits/chosen": -1.1679656505584717, "logits/rejected": -1.2492824792861938, "logps/chosen": -4.176590442657471, "logps/rejected": -5.121442794799805, "loss": 21.9799, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3107621371746063, "rewards/margins": 0.07555123418569565, "rewards/rejected": -0.38631340861320496, "step": 335 }, { "epoch": 0.21397105097545627, "grad_norm": 187.34596252441406, "learning_rate": 3.855645192435555e-05, "logits/chosen": -1.4208840131759644, "logits/rejected": -1.357755422592163, "logps/chosen": -3.746802568435669, "logps/rejected": -4.651678562164307, "loss": 21.8739, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2814106345176697, "rewards/margins": 0.06742358207702637, "rewards/rejected": -0.34883421659469604, "step": 340 }, { "epoch": 0.2171176840780365, "grad_norm": 128.47970581054688, "learning_rate": 3.847338424303787e-05, "logits/chosen": -1.403939962387085, "logits/rejected": -1.3926942348480225, "logps/chosen": -3.540362596511841, "logps/rejected": -4.463648796081543, "loss": 22.9837, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2591942250728607, "rewards/margins": 0.06667342782020569, "rewards/rejected": -0.3258676528930664, "step": 345 }, { "epoch": 0.22026431718061673, "grad_norm": 91.00343322753906, "learning_rate": 3.838808756322222e-05, "logits/chosen": -1.4555909633636475, "logits/rejected": -1.4179480075836182, "logps/chosen": -3.3319029808044434, "logps/rejected": -4.7188615798950195, "loss": 22.182, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.24019880592823029, "rewards/margins": 0.09150617569684982, "rewards/rejected": -0.3317049741744995, "step": 350 }, { "epoch": 0.223410950283197, "grad_norm": 89.21013641357422, "learning_rate": 3.8300572176805796e-05, "logits/chosen": -1.505953073501587, "logits/rejected": -1.4713289737701416, "logps/chosen": -3.2633144855499268, "logps/rejected": -4.148341655731201, "loss": 22.4622, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.23655016720294952, "rewards/margins": 0.04711543396115303, "rewards/rejected": -0.28366559743881226, "step": 355 }, { "epoch": 0.22655758338577722, "grad_norm": 136.71780395507812, "learning_rate": 3.82108486433949e-05, "logits/chosen": -1.4959208965301514, "logits/rejected": -1.4308115243911743, "logps/chosen": -3.161681652069092, "logps/rejected": -3.9897868633270264, "loss": 23.3097, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2291373759508133, "rewards/margins": 0.045841820538043976, "rewards/rejected": -0.2749791741371155, "step": 360 }, { "epoch": 0.22970421648835745, "grad_norm": 233.93896484375, "learning_rate": 3.8118927789030854e-05, "logits/chosen": -1.5138304233551025, "logits/rejected": -1.5346544981002808, "logps/chosen": -4.37386417388916, "logps/rejected": -5.469226837158203, "loss": 20.9319, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3008665442466736, "rewards/margins": 0.07115120440721512, "rewards/rejected": -0.3720177412033081, "step": 365 }, { "epoch": 0.2328508495909377, "grad_norm": 100.57418060302734, "learning_rate": 3.802482070488373e-05, "logits/chosen": -1.3890790939331055, "logits/rejected": -1.4434179067611694, "logps/chosen": -3.4095160961151123, "logps/rejected": -4.254734039306641, "loss": 21.2175, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.24669814109802246, "rewards/margins": 0.06480761617422104, "rewards/rejected": -0.3115057349205017, "step": 370 }, { "epoch": 0.23599748269351795, "grad_norm": 194.1370391845703, "learning_rate": 3.792853874591408e-05, "logits/chosen": -1.5562362670898438, "logits/rejected": -1.4487522840499878, "logps/chosen": -3.45831561088562, "logps/rejected": -4.16960334777832, "loss": 24.8363, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.23216786980628967, "rewards/margins": 0.047336287796497345, "rewards/rejected": -0.2795041799545288, "step": 375 }, { "epoch": 0.23914411579609818, "grad_norm": 88.31356811523438, "learning_rate": 3.783009352950282e-05, "logits/chosen": -1.371385097503662, "logits/rejected": -1.373175859451294, "logps/chosen": -2.55993390083313, "logps/rejected": -3.111349105834961, "loss": 22.3814, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.17149809002876282, "rewards/margins": 0.04337615519762039, "rewards/rejected": -0.214874267578125, "step": 380 }, { "epoch": 0.2422907488986784, "grad_norm": 126.74950408935547, "learning_rate": 3.772949693404954e-05, "logits/chosen": -1.33748459815979, "logits/rejected": -1.3754979372024536, "logps/chosen": -2.633439064025879, "logps/rejected": -3.534024715423584, "loss": 20.4661, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.17298361659049988, "rewards/margins": 0.07067564874887466, "rewards/rejected": -0.24365928769111633, "step": 385 }, { "epoch": 0.24543738200125864, "grad_norm": 90.40318298339844, "learning_rate": 3.762676109753919e-05, "logits/chosen": -1.2709859609603882, "logits/rejected": -1.294306755065918, "logps/chosen": -3.954099655151367, "logps/rejected": -5.9721527099609375, "loss": 21.932, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.29533275961875916, "rewards/margins": 0.12940457463264465, "rewards/rejected": -0.4247373640537262, "step": 390 }, { "epoch": 0.2485840151038389, "grad_norm": 84.59414672851562, "learning_rate": 3.7521898416077565e-05, "logits/chosen": -1.4984506368637085, "logits/rejected": -1.5229644775390625, "logps/chosen": -4.4091901779174805, "logps/rejected": -5.3940815925598145, "loss": 21.5859, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3109613358974457, "rewards/margins": 0.08055521547794342, "rewards/rejected": -0.3915165364742279, "step": 395 }, { "epoch": 0.2517306482064191, "grad_norm": 120.28202056884766, "learning_rate": 3.7414921542395546e-05, "logits/chosen": -1.5182693004608154, "logits/rejected": -1.5193490982055664, "logps/chosen": -4.545083045959473, "logps/rejected": -5.492725372314453, "loss": 21.539, "rewards/accuracies": 0.6875, "rewards/chosen": -0.36579376459121704, "rewards/margins": 0.06641928851604462, "rewards/rejected": -0.4322130084037781, "step": 400 }, { "epoch": 0.2548772813089994, "grad_norm": 143.28396606445312, "learning_rate": 3.7305843384322466e-05, "logits/chosen": -1.5114035606384277, "logits/rejected": -1.5092270374298096, "logps/chosen": -5.28603982925415, "logps/rejected": -6.232533931732178, "loss": 21.4891, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.43935927748680115, "rewards/margins": 0.08039890229701996, "rewards/rejected": -0.5197581648826599, "step": 405 }, { "epoch": 0.2580239144115796, "grad_norm": 129.09864807128906, "learning_rate": 3.71946771032286e-05, "logits/chosen": -1.6940416097640991, "logits/rejected": -1.6389005184173584, "logps/chosen": -5.122313022613525, "logps/rejected": -6.010600566864014, "loss": 21.8681, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.42212480306625366, "rewards/margins": 0.076592817902565, "rewards/rejected": -0.49871763586997986, "step": 410 }, { "epoch": 0.26117054751415986, "grad_norm": 1118.02392578125, "learning_rate": 3.708143611243716e-05, "logits/chosen": -1.65127432346344, "logits/rejected": -1.6758639812469482, "logps/chosen": -5.203777313232422, "logps/rejected": -6.3162078857421875, "loss": 21.2512, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.37822961807250977, "rewards/margins": 0.09629149734973907, "rewards/rejected": -0.4745211601257324, "step": 415 }, { "epoch": 0.2643171806167401, "grad_norm": 109.98821258544922, "learning_rate": 3.696613407560582e-05, "logits/chosen": -1.6237115859985352, "logits/rejected": -1.5712984800338745, "logps/chosen": -4.632975101470947, "logps/rejected": -6.082078456878662, "loss": 20.9477, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3740273118019104, "rewards/margins": 0.103847935795784, "rewards/rejected": -0.4778752326965332, "step": 420 }, { "epoch": 0.2674638137193203, "grad_norm": 95.2988052368164, "learning_rate": 3.684878490507808e-05, "logits/chosen": -1.5806386470794678, "logits/rejected": -1.6192169189453125, "logps/chosen": -4.849827766418457, "logps/rejected": -5.800168037414551, "loss": 23.5806, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3749791085720062, "rewards/margins": 0.07270670682191849, "rewards/rejected": -0.4476858079433441, "step": 425 }, { "epoch": 0.27061044682190055, "grad_norm": 111.99176788330078, "learning_rate": 3.6729402760204535e-05, "logits/chosen": -1.6522388458251953, "logits/rejected": -1.6433773040771484, "logps/chosen": -3.4129672050476074, "logps/rejected": -4.362156867980957, "loss": 21.9253, "rewards/accuracies": 0.6875, "rewards/chosen": -0.25616371631622314, "rewards/margins": 0.07649270445108414, "rewards/rejected": -0.3326564431190491, "step": 430 }, { "epoch": 0.2737570799244808, "grad_norm": 219.88124084472656, "learning_rate": 3.6608002045634535e-05, "logits/chosen": -1.7825971841812134, "logits/rejected": -1.6959110498428345, "logps/chosen": -3.785250425338745, "logps/rejected": -4.989777565002441, "loss": 22.1928, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.26640018820762634, "rewards/margins": 0.07046084105968475, "rewards/rejected": -0.3368610143661499, "step": 435 }, { "epoch": 0.27690371302706107, "grad_norm": 110.93528747558594, "learning_rate": 3.6484597409577975e-05, "logits/chosen": -1.8389028310775757, "logits/rejected": -1.7533693313598633, "logps/chosen": -3.4091110229492188, "logps/rejected": -4.324118614196777, "loss": 21.2394, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.25699272751808167, "rewards/margins": 0.06507135927677155, "rewards/rejected": -0.322064071893692, "step": 440 }, { "epoch": 0.2800503461296413, "grad_norm": 128.312255859375, "learning_rate": 3.6359203742037966e-05, "logits/chosen": -1.8402115106582642, "logits/rejected": -1.7344493865966797, "logps/chosen": -4.041749000549316, "logps/rejected": -4.417330741882324, "loss": 22.7853, "rewards/accuracies": 0.625, "rewards/chosen": -0.2921445965766907, "rewards/margins": 0.04909106716513634, "rewards/rejected": -0.3412356376647949, "step": 445 }, { "epoch": 0.28319697923222154, "grad_norm": 121.12706756591797, "learning_rate": 3.623183617301411e-05, "logits/chosen": -1.7311460971832275, "logits/rejected": -1.7096502780914307, "logps/chosen": -3.8819706439971924, "logps/rejected": -4.670237064361572, "loss": 22.6275, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.30139902234077454, "rewards/margins": 0.05851779133081436, "rewards/rejected": -0.3599168360233307, "step": 450 }, { "epoch": 0.28634361233480177, "grad_norm": 93.03048706054688, "learning_rate": 3.610251007067699e-05, "logits/chosen": -1.836363434791565, "logits/rejected": -1.736104965209961, "logps/chosen": -4.1447577476501465, "logps/rejected": -4.325010299682617, "loss": 26.2728, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.32724231481552124, "rewards/margins": 0.010385597124695778, "rewards/rejected": -0.33762794733047485, "step": 455 }, { "epoch": 0.289490245437382, "grad_norm": 76.58390808105469, "learning_rate": 3.597124103951379e-05, "logits/chosen": -1.7278220653533936, "logits/rejected": -1.7181174755096436, "logps/chosen": -4.0262017250061035, "logps/rejected": -4.855641841888428, "loss": 22.3804, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2886626124382019, "rewards/margins": 0.06016182899475098, "rewards/rejected": -0.3488244414329529, "step": 460 }, { "epoch": 0.29263687853996223, "grad_norm": 80.33660888671875, "learning_rate": 3.583804491844551e-05, "logits/chosen": -1.8658571243286133, "logits/rejected": -1.7413606643676758, "logps/chosen": -3.758129835128784, "logps/rejected": -4.306906223297119, "loss": 26.088, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2766272723674774, "rewards/margins": 0.03810672461986542, "rewards/rejected": -0.31473398208618164, "step": 465 }, { "epoch": 0.29578351164254246, "grad_norm": 66.17215728759766, "learning_rate": 3.5702937778915765e-05, "logits/chosen": -1.8694692850112915, "logits/rejected": -1.82939875125885, "logps/chosen": -2.9322712421417236, "logps/rejected": -3.7157013416290283, "loss": 21.7852, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2061152458190918, "rewards/margins": 0.056372471153736115, "rewards/rejected": -0.2624877095222473, "step": 470 }, { "epoch": 0.2989301447451227, "grad_norm": 95.2267837524414, "learning_rate": 3.556593592295171e-05, "logits/chosen": -1.8632274866104126, "logits/rejected": -1.8683363199234009, "logps/chosen": -2.8304595947265625, "logps/rejected": -3.464296817779541, "loss": 22.1458, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.19707690179347992, "rewards/margins": 0.04870922490954399, "rewards/rejected": -0.24578611552715302, "step": 475 }, { "epoch": 0.302076777847703, "grad_norm": 128.1005096435547, "learning_rate": 3.5427055881196946e-05, "logits/chosen": -1.7504918575286865, "logits/rejected": -1.8846075534820557, "logps/chosen": -2.7551674842834473, "logps/rejected": -3.501314163208008, "loss": 21.4037, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.192325159907341, "rewards/margins": 0.05459358170628548, "rewards/rejected": -0.2469187080860138, "step": 480 }, { "epoch": 0.3052234109502832, "grad_norm": 64.81920623779297, "learning_rate": 3.5286314410916967e-05, "logits/chosen": -1.8015562295913696, "logits/rejected": -1.9157085418701172, "logps/chosen": -3.297150135040283, "logps/rejected": -4.347265243530273, "loss": 20.2599, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.24196556210517883, "rewards/margins": 0.06687469035387039, "rewards/rejected": -0.30884024500846863, "step": 485 }, { "epoch": 0.30837004405286345, "grad_norm": 121.4966812133789, "learning_rate": 3.5143728493977245e-05, "logits/chosen": -1.7404873371124268, "logits/rejected": -1.8498218059539795, "logps/chosen": -3.553678035736084, "logps/rejected": -4.084536075592041, "loss": 24.4702, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2725631594657898, "rewards/margins": 0.037132084369659424, "rewards/rejected": -0.3096952736377716, "step": 490 }, { "epoch": 0.3115166771554437, "grad_norm": 102.46180725097656, "learning_rate": 3.499931533479417e-05, "logits/chosen": -1.7682313919067383, "logits/rejected": -1.7660820484161377, "logps/chosen": -3.595475435256958, "logps/rejected": -4.801576137542725, "loss": 20.9722, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2746976315975189, "rewards/margins": 0.10004003345966339, "rewards/rejected": -0.3747376501560211, "step": 495 }, { "epoch": 0.3146633102580239, "grad_norm": 100.82923889160156, "learning_rate": 3.485309235825916e-05, "logits/chosen": -1.7638380527496338, "logits/rejected": -1.857962965965271, "logps/chosen": -4.1785569190979, "logps/rejected": -5.445678234100342, "loss": 20.121, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.30823373794555664, "rewards/margins": 0.09736496210098267, "rewards/rejected": -0.4055987298488617, "step": 500 }, { "epoch": 0.31780994336060414, "grad_norm": 299.635009765625, "learning_rate": 3.470507720763625e-05, "logits/chosen": -1.7603092193603516, "logits/rejected": -1.8294856548309326, "logps/chosen": -3.818953037261963, "logps/rejected": -4.965951442718506, "loss": 24.0421, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2867090702056885, "rewards/margins": 0.09908684343099594, "rewards/rejected": -0.385795921087265, "step": 505 }, { "epoch": 0.3209565764631844, "grad_norm": 121.77188110351562, "learning_rate": 3.4555287742433115e-05, "logits/chosen": -1.8968608379364014, "logits/rejected": -1.863628625869751, "logps/chosen": -3.3851046562194824, "logps/rejected": -4.313992500305176, "loss": 21.5651, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2504531145095825, "rewards/margins": 0.07505444437265396, "rewards/rejected": -0.3255075514316559, "step": 510 }, { "epoch": 0.3241032095657646, "grad_norm": 84.7723617553711, "learning_rate": 3.440374203624628e-05, "logits/chosen": -1.8949018716812134, "logits/rejected": -2.03389573097229, "logps/chosen": -3.739046573638916, "logps/rejected": -4.937285423278809, "loss": 22.0895, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2827950417995453, "rewards/margins": 0.07987246662378311, "rewards/rejected": -0.3626675605773926, "step": 515 }, { "epoch": 0.3272498426683449, "grad_norm": 96.02967071533203, "learning_rate": 3.425045837458028e-05, "logits/chosen": -1.9336235523223877, "logits/rejected": -1.9811556339263916, "logps/chosen": -3.5748794078826904, "logps/rejected": -4.64247465133667, "loss": 20.7454, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2698992192745209, "rewards/margins": 0.07278282940387726, "rewards/rejected": -0.3426820635795593, "step": 520 }, { "epoch": 0.3303964757709251, "grad_norm": 138.71051025390625, "learning_rate": 3.4095455252641376e-05, "logits/chosen": -1.938104271888733, "logits/rejected": -2.024137020111084, "logps/chosen": -4.332060813903809, "logps/rejected": -5.391437530517578, "loss": 23.3511, "rewards/accuracies": 0.625, "rewards/chosen": -0.3168641924858093, "rewards/margins": 0.049729883670806885, "rewards/rejected": -0.3665940761566162, "step": 525 }, { "epoch": 0.33354310887350536, "grad_norm": 93.8726577758789, "learning_rate": 3.393875137310588e-05, "logits/chosen": -1.8752260208129883, "logits/rejected": -1.8945411443710327, "logps/chosen": -4.053868770599365, "logps/rejected": -5.044325828552246, "loss": 21.8528, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3227534890174866, "rewards/margins": 0.0821223258972168, "rewards/rejected": -0.4048757553100586, "step": 530 }, { "epoch": 0.3366897419760856, "grad_norm": 261.39129638671875, "learning_rate": 3.378036564386349e-05, "logits/chosen": -1.770957589149475, "logits/rejected": -1.8808790445327759, "logps/chosen": -3.8808326721191406, "logps/rejected": -4.960693836212158, "loss": 23.7267, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3041539788246155, "rewards/margins": 0.08733677119016647, "rewards/rejected": -0.39149072766304016, "step": 535 }, { "epoch": 0.3398363750786658, "grad_norm": 141.79991149902344, "learning_rate": 3.3620317175735945e-05, "logits/chosen": -1.929517149925232, "logits/rejected": -1.8599262237548828, "logps/chosen": -4.427219867706299, "logps/rejected": -5.757664680480957, "loss": 20.8591, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3481447994709015, "rewards/margins": 0.0858476310968399, "rewards/rejected": -0.4339924454689026, "step": 540 }, { "epoch": 0.34298300818124605, "grad_norm": 76.495361328125, "learning_rate": 3.345862528017101e-05, "logits/chosen": -1.8648240566253662, "logits/rejected": -1.899430513381958, "logps/chosen": -4.430551528930664, "logps/rejected": -5.134209156036377, "loss": 21.6823, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3582889139652252, "rewards/margins": 0.05610053986310959, "rewards/rejected": -0.4143894612789154, "step": 545 }, { "epoch": 0.3461296412838263, "grad_norm": 65.95896911621094, "learning_rate": 3.32953094669124e-05, "logits/chosen": -1.6951459646224976, "logits/rejected": -1.7398831844329834, "logps/chosen": -5.35291051864624, "logps/rejected": -6.347973823547363, "loss": 24.8551, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4343182146549225, "rewards/margins": 0.085027314722538, "rewards/rejected": -0.5193454623222351, "step": 550 }, { "epoch": 0.34927627438640657, "grad_norm": 64.50738525390625, "learning_rate": 3.313038944164577e-05, "logits/chosen": -1.7779582738876343, "logits/rejected": -1.8077032566070557, "logps/chosen": -4.008457183837891, "logps/rejected": -5.838412761688232, "loss": 19.2472, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3185553550720215, "rewards/margins": 0.10776933282613754, "rewards/rejected": -0.4263246953487396, "step": 555 }, { "epoch": 0.3524229074889868, "grad_norm": 62.579227447509766, "learning_rate": 3.296388510362095e-05, "logits/chosen": -1.5932537317276, "logits/rejected": -1.7019790410995483, "logps/chosen": -4.049741268157959, "logps/rejected": -4.859818935394287, "loss": 21.4107, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.29325228929519653, "rewards/margins": 0.06688085943460464, "rewards/rejected": -0.36013317108154297, "step": 560 }, { "epoch": 0.35556954059156703, "grad_norm": 105.9216079711914, "learning_rate": 3.2795816543250977e-05, "logits/chosen": -1.5411794185638428, "logits/rejected": -1.5789968967437744, "logps/chosen": -3.8824076652526855, "logps/rejected": -4.560225486755371, "loss": 23.1195, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2929798662662506, "rewards/margins": 0.05188722163438797, "rewards/rejected": -0.34486711025238037, "step": 565 }, { "epoch": 0.35871617369414727, "grad_norm": 55.46923065185547, "learning_rate": 3.262620403968792e-05, "logits/chosen": -1.5855820178985596, "logits/rejected": -1.7370961904525757, "logps/chosen": -3.6918272972106934, "logps/rejected": -5.205948352813721, "loss": 19.1367, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.27848196029663086, "rewards/margins": 0.11322972923517227, "rewards/rejected": -0.3917117416858673, "step": 570 }, { "epoch": 0.3618628067967275, "grad_norm": 114.82603454589844, "learning_rate": 3.245506805837605e-05, "logits/chosen": -1.6395822763442993, "logits/rejected": -1.8543764352798462, "logps/chosen": -4.298351287841797, "logps/rejected": -5.546226501464844, "loss": 19.9406, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.30993199348449707, "rewards/margins": 0.08511951565742493, "rewards/rejected": -0.3950514793395996, "step": 575 }, { "epoch": 0.36500943989930773, "grad_norm": 174.55496215820312, "learning_rate": 3.228242924858248e-05, "logits/chosen": -1.5872471332550049, "logits/rejected": -1.688132882118225, "logps/chosen": -4.568819999694824, "logps/rejected": -5.411607265472412, "loss": 22.4314, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.34597450494766235, "rewards/margins": 0.07728902995586395, "rewards/rejected": -0.4232635498046875, "step": 580 }, { "epoch": 0.36815607300188796, "grad_norm": 70.5542221069336, "learning_rate": 3.210830844090555e-05, "logits/chosen": -1.6192104816436768, "logits/rejected": -1.6785539388656616, "logps/chosen": -5.1252007484436035, "logps/rejected": -5.851187705993652, "loss": 25.8619, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.40460100769996643, "rewards/margins": 0.06072293594479561, "rewards/rejected": -0.46532392501831055, "step": 585 }, { "epoch": 0.3713027061044682, "grad_norm": 100.62268829345703, "learning_rate": 3.193272664476152e-05, "logits/chosen": -1.7602649927139282, "logits/rejected": -1.9346716403961182, "logps/chosen": -4.961272239685059, "logps/rejected": -5.8130645751953125, "loss": 22.8852, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3982604444026947, "rewards/margins": 0.059664536267519, "rewards/rejected": -0.457925021648407, "step": 590 }, { "epoch": 0.3744493392070485, "grad_norm": 411.0801696777344, "learning_rate": 3.1755705045849465e-05, "logits/chosen": -1.7633399963378906, "logits/rejected": -1.818737268447876, "logps/chosen": -5.510100364685059, "logps/rejected": -6.382575035095215, "loss": 23.8471, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4236491620540619, "rewards/margins": 0.06903719902038574, "rewards/rejected": -0.49268636107444763, "step": 595 }, { "epoch": 0.3775959723096287, "grad_norm": 98.035888671875, "learning_rate": 3.157726500359509e-05, "logits/chosen": -1.825554609298706, "logits/rejected": -1.907472014427185, "logps/chosen": -5.569567680358887, "logps/rejected": -6.1025004386901855, "loss": 24.087, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.4460601210594177, "rewards/margins": 0.03472483158111572, "rewards/rejected": -0.48078498244285583, "step": 600 }, { "epoch": 0.38074260541220895, "grad_norm": 80.47187805175781, "learning_rate": 3.1397428048573465e-05, "logits/chosen": -1.798015832901001, "logits/rejected": -1.9216489791870117, "logps/chosen": -4.644695281982422, "logps/rejected": -5.7896294593811035, "loss": 19.835, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.380901575088501, "rewards/margins": 0.08407244086265564, "rewards/rejected": -0.4649740159511566, "step": 605 }, { "epoch": 0.3838892385147892, "grad_norm": 65.88395690917969, "learning_rate": 3.121621587991113e-05, "logits/chosen": -1.9489303827285767, "logits/rejected": -1.9782030582427979, "logps/chosen": -4.736275672912598, "logps/rejected": -5.893181800842285, "loss": 21.2523, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.37491172552108765, "rewards/margins": 0.09068160504102707, "rewards/rejected": -0.46559327840805054, "step": 610 }, { "epoch": 0.3870358716173694, "grad_norm": 126.57975769042969, "learning_rate": 3.1033650362667935e-05, "logits/chosen": -1.945927619934082, "logits/rejected": -2.0246009826660156, "logps/chosen": -4.42104434967041, "logps/rejected": -5.623631000518799, "loss": 20.477, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3376965820789337, "rewards/margins": 0.07996558398008347, "rewards/rejected": -0.41766220331192017, "step": 615 }, { "epoch": 0.39018250471994964, "grad_norm": 88.92438507080078, "learning_rate": 3.084975352519874e-05, "logits/chosen": -2.063378095626831, "logits/rejected": -2.161208391189575, "logps/chosen": -4.2682085037231445, "logps/rejected": -5.291066646575928, "loss": 22.2295, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3386593759059906, "rewards/margins": 0.07158732414245605, "rewards/rejected": -0.41024675965309143, "step": 620 }, { "epoch": 0.3933291378225299, "grad_norm": 53.47737503051758, "learning_rate": 3.06645475564955e-05, "logits/chosen": -1.9409205913543701, "logits/rejected": -2.0371243953704834, "logps/chosen": -3.6241352558135986, "logps/rejected": -5.033164978027344, "loss": 20.5698, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.27396219968795776, "rewards/margins": 0.09085332602262497, "rewards/rejected": -0.36481553316116333, "step": 625 }, { "epoch": 0.3964757709251101, "grad_norm": 87.2447738647461, "learning_rate": 3.0478054803509975e-05, "logits/chosen": -1.9413238763809204, "logits/rejected": -1.989638328552246, "logps/chosen": -3.974926710128784, "logps/rejected": -5.115756034851074, "loss": 20.8679, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3056657314300537, "rewards/margins": 0.09486590325832367, "rewards/rejected": -0.4005316197872162, "step": 630 }, { "epoch": 0.3996224040276904, "grad_norm": 105.37754821777344, "learning_rate": 3.029029776845726e-05, "logits/chosen": -1.9769777059555054, "logits/rejected": -2.0631349086761475, "logps/chosen": -4.811491012573242, "logps/rejected": -6.024916648864746, "loss": 22.3949, "rewards/accuracies": 0.6875, "rewards/chosen": -0.36858421564102173, "rewards/margins": 0.09452919661998749, "rewards/rejected": -0.463113397359848, "step": 635 }, { "epoch": 0.4027690371302706, "grad_norm": 107.63380432128906, "learning_rate": 3.0101299106100766e-05, "logits/chosen": -1.9259755611419678, "logits/rejected": -2.0011420249938965, "logps/chosen": -4.672276496887207, "logps/rejected": -5.433979034423828, "loss": 23.4548, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.34038934111595154, "rewards/margins": 0.05264373868703842, "rewards/rejected": -0.39303308725357056, "step": 640 }, { "epoch": 0.40591567023285086, "grad_norm": 72.93191528320312, "learning_rate": 2.991108162101862e-05, "logits/chosen": -1.8639154434204102, "logits/rejected": -2.00860333442688, "logps/chosen": -4.0379438400268555, "logps/rejected": -4.966481685638428, "loss": 24.2063, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3016512095928192, "rewards/margins": 0.05989114195108414, "rewards/rejected": -0.36154234409332275, "step": 645 }, { "epoch": 0.4090623033354311, "grad_norm": 241.30491638183594, "learning_rate": 2.971966826485212e-05, "logits/chosen": -2.0276923179626465, "logits/rejected": -2.075092077255249, "logps/chosen": -3.9584078788757324, "logps/rejected": -4.5398454666137695, "loss": 22.3358, "rewards/accuracies": 0.625, "rewards/chosen": -0.2686071991920471, "rewards/margins": 0.05414595082402229, "rewards/rejected": -0.3227531313896179, "step": 650 }, { "epoch": 0.4122089364380113, "grad_norm": 72.65229797363281, "learning_rate": 2.952708213353636e-05, "logits/chosen": -2.087306499481201, "logits/rejected": -2.120595932006836, "logps/chosen": -2.7464280128479004, "logps/rejected": -3.2665913105010986, "loss": 23.396, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.19495923817157745, "rewards/margins": 0.03470323234796524, "rewards/rejected": -0.2296624630689621, "step": 655 }, { "epoch": 0.41535556954059155, "grad_norm": 36.565982818603516, "learning_rate": 2.9333346464513476e-05, "logits/chosen": -2.0568580627441406, "logits/rejected": -2.171510934829712, "logps/chosen": -3.1527762413024902, "logps/rejected": -3.5696024894714355, "loss": 23.204, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.2180822342634201, "rewards/margins": 0.029619824141263962, "rewards/rejected": -0.24770204722881317, "step": 660 }, { "epoch": 0.4185022026431718, "grad_norm": 57.84255599975586, "learning_rate": 2.9138484633928818e-05, "logits/chosen": -1.940320372581482, "logits/rejected": -1.9845908880233765, "logps/chosen": -3.0434772968292236, "logps/rejected": -3.5398964881896973, "loss": 24.3501, "rewards/accuracies": 0.625, "rewards/chosen": -0.2063741683959961, "rewards/margins": 0.023456847295165062, "rewards/rejected": -0.2298310250043869, "step": 665 }, { "epoch": 0.42164883574575207, "grad_norm": 56.995887756347656, "learning_rate": 2.8942520153810396e-05, "logits/chosen": -2.0002236366271973, "logits/rejected": -2.08671498298645, "logps/chosen": -2.834512710571289, "logps/rejected": -3.5050129890441895, "loss": 22.4039, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.18090704083442688, "rewards/margins": 0.04532923549413681, "rewards/rejected": -0.2262362688779831, "step": 670 }, { "epoch": 0.4247954688483323, "grad_norm": 75.65125274658203, "learning_rate": 2.8745476669231894e-05, "logits/chosen": -2.020886182785034, "logits/rejected": -2.111823558807373, "logps/chosen": -3.5571112632751465, "logps/rejected": -4.481097221374512, "loss": 22.9676, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.22384686768054962, "rewards/margins": 0.04108366742730141, "rewards/rejected": -0.2649305462837219, "step": 675 }, { "epoch": 0.42794210195091253, "grad_norm": 77.30415344238281, "learning_rate": 2.8547377955459704e-05, "logits/chosen": -1.9961265325546265, "logits/rejected": -2.0482177734375, "logps/chosen": -2.892690658569336, "logps/rejected": -3.2253260612487793, "loss": 25.6658, "rewards/accuracies": 0.5, "rewards/chosen": -0.19483526051044464, "rewards/margins": 0.01912742853164673, "rewards/rejected": -0.21396267414093018, "step": 680 }, { "epoch": 0.43108873505349277, "grad_norm": 49.21062088012695, "learning_rate": 2.834824791508413e-05, "logits/chosen": -1.930086374282837, "logits/rejected": -2.131298542022705, "logps/chosen": -2.739534854888916, "logps/rejected": -3.5602822303771973, "loss": 21.1908, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.17746233940124512, "rewards/margins": 0.06554970890283585, "rewards/rejected": -0.24301204085350037, "step": 685 }, { "epoch": 0.434235368156073, "grad_norm": 64.88590240478516, "learning_rate": 2.814811057513537e-05, "logits/chosen": -2.0517029762268066, "logits/rejected": -2.067883253097534, "logps/chosen": -2.82458758354187, "logps/rejected": -3.6670260429382324, "loss": 21.8595, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1833667755126953, "rewards/margins": 0.0560932457447052, "rewards/rejected": -0.2394600361585617, "step": 690 }, { "epoch": 0.43738200125865323, "grad_norm": 48.841331481933594, "learning_rate": 2.7946990084184383e-05, "logits/chosen": -1.798683524131775, "logits/rejected": -1.9806129932403564, "logps/chosen": -3.2995662689208984, "logps/rejected": -4.0815110206604, "loss": 22.0918, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2146444320678711, "rewards/margins": 0.05965212732553482, "rewards/rejected": -0.27429652214050293, "step": 695 }, { "epoch": 0.44052863436123346, "grad_norm": 266.59381103515625, "learning_rate": 2.7744910709429104e-05, "logits/chosen": -1.800355315208435, "logits/rejected": -1.9262745380401611, "logps/chosen": -3.308371067047119, "logps/rejected": -4.3786821365356445, "loss": 22.6616, "rewards/accuracies": 0.6875, "rewards/chosen": -0.20500688254833221, "rewards/margins": 0.07705695927143097, "rewards/rejected": -0.2820638120174408, "step": 700 }, { "epoch": 0.4436752674638137, "grad_norm": 45.74457550048828, "learning_rate": 2.754189683376641e-05, "logits/chosen": -1.8245214223861694, "logits/rejected": -1.9188095331192017, "logps/chosen": -2.6574292182922363, "logps/rejected": -3.3347110748291016, "loss": 21.6472, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1792256087064743, "rewards/margins": 0.054762959480285645, "rewards/rejected": -0.23398856818675995, "step": 705 }, { "epoch": 0.446821900566394, "grad_norm": 82.67216491699219, "learning_rate": 2.7337972952850047e-05, "logits/chosen": -1.764173150062561, "logits/rejected": -1.9260650873184204, "logps/chosen": -2.8055293560028076, "logps/rejected": -3.9603447914123535, "loss": 21.7022, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.19627173244953156, "rewards/margins": 0.07794789969921112, "rewards/rejected": -0.2742196321487427, "step": 710 }, { "epoch": 0.4499685336689742, "grad_norm": 63.396240234375, "learning_rate": 2.713316367213499e-05, "logits/chosen": -1.6747219562530518, "logits/rejected": -1.8347587585449219, "logps/chosen": -2.9625911712646484, "logps/rejected": -3.7656357288360596, "loss": 22.6149, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.21037223935127258, "rewards/margins": 0.05833571031689644, "rewards/rejected": -0.26870793104171753, "step": 715 }, { "epoch": 0.45311516677155445, "grad_norm": 118.00112915039062, "learning_rate": 2.692749370390855e-05, "logits/chosen": -1.7990179061889648, "logits/rejected": -1.8915067911148071, "logps/chosen": -3.0249316692352295, "logps/rejected": -4.06134033203125, "loss": 23.4425, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.21054935455322266, "rewards/margins": 0.05246324464678764, "rewards/rejected": -0.2630125880241394, "step": 720 }, { "epoch": 0.4562617998741347, "grad_norm": 64.52631378173828, "learning_rate": 2.6720987864308603e-05, "logits/chosen": -1.695908546447754, "logits/rejected": -1.7583353519439697, "logps/chosen": -2.815432548522949, "logps/rejected": -4.123710632324219, "loss": 21.0095, "rewards/accuracies": 0.625, "rewards/chosen": -0.1960502415895462, "rewards/margins": 0.08241166174411774, "rewards/rejected": -0.27846187353134155, "step": 725 }, { "epoch": 0.4594084329767149, "grad_norm": 59.4410285949707, "learning_rate": 2.6513671070329244e-05, "logits/chosen": -1.7788522243499756, "logits/rejected": -1.8245208263397217, "logps/chosen": -3.012934446334839, "logps/rejected": -4.003429412841797, "loss": 21.1484, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2150099277496338, "rewards/margins": 0.07829871028661728, "rewards/rejected": -0.2933086156845093, "step": 730 }, { "epoch": 0.46255506607929514, "grad_norm": 84.89627075195312, "learning_rate": 2.630556833681434e-05, "logits/chosen": -1.738438606262207, "logits/rejected": -1.8424345254898071, "logps/chosen": -2.7983458042144775, "logps/rejected": -4.087245941162109, "loss": 19.2453, "rewards/accuracies": 0.75, "rewards/chosen": -0.19751907885074615, "rewards/margins": 0.09776587784290314, "rewards/rejected": -0.2952849566936493, "step": 735 }, { "epoch": 0.4657016991818754, "grad_norm": 101.38806915283203, "learning_rate": 2.609670477343921e-05, "logits/chosen": -1.6957628726959229, "logits/rejected": -1.825757384300232, "logps/chosen": -4.030215263366699, "logps/rejected": -5.008100509643555, "loss": 22.1478, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.30844345688819885, "rewards/margins": 0.0614703968167305, "rewards/rejected": -0.36991381645202637, "step": 740 }, { "epoch": 0.46884833228445566, "grad_norm": 101.18181610107422, "learning_rate": 2.5887105581680905e-05, "logits/chosen": -1.7838348150253296, "logits/rejected": -1.7674500942230225, "logps/chosen": -4.438131809234619, "logps/rejected": -5.542893886566162, "loss": 23.806, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.36128634214401245, "rewards/margins": 0.07241909205913544, "rewards/rejected": -0.43370547890663147, "step": 745 }, { "epoch": 0.4719949653870359, "grad_norm": 89.2279052734375, "learning_rate": 2.567679605177739e-05, "logits/chosen": -1.7873433828353882, "logits/rejected": -1.831865906715393, "logps/chosen": -4.315898895263672, "logps/rejected": -5.43391227722168, "loss": 20.4258, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.32439109683036804, "rewards/margins": 0.09124849736690521, "rewards/rejected": -0.41563957929611206, "step": 750 }, { "epoch": 0.4751415984896161, "grad_norm": 68.27491760253906, "learning_rate": 2.5465801559676033e-05, "logits/chosen": -1.716103196144104, "logits/rejected": -1.744837999343872, "logps/chosen": -3.913160800933838, "logps/rejected": -5.709442615509033, "loss": 19.3215, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.30374833941459656, "rewards/margins": 0.12692494690418243, "rewards/rejected": -0.4306732714176178, "step": 755 }, { "epoch": 0.47828823159219636, "grad_norm": 149.6294708251953, "learning_rate": 2.525414756397174e-05, "logits/chosen": -1.7440742254257202, "logits/rejected": -1.8239097595214844, "logps/chosen": -3.586292266845703, "logps/rejected": -4.596356391906738, "loss": 19.9662, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2702713906764984, "rewards/margins": 0.08218260109424591, "rewards/rejected": -0.3524540364742279, "step": 760 }, { "epoch": 0.4814348646947766, "grad_norm": 102.944580078125, "learning_rate": 2.504185960283512e-05, "logits/chosen": -1.7996543645858765, "logits/rejected": -1.8109557628631592, "logps/chosen": -4.447735786437988, "logps/rejected": -5.870986461639404, "loss": 20.4207, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.35062670707702637, "rewards/margins": 0.09269314259290695, "rewards/rejected": -0.4433198869228363, "step": 765 }, { "epoch": 0.4845814977973568, "grad_norm": 128.53907775878906, "learning_rate": 2.482896329093106e-05, "logits/chosen": -1.9051790237426758, "logits/rejected": -1.9270706176757812, "logps/chosen": -5.1721906661987305, "logps/rejected": -6.744166374206543, "loss": 19.0615, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4306749701499939, "rewards/margins": 0.1142655462026596, "rewards/rejected": -0.5449405312538147, "step": 770 }, { "epoch": 0.48772813089993705, "grad_norm": 123.44400024414062, "learning_rate": 2.4615484316328023e-05, "logits/chosen": -1.8487358093261719, "logits/rejected": -1.8219711780548096, "logps/chosen": -5.741638660430908, "logps/rejected": -7.048303127288818, "loss": 22.6075, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.4748842120170593, "rewards/margins": 0.09859482944011688, "rewards/rejected": -0.5734790563583374, "step": 775 }, { "epoch": 0.4908747640025173, "grad_norm": 97.28683471679688, "learning_rate": 2.440144843739857e-05, "logits/chosen": -1.8166711330413818, "logits/rejected": -1.856359839439392, "logps/chosen": -6.369978904724121, "logps/rejected": -7.745943546295166, "loss": 21.1624, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5159797072410583, "rewards/margins": 0.09467221796512604, "rewards/rejected": -0.610651969909668, "step": 780 }, { "epoch": 0.49402139710509757, "grad_norm": 94.76971435546875, "learning_rate": 2.4186881479711338e-05, "logits/chosen": -1.8901869058609009, "logits/rejected": -1.996917724609375, "logps/chosen": -5.151943206787109, "logps/rejected": -6.655333518981934, "loss": 17.5696, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3987768888473511, "rewards/margins": 0.11971308290958405, "rewards/rejected": -0.5184900164604187, "step": 785 }, { "epoch": 0.4971680302076778, "grad_norm": 362.07489013671875, "learning_rate": 2.397180933291491e-05, "logits/chosen": -1.6789305210113525, "logits/rejected": -1.75827157497406, "logps/chosen": -4.5332841873168945, "logps/rejected": -5.266444206237793, "loss": 22.7215, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3525087535381317, "rewards/margins": 0.07219593226909637, "rewards/rejected": -0.42470473051071167, "step": 790 }, { "epoch": 0.500314663310258, "grad_norm": 181.0984344482422, "learning_rate": 2.375625794761401e-05, "logits/chosen": -1.769201636314392, "logits/rejected": -1.7219161987304688, "logps/chosen": -4.633937358856201, "logps/rejected": -5.043046474456787, "loss": 26.0541, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.3703366816043854, "rewards/margins": 0.028562629595398903, "rewards/rejected": -0.3988993167877197, "step": 795 }, { "epoch": 0.5034612964128382, "grad_norm": 120.9494857788086, "learning_rate": 2.3540253332238266e-05, "logits/chosen": -1.6151552200317383, "logits/rejected": -1.646795630455017, "logps/chosen": -4.029574394226074, "logps/rejected": -5.215254783630371, "loss": 20.2479, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.314275324344635, "rewards/margins": 0.08437344431877136, "rewards/rejected": -0.39864879846572876, "step": 800 }, { "epoch": 0.5066079295154186, "grad_norm": 119.4858169555664, "learning_rate": 2.3323821549904038e-05, "logits/chosen": -1.670577049255371, "logits/rejected": -1.5533939599990845, "logps/chosen": -3.9187912940979004, "logps/rejected": -4.743254661560059, "loss": 23.6037, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3001677095890045, "rewards/margins": 0.06169123575091362, "rewards/rejected": -0.36185896396636963, "step": 805 }, { "epoch": 0.5097545626179988, "grad_norm": 316.2997741699219, "learning_rate": 2.310698871526966e-05, "logits/chosen": -1.5207440853118896, "logits/rejected": -1.6267799139022827, "logps/chosen": -3.097418785095215, "logps/rejected": -4.804646015167236, "loss": 21.8575, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.22499537467956543, "rewards/margins": 0.11616162210702896, "rewards/rejected": -0.3411570191383362, "step": 810 }, { "epoch": 0.512901195720579, "grad_norm": 78.00189971923828, "learning_rate": 2.288978099138443e-05, "logits/chosen": -1.5745933055877686, "logits/rejected": -1.5564606189727783, "logps/chosen": -2.8804163932800293, "logps/rejected": -3.5308539867401123, "loss": 22.241, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.20254115760326385, "rewards/margins": 0.05405501648783684, "rewards/rejected": -0.2565961480140686, "step": 815 }, { "epoch": 0.5160478288231592, "grad_norm": 118.51643371582031, "learning_rate": 2.267222458653179e-05, "logits/chosen": -1.5091989040374756, "logits/rejected": -1.6645923852920532, "logps/chosen": -3.255237579345703, "logps/rejected": -4.126650333404541, "loss": 22.0187, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.23314771056175232, "rewards/margins": 0.06177164986729622, "rewards/rejected": -0.29491934180259705, "step": 820 }, { "epoch": 0.5191944619257395, "grad_norm": 68.80047607421875, "learning_rate": 2.245434575106702e-05, "logits/chosen": -1.525356411933899, "logits/rejected": -1.701436996459961, "logps/chosen": -3.166797161102295, "logps/rejected": -4.742985248565674, "loss": 20.3686, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2261020839214325, "rewards/margins": 0.08829782903194427, "rewards/rejected": -0.3143998980522156, "step": 825 }, { "epoch": 0.5223410950283197, "grad_norm": 73.1375503540039, "learning_rate": 2.223617077424988e-05, "logits/chosen": -1.6771663427352905, "logits/rejected": -1.7121098041534424, "logps/chosen": -3.020296573638916, "logps/rejected": -4.426422119140625, "loss": 20.0836, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.21738722920417786, "rewards/margins": 0.09777109324932098, "rewards/rejected": -0.31515830755233765, "step": 830 }, { "epoch": 0.5254877281309, "grad_norm": 76.68984985351562, "learning_rate": 2.2017725981072536e-05, "logits/chosen": -1.4603363275527954, "logits/rejected": -1.5595886707305908, "logps/chosen": -3.6973624229431152, "logps/rejected": -5.027807712554932, "loss": 20.512, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2720819115638733, "rewards/margins": 0.08642515540122986, "rewards/rejected": -0.35850709676742554, "step": 835 }, { "epoch": 0.5286343612334802, "grad_norm": 122.99668884277344, "learning_rate": 2.1799037729083213e-05, "logits/chosen": -1.5949891805648804, "logits/rejected": -1.7137962579727173, "logps/chosen": -3.5109829902648926, "logps/rejected": -4.95348596572876, "loss": 21.517, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.27030450105667114, "rewards/margins": 0.09910550713539124, "rewards/rejected": -0.36940997838974, "step": 840 }, { "epoch": 0.5317809943360604, "grad_norm": 65.23582458496094, "learning_rate": 2.1580132405205862e-05, "logits/chosen": -1.4871020317077637, "logits/rejected": -1.5624678134918213, "logps/chosen": -4.474881172180176, "logps/rejected": -5.375269412994385, "loss": 23.3138, "rewards/accuracies": 0.625, "rewards/chosen": -0.3389451503753662, "rewards/margins": 0.06582923233509064, "rewards/rejected": -0.40477436780929565, "step": 845 }, { "epoch": 0.5349276274386406, "grad_norm": 175.08432006835938, "learning_rate": 2.1361036422556337e-05, "logits/chosen": -1.5353832244873047, "logits/rejected": -1.596407175064087, "logps/chosen": -3.814873218536377, "logps/rejected": -4.92036771774292, "loss": 21.5442, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2893931567668915, "rewards/margins": 0.07075894623994827, "rewards/rejected": -0.36015206575393677, "step": 850 }, { "epoch": 0.5380742605412209, "grad_norm": 64.21197509765625, "learning_rate": 2.1141776217255365e-05, "logits/chosen": -1.567317247390747, "logits/rejected": -1.5555747747421265, "logps/chosen": -3.8906242847442627, "logps/rejected": -4.897479057312012, "loss": 21.8379, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.29526472091674805, "rewards/margins": 0.06354343891143799, "rewards/rejected": -0.35880815982818604, "step": 855 }, { "epoch": 0.5412208936438011, "grad_norm": 104.57052612304688, "learning_rate": 2.0922378245238787e-05, "logits/chosen": -1.5869696140289307, "logits/rejected": -1.6049997806549072, "logps/chosen": -3.8140482902526855, "logps/rejected": -4.755133628845215, "loss": 23.1968, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.29255491495132446, "rewards/margins": 0.052004069089889526, "rewards/rejected": -0.3445590138435364, "step": 860 }, { "epoch": 0.5443675267463813, "grad_norm": 92.2053451538086, "learning_rate": 2.070286897906537e-05, "logits/chosen": -1.602929711341858, "logits/rejected": -1.6071062088012695, "logps/chosen": -3.990319013595581, "logps/rejected": -5.2248215675354, "loss": 20.3706, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3073904812335968, "rewards/margins": 0.09087739139795303, "rewards/rejected": -0.39826786518096924, "step": 865 }, { "epoch": 0.5475141598489616, "grad_norm": 83.128662109375, "learning_rate": 2.0483274904722647e-05, "logits/chosen": -1.7051680088043213, "logits/rejected": -1.6087182760238647, "logps/chosen": -3.986027956008911, "logps/rejected": -4.851881980895996, "loss": 21.4848, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.30843600630760193, "rewards/margins": 0.06898938864469528, "rewards/rejected": -0.3774254322052002, "step": 870 }, { "epoch": 0.5506607929515418, "grad_norm": 62.2298583984375, "learning_rate": 2.026362251843109e-05, "logits/chosen": -1.6034513711929321, "logits/rejected": -1.699464201927185, "logps/chosen": -3.4193336963653564, "logps/rejected": -4.403960227966309, "loss": 21.3108, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2610613703727722, "rewards/margins": 0.08181565254926682, "rewards/rejected": -0.34287700057029724, "step": 875 }, { "epoch": 0.5538074260541221, "grad_norm": 88.62437438964844, "learning_rate": 2.004393832344711e-05, "logits/chosen": -1.6719697713851929, "logits/rejected": -1.5851457118988037, "logps/chosen": -3.8325066566467285, "logps/rejected": -5.3017473220825195, "loss": 19.635, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3032756447792053, "rewards/margins": 0.09231220185756683, "rewards/rejected": -0.39558783173561096, "step": 880 }, { "epoch": 0.5569540591567024, "grad_norm": 64.06165313720703, "learning_rate": 1.9824248826865124e-05, "logits/chosen": -1.5828460454940796, "logits/rejected": -1.6327168941497803, "logps/chosen": -4.681789398193359, "logps/rejected": -6.566616058349609, "loss": 18.3853, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.3667379915714264, "rewards/margins": 0.12741395831108093, "rewards/rejected": -0.49415192008018494, "step": 885 }, { "epoch": 0.5601006922592826, "grad_norm": 204.93890380859375, "learning_rate": 1.9604580536419254e-05, "logits/chosen": -1.572584867477417, "logits/rejected": -1.6088756322860718, "logps/chosen": -5.441628456115723, "logps/rejected": -7.085760593414307, "loss": 24.9097, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.45653265714645386, "rewards/margins": 0.0925588458776474, "rewards/rejected": -0.5490914583206177, "step": 890 }, { "epoch": 0.5632473253618628, "grad_norm": 162.79714965820312, "learning_rate": 1.93849599572849e-05, "logits/chosen": -1.6288610696792603, "logits/rejected": -1.6398794651031494, "logps/chosen": -5.213116645812988, "logps/rejected": -6.9830803871154785, "loss": 20.22, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.42777156829833984, "rewards/margins": 0.12980665266513824, "rewards/rejected": -0.5575782060623169, "step": 895 }, { "epoch": 0.5663939584644431, "grad_norm": 75.16659545898438, "learning_rate": 1.916541358888062e-05, "logits/chosen": -1.6041675806045532, "logits/rejected": -1.6970984935760498, "logps/chosen": -4.644831657409668, "logps/rejected": -5.80092716217041, "loss": 20.4964, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.37974128127098083, "rewards/margins": 0.09219308942556381, "rewards/rejected": -0.47193440794944763, "step": 900 }, { "epoch": 0.5695405915670233, "grad_norm": 110.90229797363281, "learning_rate": 1.8945967921670676e-05, "logits/chosen": -1.619327187538147, "logits/rejected": -1.6541610956192017, "logps/chosen": -5.146854400634766, "logps/rejected": -6.011466026306152, "loss": 22.4066, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.41492849588394165, "rewards/margins": 0.07109946012496948, "rewards/rejected": -0.48602795600891113, "step": 905 }, { "epoch": 0.5726872246696035, "grad_norm": 139.65293884277344, "learning_rate": 1.872664943396875e-05, "logits/chosen": -1.6764265298843384, "logits/rejected": -1.6785293817520142, "logps/chosen": -4.107344150543213, "logps/rejected": -5.6308698654174805, "loss": 20.0103, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3305855095386505, "rewards/margins": 0.11647170782089233, "rewards/rejected": -0.44705715775489807, "step": 910 }, { "epoch": 0.5758338577721838, "grad_norm": 147.52713012695312, "learning_rate": 1.8507484588743025e-05, "logits/chosen": -1.7002742290496826, "logits/rejected": -1.7680556774139404, "logps/chosen": -4.6784772872924805, "logps/rejected": -5.973324775695801, "loss": 21.0769, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3756680190563202, "rewards/margins": 0.09194694459438324, "rewards/rejected": -0.4676149785518646, "step": 915 }, { "epoch": 0.578980490874764, "grad_norm": 71.16407012939453, "learning_rate": 1.828849983042321e-05, "logits/chosen": -1.7075554132461548, "logits/rejected": -1.6953094005584717, "logps/chosen": -4.460357666015625, "logps/rejected": -5.521221160888672, "loss": 21.7677, "rewards/accuracies": 0.625, "rewards/chosen": -0.35953736305236816, "rewards/margins": 0.08199813961982727, "rewards/rejected": -0.44153547286987305, "step": 920 }, { "epoch": 0.5821271239773442, "grad_norm": 114.27317810058594, "learning_rate": 1.8069721581709697e-05, "logits/chosen": -1.6304935216903687, "logits/rejected": -1.6967551708221436, "logps/chosen": -4.526963233947754, "logps/rejected": -5.7123494148254395, "loss": 21.5069, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.35851508378982544, "rewards/margins": 0.07467497885227203, "rewards/rejected": -0.4331900477409363, "step": 925 }, { "epoch": 0.5852737570799245, "grad_norm": 71.74990844726562, "learning_rate": 1.785117624038546e-05, "logits/chosen": -1.704414963722229, "logits/rejected": -1.7506616115570068, "logps/chosen": -5.388034820556641, "logps/rejected": -6.3465657234191895, "loss": 21.8977, "rewards/accuracies": 0.625, "rewards/chosen": -0.4279704689979553, "rewards/margins": 0.05819786712527275, "rewards/rejected": -0.48616838455200195, "step": 930 }, { "epoch": 0.5884203901825047, "grad_norm": 78.14295196533203, "learning_rate": 1.763289017613085e-05, "logits/chosen": -1.6152721643447876, "logits/rejected": -1.640634536743164, "logps/chosen": -4.3263750076293945, "logps/rejected": -5.279467582702637, "loss": 21.887, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.34328222274780273, "rewards/margins": 0.07140573114156723, "rewards/rejected": -0.41468796133995056, "step": 935 }, { "epoch": 0.5915670232850849, "grad_norm": 219.88279724121094, "learning_rate": 1.741488972734184e-05, "logits/chosen": -1.5857679843902588, "logits/rejected": -1.65940260887146, "logps/chosen": -4.669988632202148, "logps/rejected": -6.202586650848389, "loss": 20.5667, "rewards/accuracies": 0.75, "rewards/chosen": -0.3540535271167755, "rewards/margins": 0.10686901956796646, "rewards/rejected": -0.46092256903648376, "step": 940 }, { "epoch": 0.5947136563876652, "grad_norm": 90.00337219238281, "learning_rate": 1.7197201197952065e-05, "logits/chosen": -1.5206947326660156, "logits/rejected": -1.53545343875885, "logps/chosen": -4.086690902709961, "logps/rejected": -4.490893363952637, "loss": 25.9453, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.30406898260116577, "rewards/margins": 0.034761372953653336, "rewards/rejected": -0.3388303220272064, "step": 945 }, { "epoch": 0.5978602894902454, "grad_norm": 79.93099212646484, "learning_rate": 1.6979850854258938e-05, "logits/chosen": -1.3608052730560303, "logits/rejected": -1.4760938882827759, "logps/chosen": -3.6326985359191895, "logps/rejected": -5.186118125915527, "loss": 20.6064, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2708989083766937, "rewards/margins": 0.10907317698001862, "rewards/rejected": -0.37997210025787354, "step": 950 }, { "epoch": 0.6010069225928257, "grad_norm": 54.11685562133789, "learning_rate": 1.6762864921754426e-05, "logits/chosen": -1.3788961172103882, "logits/rejected": -1.4954605102539062, "logps/chosen": -3.189054250717163, "logps/rejected": -4.365990161895752, "loss": 20.0193, "rewards/accuracies": 0.6875, "rewards/chosen": -0.23121857643127441, "rewards/margins": 0.09906688332557678, "rewards/rejected": -0.3302854597568512, "step": 955 }, { "epoch": 0.604153555695406, "grad_norm": 78.23949432373047, "learning_rate": 1.654626958196059e-05, "logits/chosen": -1.509225606918335, "logits/rejected": -1.4755313396453857, "logps/chosen": -4.190049648284912, "logps/rejected": -5.553238391876221, "loss": 18.6024, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3084833025932312, "rewards/margins": 0.10999338328838348, "rewards/rejected": -0.4184766709804535, "step": 960 }, { "epoch": 0.6073001887979862, "grad_norm": 46.66254806518555, "learning_rate": 1.633009096927062e-05, "logits/chosen": -1.5157467126846313, "logits/rejected": -1.6129589080810547, "logps/chosen": -3.3808016777038574, "logps/rejected": -4.686802864074707, "loss": 18.8156, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.26101940870285034, "rewards/margins": 0.10853584110736847, "rewards/rejected": -0.3695552349090576, "step": 965 }, { "epoch": 0.6104468219005664, "grad_norm": 76.67229461669922, "learning_rate": 1.6114355167795407e-05, "logits/chosen": -1.507666826248169, "logits/rejected": -1.642401099205017, "logps/chosen": -4.4493513107299805, "logps/rejected": -5.8435235023498535, "loss": 20.6314, "rewards/accuracies": 0.6875, "rewards/chosen": -0.32613635063171387, "rewards/margins": 0.10264672338962555, "rewards/rejected": -0.42878302931785583, "step": 970 }, { "epoch": 0.6135934550031467, "grad_norm": 97.02481842041016, "learning_rate": 1.5899088208216215e-05, "logits/chosen": -1.501697301864624, "logits/rejected": -1.594618558883667, "logps/chosen": -4.284520149230957, "logps/rejected": -4.852963447570801, "loss": 26.4688, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.33568352460861206, "rewards/margins": 0.03864779695868492, "rewards/rejected": -0.37433135509490967, "step": 975 }, { "epoch": 0.6167400881057269, "grad_norm": 176.32850646972656, "learning_rate": 1.568431606464388e-05, "logits/chosen": -1.595866084098816, "logits/rejected": -1.6668930053710938, "logps/chosen": -4.345438480377197, "logps/rejected": -5.242307662963867, "loss": 21.0145, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3100406527519226, "rewards/margins": 0.0767713412642479, "rewards/rejected": -0.3868120312690735, "step": 980 }, { "epoch": 0.6198867212083071, "grad_norm": 76.86431884765625, "learning_rate": 1.547006465148471e-05, "logits/chosen": -1.5940501689910889, "logits/rejected": -1.7789547443389893, "logps/chosen": -4.4857177734375, "logps/rejected": -5.875302314758301, "loss": 21.8847, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3244941830635071, "rewards/margins": 0.08251677453517914, "rewards/rejected": -0.4070109724998474, "step": 985 }, { "epoch": 0.6230333543108874, "grad_norm": 49.81745147705078, "learning_rate": 1.5256359820313718e-05, "logits/chosen": -1.550085425376892, "logits/rejected": -1.5959933996200562, "logps/chosen": -3.699030637741089, "logps/rejected": -4.6470842361450195, "loss": 20.7306, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2788650095462799, "rewards/margins": 0.0799705758690834, "rewards/rejected": -0.3588356077671051, "step": 990 }, { "epoch": 0.6261799874134676, "grad_norm": 81.01653289794922, "learning_rate": 1.5043227356755292e-05, "logits/chosen": -1.58163321018219, "logits/rejected": -1.663260817527771, "logps/chosen": -4.869448661804199, "logps/rejected": -5.365525245666504, "loss": 24.1646, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.34130221605300903, "rewards/margins": 0.04551910609006882, "rewards/rejected": -0.38682132959365845, "step": 995 }, { "epoch": 0.6293266205160478, "grad_norm": 101.5945053100586, "learning_rate": 1.4830692977371985e-05, "logits/chosen": -1.747009038925171, "logits/rejected": -1.7761609554290771, "logps/chosen": -4.585317134857178, "logps/rejected": -5.033480644226074, "loss": 23.2309, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3538682162761688, "rewards/margins": 0.037090349942445755, "rewards/rejected": -0.39095860719680786, "step": 1000 }, { "epoch": 0.632473253618628, "grad_norm": 55.57672882080078, "learning_rate": 1.4618782326561483e-05, "logits/chosen": -1.7331736087799072, "logits/rejected": -1.771627426147461, "logps/chosen": -3.9518864154815674, "logps/rejected": -4.847538948059082, "loss": 20.4833, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2925838530063629, "rewards/margins": 0.0719093531370163, "rewards/rejected": -0.3644932210445404, "step": 1005 }, { "epoch": 0.6356198867212083, "grad_norm": 75.53394317626953, "learning_rate": 1.4407520973462408e-05, "logits/chosen": -1.7358888387680054, "logits/rejected": -1.7642987966537476, "logps/chosen": -4.450674057006836, "logps/rejected": -5.2704572677612305, "loss": 22.8124, "rewards/accuracies": 0.625, "rewards/chosen": -0.3556494116783142, "rewards/margins": 0.04838743433356285, "rewards/rejected": -0.40403684973716736, "step": 1010 }, { "epoch": 0.6387665198237885, "grad_norm": 67.8470230102539, "learning_rate": 1.4196934408869118e-05, "logits/chosen": -1.8153152465820312, "logits/rejected": -1.8065166473388672, "logps/chosen": -5.316075325012207, "logps/rejected": -6.770912170410156, "loss": 21.5925, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3945319950580597, "rewards/margins": 0.06610045582056046, "rewards/rejected": -0.46063241362571716, "step": 1015 }, { "epoch": 0.6419131529263687, "grad_norm": 104.53321075439453, "learning_rate": 1.3987048042155977e-05, "logits/chosen": -1.6470744609832764, "logits/rejected": -1.6989984512329102, "logps/chosen": -4.787189960479736, "logps/rejected": -5.5443525314331055, "loss": 22.5867, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3872155249118805, "rewards/margins": 0.05858270451426506, "rewards/rejected": -0.44579824805259705, "step": 1020 }, { "epoch": 0.645059786028949, "grad_norm": 122.49982452392578, "learning_rate": 1.377788719821149e-05, "logits/chosen": -1.6421356201171875, "logits/rejected": -1.702820062637329, "logps/chosen": -4.435242652893066, "logps/rejected": -4.579672336578369, "loss": 25.1424, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3478389382362366, "rewards/margins": 0.0215731430798769, "rewards/rejected": -0.3694121241569519, "step": 1025 }, { "epoch": 0.6482064191315292, "grad_norm": 145.1405487060547, "learning_rate": 1.3569477114382568e-05, "logits/chosen": -1.6365470886230469, "logits/rejected": -1.6954962015151978, "logps/chosen": -4.985340595245361, "logps/rejected": -5.898791313171387, "loss": 21.7627, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.385195255279541, "rewards/margins": 0.05228766053915024, "rewards/rejected": -0.43748289346694946, "step": 1030 }, { "epoch": 0.6513530522341096, "grad_norm": 82.04701232910156, "learning_rate": 1.3361842937429436e-05, "logits/chosen": -1.6654088497161865, "logits/rejected": -1.732187032699585, "logps/chosen": -4.262317180633545, "logps/rejected": -5.410677909851074, "loss": 20.2359, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3416784703731537, "rewards/margins": 0.08638517558574677, "rewards/rejected": -0.42806363105773926, "step": 1035 }, { "epoch": 0.6544996853366898, "grad_norm": 95.95136260986328, "learning_rate": 1.3155009720491368e-05, "logits/chosen": -1.5801721811294556, "logits/rejected": -1.5603923797607422, "logps/chosen": -5.278650760650635, "logps/rejected": -6.190367698669434, "loss": 22.4881, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3998781740665436, "rewards/margins": 0.06602592766284943, "rewards/rejected": -0.4659040868282318, "step": 1040 }, { "epoch": 0.65764631843927, "grad_norm": 60.0530891418457, "learning_rate": 1.2949002420063828e-05, "logits/chosen": -1.6326820850372314, "logits/rejected": -1.720810890197754, "logps/chosen": -4.082489967346191, "logps/rejected": -5.006215572357178, "loss": 21.0105, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.32317107915878296, "rewards/margins": 0.07444654405117035, "rewards/rejected": -0.3976176679134369, "step": 1045 }, { "epoch": 0.6607929515418502, "grad_norm": 221.81906127929688, "learning_rate": 1.2743845892987183e-05, "logits/chosen": -1.6526765823364258, "logits/rejected": -1.697488784790039, "logps/chosen": -4.53380823135376, "logps/rejected": -5.771850109100342, "loss": 23.2634, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.34263211488723755, "rewards/margins": 0.07287438213825226, "rewards/rejected": -0.4155064523220062, "step": 1050 }, { "epoch": 0.6639395846444305, "grad_norm": 137.2283172607422, "learning_rate": 1.2539564893447489e-05, "logits/chosen": -1.631956696510315, "logits/rejected": -1.654306173324585, "logps/chosen": -4.1559600830078125, "logps/rejected": -5.033182621002197, "loss": 22.6183, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.32424792647361755, "rewards/margins": 0.06618380546569824, "rewards/rejected": -0.3904317319393158, "step": 1055 }, { "epoch": 0.6670862177470107, "grad_norm": 72.95520782470703, "learning_rate": 1.2336184069989663e-05, "logits/chosen": -1.670440435409546, "logits/rejected": -1.6872297525405884, "logps/chosen": -3.9552032947540283, "logps/rejected": -5.303035259246826, "loss": 19.5681, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.31223589181900024, "rewards/margins": 0.09164074063301086, "rewards/rejected": -0.4038766026496887, "step": 1060 }, { "epoch": 0.6702328508495909, "grad_norm": 90.91898345947266, "learning_rate": 1.2133727962543356e-05, "logits/chosen": -1.6696465015411377, "logits/rejected": -1.6963016986846924, "logps/chosen": -4.434679985046387, "logps/rejected": -5.158357620239258, "loss": 21.8675, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3489730954170227, "rewards/margins": 0.05557180196046829, "rewards/rejected": -0.4045449197292328, "step": 1065 }, { "epoch": 0.6733794839521712, "grad_norm": 185.79261779785156, "learning_rate": 1.193222099946202e-05, "logits/chosen": -1.6571991443634033, "logits/rejected": -1.7073132991790771, "logps/chosen": -4.607517242431641, "logps/rejected": -5.376668930053711, "loss": 22.3462, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.35802438855171204, "rewards/margins": 0.0643647164106369, "rewards/rejected": -0.42238911986351013, "step": 1070 }, { "epoch": 0.6765261170547514, "grad_norm": 71.50703430175781, "learning_rate": 1.1731687494575319e-05, "logits/chosen": -1.585889458656311, "logits/rejected": -1.6507800817489624, "logps/chosen": -4.845611572265625, "logps/rejected": -6.422255516052246, "loss": 18.5681, "rewards/accuracies": 0.75, "rewards/chosen": -0.37992939352989197, "rewards/margins": 0.10727685689926147, "rewards/rejected": -0.48720628023147583, "step": 1075 }, { "epoch": 0.6796727501573316, "grad_norm": 210.3772430419922, "learning_rate": 1.153215164425547e-05, "logits/chosen": -1.5637327432632446, "logits/rejected": -1.628791093826294, "logps/chosen": -4.643498420715332, "logps/rejected": -5.90508508682251, "loss": 22.429, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3530879020690918, "rewards/margins": 0.07370196282863617, "rewards/rejected": -0.42678990960121155, "step": 1080 }, { "epoch": 0.6828193832599119, "grad_norm": 324.6168212890625, "learning_rate": 1.133363752449768e-05, "logits/chosen": -1.6127498149871826, "logits/rejected": -1.5895841121673584, "logps/chosen": -3.8858344554901123, "logps/rejected": -5.141265392303467, "loss": 18.9867, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.31075209379196167, "rewards/margins": 0.10046511888504028, "rewards/rejected": -0.41121721267700195, "step": 1085 }, { "epoch": 0.6859660163624921, "grad_norm": 269.12744140625, "learning_rate": 1.1136169088015177e-05, "logits/chosen": -1.5152666568756104, "logits/rejected": -1.5772387981414795, "logps/chosen": -4.37540864944458, "logps/rejected": -5.073463439941406, "loss": 22.4614, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3488185405731201, "rewards/margins": 0.05801800638437271, "rewards/rejected": -0.40683650970458984, "step": 1090 }, { "epoch": 0.6891126494650723, "grad_norm": 407.53985595703125, "learning_rate": 1.0939770161349015e-05, "logits/chosen": -1.604278802871704, "logits/rejected": -1.6394538879394531, "logps/chosen": -4.725668907165527, "logps/rejected": -6.037966728210449, "loss": 23.0495, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3864028751850128, "rewards/margins": 0.09246636927127838, "rewards/rejected": -0.4788691997528076, "step": 1095 }, { "epoch": 0.6922592825676526, "grad_norm": 65.52562713623047, "learning_rate": 1.0744464441993205e-05, "logits/chosen": -1.4906436204910278, "logits/rejected": -1.570569634437561, "logps/chosen": -4.404895782470703, "logps/rejected": -5.454612731933594, "loss": 21.9146, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3443445563316345, "rewards/margins": 0.07481996715068817, "rewards/rejected": -0.4191645085811615, "step": 1100 }, { "epoch": 0.6954059156702328, "grad_norm": 60.899654388427734, "learning_rate": 1.0550275495535382e-05, "logits/chosen": -1.5062484741210938, "logits/rejected": -1.5998207330703735, "logps/chosen": -5.046140193939209, "logps/rejected": -6.212726593017578, "loss": 22.0906, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3926524817943573, "rewards/margins": 0.08822645246982574, "rewards/rejected": -0.48087891936302185, "step": 1105 }, { "epoch": 0.6985525487728131, "grad_norm": 85.36582946777344, "learning_rate": 1.0357226752813343e-05, "logits/chosen": -1.48141348361969, "logits/rejected": -1.532138705253601, "logps/chosen": -4.955922603607178, "logps/rejected": -6.1522979736328125, "loss": 19.2663, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3955458402633667, "rewards/margins": 0.09683366119861603, "rewards/rejected": -0.4923795163631439, "step": 1110 }, { "epoch": 0.7016991818753934, "grad_norm": 92.5035171508789, "learning_rate": 1.0165341507087922e-05, "logits/chosen": -1.4898306131362915, "logits/rejected": -1.589817762374878, "logps/chosen": -4.877270221710205, "logps/rejected": -6.326567649841309, "loss": 21.0751, "rewards/accuracies": 0.625, "rewards/chosen": -0.3771550953388214, "rewards/margins": 0.10272278636693954, "rewards/rejected": -0.47987785935401917, "step": 1115 }, { "epoch": 0.7048458149779736, "grad_norm": 100.18026733398438, "learning_rate": 9.974642911232413e-06, "logits/chosen": -1.5176981687545776, "logits/rejected": -1.5406978130340576, "logps/chosen": -5.319207191467285, "logps/rejected": -6.242737770080566, "loss": 20.9524, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4210137724876404, "rewards/margins": 0.07255946844816208, "rewards/rejected": -0.49357327818870544, "step": 1120 }, { "epoch": 0.7079924480805538, "grad_norm": 176.3753662109375, "learning_rate": 9.785153974938912e-06, "logits/chosen": -1.5830824375152588, "logits/rejected": -1.6101982593536377, "logps/chosen": -5.879128456115723, "logps/rejected": -6.807085990905762, "loss": 22.111, "rewards/accuracies": 0.625, "rewards/chosen": -0.44119253754615784, "rewards/margins": 0.07570262253284454, "rewards/rejected": -0.5168951749801636, "step": 1125 }, { "epoch": 0.7111390811831341, "grad_norm": 67.40308380126953, "learning_rate": 9.596897561942026e-06, "logits/chosen": -1.463176965713501, "logits/rejected": -1.4804832935333252, "logps/chosen": -4.481048107147217, "logps/rejected": -5.287797451019287, "loss": 22.1994, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3575854003429413, "rewards/margins": 0.06364957243204117, "rewards/rejected": -0.42123493552207947, "step": 1130 }, { "epoch": 0.7142857142857143, "grad_norm": 93.39257049560547, "learning_rate": 9.409896387260082e-06, "logits/chosen": -1.4179964065551758, "logits/rejected": -1.4655730724334717, "logps/chosen": -4.708760738372803, "logps/rejected": -6.217686653137207, "loss": 21.4161, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.37350720167160034, "rewards/margins": 0.10105752944946289, "rewards/rejected": -0.47456473112106323, "step": 1135 }, { "epoch": 0.7174323473882945, "grad_norm": 97.41583251953125, "learning_rate": 9.224173014454372e-06, "logits/chosen": -1.4397246837615967, "logits/rejected": -1.4766523838043213, "logps/chosen": -4.817109107971191, "logps/rejected": -6.214907169342041, "loss": 22.7104, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.384985089302063, "rewards/margins": 0.0952010303735733, "rewards/rejected": -0.4801861345767975, "step": 1140 }, { "epoch": 0.7205789804908748, "grad_norm": 103.4198989868164, "learning_rate": 9.039749852906606e-06, "logits/chosen": -1.368666648864746, "logits/rejected": -1.4239342212677002, "logps/chosen": -4.382673740386963, "logps/rejected": -5.262811183929443, "loss": 20.8727, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.35242724418640137, "rewards/margins": 0.075123131275177, "rewards/rejected": -0.42755040526390076, "step": 1145 }, { "epoch": 0.723725613593455, "grad_norm": 131.38589477539062, "learning_rate": 8.856649155115002e-06, "logits/chosen": -1.409711241722107, "logits/rejected": -1.455235481262207, "logps/chosen": -4.550191402435303, "logps/rejected": -5.52540922164917, "loss": 23.0103, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3620757460594177, "rewards/margins": 0.06903600692749023, "rewards/rejected": -0.43111175298690796, "step": 1150 }, { "epoch": 0.7268722466960352, "grad_norm": 60.0385627746582, "learning_rate": 8.674893014009311e-06, "logits/chosen": -1.3705095052719116, "logits/rejected": -1.4764083623886108, "logps/chosen": -4.423483848571777, "logps/rejected": -5.486600875854492, "loss": 21.3505, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3566300570964813, "rewards/margins": 0.07945708185434341, "rewards/rejected": -0.4360871911048889, "step": 1155 }, { "epoch": 0.7300188797986155, "grad_norm": 80.497802734375, "learning_rate": 8.494503360285084e-06, "logits/chosen": -1.406087875366211, "logits/rejected": -1.5597848892211914, "logps/chosen": -4.28043270111084, "logps/rejected": -5.639766216278076, "loss": 21.9094, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3276395797729492, "rewards/margins": 0.07139433920383453, "rewards/rejected": -0.39903393387794495, "step": 1160 }, { "epoch": 0.7331655129011957, "grad_norm": 106.78560638427734, "learning_rate": 8.315501959757506e-06, "logits/chosen": -1.4479920864105225, "logits/rejected": -1.530386209487915, "logps/chosen": -5.356269836425781, "logps/rejected": -6.295357704162598, "loss": 20.2622, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.39192715287208557, "rewards/margins": 0.07843243330717087, "rewards/rejected": -0.47035956382751465, "step": 1165 }, { "epoch": 0.7363121460037759, "grad_norm": 70.2252426147461, "learning_rate": 8.137910410735119e-06, "logits/chosen": -1.3913201093673706, "logits/rejected": -1.5211797952651978, "logps/chosen": -4.186515808105469, "logps/rejected": -5.630705833435059, "loss": 19.5955, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3086017966270447, "rewards/margins": 0.1026659831404686, "rewards/rejected": -0.4112677574157715, "step": 1170 }, { "epoch": 0.7394587791063562, "grad_norm": 192.9811553955078, "learning_rate": 7.961750141413811e-06, "logits/chosen": -1.4113714694976807, "logits/rejected": -1.4863709211349487, "logps/chosen": -4.043957710266113, "logps/rejected": -4.903926849365234, "loss": 21.1766, "rewards/accuracies": 0.6875, "rewards/chosen": -0.30225270986557007, "rewards/margins": 0.0713002160191536, "rewards/rejected": -0.3735528886318207, "step": 1175 }, { "epoch": 0.7426054122089364, "grad_norm": 120.66477966308594, "learning_rate": 7.787042407291236e-06, "logits/chosen": -1.4459470510482788, "logits/rejected": -1.4732497930526733, "logps/chosen": -4.194180488586426, "logps/rejected": -5.103634834289551, "loss": 21.7414, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.33040323853492737, "rewards/margins": 0.07123108208179474, "rewards/rejected": -0.4016343653202057, "step": 1180 }, { "epoch": 0.7457520453115167, "grad_norm": 76.2223892211914, "learning_rate": 7.613808288602185e-06, "logits/chosen": -1.3101516962051392, "logits/rejected": -1.410070776939392, "logps/chosen": -3.897928237915039, "logps/rejected": -4.853459358215332, "loss": 20.4936, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.30166110396385193, "rewards/margins": 0.07222743332386017, "rewards/rejected": -0.3738885223865509, "step": 1185 }, { "epoch": 0.748898678414097, "grad_norm": 71.2408676147461, "learning_rate": 7.442068687774983e-06, "logits/chosen": -1.3900350332260132, "logits/rejected": -1.4306429624557495, "logps/chosen": -4.03500509262085, "logps/rejected": -4.971550941467285, "loss": 20.8514, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.31287750601768494, "rewards/margins": 0.07024455070495605, "rewards/rejected": -0.383122056722641, "step": 1190 }, { "epoch": 0.7520453115166772, "grad_norm": 174.92535400390625, "learning_rate": 7.271844326909465e-06, "logits/chosen": -1.3968006372451782, "logits/rejected": -1.3997862339019775, "logps/chosen": -4.94242000579834, "logps/rejected": -5.543642520904541, "loss": 23.6965, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.36712104082107544, "rewards/margins": 0.041298940777778625, "rewards/rejected": -0.40841999650001526, "step": 1195 }, { "epoch": 0.7551919446192574, "grad_norm": 83.12405395507812, "learning_rate": 7.1031557452765934e-06, "logits/chosen": -1.4155142307281494, "logits/rejected": -1.4555690288543701, "logps/chosen": -3.987143039703369, "logps/rejected": -5.240988731384277, "loss": 20.4557, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.312110960483551, "rewards/margins": 0.08850479125976562, "rewards/rejected": -0.40061575174331665, "step": 1200 }, { "epoch": 0.7583385777218377, "grad_norm": 82.25894165039062, "learning_rate": 6.936023296840211e-06, "logits/chosen": -1.3227570056915283, "logits/rejected": -1.4542601108551025, "logps/chosen": -4.520358562469482, "logps/rejected": -5.628200531005859, "loss": 21.0717, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3516823947429657, "rewards/margins": 0.07068557292222977, "rewards/rejected": -0.42236796021461487, "step": 1205 }, { "epoch": 0.7614852108244179, "grad_norm": 63.93009567260742, "learning_rate": 6.770467147801152e-06, "logits/chosen": -1.3352692127227783, "logits/rejected": -1.4765124320983887, "logps/chosen": -3.903353452682495, "logps/rejected": -5.777923107147217, "loss": 18.1176, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.30418699979782104, "rewards/margins": 0.1330038160085678, "rewards/rejected": -0.43719083070755005, "step": 1210 }, { "epoch": 0.7646318439269981, "grad_norm": 123.6090087890625, "learning_rate": 6.606507274163949e-06, "logits/chosen": -1.4196144342422485, "logits/rejected": -1.5160802602767944, "logps/chosen": -4.3763604164123535, "logps/rejected": -5.507956504821777, "loss": 21.3593, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3455001711845398, "rewards/margins": 0.08908528089523315, "rewards/rejected": -0.43458548188209534, "step": 1215 }, { "epoch": 0.7677784770295784, "grad_norm": 79.51527404785156, "learning_rate": 6.444163459326569e-06, "logits/chosen": -1.3841816186904907, "logits/rejected": -1.44673752784729, "logps/chosen": -4.642246723175049, "logps/rejected": -5.952216625213623, "loss": 20.2826, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.37046322226524353, "rewards/margins": 0.10095451772212982, "rewards/rejected": -0.47141775488853455, "step": 1220 }, { "epoch": 0.7709251101321586, "grad_norm": 115.33991241455078, "learning_rate": 6.283455291693303e-06, "logits/chosen": -1.2804498672485352, "logits/rejected": -1.336126446723938, "logps/chosen": -4.530810356140137, "logps/rejected": -5.714901924133301, "loss": 23.5811, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3543975353240967, "rewards/margins": 0.07916755974292755, "rewards/rejected": -0.43356508016586304, "step": 1225 }, { "epoch": 0.7740717432347388, "grad_norm": 102.68350219726562, "learning_rate": 6.124402162311274e-06, "logits/chosen": -1.3455007076263428, "logits/rejected": -1.3819594383239746, "logps/chosen": -4.560150146484375, "logps/rejected": -5.909863471984863, "loss": 21.4806, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.36466413736343384, "rewards/margins": 0.07453545182943344, "rewards/rejected": -0.4391995966434479, "step": 1230 }, { "epoch": 0.777218376337319, "grad_norm": 78.0007553100586, "learning_rate": 5.9670232625306955e-06, "logits/chosen": -1.3267484903335571, "logits/rejected": -1.3938989639282227, "logps/chosen": -4.1908979415893555, "logps/rejected": -4.819875240325928, "loss": 24.9323, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3211560845375061, "rewards/margins": 0.054157011210918427, "rewards/rejected": -0.3753131031990051, "step": 1235 }, { "epoch": 0.7803650094398993, "grad_norm": 910.2023315429688, "learning_rate": 5.81133758168922e-06, "logits/chosen": -1.4007585048675537, "logits/rejected": -1.4542076587677002, "logps/chosen": -5.091724872589111, "logps/rejected": -6.444447994232178, "loss": 20.9318, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.36028310656547546, "rewards/margins": 0.09358057379722595, "rewards/rejected": -0.4538637101650238, "step": 1240 }, { "epoch": 0.7835116425424795, "grad_norm": 68.25672912597656, "learning_rate": 5.6573639048207315e-06, "logits/chosen": -1.3604391813278198, "logits/rejected": -1.3182973861694336, "logps/chosen": -4.621526718139648, "logps/rejected": -5.245944023132324, "loss": 21.9955, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.32450687885284424, "rewards/margins": 0.07323630154132843, "rewards/rejected": -0.3977431654930115, "step": 1245 }, { "epoch": 0.7866582756450597, "grad_norm": 106.51322937011719, "learning_rate": 5.5051208103887025e-06, "logits/chosen": -1.3608815670013428, "logits/rejected": -1.4448637962341309, "logps/chosen": -4.045924663543701, "logps/rejected": -5.57630729675293, "loss": 20.889, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3083491623401642, "rewards/margins": 0.09919731318950653, "rewards/rejected": -0.40754643082618713, "step": 1250 }, { "epoch": 0.78980490874764, "grad_norm": 70.59749603271484, "learning_rate": 5.354626668044535e-06, "logits/chosen": -1.3460859060287476, "logits/rejected": -1.412706732749939, "logps/chosen": -3.734891891479492, "logps/rejected": -4.818475246429443, "loss": 21.0468, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2917167842388153, "rewards/margins": 0.07943135499954224, "rewards/rejected": -0.37114813923835754, "step": 1255 }, { "epoch": 0.7929515418502202, "grad_norm": 83.2120361328125, "learning_rate": 5.205899636411078e-06, "logits/chosen": -1.3329652547836304, "logits/rejected": -1.3952248096466064, "logps/chosen": -4.460053443908691, "logps/rejected": -4.993377685546875, "loss": 25.4182, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.34508955478668213, "rewards/margins": 0.03861779719591141, "rewards/rejected": -0.38370734453201294, "step": 1260 }, { "epoch": 0.7960981749528006, "grad_norm": 74.94086456298828, "learning_rate": 5.058957660891613e-06, "logits/chosen": -1.353829264640808, "logits/rejected": -1.36537766456604, "logps/chosen": -3.8537967205047607, "logps/rejected": -4.86336612701416, "loss": 21.0046, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.29704272747039795, "rewards/margins": 0.07927833497524261, "rewards/rejected": -0.376321017742157, "step": 1265 }, { "epoch": 0.7992448080553808, "grad_norm": 68.53548431396484, "learning_rate": 4.913818471504552e-06, "logits/chosen": -1.3891483545303345, "logits/rejected": -1.4956327676773071, "logps/chosen": -3.83349609375, "logps/rejected": -5.111277103424072, "loss": 20.258, "rewards/accuracies": 0.75, "rewards/chosen": -0.299319326877594, "rewards/margins": 0.09995778650045395, "rewards/rejected": -0.3992771506309509, "step": 1270 }, { "epoch": 0.802391441157961, "grad_norm": 161.29922485351562, "learning_rate": 4.770499580744125e-06, "logits/chosen": -1.3398183584213257, "logits/rejected": -1.3453642129898071, "logps/chosen": -3.9315247535705566, "logps/rejected": -4.841611862182617, "loss": 22.4824, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.30572158098220825, "rewards/margins": 0.06097061559557915, "rewards/rejected": -0.3666921854019165, "step": 1275 }, { "epoch": 0.8055380742605412, "grad_norm": 68.45879364013672, "learning_rate": 4.629018281467357e-06, "logits/chosen": -1.297154188156128, "logits/rejected": -1.338921070098877, "logps/chosen": -3.7794177532196045, "logps/rejected": -4.509110927581787, "loss": 21.658, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2888321876525879, "rewards/margins": 0.05916588753461838, "rewards/rejected": -0.3479980528354645, "step": 1280 }, { "epoch": 0.8086847073631215, "grad_norm": 74.77375793457031, "learning_rate": 4.489391644807462e-06, "logits/chosen": -1.4385647773742676, "logits/rejected": -1.5144340991973877, "logps/chosen": -3.69215726852417, "logps/rejected": -4.667183876037598, "loss": 21.0338, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2842097282409668, "rewards/margins": 0.07265909761190414, "rewards/rejected": -0.35686883330345154, "step": 1285 }, { "epoch": 0.8118313404657017, "grad_norm": 78.63387298583984, "learning_rate": 4.351636518114091e-06, "logits/chosen": -1.3093000650405884, "logits/rejected": -1.3893928527832031, "logps/chosen": -3.599902629852295, "logps/rejected": -4.570587635040283, "loss": 22.1635, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2749050259590149, "rewards/margins": 0.08025064319372177, "rewards/rejected": -0.3551556468009949, "step": 1290 }, { "epoch": 0.8149779735682819, "grad_norm": 78.53893280029297, "learning_rate": 4.215769522920487e-06, "logits/chosen": -1.2443653345108032, "logits/rejected": -1.3605782985687256, "logps/chosen": -3.2713770866394043, "logps/rejected": -4.569630146026611, "loss": 20.9369, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.24853453040122986, "rewards/margins": 0.10017738491296768, "rewards/rejected": -0.34871190786361694, "step": 1295 }, { "epoch": 0.8181246066708622, "grad_norm": 82.4554672241211, "learning_rate": 4.0818070529379715e-06, "logits/chosen": -1.383690357208252, "logits/rejected": -1.4704560041427612, "logps/chosen": -4.524319171905518, "logps/rejected": -5.7077460289001465, "loss": 21.9118, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.316571444272995, "rewards/margins": 0.0641409307718277, "rewards/rejected": -0.3807123601436615, "step": 1300 }, { "epoch": 0.8212712397734424, "grad_norm": 71.1880111694336, "learning_rate": 3.949765272077843e-06, "logits/chosen": -1.3107343912124634, "logits/rejected": -1.3561115264892578, "logps/chosen": -3.846195936203003, "logps/rejected": -4.79428768157959, "loss": 21.0994, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.27155357599258423, "rewards/margins": 0.07163957506418228, "rewards/rejected": -0.3431931734085083, "step": 1305 }, { "epoch": 0.8244178728760226, "grad_norm": 50.073204040527344, "learning_rate": 3.819660112501053e-06, "logits/chosen": -1.2764497995376587, "logits/rejected": -1.3517284393310547, "logps/chosen": -3.5745315551757812, "logps/rejected": -4.921723365783691, "loss": 19.6469, "rewards/accuracies": 0.6875, "rewards/chosen": -0.27279648184776306, "rewards/margins": 0.1019618958234787, "rewards/rejected": -0.37475839257240295, "step": 1310 }, { "epoch": 0.8275645059786029, "grad_norm": 83.62207794189453, "learning_rate": 3.6915072726958514e-06, "logits/chosen": -1.2466180324554443, "logits/rejected": -1.2861813306808472, "logps/chosen": -3.430490016937256, "logps/rejected": -4.824821949005127, "loss": 20.5161, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2643739581108093, "rewards/margins": 0.1028999462723732, "rewards/rejected": -0.3672739565372467, "step": 1315 }, { "epoch": 0.8307111390811831, "grad_norm": 76.6629638671875, "learning_rate": 3.5653222155835686e-06, "logits/chosen": -1.2766977548599243, "logits/rejected": -1.3114259243011475, "logps/chosen": -4.222517967224121, "logps/rejected": -5.029845714569092, "loss": 22.1218, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3145274221897125, "rewards/margins": 0.06165830045938492, "rewards/rejected": -0.37618574500083923, "step": 1320 }, { "epoch": 0.8338577721837633, "grad_norm": 159.4115447998047, "learning_rate": 3.4411201666529003e-06, "logits/chosen": -1.3758924007415771, "logits/rejected": -1.4244683980941772, "logps/chosen": -4.457423210144043, "logps/rejected": -5.342848300933838, "loss": 23.3834, "rewards/accuracies": 0.5625, "rewards/chosen": -0.31679028272628784, "rewards/margins": 0.06267707049846649, "rewards/rejected": -0.3794673979282379, "step": 1325 }, { "epoch": 0.8370044052863436, "grad_norm": 56.71870803833008, "learning_rate": 3.3189161121227564e-06, "logits/chosen": -1.3166803121566772, "logits/rejected": -1.385522723197937, "logps/chosen": -3.8323776721954346, "logps/rejected": -4.732277870178223, "loss": 23.3384, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2978932559490204, "rewards/margins": 0.0644349679350853, "rewards/rejected": -0.3623282313346863, "step": 1330 }, { "epoch": 0.8401510383889238, "grad_norm": 66.62996673583984, "learning_rate": 3.198724797134074e-06, "logits/chosen": -1.2822662591934204, "logits/rejected": -1.4124181270599365, "logps/chosen": -3.9724369049072266, "logps/rejected": -5.0466437339782715, "loss": 22.4903, "rewards/accuracies": 0.625, "rewards/chosen": -0.2994682192802429, "rewards/margins": 0.0788046196103096, "rewards/rejected": -0.3782728910446167, "step": 1335 }, { "epoch": 0.8432976714915041, "grad_norm": 70.8177261352539, "learning_rate": 3.080560723970616e-06, "logits/chosen": -1.2813329696655273, "logits/rejected": -1.3586981296539307, "logps/chosen": -3.6214439868927, "logps/rejected": -4.637081623077393, "loss": 20.5515, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.28146275877952576, "rewards/margins": 0.07804764062166214, "rewards/rejected": -0.3595103919506073, "step": 1340 }, { "epoch": 0.8464443045940844, "grad_norm": 64.40753173828125, "learning_rate": 2.96443815030917e-06, "logits/chosen": -1.3396605253219604, "logits/rejected": -1.4255945682525635, "logps/chosen": -3.604154586791992, "logps/rejected": -4.95128059387207, "loss": 20.7037, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2776135206222534, "rewards/margins": 0.09353432059288025, "rewards/rejected": -0.37114784121513367, "step": 1345 }, { "epoch": 0.8495909376966646, "grad_norm": 93.99842071533203, "learning_rate": 2.850371087499195e-06, "logits/chosen": -1.381260633468628, "logits/rejected": -1.4631612300872803, "logps/chosen": -4.883763790130615, "logps/rejected": -6.07845401763916, "loss": 21.0858, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3591059148311615, "rewards/margins": 0.09570769965648651, "rewards/rejected": -0.4548136591911316, "step": 1350 }, { "epoch": 0.8527375707992448, "grad_norm": 62.075279235839844, "learning_rate": 2.7383732988722057e-06, "logits/chosen": -1.3089946508407593, "logits/rejected": -1.3634613752365112, "logps/chosen": -3.7724010944366455, "logps/rejected": -4.929832458496094, "loss": 19.0202, "rewards/accuracies": 0.6875, "rewards/chosen": -0.273120641708374, "rewards/margins": 0.09602681547403336, "rewards/rejected": -0.36914747953414917, "step": 1355 }, { "epoch": 0.8558842039018251, "grad_norm": 80.0210189819336, "learning_rate": 2.6284582980811136e-06, "logits/chosen": -1.4461333751678467, "logits/rejected": -1.370339035987854, "logps/chosen": -4.136780738830566, "logps/rejected": -5.008397579193115, "loss": 23.5672, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3027392327785492, "rewards/margins": 0.062295325100421906, "rewards/rejected": -0.3650345206260681, "step": 1360 }, { "epoch": 0.8590308370044053, "grad_norm": 169.91099548339844, "learning_rate": 2.5206393474696422e-06, "logits/chosen": -1.2922241687774658, "logits/rejected": -1.3685882091522217, "logps/chosen": -3.8860459327697754, "logps/rejected": -4.820228099822998, "loss": 20.1345, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2878992557525635, "rewards/margins": 0.07816118001937866, "rewards/rejected": -0.36606043577194214, "step": 1365 }, { "epoch": 0.8621774701069855, "grad_norm": 291.87542724609375, "learning_rate": 2.4149294564721146e-06, "logits/chosen": -1.390933632850647, "logits/rejected": -1.477757215499878, "logps/chosen": -4.5947346687316895, "logps/rejected": -5.662859916687012, "loss": 22.1173, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.32581329345703125, "rewards/margins": 0.0882103443145752, "rewards/rejected": -0.4140236973762512, "step": 1370 }, { "epoch": 0.8653241032095658, "grad_norm": 50.774810791015625, "learning_rate": 2.3113413800437145e-06, "logits/chosen": -1.3678381443023682, "logits/rejected": -1.4147788286209106, "logps/chosen": -4.411424160003662, "logps/rejected": -5.547976970672607, "loss": 20.419, "rewards/accuracies": 0.625, "rewards/chosen": -0.3136950135231018, "rewards/margins": 0.08119923621416092, "rewards/rejected": -0.3948942720890045, "step": 1375 }, { "epoch": 0.868470736312146, "grad_norm": 75.1661605834961, "learning_rate": 2.2098876171215e-06, "logits/chosen": -1.2949163913726807, "logits/rejected": -1.4591166973114014, "logps/chosen": -3.913958787918091, "logps/rejected": -4.945563316345215, "loss": 20.5075, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.30175596475601196, "rewards/margins": 0.09277000278234482, "rewards/rejected": -0.394525945186615, "step": 1380 }, { "epoch": 0.8716173694147262, "grad_norm": 116.18523406982422, "learning_rate": 2.110580409116261e-06, "logits/chosen": -1.3234283924102783, "logits/rejected": -1.3651349544525146, "logps/chosen": -4.782530307769775, "logps/rejected": -5.800885200500488, "loss": 22.8406, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3511677384376526, "rewards/margins": 0.07397367060184479, "rewards/rejected": -0.4251413345336914, "step": 1385 }, { "epoch": 0.8747640025173065, "grad_norm": 145.46861267089844, "learning_rate": 2.013431738435465e-06, "logits/chosen": -1.3332188129425049, "logits/rejected": -1.4134724140167236, "logps/chosen": -4.268718242645264, "logps/rejected": -5.433601379394531, "loss": 22.5056, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3332800269126892, "rewards/margins": 0.07072637230157852, "rewards/rejected": -0.4040064215660095, "step": 1390 }, { "epoch": 0.8779106356198867, "grad_norm": 117.83720397949219, "learning_rate": 1.9184533270374928e-06, "logits/chosen": -1.3927792310714722, "logits/rejected": -1.4590123891830444, "logps/chosen": -4.519114017486572, "logps/rejected": -5.810807228088379, "loss": 21.2018, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.34003710746765137, "rewards/margins": 0.08822458237409592, "rewards/rejected": -0.4282616972923279, "step": 1395 }, { "epoch": 0.8810572687224669, "grad_norm": 128.75563049316406, "learning_rate": 1.8256566350172211e-06, "logits/chosen": -1.4642970561981201, "logits/rejected": -1.56011962890625, "logps/chosen": -5.124087810516357, "logps/rejected": -6.271437168121338, "loss": 20.9824, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.34806352853775024, "rewards/margins": 0.0969148576259613, "rewards/rejected": -0.44497838616371155, "step": 1400 }, { "epoch": 0.8842039018250472, "grad_norm": 88.87577056884766, "learning_rate": 1.7350528592232962e-06, "logits/chosen": -1.3359493017196655, "logits/rejected": -1.4811887741088867, "logps/chosen": -4.525036811828613, "logps/rejected": -5.623012542724609, "loss": 22.1104, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3581879138946533, "rewards/margins": 0.07608196139335632, "rewards/rejected": -0.43426984548568726, "step": 1405 }, { "epoch": 0.8873505349276274, "grad_norm": 69.19255065917969, "learning_rate": 1.6466529319070735e-06, "logits/chosen": -1.2726246118545532, "logits/rejected": -1.39580237865448, "logps/chosen": -3.7457852363586426, "logps/rejected": -5.324977397918701, "loss": 18.2434, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2871127724647522, "rewards/margins": 0.11219409853219986, "rewards/rejected": -0.39930686354637146, "step": 1410 }, { "epoch": 0.8904971680302077, "grad_norm": 73.79737854003906, "learning_rate": 1.560467519403579e-06, "logits/chosen": -1.3266379833221436, "logits/rejected": -1.3948261737823486, "logps/chosen": -4.1067681312561035, "logps/rejected": -4.673392295837402, "loss": 22.1702, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3159501254558563, "rewards/margins": 0.04971395805478096, "rewards/rejected": -0.3656640946865082, "step": 1415 }, { "epoch": 0.893643801132788, "grad_norm": 106.870361328125, "learning_rate": 1.4765070208444732e-06, "logits/chosen": -1.3216549158096313, "logits/rejected": -1.35343337059021, "logps/chosen": -4.343778133392334, "logps/rejected": -5.122066497802734, "loss": 22.7187, "rewards/accuracies": 0.5625, "rewards/chosen": -0.33430585265159607, "rewards/margins": 0.06294408440589905, "rewards/rejected": -0.3972499370574951, "step": 1420 }, { "epoch": 0.8967904342353682, "grad_norm": 62.6711311340332, "learning_rate": 1.3947815669033026e-06, "logits/chosen": -1.3594673871994019, "logits/rejected": -1.4739999771118164, "logps/chosen": -4.087611198425293, "logps/rejected": -5.339770317077637, "loss": 20.526, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.31806594133377075, "rewards/margins": 0.08883042633533478, "rewards/rejected": -0.40689635276794434, "step": 1425 }, { "epoch": 0.8999370673379484, "grad_norm": 98.1043930053711, "learning_rate": 1.3153010185731495e-06, "logits/chosen": -1.2508734464645386, "logits/rejected": -1.32900869846344, "logps/chosen": -4.235801696777344, "logps/rejected": -5.670529842376709, "loss": 20.3076, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3296756148338318, "rewards/margins": 0.09636791795492172, "rewards/rejected": -0.4260435700416565, "step": 1430 }, { "epoch": 0.9030837004405287, "grad_norm": 87.73750305175781, "learning_rate": 1.2380749659767766e-06, "logits/chosen": -1.3343340158462524, "logits/rejected": -1.3880221843719482, "logps/chosen": -4.322578430175781, "logps/rejected": -5.371191501617432, "loss": 20.9961, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.33630794286727905, "rewards/margins": 0.0794602781534195, "rewards/rejected": -0.41576823592185974, "step": 1435 }, { "epoch": 0.9062303335431089, "grad_norm": 72.0036392211914, "learning_rate": 1.1631127272095077e-06, "logits/chosen": -1.3422092199325562, "logits/rejected": -1.4017739295959473, "logps/chosen": -3.97587251663208, "logps/rejected": -5.63102388381958, "loss": 18.4484, "rewards/accuracies": 0.6875, "rewards/chosen": -0.302670419216156, "rewards/margins": 0.1103433147072792, "rewards/rejected": -0.413013756275177, "step": 1440 }, { "epoch": 0.9093769666456891, "grad_norm": 55.72761917114258, "learning_rate": 1.0904233472148862e-06, "logits/chosen": -1.4325498342514038, "logits/rejected": -1.5191594362258911, "logps/chosen": -4.523946285247803, "logps/rejected": -5.913887023925781, "loss": 20.9945, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.34643903374671936, "rewards/margins": 0.07747067511081696, "rewards/rejected": -0.4239097237586975, "step": 1445 }, { "epoch": 0.9125235997482694, "grad_norm": 74.03398132324219, "learning_rate": 1.0200155966933333e-06, "logits/chosen": -1.3860814571380615, "logits/rejected": -1.4824600219726562, "logps/chosen": -4.180668830871582, "logps/rejected": -5.086295127868652, "loss": 22.6256, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.31416332721710205, "rewards/margins": 0.06807545572519302, "rewards/rejected": -0.3822387754917145, "step": 1450 }, { "epoch": 0.9156702328508496, "grad_norm": 55.17578887939453, "learning_rate": 9.51897971043847e-07, "logits/chosen": -1.277956485748291, "logits/rejected": -1.4699045419692993, "logps/chosen": -3.923815965652466, "logps/rejected": -5.776226997375488, "loss": 18.1837, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.30271369218826294, "rewards/margins": 0.13357527554035187, "rewards/rejected": -0.4362889230251312, "step": 1455 }, { "epoch": 0.9188168659534298, "grad_norm": 67.42135620117188, "learning_rate": 8.860786893389761e-07, "logits/chosen": -1.3501498699188232, "logits/rejected": -1.4162402153015137, "logps/chosen": -4.456291198730469, "logps/rejected": -4.891867637634277, "loss": 23.4746, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.35184237360954285, "rewards/margins": 0.03937570005655289, "rewards/rejected": -0.3912180960178375, "step": 1460 }, { "epoch": 0.92196349905601, "grad_norm": 86.8721923828125, "learning_rate": 8.225656933330972e-07, "logits/chosen": -1.396032691001892, "logits/rejected": -1.3607252836227417, "logps/chosen": -4.139504909515381, "logps/rejected": -5.256811618804932, "loss": 20.6197, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.31887346506118774, "rewards/margins": 0.08756524324417114, "rewards/rejected": -0.4064387381076813, "step": 1465 }, { "epoch": 0.9251101321585903, "grad_norm": 63.26131057739258, "learning_rate": 7.613666465041492e-07, "logits/chosen": -1.296687364578247, "logits/rejected": -1.338370442390442, "logps/chosen": -4.0869526863098145, "logps/rejected": -4.680004596710205, "loss": 22.3496, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.30148619413375854, "rewards/margins": 0.06435124576091766, "rewards/rejected": -0.365837424993515, "step": 1470 }, { "epoch": 0.9282567652611705, "grad_norm": 64.71456909179688, "learning_rate": 7.024889331289731e-07, "logits/chosen": -1.3576750755310059, "logits/rejected": -1.4629138708114624, "logps/chosen": -4.305732250213623, "logps/rejected": -6.287524700164795, "loss": 19.0147, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3270648717880249, "rewards/margins": 0.12565208971500397, "rewards/rejected": -0.45271697640419006, "step": 1475 }, { "epoch": 0.9314033983637507, "grad_norm": 79.55664825439453, "learning_rate": 6.459396573923227e-07, "logits/chosen": -1.2750294208526611, "logits/rejected": -1.3182651996612549, "logps/chosen": -3.8780131340026855, "logps/rejected": -5.497721195220947, "loss": 19.3141, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.29957860708236694, "rewards/margins": 0.11124887317419052, "rewards/rejected": -0.41082748770713806, "step": 1480 }, { "epoch": 0.934550031466331, "grad_norm": 97.28962707519531, "learning_rate": 5.917256425296725e-07, "logits/chosen": -1.3326900005340576, "logits/rejected": -1.3848145008087158, "logps/chosen": -4.326709270477295, "logps/rejected": -5.8570427894592285, "loss": 17.956, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.32169783115386963, "rewards/margins": 0.11987517029047012, "rewards/rejected": -0.44157299399375916, "step": 1485 }, { "epoch": 0.9376966645689113, "grad_norm": 104.4383773803711, "learning_rate": 5.398534300039227e-07, "logits/chosen": -1.3669896125793457, "logits/rejected": -1.4102351665496826, "logps/chosen": -4.2153167724609375, "logps/rejected": -5.1999030113220215, "loss": 20.9588, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3310829997062683, "rewards/margins": 0.07336001843214035, "rewards/rejected": -0.40444302558898926, "step": 1490 }, { "epoch": 0.9408432976714916, "grad_norm": 59.6121826171875, "learning_rate": 4.903292787161129e-07, "logits/chosen": -1.4228112697601318, "logits/rejected": -1.528313159942627, "logps/chosen": -4.338911533355713, "logps/rejected": -5.048561096191406, "loss": 22.4697, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3458613455295563, "rewards/margins": 0.05565253645181656, "rewards/rejected": -0.40151387453079224, "step": 1495 }, { "epoch": 0.9439899307740718, "grad_norm": 134.8368377685547, "learning_rate": 4.4315916425021755e-07, "logits/chosen": -1.4706683158874512, "logits/rejected": -1.5189244747161865, "logps/chosen": -4.430064678192139, "logps/rejected": -4.881100177764893, "loss": 24.7599, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.34278133511543274, "rewards/margins": 0.03427756577730179, "rewards/rejected": -0.37705889344215393, "step": 1500 }, { "epoch": 0.947136563876652, "grad_norm": 75.44186401367188, "learning_rate": 3.983487781521311e-07, "logits/chosen": -1.3628993034362793, "logits/rejected": -1.5227676630020142, "logps/chosen": -4.508485317230225, "logps/rejected": -5.836249351501465, "loss": 21.4824, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.35081833600997925, "rewards/margins": 0.0795225128531456, "rewards/rejected": -0.43034085631370544, "step": 1505 }, { "epoch": 0.9502831969792322, "grad_norm": 53.86139678955078, "learning_rate": 3.5590352724293565e-07, "logits/chosen": -1.2814509868621826, "logits/rejected": -1.383336067199707, "logps/chosen": -3.697767972946167, "logps/rejected": -5.5374345779418945, "loss": 18.3089, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.28700894117355347, "rewards/margins": 0.12951508164405823, "rewards/rejected": -0.4165240228176117, "step": 1510 }, { "epoch": 0.9534298300818125, "grad_norm": 55.83627700805664, "learning_rate": 3.1582853296649785e-07, "logits/chosen": -1.3301982879638672, "logits/rejected": -1.4231036901474, "logps/chosen": -3.7521042823791504, "logps/rejected": -4.861963748931885, "loss": 19.3616, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2891950309276581, "rewards/margins": 0.09701049327850342, "rewards/rejected": -0.3862055242061615, "step": 1515 }, { "epoch": 0.9565764631843927, "grad_norm": 88.61446380615234, "learning_rate": 2.7812863077153253e-07, "logits/chosen": -1.2899259328842163, "logits/rejected": -1.398050308227539, "logps/chosen": -4.068936824798584, "logps/rejected": -5.717960357666016, "loss": 17.8938, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.31747734546661377, "rewards/margins": 0.11797485500574112, "rewards/rejected": -0.4354521632194519, "step": 1520 }, { "epoch": 0.9597230962869729, "grad_norm": 58.96453857421875, "learning_rate": 2.4280836952814913e-07, "logits/chosen": -1.3611301183700562, "logits/rejected": -1.4117127656936646, "logps/chosen": -4.0526018142700195, "logps/rejected": -5.437824249267578, "loss": 21.3406, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.31340762972831726, "rewards/margins": 0.07557393610477448, "rewards/rejected": -0.38898158073425293, "step": 1525 }, { "epoch": 0.9628697293895532, "grad_norm": 82.22030639648438, "learning_rate": 2.0987201097897757e-07, "logits/chosen": -1.290305256843567, "logits/rejected": -1.3669493198394775, "logps/chosen": -4.012240409851074, "logps/rejected": -6.001503944396973, "loss": 18.4697, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3110642433166504, "rewards/margins": 0.14227357506752014, "rewards/rejected": -0.45333781838417053, "step": 1530 }, { "epoch": 0.9660163624921334, "grad_norm": 69.16776275634766, "learning_rate": 1.7932352922496844e-07, "logits/chosen": -1.3238952159881592, "logits/rejected": -1.4009875059127808, "logps/chosen": -4.168734550476074, "logps/rejected": -5.520012855529785, "loss": 18.6757, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3233780264854431, "rewards/margins": 0.10560549795627594, "rewards/rejected": -0.42898350954055786, "step": 1535 }, { "epoch": 0.9691629955947136, "grad_norm": 87.41554260253906, "learning_rate": 1.5116661024584756e-07, "logits/chosen": -1.3047425746917725, "logits/rejected": -1.2935268878936768, "logps/chosen": -3.8972859382629395, "logps/rejected": -5.7956743240356445, "loss": 19.4437, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2976101040840149, "rewards/margins": 0.13567054271697998, "rewards/rejected": -0.4332806169986725, "step": 1540 }, { "epoch": 0.9723096286972939, "grad_norm": 129.71432495117188, "learning_rate": 1.254046514553986e-07, "logits/chosen": -1.3411355018615723, "logits/rejected": -1.3150873184204102, "logps/chosen": -4.793996334075928, "logps/rejected": -6.1579155921936035, "loss": 22.5465, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.36123085021972656, "rewards/margins": 0.08643898367881775, "rewards/rejected": -0.4476698338985443, "step": 1545 }, { "epoch": 0.9754562617998741, "grad_norm": 156.82296752929688, "learning_rate": 1.0204076129150198e-07, "logits/chosen": -1.3176259994506836, "logits/rejected": -1.371140956878662, "logps/chosen": -4.381787300109863, "logps/rejected": -5.822647571563721, "loss": 20.2445, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.322052001953125, "rewards/margins": 0.08496570587158203, "rewards/rejected": -0.40701770782470703, "step": 1550 }, { "epoch": 0.9786028949024543, "grad_norm": 101.22770690917969, "learning_rate": 8.107775884109048e-08, "logits/chosen": -1.377939224243164, "logits/rejected": -1.460756540298462, "logps/chosen": -4.821037292480469, "logps/rejected": -5.5621137619018555, "loss": 23.1685, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.37864041328430176, "rewards/margins": 0.05817138031125069, "rewards/rejected": -0.43681177496910095, "step": 1555 }, { "epoch": 0.9817495280050346, "grad_norm": 93.55181884765625, "learning_rate": 6.251817349998578e-08, "logits/chosen": -1.2559947967529297, "logits/rejected": -1.3171112537384033, "logps/chosen": -3.9931647777557373, "logps/rejected": -5.348459243774414, "loss": 22.9477, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.30862289667129517, "rewards/margins": 0.0842631608247757, "rewards/rejected": -0.39288607239723206, "step": 1560 }, { "epoch": 0.9848961611076148, "grad_norm": 80.63821411132812, "learning_rate": 4.636424466771372e-08, "logits/chosen": -1.24492347240448, "logits/rejected": -1.3349525928497314, "logps/chosen": -4.380553245544434, "logps/rejected": -5.421158313751221, "loss": 22.0329, "rewards/accuracies": 0.6875, "rewards/chosen": -0.34523719549179077, "rewards/margins": 0.07052381336688995, "rewards/rejected": -0.4157610535621643, "step": 1565 }, { "epoch": 0.9880427942101951, "grad_norm": 55.254642486572266, "learning_rate": 3.261792147728704e-08, "logits/chosen": -1.3501121997833252, "logits/rejected": -1.3522610664367676, "logps/chosen": -4.829428195953369, "logps/rejected": -5.480432033538818, "loss": 22.6751, "rewards/accuracies": 0.625, "rewards/chosen": -0.3382914662361145, "rewards/margins": 0.05635923147201538, "rewards/rejected": -0.3946506381034851, "step": 1570 }, { "epoch": 0.9911894273127754, "grad_norm": 102.65123748779297, "learning_rate": 2.1280862560026927e-08, "logits/chosen": -1.350527048110962, "logits/rejected": -1.3495935201644897, "logps/chosen": -3.8183772563934326, "logps/rejected": -4.949650764465332, "loss": 22.3353, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3000851273536682, "rewards/margins": 0.07860491424798965, "rewards/rejected": -0.37869006395339966, "step": 1575 }, { "epoch": 0.9943360604153556, "grad_norm": 67.94386291503906, "learning_rate": 1.2354435845436385e-08, "logits/chosen": -1.2602336406707764, "logits/rejected": -1.2594802379608154, "logps/chosen": -3.5885491371154785, "logps/rejected": -4.909377098083496, "loss": 18.7801, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.27459749579429626, "rewards/margins": 0.10287781804800034, "rewards/rejected": -0.3774753212928772, "step": 1580 }, { "epoch": 0.9974826935179358, "grad_norm": 78.847412109375, "learning_rate": 5.8397183961411694e-09, "logits/chosen": -1.4188308715820312, "logits/rejected": -1.3911654949188232, "logps/chosen": -4.257325649261475, "logps/rejected": -5.559029579162598, "loss": 20.67, "rewards/accuracies": 0.6875, "rewards/chosen": -0.30969610810279846, "rewards/margins": 0.08111827820539474, "rewards/rejected": -0.3908143639564514, "step": 1585 }, { "epoch": 1.0, "step": 1589, "total_flos": 0.0, "train_loss": 22.009478435192264, "train_runtime": 23016.83, "train_samples_per_second": 1.105, "train_steps_per_second": 0.069 } ], "logging_steps": 5, "max_steps": 1589, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }