diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4797 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 1589, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0031466331025802393, + "grad_norm": 8.804184913635254, + "learning_rate": 1.257861635220126e-06, + "logits/chosen": -0.11013289541006088, + "logits/rejected": -0.5208367109298706, + "logps/chosen": -0.8537980914115906, + "logps/rejected": -1.0550096035003662, + "loss": 24.9985, + "rewards/accuracies": 0.3125, + "rewards/chosen": -5.359128408599645e-06, + "rewards/margins": 1.545622944831848e-05, + "rewards/rejected": -2.081535967590753e-05, + "step": 5 + }, + { + "epoch": 0.0062932662051604785, + "grad_norm": 18.678768157958984, + "learning_rate": 2.2641509433962266e-06, + "logits/chosen": -0.3030635714530945, + "logits/rejected": -0.5435053706169128, + "logps/chosen": -0.9865642786026001, + "logps/rejected": -1.107262372970581, + "loss": 24.9967, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.00010868474782910198, + "rewards/margins": 3.348257814650424e-05, + "rewards/rejected": -0.00014216733688954264, + "step": 10 + }, + { + "epoch": 0.009439899307740718, + "grad_norm": 11.281435012817383, + "learning_rate": 3.5220125786163524e-06, + "logits/chosen": -0.5111545324325562, + "logits/rejected": -0.8536307215690613, + "logps/chosen": -1.0305876731872559, + "logps/rejected": -1.2494089603424072, + "loss": 24.9847, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.00031705142464488745, + "rewards/margins": 0.000152838954818435, + "rewards/rejected": -0.00046989036491140723, + "step": 15 + }, + { + "epoch": 0.012586532410320957, + "grad_norm": 59.5455322265625, + "learning_rate": 4.528301886792453e-06, + "logits/chosen": -0.616014838218689, + "logits/rejected": -0.6851056218147278, + "logps/chosen": -1.130916953086853, + "logps/rejected": -1.46986985206604, + "loss": 24.9645, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0012707245768979192, + "rewards/margins": 0.0003901523014064878, + "rewards/rejected": -0.0016608769074082375, + "step": 20 + }, + { + "epoch": 0.015733165512901194, + "grad_norm": 9.532500267028809, + "learning_rate": 5.786163522012579e-06, + "logits/chosen": -0.12423186004161835, + "logits/rejected": -0.4599896967411041, + "logps/chosen": -0.8485546112060547, + "logps/rejected": -1.0018525123596191, + "loss": 24.9267, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.001064571551978588, + "rewards/margins": 0.0007417487213388085, + "rewards/rejected": -0.0018063202733173966, + "step": 25 + }, + { + "epoch": 0.018879798615481436, + "grad_norm": 11.064372062683105, + "learning_rate": 7.044025157232705e-06, + "logits/chosen": -0.1580429971218109, + "logits/rejected": -0.38266992568969727, + "logps/chosen": -0.8662201166152954, + "logps/rejected": -1.0262982845306396, + "loss": 24.8872, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0021042851731181145, + "rewards/margins": 0.0011668736115098, + "rewards/rejected": -0.0032711587846279144, + "step": 30 + }, + { + "epoch": 0.022026431718061675, + "grad_norm": 37.646690368652344, + "learning_rate": 8.301886792452832e-06, + "logits/chosen": 0.026471847668290138, + "logits/rejected": -0.49966034293174744, + "logps/chosen": -0.8883110880851746, + "logps/rejected": -1.199055790901184, + "loss": 24.6732, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.003758195089176297, + "rewards/margins": 0.0034271504264324903, + "rewards/rejected": -0.0071853450499475, + "step": 35 + }, + { + "epoch": 0.025173064820641914, + "grad_norm": 34.88091278076172, + "learning_rate": 9.559748427672956e-06, + "logits/chosen": -0.36181551218032837, + "logits/rejected": -0.6659843325614929, + "logps/chosen": -0.9565097689628601, + "logps/rejected": -1.183980941772461, + "loss": 24.6032, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.007488996721804142, + "rewards/margins": 0.004166100639849901, + "rewards/rejected": -0.011655096895992756, + "step": 40 + }, + { + "epoch": 0.028319697923222153, + "grad_norm": 20.635541915893555, + "learning_rate": 1.0817610062893083e-05, + "logits/chosen": -0.5469181537628174, + "logits/rejected": -0.7580572366714478, + "logps/chosen": -1.0930149555206299, + "logps/rejected": -1.2114075422286987, + "loss": 24.7536, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0149534298107028, + "rewards/margins": 0.002928597154095769, + "rewards/rejected": -0.017882030457258224, + "step": 45 + }, + { + "epoch": 0.03146633102580239, + "grad_norm": 36.45863723754883, + "learning_rate": 1.2075471698113209e-05, + "logits/chosen": -0.5085287094116211, + "logits/rejected": -0.7208930253982544, + "logps/chosen": -1.083438515663147, + "logps/rejected": -1.3884985446929932, + "loss": 23.9964, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.018525470048189163, + "rewards/margins": 0.011683688499033451, + "rewards/rejected": -0.03020915761590004, + "step": 50 + }, + { + "epoch": 0.034612964128382634, + "grad_norm": 45.08958053588867, + "learning_rate": 1.3081761006289308e-05, + "logits/chosen": -0.6235328912734985, + "logits/rejected": -0.8463523983955383, + "logps/chosen": -1.1567853689193726, + "logps/rejected": -2.149567127227783, + "loss": 23.8291, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.028398022055625916, + "rewards/margins": 0.03046615794301033, + "rewards/rejected": -0.05886417627334595, + "step": 55 + }, + { + "epoch": 0.03775959723096287, + "grad_norm": 48.65830612182617, + "learning_rate": 1.408805031446541e-05, + "logits/chosen": -0.6318017244338989, + "logits/rejected": -0.9939996600151062, + "logps/chosen": -1.7119739055633545, + "logps/rejected": -2.3402199745178223, + "loss": 23.5592, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.060618169605731964, + "rewards/margins": 0.028203105553984642, + "rewards/rejected": -0.08882127702236176, + "step": 60 + }, + { + "epoch": 0.04090623033354311, + "grad_norm": 138.4451904296875, + "learning_rate": 1.5345911949685536e-05, + "logits/chosen": -0.9717090725898743, + "logits/rejected": -1.1959871053695679, + "logps/chosen": -1.9082473516464233, + "logps/rejected": -2.449486494064331, + "loss": 22.7524, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08517814427614212, + "rewards/margins": 0.03464372828602791, + "rewards/rejected": -0.11982186883687973, + "step": 65 + }, + { + "epoch": 0.04405286343612335, + "grad_norm": 49.257568359375, + "learning_rate": 1.6603773584905664e-05, + "logits/chosen": -0.7433441281318665, + "logits/rejected": -1.0399134159088135, + "logps/chosen": -2.255545139312744, + "logps/rejected": -2.98321533203125, + "loss": 23.4113, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.11126575618982315, + "rewards/margins": 0.04789165034890175, + "rewards/rejected": -0.159157395362854, + "step": 70 + }, + { + "epoch": 0.04719949653870359, + "grad_norm": 56.168670654296875, + "learning_rate": 1.7861635220125788e-05, + "logits/chosen": -1.0234445333480835, + "logits/rejected": -1.288999080657959, + "logps/chosen": -1.653058648109436, + "logps/rejected": -2.370941162109375, + "loss": 22.181, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08025868237018585, + "rewards/margins": 0.044546954333782196, + "rewards/rejected": -0.12480561435222626, + "step": 75 + }, + { + "epoch": 0.05034612964128383, + "grad_norm": NaN, + "learning_rate": 1.8867924528301888e-05, + "logits/chosen": -1.1835613250732422, + "logits/rejected": -1.4036767482757568, + "logps/chosen": -1.90883469581604, + "logps/rejected": -2.1450889110565186, + "loss": 25.8869, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.09280741214752197, + "rewards/margins": 0.021463319659233093, + "rewards/rejected": -0.11427073180675507, + "step": 80 + }, + { + "epoch": 0.05349276274386407, + "grad_norm": 103.32862091064453, + "learning_rate": 2.0125786163522016e-05, + "logits/chosen": -1.539898157119751, + "logits/rejected": -1.6518356800079346, + "logps/chosen": -2.0776684284210205, + "logps/rejected": -2.5599067211151123, + "loss": 24.1212, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.12567706406116486, + "rewards/margins": 0.026400262489914894, + "rewards/rejected": -0.152077317237854, + "step": 85 + }, + { + "epoch": 0.056639395846444306, + "grad_norm": 176.14784240722656, + "learning_rate": 2.138364779874214e-05, + "logits/chosen": -1.3818124532699585, + "logits/rejected": -1.5843524932861328, + "logps/chosen": -2.48514461517334, + "logps/rejected": -2.8115108013153076, + "loss": 26.3518, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15766306221485138, + "rewards/margins": 0.025023411959409714, + "rewards/rejected": -0.1826864778995514, + "step": 90 + }, + { + "epoch": 0.059786028949024544, + "grad_norm": 106.15309143066406, + "learning_rate": 2.2641509433962265e-05, + "logits/chosen": -1.5876004695892334, + "logits/rejected": -1.7525005340576172, + "logps/chosen": -2.2529654502868652, + "logps/rejected": -3.2197937965393066, + "loss": 20.8074, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.14453086256980896, + "rewards/margins": 0.07421146333217621, + "rewards/rejected": -0.21874232590198517, + "step": 95 + }, + { + "epoch": 0.06293266205160478, + "grad_norm": 91.7872085571289, + "learning_rate": 2.3899371069182393e-05, + "logits/chosen": -1.6880552768707275, + "logits/rejected": -1.667741060256958, + "logps/chosen": -3.5453040599823, + "logps/rejected": -3.8808798789978027, + "loss": 24.6555, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.2320992648601532, + "rewards/margins": 0.020265836268663406, + "rewards/rejected": -0.2523651123046875, + "step": 100 + }, + { + "epoch": 0.06607929515418502, + "grad_norm": 778.8959350585938, + "learning_rate": 2.4905660377358492e-05, + "logits/chosen": -1.8318984508514404, + "logits/rejected": -1.8932411670684814, + "logps/chosen": -3.125164031982422, + "logps/rejected": -4.746774673461914, + "loss": 27.3293, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.19288301467895508, + "rewards/margins": 0.096275694668293, + "rewards/rejected": -0.28915873169898987, + "step": 105 + }, + { + "epoch": 0.06922592825676527, + "grad_norm": 132.40354919433594, + "learning_rate": 2.6163522012578617e-05, + "logits/chosen": -1.7445348501205444, + "logits/rejected": -1.902320146560669, + "logps/chosen": -1.9325546026229858, + "logps/rejected": -3.3019511699676514, + "loss": 21.7317, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.10555150359869003, + "rewards/margins": 0.07426220178604126, + "rewards/rejected": -0.1798136979341507, + "step": 110 + }, + { + "epoch": 0.0723725613593455, + "grad_norm": 98.48819732666016, + "learning_rate": 2.742138364779874e-05, + "logits/chosen": -1.7994951009750366, + "logits/rejected": -1.9057296514511108, + "logps/chosen": -2.1663613319396973, + "logps/rejected": -2.82452392578125, + "loss": 22.7429, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1297454833984375, + "rewards/margins": 0.04077299311757088, + "rewards/rejected": -0.17051845788955688, + "step": 115 + }, + { + "epoch": 0.07551919446192575, + "grad_norm": 93.13198852539062, + "learning_rate": 2.867924528301887e-05, + "logits/chosen": -1.6606374979019165, + "logits/rejected": -1.7864787578582764, + "logps/chosen": -2.2936453819274902, + "logps/rejected": -2.5704400539398193, + "loss": 24.0989, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.12692785263061523, + "rewards/margins": 0.020071204751729965, + "rewards/rejected": -0.1469990611076355, + "step": 120 + }, + { + "epoch": 0.07866582756450598, + "grad_norm": 101.10535430908203, + "learning_rate": 2.968553459119497e-05, + "logits/chosen": -1.648816704750061, + "logits/rejected": -1.6658546924591064, + "logps/chosen": -2.0479884147644043, + "logps/rejected": -2.8278560638427734, + "loss": 27.9983, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.11854568868875504, + "rewards/margins": 0.0439017117023468, + "rewards/rejected": -0.16244739294052124, + "step": 125 + }, + { + "epoch": 0.08181246066708622, + "grad_norm": 92.93740844726562, + "learning_rate": 3.09433962264151e-05, + "logits/chosen": -1.7306410074234009, + "logits/rejected": -1.8349339962005615, + "logps/chosen": -2.082920551300049, + "logps/rejected": -3.115952253341675, + "loss": 23.5299, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11979808658361435, + "rewards/margins": 0.06730449199676514, + "rewards/rejected": -0.18710258603096008, + "step": 130 + }, + { + "epoch": 0.08495909376966645, + "grad_norm": 123.31324768066406, + "learning_rate": 3.220125786163522e-05, + "logits/chosen": -1.8235572576522827, + "logits/rejected": -1.8541405200958252, + "logps/chosen": -1.9667946100234985, + "logps/rejected": -2.772089958190918, + "loss": 22.6137, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.10746946185827255, + "rewards/margins": 0.04866869002580643, + "rewards/rejected": -0.15613815188407898, + "step": 135 + }, + { + "epoch": 0.0881057268722467, + "grad_norm": 126.42218017578125, + "learning_rate": 3.345911949685535e-05, + "logits/chosen": -1.674515962600708, + "logits/rejected": -1.8894662857055664, + "logps/chosen": -2.245245933532715, + "logps/rejected": -3.0301966667175293, + "loss": 22.6984, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.11901465803384781, + "rewards/margins": 0.049767203629016876, + "rewards/rejected": -0.16878187656402588, + "step": 140 + }, + { + "epoch": 0.09125235997482693, + "grad_norm": 114.61023712158203, + "learning_rate": 3.471698113207548e-05, + "logits/chosen": -1.7905619144439697, + "logits/rejected": -1.8821656703948975, + "logps/chosen": -3.373708724975586, + "logps/rejected": -4.691153526306152, + "loss": 22.2353, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18414758145809174, + "rewards/margins": 0.0776277631521225, + "rewards/rejected": -0.26177531480789185, + "step": 145 + }, + { + "epoch": 0.09439899307740718, + "grad_norm": 296.22955322265625, + "learning_rate": 3.59748427672956e-05, + "logits/chosen": -1.654166579246521, + "logits/rejected": -1.847495436668396, + "logps/chosen": -3.2497410774230957, + "logps/rejected": -4.303386688232422, + "loss": 20.9992, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.22493870556354523, + "rewards/margins": 0.07029641419649124, + "rewards/rejected": -0.2952350974082947, + "step": 150 + }, + { + "epoch": 0.09754562617998741, + "grad_norm": 579.211669921875, + "learning_rate": 3.7232704402515726e-05, + "logits/chosen": -1.6689754724502563, + "logits/rejected": -1.7173473834991455, + "logps/chosen": -3.7695910930633545, + "logps/rejected": -4.783900260925293, + "loss": 25.2195, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2695242762565613, + "rewards/margins": 0.05760473012924194, + "rewards/rejected": -0.3271290063858032, + "step": 155 + }, + { + "epoch": 0.10069225928256766, + "grad_norm": 200.13609313964844, + "learning_rate": 3.8490566037735854e-05, + "logits/chosen": -1.7428325414657593, + "logits/rejected": -1.74752938747406, + "logps/chosen": -3.6156649589538574, + "logps/rejected": -4.805546760559082, + "loss": 22.7118, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2704419791698456, + "rewards/margins": 0.06444540619850159, + "rewards/rejected": -0.33488741517066956, + "step": 160 + }, + { + "epoch": 0.10383889238514789, + "grad_norm": 172.51622009277344, + "learning_rate": 3.9748427672955975e-05, + "logits/chosen": -1.7474384307861328, + "logits/rejected": -1.7428706884384155, + "logps/chosen": -3.276729106903076, + "logps/rejected": -4.082120418548584, + "loss": 22.3077, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.2311684638261795, + "rewards/margins": 0.051828593015670776, + "rewards/rejected": -0.2829970717430115, + "step": 165 + }, + { + "epoch": 0.10698552548772813, + "grad_norm": 146.08352661132812, + "learning_rate": 3.9999227773220194e-05, + "logits/chosen": -1.6052366495132446, + "logits/rejected": -1.6235520839691162, + "logps/chosen": -3.030139207839966, + "logps/rejected": -4.707204818725586, + "loss": 20.0014, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.21569938957691193, + "rewards/margins": 0.12310032546520233, + "rewards/rejected": -0.33879974484443665, + "step": 170 + }, + { + "epoch": 0.11013215859030837, + "grad_norm": 133.93203735351562, + "learning_rate": 3.9996090704130684e-05, + "logits/chosen": -1.7021839618682861, + "logits/rejected": -1.7295335531234741, + "logps/chosen": -3.9147982597351074, + "logps/rejected": -5.332208633422852, + "loss": 20.047, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.3039882779121399, + "rewards/margins": 0.1180083155632019, + "rewards/rejected": -0.4219965934753418, + "step": 175 + }, + { + "epoch": 0.11327879169288861, + "grad_norm": 558.7332763671875, + "learning_rate": 3.999054090678532e-05, + "logits/chosen": -1.5368597507476807, + "logits/rejected": -1.592637300491333, + "logps/chosen": -6.026860237121582, + "logps/rejected": -6.550711631774902, + "loss": 29.6933, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.4695097804069519, + "rewards/margins": 0.02213056944310665, + "rewards/rejected": -0.4916403889656067, + "step": 180 + }, + { + "epoch": 0.11642542479546884, + "grad_norm": 212.05760192871094, + "learning_rate": 3.9982579050822615e-05, + "logits/chosen": -1.5933212041854858, + "logits/rejected": -1.5753694772720337, + "logps/chosen": -4.716382026672363, + "logps/rejected": -5.257371425628662, + "loss": 27.5815, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3512773811817169, + "rewards/margins": 0.033867720514535904, + "rewards/rejected": -0.3851450979709625, + "step": 185 + }, + { + "epoch": 0.11957205789804909, + "grad_norm": 134.0122833251953, + "learning_rate": 3.997220609692011e-05, + "logits/chosen": -1.6495559215545654, + "logits/rejected": -1.6725133657455444, + "logps/chosen": -3.984989643096924, + "logps/rejected": -5.001562595367432, + "loss": 22.6766, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2919956147670746, + "rewards/margins": 0.05422482639551163, + "rewards/rejected": -0.3462204337120056, + "step": 190 + }, + { + "epoch": 0.12271869100062932, + "grad_norm": 151.4617919921875, + "learning_rate": 3.9959423296678384e-05, + "logits/chosen": -1.7128961086273193, + "logits/rejected": -1.6318174600601196, + "logps/chosen": -3.3435721397399902, + "logps/rejected": -4.078289985656738, + "loss": 25.0994, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.23941469192504883, + "rewards/margins": 0.04007013887166977, + "rewards/rejected": -0.2794848084449768, + "step": 195 + }, + { + "epoch": 0.12586532410320955, + "grad_norm": 115.02042388916016, + "learning_rate": 3.9944232192470094e-05, + "logits/chosen": -1.7137172222137451, + "logits/rejected": -1.7910420894622803, + "logps/chosen": -3.106358051300049, + "logps/rejected": -3.97111439704895, + "loss": 21.6293, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.2061949521303177, + "rewards/margins": 0.04795767739415169, + "rewards/rejected": -0.2541525959968567, + "step": 200 + }, + { + "epoch": 0.1290119572057898, + "grad_norm": 81.87369537353516, + "learning_rate": 3.992663461725383e-05, + "logits/chosen": -1.5431886911392212, + "logits/rejected": -1.557018518447876, + "logps/chosen": -2.805392026901245, + "logps/rejected": -4.356006622314453, + "loss": 21.8817, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.1901206076145172, + "rewards/margins": 0.0818587988615036, + "rewards/rejected": -0.2719793915748596, + "step": 205 + }, + { + "epoch": 0.13215859030837004, + "grad_norm": 188.29173278808594, + "learning_rate": 3.990663269435298e-05, + "logits/chosen": -1.6920125484466553, + "logits/rejected": -1.6854931116104126, + "logps/chosen": -3.156735897064209, + "logps/rejected": -4.396471977233887, + "loss": 27.5638, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.20159177482128143, + "rewards/margins": 0.07304342836141586, + "rewards/rejected": -0.2746351957321167, + "step": 210 + }, + { + "epoch": 0.13530522341095028, + "grad_norm": 141.69142150878906, + "learning_rate": 3.98842288371995e-05, + "logits/chosen": -1.6487762928009033, + "logits/rejected": -1.7372210025787354, + "logps/chosen": -2.6156325340270996, + "logps/rejected": -3.677928924560547, + "loss": 21.5613, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1552310734987259, + "rewards/margins": 0.0671745091676712, + "rewards/rejected": -0.2224055826663971, + "step": 215 + }, + { + "epoch": 0.13845185651353054, + "grad_norm": 92.31954193115234, + "learning_rate": 3.985942574904275e-05, + "logits/chosen": -1.677199363708496, + "logits/rejected": -1.6414434909820557, + "logps/chosen": -2.499932050704956, + "logps/rejected": -3.3010895252227783, + "loss": 22.2151, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.16245323419570923, + "rewards/margins": 0.05558500811457634, + "rewards/rejected": -0.21803824603557587, + "step": 220 + }, + { + "epoch": 0.14159848961611077, + "grad_norm": 106.32303619384766, + "learning_rate": 3.983222642262329e-05, + "logits/chosen": -1.6422779560089111, + "logits/rejected": -1.6500838994979858, + "logps/chosen": -2.66230845451355, + "logps/rejected": -3.7150185108184814, + "loss": 20.2102, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.18372972309589386, + "rewards/margins": 0.08325181156396866, + "rewards/rejected": -0.2669815421104431, + "step": 225 + }, + { + "epoch": 0.144745122718691, + "grad_norm": 113.5155029296875, + "learning_rate": 3.980263413981178e-05, + "logits/chosen": -1.5669496059417725, + "logits/rejected": -1.5747731924057007, + "logps/chosen": -3.1706671714782715, + "logps/rejected": -3.948005199432373, + "loss": 21.8852, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2336561679840088, + "rewards/margins": 0.06708776950836182, + "rewards/rejected": -0.300743967294693, + "step": 230 + }, + { + "epoch": 0.14789175582127123, + "grad_norm": 99.03396606445312, + "learning_rate": 3.977065247121298e-05, + "logits/chosen": -1.639129400253296, + "logits/rejected": -1.6693006753921509, + "logps/chosen": -3.2495856285095215, + "logps/rejected": -4.634251594543457, + "loss": 22.4292, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.23980684578418732, + "rewards/margins": 0.10619230568408966, + "rewards/rejected": -0.345999151468277, + "step": 235 + }, + { + "epoch": 0.1510383889238515, + "grad_norm": 254.25714111328125, + "learning_rate": 3.973628527573495e-05, + "logits/chosen": -1.4451357126235962, + "logits/rejected": -1.415290355682373, + "logps/chosen": -4.496035575866699, + "logps/rejected": -5.4740800857543945, + "loss": 24.0697, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.3446175158023834, + "rewards/margins": 0.07305804640054703, + "rewards/rejected": -0.41767558455467224, + "step": 240 + }, + { + "epoch": 0.15418502202643172, + "grad_norm": 98.8416976928711, + "learning_rate": 3.969953670012342e-05, + "logits/chosen": -1.6127903461456299, + "logits/rejected": -1.529802918434143, + "logps/chosen": -3.744677782058716, + "logps/rejected": -5.76874303817749, + "loss": 20.2498, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2814852297306061, + "rewards/margins": 0.12259259074926376, + "rewards/rejected": -0.40407782793045044, + "step": 245 + }, + { + "epoch": 0.15733165512901195, + "grad_norm": 174.65185546875, + "learning_rate": 3.9660411178461427e-05, + "logits/chosen": -1.6170070171356201, + "logits/rejected": -1.5994997024536133, + "logps/chosen": -3.390500545501709, + "logps/rejected": -4.377715587615967, + "loss": 22.3596, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2286950647830963, + "rewards/margins": 0.07162559777498245, + "rewards/rejected": -0.30032065510749817, + "step": 250 + }, + { + "epoch": 0.1604782882315922, + "grad_norm": 98.30694580078125, + "learning_rate": 3.9618913431634326e-05, + "logits/chosen": -1.5248662233352661, + "logits/rejected": -1.570233702659607, + "logps/chosen": -2.914156436920166, + "logps/rejected": -3.4877963066101074, + "loss": 21.8392, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.17873355746269226, + "rewards/margins": 0.04700728505849838, + "rewards/rejected": -0.22574086487293243, + "step": 255 + }, + { + "epoch": 0.16362492133417245, + "grad_norm": 108.39894104003906, + "learning_rate": 3.957504846676015e-05, + "logits/chosen": -1.5246005058288574, + "logits/rejected": -1.6037238836288452, + "logps/chosen": -3.113523006439209, + "logps/rejected": -4.024534702301025, + "loss": 21.9178, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.21206626296043396, + "rewards/margins": 0.06214412301778793, + "rewards/rejected": -0.2742103934288025, + "step": 260 + }, + { + "epoch": 0.16677155443675268, + "grad_norm": 122.53215026855469, + "learning_rate": 3.952882157658545e-05, + "logits/chosen": -1.4534975290298462, + "logits/rejected": -1.4294064044952393, + "logps/chosen": -3.44130277633667, + "logps/rejected": -3.7570698261260986, + "loss": 25.6929, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2504199147224426, + "rewards/margins": 0.021822316572070122, + "rewards/rejected": -0.2722422182559967, + "step": 265 + }, + { + "epoch": 0.1699181875393329, + "grad_norm": 117.72502899169922, + "learning_rate": 3.948023833884667e-05, + "logits/chosen": -1.596609354019165, + "logits/rejected": -1.6202917098999023, + "logps/chosen": -3.7515816688537598, + "logps/rejected": -3.9420647621154785, + "loss": 25.1709, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.23414090275764465, + "rewards/margins": 0.027978042140603065, + "rewards/rejected": -0.26211896538734436, + "step": 270 + }, + { + "epoch": 0.17306482064191314, + "grad_norm": 84.38936614990234, + "learning_rate": 3.942930461559718e-05, + "logits/chosen": -1.5714600086212158, + "logits/rejected": -1.683579683303833, + "logps/chosen": -3.3148865699768066, + "logps/rejected": -3.7648849487304688, + "loss": 24.1859, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2219853699207306, + "rewards/margins": 0.02847103402018547, + "rewards/rejected": -0.25045639276504517, + "step": 275 + }, + { + "epoch": 0.1762114537444934, + "grad_norm": 122.990478515625, + "learning_rate": 3.9376026552499894e-05, + "logits/chosen": -1.5986852645874023, + "logits/rejected": -1.6811764240264893, + "logps/chosen": -3.261617660522461, + "logps/rejected": -4.3173418045043945, + "loss": 19.8872, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.22893181443214417, + "rewards/margins": 0.0762997642159462, + "rewards/rejected": -0.30523157119750977, + "step": 280 + }, + { + "epoch": 0.17935808684707363, + "grad_norm": 128.1126251220703, + "learning_rate": 3.9320410578085774e-05, + "logits/chosen": -1.5240422487258911, + "logits/rejected": -1.5410079956054688, + "logps/chosen": -3.7498767375946045, + "logps/rejected": -4.466190338134766, + "loss": 22.8035, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2702712118625641, + "rewards/margins": 0.0467303991317749, + "rewards/rejected": -0.3170016407966614, + "step": 285 + }, + { + "epoch": 0.18250471994965387, + "grad_norm": 160.00189208984375, + "learning_rate": 3.9262463402978165e-05, + "logits/chosen": -1.413119912147522, + "logits/rejected": -1.3633155822753906, + "logps/chosen": -3.8721237182617188, + "logps/rejected": -5.0125298500061035, + "loss": 22.2056, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3019005358219147, + "rewards/margins": 0.08287017047405243, + "rewards/rejected": -0.3847707211971283, + "step": 290 + }, + { + "epoch": 0.1856513530522341, + "grad_norm": 168.05908203125, + "learning_rate": 3.920219201908306e-05, + "logits/chosen": -1.2270746231079102, + "logits/rejected": -1.2809008359909058, + "logps/chosen": -4.052460670471191, + "logps/rejected": -5.228961944580078, + "loss": 21.1427, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3124389052391052, + "rewards/margins": 0.0833948403596878, + "rewards/rejected": -0.3958337903022766, + "step": 295 + }, + { + "epoch": 0.18879798615481436, + "grad_norm": 94.47506713867188, + "learning_rate": 3.9139603698745514e-05, + "logits/chosen": -1.1681110858917236, + "logits/rejected": -1.2372829914093018, + "logps/chosen": -3.511944532394409, + "logps/rejected": -4.100220680236816, + "loss": 22.7025, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.24873590469360352, + "rewards/margins": 0.03639525547623634, + "rewards/rejected": -0.28513115644454956, + "step": 300 + }, + { + "epoch": 0.1919446192573946, + "grad_norm": 560.2835083007812, + "learning_rate": 3.907470599387209e-05, + "logits/chosen": -1.101466178894043, + "logits/rejected": -1.0982881784439087, + "logps/chosen": -3.0287392139434814, + "logps/rejected": -3.3412985801696777, + "loss": 25.2732, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.210398867726326, + "rewards/margins": 0.023965148255228996, + "rewards/rejected": -0.23436403274536133, + "step": 305 + }, + { + "epoch": 0.19509125235997482, + "grad_norm": 190.03529357910156, + "learning_rate": 3.900750673501971e-05, + "logits/chosen": -0.8078586459159851, + "logits/rejected": -1.0514795780181885, + "logps/chosen": -2.391371250152588, + "logps/rejected": -3.401437282562256, + "loss": 21.6721, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.14814777672290802, + "rewards/margins": 0.07396493852138519, + "rewards/rejected": -0.22211270034313202, + "step": 310 + }, + { + "epoch": 0.19823788546255505, + "grad_norm": 127.30278778076172, + "learning_rate": 3.893801403045078e-05, + "logits/chosen": -0.9948938488960266, + "logits/rejected": -1.1343729496002197, + "logps/chosen": -2.520848274230957, + "logps/rejected": -3.695737838745117, + "loss": 21.1395, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.16981211304664612, + "rewards/margins": 0.08695949614048004, + "rewards/rejected": -0.25677159428596497, + "step": 315 + }, + { + "epoch": 0.2013845185651353, + "grad_norm": 164.39279174804688, + "learning_rate": 3.8866236265154864e-05, + "logits/chosen": -1.059020757675171, + "logits/rejected": -1.1909369230270386, + "logps/chosen": -3.2958297729492188, + "logps/rejected": -4.60178279876709, + "loss": 23.1263, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.24349625408649445, + "rewards/margins": 0.08941353857517242, + "rewards/rejected": -0.33290979266166687, + "step": 320 + }, + { + "epoch": 0.20453115166771554, + "grad_norm": 317.5319519042969, + "learning_rate": 3.8792182099836956e-05, + "logits/chosen": -1.1690977811813354, + "logits/rejected": -1.221868872642517, + "logps/chosen": -3.4916579723358154, + "logps/rejected": -4.967286109924316, + "loss": 19.5685, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.2618243992328644, + "rewards/margins": 0.09256020933389664, + "rewards/rejected": -0.35438457131385803, + "step": 325 + }, + { + "epoch": 0.20767778477029578, + "grad_norm": 113.65757751464844, + "learning_rate": 3.8715860469872456e-05, + "logits/chosen": -1.230567216873169, + "logits/rejected": -1.2354533672332764, + "logps/chosen": -4.1219682693481445, + "logps/rejected": -5.140664577484131, + "loss": 24.1262, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3101332485675812, + "rewards/margins": 0.07826542854309082, + "rewards/rejected": -0.3883987069129944, + "step": 330 + }, + { + "epoch": 0.21082441787287604, + "grad_norm": 103.66908264160156, + "learning_rate": 3.863728058422905e-05, + "logits/chosen": -1.1679656505584717, + "logits/rejected": -1.2492824792861938, + "logps/chosen": -4.176590442657471, + "logps/rejected": -5.121442794799805, + "loss": 21.9799, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3107621371746063, + "rewards/margins": 0.07555123418569565, + "rewards/rejected": -0.38631340861320496, + "step": 335 + }, + { + "epoch": 0.21397105097545627, + "grad_norm": 187.34596252441406, + "learning_rate": 3.855645192435555e-05, + "logits/chosen": -1.4208840131759644, + "logits/rejected": -1.357755422592163, + "logps/chosen": -3.746802568435669, + "logps/rejected": -4.651678562164307, + "loss": 21.8739, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2814106345176697, + "rewards/margins": 0.06742358207702637, + "rewards/rejected": -0.34883421659469604, + "step": 340 + }, + { + "epoch": 0.2171176840780365, + "grad_norm": 128.47970581054688, + "learning_rate": 3.847338424303787e-05, + "logits/chosen": -1.403939962387085, + "logits/rejected": -1.3926942348480225, + "logps/chosen": -3.540362596511841, + "logps/rejected": -4.463648796081543, + "loss": 22.9837, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2591942250728607, + "rewards/margins": 0.06667342782020569, + "rewards/rejected": -0.3258676528930664, + "step": 345 + }, + { + "epoch": 0.22026431718061673, + "grad_norm": 91.00343322753906, + "learning_rate": 3.838808756322222e-05, + "logits/chosen": -1.4555909633636475, + "logits/rejected": -1.4179480075836182, + "logps/chosen": -3.3319029808044434, + "logps/rejected": -4.7188615798950195, + "loss": 22.182, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.24019880592823029, + "rewards/margins": 0.09150617569684982, + "rewards/rejected": -0.3317049741744995, + "step": 350 + }, + { + "epoch": 0.223410950283197, + "grad_norm": 89.21013641357422, + "learning_rate": 3.8300572176805796e-05, + "logits/chosen": -1.505953073501587, + "logits/rejected": -1.4713289737701416, + "logps/chosen": -3.2633144855499268, + "logps/rejected": -4.148341655731201, + "loss": 22.4622, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.23655016720294952, + "rewards/margins": 0.04711543396115303, + "rewards/rejected": -0.28366559743881226, + "step": 355 + }, + { + "epoch": 0.22655758338577722, + "grad_norm": 136.71780395507812, + "learning_rate": 3.82108486433949e-05, + "logits/chosen": -1.4959208965301514, + "logits/rejected": -1.4308115243911743, + "logps/chosen": -3.161681652069092, + "logps/rejected": -3.9897868633270264, + "loss": 23.3097, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.2291373759508133, + "rewards/margins": 0.045841820538043976, + "rewards/rejected": -0.2749791741371155, + "step": 360 + }, + { + "epoch": 0.22970421648835745, + "grad_norm": 233.93896484375, + "learning_rate": 3.8118927789030854e-05, + "logits/chosen": -1.5138304233551025, + "logits/rejected": -1.5346544981002808, + "logps/chosen": -4.37386417388916, + "logps/rejected": -5.469226837158203, + "loss": 20.9319, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3008665442466736, + "rewards/margins": 0.07115120440721512, + "rewards/rejected": -0.3720177412033081, + "step": 365 + }, + { + "epoch": 0.2328508495909377, + "grad_norm": 100.57418060302734, + "learning_rate": 3.802482070488373e-05, + "logits/chosen": -1.3890790939331055, + "logits/rejected": -1.4434179067611694, + "logps/chosen": -3.4095160961151123, + "logps/rejected": -4.254734039306641, + "loss": 21.2175, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.24669814109802246, + "rewards/margins": 0.06480761617422104, + "rewards/rejected": -0.3115057349205017, + "step": 370 + }, + { + "epoch": 0.23599748269351795, + "grad_norm": 194.1370391845703, + "learning_rate": 3.792853874591408e-05, + "logits/chosen": -1.5562362670898438, + "logits/rejected": -1.4487522840499878, + "logps/chosen": -3.45831561088562, + "logps/rejected": -4.16960334777832, + "loss": 24.8363, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.23216786980628967, + "rewards/margins": 0.047336287796497345, + "rewards/rejected": -0.2795041799545288, + "step": 375 + }, + { + "epoch": 0.23914411579609818, + "grad_norm": 88.31356811523438, + "learning_rate": 3.783009352950282e-05, + "logits/chosen": -1.371385097503662, + "logits/rejected": -1.373175859451294, + "logps/chosen": -2.55993390083313, + "logps/rejected": -3.111349105834961, + "loss": 22.3814, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.17149809002876282, + "rewards/margins": 0.04337615519762039, + "rewards/rejected": -0.214874267578125, + "step": 380 + }, + { + "epoch": 0.2422907488986784, + "grad_norm": 126.74950408935547, + "learning_rate": 3.772949693404954e-05, + "logits/chosen": -1.33748459815979, + "logits/rejected": -1.3754979372024536, + "logps/chosen": -2.633439064025879, + "logps/rejected": -3.534024715423584, + "loss": 20.4661, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.17298361659049988, + "rewards/margins": 0.07067564874887466, + "rewards/rejected": -0.24365928769111633, + "step": 385 + }, + { + "epoch": 0.24543738200125864, + "grad_norm": 90.40318298339844, + "learning_rate": 3.762676109753919e-05, + "logits/chosen": -1.2709859609603882, + "logits/rejected": -1.294306755065918, + "logps/chosen": -3.954099655151367, + "logps/rejected": -5.9721527099609375, + "loss": 21.932, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.29533275961875916, + "rewards/margins": 0.12940457463264465, + "rewards/rejected": -0.4247373640537262, + "step": 390 + }, + { + "epoch": 0.2485840151038389, + "grad_norm": 84.59414672851562, + "learning_rate": 3.7521898416077565e-05, + "logits/chosen": -1.4984506368637085, + "logits/rejected": -1.5229644775390625, + "logps/chosen": -4.4091901779174805, + "logps/rejected": -5.3940815925598145, + "loss": 21.5859, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3109613358974457, + "rewards/margins": 0.08055521547794342, + "rewards/rejected": -0.3915165364742279, + "step": 395 + }, + { + "epoch": 0.2517306482064191, + "grad_norm": 120.28202056884766, + "learning_rate": 3.7414921542395546e-05, + "logits/chosen": -1.5182693004608154, + "logits/rejected": -1.5193490982055664, + "logps/chosen": -4.545083045959473, + "logps/rejected": -5.492725372314453, + "loss": 21.539, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.36579376459121704, + "rewards/margins": 0.06641928851604462, + "rewards/rejected": -0.4322130084037781, + "step": 400 + }, + { + "epoch": 0.2548772813089994, + "grad_norm": 143.28396606445312, + "learning_rate": 3.7305843384322466e-05, + "logits/chosen": -1.5114035606384277, + "logits/rejected": -1.5092270374298096, + "logps/chosen": -5.28603982925415, + "logps/rejected": -6.232533931732178, + "loss": 21.4891, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.43935927748680115, + "rewards/margins": 0.08039890229701996, + "rewards/rejected": -0.5197581648826599, + "step": 405 + }, + { + "epoch": 0.2580239144115796, + "grad_norm": 129.09864807128906, + "learning_rate": 3.71946771032286e-05, + "logits/chosen": -1.6940416097640991, + "logits/rejected": -1.6389005184173584, + "logps/chosen": -5.122313022613525, + "logps/rejected": -6.010600566864014, + "loss": 21.8681, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.42212480306625366, + "rewards/margins": 0.076592817902565, + "rewards/rejected": -0.49871763586997986, + "step": 410 + }, + { + "epoch": 0.26117054751415986, + "grad_norm": 1118.02392578125, + "learning_rate": 3.708143611243716e-05, + "logits/chosen": -1.65127432346344, + "logits/rejected": -1.6758639812469482, + "logps/chosen": -5.203777313232422, + "logps/rejected": -6.3162078857421875, + "loss": 21.2512, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.37822961807250977, + "rewards/margins": 0.09629149734973907, + "rewards/rejected": -0.4745211601257324, + "step": 415 + }, + { + "epoch": 0.2643171806167401, + "grad_norm": 109.98821258544922, + "learning_rate": 3.696613407560582e-05, + "logits/chosen": -1.6237115859985352, + "logits/rejected": -1.5712984800338745, + "logps/chosen": -4.632975101470947, + "logps/rejected": -6.082078456878662, + "loss": 20.9477, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.3740273118019104, + "rewards/margins": 0.103847935795784, + "rewards/rejected": -0.4778752326965332, + "step": 420 + }, + { + "epoch": 0.2674638137193203, + "grad_norm": 95.2988052368164, + "learning_rate": 3.684878490507808e-05, + "logits/chosen": -1.5806386470794678, + "logits/rejected": -1.6192169189453125, + "logps/chosen": -4.849827766418457, + "logps/rejected": -5.800168037414551, + "loss": 23.5806, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3749791085720062, + "rewards/margins": 0.07270670682191849, + "rewards/rejected": -0.4476858079433441, + "step": 425 + }, + { + "epoch": 0.27061044682190055, + "grad_norm": 111.99176788330078, + "learning_rate": 3.6729402760204535e-05, + "logits/chosen": -1.6522388458251953, + "logits/rejected": -1.6433773040771484, + "logps/chosen": -3.4129672050476074, + "logps/rejected": -4.362156867980957, + "loss": 21.9253, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.25616371631622314, + "rewards/margins": 0.07649270445108414, + "rewards/rejected": -0.3326564431190491, + "step": 430 + }, + { + "epoch": 0.2737570799244808, + "grad_norm": 219.88124084472656, + "learning_rate": 3.6608002045634535e-05, + "logits/chosen": -1.7825971841812134, + "logits/rejected": -1.6959110498428345, + "logps/chosen": -3.785250425338745, + "logps/rejected": -4.989777565002441, + "loss": 22.1928, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.26640018820762634, + "rewards/margins": 0.07046084105968475, + "rewards/rejected": -0.3368610143661499, + "step": 435 + }, + { + "epoch": 0.27690371302706107, + "grad_norm": 110.93528747558594, + "learning_rate": 3.6484597409577975e-05, + "logits/chosen": -1.8389028310775757, + "logits/rejected": -1.7533693313598633, + "logps/chosen": -3.4091110229492188, + "logps/rejected": -4.324118614196777, + "loss": 21.2394, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.25699272751808167, + "rewards/margins": 0.06507135927677155, + "rewards/rejected": -0.322064071893692, + "step": 440 + }, + { + "epoch": 0.2800503461296413, + "grad_norm": 128.312255859375, + "learning_rate": 3.6359203742037966e-05, + "logits/chosen": -1.8402115106582642, + "logits/rejected": -1.7344493865966797, + "logps/chosen": -4.041749000549316, + "logps/rejected": -4.417330741882324, + "loss": 22.7853, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2921445965766907, + "rewards/margins": 0.04909106716513634, + "rewards/rejected": -0.3412356376647949, + "step": 445 + }, + { + "epoch": 0.28319697923222154, + "grad_norm": 121.12706756591797, + "learning_rate": 3.623183617301411e-05, + "logits/chosen": -1.7311460971832275, + "logits/rejected": -1.7096502780914307, + "logps/chosen": -3.8819706439971924, + "logps/rejected": -4.670237064361572, + "loss": 22.6275, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.30139902234077454, + "rewards/margins": 0.05851779133081436, + "rewards/rejected": -0.3599168360233307, + "step": 450 + }, + { + "epoch": 0.28634361233480177, + "grad_norm": 93.03048706054688, + "learning_rate": 3.610251007067699e-05, + "logits/chosen": -1.836363434791565, + "logits/rejected": -1.736104965209961, + "logps/chosen": -4.1447577476501465, + "logps/rejected": -4.325010299682617, + "loss": 26.2728, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.32724231481552124, + "rewards/margins": 0.010385597124695778, + "rewards/rejected": -0.33762794733047485, + "step": 455 + }, + { + "epoch": 0.289490245437382, + "grad_norm": 76.58390808105469, + "learning_rate": 3.597124103951379e-05, + "logits/chosen": -1.7278220653533936, + "logits/rejected": -1.7181174755096436, + "logps/chosen": -4.0262017250061035, + "logps/rejected": -4.855641841888428, + "loss": 22.3804, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2886626124382019, + "rewards/margins": 0.06016182899475098, + "rewards/rejected": -0.3488244414329529, + "step": 460 + }, + { + "epoch": 0.29263687853996223, + "grad_norm": 80.33660888671875, + "learning_rate": 3.583804491844551e-05, + "logits/chosen": -1.8658571243286133, + "logits/rejected": -1.7413606643676758, + "logps/chosen": -3.758129835128784, + "logps/rejected": -4.306906223297119, + "loss": 26.088, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2766272723674774, + "rewards/margins": 0.03810672461986542, + "rewards/rejected": -0.31473398208618164, + "step": 465 + }, + { + "epoch": 0.29578351164254246, + "grad_norm": 66.17215728759766, + "learning_rate": 3.5702937778915765e-05, + "logits/chosen": -1.8694692850112915, + "logits/rejected": -1.82939875125885, + "logps/chosen": -2.9322712421417236, + "logps/rejected": -3.7157013416290283, + "loss": 21.7852, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.2061152458190918, + "rewards/margins": 0.056372471153736115, + "rewards/rejected": -0.2624877095222473, + "step": 470 + }, + { + "epoch": 0.2989301447451227, + "grad_norm": 95.2267837524414, + "learning_rate": 3.556593592295171e-05, + "logits/chosen": -1.8632274866104126, + "logits/rejected": -1.8683363199234009, + "logps/chosen": -2.8304595947265625, + "logps/rejected": -3.464296817779541, + "loss": 22.1458, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.19707690179347992, + "rewards/margins": 0.04870922490954399, + "rewards/rejected": -0.24578611552715302, + "step": 475 + }, + { + "epoch": 0.302076777847703, + "grad_norm": 128.1005096435547, + "learning_rate": 3.5427055881196946e-05, + "logits/chosen": -1.7504918575286865, + "logits/rejected": -1.8846075534820557, + "logps/chosen": -2.7551674842834473, + "logps/rejected": -3.501314163208008, + "loss": 21.4037, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.192325159907341, + "rewards/margins": 0.05459358170628548, + "rewards/rejected": -0.2469187080860138, + "step": 480 + }, + { + "epoch": 0.3052234109502832, + "grad_norm": 64.81920623779297, + "learning_rate": 3.5286314410916967e-05, + "logits/chosen": -1.8015562295913696, + "logits/rejected": -1.9157085418701172, + "logps/chosen": -3.297150135040283, + "logps/rejected": -4.347265243530273, + "loss": 20.2599, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.24196556210517883, + "rewards/margins": 0.06687469035387039, + "rewards/rejected": -0.30884024500846863, + "step": 485 + }, + { + "epoch": 0.30837004405286345, + "grad_norm": 121.4966812133789, + "learning_rate": 3.5143728493977245e-05, + "logits/chosen": -1.7404873371124268, + "logits/rejected": -1.8498218059539795, + "logps/chosen": -3.553678035736084, + "logps/rejected": -4.084536075592041, + "loss": 24.4702, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2725631594657898, + "rewards/margins": 0.037132084369659424, + "rewards/rejected": -0.3096952736377716, + "step": 490 + }, + { + "epoch": 0.3115166771554437, + "grad_norm": 102.46180725097656, + "learning_rate": 3.499931533479417e-05, + "logits/chosen": -1.7682313919067383, + "logits/rejected": -1.7660820484161377, + "logps/chosen": -3.595475435256958, + "logps/rejected": -4.801576137542725, + "loss": 20.9722, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.2746976315975189, + "rewards/margins": 0.10004003345966339, + "rewards/rejected": -0.3747376501560211, + "step": 495 + }, + { + "epoch": 0.3146633102580239, + "grad_norm": 100.82923889160156, + "learning_rate": 3.485309235825916e-05, + "logits/chosen": -1.7638380527496338, + "logits/rejected": -1.857962965965271, + "logps/chosen": -4.1785569190979, + "logps/rejected": -5.445678234100342, + "loss": 20.121, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.30823373794555664, + "rewards/margins": 0.09736496210098267, + "rewards/rejected": -0.4055987298488617, + "step": 500 + }, + { + "epoch": 0.31780994336060414, + "grad_norm": 299.635009765625, + "learning_rate": 3.470507720763625e-05, + "logits/chosen": -1.7603092193603516, + "logits/rejected": -1.8294856548309326, + "logps/chosen": -3.818953037261963, + "logps/rejected": -4.965951442718506, + "loss": 24.0421, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2867090702056885, + "rewards/margins": 0.09908684343099594, + "rewards/rejected": -0.385795921087265, + "step": 505 + }, + { + "epoch": 0.3209565764631844, + "grad_norm": 121.77188110351562, + "learning_rate": 3.4555287742433115e-05, + "logits/chosen": -1.8968608379364014, + "logits/rejected": -1.863628625869751, + "logps/chosen": -3.3851046562194824, + "logps/rejected": -4.313992500305176, + "loss": 21.5651, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2504531145095825, + "rewards/margins": 0.07505444437265396, + "rewards/rejected": -0.3255075514316559, + "step": 510 + }, + { + "epoch": 0.3241032095657646, + "grad_norm": 84.7723617553711, + "learning_rate": 3.440374203624628e-05, + "logits/chosen": -1.8949018716812134, + "logits/rejected": -2.03389573097229, + "logps/chosen": -3.739046573638916, + "logps/rejected": -4.937285423278809, + "loss": 22.0895, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2827950417995453, + "rewards/margins": 0.07987246662378311, + "rewards/rejected": -0.3626675605773926, + "step": 515 + }, + { + "epoch": 0.3272498426683449, + "grad_norm": 96.02967071533203, + "learning_rate": 3.425045837458028e-05, + "logits/chosen": -1.9336235523223877, + "logits/rejected": -1.9811556339263916, + "logps/chosen": -3.5748794078826904, + "logps/rejected": -4.64247465133667, + "loss": 20.7454, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2698992192745209, + "rewards/margins": 0.07278282940387726, + "rewards/rejected": -0.3426820635795593, + "step": 520 + }, + { + "epoch": 0.3303964757709251, + "grad_norm": 138.71051025390625, + "learning_rate": 3.4095455252641376e-05, + "logits/chosen": -1.938104271888733, + "logits/rejected": -2.024137020111084, + "logps/chosen": -4.332060813903809, + "logps/rejected": -5.391437530517578, + "loss": 23.3511, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3168641924858093, + "rewards/margins": 0.049729883670806885, + "rewards/rejected": -0.3665940761566162, + "step": 525 + }, + { + "epoch": 0.33354310887350536, + "grad_norm": 93.8726577758789, + "learning_rate": 3.393875137310588e-05, + "logits/chosen": -1.8752260208129883, + "logits/rejected": -1.8945411443710327, + "logps/chosen": -4.053868770599365, + "logps/rejected": -5.044325828552246, + "loss": 21.8528, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.3227534890174866, + "rewards/margins": 0.0821223258972168, + "rewards/rejected": -0.4048757553100586, + "step": 530 + }, + { + "epoch": 0.3366897419760856, + "grad_norm": 261.39129638671875, + "learning_rate": 3.378036564386349e-05, + "logits/chosen": -1.770957589149475, + "logits/rejected": -1.8808790445327759, + "logps/chosen": -3.8808326721191406, + "logps/rejected": -4.960693836212158, + "loss": 23.7267, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3041539788246155, + "rewards/margins": 0.08733677119016647, + "rewards/rejected": -0.39149072766304016, + "step": 535 + }, + { + "epoch": 0.3398363750786658, + "grad_norm": 141.79991149902344, + "learning_rate": 3.3620317175735945e-05, + "logits/chosen": -1.929517149925232, + "logits/rejected": -1.8599262237548828, + "logps/chosen": -4.427219867706299, + "logps/rejected": -5.757664680480957, + "loss": 20.8591, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3481447994709015, + "rewards/margins": 0.0858476310968399, + "rewards/rejected": -0.4339924454689026, + "step": 540 + }, + { + "epoch": 0.34298300818124605, + "grad_norm": 76.495361328125, + "learning_rate": 3.345862528017101e-05, + "logits/chosen": -1.8648240566253662, + "logits/rejected": -1.899430513381958, + "logps/chosen": -4.430551528930664, + "logps/rejected": -5.134209156036377, + "loss": 21.6823, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3582889139652252, + "rewards/margins": 0.05610053986310959, + "rewards/rejected": -0.4143894612789154, + "step": 545 + }, + { + "epoch": 0.3461296412838263, + "grad_norm": 65.95896911621094, + "learning_rate": 3.32953094669124e-05, + "logits/chosen": -1.6951459646224976, + "logits/rejected": -1.7398831844329834, + "logps/chosen": -5.35291051864624, + "logps/rejected": -6.347973823547363, + "loss": 24.8551, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4343182146549225, + "rewards/margins": 0.085027314722538, + "rewards/rejected": -0.5193454623222351, + "step": 550 + }, + { + "epoch": 0.34927627438640657, + "grad_norm": 64.50738525390625, + "learning_rate": 3.313038944164577e-05, + "logits/chosen": -1.7779582738876343, + "logits/rejected": -1.8077032566070557, + "logps/chosen": -4.008457183837891, + "logps/rejected": -5.838412761688232, + "loss": 19.2472, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3185553550720215, + "rewards/margins": 0.10776933282613754, + "rewards/rejected": -0.4263246953487396, + "step": 555 + }, + { + "epoch": 0.3524229074889868, + "grad_norm": 62.579227447509766, + "learning_rate": 3.296388510362095e-05, + "logits/chosen": -1.5932537317276, + "logits/rejected": -1.7019790410995483, + "logps/chosen": -4.049741268157959, + "logps/rejected": -4.859818935394287, + "loss": 21.4107, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.29325228929519653, + "rewards/margins": 0.06688085943460464, + "rewards/rejected": -0.36013317108154297, + "step": 560 + }, + { + "epoch": 0.35556954059156703, + "grad_norm": 105.9216079711914, + "learning_rate": 3.2795816543250977e-05, + "logits/chosen": -1.5411794185638428, + "logits/rejected": -1.5789968967437744, + "logps/chosen": -3.8824076652526855, + "logps/rejected": -4.560225486755371, + "loss": 23.1195, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.2929798662662506, + "rewards/margins": 0.05188722163438797, + "rewards/rejected": -0.34486711025238037, + "step": 565 + }, + { + "epoch": 0.35871617369414727, + "grad_norm": 55.46923065185547, + "learning_rate": 3.262620403968792e-05, + "logits/chosen": -1.5855820178985596, + "logits/rejected": -1.7370961904525757, + "logps/chosen": -3.6918272972106934, + "logps/rejected": -5.205948352813721, + "loss": 19.1367, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.27848196029663086, + "rewards/margins": 0.11322972923517227, + "rewards/rejected": -0.3917117416858673, + "step": 570 + }, + { + "epoch": 0.3618628067967275, + "grad_norm": 114.82603454589844, + "learning_rate": 3.245506805837605e-05, + "logits/chosen": -1.6395822763442993, + "logits/rejected": -1.8543764352798462, + "logps/chosen": -4.298351287841797, + "logps/rejected": -5.546226501464844, + "loss": 19.9406, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.30993199348449707, + "rewards/margins": 0.08511951565742493, + "rewards/rejected": -0.3950514793395996, + "step": 575 + }, + { + "epoch": 0.36500943989930773, + "grad_norm": 174.55496215820312, + "learning_rate": 3.228242924858248e-05, + "logits/chosen": -1.5872471332550049, + "logits/rejected": -1.688132882118225, + "logps/chosen": -4.568819999694824, + "logps/rejected": -5.411607265472412, + "loss": 22.4314, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.34597450494766235, + "rewards/margins": 0.07728902995586395, + "rewards/rejected": -0.4232635498046875, + "step": 580 + }, + { + "epoch": 0.36815607300188796, + "grad_norm": 70.5542221069336, + "learning_rate": 3.210830844090555e-05, + "logits/chosen": -1.6192104816436768, + "logits/rejected": -1.6785539388656616, + "logps/chosen": -5.1252007484436035, + "logps/rejected": -5.851187705993652, + "loss": 25.8619, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.40460100769996643, + "rewards/margins": 0.06072293594479561, + "rewards/rejected": -0.46532392501831055, + "step": 585 + }, + { + "epoch": 0.3713027061044682, + "grad_norm": 100.62268829345703, + "learning_rate": 3.193272664476152e-05, + "logits/chosen": -1.7602649927139282, + "logits/rejected": -1.9346716403961182, + "logps/chosen": -4.961272239685059, + "logps/rejected": -5.8130645751953125, + "loss": 22.8852, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3982604444026947, + "rewards/margins": 0.059664536267519, + "rewards/rejected": -0.457925021648407, + "step": 590 + }, + { + "epoch": 0.3744493392070485, + "grad_norm": 411.0801696777344, + "learning_rate": 3.1755705045849465e-05, + "logits/chosen": -1.7633399963378906, + "logits/rejected": -1.818737268447876, + "logps/chosen": -5.510100364685059, + "logps/rejected": -6.382575035095215, + "loss": 23.8471, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4236491620540619, + "rewards/margins": 0.06903719902038574, + "rewards/rejected": -0.49268636107444763, + "step": 595 + }, + { + "epoch": 0.3775959723096287, + "grad_norm": 98.035888671875, + "learning_rate": 3.157726500359509e-05, + "logits/chosen": -1.825554609298706, + "logits/rejected": -1.907472014427185, + "logps/chosen": -5.569567680358887, + "logps/rejected": -6.1025004386901855, + "loss": 24.087, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.4460601210594177, + "rewards/margins": 0.03472483158111572, + "rewards/rejected": -0.48078498244285583, + "step": 600 + }, + { + "epoch": 0.38074260541220895, + "grad_norm": 80.47187805175781, + "learning_rate": 3.1397428048573465e-05, + "logits/chosen": -1.798015832901001, + "logits/rejected": -1.9216489791870117, + "logps/chosen": -4.644695281982422, + "logps/rejected": -5.7896294593811035, + "loss": 19.835, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.380901575088501, + "rewards/margins": 0.08407244086265564, + "rewards/rejected": -0.4649740159511566, + "step": 605 + }, + { + "epoch": 0.3838892385147892, + "grad_norm": 65.88395690917969, + "learning_rate": 3.121621587991113e-05, + "logits/chosen": -1.9489303827285767, + "logits/rejected": -1.9782030582427979, + "logps/chosen": -4.736275672912598, + "logps/rejected": -5.893181800842285, + "loss": 21.2523, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.37491172552108765, + "rewards/margins": 0.09068160504102707, + "rewards/rejected": -0.46559327840805054, + "step": 610 + }, + { + "epoch": 0.3870358716173694, + "grad_norm": 126.57975769042969, + "learning_rate": 3.1033650362667935e-05, + "logits/chosen": -1.945927619934082, + "logits/rejected": -2.0246009826660156, + "logps/chosen": -4.42104434967041, + "logps/rejected": -5.623631000518799, + "loss": 20.477, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3376965820789337, + "rewards/margins": 0.07996558398008347, + "rewards/rejected": -0.41766220331192017, + "step": 615 + }, + { + "epoch": 0.39018250471994964, + "grad_norm": 88.92438507080078, + "learning_rate": 3.084975352519874e-05, + "logits/chosen": -2.063378095626831, + "logits/rejected": -2.161208391189575, + "logps/chosen": -4.2682085037231445, + "logps/rejected": -5.291066646575928, + "loss": 22.2295, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3386593759059906, + "rewards/margins": 0.07158732414245605, + "rewards/rejected": -0.41024675965309143, + "step": 620 + }, + { + "epoch": 0.3933291378225299, + "grad_norm": 53.47737503051758, + "learning_rate": 3.06645475564955e-05, + "logits/chosen": -1.9409205913543701, + "logits/rejected": -2.0371243953704834, + "logps/chosen": -3.6241352558135986, + "logps/rejected": -5.033164978027344, + "loss": 20.5698, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.27396219968795776, + "rewards/margins": 0.09085332602262497, + "rewards/rejected": -0.36481553316116333, + "step": 625 + }, + { + "epoch": 0.3964757709251101, + "grad_norm": 87.2447738647461, + "learning_rate": 3.0478054803509975e-05, + "logits/chosen": -1.9413238763809204, + "logits/rejected": -1.989638328552246, + "logps/chosen": -3.974926710128784, + "logps/rejected": -5.115756034851074, + "loss": 20.8679, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3056657314300537, + "rewards/margins": 0.09486590325832367, + "rewards/rejected": -0.4005316197872162, + "step": 630 + }, + { + "epoch": 0.3996224040276904, + "grad_norm": 105.37754821777344, + "learning_rate": 3.029029776845726e-05, + "logits/chosen": -1.9769777059555054, + "logits/rejected": -2.0631349086761475, + "logps/chosen": -4.811491012573242, + "logps/rejected": -6.024916648864746, + "loss": 22.3949, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.36858421564102173, + "rewards/margins": 0.09452919661998749, + "rewards/rejected": -0.463113397359848, + "step": 635 + }, + { + "epoch": 0.4027690371302706, + "grad_norm": 107.63380432128906, + "learning_rate": 3.0101299106100766e-05, + "logits/chosen": -1.9259755611419678, + "logits/rejected": -2.0011420249938965, + "logps/chosen": -4.672276496887207, + "logps/rejected": -5.433979034423828, + "loss": 23.4548, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.34038934111595154, + "rewards/margins": 0.05264373868703842, + "rewards/rejected": -0.39303308725357056, + "step": 640 + }, + { + "epoch": 0.40591567023285086, + "grad_norm": 72.93191528320312, + "learning_rate": 2.991108162101862e-05, + "logits/chosen": -1.8639154434204102, + "logits/rejected": -2.00860333442688, + "logps/chosen": -4.0379438400268555, + "logps/rejected": -4.966481685638428, + "loss": 24.2063, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.3016512095928192, + "rewards/margins": 0.05989114195108414, + "rewards/rejected": -0.36154234409332275, + "step": 645 + }, + { + "epoch": 0.4090623033354311, + "grad_norm": 241.30491638183594, + "learning_rate": 2.971966826485212e-05, + "logits/chosen": -2.0276923179626465, + "logits/rejected": -2.075092077255249, + "logps/chosen": -3.9584078788757324, + "logps/rejected": -4.5398454666137695, + "loss": 22.3358, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2686071991920471, + "rewards/margins": 0.05414595082402229, + "rewards/rejected": -0.3227531313896179, + "step": 650 + }, + { + "epoch": 0.4122089364380113, + "grad_norm": 72.65229797363281, + "learning_rate": 2.952708213353636e-05, + "logits/chosen": -2.087306499481201, + "logits/rejected": -2.120595932006836, + "logps/chosen": -2.7464280128479004, + "logps/rejected": -3.2665913105010986, + "loss": 23.396, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19495923817157745, + "rewards/margins": 0.03470323234796524, + "rewards/rejected": -0.2296624630689621, + "step": 655 + }, + { + "epoch": 0.41535556954059155, + "grad_norm": 36.565982818603516, + "learning_rate": 2.9333346464513476e-05, + "logits/chosen": -2.0568580627441406, + "logits/rejected": -2.171510934829712, + "logps/chosen": -3.1527762413024902, + "logps/rejected": -3.5696024894714355, + "loss": 23.204, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.2180822342634201, + "rewards/margins": 0.029619824141263962, + "rewards/rejected": -0.24770204722881317, + "step": 660 + }, + { + "epoch": 0.4185022026431718, + "grad_norm": 57.84255599975586, + "learning_rate": 2.9138484633928818e-05, + "logits/chosen": -1.940320372581482, + "logits/rejected": -1.9845908880233765, + "logps/chosen": -3.0434772968292236, + "logps/rejected": -3.5398964881896973, + "loss": 24.3501, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2063741683959961, + "rewards/margins": 0.023456847295165062, + "rewards/rejected": -0.2298310250043869, + "step": 665 + }, + { + "epoch": 0.42164883574575207, + "grad_norm": 56.995887756347656, + "learning_rate": 2.8942520153810396e-05, + "logits/chosen": -2.0002236366271973, + "logits/rejected": -2.08671498298645, + "logps/chosen": -2.834512710571289, + "logps/rejected": -3.5050129890441895, + "loss": 22.4039, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.18090704083442688, + "rewards/margins": 0.04532923549413681, + "rewards/rejected": -0.2262362688779831, + "step": 670 + }, + { + "epoch": 0.4247954688483323, + "grad_norm": 75.65125274658203, + "learning_rate": 2.8745476669231894e-05, + "logits/chosen": -2.020886182785034, + "logits/rejected": -2.111823558807373, + "logps/chosen": -3.5571112632751465, + "logps/rejected": -4.481097221374512, + "loss": 22.9676, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.22384686768054962, + "rewards/margins": 0.04108366742730141, + "rewards/rejected": -0.2649305462837219, + "step": 675 + }, + { + "epoch": 0.42794210195091253, + "grad_norm": 77.30415344238281, + "learning_rate": 2.8547377955459704e-05, + "logits/chosen": -1.9961265325546265, + "logits/rejected": -2.0482177734375, + "logps/chosen": -2.892690658569336, + "logps/rejected": -3.2253260612487793, + "loss": 25.6658, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19483526051044464, + "rewards/margins": 0.01912742853164673, + "rewards/rejected": -0.21396267414093018, + "step": 680 + }, + { + "epoch": 0.43108873505349277, + "grad_norm": 49.21062088012695, + "learning_rate": 2.834824791508413e-05, + "logits/chosen": -1.930086374282837, + "logits/rejected": -2.131298542022705, + "logps/chosen": -2.739534854888916, + "logps/rejected": -3.5602822303771973, + "loss": 21.1908, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.17746233940124512, + "rewards/margins": 0.06554970890283585, + "rewards/rejected": -0.24301204085350037, + "step": 685 + }, + { + "epoch": 0.434235368156073, + "grad_norm": 64.88590240478516, + "learning_rate": 2.814811057513537e-05, + "logits/chosen": -2.0517029762268066, + "logits/rejected": -2.067883253097534, + "logps/chosen": -2.82458758354187, + "logps/rejected": -3.6670260429382324, + "loss": 21.8595, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.1833667755126953, + "rewards/margins": 0.0560932457447052, + "rewards/rejected": -0.2394600361585617, + "step": 690 + }, + { + "epoch": 0.43738200125865323, + "grad_norm": 48.841331481933594, + "learning_rate": 2.7946990084184383e-05, + "logits/chosen": -1.798683524131775, + "logits/rejected": -1.9806129932403564, + "logps/chosen": -3.2995662689208984, + "logps/rejected": -4.0815110206604, + "loss": 22.0918, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2146444320678711, + "rewards/margins": 0.05965212732553482, + "rewards/rejected": -0.27429652214050293, + "step": 695 + }, + { + "epoch": 0.44052863436123346, + "grad_norm": 266.59381103515625, + "learning_rate": 2.7744910709429104e-05, + "logits/chosen": -1.800355315208435, + "logits/rejected": -1.9262745380401611, + "logps/chosen": -3.308371067047119, + "logps/rejected": -4.3786821365356445, + "loss": 22.6616, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.20500688254833221, + "rewards/margins": 0.07705695927143097, + "rewards/rejected": -0.2820638120174408, + "step": 700 + }, + { + "epoch": 0.4436752674638137, + "grad_norm": 45.74457550048828, + "learning_rate": 2.754189683376641e-05, + "logits/chosen": -1.8245214223861694, + "logits/rejected": -1.9188095331192017, + "logps/chosen": -2.6574292182922363, + "logps/rejected": -3.3347110748291016, + "loss": 21.6472, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1792256087064743, + "rewards/margins": 0.054762959480285645, + "rewards/rejected": -0.23398856818675995, + "step": 705 + }, + { + "epoch": 0.446821900566394, + "grad_norm": 82.67216491699219, + "learning_rate": 2.7337972952850047e-05, + "logits/chosen": -1.764173150062561, + "logits/rejected": -1.9260650873184204, + "logps/chosen": -2.8055293560028076, + "logps/rejected": -3.9603447914123535, + "loss": 21.7022, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.19627173244953156, + "rewards/margins": 0.07794789969921112, + "rewards/rejected": -0.2742196321487427, + "step": 710 + }, + { + "epoch": 0.4499685336689742, + "grad_norm": 63.396240234375, + "learning_rate": 2.713316367213499e-05, + "logits/chosen": -1.6747219562530518, + "logits/rejected": -1.8347587585449219, + "logps/chosen": -2.9625911712646484, + "logps/rejected": -3.7656357288360596, + "loss": 22.6149, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.21037223935127258, + "rewards/margins": 0.05833571031689644, + "rewards/rejected": -0.26870793104171753, + "step": 715 + }, + { + "epoch": 0.45311516677155445, + "grad_norm": 118.00112915039062, + "learning_rate": 2.692749370390855e-05, + "logits/chosen": -1.7990179061889648, + "logits/rejected": -1.8915067911148071, + "logps/chosen": -3.0249316692352295, + "logps/rejected": -4.06134033203125, + "loss": 23.4425, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.21054935455322266, + "rewards/margins": 0.05246324464678764, + "rewards/rejected": -0.2630125880241394, + "step": 720 + }, + { + "epoch": 0.4562617998741347, + "grad_norm": 64.52631378173828, + "learning_rate": 2.6720987864308603e-05, + "logits/chosen": -1.695908546447754, + "logits/rejected": -1.7583353519439697, + "logps/chosen": -2.815432548522949, + "logps/rejected": -4.123710632324219, + "loss": 21.0095, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1960502415895462, + "rewards/margins": 0.08241166174411774, + "rewards/rejected": -0.27846187353134155, + "step": 725 + }, + { + "epoch": 0.4594084329767149, + "grad_norm": 59.4410285949707, + "learning_rate": 2.6513671070329244e-05, + "logits/chosen": -1.7788522243499756, + "logits/rejected": -1.8245208263397217, + "logps/chosen": -3.012934446334839, + "logps/rejected": -4.003429412841797, + "loss": 21.1484, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2150099277496338, + "rewards/margins": 0.07829871028661728, + "rewards/rejected": -0.2933086156845093, + "step": 730 + }, + { + "epoch": 0.46255506607929514, + "grad_norm": 84.89627075195312, + "learning_rate": 2.630556833681434e-05, + "logits/chosen": -1.738438606262207, + "logits/rejected": -1.8424345254898071, + "logps/chosen": -2.7983458042144775, + "logps/rejected": -4.087245941162109, + "loss": 19.2453, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19751907885074615, + "rewards/margins": 0.09776587784290314, + "rewards/rejected": -0.2952849566936493, + "step": 735 + }, + { + "epoch": 0.4657016991818754, + "grad_norm": 101.38806915283203, + "learning_rate": 2.609670477343921e-05, + "logits/chosen": -1.6957628726959229, + "logits/rejected": -1.825757384300232, + "logps/chosen": -4.030215263366699, + "logps/rejected": -5.008100509643555, + "loss": 22.1478, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.30844345688819885, + "rewards/margins": 0.0614703968167305, + "rewards/rejected": -0.36991381645202637, + "step": 740 + }, + { + "epoch": 0.46884833228445566, + "grad_norm": 101.18181610107422, + "learning_rate": 2.5887105581680905e-05, + "logits/chosen": -1.7838348150253296, + "logits/rejected": -1.7674500942230225, + "logps/chosen": -4.438131809234619, + "logps/rejected": -5.542893886566162, + "loss": 23.806, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.36128634214401245, + "rewards/margins": 0.07241909205913544, + "rewards/rejected": -0.43370547890663147, + "step": 745 + }, + { + "epoch": 0.4719949653870359, + "grad_norm": 89.2279052734375, + "learning_rate": 2.567679605177739e-05, + "logits/chosen": -1.7873433828353882, + "logits/rejected": -1.831865906715393, + "logps/chosen": -4.315898895263672, + "logps/rejected": -5.43391227722168, + "loss": 20.4258, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.32439109683036804, + "rewards/margins": 0.09124849736690521, + "rewards/rejected": -0.41563957929611206, + "step": 750 + }, + { + "epoch": 0.4751415984896161, + "grad_norm": 68.27491760253906, + "learning_rate": 2.5465801559676033e-05, + "logits/chosen": -1.716103196144104, + "logits/rejected": -1.744837999343872, + "logps/chosen": -3.913160800933838, + "logps/rejected": -5.709442615509033, + "loss": 19.3215, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.30374833941459656, + "rewards/margins": 0.12692494690418243, + "rewards/rejected": -0.4306732714176178, + "step": 755 + }, + { + "epoch": 0.47828823159219636, + "grad_norm": 149.6294708251953, + "learning_rate": 2.525414756397174e-05, + "logits/chosen": -1.7440742254257202, + "logits/rejected": -1.8239097595214844, + "logps/chosen": -3.586292266845703, + "logps/rejected": -4.596356391906738, + "loss": 19.9662, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.2702713906764984, + "rewards/margins": 0.08218260109424591, + "rewards/rejected": -0.3524540364742279, + "step": 760 + }, + { + "epoch": 0.4814348646947766, + "grad_norm": 102.944580078125, + "learning_rate": 2.504185960283512e-05, + "logits/chosen": -1.7996543645858765, + "logits/rejected": -1.8109557628631592, + "logps/chosen": -4.447735786437988, + "logps/rejected": -5.870986461639404, + "loss": 20.4207, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.35062670707702637, + "rewards/margins": 0.09269314259290695, + "rewards/rejected": -0.4433198869228363, + "step": 765 + }, + { + "epoch": 0.4845814977973568, + "grad_norm": 128.53907775878906, + "learning_rate": 2.482896329093106e-05, + "logits/chosen": -1.9051790237426758, + "logits/rejected": -1.9270706176757812, + "logps/chosen": -5.1721906661987305, + "logps/rejected": -6.744166374206543, + "loss": 19.0615, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4306749701499939, + "rewards/margins": 0.1142655462026596, + "rewards/rejected": -0.5449405312538147, + "step": 770 + }, + { + "epoch": 0.48772813089993705, + "grad_norm": 123.44400024414062, + "learning_rate": 2.4615484316328023e-05, + "logits/chosen": -1.8487358093261719, + "logits/rejected": -1.8219711780548096, + "logps/chosen": -5.741638660430908, + "logps/rejected": -7.048303127288818, + "loss": 22.6075, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.4748842120170593, + "rewards/margins": 0.09859482944011688, + "rewards/rejected": -0.5734790563583374, + "step": 775 + }, + { + "epoch": 0.4908747640025173, + "grad_norm": 97.28683471679688, + "learning_rate": 2.440144843739857e-05, + "logits/chosen": -1.8166711330413818, + "logits/rejected": -1.856359839439392, + "logps/chosen": -6.369978904724121, + "logps/rejected": -7.745943546295166, + "loss": 21.1624, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5159797072410583, + "rewards/margins": 0.09467221796512604, + "rewards/rejected": -0.610651969909668, + "step": 780 + }, + { + "epoch": 0.49402139710509757, + "grad_norm": 94.76971435546875, + "learning_rate": 2.4186881479711338e-05, + "logits/chosen": -1.8901869058609009, + "logits/rejected": -1.996917724609375, + "logps/chosen": -5.151943206787109, + "logps/rejected": -6.655333518981934, + "loss": 17.5696, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3987768888473511, + "rewards/margins": 0.11971308290958405, + "rewards/rejected": -0.5184900164604187, + "step": 785 + }, + { + "epoch": 0.4971680302076778, + "grad_norm": 362.07489013671875, + "learning_rate": 2.397180933291491e-05, + "logits/chosen": -1.6789305210113525, + "logits/rejected": -1.75827157497406, + "logps/chosen": -4.5332841873168945, + "logps/rejected": -5.266444206237793, + "loss": 22.7215, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.3525087535381317, + "rewards/margins": 0.07219593226909637, + "rewards/rejected": -0.42470473051071167, + "step": 790 + }, + { + "epoch": 0.500314663310258, + "grad_norm": 181.0984344482422, + "learning_rate": 2.375625794761401e-05, + "logits/chosen": -1.769201636314392, + "logits/rejected": -1.7219161987304688, + "logps/chosen": -4.633937358856201, + "logps/rejected": -5.043046474456787, + "loss": 26.0541, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.3703366816043854, + "rewards/margins": 0.028562629595398903, + "rewards/rejected": -0.3988993167877197, + "step": 795 + }, + { + "epoch": 0.5034612964128382, + "grad_norm": 120.9494857788086, + "learning_rate": 2.3540253332238266e-05, + "logits/chosen": -1.6151552200317383, + "logits/rejected": -1.646795630455017, + "logps/chosen": -4.029574394226074, + "logps/rejected": -5.215254783630371, + "loss": 20.2479, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.314275324344635, + "rewards/margins": 0.08437344431877136, + "rewards/rejected": -0.39864879846572876, + "step": 800 + }, + { + "epoch": 0.5066079295154186, + "grad_norm": 119.4858169555664, + "learning_rate": 2.3323821549904038e-05, + "logits/chosen": -1.670577049255371, + "logits/rejected": -1.5533939599990845, + "logps/chosen": -3.9187912940979004, + "logps/rejected": -4.743254661560059, + "loss": 23.6037, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3001677095890045, + "rewards/margins": 0.06169123575091362, + "rewards/rejected": -0.36185896396636963, + "step": 805 + }, + { + "epoch": 0.5097545626179988, + "grad_norm": 316.2997741699219, + "learning_rate": 2.310698871526966e-05, + "logits/chosen": -1.5207440853118896, + "logits/rejected": -1.6267799139022827, + "logps/chosen": -3.097418785095215, + "logps/rejected": -4.804646015167236, + "loss": 21.8575, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.22499537467956543, + "rewards/margins": 0.11616162210702896, + "rewards/rejected": -0.3411570191383362, + "step": 810 + }, + { + "epoch": 0.512901195720579, + "grad_norm": 78.00189971923828, + "learning_rate": 2.288978099138443e-05, + "logits/chosen": -1.5745933055877686, + "logits/rejected": -1.5564606189727783, + "logps/chosen": -2.8804163932800293, + "logps/rejected": -3.5308539867401123, + "loss": 22.241, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20254115760326385, + "rewards/margins": 0.05405501648783684, + "rewards/rejected": -0.2565961480140686, + "step": 815 + }, + { + "epoch": 0.5160478288231592, + "grad_norm": 118.51643371582031, + "learning_rate": 2.267222458653179e-05, + "logits/chosen": -1.5091989040374756, + "logits/rejected": -1.6645923852920532, + "logps/chosen": -3.255237579345703, + "logps/rejected": -4.126650333404541, + "loss": 22.0187, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.23314771056175232, + "rewards/margins": 0.06177164986729622, + "rewards/rejected": -0.29491934180259705, + "step": 820 + }, + { + "epoch": 0.5191944619257395, + "grad_norm": 68.80047607421875, + "learning_rate": 2.245434575106702e-05, + "logits/chosen": -1.525356411933899, + "logits/rejected": -1.701436996459961, + "logps/chosen": -3.166797161102295, + "logps/rejected": -4.742985248565674, + "loss": 20.3686, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2261020839214325, + "rewards/margins": 0.08829782903194427, + "rewards/rejected": -0.3143998980522156, + "step": 825 + }, + { + "epoch": 0.5223410950283197, + "grad_norm": 73.1375503540039, + "learning_rate": 2.223617077424988e-05, + "logits/chosen": -1.6771663427352905, + "logits/rejected": -1.7121098041534424, + "logps/chosen": -3.020296573638916, + "logps/rejected": -4.426422119140625, + "loss": 20.0836, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.21738722920417786, + "rewards/margins": 0.09777109324932098, + "rewards/rejected": -0.31515830755233765, + "step": 830 + }, + { + "epoch": 0.5254877281309, + "grad_norm": 76.68984985351562, + "learning_rate": 2.2017725981072536e-05, + "logits/chosen": -1.4603363275527954, + "logits/rejected": -1.5595886707305908, + "logps/chosen": -3.6973624229431152, + "logps/rejected": -5.027807712554932, + "loss": 20.512, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.2720819115638733, + "rewards/margins": 0.08642515540122986, + "rewards/rejected": -0.35850709676742554, + "step": 835 + }, + { + "epoch": 0.5286343612334802, + "grad_norm": 122.99668884277344, + "learning_rate": 2.1799037729083213e-05, + "logits/chosen": -1.5949891805648804, + "logits/rejected": -1.7137962579727173, + "logps/chosen": -3.5109829902648926, + "logps/rejected": -4.95348596572876, + "loss": 21.517, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.27030450105667114, + "rewards/margins": 0.09910550713539124, + "rewards/rejected": -0.36940997838974, + "step": 840 + }, + { + "epoch": 0.5317809943360604, + "grad_norm": 65.23582458496094, + "learning_rate": 2.1580132405205862e-05, + "logits/chosen": -1.4871020317077637, + "logits/rejected": -1.5624678134918213, + "logps/chosen": -4.474881172180176, + "logps/rejected": -5.375269412994385, + "loss": 23.3138, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3389451503753662, + "rewards/margins": 0.06582923233509064, + "rewards/rejected": -0.40477436780929565, + "step": 845 + }, + { + "epoch": 0.5349276274386406, + "grad_norm": 175.08432006835938, + "learning_rate": 2.1361036422556337e-05, + "logits/chosen": -1.5353832244873047, + "logits/rejected": -1.596407175064087, + "logps/chosen": -3.814873218536377, + "logps/rejected": -4.92036771774292, + "loss": 21.5442, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2893931567668915, + "rewards/margins": 0.07075894623994827, + "rewards/rejected": -0.36015206575393677, + "step": 850 + }, + { + "epoch": 0.5380742605412209, + "grad_norm": 64.21197509765625, + "learning_rate": 2.1141776217255365e-05, + "logits/chosen": -1.567317247390747, + "logits/rejected": -1.5555747747421265, + "logps/chosen": -3.8906242847442627, + "logps/rejected": -4.897479057312012, + "loss": 21.8379, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.29526472091674805, + "rewards/margins": 0.06354343891143799, + "rewards/rejected": -0.35880815982818604, + "step": 855 + }, + { + "epoch": 0.5412208936438011, + "grad_norm": 104.57052612304688, + "learning_rate": 2.0922378245238787e-05, + "logits/chosen": -1.5869696140289307, + "logits/rejected": -1.6049997806549072, + "logps/chosen": -3.8140482902526855, + "logps/rejected": -4.755133628845215, + "loss": 23.1968, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.29255491495132446, + "rewards/margins": 0.052004069089889526, + "rewards/rejected": -0.3445590138435364, + "step": 860 + }, + { + "epoch": 0.5443675267463813, + "grad_norm": 92.2053451538086, + "learning_rate": 2.070286897906537e-05, + "logits/chosen": -1.602929711341858, + "logits/rejected": -1.6071062088012695, + "logps/chosen": -3.990319013595581, + "logps/rejected": -5.2248215675354, + "loss": 20.3706, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3073904812335968, + "rewards/margins": 0.09087739139795303, + "rewards/rejected": -0.39826786518096924, + "step": 865 + }, + { + "epoch": 0.5475141598489616, + "grad_norm": 83.128662109375, + "learning_rate": 2.0483274904722647e-05, + "logits/chosen": -1.7051680088043213, + "logits/rejected": -1.6087182760238647, + "logps/chosen": -3.986027956008911, + "logps/rejected": -4.851881980895996, + "loss": 21.4848, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.30843600630760193, + "rewards/margins": 0.06898938864469528, + "rewards/rejected": -0.3774254322052002, + "step": 870 + }, + { + "epoch": 0.5506607929515418, + "grad_norm": 62.2298583984375, + "learning_rate": 2.026362251843109e-05, + "logits/chosen": -1.6034513711929321, + "logits/rejected": -1.699464201927185, + "logps/chosen": -3.4193336963653564, + "logps/rejected": -4.403960227966309, + "loss": 21.3108, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.2610613703727722, + "rewards/margins": 0.08181565254926682, + "rewards/rejected": -0.34287700057029724, + "step": 875 + }, + { + "epoch": 0.5538074260541221, + "grad_norm": 88.62437438964844, + "learning_rate": 2.004393832344711e-05, + "logits/chosen": -1.6719697713851929, + "logits/rejected": -1.5851457118988037, + "logps/chosen": -3.8325066566467285, + "logps/rejected": -5.3017473220825195, + "loss": 19.635, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3032756447792053, + "rewards/margins": 0.09231220185756683, + "rewards/rejected": -0.39558783173561096, + "step": 880 + }, + { + "epoch": 0.5569540591567024, + "grad_norm": 64.06165313720703, + "learning_rate": 1.9824248826865124e-05, + "logits/chosen": -1.5828460454940796, + "logits/rejected": -1.6327168941497803, + "logps/chosen": -4.681789398193359, + "logps/rejected": -6.566616058349609, + "loss": 18.3853, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.3667379915714264, + "rewards/margins": 0.12741395831108093, + "rewards/rejected": -0.49415192008018494, + "step": 885 + }, + { + "epoch": 0.5601006922592826, + "grad_norm": 204.93890380859375, + "learning_rate": 1.9604580536419254e-05, + "logits/chosen": -1.572584867477417, + "logits/rejected": -1.6088756322860718, + "logps/chosen": -5.441628456115723, + "logps/rejected": -7.085760593414307, + "loss": 24.9097, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.45653265714645386, + "rewards/margins": 0.0925588458776474, + "rewards/rejected": -0.5490914583206177, + "step": 890 + }, + { + "epoch": 0.5632473253618628, + "grad_norm": 162.79714965820312, + "learning_rate": 1.93849599572849e-05, + "logits/chosen": -1.6288610696792603, + "logits/rejected": -1.6398794651031494, + "logps/chosen": -5.213116645812988, + "logps/rejected": -6.9830803871154785, + "loss": 20.22, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.42777156829833984, + "rewards/margins": 0.12980665266513824, + "rewards/rejected": -0.5575782060623169, + "step": 895 + }, + { + "epoch": 0.5663939584644431, + "grad_norm": 75.16659545898438, + "learning_rate": 1.916541358888062e-05, + "logits/chosen": -1.6041675806045532, + "logits/rejected": -1.6970984935760498, + "logps/chosen": -4.644831657409668, + "logps/rejected": -5.80092716217041, + "loss": 20.4964, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.37974128127098083, + "rewards/margins": 0.09219308942556381, + "rewards/rejected": -0.47193440794944763, + "step": 900 + }, + { + "epoch": 0.5695405915670233, + "grad_norm": 110.90229797363281, + "learning_rate": 1.8945967921670676e-05, + "logits/chosen": -1.619327187538147, + "logits/rejected": -1.6541610956192017, + "logps/chosen": -5.146854400634766, + "logps/rejected": -6.011466026306152, + "loss": 22.4066, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.41492849588394165, + "rewards/margins": 0.07109946012496948, + "rewards/rejected": -0.48602795600891113, + "step": 905 + }, + { + "epoch": 0.5726872246696035, + "grad_norm": 139.65293884277344, + "learning_rate": 1.872664943396875e-05, + "logits/chosen": -1.6764265298843384, + "logits/rejected": -1.6785293817520142, + "logps/chosen": -4.107344150543213, + "logps/rejected": -5.6308698654174805, + "loss": 20.0103, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3305855095386505, + "rewards/margins": 0.11647170782089233, + "rewards/rejected": -0.44705715775489807, + "step": 910 + }, + { + "epoch": 0.5758338577721838, + "grad_norm": 147.52713012695312, + "learning_rate": 1.8507484588743025e-05, + "logits/chosen": -1.7002742290496826, + "logits/rejected": -1.7680556774139404, + "logps/chosen": -4.6784772872924805, + "logps/rejected": -5.973324775695801, + "loss": 21.0769, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3756680190563202, + "rewards/margins": 0.09194694459438324, + "rewards/rejected": -0.4676149785518646, + "step": 915 + }, + { + "epoch": 0.578980490874764, + "grad_norm": 71.16407012939453, + "learning_rate": 1.828849983042321e-05, + "logits/chosen": -1.7075554132461548, + "logits/rejected": -1.6953094005584717, + "logps/chosen": -4.460357666015625, + "logps/rejected": -5.521221160888672, + "loss": 21.7677, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.35953736305236816, + "rewards/margins": 0.08199813961982727, + "rewards/rejected": -0.44153547286987305, + "step": 920 + }, + { + "epoch": 0.5821271239773442, + "grad_norm": 114.27317810058594, + "learning_rate": 1.8069721581709697e-05, + "logits/chosen": -1.6304935216903687, + "logits/rejected": -1.6967551708221436, + "logps/chosen": -4.526963233947754, + "logps/rejected": -5.7123494148254395, + "loss": 21.5069, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.35851508378982544, + "rewards/margins": 0.07467497885227203, + "rewards/rejected": -0.4331900477409363, + "step": 925 + }, + { + "epoch": 0.5852737570799245, + "grad_norm": 71.74990844726562, + "learning_rate": 1.785117624038546e-05, + "logits/chosen": -1.704414963722229, + "logits/rejected": -1.7506616115570068, + "logps/chosen": -5.388034820556641, + "logps/rejected": -6.3465657234191895, + "loss": 21.8977, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4279704689979553, + "rewards/margins": 0.05819786712527275, + "rewards/rejected": -0.48616838455200195, + "step": 930 + }, + { + "epoch": 0.5884203901825047, + "grad_norm": 78.14295196533203, + "learning_rate": 1.763289017613085e-05, + "logits/chosen": -1.6152721643447876, + "logits/rejected": -1.640634536743164, + "logps/chosen": -4.3263750076293945, + "logps/rejected": -5.279467582702637, + "loss": 21.887, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.34328222274780273, + "rewards/margins": 0.07140573114156723, + "rewards/rejected": -0.41468796133995056, + "step": 935 + }, + { + "epoch": 0.5915670232850849, + "grad_norm": 219.88279724121094, + "learning_rate": 1.741488972734184e-05, + "logits/chosen": -1.5857679843902588, + "logits/rejected": -1.65940260887146, + "logps/chosen": -4.669988632202148, + "logps/rejected": -6.202586650848389, + "loss": 20.5667, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3540535271167755, + "rewards/margins": 0.10686901956796646, + "rewards/rejected": -0.46092256903648376, + "step": 940 + }, + { + "epoch": 0.5947136563876652, + "grad_norm": 90.00337219238281, + "learning_rate": 1.7197201197952065e-05, + "logits/chosen": -1.5206947326660156, + "logits/rejected": -1.53545343875885, + "logps/chosen": -4.086690902709961, + "logps/rejected": -4.490893363952637, + "loss": 25.9453, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.30406898260116577, + "rewards/margins": 0.034761372953653336, + "rewards/rejected": -0.3388303220272064, + "step": 945 + }, + { + "epoch": 0.5978602894902454, + "grad_norm": 79.93099212646484, + "learning_rate": 1.6979850854258938e-05, + "logits/chosen": -1.3608052730560303, + "logits/rejected": -1.4760938882827759, + "logps/chosen": -3.6326985359191895, + "logps/rejected": -5.186118125915527, + "loss": 20.6064, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2708989083766937, + "rewards/margins": 0.10907317698001862, + "rewards/rejected": -0.37997210025787354, + "step": 950 + }, + { + "epoch": 0.6010069225928257, + "grad_norm": 54.11685562133789, + "learning_rate": 1.6762864921754426e-05, + "logits/chosen": -1.3788961172103882, + "logits/rejected": -1.4954605102539062, + "logps/chosen": -3.189054250717163, + "logps/rejected": -4.365990161895752, + "loss": 20.0193, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.23121857643127441, + "rewards/margins": 0.09906688332557678, + "rewards/rejected": -0.3302854597568512, + "step": 955 + }, + { + "epoch": 0.604153555695406, + "grad_norm": 78.23949432373047, + "learning_rate": 1.654626958196059e-05, + "logits/chosen": -1.509225606918335, + "logits/rejected": -1.4755313396453857, + "logps/chosen": -4.190049648284912, + "logps/rejected": -5.553238391876221, + "loss": 18.6024, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3084833025932312, + "rewards/margins": 0.10999338328838348, + "rewards/rejected": -0.4184766709804535, + "step": 960 + }, + { + "epoch": 0.6073001887979862, + "grad_norm": 46.66254806518555, + "learning_rate": 1.633009096927062e-05, + "logits/chosen": -1.5157467126846313, + "logits/rejected": -1.6129589080810547, + "logps/chosen": -3.3808016777038574, + "logps/rejected": -4.686802864074707, + "loss": 18.8156, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.26101940870285034, + "rewards/margins": 0.10853584110736847, + "rewards/rejected": -0.3695552349090576, + "step": 965 + }, + { + "epoch": 0.6104468219005664, + "grad_norm": 76.67229461669922, + "learning_rate": 1.6114355167795407e-05, + "logits/chosen": -1.507666826248169, + "logits/rejected": -1.642401099205017, + "logps/chosen": -4.4493513107299805, + "logps/rejected": -5.8435235023498535, + "loss": 20.6314, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.32613635063171387, + "rewards/margins": 0.10264672338962555, + "rewards/rejected": -0.42878302931785583, + "step": 970 + }, + { + "epoch": 0.6135934550031467, + "grad_norm": 97.02481842041016, + "learning_rate": 1.5899088208216215e-05, + "logits/chosen": -1.501697301864624, + "logits/rejected": -1.594618558883667, + "logps/chosen": -4.284520149230957, + "logps/rejected": -4.852963447570801, + "loss": 26.4688, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.33568352460861206, + "rewards/margins": 0.03864779695868492, + "rewards/rejected": -0.37433135509490967, + "step": 975 + }, + { + "epoch": 0.6167400881057269, + "grad_norm": 176.32850646972656, + "learning_rate": 1.568431606464388e-05, + "logits/chosen": -1.595866084098816, + "logits/rejected": -1.6668930053710938, + "logps/chosen": -4.345438480377197, + "logps/rejected": -5.242307662963867, + "loss": 21.0145, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3100406527519226, + "rewards/margins": 0.0767713412642479, + "rewards/rejected": -0.3868120312690735, + "step": 980 + }, + { + "epoch": 0.6198867212083071, + "grad_norm": 76.86431884765625, + "learning_rate": 1.547006465148471e-05, + "logits/chosen": -1.5940501689910889, + "logits/rejected": -1.7789547443389893, + "logps/chosen": -4.4857177734375, + "logps/rejected": -5.875302314758301, + "loss": 21.8847, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3244941830635071, + "rewards/margins": 0.08251677453517914, + "rewards/rejected": -0.4070109724998474, + "step": 985 + }, + { + "epoch": 0.6230333543108874, + "grad_norm": 49.81745147705078, + "learning_rate": 1.5256359820313718e-05, + "logits/chosen": -1.550085425376892, + "logits/rejected": -1.5959933996200562, + "logps/chosen": -3.699030637741089, + "logps/rejected": -4.6470842361450195, + "loss": 20.7306, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2788650095462799, + "rewards/margins": 0.0799705758690834, + "rewards/rejected": -0.3588356077671051, + "step": 990 + }, + { + "epoch": 0.6261799874134676, + "grad_norm": 81.01653289794922, + "learning_rate": 1.5043227356755292e-05, + "logits/chosen": -1.58163321018219, + "logits/rejected": -1.663260817527771, + "logps/chosen": -4.869448661804199, + "logps/rejected": -5.365525245666504, + "loss": 24.1646, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.34130221605300903, + "rewards/margins": 0.04551910609006882, + "rewards/rejected": -0.38682132959365845, + "step": 995 + }, + { + "epoch": 0.6293266205160478, + "grad_norm": 101.5945053100586, + "learning_rate": 1.4830692977371985e-05, + "logits/chosen": -1.747009038925171, + "logits/rejected": -1.7761609554290771, + "logps/chosen": -4.585317134857178, + "logps/rejected": -5.033480644226074, + "loss": 23.2309, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.3538682162761688, + "rewards/margins": 0.037090349942445755, + "rewards/rejected": -0.39095860719680786, + "step": 1000 + }, + { + "epoch": 0.632473253618628, + "grad_norm": 55.57672882080078, + "learning_rate": 1.4618782326561483e-05, + "logits/chosen": -1.7331736087799072, + "logits/rejected": -1.771627426147461, + "logps/chosen": -3.9518864154815674, + "logps/rejected": -4.847538948059082, + "loss": 20.4833, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.2925838530063629, + "rewards/margins": 0.0719093531370163, + "rewards/rejected": -0.3644932210445404, + "step": 1005 + }, + { + "epoch": 0.6356198867212083, + "grad_norm": 75.53394317626953, + "learning_rate": 1.4407520973462408e-05, + "logits/chosen": -1.7358888387680054, + "logits/rejected": -1.7642987966537476, + "logps/chosen": -4.450674057006836, + "logps/rejected": -5.2704572677612305, + "loss": 22.8124, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3556494116783142, + "rewards/margins": 0.04838743433356285, + "rewards/rejected": -0.40403684973716736, + "step": 1010 + }, + { + "epoch": 0.6387665198237885, + "grad_norm": 67.8470230102539, + "learning_rate": 1.4196934408869118e-05, + "logits/chosen": -1.8153152465820312, + "logits/rejected": -1.8065166473388672, + "logps/chosen": -5.316075325012207, + "logps/rejected": -6.770912170410156, + "loss": 21.5925, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3945319950580597, + "rewards/margins": 0.06610045582056046, + "rewards/rejected": -0.46063241362571716, + "step": 1015 + }, + { + "epoch": 0.6419131529263687, + "grad_norm": 104.53321075439453, + "learning_rate": 1.3987048042155977e-05, + "logits/chosen": -1.6470744609832764, + "logits/rejected": -1.6989984512329102, + "logps/chosen": -4.787189960479736, + "logps/rejected": -5.5443525314331055, + "loss": 22.5867, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3872155249118805, + "rewards/margins": 0.05858270451426506, + "rewards/rejected": -0.44579824805259705, + "step": 1020 + }, + { + "epoch": 0.645059786028949, + "grad_norm": 122.49982452392578, + "learning_rate": 1.377788719821149e-05, + "logits/chosen": -1.6421356201171875, + "logits/rejected": -1.702820062637329, + "logps/chosen": -4.435242652893066, + "logps/rejected": -4.579672336578369, + "loss": 25.1424, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3478389382362366, + "rewards/margins": 0.0215731430798769, + "rewards/rejected": -0.3694121241569519, + "step": 1025 + }, + { + "epoch": 0.6482064191315292, + "grad_norm": 145.1405487060547, + "learning_rate": 1.3569477114382568e-05, + "logits/chosen": -1.6365470886230469, + "logits/rejected": -1.6954962015151978, + "logps/chosen": -4.985340595245361, + "logps/rejected": -5.898791313171387, + "loss": 21.7627, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.385195255279541, + "rewards/margins": 0.05228766053915024, + "rewards/rejected": -0.43748289346694946, + "step": 1030 + }, + { + "epoch": 0.6513530522341096, + "grad_norm": 82.04701232910156, + "learning_rate": 1.3361842937429436e-05, + "logits/chosen": -1.6654088497161865, + "logits/rejected": -1.732187032699585, + "logps/chosen": -4.262317180633545, + "logps/rejected": -5.410677909851074, + "loss": 20.2359, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3416784703731537, + "rewards/margins": 0.08638517558574677, + "rewards/rejected": -0.42806363105773926, + "step": 1035 + }, + { + "epoch": 0.6544996853366898, + "grad_norm": 95.95136260986328, + "learning_rate": 1.3155009720491368e-05, + "logits/chosen": -1.5801721811294556, + "logits/rejected": -1.5603923797607422, + "logps/chosen": -5.278650760650635, + "logps/rejected": -6.190367698669434, + "loss": 22.4881, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3998781740665436, + "rewards/margins": 0.06602592766284943, + "rewards/rejected": -0.4659040868282318, + "step": 1040 + }, + { + "epoch": 0.65764631843927, + "grad_norm": 60.0530891418457, + "learning_rate": 1.2949002420063828e-05, + "logits/chosen": -1.6326820850372314, + "logits/rejected": -1.720810890197754, + "logps/chosen": -4.082489967346191, + "logps/rejected": -5.006215572357178, + "loss": 21.0105, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.32317107915878296, + "rewards/margins": 0.07444654405117035, + "rewards/rejected": -0.3976176679134369, + "step": 1045 + }, + { + "epoch": 0.6607929515418502, + "grad_norm": 221.81906127929688, + "learning_rate": 1.2743845892987183e-05, + "logits/chosen": -1.6526765823364258, + "logits/rejected": -1.697488784790039, + "logps/chosen": -4.53380823135376, + "logps/rejected": -5.771850109100342, + "loss": 23.2634, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.34263211488723755, + "rewards/margins": 0.07287438213825226, + "rewards/rejected": -0.4155064523220062, + "step": 1050 + }, + { + "epoch": 0.6639395846444305, + "grad_norm": 137.2283172607422, + "learning_rate": 1.2539564893447489e-05, + "logits/chosen": -1.631956696510315, + "logits/rejected": -1.654306173324585, + "logps/chosen": -4.1559600830078125, + "logps/rejected": -5.033182621002197, + "loss": 22.6183, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.32424792647361755, + "rewards/margins": 0.06618380546569824, + "rewards/rejected": -0.3904317319393158, + "step": 1055 + }, + { + "epoch": 0.6670862177470107, + "grad_norm": 72.95520782470703, + "learning_rate": 1.2336184069989663e-05, + "logits/chosen": -1.670440435409546, + "logits/rejected": -1.6872297525405884, + "logps/chosen": -3.9552032947540283, + "logps/rejected": -5.303035259246826, + "loss": 19.5681, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.31223589181900024, + "rewards/margins": 0.09164074063301086, + "rewards/rejected": -0.4038766026496887, + "step": 1060 + }, + { + "epoch": 0.6702328508495909, + "grad_norm": 90.91898345947266, + "learning_rate": 1.2133727962543356e-05, + "logits/chosen": -1.6696465015411377, + "logits/rejected": -1.6963016986846924, + "logps/chosen": -4.434679985046387, + "logps/rejected": -5.158357620239258, + "loss": 21.8675, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.3489730954170227, + "rewards/margins": 0.05557180196046829, + "rewards/rejected": -0.4045449197292328, + "step": 1065 + }, + { + "epoch": 0.6733794839521712, + "grad_norm": 185.79261779785156, + "learning_rate": 1.193222099946202e-05, + "logits/chosen": -1.6571991443634033, + "logits/rejected": -1.7073132991790771, + "logps/chosen": -4.607517242431641, + "logps/rejected": -5.376668930053711, + "loss": 22.3462, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.35802438855171204, + "rewards/margins": 0.0643647164106369, + "rewards/rejected": -0.42238911986351013, + "step": 1070 + }, + { + "epoch": 0.6765261170547514, + "grad_norm": 71.50703430175781, + "learning_rate": 1.1731687494575319e-05, + "logits/chosen": -1.585889458656311, + "logits/rejected": -1.6507800817489624, + "logps/chosen": -4.845611572265625, + "logps/rejected": -6.422255516052246, + "loss": 18.5681, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37992939352989197, + "rewards/margins": 0.10727685689926147, + "rewards/rejected": -0.48720628023147583, + "step": 1075 + }, + { + "epoch": 0.6796727501573316, + "grad_norm": 210.3772430419922, + "learning_rate": 1.153215164425547e-05, + "logits/chosen": -1.5637327432632446, + "logits/rejected": -1.628791093826294, + "logps/chosen": -4.643498420715332, + "logps/rejected": -5.90508508682251, + "loss": 22.429, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3530879020690918, + "rewards/margins": 0.07370196282863617, + "rewards/rejected": -0.42678990960121155, + "step": 1080 + }, + { + "epoch": 0.6828193832599119, + "grad_norm": 324.6168212890625, + "learning_rate": 1.133363752449768e-05, + "logits/chosen": -1.6127498149871826, + "logits/rejected": -1.5895841121673584, + "logps/chosen": -3.8858344554901123, + "logps/rejected": -5.141265392303467, + "loss": 18.9867, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.31075209379196167, + "rewards/margins": 0.10046511888504028, + "rewards/rejected": -0.41121721267700195, + "step": 1085 + }, + { + "epoch": 0.6859660163624921, + "grad_norm": 269.12744140625, + "learning_rate": 1.1136169088015177e-05, + "logits/chosen": -1.5152666568756104, + "logits/rejected": -1.5772387981414795, + "logps/chosen": -4.37540864944458, + "logps/rejected": -5.073463439941406, + "loss": 22.4614, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3488185405731201, + "rewards/margins": 0.05801800638437271, + "rewards/rejected": -0.40683650970458984, + "step": 1090 + }, + { + "epoch": 0.6891126494650723, + "grad_norm": 407.53985595703125, + "learning_rate": 1.0939770161349015e-05, + "logits/chosen": -1.604278802871704, + "logits/rejected": -1.6394538879394531, + "logps/chosen": -4.725668907165527, + "logps/rejected": -6.037966728210449, + "loss": 23.0495, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3864028751850128, + "rewards/margins": 0.09246636927127838, + "rewards/rejected": -0.4788691997528076, + "step": 1095 + }, + { + "epoch": 0.6922592825676526, + "grad_norm": 65.52562713623047, + "learning_rate": 1.0744464441993205e-05, + "logits/chosen": -1.4906436204910278, + "logits/rejected": -1.570569634437561, + "logps/chosen": -4.404895782470703, + "logps/rejected": -5.454612731933594, + "loss": 21.9146, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3443445563316345, + "rewards/margins": 0.07481996715068817, + "rewards/rejected": -0.4191645085811615, + "step": 1100 + }, + { + "epoch": 0.6954059156702328, + "grad_norm": 60.899654388427734, + "learning_rate": 1.0550275495535382e-05, + "logits/chosen": -1.5062484741210938, + "logits/rejected": -1.5998207330703735, + "logps/chosen": -5.046140193939209, + "logps/rejected": -6.212726593017578, + "loss": 22.0906, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3926524817943573, + "rewards/margins": 0.08822645246982574, + "rewards/rejected": -0.48087891936302185, + "step": 1105 + }, + { + "epoch": 0.6985525487728131, + "grad_norm": 85.36582946777344, + "learning_rate": 1.0357226752813343e-05, + "logits/chosen": -1.48141348361969, + "logits/rejected": -1.532138705253601, + "logps/chosen": -4.955922603607178, + "logps/rejected": -6.1522979736328125, + "loss": 19.2663, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.3955458402633667, + "rewards/margins": 0.09683366119861603, + "rewards/rejected": -0.4923795163631439, + "step": 1110 + }, + { + "epoch": 0.7016991818753934, + "grad_norm": 92.5035171508789, + "learning_rate": 1.0165341507087922e-05, + "logits/chosen": -1.4898306131362915, + "logits/rejected": -1.589817762374878, + "logps/chosen": -4.877270221710205, + "logps/rejected": -6.326567649841309, + "loss": 21.0751, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3771550953388214, + "rewards/margins": 0.10272278636693954, + "rewards/rejected": -0.47987785935401917, + "step": 1115 + }, + { + "epoch": 0.7048458149779736, + "grad_norm": 100.18026733398438, + "learning_rate": 9.974642911232413e-06, + "logits/chosen": -1.5176981687545776, + "logits/rejected": -1.5406978130340576, + "logps/chosen": -5.319207191467285, + "logps/rejected": -6.242737770080566, + "loss": 20.9524, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4210137724876404, + "rewards/margins": 0.07255946844816208, + "rewards/rejected": -0.49357327818870544, + "step": 1120 + }, + { + "epoch": 0.7079924480805538, + "grad_norm": 176.3753662109375, + "learning_rate": 9.785153974938912e-06, + "logits/chosen": -1.5830824375152588, + "logits/rejected": -1.6101982593536377, + "logps/chosen": -5.879128456115723, + "logps/rejected": -6.807085990905762, + "loss": 22.111, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.44119253754615784, + "rewards/margins": 0.07570262253284454, + "rewards/rejected": -0.5168951749801636, + "step": 1125 + }, + { + "epoch": 0.7111390811831341, + "grad_norm": 67.40308380126953, + "learning_rate": 9.596897561942026e-06, + "logits/chosen": -1.463176965713501, + "logits/rejected": -1.4804832935333252, + "logps/chosen": -4.481048107147217, + "logps/rejected": -5.287797451019287, + "loss": 22.1994, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3575854003429413, + "rewards/margins": 0.06364957243204117, + "rewards/rejected": -0.42123493552207947, + "step": 1130 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 93.39257049560547, + "learning_rate": 9.409896387260082e-06, + "logits/chosen": -1.4179964065551758, + "logits/rejected": -1.4655730724334717, + "logps/chosen": -4.708760738372803, + "logps/rejected": -6.217686653137207, + "loss": 21.4161, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.37350720167160034, + "rewards/margins": 0.10105752944946289, + "rewards/rejected": -0.47456473112106323, + "step": 1135 + }, + { + "epoch": 0.7174323473882945, + "grad_norm": 97.41583251953125, + "learning_rate": 9.224173014454372e-06, + "logits/chosen": -1.4397246837615967, + "logits/rejected": -1.4766523838043213, + "logps/chosen": -4.817109107971191, + "logps/rejected": -6.214907169342041, + "loss": 22.7104, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.384985089302063, + "rewards/margins": 0.0952010303735733, + "rewards/rejected": -0.4801861345767975, + "step": 1140 + }, + { + "epoch": 0.7205789804908748, + "grad_norm": 103.4198989868164, + "learning_rate": 9.039749852906606e-06, + "logits/chosen": -1.368666648864746, + "logits/rejected": -1.4239342212677002, + "logps/chosen": -4.382673740386963, + "logps/rejected": -5.262811183929443, + "loss": 20.8727, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.35242724418640137, + "rewards/margins": 0.075123131275177, + "rewards/rejected": -0.42755040526390076, + "step": 1145 + }, + { + "epoch": 0.723725613593455, + "grad_norm": 131.38589477539062, + "learning_rate": 8.856649155115002e-06, + "logits/chosen": -1.409711241722107, + "logits/rejected": -1.455235481262207, + "logps/chosen": -4.550191402435303, + "logps/rejected": -5.52540922164917, + "loss": 23.0103, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3620757460594177, + "rewards/margins": 0.06903600692749023, + "rewards/rejected": -0.43111175298690796, + "step": 1150 + }, + { + "epoch": 0.7268722466960352, + "grad_norm": 60.0385627746582, + "learning_rate": 8.674893014009311e-06, + "logits/chosen": -1.3705095052719116, + "logits/rejected": -1.4764083623886108, + "logps/chosen": -4.423483848571777, + "logps/rejected": -5.486600875854492, + "loss": 21.3505, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3566300570964813, + "rewards/margins": 0.07945708185434341, + "rewards/rejected": -0.4360871911048889, + "step": 1155 + }, + { + "epoch": 0.7300188797986155, + "grad_norm": 80.497802734375, + "learning_rate": 8.494503360285084e-06, + "logits/chosen": -1.406087875366211, + "logits/rejected": -1.5597848892211914, + "logps/chosen": -4.28043270111084, + "logps/rejected": -5.639766216278076, + "loss": 21.9094, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3276395797729492, + "rewards/margins": 0.07139433920383453, + "rewards/rejected": -0.39903393387794495, + "step": 1160 + }, + { + "epoch": 0.7331655129011957, + "grad_norm": 106.78560638427734, + "learning_rate": 8.315501959757506e-06, + "logits/chosen": -1.4479920864105225, + "logits/rejected": -1.530386209487915, + "logps/chosen": -5.356269836425781, + "logps/rejected": -6.295357704162598, + "loss": 20.2622, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.39192715287208557, + "rewards/margins": 0.07843243330717087, + "rewards/rejected": -0.47035956382751465, + "step": 1165 + }, + { + "epoch": 0.7363121460037759, + "grad_norm": 70.2252426147461, + "learning_rate": 8.137910410735119e-06, + "logits/chosen": -1.3913201093673706, + "logits/rejected": -1.5211797952651978, + "logps/chosen": -4.186515808105469, + "logps/rejected": -5.630705833435059, + "loss": 19.5955, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3086017966270447, + "rewards/margins": 0.1026659831404686, + "rewards/rejected": -0.4112677574157715, + "step": 1170 + }, + { + "epoch": 0.7394587791063562, + "grad_norm": 192.9811553955078, + "learning_rate": 7.961750141413811e-06, + "logits/chosen": -1.4113714694976807, + "logits/rejected": -1.4863709211349487, + "logps/chosen": -4.043957710266113, + "logps/rejected": -4.903926849365234, + "loss": 21.1766, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.30225270986557007, + "rewards/margins": 0.0713002160191536, + "rewards/rejected": -0.3735528886318207, + "step": 1175 + }, + { + "epoch": 0.7426054122089364, + "grad_norm": 120.66477966308594, + "learning_rate": 7.787042407291236e-06, + "logits/chosen": -1.4459470510482788, + "logits/rejected": -1.4732497930526733, + "logps/chosen": -4.194180488586426, + "logps/rejected": -5.103634834289551, + "loss": 21.7414, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.33040323853492737, + "rewards/margins": 0.07123108208179474, + "rewards/rejected": -0.4016343653202057, + "step": 1180 + }, + { + "epoch": 0.7457520453115167, + "grad_norm": 76.2223892211914, + "learning_rate": 7.613808288602185e-06, + "logits/chosen": -1.3101516962051392, + "logits/rejected": -1.410070776939392, + "logps/chosen": -3.897928237915039, + "logps/rejected": -4.853459358215332, + "loss": 20.4936, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.30166110396385193, + "rewards/margins": 0.07222743332386017, + "rewards/rejected": -0.3738885223865509, + "step": 1185 + }, + { + "epoch": 0.748898678414097, + "grad_norm": 71.2408676147461, + "learning_rate": 7.442068687774983e-06, + "logits/chosen": -1.3900350332260132, + "logits/rejected": -1.4306429624557495, + "logps/chosen": -4.03500509262085, + "logps/rejected": -4.971550941467285, + "loss": 20.8514, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.31287750601768494, + "rewards/margins": 0.07024455070495605, + "rewards/rejected": -0.383122056722641, + "step": 1190 + }, + { + "epoch": 0.7520453115166772, + "grad_norm": 174.92535400390625, + "learning_rate": 7.271844326909465e-06, + "logits/chosen": -1.3968006372451782, + "logits/rejected": -1.3997862339019775, + "logps/chosen": -4.94242000579834, + "logps/rejected": -5.543642520904541, + "loss": 23.6965, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.36712104082107544, + "rewards/margins": 0.041298940777778625, + "rewards/rejected": -0.40841999650001526, + "step": 1195 + }, + { + "epoch": 0.7551919446192574, + "grad_norm": 83.12405395507812, + "learning_rate": 7.1031557452765934e-06, + "logits/chosen": -1.4155142307281494, + "logits/rejected": -1.4555690288543701, + "logps/chosen": -3.987143039703369, + "logps/rejected": -5.240988731384277, + "loss": 20.4557, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.312110960483551, + "rewards/margins": 0.08850479125976562, + "rewards/rejected": -0.40061575174331665, + "step": 1200 + }, + { + "epoch": 0.7583385777218377, + "grad_norm": 82.25894165039062, + "learning_rate": 6.936023296840211e-06, + "logits/chosen": -1.3227570056915283, + "logits/rejected": -1.4542601108551025, + "logps/chosen": -4.520358562469482, + "logps/rejected": -5.628200531005859, + "loss": 21.0717, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3516823947429657, + "rewards/margins": 0.07068557292222977, + "rewards/rejected": -0.42236796021461487, + "step": 1205 + }, + { + "epoch": 0.7614852108244179, + "grad_norm": 63.93009567260742, + "learning_rate": 6.770467147801152e-06, + "logits/chosen": -1.3352692127227783, + "logits/rejected": -1.4765124320983887, + "logps/chosen": -3.903353452682495, + "logps/rejected": -5.777923107147217, + "loss": 18.1176, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.30418699979782104, + "rewards/margins": 0.1330038160085678, + "rewards/rejected": -0.43719083070755005, + "step": 1210 + }, + { + "epoch": 0.7646318439269981, + "grad_norm": 123.6090087890625, + "learning_rate": 6.606507274163949e-06, + "logits/chosen": -1.4196144342422485, + "logits/rejected": -1.5160802602767944, + "logps/chosen": -4.3763604164123535, + "logps/rejected": -5.507956504821777, + "loss": 21.3593, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3455001711845398, + "rewards/margins": 0.08908528089523315, + "rewards/rejected": -0.43458548188209534, + "step": 1215 + }, + { + "epoch": 0.7677784770295784, + "grad_norm": 79.51527404785156, + "learning_rate": 6.444163459326569e-06, + "logits/chosen": -1.3841816186904907, + "logits/rejected": -1.44673752784729, + "logps/chosen": -4.642246723175049, + "logps/rejected": -5.952216625213623, + "loss": 20.2826, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.37046322226524353, + "rewards/margins": 0.10095451772212982, + "rewards/rejected": -0.47141775488853455, + "step": 1220 + }, + { + "epoch": 0.7709251101321586, + "grad_norm": 115.33991241455078, + "learning_rate": 6.283455291693303e-06, + "logits/chosen": -1.2804498672485352, + "logits/rejected": -1.336126446723938, + "logps/chosen": -4.530810356140137, + "logps/rejected": -5.714901924133301, + "loss": 23.5811, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3543975353240967, + "rewards/margins": 0.07916755974292755, + "rewards/rejected": -0.43356508016586304, + "step": 1225 + }, + { + "epoch": 0.7740717432347388, + "grad_norm": 102.68350219726562, + "learning_rate": 6.124402162311274e-06, + "logits/chosen": -1.3455007076263428, + "logits/rejected": -1.3819594383239746, + "logps/chosen": -4.560150146484375, + "logps/rejected": -5.909863471984863, + "loss": 21.4806, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.36466413736343384, + "rewards/margins": 0.07453545182943344, + "rewards/rejected": -0.4391995966434479, + "step": 1230 + }, + { + "epoch": 0.777218376337319, + "grad_norm": 78.0007553100586, + "learning_rate": 5.9670232625306955e-06, + "logits/chosen": -1.3267484903335571, + "logits/rejected": -1.3938989639282227, + "logps/chosen": -4.1908979415893555, + "logps/rejected": -4.819875240325928, + "loss": 24.9323, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3211560845375061, + "rewards/margins": 0.054157011210918427, + "rewards/rejected": -0.3753131031990051, + "step": 1235 + }, + { + "epoch": 0.7803650094398993, + "grad_norm": 910.2023315429688, + "learning_rate": 5.81133758168922e-06, + "logits/chosen": -1.4007585048675537, + "logits/rejected": -1.4542076587677002, + "logps/chosen": -5.091724872589111, + "logps/rejected": -6.444447994232178, + "loss": 20.9318, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.36028310656547546, + "rewards/margins": 0.09358057379722595, + "rewards/rejected": -0.4538637101650238, + "step": 1240 + }, + { + "epoch": 0.7835116425424795, + "grad_norm": 68.25672912597656, + "learning_rate": 5.6573639048207315e-06, + "logits/chosen": -1.3604391813278198, + "logits/rejected": -1.3182973861694336, + "logps/chosen": -4.621526718139648, + "logps/rejected": -5.245944023132324, + "loss": 21.9955, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.32450687885284424, + "rewards/margins": 0.07323630154132843, + "rewards/rejected": -0.3977431654930115, + "step": 1245 + }, + { + "epoch": 0.7866582756450597, + "grad_norm": 106.51322937011719, + "learning_rate": 5.5051208103887025e-06, + "logits/chosen": -1.3608815670013428, + "logits/rejected": -1.4448637962341309, + "logps/chosen": -4.045924663543701, + "logps/rejected": -5.57630729675293, + "loss": 20.889, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3083491623401642, + "rewards/margins": 0.09919731318950653, + "rewards/rejected": -0.40754643082618713, + "step": 1250 + }, + { + "epoch": 0.78980490874764, + "grad_norm": 70.59749603271484, + "learning_rate": 5.354626668044535e-06, + "logits/chosen": -1.3460859060287476, + "logits/rejected": -1.412706732749939, + "logps/chosen": -3.734891891479492, + "logps/rejected": -4.818475246429443, + "loss": 21.0468, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2917167842388153, + "rewards/margins": 0.07943135499954224, + "rewards/rejected": -0.37114813923835754, + "step": 1255 + }, + { + "epoch": 0.7929515418502202, + "grad_norm": 83.2120361328125, + "learning_rate": 5.205899636411078e-06, + "logits/chosen": -1.3329652547836304, + "logits/rejected": -1.3952248096466064, + "logps/chosen": -4.460053443908691, + "logps/rejected": -4.993377685546875, + "loss": 25.4182, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.34508955478668213, + "rewards/margins": 0.03861779719591141, + "rewards/rejected": -0.38370734453201294, + "step": 1260 + }, + { + "epoch": 0.7960981749528006, + "grad_norm": 74.94086456298828, + "learning_rate": 5.058957660891613e-06, + "logits/chosen": -1.353829264640808, + "logits/rejected": -1.36537766456604, + "logps/chosen": -3.8537967205047607, + "logps/rejected": -4.86336612701416, + "loss": 21.0046, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.29704272747039795, + "rewards/margins": 0.07927833497524261, + "rewards/rejected": -0.376321017742157, + "step": 1265 + }, + { + "epoch": 0.7992448080553808, + "grad_norm": 68.53548431396484, + "learning_rate": 4.913818471504552e-06, + "logits/chosen": -1.3891483545303345, + "logits/rejected": -1.4956327676773071, + "logps/chosen": -3.83349609375, + "logps/rejected": -5.111277103424072, + "loss": 20.258, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.299319326877594, + "rewards/margins": 0.09995778650045395, + "rewards/rejected": -0.3992771506309509, + "step": 1270 + }, + { + "epoch": 0.802391441157961, + "grad_norm": 161.29922485351562, + "learning_rate": 4.770499580744125e-06, + "logits/chosen": -1.3398183584213257, + "logits/rejected": -1.3453642129898071, + "logps/chosen": -3.9315247535705566, + "logps/rejected": -4.841611862182617, + "loss": 22.4824, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.30572158098220825, + "rewards/margins": 0.06097061559557915, + "rewards/rejected": -0.3666921854019165, + "step": 1275 + }, + { + "epoch": 0.8055380742605412, + "grad_norm": 68.45879364013672, + "learning_rate": 4.629018281467357e-06, + "logits/chosen": -1.297154188156128, + "logits/rejected": -1.338921070098877, + "logps/chosen": -3.7794177532196045, + "logps/rejected": -4.509110927581787, + "loss": 21.658, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2888321876525879, + "rewards/margins": 0.05916588753461838, + "rewards/rejected": -0.3479980528354645, + "step": 1280 + }, + { + "epoch": 0.8086847073631215, + "grad_norm": 74.77375793457031, + "learning_rate": 4.489391644807462e-06, + "logits/chosen": -1.4385647773742676, + "logits/rejected": -1.5144340991973877, + "logps/chosen": -3.69215726852417, + "logps/rejected": -4.667183876037598, + "loss": 21.0338, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2842097282409668, + "rewards/margins": 0.07265909761190414, + "rewards/rejected": -0.35686883330345154, + "step": 1285 + }, + { + "epoch": 0.8118313404657017, + "grad_norm": 78.63387298583984, + "learning_rate": 4.351636518114091e-06, + "logits/chosen": -1.3093000650405884, + "logits/rejected": -1.3893928527832031, + "logps/chosen": -3.599902629852295, + "logps/rejected": -4.570587635040283, + "loss": 22.1635, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.2749050259590149, + "rewards/margins": 0.08025064319372177, + "rewards/rejected": -0.3551556468009949, + "step": 1290 + }, + { + "epoch": 0.8149779735682819, + "grad_norm": 78.53893280029297, + "learning_rate": 4.215769522920487e-06, + "logits/chosen": -1.2443653345108032, + "logits/rejected": -1.3605782985687256, + "logps/chosen": -3.2713770866394043, + "logps/rejected": -4.569630146026611, + "loss": 20.9369, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.24853453040122986, + "rewards/margins": 0.10017738491296768, + "rewards/rejected": -0.34871190786361694, + "step": 1295 + }, + { + "epoch": 0.8181246066708622, + "grad_norm": 82.4554672241211, + "learning_rate": 4.0818070529379715e-06, + "logits/chosen": -1.383690357208252, + "logits/rejected": -1.4704560041427612, + "logps/chosen": -4.524319171905518, + "logps/rejected": -5.7077460289001465, + "loss": 21.9118, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.316571444272995, + "rewards/margins": 0.0641409307718277, + "rewards/rejected": -0.3807123601436615, + "step": 1300 + }, + { + "epoch": 0.8212712397734424, + "grad_norm": 71.1880111694336, + "learning_rate": 3.949765272077843e-06, + "logits/chosen": -1.3107343912124634, + "logits/rejected": -1.3561115264892578, + "logps/chosen": -3.846195936203003, + "logps/rejected": -4.79428768157959, + "loss": 21.0994, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.27155357599258423, + "rewards/margins": 0.07163957506418228, + "rewards/rejected": -0.3431931734085083, + "step": 1305 + }, + { + "epoch": 0.8244178728760226, + "grad_norm": 50.073204040527344, + "learning_rate": 3.819660112501053e-06, + "logits/chosen": -1.2764497995376587, + "logits/rejected": -1.3517284393310547, + "logps/chosen": -3.5745315551757812, + "logps/rejected": -4.921723365783691, + "loss": 19.6469, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.27279648184776306, + "rewards/margins": 0.1019618958234787, + "rewards/rejected": -0.37475839257240295, + "step": 1310 + }, + { + "epoch": 0.8275645059786029, + "grad_norm": 83.62207794189453, + "learning_rate": 3.6915072726958514e-06, + "logits/chosen": -1.2466180324554443, + "logits/rejected": -1.2861813306808472, + "logps/chosen": -3.430490016937256, + "logps/rejected": -4.824821949005127, + "loss": 20.5161, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2643739581108093, + "rewards/margins": 0.1028999462723732, + "rewards/rejected": -0.3672739565372467, + "step": 1315 + }, + { + "epoch": 0.8307111390811831, + "grad_norm": 76.6629638671875, + "learning_rate": 3.5653222155835686e-06, + "logits/chosen": -1.2766977548599243, + "logits/rejected": -1.3114259243011475, + "logps/chosen": -4.222517967224121, + "logps/rejected": -5.029845714569092, + "loss": 22.1218, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3145274221897125, + "rewards/margins": 0.06165830045938492, + "rewards/rejected": -0.37618574500083923, + "step": 1320 + }, + { + "epoch": 0.8338577721837633, + "grad_norm": 159.4115447998047, + "learning_rate": 3.4411201666529003e-06, + "logits/chosen": -1.3758924007415771, + "logits/rejected": -1.4244683980941772, + "logps/chosen": -4.457423210144043, + "logps/rejected": -5.342848300933838, + "loss": 23.3834, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.31679028272628784, + "rewards/margins": 0.06267707049846649, + "rewards/rejected": -0.3794673979282379, + "step": 1325 + }, + { + "epoch": 0.8370044052863436, + "grad_norm": 56.71870803833008, + "learning_rate": 3.3189161121227564e-06, + "logits/chosen": -1.3166803121566772, + "logits/rejected": -1.385522723197937, + "logps/chosen": -3.8323776721954346, + "logps/rejected": -4.732277870178223, + "loss": 23.3384, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2978932559490204, + "rewards/margins": 0.0644349679350853, + "rewards/rejected": -0.3623282313346863, + "step": 1330 + }, + { + "epoch": 0.8401510383889238, + "grad_norm": 66.62996673583984, + "learning_rate": 3.198724797134074e-06, + "logits/chosen": -1.2822662591934204, + "logits/rejected": -1.4124181270599365, + "logps/chosen": -3.9724369049072266, + "logps/rejected": -5.0466437339782715, + "loss": 22.4903, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2994682192802429, + "rewards/margins": 0.0788046196103096, + "rewards/rejected": -0.3782728910446167, + "step": 1335 + }, + { + "epoch": 0.8432976714915041, + "grad_norm": 70.8177261352539, + "learning_rate": 3.080560723970616e-06, + "logits/chosen": -1.2813329696655273, + "logits/rejected": -1.3586981296539307, + "logps/chosen": -3.6214439868927, + "logps/rejected": -4.637081623077393, + "loss": 20.5515, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.28146275877952576, + "rewards/margins": 0.07804764062166214, + "rewards/rejected": -0.3595103919506073, + "step": 1340 + }, + { + "epoch": 0.8464443045940844, + "grad_norm": 64.40753173828125, + "learning_rate": 2.96443815030917e-06, + "logits/chosen": -1.3396605253219604, + "logits/rejected": -1.4255945682525635, + "logps/chosen": -3.604154586791992, + "logps/rejected": -4.95128059387207, + "loss": 20.7037, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2776135206222534, + "rewards/margins": 0.09353432059288025, + "rewards/rejected": -0.37114784121513367, + "step": 1345 + }, + { + "epoch": 0.8495909376966646, + "grad_norm": 93.99842071533203, + "learning_rate": 2.850371087499195e-06, + "logits/chosen": -1.381260633468628, + "logits/rejected": -1.4631612300872803, + "logps/chosen": -4.883763790130615, + "logps/rejected": -6.07845401763916, + "loss": 21.0858, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.3591059148311615, + "rewards/margins": 0.09570769965648651, + "rewards/rejected": -0.4548136591911316, + "step": 1350 + }, + { + "epoch": 0.8527375707992448, + "grad_norm": 62.075279235839844, + "learning_rate": 2.7383732988722057e-06, + "logits/chosen": -1.3089946508407593, + "logits/rejected": -1.3634613752365112, + "logps/chosen": -3.7724010944366455, + "logps/rejected": -4.929832458496094, + "loss": 19.0202, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.273120641708374, + "rewards/margins": 0.09602681547403336, + "rewards/rejected": -0.36914747953414917, + "step": 1355 + }, + { + "epoch": 0.8558842039018251, + "grad_norm": 80.0210189819336, + "learning_rate": 2.6284582980811136e-06, + "logits/chosen": -1.4461333751678467, + "logits/rejected": -1.370339035987854, + "logps/chosen": -4.136780738830566, + "logps/rejected": -5.008397579193115, + "loss": 23.5672, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3027392327785492, + "rewards/margins": 0.062295325100421906, + "rewards/rejected": -0.3650345206260681, + "step": 1360 + }, + { + "epoch": 0.8590308370044053, + "grad_norm": 169.91099548339844, + "learning_rate": 2.5206393474696422e-06, + "logits/chosen": -1.2922241687774658, + "logits/rejected": -1.3685882091522217, + "logps/chosen": -3.8860459327697754, + "logps/rejected": -4.820228099822998, + "loss": 20.1345, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2878992557525635, + "rewards/margins": 0.07816118001937866, + "rewards/rejected": -0.36606043577194214, + "step": 1365 + }, + { + "epoch": 0.8621774701069855, + "grad_norm": 291.87542724609375, + "learning_rate": 2.4149294564721146e-06, + "logits/chosen": -1.390933632850647, + "logits/rejected": -1.477757215499878, + "logps/chosen": -4.5947346687316895, + "logps/rejected": -5.662859916687012, + "loss": 22.1173, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.32581329345703125, + "rewards/margins": 0.0882103443145752, + "rewards/rejected": -0.4140236973762512, + "step": 1370 + }, + { + "epoch": 0.8653241032095658, + "grad_norm": 50.774810791015625, + "learning_rate": 2.3113413800437145e-06, + "logits/chosen": -1.3678381443023682, + "logits/rejected": -1.4147788286209106, + "logps/chosen": -4.411424160003662, + "logps/rejected": -5.547976970672607, + "loss": 20.419, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3136950135231018, + "rewards/margins": 0.08119923621416092, + "rewards/rejected": -0.3948942720890045, + "step": 1375 + }, + { + "epoch": 0.868470736312146, + "grad_norm": 75.1661605834961, + "learning_rate": 2.2098876171215e-06, + "logits/chosen": -1.2949163913726807, + "logits/rejected": -1.4591166973114014, + "logps/chosen": -3.913958787918091, + "logps/rejected": -4.945563316345215, + "loss": 20.5075, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.30175596475601196, + "rewards/margins": 0.09277000278234482, + "rewards/rejected": -0.394525945186615, + "step": 1380 + }, + { + "epoch": 0.8716173694147262, + "grad_norm": 116.18523406982422, + "learning_rate": 2.110580409116261e-06, + "logits/chosen": -1.3234283924102783, + "logits/rejected": -1.3651349544525146, + "logps/chosen": -4.782530307769775, + "logps/rejected": -5.800885200500488, + "loss": 22.8406, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3511677384376526, + "rewards/margins": 0.07397367060184479, + "rewards/rejected": -0.4251413345336914, + "step": 1385 + }, + { + "epoch": 0.8747640025173065, + "grad_norm": 145.46861267089844, + "learning_rate": 2.013431738435465e-06, + "logits/chosen": -1.3332188129425049, + "logits/rejected": -1.4134724140167236, + "logps/chosen": -4.268718242645264, + "logps/rejected": -5.433601379394531, + "loss": 22.5056, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3332800269126892, + "rewards/margins": 0.07072637230157852, + "rewards/rejected": -0.4040064215660095, + "step": 1390 + }, + { + "epoch": 0.8779106356198867, + "grad_norm": 117.83720397949219, + "learning_rate": 1.9184533270374928e-06, + "logits/chosen": -1.3927792310714722, + "logits/rejected": -1.4590123891830444, + "logps/chosen": -4.519114017486572, + "logps/rejected": -5.810807228088379, + "loss": 21.2018, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.34003710746765137, + "rewards/margins": 0.08822458237409592, + "rewards/rejected": -0.4282616972923279, + "step": 1395 + }, + { + "epoch": 0.8810572687224669, + "grad_norm": 128.75563049316406, + "learning_rate": 1.8256566350172211e-06, + "logits/chosen": -1.4642970561981201, + "logits/rejected": -1.56011962890625, + "logps/chosen": -5.124087810516357, + "logps/rejected": -6.271437168121338, + "loss": 20.9824, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.34806352853775024, + "rewards/margins": 0.0969148576259613, + "rewards/rejected": -0.44497838616371155, + "step": 1400 + }, + { + "epoch": 0.8842039018250472, + "grad_norm": 88.87577056884766, + "learning_rate": 1.7350528592232962e-06, + "logits/chosen": -1.3359493017196655, + "logits/rejected": -1.4811887741088867, + "logps/chosen": -4.525036811828613, + "logps/rejected": -5.623012542724609, + "loss": 22.1104, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.3581879138946533, + "rewards/margins": 0.07608196139335632, + "rewards/rejected": -0.43426984548568726, + "step": 1405 + }, + { + "epoch": 0.8873505349276274, + "grad_norm": 69.19255065917969, + "learning_rate": 1.6466529319070735e-06, + "logits/chosen": -1.2726246118545532, + "logits/rejected": -1.39580237865448, + "logps/chosen": -3.7457852363586426, + "logps/rejected": -5.324977397918701, + "loss": 18.2434, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2871127724647522, + "rewards/margins": 0.11219409853219986, + "rewards/rejected": -0.39930686354637146, + "step": 1410 + }, + { + "epoch": 0.8904971680302077, + "grad_norm": 73.79737854003906, + "learning_rate": 1.560467519403579e-06, + "logits/chosen": -1.3266379833221436, + "logits/rejected": -1.3948261737823486, + "logps/chosen": -4.1067681312561035, + "logps/rejected": -4.673392295837402, + "loss": 22.1702, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3159501254558563, + "rewards/margins": 0.04971395805478096, + "rewards/rejected": -0.3656640946865082, + "step": 1415 + }, + { + "epoch": 0.893643801132788, + "grad_norm": 106.870361328125, + "learning_rate": 1.4765070208444732e-06, + "logits/chosen": -1.3216549158096313, + "logits/rejected": -1.35343337059021, + "logps/chosen": -4.343778133392334, + "logps/rejected": -5.122066497802734, + "loss": 22.7187, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.33430585265159607, + "rewards/margins": 0.06294408440589905, + "rewards/rejected": -0.3972499370574951, + "step": 1420 + }, + { + "epoch": 0.8967904342353682, + "grad_norm": 62.6711311340332, + "learning_rate": 1.3947815669033026e-06, + "logits/chosen": -1.3594673871994019, + "logits/rejected": -1.4739999771118164, + "logps/chosen": -4.087611198425293, + "logps/rejected": -5.339770317077637, + "loss": 20.526, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.31806594133377075, + "rewards/margins": 0.08883042633533478, + "rewards/rejected": -0.40689635276794434, + "step": 1425 + }, + { + "epoch": 0.8999370673379484, + "grad_norm": 98.1043930053711, + "learning_rate": 1.3153010185731495e-06, + "logits/chosen": -1.2508734464645386, + "logits/rejected": -1.32900869846344, + "logps/chosen": -4.235801696777344, + "logps/rejected": -5.670529842376709, + "loss": 20.3076, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3296756148338318, + "rewards/margins": 0.09636791795492172, + "rewards/rejected": -0.4260435700416565, + "step": 1430 + }, + { + "epoch": 0.9030837004405287, + "grad_norm": 87.73750305175781, + "learning_rate": 1.2380749659767766e-06, + "logits/chosen": -1.3343340158462524, + "logits/rejected": -1.3880221843719482, + "logps/chosen": -4.322578430175781, + "logps/rejected": -5.371191501617432, + "loss": 20.9961, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.33630794286727905, + "rewards/margins": 0.0794602781534195, + "rewards/rejected": -0.41576823592185974, + "step": 1435 + }, + { + "epoch": 0.9062303335431089, + "grad_norm": 72.0036392211914, + "learning_rate": 1.1631127272095077e-06, + "logits/chosen": -1.3422092199325562, + "logits/rejected": -1.4017739295959473, + "logps/chosen": -3.97587251663208, + "logps/rejected": -5.63102388381958, + "loss": 18.4484, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.302670419216156, + "rewards/margins": 0.1103433147072792, + "rewards/rejected": -0.413013756275177, + "step": 1440 + }, + { + "epoch": 0.9093769666456891, + "grad_norm": 55.72761917114258, + "learning_rate": 1.0904233472148862e-06, + "logits/chosen": -1.4325498342514038, + "logits/rejected": -1.5191594362258911, + "logps/chosen": -4.523946285247803, + "logps/rejected": -5.913887023925781, + "loss": 20.9945, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.34643903374671936, + "rewards/margins": 0.07747067511081696, + "rewards/rejected": -0.4239097237586975, + "step": 1445 + }, + { + "epoch": 0.9125235997482694, + "grad_norm": 74.03398132324219, + "learning_rate": 1.0200155966933333e-06, + "logits/chosen": -1.3860814571380615, + "logits/rejected": -1.4824600219726562, + "logps/chosen": -4.180668830871582, + "logps/rejected": -5.086295127868652, + "loss": 22.6256, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.31416332721710205, + "rewards/margins": 0.06807545572519302, + "rewards/rejected": -0.3822387754917145, + "step": 1450 + }, + { + "epoch": 0.9156702328508496, + "grad_norm": 55.17578887939453, + "learning_rate": 9.51897971043847e-07, + "logits/chosen": -1.277956485748291, + "logits/rejected": -1.4699045419692993, + "logps/chosen": -3.923815965652466, + "logps/rejected": -5.776226997375488, + "loss": 18.1837, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.30271369218826294, + "rewards/margins": 0.13357527554035187, + "rewards/rejected": -0.4362889230251312, + "step": 1455 + }, + { + "epoch": 0.9188168659534298, + "grad_norm": 67.42135620117188, + "learning_rate": 8.860786893389761e-07, + "logits/chosen": -1.3501498699188232, + "logits/rejected": -1.4162402153015137, + "logps/chosen": -4.456291198730469, + "logps/rejected": -4.891867637634277, + "loss": 23.4746, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.35184237360954285, + "rewards/margins": 0.03937570005655289, + "rewards/rejected": -0.3912180960178375, + "step": 1460 + }, + { + "epoch": 0.92196349905601, + "grad_norm": 86.8721923828125, + "learning_rate": 8.225656933330972e-07, + "logits/chosen": -1.396032691001892, + "logits/rejected": -1.3607252836227417, + "logps/chosen": -4.139504909515381, + "logps/rejected": -5.256811618804932, + "loss": 20.6197, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.31887346506118774, + "rewards/margins": 0.08756524324417114, + "rewards/rejected": -0.4064387381076813, + "step": 1465 + }, + { + "epoch": 0.9251101321585903, + "grad_norm": 63.26131057739258, + "learning_rate": 7.613666465041492e-07, + "logits/chosen": -1.296687364578247, + "logits/rejected": -1.338370442390442, + "logps/chosen": -4.0869526863098145, + "logps/rejected": -4.680004596710205, + "loss": 22.3496, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.30148619413375854, + "rewards/margins": 0.06435124576091766, + "rewards/rejected": -0.365837424993515, + "step": 1470 + }, + { + "epoch": 0.9282567652611705, + "grad_norm": 64.71456909179688, + "learning_rate": 7.024889331289731e-07, + "logits/chosen": -1.3576750755310059, + "logits/rejected": -1.4629138708114624, + "logps/chosen": -4.305732250213623, + "logps/rejected": -6.287524700164795, + "loss": 19.0147, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3270648717880249, + "rewards/margins": 0.12565208971500397, + "rewards/rejected": -0.45271697640419006, + "step": 1475 + }, + { + "epoch": 0.9314033983637507, + "grad_norm": 79.55664825439453, + "learning_rate": 6.459396573923227e-07, + "logits/chosen": -1.2750294208526611, + "logits/rejected": -1.3182651996612549, + "logps/chosen": -3.8780131340026855, + "logps/rejected": -5.497721195220947, + "loss": 19.3141, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.29957860708236694, + "rewards/margins": 0.11124887317419052, + "rewards/rejected": -0.41082748770713806, + "step": 1480 + }, + { + "epoch": 0.934550031466331, + "grad_norm": 97.28962707519531, + "learning_rate": 5.917256425296725e-07, + "logits/chosen": -1.3326900005340576, + "logits/rejected": -1.3848145008087158, + "logps/chosen": -4.326709270477295, + "logps/rejected": -5.8570427894592285, + "loss": 17.956, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.32169783115386963, + "rewards/margins": 0.11987517029047012, + "rewards/rejected": -0.44157299399375916, + "step": 1485 + }, + { + "epoch": 0.9376966645689113, + "grad_norm": 104.4383773803711, + "learning_rate": 5.398534300039227e-07, + "logits/chosen": -1.3669896125793457, + "logits/rejected": -1.4102351665496826, + "logps/chosen": -4.2153167724609375, + "logps/rejected": -5.1999030113220215, + "loss": 20.9588, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3310829997062683, + "rewards/margins": 0.07336001843214035, + "rewards/rejected": -0.40444302558898926, + "step": 1490 + }, + { + "epoch": 0.9408432976714916, + "grad_norm": 59.6121826171875, + "learning_rate": 4.903292787161129e-07, + "logits/chosen": -1.4228112697601318, + "logits/rejected": -1.528313159942627, + "logps/chosen": -4.338911533355713, + "logps/rejected": -5.048561096191406, + "loss": 22.4697, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3458613455295563, + "rewards/margins": 0.05565253645181656, + "rewards/rejected": -0.40151387453079224, + "step": 1495 + }, + { + "epoch": 0.9439899307740718, + "grad_norm": 134.8368377685547, + "learning_rate": 4.4315916425021755e-07, + "logits/chosen": -1.4706683158874512, + "logits/rejected": -1.5189244747161865, + "logps/chosen": -4.430064678192139, + "logps/rejected": -4.881100177764893, + "loss": 24.7599, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.34278133511543274, + "rewards/margins": 0.03427756577730179, + "rewards/rejected": -0.37705889344215393, + "step": 1500 + }, + { + "epoch": 0.947136563876652, + "grad_norm": 75.44186401367188, + "learning_rate": 3.983487781521311e-07, + "logits/chosen": -1.3628993034362793, + "logits/rejected": -1.5227676630020142, + "logps/chosen": -4.508485317230225, + "logps/rejected": -5.836249351501465, + "loss": 21.4824, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.35081833600997925, + "rewards/margins": 0.0795225128531456, + "rewards/rejected": -0.43034085631370544, + "step": 1505 + }, + { + "epoch": 0.9502831969792322, + "grad_norm": 53.86139678955078, + "learning_rate": 3.5590352724293565e-07, + "logits/chosen": -1.2814509868621826, + "logits/rejected": -1.383336067199707, + "logps/chosen": -3.697767972946167, + "logps/rejected": -5.5374345779418945, + "loss": 18.3089, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.28700894117355347, + "rewards/margins": 0.12951508164405823, + "rewards/rejected": -0.4165240228176117, + "step": 1510 + }, + { + "epoch": 0.9534298300818125, + "grad_norm": 55.83627700805664, + "learning_rate": 3.1582853296649785e-07, + "logits/chosen": -1.3301982879638672, + "logits/rejected": -1.4231036901474, + "logps/chosen": -3.7521042823791504, + "logps/rejected": -4.861963748931885, + "loss": 19.3616, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2891950309276581, + "rewards/margins": 0.09701049327850342, + "rewards/rejected": -0.3862055242061615, + "step": 1515 + }, + { + "epoch": 0.9565764631843927, + "grad_norm": 88.61446380615234, + "learning_rate": 2.7812863077153253e-07, + "logits/chosen": -1.2899259328842163, + "logits/rejected": -1.398050308227539, + "logps/chosen": -4.068936824798584, + "logps/rejected": -5.717960357666016, + "loss": 17.8938, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.31747734546661377, + "rewards/margins": 0.11797485500574112, + "rewards/rejected": -0.4354521632194519, + "step": 1520 + }, + { + "epoch": 0.9597230962869729, + "grad_norm": 58.96453857421875, + "learning_rate": 2.4280836952814913e-07, + "logits/chosen": -1.3611301183700562, + "logits/rejected": -1.4117127656936646, + "logps/chosen": -4.0526018142700195, + "logps/rejected": -5.437824249267578, + "loss": 21.3406, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.31340762972831726, + "rewards/margins": 0.07557393610477448, + "rewards/rejected": -0.38898158073425293, + "step": 1525 + }, + { + "epoch": 0.9628697293895532, + "grad_norm": 82.22030639648438, + "learning_rate": 2.0987201097897757e-07, + "logits/chosen": -1.290305256843567, + "logits/rejected": -1.3669493198394775, + "logps/chosen": -4.012240409851074, + "logps/rejected": -6.001503944396973, + "loss": 18.4697, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3110642433166504, + "rewards/margins": 0.14227357506752014, + "rewards/rejected": -0.45333781838417053, + "step": 1530 + }, + { + "epoch": 0.9660163624921334, + "grad_norm": 69.16776275634766, + "learning_rate": 1.7932352922496844e-07, + "logits/chosen": -1.3238952159881592, + "logits/rejected": -1.4009875059127808, + "logps/chosen": -4.168734550476074, + "logps/rejected": -5.520012855529785, + "loss": 18.6757, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.3233780264854431, + "rewards/margins": 0.10560549795627594, + "rewards/rejected": -0.42898350954055786, + "step": 1535 + }, + { + "epoch": 0.9691629955947136, + "grad_norm": 87.41554260253906, + "learning_rate": 1.5116661024584756e-07, + "logits/chosen": -1.3047425746917725, + "logits/rejected": -1.2935268878936768, + "logps/chosen": -3.8972859382629395, + "logps/rejected": -5.7956743240356445, + "loss": 19.4437, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2976101040840149, + "rewards/margins": 0.13567054271697998, + "rewards/rejected": -0.4332806169986725, + "step": 1540 + }, + { + "epoch": 0.9723096286972939, + "grad_norm": 129.71432495117188, + "learning_rate": 1.254046514553986e-07, + "logits/chosen": -1.3411355018615723, + "logits/rejected": -1.3150873184204102, + "logps/chosen": -4.793996334075928, + "logps/rejected": -6.1579155921936035, + "loss": 22.5465, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.36123085021972656, + "rewards/margins": 0.08643898367881775, + "rewards/rejected": -0.4476698338985443, + "step": 1545 + }, + { + "epoch": 0.9754562617998741, + "grad_norm": 156.82296752929688, + "learning_rate": 1.0204076129150198e-07, + "logits/chosen": -1.3176259994506836, + "logits/rejected": -1.371140956878662, + "logps/chosen": -4.381787300109863, + "logps/rejected": -5.822647571563721, + "loss": 20.2445, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.322052001953125, + "rewards/margins": 0.08496570587158203, + "rewards/rejected": -0.40701770782470703, + "step": 1550 + }, + { + "epoch": 0.9786028949024543, + "grad_norm": 101.22770690917969, + "learning_rate": 8.107775884109048e-08, + "logits/chosen": -1.377939224243164, + "logits/rejected": -1.460756540298462, + "logps/chosen": -4.821037292480469, + "logps/rejected": -5.5621137619018555, + "loss": 23.1685, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.37864041328430176, + "rewards/margins": 0.05817138031125069, + "rewards/rejected": -0.43681177496910095, + "step": 1555 + }, + { + "epoch": 0.9817495280050346, + "grad_norm": 93.55181884765625, + "learning_rate": 6.251817349998578e-08, + "logits/chosen": -1.2559947967529297, + "logits/rejected": -1.3171112537384033, + "logps/chosen": -3.9931647777557373, + "logps/rejected": -5.348459243774414, + "loss": 22.9477, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.30862289667129517, + "rewards/margins": 0.0842631608247757, + "rewards/rejected": -0.39288607239723206, + "step": 1560 + }, + { + "epoch": 0.9848961611076148, + "grad_norm": 80.63821411132812, + "learning_rate": 4.636424466771372e-08, + "logits/chosen": -1.24492347240448, + "logits/rejected": -1.3349525928497314, + "logps/chosen": -4.380553245544434, + "logps/rejected": -5.421158313751221, + "loss": 22.0329, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.34523719549179077, + "rewards/margins": 0.07052381336688995, + "rewards/rejected": -0.4157610535621643, + "step": 1565 + }, + { + "epoch": 0.9880427942101951, + "grad_norm": 55.254642486572266, + "learning_rate": 3.261792147728704e-08, + "logits/chosen": -1.3501121997833252, + "logits/rejected": -1.3522610664367676, + "logps/chosen": -4.829428195953369, + "logps/rejected": -5.480432033538818, + "loss": 22.6751, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3382914662361145, + "rewards/margins": 0.05635923147201538, + "rewards/rejected": -0.3946506381034851, + "step": 1570 + }, + { + "epoch": 0.9911894273127754, + "grad_norm": 102.65123748779297, + "learning_rate": 2.1280862560026927e-08, + "logits/chosen": -1.350527048110962, + "logits/rejected": -1.3495935201644897, + "logps/chosen": -3.8183772563934326, + "logps/rejected": -4.949650764465332, + "loss": 22.3353, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3000851273536682, + "rewards/margins": 0.07860491424798965, + "rewards/rejected": -0.37869006395339966, + "step": 1575 + }, + { + "epoch": 0.9943360604153556, + "grad_norm": 67.94386291503906, + "learning_rate": 1.2354435845436385e-08, + "logits/chosen": -1.2602336406707764, + "logits/rejected": -1.2594802379608154, + "logps/chosen": -3.5885491371154785, + "logps/rejected": -4.909377098083496, + "loss": 18.7801, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.27459749579429626, + "rewards/margins": 0.10287781804800034, + "rewards/rejected": -0.3774753212928772, + "step": 1580 + }, + { + "epoch": 0.9974826935179358, + "grad_norm": 78.847412109375, + "learning_rate": 5.8397183961411694e-09, + "logits/chosen": -1.4188308715820312, + "logits/rejected": -1.3911654949188232, + "logps/chosen": -4.257325649261475, + "logps/rejected": -5.559029579162598, + "loss": 20.67, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.30969610810279846, + "rewards/margins": 0.08111827820539474, + "rewards/rejected": -0.3908143639564514, + "step": 1585 + }, + { + "epoch": 1.0, + "step": 1589, + "total_flos": 0.0, + "train_loss": 22.009478435192264, + "train_runtime": 23016.83, + "train_samples_per_second": 1.105, + "train_steps_per_second": 0.069 + } + ], + "logging_steps": 5, + "max_steps": 1589, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}