{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100, "global_step": 3112, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.0706638115631692e-10, "logits/chosen": 1.2566330432891846, "logits/rejected": 0.7730951309204102, "logps/chosen": -300.374267578125, "logps/rejected": -324.00494384765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 1.070663811563169e-09, "logits/chosen": 0.9792649745941162, "logits/rejected": 1.7012548446655273, "logps/chosen": -464.2229309082031, "logps/rejected": -332.3782653808594, "loss": 0.6952, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": -0.006301212124526501, "rewards/margins": -0.0025307913310825825, "rewards/rejected": -0.0037704205606132746, "step": 10 }, { "epoch": 0.01, "learning_rate": 2.141327623126338e-09, "logits/chosen": 0.5618988871574402, "logits/rejected": 1.6265491247177124, "logps/chosen": -438.208984375, "logps/rejected": -328.3803405761719, "loss": 0.6981, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.012895757332444191, "rewards/margins": -0.00526293832808733, "rewards/rejected": -0.007632819004356861, "step": 20 }, { "epoch": 0.02, "learning_rate": 3.2119914346895075e-09, "logits/chosen": 0.8482489585876465, "logits/rejected": 1.8450462818145752, "logps/chosen": -437.23870849609375, "logps/rejected": -367.84637451171875, "loss": 0.6967, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.005116526037454605, "rewards/margins": -0.005225582513958216, "rewards/rejected": 0.010342106223106384, "step": 30 }, { "epoch": 0.03, "learning_rate": 4.282655246252676e-09, "logits/chosen": 0.9240902662277222, "logits/rejected": 2.074276924133301, "logps/chosen": -408.9275207519531, "logps/rejected": -335.3138122558594, "loss": 0.6953, "rewards/accuracies": 0.5, "rewards/chosen": 0.02186817303299904, "rewards/margins": 0.013794437050819397, "rewards/rejected": 0.008073735050857067, "step": 40 }, { "epoch": 0.03, "learning_rate": 5.353319057815846e-09, "logits/chosen": 0.9429410696029663, "logits/rejected": 1.3247915506362915, "logps/chosen": -487.283203125, "logps/rejected": -337.8562927246094, "loss": 0.6867, "rewards/accuracies": 0.5625, "rewards/chosen": 0.01094397995620966, "rewards/margins": 0.016293564811348915, "rewards/rejected": -0.005349582992494106, "step": 50 }, { "epoch": 0.04, "learning_rate": 6.423982869379015e-09, "logits/chosen": 0.9860417246818542, "logits/rejected": 1.649106740951538, "logps/chosen": -456.7554626464844, "logps/rejected": -330.7721252441406, "loss": 0.6692, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0688941478729248, "rewards/margins": 0.04043982923030853, "rewards/rejected": 0.02845432423055172, "step": 60 }, { "epoch": 0.04, "learning_rate": 7.494646680942184e-09, "logits/chosen": 0.9182626008987427, "logits/rejected": 1.827265977859497, "logps/chosen": -396.9792785644531, "logps/rejected": -330.5106506347656, "loss": 0.6531, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.1199527382850647, "rewards/margins": 0.07451293617486954, "rewards/rejected": 0.04543980211019516, "step": 70 }, { "epoch": 0.05, "learning_rate": 8.565310492505352e-09, "logits/chosen": 0.5657048225402832, "logits/rejected": 2.0772719383239746, "logps/chosen": -467.70233154296875, "logps/rejected": -340.50457763671875, "loss": 0.6347, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.18721021711826324, "rewards/margins": 0.12396695464849472, "rewards/rejected": 0.06324325501918793, "step": 80 }, { "epoch": 0.06, "learning_rate": 9.635974304068522e-09, "logits/chosen": 0.9224559664726257, "logits/rejected": 1.6432113647460938, "logps/chosen": -410.8121032714844, "logps/rejected": -300.57012939453125, "loss": 0.608, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.28949785232543945, "rewards/margins": 0.21210959553718567, "rewards/rejected": 0.07738825678825378, "step": 90 }, { "epoch": 0.06, "learning_rate": 1.0706638115631692e-08, "logits/chosen": 0.8869959115982056, "logits/rejected": 1.8451831340789795, "logps/chosen": -403.8915710449219, "logps/rejected": -300.415771484375, "loss": 0.5911, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.3641214966773987, "rewards/margins": 0.22090363502502441, "rewards/rejected": 0.14321786165237427, "step": 100 }, { "epoch": 0.06, "eval_logits/chosen": 0.30295610427856445, "eval_logits/rejected": 0.8016409873962402, "eval_logps/chosen": -395.38714599609375, "eval_logps/rejected": -304.46209716796875, "eval_loss": 0.5531623363494873, "eval_rewards/accuracies": 0.84375, "eval_rewards/chosen": 0.39543235301971436, "eval_rewards/margins": 0.28056541085243225, "eval_rewards/rejected": 0.1148669496178627, "eval_runtime": 77.7884, "eval_samples_per_second": 12.855, "eval_steps_per_second": 0.411, "step": 100 }, { "epoch": 0.07, "learning_rate": 1.177730192719486e-08, "logits/chosen": 1.0650697946548462, "logits/rejected": 1.743814468383789, "logps/chosen": -376.599853515625, "logps/rejected": -334.667724609375, "loss": 0.5616, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.4621427655220032, "rewards/margins": 0.31689101457595825, "rewards/rejected": 0.14525175094604492, "step": 110 }, { "epoch": 0.08, "learning_rate": 1.284796573875803e-08, "logits/chosen": 0.7667558789253235, "logits/rejected": 1.5485883951187134, "logps/chosen": -444.92987060546875, "logps/rejected": -348.63372802734375, "loss": 0.532, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.5855865478515625, "rewards/margins": 0.441326379776001, "rewards/rejected": 0.14426018297672272, "step": 120 }, { "epoch": 0.08, "learning_rate": 1.3918629550321198e-08, "logits/chosen": 0.6718708276748657, "logits/rejected": 1.5146424770355225, "logps/chosen": -443.32568359375, "logps/rejected": -351.67010498046875, "loss": 0.5069, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6444682478904724, "rewards/margins": 0.5162140130996704, "rewards/rejected": 0.128254234790802, "step": 130 }, { "epoch": 0.09, "learning_rate": 1.4989293361884368e-08, "logits/chosen": 0.7301766872406006, "logits/rejected": 1.4742127656936646, "logps/chosen": -460.937255859375, "logps/rejected": -351.74871826171875, "loss": 0.4694, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7888091802597046, "rewards/margins": 0.612645149230957, "rewards/rejected": 0.17616406083106995, "step": 140 }, { "epoch": 0.1, "learning_rate": 1.6059957173447535e-08, "logits/chosen": 1.1185513734817505, "logits/rejected": 1.3702727556228638, "logps/chosen": -388.2076721191406, "logps/rejected": -313.1037902832031, "loss": 0.4438, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8620562553405762, "rewards/margins": 0.5798792839050293, "rewards/rejected": 0.2821769118309021, "step": 150 }, { "epoch": 0.1, "learning_rate": 1.7130620985010704e-08, "logits/chosen": 1.1629040241241455, "logits/rejected": 1.1827285289764404, "logps/chosen": -448.11517333984375, "logps/rejected": -330.4544372558594, "loss": 0.4282, "rewards/accuracies": 0.875, "rewards/chosen": 1.1283848285675049, "rewards/margins": 0.9130982160568237, "rewards/rejected": 0.2152867317199707, "step": 160 }, { "epoch": 0.11, "learning_rate": 1.8201284796573874e-08, "logits/chosen": 0.9314751625061035, "logits/rejected": 1.7524402141571045, "logps/chosen": -433.95159912109375, "logps/rejected": -306.7549743652344, "loss": 0.3991, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 1.0592777729034424, "rewards/margins": 0.8145540952682495, "rewards/rejected": 0.24472376704216003, "step": 170 }, { "epoch": 0.12, "learning_rate": 1.9271948608137044e-08, "logits/chosen": 0.9510858654975891, "logits/rejected": 1.7319552898406982, "logps/chosen": -425.33148193359375, "logps/rejected": -348.006103515625, "loss": 0.3595, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.3629436492919922, "rewards/margins": 1.0799864530563354, "rewards/rejected": 0.2829572558403015, "step": 180 }, { "epoch": 0.12, "learning_rate": 2.0342612419700214e-08, "logits/chosen": 0.8850847482681274, "logits/rejected": 1.6263946294784546, "logps/chosen": -368.7896423339844, "logps/rejected": -318.7091064453125, "loss": 0.3786, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.226855993270874, "rewards/margins": 0.9453747868537903, "rewards/rejected": 0.2814810574054718, "step": 190 }, { "epoch": 0.13, "learning_rate": 2.1413276231263384e-08, "logits/chosen": 0.9113885760307312, "logits/rejected": 1.9366668462753296, "logps/chosen": -392.1955871582031, "logps/rejected": -291.93536376953125, "loss": 0.3425, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.2265583276748657, "rewards/margins": 1.0303113460540771, "rewards/rejected": 0.19624683260917664, "step": 200 }, { "epoch": 0.13, "eval_logits/chosen": 0.33036381006240845, "eval_logits/rejected": 0.8503063321113586, "eval_logps/chosen": -385.884765625, "eval_logps/rejected": -303.293212890625, "eval_loss": 0.31037652492523193, "eval_rewards/accuracies": 0.9453125, "eval_rewards/chosen": 1.345674753189087, "eval_rewards/margins": 1.1139166355133057, "eval_rewards/rejected": 0.23175781965255737, "eval_runtime": 77.5737, "eval_samples_per_second": 12.891, "eval_steps_per_second": 0.413, "step": 200 }, { "epoch": 0.13, "learning_rate": 2.248394004282655e-08, "logits/chosen": 1.403585433959961, "logits/rejected": 1.621122121810913, "logps/chosen": -385.9792175292969, "logps/rejected": -314.9217529296875, "loss": 0.3509, "rewards/accuracies": 0.9375, "rewards/chosen": 1.412534475326538, "rewards/margins": 1.2651290893554688, "rewards/rejected": 0.1474055051803589, "step": 210 }, { "epoch": 0.14, "learning_rate": 2.355460385438972e-08, "logits/chosen": 0.8744575381278992, "logits/rejected": 2.048430919647217, "logps/chosen": -420.71820068359375, "logps/rejected": -308.5834045410156, "loss": 0.3222, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.4642362594604492, "rewards/margins": 1.3160054683685303, "rewards/rejected": 0.14823095500469208, "step": 220 }, { "epoch": 0.15, "learning_rate": 2.462526766595289e-08, "logits/chosen": 1.1990312337875366, "logits/rejected": 1.7153043746948242, "logps/chosen": -431.5537109375, "logps/rejected": -328.9321594238281, "loss": 0.3009, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6640676259994507, "rewards/margins": 1.5493268966674805, "rewards/rejected": 0.11474086344242096, "step": 230 }, { "epoch": 0.15, "learning_rate": 2.569593147751606e-08, "logits/chosen": 1.2709139585494995, "logits/rejected": 2.0714378356933594, "logps/chosen": -390.81512451171875, "logps/rejected": -353.8885803222656, "loss": 0.2846, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.6071593761444092, "rewards/margins": 1.5323899984359741, "rewards/rejected": 0.07476941496133804, "step": 240 }, { "epoch": 0.16, "learning_rate": 2.676659528907923e-08, "logits/chosen": 1.2884495258331299, "logits/rejected": 1.4190622568130493, "logps/chosen": -402.27105712890625, "logps/rejected": -316.29681396484375, "loss": 0.2703, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.6437135934829712, "rewards/margins": 1.6835206747055054, "rewards/rejected": -0.039806898683309555, "step": 250 }, { "epoch": 0.17, "learning_rate": 2.7837259100642396e-08, "logits/chosen": 1.0041536092758179, "logits/rejected": 1.827345848083496, "logps/chosen": -352.20782470703125, "logps/rejected": -281.7880859375, "loss": 0.2568, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.6974023580551147, "rewards/margins": 1.677443265914917, "rewards/rejected": 0.019959043711423874, "step": 260 }, { "epoch": 0.17, "learning_rate": 2.890792291220557e-08, "logits/chosen": 1.280470848083496, "logits/rejected": 2.0490353107452393, "logps/chosen": -396.66546630859375, "logps/rejected": -352.89923095703125, "loss": 0.2423, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.8935625553131104, "rewards/margins": 1.9584296941757202, "rewards/rejected": -0.06486758589744568, "step": 270 }, { "epoch": 0.18, "learning_rate": 2.9978586723768736e-08, "logits/chosen": 1.1651164293289185, "logits/rejected": 1.8686307668685913, "logps/chosen": -369.2472229003906, "logps/rejected": -350.49395751953125, "loss": 0.2369, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.8536313772201538, "rewards/margins": 2.0920298099517822, "rewards/rejected": -0.23839814960956573, "step": 280 }, { "epoch": 0.19, "learning_rate": 3.1049250535331906e-08, "logits/chosen": 0.9764202833175659, "logits/rejected": 1.5498555898666382, "logps/chosen": -401.94708251953125, "logps/rejected": -309.38751220703125, "loss": 0.2172, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.9584842920303345, "rewards/margins": 2.0484821796417236, "rewards/rejected": -0.08999788761138916, "step": 290 }, { "epoch": 0.19, "learning_rate": 3.211991434689507e-08, "logits/chosen": 1.1941782236099243, "logits/rejected": 2.1323885917663574, "logps/chosen": -424.00299072265625, "logps/rejected": -329.6502380371094, "loss": 0.2046, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.1664984226226807, "rewards/margins": 2.4187095165252686, "rewards/rejected": -0.2522108256816864, "step": 300 }, { "epoch": 0.19, "eval_logits/chosen": 0.5110462307929993, "eval_logits/rejected": 1.0409064292907715, "eval_logps/chosen": -381.6942138671875, "eval_logps/rejected": -308.5774230957031, "eval_loss": 0.1841452568769455, "eval_rewards/accuracies": 0.9453125, "eval_rewards/chosen": 1.764728307723999, "eval_rewards/margins": 2.061392307281494, "eval_rewards/rejected": -0.2966638207435608, "eval_runtime": 77.5919, "eval_samples_per_second": 12.888, "eval_steps_per_second": 0.412, "step": 300 }, { "epoch": 0.2, "learning_rate": 3.3190578158458246e-08, "logits/chosen": 0.9687066078186035, "logits/rejected": 1.7230606079101562, "logps/chosen": -409.93109130859375, "logps/rejected": -367.07464599609375, "loss": 0.183, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.9146747589111328, "rewards/margins": 2.3619022369384766, "rewards/rejected": -0.4472277760505676, "step": 310 }, { "epoch": 0.21, "learning_rate": 3.426124197002141e-08, "logits/chosen": 0.9972401857376099, "logits/rejected": 2.0866451263427734, "logps/chosen": -459.5816345214844, "logps/rejected": -333.5121154785156, "loss": 0.1805, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.2129783630371094, "rewards/margins": 2.6930830478668213, "rewards/rejected": -0.4801049828529358, "step": 320 }, { "epoch": 0.21, "learning_rate": 3.533190578158458e-08, "logits/chosen": 1.417145013809204, "logits/rejected": 1.8801225423812866, "logps/chosen": -405.06280517578125, "logps/rejected": -345.85809326171875, "loss": 0.1807, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.1431164741516113, "rewards/margins": 2.6557106971740723, "rewards/rejected": -0.5125941038131714, "step": 330 }, { "epoch": 0.22, "learning_rate": 3.640256959314775e-08, "logits/chosen": 1.1074305772781372, "logits/rejected": 2.019841194152832, "logps/chosen": -463.04534912109375, "logps/rejected": -334.6724853515625, "loss": 0.1691, "rewards/accuracies": 1.0, "rewards/chosen": 2.148191452026367, "rewards/margins": 2.7028677463531494, "rewards/rejected": -0.5546759366989136, "step": 340 }, { "epoch": 0.22, "learning_rate": 3.747323340471092e-08, "logits/chosen": 1.4934161901474, "logits/rejected": 2.0615015029907227, "logps/chosen": -408.3346862792969, "logps/rejected": -348.22747802734375, "loss": 0.2014, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.1981351375579834, "rewards/margins": 2.8584563732147217, "rewards/rejected": -0.6603211164474487, "step": 350 }, { "epoch": 0.23, "learning_rate": 3.854389721627409e-08, "logits/chosen": 1.234470009803772, "logits/rejected": 1.3749644756317139, "logps/chosen": -429.24169921875, "logps/rejected": -359.1957092285156, "loss": 0.1651, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.1755428314208984, "rewards/margins": 3.0690488815307617, "rewards/rejected": -0.8935060501098633, "step": 360 }, { "epoch": 0.24, "learning_rate": 3.961456102783726e-08, "logits/chosen": 1.5004570484161377, "logits/rejected": 2.204867124557495, "logps/chosen": -432.32135009765625, "logps/rejected": -330.5525207519531, "loss": 0.1527, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.2549405097961426, "rewards/margins": 2.9733498096466064, "rewards/rejected": -0.7184091806411743, "step": 370 }, { "epoch": 0.24, "learning_rate": 4.068522483940043e-08, "logits/chosen": 1.4254530668258667, "logits/rejected": 1.9733701944351196, "logps/chosen": -372.3633117675781, "logps/rejected": -325.6457824707031, "loss": 0.1565, "rewards/accuracies": 0.9375, "rewards/chosen": 1.8286545276641846, "rewards/margins": 2.7208027839660645, "rewards/rejected": -0.8921481966972351, "step": 380 }, { "epoch": 0.25, "learning_rate": 4.175588865096359e-08, "logits/chosen": 1.5646907091140747, "logits/rejected": 1.819138526916504, "logps/chosen": -377.5379943847656, "logps/rejected": -343.8385925292969, "loss": 0.1354, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.9773420095443726, "rewards/margins": 2.9212684631347656, "rewards/rejected": -0.9439260363578796, "step": 390 }, { "epoch": 0.26, "learning_rate": 4.282655246252677e-08, "logits/chosen": 1.4840887784957886, "logits/rejected": 1.9725840091705322, "logps/chosen": -434.97784423828125, "logps/rejected": -331.9248352050781, "loss": 0.1596, "rewards/accuracies": 0.875, "rewards/chosen": 2.0752744674682617, "rewards/margins": 3.0398762226104736, "rewards/rejected": -0.9646021127700806, "step": 400 }, { "epoch": 0.26, "eval_logits/chosen": 0.6985446214675903, "eval_logits/rejected": 1.232299566268921, "eval_logps/chosen": -381.2061767578125, "eval_logps/rejected": -316.1226806640625, "eval_loss": 0.13778163492679596, "eval_rewards/accuracies": 0.953125, "eval_rewards/chosen": 1.813530683517456, "eval_rewards/margins": 2.8647243976593018, "eval_rewards/rejected": -1.0511937141418457, "eval_runtime": 77.7071, "eval_samples_per_second": 12.869, "eval_steps_per_second": 0.412, "step": 400 }, { "epoch": 0.26, "learning_rate": 4.389721627408993e-08, "logits/chosen": 1.466382622718811, "logits/rejected": 2.043748617172241, "logps/chosen": -327.84527587890625, "logps/rejected": -320.1709899902344, "loss": 0.1566, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 1.6711170673370361, "rewards/margins": 2.701075315475464, "rewards/rejected": -1.0299583673477173, "step": 410 }, { "epoch": 0.27, "learning_rate": 4.49678800856531e-08, "logits/chosen": 0.8124781847000122, "logits/rejected": 2.2421953678131104, "logps/chosen": -400.47650146484375, "logps/rejected": -366.773681640625, "loss": 0.1388, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.0225610733032227, "rewards/margins": 3.3162319660186768, "rewards/rejected": -1.293670654296875, "step": 420 }, { "epoch": 0.28, "learning_rate": 4.603854389721627e-08, "logits/chosen": 1.411001443862915, "logits/rejected": 2.1695659160614014, "logps/chosen": -381.0799255371094, "logps/rejected": -321.12640380859375, "loss": 0.1331, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.094374656677246, "rewards/margins": 3.4128735065460205, "rewards/rejected": -1.3184987306594849, "step": 430 }, { "epoch": 0.28, "learning_rate": 4.710920770877944e-08, "logits/chosen": 1.3558982610702515, "logits/rejected": 2.4410338401794434, "logps/chosen": -366.334716796875, "logps/rejected": -347.8705139160156, "loss": 0.1456, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.165606737136841, "rewards/margins": 3.6063430309295654, "rewards/rejected": -1.4407367706298828, "step": 440 }, { "epoch": 0.29, "learning_rate": 4.817987152034261e-08, "logits/chosen": 1.2681734561920166, "logits/rejected": 2.0421833992004395, "logps/chosen": -482.25091552734375, "logps/rejected": -355.57904052734375, "loss": 0.1114, "rewards/accuracies": 1.0, "rewards/chosen": 2.3174893856048584, "rewards/margins": 3.938343048095703, "rewards/rejected": -1.6208534240722656, "step": 450 }, { "epoch": 0.3, "learning_rate": 4.925053533190578e-08, "logits/chosen": 1.3757874965667725, "logits/rejected": 2.598388195037842, "logps/chosen": -371.0218200683594, "logps/rejected": -349.3445129394531, "loss": 0.1312, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.7905902862548828, "rewards/margins": 3.369292736053467, "rewards/rejected": -1.5787023305892944, "step": 460 }, { "epoch": 0.3, "learning_rate": 5.032119914346895e-08, "logits/chosen": 1.4324853420257568, "logits/rejected": 2.03037428855896, "logps/chosen": -403.80120849609375, "logps/rejected": -359.4237976074219, "loss": 0.0932, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.2499053478240967, "rewards/margins": 3.795927047729492, "rewards/rejected": -1.5460216999053955, "step": 470 }, { "epoch": 0.31, "learning_rate": 5.139186295503212e-08, "logits/chosen": 1.1828899383544922, "logits/rejected": 2.248396873474121, "logps/chosen": -442.19970703125, "logps/rejected": -360.25006103515625, "loss": 0.123, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.486384868621826, "rewards/margins": 4.412066459655762, "rewards/rejected": -1.9256811141967773, "step": 480 }, { "epoch": 0.31, "learning_rate": 5.246252676659528e-08, "logits/chosen": 0.9969785809516907, "logits/rejected": 2.016841173171997, "logps/chosen": -364.4849853515625, "logps/rejected": -346.4527893066406, "loss": 0.1157, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.7432949542999268, "rewards/margins": 3.481992721557617, "rewards/rejected": -1.7386982440948486, "step": 490 }, { "epoch": 0.32, "learning_rate": 5.353319057815846e-08, "logits/chosen": 1.628588080406189, "logits/rejected": 2.0982065200805664, "logps/chosen": -381.443115234375, "logps/rejected": -362.29486083984375, "loss": 0.1153, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.0162081718444824, "rewards/margins": 4.036345958709717, "rewards/rejected": -2.0201380252838135, "step": 500 }, { "epoch": 0.32, "eval_logits/chosen": 0.9163604378700256, "eval_logits/rejected": 1.455054521560669, "eval_logps/chosen": -380.87884521484375, "eval_logps/rejected": -324.88177490234375, "eval_loss": 0.10337930172681808, "eval_rewards/accuracies": 0.9375, "eval_rewards/chosen": 1.8462636470794678, "eval_rewards/margins": 3.773362159729004, "eval_rewards/rejected": -1.9270987510681152, "eval_runtime": 77.7201, "eval_samples_per_second": 12.867, "eval_steps_per_second": 0.412, "step": 500 }, { "epoch": 0.33, "learning_rate": 5.460385438972163e-08, "logits/chosen": 1.4893255233764648, "logits/rejected": 1.7933692932128906, "logps/chosen": -441.28399658203125, "logps/rejected": -357.0665588378906, "loss": 0.0929, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.2136569023132324, "rewards/margins": 4.3935346603393555, "rewards/rejected": -2.179877758026123, "step": 510 }, { "epoch": 0.33, "learning_rate": 5.567451820128479e-08, "logits/chosen": 1.6450592279434204, "logits/rejected": 2.41060209274292, "logps/chosen": -457.15667724609375, "logps/rejected": -374.75115966796875, "loss": 0.0995, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.662111759185791, "rewards/margins": 5.2003607749938965, "rewards/rejected": -2.5382492542266846, "step": 520 }, { "epoch": 0.34, "learning_rate": 5.6745182012847956e-08, "logits/chosen": 1.7802613973617554, "logits/rejected": 2.5348358154296875, "logps/chosen": -378.1894836425781, "logps/rejected": -335.26129150390625, "loss": 0.1062, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.415276050567627, "rewards/margins": 4.646345138549805, "rewards/rejected": -2.231069564819336, "step": 530 }, { "epoch": 0.35, "learning_rate": 5.781584582441114e-08, "logits/chosen": 1.8911815881729126, "logits/rejected": 2.2930028438568115, "logps/chosen": -445.2462463378906, "logps/rejected": -343.48150634765625, "loss": 0.102, "rewards/accuracies": 0.9375, "rewards/chosen": 1.925233244895935, "rewards/margins": 4.143408298492432, "rewards/rejected": -2.2181754112243652, "step": 540 }, { "epoch": 0.35, "learning_rate": 5.88865096359743e-08, "logits/chosen": 1.3622030019760132, "logits/rejected": 2.7591183185577393, "logps/chosen": -417.8857421875, "logps/rejected": -337.2507629394531, "loss": 0.1003, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.4883484840393066, "rewards/margins": 4.856560707092285, "rewards/rejected": -2.3682124614715576, "step": 550 }, { "epoch": 0.36, "learning_rate": 5.995717344753747e-08, "logits/chosen": 1.885152816772461, "logits/rejected": 2.445096492767334, "logps/chosen": -413.9400939941406, "logps/rejected": -347.16864013671875, "loss": 0.1078, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7049505710601807, "rewards/margins": 4.317243576049805, "rewards/rejected": -2.612293243408203, "step": 560 }, { "epoch": 0.37, "learning_rate": 6.102783725910064e-08, "logits/chosen": 1.8548791408538818, "logits/rejected": 2.9488348960876465, "logps/chosen": -399.66888427734375, "logps/rejected": -361.6997375488281, "loss": 0.0833, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.391327381134033, "rewards/margins": 5.426538467407227, "rewards/rejected": -3.0352110862731934, "step": 570 }, { "epoch": 0.37, "learning_rate": 6.209850107066381e-08, "logits/chosen": 1.9805002212524414, "logits/rejected": 2.253380537033081, "logps/chosen": -363.88214111328125, "logps/rejected": -356.90264892578125, "loss": 0.0771, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.9852304458618164, "rewards/margins": 4.773615837097168, "rewards/rejected": -2.7883856296539307, "step": 580 }, { "epoch": 0.38, "learning_rate": 6.316916488222698e-08, "logits/chosen": 1.8189414739608765, "logits/rejected": 2.1621651649475098, "logps/chosen": -376.37664794921875, "logps/rejected": -344.7612609863281, "loss": 0.1041, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.4361340999603271, "rewards/margins": 4.317945957183838, "rewards/rejected": -2.8818118572235107, "step": 590 }, { "epoch": 0.39, "learning_rate": 6.423982869379014e-08, "logits/chosen": 1.9913393259048462, "logits/rejected": 2.2392477989196777, "logps/chosen": -409.7967834472656, "logps/rejected": -371.5293884277344, "loss": 0.0904, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.0530600547790527, "rewards/margins": 5.140936851501465, "rewards/rejected": -3.087876081466675, "step": 600 }, { "epoch": 0.39, "eval_logits/chosen": 1.1242132186889648, "eval_logits/rejected": 1.6480361223220825, "eval_logps/chosen": -383.8947448730469, "eval_logps/rejected": -335.6293029785156, "eval_loss": 0.08573687076568604, "eval_rewards/accuracies": 0.9453125, "eval_rewards/chosen": 1.5446751117706299, "eval_rewards/margins": 4.5465288162231445, "eval_rewards/rejected": -3.0018532276153564, "eval_runtime": 77.586, "eval_samples_per_second": 12.889, "eval_steps_per_second": 0.412, "step": 600 }, { "epoch": 0.39, "learning_rate": 6.531049250535332e-08, "logits/chosen": 1.5741275548934937, "logits/rejected": 2.2887330055236816, "logps/chosen": -405.8610534667969, "logps/rejected": -368.2715759277344, "loss": 0.0841, "rewards/accuracies": 1.0, "rewards/chosen": 1.9225307703018188, "rewards/margins": 5.187340259552002, "rewards/rejected": -3.2648093700408936, "step": 610 }, { "epoch": 0.4, "learning_rate": 6.638115631691649e-08, "logits/chosen": 1.3936151266098022, "logits/rejected": 2.8601880073547363, "logps/chosen": -414.2528381347656, "logps/rejected": -394.7596435546875, "loss": 0.079, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7199567556381226, "rewards/margins": 5.62530517578125, "rewards/rejected": -3.905348300933838, "step": 620 }, { "epoch": 0.4, "learning_rate": 6.745182012847965e-08, "logits/chosen": 1.7785946130752563, "logits/rejected": 2.8865761756896973, "logps/chosen": -429.94580078125, "logps/rejected": -385.0089416503906, "loss": 0.085, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.8962761163711548, "rewards/margins": 5.370936393737793, "rewards/rejected": -3.474660873413086, "step": 630 }, { "epoch": 0.41, "learning_rate": 6.852248394004282e-08, "logits/chosen": 1.801674246788025, "logits/rejected": 2.2879269123077393, "logps/chosen": -424.087158203125, "logps/rejected": -360.2523498535156, "loss": 0.0998, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.9135773181915283, "rewards/margins": 5.04227352142334, "rewards/rejected": -3.1286959648132324, "step": 640 }, { "epoch": 0.42, "learning_rate": 6.9593147751606e-08, "logits/chosen": 1.8188073635101318, "logits/rejected": 2.1497268676757812, "logps/chosen": -423.1458435058594, "logps/rejected": -369.9307861328125, "loss": 0.082, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.321187138557434, "rewards/margins": 4.85211706161499, "rewards/rejected": -3.5309300422668457, "step": 650 }, { "epoch": 0.42, "learning_rate": 7.066381156316916e-08, "logits/chosen": 1.7404190301895142, "logits/rejected": 2.677952289581299, "logps/chosen": -402.8115234375, "logps/rejected": -340.89337158203125, "loss": 0.0725, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.031486749649048, "rewards/margins": 5.562923908233643, "rewards/rejected": -3.531437397003174, "step": 660 }, { "epoch": 0.43, "learning_rate": 7.173447537473233e-08, "logits/chosen": 1.8731329441070557, "logits/rejected": 2.752986431121826, "logps/chosen": -387.5521240234375, "logps/rejected": -344.8697204589844, "loss": 0.0797, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3953731060028076, "rewards/margins": 5.006608963012695, "rewards/rejected": -3.611236095428467, "step": 670 }, { "epoch": 0.44, "learning_rate": 7.28051391862955e-08, "logits/chosen": 1.765631914138794, "logits/rejected": 2.9511940479278564, "logps/chosen": -412.4698791503906, "logps/rejected": -378.50537109375, "loss": 0.0922, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7086801528930664, "rewards/margins": 5.677321434020996, "rewards/rejected": -3.968641757965088, "step": 680 }, { "epoch": 0.44, "learning_rate": 7.387580299785867e-08, "logits/chosen": 1.7808834314346313, "logits/rejected": 2.730713367462158, "logps/chosen": -375.7123718261719, "logps/rejected": -352.998046875, "loss": 0.0794, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.92323899269104, "rewards/margins": 5.590106010437012, "rewards/rejected": -3.6668670177459717, "step": 690 }, { "epoch": 0.45, "learning_rate": 7.494646680942184e-08, "logits/chosen": 1.7647113800048828, "logits/rejected": 2.6917636394500732, "logps/chosen": -393.0909729003906, "logps/rejected": -375.1470947265625, "loss": 0.0754, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.228740930557251, "rewards/margins": 6.255539417266846, "rewards/rejected": -4.026798248291016, "step": 700 }, { "epoch": 0.45, "eval_logits/chosen": 1.1851357221603394, "eval_logits/rejected": 1.7113410234451294, "eval_logps/chosen": -382.10595703125, "eval_logps/rejected": -341.21160888671875, "eval_loss": 0.07382317632436752, "eval_rewards/accuracies": 0.96875, "eval_rewards/chosen": 1.7235567569732666, "eval_rewards/margins": 5.283637523651123, "eval_rewards/rejected": -3.5600812435150146, "eval_runtime": 78.3559, "eval_samples_per_second": 12.762, "eval_steps_per_second": 0.408, "step": 700 }, { "epoch": 0.46, "learning_rate": 7.601713062098501e-08, "logits/chosen": 1.8762671947479248, "logits/rejected": 2.7768630981445312, "logps/chosen": -403.41461181640625, "logps/rejected": -348.1267395019531, "loss": 0.0767, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.008270502090454, "rewards/margins": 5.45863151550293, "rewards/rejected": -3.4503607749938965, "step": 710 }, { "epoch": 0.46, "learning_rate": 7.708779443254818e-08, "logits/chosen": 1.9685356616973877, "logits/rejected": 2.4259753227233887, "logps/chosen": -423.30694580078125, "logps/rejected": -355.1989440917969, "loss": 0.0738, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.1973319053649902, "rewards/margins": 5.911899566650391, "rewards/rejected": -3.7145678997039795, "step": 720 }, { "epoch": 0.47, "learning_rate": 7.815845824411135e-08, "logits/chosen": 1.7616288661956787, "logits/rejected": 2.778398036956787, "logps/chosen": -415.79833984375, "logps/rejected": -365.4802551269531, "loss": 0.0838, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.6791164875030518, "rewards/margins": 5.412492275238037, "rewards/rejected": -3.7333762645721436, "step": 730 }, { "epoch": 0.48, "learning_rate": 7.922912205567452e-08, "logits/chosen": 1.911268949508667, "logits/rejected": 2.5959761142730713, "logps/chosen": -399.43463134765625, "logps/rejected": -380.5097961425781, "loss": 0.0822, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.497511625289917, "rewards/margins": 5.510095596313477, "rewards/rejected": -4.0125837326049805, "step": 740 }, { "epoch": 0.48, "learning_rate": 8.029978586723767e-08, "logits/chosen": 1.8361726999282837, "logits/rejected": 2.3821051120758057, "logps/chosen": -372.65618896484375, "logps/rejected": -381.7906799316406, "loss": 0.0662, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.844112753868103, "rewards/margins": 6.229226589202881, "rewards/rejected": -4.385113716125488, "step": 750 }, { "epoch": 0.49, "learning_rate": 8.137044967880086e-08, "logits/chosen": 2.179152011871338, "logits/rejected": 2.5297486782073975, "logps/chosen": -396.2829284667969, "logps/rejected": -352.2908935546875, "loss": 0.0683, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.0879392623901367, "rewards/margins": 6.044869422912598, "rewards/rejected": -3.956930160522461, "step": 760 }, { "epoch": 0.49, "learning_rate": 8.244111349036403e-08, "logits/chosen": 1.5910015106201172, "logits/rejected": 2.9595401287078857, "logps/chosen": -386.81573486328125, "logps/rejected": -350.80303955078125, "loss": 0.0697, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.7900558710098267, "rewards/margins": 5.987098693847656, "rewards/rejected": -4.197042942047119, "step": 770 }, { "epoch": 0.5, "learning_rate": 8.351177730192718e-08, "logits/chosen": 1.8572914600372314, "logits/rejected": 2.7143654823303223, "logps/chosen": -416.63238525390625, "logps/rejected": -403.2386169433594, "loss": 0.0668, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9369878768920898, "rewards/margins": 6.391732692718506, "rewards/rejected": -4.4547438621521, "step": 780 }, { "epoch": 0.51, "learning_rate": 8.458244111349035e-08, "logits/chosen": 1.7877603769302368, "logits/rejected": 3.0778090953826904, "logps/chosen": -436.93121337890625, "logps/rejected": -375.30499267578125, "loss": 0.0639, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7189620733261108, "rewards/margins": 6.110939979553223, "rewards/rejected": -4.391977787017822, "step": 790 }, { "epoch": 0.51, "learning_rate": 8.565310492505354e-08, "logits/chosen": 1.8236795663833618, "logits/rejected": 3.053520441055298, "logps/chosen": -440.10711669921875, "logps/rejected": -371.2597351074219, "loss": 0.0748, "rewards/accuracies": 1.0, "rewards/chosen": 1.6093488931655884, "rewards/margins": 5.858896732330322, "rewards/rejected": -4.249547004699707, "step": 800 }, { "epoch": 0.51, "eval_logits/chosen": 1.2858448028564453, "eval_logits/rejected": 1.8164113759994507, "eval_logps/chosen": -386.32391357421875, "eval_logps/rejected": -349.60479736328125, "eval_loss": 0.0711909607052803, "eval_rewards/accuracies": 0.9765625, "eval_rewards/chosen": 1.3017570972442627, "eval_rewards/margins": 5.701159477233887, "eval_rewards/rejected": -4.399402618408203, "eval_runtime": 78.5065, "eval_samples_per_second": 12.738, "eval_steps_per_second": 0.408, "step": 800 }, { "epoch": 0.52, "learning_rate": 8.672376873661669e-08, "logits/chosen": 2.046654462814331, "logits/rejected": 2.875682830810547, "logps/chosen": -367.9713134765625, "logps/rejected": -358.0971984863281, "loss": 0.0733, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.6088149547576904, "rewards/margins": 6.157548427581787, "rewards/rejected": -4.548734188079834, "step": 810 }, { "epoch": 0.53, "learning_rate": 8.779443254817986e-08, "logits/chosen": 2.2108118534088135, "logits/rejected": 2.811412811279297, "logps/chosen": -388.9371032714844, "logps/rejected": -359.3924255371094, "loss": 0.0554, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.500943899154663, "rewards/margins": 6.306944370269775, "rewards/rejected": -4.806000709533691, "step": 820 }, { "epoch": 0.53, "learning_rate": 8.886509635974304e-08, "logits/chosen": 1.7628936767578125, "logits/rejected": 2.9087016582489014, "logps/chosen": -436.14886474609375, "logps/rejected": -375.5445251464844, "loss": 0.0587, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7326514720916748, "rewards/margins": 6.393059730529785, "rewards/rejected": -4.660407543182373, "step": 830 }, { "epoch": 0.54, "learning_rate": 8.99357601713062e-08, "logits/chosen": 1.880814552307129, "logits/rejected": 2.7161240577697754, "logps/chosen": -395.0550231933594, "logps/rejected": -363.26849365234375, "loss": 0.0616, "rewards/accuracies": 0.9375, "rewards/chosen": 1.268677830696106, "rewards/margins": 5.726746559143066, "rewards/rejected": -4.45806884765625, "step": 840 }, { "epoch": 0.55, "learning_rate": 9.100642398286937e-08, "logits/chosen": 1.7003024816513062, "logits/rejected": 2.424133777618408, "logps/chosen": -389.845703125, "logps/rejected": -358.5777282714844, "loss": 0.0776, "rewards/accuracies": 0.9375, "rewards/chosen": 1.434398889541626, "rewards/margins": 5.904941082000732, "rewards/rejected": -4.470543384552002, "step": 850 }, { "epoch": 0.55, "learning_rate": 9.207708779443254e-08, "logits/chosen": 1.8526960611343384, "logits/rejected": 2.986250638961792, "logps/chosen": -414.87744140625, "logps/rejected": -394.83477783203125, "loss": 0.0555, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.5953035354614258, "rewards/margins": 6.830922603607178, "rewards/rejected": -5.23561954498291, "step": 860 }, { "epoch": 0.56, "learning_rate": 9.314775160599571e-08, "logits/chosen": 2.066636800765991, "logits/rejected": 2.8474645614624023, "logps/chosen": -422.1144104003906, "logps/rejected": -373.4290466308594, "loss": 0.059, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.4623820781707764, "rewards/margins": 6.585521697998047, "rewards/rejected": -5.123138904571533, "step": 870 }, { "epoch": 0.57, "learning_rate": 9.421841541755888e-08, "logits/chosen": 2.3123459815979004, "logits/rejected": 2.8036293983459473, "logps/chosen": -407.44525146484375, "logps/rejected": -358.09771728515625, "loss": 0.0736, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.7242000102996826, "rewards/margins": 6.361567974090576, "rewards/rejected": -4.637368202209473, "step": 880 }, { "epoch": 0.57, "learning_rate": 9.528907922912205e-08, "logits/chosen": 1.5694644451141357, "logits/rejected": 2.9323623180389404, "logps/chosen": -440.52264404296875, "logps/rejected": -416.00543212890625, "loss": 0.0513, "rewards/accuracies": 1.0, "rewards/chosen": 2.229987621307373, "rewards/margins": 7.332463264465332, "rewards/rejected": -5.102475166320801, "step": 890 }, { "epoch": 0.58, "learning_rate": 9.635974304068522e-08, "logits/chosen": 1.9838998317718506, "logits/rejected": 3.293696165084839, "logps/chosen": -377.60546875, "logps/rejected": -400.6976013183594, "loss": 0.0418, "rewards/accuracies": 1.0, "rewards/chosen": 1.361082911491394, "rewards/margins": 7.032995700836182, "rewards/rejected": -5.6719136238098145, "step": 900 }, { "epoch": 0.58, "eval_logits/chosen": 1.431064486503601, "eval_logits/rejected": 1.926888108253479, "eval_logps/chosen": -389.20965576171875, "eval_logps/rejected": -357.00994873046875, "eval_loss": 0.06582893431186676, "eval_rewards/accuracies": 0.96875, "eval_rewards/chosen": 1.0131824016571045, "eval_rewards/margins": 6.153097629547119, "eval_rewards/rejected": -5.1399149894714355, "eval_runtime": 78.1749, "eval_samples_per_second": 12.792, "eval_steps_per_second": 0.409, "step": 900 }, { "epoch": 0.58, "learning_rate": 9.743040685224839e-08, "logits/chosen": 2.3193790912628174, "logits/rejected": 2.9782004356384277, "logps/chosen": -355.58978271484375, "logps/rejected": -374.8011779785156, "loss": 0.0504, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.7792609930038452, "rewards/margins": 6.809684753417969, "rewards/rejected": -5.030424118041992, "step": 910 }, { "epoch": 0.59, "learning_rate": 9.850107066381156e-08, "logits/chosen": 1.8106858730316162, "logits/rejected": 2.700303316116333, "logps/chosen": -436.91912841796875, "logps/rejected": -376.2388610839844, "loss": 0.0524, "rewards/accuracies": 0.9375, "rewards/chosen": 2.020312786102295, "rewards/margins": 7.166808128356934, "rewards/rejected": -5.146495342254639, "step": 920 }, { "epoch": 0.6, "learning_rate": 9.957173447537473e-08, "logits/chosen": 1.956599235534668, "logits/rejected": 2.98834490776062, "logps/chosen": -432.9703063964844, "logps/rejected": -390.3727722167969, "loss": 0.0802, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9984395503997803, "rewards/margins": 7.112124443054199, "rewards/rejected": -5.11368465423584, "step": 930 }, { "epoch": 0.6, "learning_rate": 9.992858843132586e-08, "logits/chosen": 2.160102605819702, "logits/rejected": 2.821474313735962, "logps/chosen": -424.42962646484375, "logps/rejected": -402.60235595703125, "loss": 0.059, "rewards/accuracies": 1.0, "rewards/chosen": 2.0432326793670654, "rewards/margins": 7.653304100036621, "rewards/rejected": -5.610072135925293, "step": 940 }, { "epoch": 0.61, "learning_rate": 9.980956915020233e-08, "logits/chosen": 1.941819190979004, "logits/rejected": 3.0604381561279297, "logps/chosen": -397.3748474121094, "logps/rejected": -423.2496643066406, "loss": 0.0557, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.0661425590515137, "rewards/margins": 7.666254997253418, "rewards/rejected": -5.600112438201904, "step": 950 }, { "epoch": 0.62, "learning_rate": 9.969054986907879e-08, "logits/chosen": 1.859452247619629, "logits/rejected": 3.1880416870117188, "logps/chosen": -423.7708435058594, "logps/rejected": -383.41259765625, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": 1.7157636880874634, "rewards/margins": 7.163266181945801, "rewards/rejected": -5.447502136230469, "step": 960 }, { "epoch": 0.62, "learning_rate": 9.957153058795524e-08, "logits/chosen": 1.755894660949707, "logits/rejected": 2.667271852493286, "logps/chosen": -405.3694763183594, "logps/rejected": -380.8141784667969, "loss": 0.0777, "rewards/accuracies": 0.9375, "rewards/chosen": 1.459249496459961, "rewards/margins": 6.949099540710449, "rewards/rejected": -5.489850044250488, "step": 970 }, { "epoch": 0.63, "learning_rate": 9.94525113068317e-08, "logits/chosen": 1.9811556339263916, "logits/rejected": 3.005946397781372, "logps/chosen": -364.09521484375, "logps/rejected": -364.8800048828125, "loss": 0.0533, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.8379303216934204, "rewards/margins": 6.930338382720947, "rewards/rejected": -5.092407703399658, "step": 980 }, { "epoch": 0.64, "learning_rate": 9.933349202570817e-08, "logits/chosen": 1.7156349420547485, "logits/rejected": 2.972471237182617, "logps/chosen": -424.38519287109375, "logps/rejected": -384.4205627441406, "loss": 0.0545, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.169966459274292, "rewards/margins": 7.766200065612793, "rewards/rejected": -5.596234321594238, "step": 990 }, { "epoch": 0.64, "learning_rate": 9.921447274458463e-08, "logits/chosen": 1.65463387966156, "logits/rejected": 3.0111804008483887, "logps/chosen": -466.137451171875, "logps/rejected": -387.94317626953125, "loss": 0.0597, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.5395125150680542, "rewards/margins": 7.153542518615723, "rewards/rejected": -5.614029884338379, "step": 1000 }, { "epoch": 0.64, "eval_logits/chosen": 1.426473617553711, "eval_logits/rejected": 1.9225867986679077, "eval_logps/chosen": -391.24505615234375, "eval_logps/rejected": -361.6107482910156, "eval_loss": 0.06333575397729874, "eval_rewards/accuracies": 0.96875, "eval_rewards/chosen": 0.8096399307250977, "eval_rewards/margins": 6.409637928009033, "eval_rewards/rejected": -5.5999979972839355, "eval_runtime": 78.1719, "eval_samples_per_second": 12.792, "eval_steps_per_second": 0.409, "step": 1000 }, { "epoch": 0.65, "learning_rate": 9.909545346346108e-08, "logits/chosen": 2.0751636028289795, "logits/rejected": 2.8259801864624023, "logps/chosen": -391.50396728515625, "logps/rejected": -390.41680908203125, "loss": 0.0479, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0614975690841675, "rewards/margins": 6.492199897766113, "rewards/rejected": -5.430701732635498, "step": 1010 }, { "epoch": 0.66, "learning_rate": 9.897643418233753e-08, "logits/chosen": 1.8207337856292725, "logits/rejected": 2.810199737548828, "logps/chosen": -412.99658203125, "logps/rejected": -401.36126708984375, "loss": 0.0404, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5562200546264648, "rewards/margins": 8.119148254394531, "rewards/rejected": -6.562928199768066, "step": 1020 }, { "epoch": 0.66, "learning_rate": 9.885741490121398e-08, "logits/chosen": 2.1346724033355713, "logits/rejected": 2.99312424659729, "logps/chosen": -427.60601806640625, "logps/rejected": -379.160400390625, "loss": 0.0493, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.149390697479248, "rewards/margins": 7.403092384338379, "rewards/rejected": -5.253701210021973, "step": 1030 }, { "epoch": 0.67, "learning_rate": 9.873839562009045e-08, "logits/chosen": 1.81674063205719, "logits/rejected": 3.14642071723938, "logps/chosen": -410.53436279296875, "logps/rejected": -422.78790283203125, "loss": 0.0625, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.664518117904663, "rewards/margins": 7.533532619476318, "rewards/rejected": -5.869014739990234, "step": 1040 }, { "epoch": 0.67, "learning_rate": 9.861937633896691e-08, "logits/chosen": 2.1859567165374756, "logits/rejected": 2.774512767791748, "logps/chosen": -371.1358642578125, "logps/rejected": -376.1512451171875, "loss": 0.0397, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8732522130012512, "rewards/margins": 6.792318820953369, "rewards/rejected": -5.919065952301025, "step": 1050 }, { "epoch": 0.68, "learning_rate": 9.850035705784336e-08, "logits/chosen": 2.508104085922241, "logits/rejected": 2.7455363273620605, "logps/chosen": -438.8089294433594, "logps/rejected": -426.82464599609375, "loss": 0.0433, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.799186110496521, "rewards/margins": 8.152082443237305, "rewards/rejected": -6.352896690368652, "step": 1060 }, { "epoch": 0.69, "learning_rate": 9.838133777671982e-08, "logits/chosen": 2.162297010421753, "logits/rejected": 2.9005820751190186, "logps/chosen": -393.8660583496094, "logps/rejected": -402.96649169921875, "loss": 0.0523, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1076323986053467, "rewards/margins": 7.283668518066406, "rewards/rejected": -6.1760358810424805, "step": 1070 }, { "epoch": 0.69, "learning_rate": 9.826231849559629e-08, "logits/chosen": 2.274879217147827, "logits/rejected": 2.933121681213379, "logps/chosen": -436.8518981933594, "logps/rejected": -405.71246337890625, "loss": 0.0622, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.486023187637329, "rewards/margins": 7.599495887756348, "rewards/rejected": -6.1134724617004395, "step": 1080 }, { "epoch": 0.7, "learning_rate": 9.814329921447275e-08, "logits/chosen": 2.3372159004211426, "logits/rejected": 2.4765231609344482, "logps/chosen": -394.52398681640625, "logps/rejected": -383.6842956542969, "loss": 0.047, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5595790147781372, "rewards/margins": 7.81160831451416, "rewards/rejected": -6.2520294189453125, "step": 1090 }, { "epoch": 0.71, "learning_rate": 9.80242799333492e-08, "logits/chosen": 2.076772689819336, "logits/rejected": 3.134021282196045, "logps/chosen": -424.93914794921875, "logps/rejected": -395.6697998046875, "loss": 0.045, "rewards/accuracies": 1.0, "rewards/chosen": 1.405705451965332, "rewards/margins": 7.619529724121094, "rewards/rejected": -6.213824272155762, "step": 1100 }, { "epoch": 0.71, "eval_logits/chosen": 1.5631608963012695, "eval_logits/rejected": 2.0626633167266846, "eval_logps/chosen": -392.5325012207031, "eval_logps/rejected": -367.3223876953125, "eval_loss": 0.060555677860975266, "eval_rewards/accuracies": 0.9609375, "eval_rewards/chosen": 0.6809001564979553, "eval_rewards/margins": 6.852060317993164, "eval_rewards/rejected": -6.171159744262695, "eval_runtime": 78.2923, "eval_samples_per_second": 12.773, "eval_steps_per_second": 0.409, "step": 1100 }, { "epoch": 0.71, "learning_rate": 9.790526065222565e-08, "logits/chosen": 2.313490629196167, "logits/rejected": 2.873136281967163, "logps/chosen": -429.7516174316406, "logps/rejected": -405.4490051269531, "loss": 0.0419, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5168659687042236, "rewards/margins": 7.972109794616699, "rewards/rejected": -6.455244541168213, "step": 1110 }, { "epoch": 0.72, "learning_rate": 9.778624137110211e-08, "logits/chosen": 2.1459672451019287, "logits/rejected": 2.9044458866119385, "logps/chosen": -420.87744140625, "logps/rejected": -377.68896484375, "loss": 0.0336, "rewards/accuracies": 1.0, "rewards/chosen": 1.6729495525360107, "rewards/margins": 7.566412925720215, "rewards/rejected": -5.893463134765625, "step": 1120 }, { "epoch": 0.73, "learning_rate": 9.766722208997857e-08, "logits/chosen": 2.3557143211364746, "logits/rejected": 2.725691318511963, "logps/chosen": -405.68109130859375, "logps/rejected": -419.1329040527344, "loss": 0.0552, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7717654705047607, "rewards/margins": 8.059925079345703, "rewards/rejected": -6.288159370422363, "step": 1130 }, { "epoch": 0.73, "learning_rate": 9.754820280885503e-08, "logits/chosen": 2.216296672821045, "logits/rejected": 2.695742130279541, "logps/chosen": -415.5310974121094, "logps/rejected": -400.45263671875, "loss": 0.0422, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.6496362686157227, "rewards/margins": 7.936199188232422, "rewards/rejected": -6.286562919616699, "step": 1140 }, { "epoch": 0.74, "learning_rate": 9.742918352773148e-08, "logits/chosen": 2.093048572540283, "logits/rejected": 2.7978243827819824, "logps/chosen": -450.3804626464844, "logps/rejected": -412.62908935546875, "loss": 0.0573, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.3106613159179688, "rewards/margins": 8.593067169189453, "rewards/rejected": -6.282405853271484, "step": 1150 }, { "epoch": 0.75, "learning_rate": 9.731016424660795e-08, "logits/chosen": 2.472761869430542, "logits/rejected": 3.0465588569641113, "logps/chosen": -426.41412353515625, "logps/rejected": -407.08612060546875, "loss": 0.0443, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5518306493759155, "rewards/margins": 8.157126426696777, "rewards/rejected": -6.6052961349487305, "step": 1160 }, { "epoch": 0.75, "learning_rate": 9.719114496548441e-08, "logits/chosen": 1.941318154335022, "logits/rejected": 3.335367202758789, "logps/chosen": -418.33428955078125, "logps/rejected": -403.39337158203125, "loss": 0.0404, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.0809710025787354, "rewards/margins": 8.985780715942383, "rewards/rejected": -6.90480899810791, "step": 1170 }, { "epoch": 0.76, "learning_rate": 9.707212568436087e-08, "logits/chosen": 2.241579532623291, "logits/rejected": 2.8625073432922363, "logps/chosen": -408.77178955078125, "logps/rejected": -389.53778076171875, "loss": 0.0384, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.2259656190872192, "rewards/margins": 8.003189086914062, "rewards/rejected": -6.777223110198975, "step": 1180 }, { "epoch": 0.76, "learning_rate": 9.695310640323732e-08, "logits/chosen": 2.914790630340576, "logits/rejected": 2.8142731189727783, "logps/chosen": -387.0242614746094, "logps/rejected": -399.86749267578125, "loss": 0.0371, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.323935627937317, "rewards/margins": 8.005804061889648, "rewards/rejected": -6.681868553161621, "step": 1190 }, { "epoch": 0.77, "learning_rate": 9.683408712211378e-08, "logits/chosen": 2.1048951148986816, "logits/rejected": 2.841226816177368, "logps/chosen": -451.27459716796875, "logps/rejected": -403.28240966796875, "loss": 0.0669, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.7710305452346802, "rewards/margins": 7.861186981201172, "rewards/rejected": -7.090156555175781, "step": 1200 }, { "epoch": 0.77, "eval_logits/chosen": 1.5480470657348633, "eval_logits/rejected": 2.058847188949585, "eval_logps/chosen": -392.0873718261719, "eval_logps/rejected": -371.0377197265625, "eval_loss": 0.06311403959989548, "eval_rewards/accuracies": 0.9609375, "eval_rewards/chosen": 0.7254116535186768, "eval_rewards/margins": 7.268110275268555, "eval_rewards/rejected": -6.542698860168457, "eval_runtime": 78.0374, "eval_samples_per_second": 12.814, "eval_steps_per_second": 0.41, "step": 1200 }, { "epoch": 0.78, "learning_rate": 9.671506784099024e-08, "logits/chosen": 2.5204672813415527, "logits/rejected": 3.2195708751678467, "logps/chosen": -350.8345031738281, "logps/rejected": -381.27569580078125, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": 1.312830924987793, "rewards/margins": 8.153268814086914, "rewards/rejected": -6.840437889099121, "step": 1210 }, { "epoch": 0.78, "learning_rate": 9.659604855986669e-08, "logits/chosen": 2.0493383407592773, "logits/rejected": 2.925226926803589, "logps/chosen": -462.79638671875, "logps/rejected": -402.0813903808594, "loss": 0.0583, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8163875341415405, "rewards/margins": 8.08704948425293, "rewards/rejected": -6.270661354064941, "step": 1220 }, { "epoch": 0.79, "learning_rate": 9.647702927874315e-08, "logits/chosen": 2.3467869758605957, "logits/rejected": 2.963789463043213, "logps/chosen": -448.3075256347656, "logps/rejected": -411.7268981933594, "loss": 0.0308, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1376395225524902, "rewards/margins": 8.935708999633789, "rewards/rejected": -6.798068046569824, "step": 1230 }, { "epoch": 0.8, "learning_rate": 9.63580099976196e-08, "logits/chosen": 2.0899271965026855, "logits/rejected": 3.0291128158569336, "logps/chosen": -395.5885009765625, "logps/rejected": -395.0786437988281, "loss": 0.0437, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3425300121307373, "rewards/margins": 7.873586177825928, "rewards/rejected": -6.5310564041137695, "step": 1240 }, { "epoch": 0.8, "learning_rate": 9.623899071649607e-08, "logits/chosen": 2.6475844383239746, "logits/rejected": 3.3692593574523926, "logps/chosen": -355.3011169433594, "logps/rejected": -404.4001770019531, "loss": 0.036, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1569859981536865, "rewards/margins": 8.337442398071289, "rewards/rejected": -6.18045711517334, "step": 1250 }, { "epoch": 0.81, "learning_rate": 9.611997143537253e-08, "logits/chosen": 2.364384412765503, "logits/rejected": 2.7631328105926514, "logps/chosen": -441.07391357421875, "logps/rejected": -394.32122802734375, "loss": 0.0434, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.301563024520874, "rewards/margins": 8.577180862426758, "rewards/rejected": -6.275616645812988, "step": 1260 }, { "epoch": 0.82, "learning_rate": 9.600095215424899e-08, "logits/chosen": 2.9918723106384277, "logits/rejected": 3.4367504119873047, "logps/chosen": -358.42230224609375, "logps/rejected": -392.7529296875, "loss": 0.0499, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.5515286922454834, "rewards/margins": 8.478203773498535, "rewards/rejected": -6.926675319671631, "step": 1270 }, { "epoch": 0.82, "learning_rate": 9.588193287312544e-08, "logits/chosen": 2.2631287574768066, "logits/rejected": 3.006317138671875, "logps/chosen": -417.509033203125, "logps/rejected": -430.56353759765625, "loss": 0.0396, "rewards/accuracies": 1.0, "rewards/chosen": 1.066652536392212, "rewards/margins": 8.158655166625977, "rewards/rejected": -7.092002868652344, "step": 1280 }, { "epoch": 0.83, "learning_rate": 9.57629135920019e-08, "logits/chosen": 2.1602940559387207, "logits/rejected": 3.0440659523010254, "logps/chosen": -381.5716247558594, "logps/rejected": -442.3666076660156, "loss": 0.0371, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.255479335784912, "rewards/margins": 10.695677757263184, "rewards/rejected": -9.440199851989746, "step": 1290 }, { "epoch": 0.84, "learning_rate": 9.564389431087836e-08, "logits/chosen": 2.3889143466949463, "logits/rejected": 3.3642821311950684, "logps/chosen": -386.8828430175781, "logps/rejected": -414.80157470703125, "loss": 0.0349, "rewards/accuracies": 1.0, "rewards/chosen": 0.5850082635879517, "rewards/margins": 7.8508620262146, "rewards/rejected": -7.265854835510254, "step": 1300 }, { "epoch": 0.84, "eval_logits/chosen": 1.5706590414047241, "eval_logits/rejected": 2.0521459579467773, "eval_logps/chosen": -390.2462158203125, "eval_logps/rejected": -372.8173522949219, "eval_loss": 0.05391751974821091, "eval_rewards/accuracies": 0.9609375, "eval_rewards/chosen": 0.9095280170440674, "eval_rewards/margins": 7.630187034606934, "eval_rewards/rejected": -6.720658302307129, "eval_runtime": 77.9662, "eval_samples_per_second": 12.826, "eval_steps_per_second": 0.41, "step": 1300 }, { "epoch": 0.84, "learning_rate": 9.552487502975481e-08, "logits/chosen": 2.109424114227295, "logits/rejected": 3.372615098953247, "logps/chosen": -425.7452087402344, "logps/rejected": -420.9171447753906, "loss": 0.0561, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8334258794784546, "rewards/margins": 8.975770950317383, "rewards/rejected": -7.142345428466797, "step": 1310 }, { "epoch": 0.85, "learning_rate": 9.540585574863127e-08, "logits/chosen": 1.9445127248764038, "logits/rejected": 2.690776824951172, "logps/chosen": -418.9251403808594, "logps/rejected": -406.52813720703125, "loss": 0.0519, "rewards/accuracies": 1.0, "rewards/chosen": 1.2240526676177979, "rewards/margins": 8.488465309143066, "rewards/rejected": -7.264412879943848, "step": 1320 }, { "epoch": 0.85, "learning_rate": 9.528683646750774e-08, "logits/chosen": 2.327258586883545, "logits/rejected": 3.642822265625, "logps/chosen": -422.3324279785156, "logps/rejected": -427.5508728027344, "loss": 0.0439, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9490644335746765, "rewards/margins": 8.496426582336426, "rewards/rejected": -7.547361850738525, "step": 1330 }, { "epoch": 0.86, "learning_rate": 9.51678171863842e-08, "logits/chosen": 2.0918898582458496, "logits/rejected": 3.0718982219696045, "logps/chosen": -451.8070373535156, "logps/rejected": -396.4839782714844, "loss": 0.0282, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.8088127374649048, "rewards/margins": 8.987968444824219, "rewards/rejected": -7.179154872894287, "step": 1340 }, { "epoch": 0.87, "learning_rate": 9.504879790526065e-08, "logits/chosen": 2.251584529876709, "logits/rejected": 2.643411636352539, "logps/chosen": -390.68804931640625, "logps/rejected": -401.1186218261719, "loss": 0.0389, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3335015773773193, "rewards/margins": 8.997198104858398, "rewards/rejected": -7.663697242736816, "step": 1350 }, { "epoch": 0.87, "learning_rate": 9.49297786241371e-08, "logits/chosen": 2.6715595722198486, "logits/rejected": 2.812282085418701, "logps/chosen": -398.098388671875, "logps/rejected": -429.861083984375, "loss": 0.0371, "rewards/accuracies": 1.0, "rewards/chosen": 1.4172677993774414, "rewards/margins": 9.3431396484375, "rewards/rejected": -7.925871849060059, "step": 1360 }, { "epoch": 0.88, "learning_rate": 9.481075934301356e-08, "logits/chosen": 2.6521875858306885, "logits/rejected": 3.3430676460266113, "logps/chosen": -384.2088928222656, "logps/rejected": -394.98565673828125, "loss": 0.0281, "rewards/accuracies": 1.0, "rewards/chosen": 1.5810503959655762, "rewards/margins": 8.41893482208252, "rewards/rejected": -6.837882995605469, "step": 1370 }, { "epoch": 0.89, "learning_rate": 9.469174006189002e-08, "logits/chosen": 1.8557714223861694, "logits/rejected": 2.911968946456909, "logps/chosen": -493.2205505371094, "logps/rejected": -419.2433166503906, "loss": 0.038, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.404146432876587, "rewards/margins": 9.705537796020508, "rewards/rejected": -7.3013916015625, "step": 1380 }, { "epoch": 0.89, "learning_rate": 9.457272078076648e-08, "logits/chosen": 2.248764991760254, "logits/rejected": 2.723816394805908, "logps/chosen": -448.24749755859375, "logps/rejected": -399.8122253417969, "loss": 0.0282, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.05086612701416, "rewards/margins": 8.73229694366455, "rewards/rejected": -6.681430816650391, "step": 1390 }, { "epoch": 0.9, "learning_rate": 9.445370149964293e-08, "logits/chosen": 2.173478603363037, "logits/rejected": 3.2225327491760254, "logps/chosen": -396.5003356933594, "logps/rejected": -385.8442687988281, "loss": 0.0493, "rewards/accuracies": 1.0, "rewards/chosen": 1.8472175598144531, "rewards/margins": 8.179679870605469, "rewards/rejected": -6.332461357116699, "step": 1400 }, { "epoch": 0.9, "eval_logits/chosen": 1.612717866897583, "eval_logits/rejected": 2.088186264038086, "eval_logps/chosen": -388.5294189453125, "eval_logps/rejected": -372.8053894042969, "eval_loss": 0.05290338769555092, "eval_rewards/accuracies": 0.9609375, "eval_rewards/chosen": 1.0812093019485474, "eval_rewards/margins": 7.80067253112793, "eval_rewards/rejected": -6.719463348388672, "eval_runtime": 77.9512, "eval_samples_per_second": 12.829, "eval_steps_per_second": 0.411, "step": 1400 }, { "epoch": 0.91, "learning_rate": 9.43346822185194e-08, "logits/chosen": 1.9332910776138306, "logits/rejected": 3.120542526245117, "logps/chosen": -405.4175109863281, "logps/rejected": -406.57781982421875, "loss": 0.0388, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5408118963241577, "rewards/margins": 8.476791381835938, "rewards/rejected": -6.935980319976807, "step": 1410 }, { "epoch": 0.91, "learning_rate": 9.421566293739586e-08, "logits/chosen": 2.6357414722442627, "logits/rejected": 2.7694199085235596, "logps/chosen": -403.54632568359375, "logps/rejected": -362.77557373046875, "loss": 0.05, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.2265160083770752, "rewards/margins": 7.877467155456543, "rewards/rejected": -6.650951385498047, "step": 1420 }, { "epoch": 0.92, "learning_rate": 9.409664365627231e-08, "logits/chosen": 2.213944673538208, "logits/rejected": 2.804633617401123, "logps/chosen": -411.54608154296875, "logps/rejected": -416.67498779296875, "loss": 0.0319, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.015861988067627, "rewards/margins": 9.614884376525879, "rewards/rejected": -7.599021911621094, "step": 1430 }, { "epoch": 0.93, "learning_rate": 9.397762437514877e-08, "logits/chosen": 2.344874620437622, "logits/rejected": 2.9800264835357666, "logps/chosen": -387.05206298828125, "logps/rejected": -409.482421875, "loss": 0.0328, "rewards/accuracies": 1.0, "rewards/chosen": 0.7274560928344727, "rewards/margins": 8.34837818145752, "rewards/rejected": -7.620922088623047, "step": 1440 }, { "epoch": 0.93, "learning_rate": 9.385860509402523e-08, "logits/chosen": 2.6229679584503174, "logits/rejected": 2.767507553100586, "logps/chosen": -400.82647705078125, "logps/rejected": -437.8872985839844, "loss": 0.0437, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4043718576431274, "rewards/margins": 9.666748046875, "rewards/rejected": -8.26237678527832, "step": 1450 }, { "epoch": 0.94, "learning_rate": 9.373958581290168e-08, "logits/chosen": 1.9819673299789429, "logits/rejected": 3.2101433277130127, "logps/chosen": -395.5453186035156, "logps/rejected": -420.27020263671875, "loss": 0.0334, "rewards/accuracies": 1.0, "rewards/chosen": 1.1504669189453125, "rewards/margins": 9.256044387817383, "rewards/rejected": -8.105578422546387, "step": 1460 }, { "epoch": 0.94, "learning_rate": 9.362056653177814e-08, "logits/chosen": 2.1626055240631104, "logits/rejected": 2.6654152870178223, "logps/chosen": -443.01470947265625, "logps/rejected": -387.11956787109375, "loss": 0.0519, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9894819259643555, "rewards/margins": 8.866507530212402, "rewards/rejected": -7.877026557922363, "step": 1470 }, { "epoch": 0.95, "learning_rate": 9.35015472506546e-08, "logits/chosen": 2.113184928894043, "logits/rejected": 2.9894702434539795, "logps/chosen": -426.97454833984375, "logps/rejected": -392.66937255859375, "loss": 0.0455, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.242422342300415, "rewards/margins": 8.285688400268555, "rewards/rejected": -7.043266296386719, "step": 1480 }, { "epoch": 0.96, "learning_rate": 9.338252796953105e-08, "logits/chosen": 2.179399013519287, "logits/rejected": 2.653637409210205, "logps/chosen": -417.0636291503906, "logps/rejected": -418.0546875, "loss": 0.0329, "rewards/accuracies": 1.0, "rewards/chosen": 1.5582334995269775, "rewards/margins": 9.643311500549316, "rewards/rejected": -8.085078239440918, "step": 1490 }, { "epoch": 0.96, "learning_rate": 9.326350868840752e-08, "logits/chosen": 2.342013120651245, "logits/rejected": 3.294114589691162, "logps/chosen": -410.2916564941406, "logps/rejected": -398.54022216796875, "loss": 0.0361, "rewards/accuracies": 1.0, "rewards/chosen": 0.9324003458023071, "rewards/margins": 8.479290008544922, "rewards/rejected": -7.5468902587890625, "step": 1500 }, { "epoch": 0.96, "eval_logits/chosen": 1.6103559732437134, "eval_logits/rejected": 2.1083178520202637, "eval_logps/chosen": -395.0100402832031, "eval_logps/rejected": -380.8038330078125, "eval_loss": 0.05386331304907799, "eval_rewards/accuracies": 0.96875, "eval_rewards/chosen": 0.4331449270248413, "eval_rewards/margins": 7.952449798583984, "eval_rewards/rejected": -7.519304275512695, "eval_runtime": 77.9615, "eval_samples_per_second": 12.827, "eval_steps_per_second": 0.41, "step": 1500 }, { "epoch": 0.97, "learning_rate": 9.314448940728398e-08, "logits/chosen": 1.7690677642822266, "logits/rejected": 3.269226551055908, "logps/chosen": -419.20867919921875, "logps/rejected": -420.2843322753906, "loss": 0.0332, "rewards/accuracies": 1.0, "rewards/chosen": 0.7724257707595825, "rewards/margins": 8.932862281799316, "rewards/rejected": -8.160436630249023, "step": 1510 }, { "epoch": 0.98, "learning_rate": 9.302547012616043e-08, "logits/chosen": 2.311052083969116, "logits/rejected": 2.7695891857147217, "logps/chosen": -420.0890197753906, "logps/rejected": -437.40863037109375, "loss": 0.0506, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9803401827812195, "rewards/margins": 9.851530075073242, "rewards/rejected": -8.871191024780273, "step": 1520 }, { "epoch": 0.98, "learning_rate": 9.290645084503689e-08, "logits/chosen": 2.257835626602173, "logits/rejected": 2.8854148387908936, "logps/chosen": -388.00152587890625, "logps/rejected": -398.3310546875, "loss": 0.0312, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7348248362541199, "rewards/margins": 8.951128959655762, "rewards/rejected": -8.216302871704102, "step": 1530 }, { "epoch": 0.99, "learning_rate": 9.278743156391336e-08, "logits/chosen": 1.8644654750823975, "logits/rejected": 3.318366527557373, "logps/chosen": -398.03961181640625, "logps/rejected": -404.6298522949219, "loss": 0.0329, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6025460958480835, "rewards/margins": 8.443681716918945, "rewards/rejected": -7.8411359786987305, "step": 1540 }, { "epoch": 1.0, "learning_rate": 9.26684122827898e-08, "logits/chosen": 2.1022090911865234, "logits/rejected": 3.08918833732605, "logps/chosen": -380.1417236328125, "logps/rejected": -418.6334533691406, "loss": 0.0319, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.10860241949558258, "rewards/margins": 8.798811912536621, "rewards/rejected": -8.9074125289917, "step": 1550 }, { "epoch": 1.0, "learning_rate": 9.254939300166626e-08, "logits/chosen": 1.9660978317260742, "logits/rejected": 3.0992188453674316, "logps/chosen": -432.70379638671875, "logps/rejected": -398.1070861816406, "loss": 0.0321, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7886090874671936, "rewards/margins": 9.03126335144043, "rewards/rejected": -8.242653846740723, "step": 1560 }, { "epoch": 1.01, "learning_rate": 9.243037372054272e-08, "logits/chosen": 1.8959643840789795, "logits/rejected": 3.006300687789917, "logps/chosen": -485.46435546875, "logps/rejected": -439.8067321777344, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": 1.288769006729126, "rewards/margins": 10.527647018432617, "rewards/rejected": -9.238879203796387, "step": 1570 }, { "epoch": 1.02, "learning_rate": 9.231135443941919e-08, "logits/chosen": 2.1207528114318848, "logits/rejected": 3.0929980278015137, "logps/chosen": -427.466796875, "logps/rejected": -432.8934631347656, "loss": 0.0109, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8015705943107605, "rewards/margins": 10.440237998962402, "rewards/rejected": -9.638667106628418, "step": 1580 }, { "epoch": 1.02, "learning_rate": 9.219233515829564e-08, "logits/chosen": 2.253310441970825, "logits/rejected": 3.08237886428833, "logps/chosen": -423.25775146484375, "logps/rejected": -393.71966552734375, "loss": 0.0153, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.1011648178100586, "rewards/margins": 8.963667869567871, "rewards/rejected": -7.8625030517578125, "step": 1590 }, { "epoch": 1.03, "learning_rate": 9.20733158771721e-08, "logits/chosen": 1.8514906167984009, "logits/rejected": 2.837451934814453, "logps/chosen": -476.034912109375, "logps/rejected": -431.28369140625, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": 1.7295993566513062, "rewards/margins": 10.104246139526367, "rewards/rejected": -8.37464714050293, "step": 1600 }, { "epoch": 1.03, "eval_logits/chosen": 1.5608395338058472, "eval_logits/rejected": 2.154280662536621, "eval_logps/chosen": -400.8282470703125, "eval_logps/rejected": -387.81903076171875, "eval_loss": 0.05462770164012909, "eval_rewards/accuracies": 0.96875, "eval_rewards/chosen": -0.1486767828464508, "eval_rewards/margins": 8.072151184082031, "eval_rewards/rejected": -8.22082805633545, "eval_runtime": 78.122, "eval_samples_per_second": 12.8, "eval_steps_per_second": 0.41, "step": 1600 }, { "epoch": 1.03, "learning_rate": 9.195429659604855e-08, "logits/chosen": 2.027790069580078, "logits/rejected": 3.0657553672790527, "logps/chosen": -432.85369873046875, "logps/rejected": -412.8692321777344, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": 0.8429557681083679, "rewards/margins": 9.376811027526855, "rewards/rejected": -8.533855438232422, "step": 1610 }, { "epoch": 1.04, "learning_rate": 9.183527731492501e-08, "logits/chosen": 2.6315040588378906, "logits/rejected": 3.4493613243103027, "logps/chosen": -425.31756591796875, "logps/rejected": -431.07989501953125, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": 0.8139169812202454, "rewards/margins": 9.736674308776855, "rewards/rejected": -8.922757148742676, "step": 1620 }, { "epoch": 1.05, "learning_rate": 9.171625803380148e-08, "logits/chosen": 2.329026699066162, "logits/rejected": 3.3126883506774902, "logps/chosen": -426.11114501953125, "logps/rejected": -430.76934814453125, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": 1.6953893899917603, "rewards/margins": 10.516650199890137, "rewards/rejected": -8.821261405944824, "step": 1630 }, { "epoch": 1.05, "learning_rate": 9.159723875267794e-08, "logits/chosen": 2.058751106262207, "logits/rejected": 2.7939858436584473, "logps/chosen": -388.2889099121094, "logps/rejected": -386.5802307128906, "loss": 0.0199, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9611308574676514, "rewards/margins": 9.938085556030273, "rewards/rejected": -7.976954460144043, "step": 1640 }, { "epoch": 1.06, "learning_rate": 9.147821947155438e-08, "logits/chosen": 2.1123156547546387, "logits/rejected": 3.6038296222686768, "logps/chosen": -393.5766296386719, "logps/rejected": -402.1626892089844, "loss": 0.0107, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8590052723884583, "rewards/margins": 9.311236381530762, "rewards/rejected": -8.452230453491211, "step": 1650 }, { "epoch": 1.07, "learning_rate": 9.135920019043084e-08, "logits/chosen": 2.1106104850769043, "logits/rejected": 3.1583571434020996, "logps/chosen": -374.95001220703125, "logps/rejected": -392.5960998535156, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": 0.768711268901825, "rewards/margins": 8.690801620483398, "rewards/rejected": -7.922091007232666, "step": 1660 }, { "epoch": 1.07, "learning_rate": 9.12401809093073e-08, "logits/chosen": 2.2228665351867676, "logits/rejected": 3.330571413040161, "logps/chosen": -443.146484375, "logps/rejected": -426.9916076660156, "loss": 0.0153, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.233867883682251, "rewards/margins": 10.038155555725098, "rewards/rejected": -8.804287910461426, "step": 1670 }, { "epoch": 1.08, "learning_rate": 9.112116162818376e-08, "logits/chosen": 2.2720413208007812, "logits/rejected": 3.6448235511779785, "logps/chosen": -428.63763427734375, "logps/rejected": -414.93768310546875, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": 1.4396426677703857, "rewards/margins": 9.752705574035645, "rewards/rejected": -8.31306266784668, "step": 1680 }, { "epoch": 1.09, "learning_rate": 9.100214234706022e-08, "logits/chosen": 2.2004613876342773, "logits/rejected": 3.101625919342041, "logps/chosen": -419.28143310546875, "logps/rejected": -431.95684814453125, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": 1.1783888339996338, "rewards/margins": 10.366841316223145, "rewards/rejected": -9.188451766967773, "step": 1690 }, { "epoch": 1.09, "learning_rate": 9.088312306593667e-08, "logits/chosen": 2.8064794540405273, "logits/rejected": 3.2689521312713623, "logps/chosen": -354.8829650878906, "logps/rejected": -406.4047546386719, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": 0.7891177535057068, "rewards/margins": 9.862220764160156, "rewards/rejected": -9.073102951049805, "step": 1700 }, { "epoch": 1.09, "eval_logits/chosen": 1.625260591506958, "eval_logits/rejected": 2.238248109817505, "eval_logps/chosen": -400.59375, "eval_logps/rejected": -390.6427917480469, "eval_loss": 0.057591091841459274, "eval_rewards/accuracies": 0.96875, "eval_rewards/chosen": -0.12522682547569275, "eval_rewards/margins": 8.377971649169922, "eval_rewards/rejected": -8.503198623657227, "eval_runtime": 78.0334, "eval_samples_per_second": 12.815, "eval_steps_per_second": 0.41, "step": 1700 }, { "epoch": 1.1, "learning_rate": 9.076410378481314e-08, "logits/chosen": 1.9525015354156494, "logits/rejected": 3.1185505390167236, "logps/chosen": -422.75225830078125, "logps/rejected": -421.87445068359375, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": 1.9433252811431885, "rewards/margins": 10.827123641967773, "rewards/rejected": -8.883798599243164, "step": 1710 }, { "epoch": 1.11, "learning_rate": 9.06450845036896e-08, "logits/chosen": 2.2783799171447754, "logits/rejected": 3.1842360496520996, "logps/chosen": -418.703369140625, "logps/rejected": -418.055419921875, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": 1.2181668281555176, "rewards/margins": 9.945693016052246, "rewards/rejected": -8.727526664733887, "step": 1720 }, { "epoch": 1.11, "learning_rate": 9.052606522256606e-08, "logits/chosen": 2.0897390842437744, "logits/rejected": 3.332200288772583, "logps/chosen": -436.8134765625, "logps/rejected": -428.2256774902344, "loss": 0.0087, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5979622006416321, "rewards/margins": 9.419112205505371, "rewards/rejected": -8.821150779724121, "step": 1730 }, { "epoch": 1.12, "learning_rate": 9.04070459414425e-08, "logits/chosen": 1.7566314935684204, "logits/rejected": 3.1516499519348145, "logps/chosen": -444.9219665527344, "logps/rejected": -409.5821228027344, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": 0.8818896412849426, "rewards/margins": 10.398978233337402, "rewards/rejected": -9.517088890075684, "step": 1740 }, { "epoch": 1.12, "learning_rate": 9.028802666031897e-08, "logits/chosen": 2.2249584197998047, "logits/rejected": 3.220370054244995, "logps/chosen": -373.8511962890625, "logps/rejected": -429.1431579589844, "loss": 0.0122, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.09718599170446396, "rewards/margins": 9.384894371032715, "rewards/rejected": -9.28770923614502, "step": 1750 }, { "epoch": 1.13, "learning_rate": 9.016900737919543e-08, "logits/chosen": 2.468928337097168, "logits/rejected": 3.5063624382019043, "logps/chosen": -375.80279541015625, "logps/rejected": -422.55450439453125, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": 0.36610403656959534, "rewards/margins": 9.850809097290039, "rewards/rejected": -9.484704971313477, "step": 1760 }, { "epoch": 1.14, "learning_rate": 9.004998809807188e-08, "logits/chosen": 2.5144405364990234, "logits/rejected": 3.0172762870788574, "logps/chosen": -398.01959228515625, "logps/rejected": -412.0162048339844, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": 0.71649569272995, "rewards/margins": 10.736013412475586, "rewards/rejected": -10.01951789855957, "step": 1770 }, { "epoch": 1.14, "learning_rate": 8.993096881694834e-08, "logits/chosen": 2.411824941635132, "logits/rejected": 3.4212913513183594, "logps/chosen": -441.16741943359375, "logps/rejected": -410.988525390625, "loss": 0.0107, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.526915967464447, "rewards/margins": 9.955665588378906, "rewards/rejected": -9.428749084472656, "step": 1780 }, { "epoch": 1.15, "learning_rate": 8.981194953582481e-08, "logits/chosen": 2.0398449897766113, "logits/rejected": 2.990245819091797, "logps/chosen": -424.50830078125, "logps/rejected": -440.29034423828125, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": 1.1279420852661133, "rewards/margins": 10.929909706115723, "rewards/rejected": -9.801966667175293, "step": 1790 }, { "epoch": 1.16, "learning_rate": 8.969293025470126e-08, "logits/chosen": 2.0134599208831787, "logits/rejected": 3.4384913444519043, "logps/chosen": -431.4124450683594, "logps/rejected": -422.57275390625, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": 1.387521505355835, "rewards/margins": 10.490163803100586, "rewards/rejected": -9.102643013000488, "step": 1800 }, { "epoch": 1.16, "eval_logits/chosen": 1.578616738319397, "eval_logits/rejected": 2.1524062156677246, "eval_logps/chosen": -396.13555908203125, "eval_logps/rejected": -386.849365234375, "eval_loss": 0.054019615054130554, "eval_rewards/accuracies": 0.96875, "eval_rewards/chosen": 0.32059139013290405, "eval_rewards/margins": 8.444451332092285, "eval_rewards/rejected": -8.123859405517578, "eval_runtime": 78.1156, "eval_samples_per_second": 12.802, "eval_steps_per_second": 0.41, "step": 1800 }, { "epoch": 1.16, "learning_rate": 8.957391097357772e-08, "logits/chosen": 1.8505395650863647, "logits/rejected": 2.4895451068878174, "logps/chosen": -411.90081787109375, "logps/rejected": -446.5166931152344, "loss": 0.0093, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4874722957611084, "rewards/margins": 10.799659729003906, "rewards/rejected": -9.312187194824219, "step": 1810 }, { "epoch": 1.17, "learning_rate": 8.945489169245418e-08, "logits/chosen": 2.4672083854675293, "logits/rejected": 3.3016533851623535, "logps/chosen": -407.24810791015625, "logps/rejected": -421.69207763671875, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": 0.6584242582321167, "rewards/margins": 9.77189826965332, "rewards/rejected": -9.113473892211914, "step": 1820 }, { "epoch": 1.18, "learning_rate": 8.933587241133062e-08, "logits/chosen": 2.9424712657928467, "logits/rejected": 3.1963260173797607, "logps/chosen": -393.42730712890625, "logps/rejected": -405.26776123046875, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": 1.141961693763733, "rewards/margins": 9.95053768157959, "rewards/rejected": -8.808575630187988, "step": 1830 }, { "epoch": 1.18, "learning_rate": 8.921685313020709e-08, "logits/chosen": 2.1705727577209473, "logits/rejected": 3.132054090499878, "logps/chosen": -458.5648498535156, "logps/rejected": -425.5223693847656, "loss": 0.0144, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.0647785663604736, "rewards/margins": 10.456083297729492, "rewards/rejected": -8.391304969787598, "step": 1840 }, { "epoch": 1.19, "learning_rate": 8.909783384908355e-08, "logits/chosen": 2.2793197631835938, "logits/rejected": 3.1851863861083984, "logps/chosen": -428.22161865234375, "logps/rejected": -393.50201416015625, "loss": 0.0142, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3557255268096924, "rewards/margins": 9.879143714904785, "rewards/rejected": -8.523417472839355, "step": 1850 }, { "epoch": 1.2, "learning_rate": 8.897881456796e-08, "logits/chosen": 2.092120409011841, "logits/rejected": 3.1506097316741943, "logps/chosen": -423.780517578125, "logps/rejected": -435.0636291503906, "loss": 0.0164, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.041473150253296, "rewards/margins": 10.483617782592773, "rewards/rejected": -9.442144393920898, "step": 1860 }, { "epoch": 1.2, "learning_rate": 8.885979528683646e-08, "logits/chosen": 2.077341079711914, "logits/rejected": 2.840765953063965, "logps/chosen": -467.50335693359375, "logps/rejected": -450.4947814941406, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": 1.0977306365966797, "rewards/margins": 10.936107635498047, "rewards/rejected": -9.838376998901367, "step": 1870 }, { "epoch": 1.21, "learning_rate": 8.874077600571293e-08, "logits/chosen": 2.388977527618408, "logits/rejected": 3.17130970954895, "logps/chosen": -415.8170471191406, "logps/rejected": -431.8157653808594, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": 1.1165374517440796, "rewards/margins": 11.083638191223145, "rewards/rejected": -9.967100143432617, "step": 1880 }, { "epoch": 1.21, "learning_rate": 8.862175672458938e-08, "logits/chosen": 2.3494229316711426, "logits/rejected": 3.3899810314178467, "logps/chosen": -398.8847351074219, "logps/rejected": -420.1104431152344, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": 0.9555309414863586, "rewards/margins": 9.63708209991455, "rewards/rejected": -8.681550979614258, "step": 1890 }, { "epoch": 1.22, "learning_rate": 8.850273744346584e-08, "logits/chosen": 2.377115488052368, "logits/rejected": 3.1841914653778076, "logps/chosen": -384.47821044921875, "logps/rejected": -382.5137634277344, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": 1.495150089263916, "rewards/margins": 9.905587196350098, "rewards/rejected": -8.410436630249023, "step": 1900 }, { "epoch": 1.22, "eval_logits/chosen": 1.5970555543899536, "eval_logits/rejected": 2.134828805923462, "eval_logps/chosen": -395.612060546875, "eval_logps/rejected": -388.520751953125, "eval_loss": 0.05577890947461128, "eval_rewards/accuracies": 0.96875, "eval_rewards/chosen": 0.3729451596736908, "eval_rewards/margins": 8.663945198059082, "eval_rewards/rejected": -8.290999412536621, "eval_runtime": 78.1703, "eval_samples_per_second": 12.793, "eval_steps_per_second": 0.409, "step": 1900 }, { "epoch": 1.23, "learning_rate": 8.83837181623423e-08, "logits/chosen": 2.162808895111084, "logits/rejected": 2.938401937484741, "logps/chosen": -416.3312072753906, "logps/rejected": -431.4833984375, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": 1.4468110799789429, "rewards/margins": 11.13166332244873, "rewards/rejected": -9.684852600097656, "step": 1910 }, { "epoch": 1.23, "learning_rate": 8.826469888121875e-08, "logits/chosen": 1.8704240322113037, "logits/rejected": 3.454761028289795, "logps/chosen": -394.39312744140625, "logps/rejected": -435.0038146972656, "loss": 0.0103, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1534335613250732, "rewards/margins": 10.062942504882812, "rewards/rejected": -8.909509658813477, "step": 1920 }, { "epoch": 1.24, "learning_rate": 8.814567960009521e-08, "logits/chosen": 2.3629403114318848, "logits/rejected": 3.146951675415039, "logps/chosen": -385.4207458496094, "logps/rejected": -421.41552734375, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": 1.5849992036819458, "rewards/margins": 10.956562995910645, "rewards/rejected": -9.371562957763672, "step": 1930 }, { "epoch": 1.25, "learning_rate": 8.802666031897167e-08, "logits/chosen": 1.9549896717071533, "logits/rejected": 3.260117769241333, "logps/chosen": -438.8719177246094, "logps/rejected": -423.6161193847656, "loss": 0.0114, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8324927091598511, "rewards/margins": 9.532448768615723, "rewards/rejected": -8.699956893920898, "step": 1940 }, { "epoch": 1.25, "learning_rate": 8.790764103784812e-08, "logits/chosen": 2.376079797744751, "logits/rejected": 2.626861572265625, "logps/chosen": -420.97320556640625, "logps/rejected": -430.754638671875, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": 1.4511768817901611, "rewards/margins": 10.815112113952637, "rewards/rejected": -9.363935470581055, "step": 1950 }, { "epoch": 1.26, "learning_rate": 8.778862175672459e-08, "logits/chosen": 2.016657829284668, "logits/rejected": 3.148374557495117, "logps/chosen": -399.3986511230469, "logps/rejected": -409.0765075683594, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": 0.7621505260467529, "rewards/margins": 9.929964065551758, "rewards/rejected": -9.167813301086426, "step": 1960 }, { "epoch": 1.27, "learning_rate": 8.766960247560105e-08, "logits/chosen": 2.107110023498535, "logits/rejected": 2.8882648944854736, "logps/chosen": -467.67999267578125, "logps/rejected": -445.00054931640625, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": 1.5498336553573608, "rewards/margins": 11.310626983642578, "rewards/rejected": -9.760791778564453, "step": 1970 }, { "epoch": 1.27, "learning_rate": 8.75505831944775e-08, "logits/chosen": 2.1148109436035156, "logits/rejected": 2.921837568283081, "logps/chosen": -419.919677734375, "logps/rejected": -438.27392578125, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": 0.5840842723846436, "rewards/margins": 10.156820297241211, "rewards/rejected": -9.572736740112305, "step": 1980 }, { "epoch": 1.28, "learning_rate": 8.743156391335396e-08, "logits/chosen": 2.2187628746032715, "logits/rejected": 3.3866772651672363, "logps/chosen": -381.0819396972656, "logps/rejected": -417.1609802246094, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": 0.2810596525669098, "rewards/margins": 9.59456729888916, "rewards/rejected": -9.313508033752441, "step": 1990 }, { "epoch": 1.29, "learning_rate": 8.731254463223042e-08, "logits/chosen": 2.337141513824463, "logits/rejected": 2.984570026397705, "logps/chosen": -428.96307373046875, "logps/rejected": -433.2093811035156, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": 1.118440866470337, "rewards/margins": 10.297574043273926, "rewards/rejected": -9.179132461547852, "step": 2000 }, { "epoch": 1.29, "eval_logits/chosen": 1.633168339729309, "eval_logits/rejected": 2.174915313720703, "eval_logps/chosen": -397.74981689453125, "eval_logps/rejected": -392.74847412109375, "eval_loss": 0.0574236661195755, "eval_rewards/accuracies": 0.96875, "eval_rewards/chosen": 0.15916621685028076, "eval_rewards/margins": 8.872934341430664, "eval_rewards/rejected": -8.713767051696777, "eval_runtime": 77.9326, "eval_samples_per_second": 12.832, "eval_steps_per_second": 0.411, "step": 2000 }, { "epoch": 1.29, "learning_rate": 8.719352535110687e-08, "logits/chosen": 2.470761775970459, "logits/rejected": 3.466107130050659, "logps/chosen": -330.568115234375, "logps/rejected": -396.0406799316406, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": 0.5657675862312317, "rewards/margins": 9.64977741241455, "rewards/rejected": -9.084009170532227, "step": 2010 }, { "epoch": 1.3, "learning_rate": 8.707450606998333e-08, "logits/chosen": 2.0616042613983154, "logits/rejected": 3.1946635246276855, "logps/chosen": -374.26641845703125, "logps/rejected": -438.85565185546875, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": 0.9465324282646179, "rewards/margins": 10.824884414672852, "rewards/rejected": -9.8783540725708, "step": 2020 }, { "epoch": 1.3, "learning_rate": 8.695548678885979e-08, "logits/chosen": 2.382939100265503, "logits/rejected": 2.7525863647460938, "logps/chosen": -404.4330139160156, "logps/rejected": -403.4361572265625, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": 0.8075125813484192, "rewards/margins": 9.784753799438477, "rewards/rejected": -8.977242469787598, "step": 2030 }, { "epoch": 1.31, "learning_rate": 8.683646750773624e-08, "logits/chosen": 2.528764247894287, "logits/rejected": 3.0091147422790527, "logps/chosen": -448.67852783203125, "logps/rejected": -431.8081970214844, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": 1.1726857423782349, "rewards/margins": 11.344512939453125, "rewards/rejected": -10.17182731628418, "step": 2040 }, { "epoch": 1.32, "learning_rate": 8.671744822661271e-08, "logits/chosen": 2.7697949409484863, "logits/rejected": 3.7019259929656982, "logps/chosen": -388.6200256347656, "logps/rejected": -417.25152587890625, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": 0.27654901146888733, "rewards/margins": 9.896730422973633, "rewards/rejected": -9.620182037353516, "step": 2050 }, { "epoch": 1.32, "learning_rate": 8.659842894548917e-08, "logits/chosen": 2.3201236724853516, "logits/rejected": 3.4561610221862793, "logps/chosen": -425.83770751953125, "logps/rejected": -425.7290954589844, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": 1.1046048402786255, "rewards/margins": 10.805675506591797, "rewards/rejected": -9.701070785522461, "step": 2060 }, { "epoch": 1.33, "learning_rate": 8.647940966436562e-08, "logits/chosen": 2.3291468620300293, "logits/rejected": 3.3310623168945312, "logps/chosen": -429.80987548828125, "logps/rejected": -432.98992919921875, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": 0.8949478268623352, "rewards/margins": 11.059711456298828, "rewards/rejected": -10.164762496948242, "step": 2070 }, { "epoch": 1.34, "learning_rate": 8.636039038324208e-08, "logits/chosen": 2.225471019744873, "logits/rejected": 3.3019492626190186, "logps/chosen": -387.1333923339844, "logps/rejected": -433.3799743652344, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": 0.5047739744186401, "rewards/margins": 12.810396194458008, "rewards/rejected": -12.305620193481445, "step": 2080 }, { "epoch": 1.34, "learning_rate": 8.624137110211854e-08, "logits/chosen": 2.510704517364502, "logits/rejected": 3.3782081604003906, "logps/chosen": -409.83270263671875, "logps/rejected": -438.6222229003906, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": 1.7321760654449463, "rewards/margins": 12.49064826965332, "rewards/rejected": -10.75847339630127, "step": 2090 }, { "epoch": 1.35, "learning_rate": 8.6122351820995e-08, "logits/chosen": 1.8151687383651733, "logits/rejected": 2.730388879776001, "logps/chosen": -369.6325378417969, "logps/rejected": -416.94476318359375, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": 0.7672537565231323, "rewards/margins": 10.405468940734863, "rewards/rejected": -9.638215065002441, "step": 2100 }, { "epoch": 1.35, "eval_logits/chosen": 1.7073872089385986, "eval_logits/rejected": 2.287001371383667, "eval_logps/chosen": -401.92828369140625, "eval_logps/rejected": -399.6173095703125, "eval_loss": 0.0646829605102539, "eval_rewards/accuracies": 0.96875, "eval_rewards/chosen": -0.2586807608604431, "eval_rewards/margins": 9.141975402832031, "eval_rewards/rejected": -9.400656700134277, "eval_runtime": 77.8864, "eval_samples_per_second": 12.839, "eval_steps_per_second": 0.411, "step": 2100 }, { "epoch": 1.36, "learning_rate": 8.600333253987145e-08, "logits/chosen": 2.291734218597412, "logits/rejected": 3.170888662338257, "logps/chosen": -423.4766540527344, "logps/rejected": -427.091796875, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": 1.495410680770874, "rewards/margins": 11.364627838134766, "rewards/rejected": -9.869218826293945, "step": 2110 }, { "epoch": 1.36, "learning_rate": 8.58843132587479e-08, "logits/chosen": 2.2076480388641357, "logits/rejected": 2.9256691932678223, "logps/chosen": -432.29248046875, "logps/rejected": -417.88055419921875, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": 1.4364680051803589, "rewards/margins": 11.401215553283691, "rewards/rejected": -9.964746475219727, "step": 2120 }, { "epoch": 1.37, "learning_rate": 8.576529397762438e-08, "logits/chosen": 2.705327272415161, "logits/rejected": 3.241112232208252, "logps/chosen": -381.64923095703125, "logps/rejected": -452.84588623046875, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": 0.8446656465530396, "rewards/margins": 11.237610816955566, "rewards/rejected": -10.392945289611816, "step": 2130 }, { "epoch": 1.38, "learning_rate": 8.564627469650083e-08, "logits/chosen": 2.1531457901000977, "logits/rejected": 3.033215284347534, "logps/chosen": -409.0959167480469, "logps/rejected": -468.179443359375, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": 0.9750633239746094, "rewards/margins": 11.535894393920898, "rewards/rejected": -10.560831069946289, "step": 2140 }, { "epoch": 1.38, "learning_rate": 8.552725541537729e-08, "logits/chosen": 2.3740274906158447, "logits/rejected": 3.0594983100891113, "logps/chosen": -408.73284912109375, "logps/rejected": -432.64154052734375, "loss": 0.0111, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3420227766036987, "rewards/margins": 11.06401252746582, "rewards/rejected": -9.721988677978516, "step": 2150 }, { "epoch": 1.39, "learning_rate": 8.540823613425374e-08, "logits/chosen": 2.0250580310821533, "logits/rejected": 3.1984283924102783, "logps/chosen": -414.43865966796875, "logps/rejected": -414.484375, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": 1.4251967668533325, "rewards/margins": 11.274666786193848, "rewards/rejected": -9.849469184875488, "step": 2160 }, { "epoch": 1.39, "learning_rate": 8.528921685313021e-08, "logits/chosen": 2.57132625579834, "logits/rejected": 3.3775405883789062, "logps/chosen": -492.7275390625, "logps/rejected": -453.9833984375, "loss": 0.0129, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.017164707183838, "rewards/margins": 12.133844375610352, "rewards/rejected": -10.116679191589355, "step": 2170 }, { "epoch": 1.4, "learning_rate": 8.517019757200666e-08, "logits/chosen": 2.3586134910583496, "logits/rejected": 3.1494510173797607, "logps/chosen": -419.758056640625, "logps/rejected": -436.7438049316406, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": 1.5768086910247803, "rewards/margins": 11.310070037841797, "rewards/rejected": -9.733260154724121, "step": 2180 }, { "epoch": 1.41, "learning_rate": 8.505117829088311e-08, "logits/chosen": 2.436389684677124, "logits/rejected": 3.2424912452697754, "logps/chosen": -443.7950744628906, "logps/rejected": -466.5213317871094, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": 0.40640783309936523, "rewards/margins": 10.672332763671875, "rewards/rejected": -10.265925407409668, "step": 2190 }, { "epoch": 1.41, "learning_rate": 8.493215900975957e-08, "logits/chosen": 2.098741054534912, "logits/rejected": 2.9514975547790527, "logps/chosen": -424.73052978515625, "logps/rejected": -460.8973083496094, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": 0.6627575159072876, "rewards/margins": 12.264575004577637, "rewards/rejected": -11.60181713104248, "step": 2200 }, { "epoch": 1.41, "eval_logits/chosen": 1.8804820775985718, "eval_logits/rejected": 2.3825998306274414, "eval_logps/chosen": -409.8846130371094, "eval_logps/rejected": -406.9892578125, "eval_loss": 0.06976839900016785, "eval_rewards/accuracies": 0.9609375, "eval_rewards/chosen": -1.0543094873428345, "eval_rewards/margins": 9.083538055419922, "eval_rewards/rejected": -10.137847900390625, "eval_runtime": 78.0306, "eval_samples_per_second": 12.815, "eval_steps_per_second": 0.41, "step": 2200 }, { "epoch": 1.42, "learning_rate": 8.481313972863604e-08, "logits/chosen": 2.7101387977600098, "logits/rejected": 3.5361340045928955, "logps/chosen": -386.73162841796875, "logps/rejected": -412.102783203125, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": 0.5167805552482605, "rewards/margins": 11.045450210571289, "rewards/rejected": -10.528669357299805, "step": 2210 }, { "epoch": 1.43, "learning_rate": 8.46941204475125e-08, "logits/chosen": 2.4018852710723877, "logits/rejected": 2.6615495681762695, "logps/chosen": -435.355712890625, "logps/rejected": -460.0689392089844, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": 1.7065870761871338, "rewards/margins": 11.625639915466309, "rewards/rejected": -9.919052124023438, "step": 2220 }, { "epoch": 1.43, "learning_rate": 8.457510116638895e-08, "logits/chosen": 2.355992078781128, "logits/rejected": 3.142991304397583, "logps/chosen": -436.5738830566406, "logps/rejected": -442.658447265625, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": 2.008822202682495, "rewards/margins": 11.25963020324707, "rewards/rejected": -9.250809669494629, "step": 2230 }, { "epoch": 1.44, "learning_rate": 8.445608188526541e-08, "logits/chosen": 2.113661527633667, "logits/rejected": 3.376765727996826, "logps/chosen": -363.7850036621094, "logps/rejected": -417.59130859375, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": 1.0049993991851807, "rewards/margins": 10.82304573059082, "rewards/rejected": -9.818044662475586, "step": 2240 }, { "epoch": 1.45, "learning_rate": 8.433706260414186e-08, "logits/chosen": 2.2611918449401855, "logits/rejected": 2.977774143218994, "logps/chosen": -412.16259765625, "logps/rejected": -402.7505798339844, "loss": 0.0124, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9179121255874634, "rewards/margins": 10.67682933807373, "rewards/rejected": -8.758916854858398, "step": 2250 }, { "epoch": 1.45, "learning_rate": 8.421804332301833e-08, "logits/chosen": 2.2166171073913574, "logits/rejected": 3.1910576820373535, "logps/chosen": -415.0889587402344, "logps/rejected": -410.3279724121094, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": 2.147801399230957, "rewards/margins": 11.321202278137207, "rewards/rejected": -9.17340087890625, "step": 2260 }, { "epoch": 1.46, "learning_rate": 8.409902404189478e-08, "logits/chosen": 2.3937222957611084, "logits/rejected": 3.467958450317383, "logps/chosen": -410.2650451660156, "logps/rejected": -424.83074951171875, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": 1.3514193296432495, "rewards/margins": 11.13292121887207, "rewards/rejected": -9.781502723693848, "step": 2270 }, { "epoch": 1.47, "learning_rate": 8.398000476077123e-08, "logits/chosen": 2.627056360244751, "logits/rejected": 2.9553661346435547, "logps/chosen": -422.7137145996094, "logps/rejected": -420.8788146972656, "loss": 0.0098, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3633685111999512, "rewards/margins": 10.64666748046875, "rewards/rejected": -9.283299446105957, "step": 2280 }, { "epoch": 1.47, "learning_rate": 8.386098547964769e-08, "logits/chosen": 2.7881388664245605, "logits/rejected": 2.7855353355407715, "logps/chosen": -463.8809509277344, "logps/rejected": -449.48004150390625, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": 2.032374382019043, "rewards/margins": 12.262084007263184, "rewards/rejected": -10.22970962524414, "step": 2290 }, { "epoch": 1.48, "learning_rate": 8.374196619852416e-08, "logits/chosen": 1.9589307308197021, "logits/rejected": 2.762117862701416, "logps/chosen": -450.49609375, "logps/rejected": -444.59149169921875, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": 1.6292638778686523, "rewards/margins": 11.14265251159668, "rewards/rejected": -9.513387680053711, "step": 2300 }, { "epoch": 1.48, "eval_logits/chosen": 1.7084078788757324, "eval_logits/rejected": 2.194572687149048, "eval_logps/chosen": -397.28363037109375, "eval_logps/rejected": -393.7378845214844, "eval_loss": 0.06439676135778427, "eval_rewards/accuracies": 0.96875, "eval_rewards/chosen": 0.20578746497631073, "eval_rewards/margins": 9.018497467041016, "eval_rewards/rejected": -8.812708854675293, "eval_runtime": 78.0234, "eval_samples_per_second": 12.817, "eval_steps_per_second": 0.41, "step": 2300 }, { "epoch": 1.48, "learning_rate": 8.362294691740062e-08, "logits/chosen": 2.6022965908050537, "logits/rejected": 3.01656436920166, "logps/chosen": -399.0681457519531, "logps/rejected": -423.8030700683594, "loss": 0.0097, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5128016471862793, "rewards/margins": 10.698533058166504, "rewards/rejected": -9.18572998046875, "step": 2310 }, { "epoch": 1.49, "learning_rate": 8.350392763627707e-08, "logits/chosen": 2.14322566986084, "logits/rejected": 3.216325044631958, "logps/chosen": -432.8180236816406, "logps/rejected": -448.30645751953125, "loss": 0.0107, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6714401245117188, "rewards/margins": 11.98411750793457, "rewards/rejected": -10.312677383422852, "step": 2320 }, { "epoch": 1.5, "learning_rate": 8.338490835515353e-08, "logits/chosen": 2.0054235458374023, "logits/rejected": 3.5831961631774902, "logps/chosen": -402.33892822265625, "logps/rejected": -419.5140686035156, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": 1.3268228769302368, "rewards/margins": 11.421789169311523, "rewards/rejected": -10.094966888427734, "step": 2330 }, { "epoch": 1.5, "learning_rate": 8.326588907403e-08, "logits/chosen": 2.380002498626709, "logits/rejected": 3.5054984092712402, "logps/chosen": -452.48748779296875, "logps/rejected": -460.12786865234375, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": 1.852749228477478, "rewards/margins": 12.324037551879883, "rewards/rejected": -10.471287727355957, "step": 2340 }, { "epoch": 1.51, "learning_rate": 8.314686979290645e-08, "logits/chosen": 2.2875988483428955, "logits/rejected": 3.5421195030212402, "logps/chosen": -428.4679260253906, "logps/rejected": -430.433349609375, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": 1.6448265314102173, "rewards/margins": 11.238059997558594, "rewards/rejected": -9.593233108520508, "step": 2350 }, { "epoch": 1.52, "learning_rate": 8.30278505117829e-08, "logits/chosen": 2.450359582901001, "logits/rejected": 3.1287174224853516, "logps/chosen": -432.19329833984375, "logps/rejected": -433.89111328125, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": 1.381823182106018, "rewards/margins": 12.04507064819336, "rewards/rejected": -10.663248062133789, "step": 2360 }, { "epoch": 1.52, "learning_rate": 8.290883123065935e-08, "logits/chosen": 2.806814670562744, "logits/rejected": 3.616931200027466, "logps/chosen": -421.779052734375, "logps/rejected": -449.85260009765625, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": 1.1789385080337524, "rewards/margins": 11.785478591918945, "rewards/rejected": -10.606538772583008, "step": 2370 }, { "epoch": 1.53, "learning_rate": 8.278981194953582e-08, "logits/chosen": 2.440140724182129, "logits/rejected": 3.0209367275238037, "logps/chosen": -454.8857421875, "logps/rejected": -450.9249572753906, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": 1.156050443649292, "rewards/margins": 11.445282936096191, "rewards/rejected": -10.28923225402832, "step": 2380 }, { "epoch": 1.54, "learning_rate": 8.267079266841228e-08, "logits/chosen": 2.5684947967529297, "logits/rejected": 3.107997417449951, "logps/chosen": -408.620849609375, "logps/rejected": -425.724365234375, "loss": 0.008, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.03949282318353653, "rewards/margins": 10.728104591369629, "rewards/rejected": -10.767596244812012, "step": 2390 }, { "epoch": 1.54, "learning_rate": 8.255177338728874e-08, "logits/chosen": 2.5037713050842285, "logits/rejected": 3.1614301204681396, "logps/chosen": -447.5420837402344, "logps/rejected": -419.99981689453125, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": 1.0575414896011353, "rewards/margins": 11.080565452575684, "rewards/rejected": -10.02302360534668, "step": 2400 }, { "epoch": 1.54, "eval_logits/chosen": 1.8239837884902954, "eval_logits/rejected": 2.3104217052459717, "eval_logps/chosen": -404.14984130859375, "eval_logps/rejected": -404.53826904296875, "eval_loss": 0.06748179346323013, "eval_rewards/accuracies": 0.96875, "eval_rewards/chosen": -0.4808317720890045, "eval_rewards/margins": 9.411918640136719, "eval_rewards/rejected": -9.892749786376953, "eval_runtime": 78.0282, "eval_samples_per_second": 12.816, "eval_steps_per_second": 0.41, "step": 2400 }, { "epoch": 1.55, "learning_rate": 8.243275410616519e-08, "logits/chosen": 2.3226962089538574, "logits/rejected": 3.3303439617156982, "logps/chosen": -418.3297424316406, "logps/rejected": -439.53143310546875, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 0.6141597032546997, "rewards/margins": 10.954813003540039, "rewards/rejected": -10.340652465820312, "step": 2410 }, { "epoch": 1.56, "learning_rate": 8.231373482504166e-08, "logits/chosen": 2.913877487182617, "logits/rejected": 3.016862392425537, "logps/chosen": -340.18499755859375, "logps/rejected": -396.21722412109375, "loss": 0.0071, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8227875232696533, "rewards/margins": 11.620365142822266, "rewards/rejected": -9.797577857971191, "step": 2420 }, { "epoch": 1.56, "learning_rate": 8.219471554391812e-08, "logits/chosen": 2.501906633377075, "logits/rejected": 2.8194351196289062, "logps/chosen": -432.50787353515625, "logps/rejected": -426.0884704589844, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": 1.8617050647735596, "rewards/margins": 11.113122940063477, "rewards/rejected": -9.251418113708496, "step": 2430 }, { "epoch": 1.57, "learning_rate": 8.207569626279457e-08, "logits/chosen": 2.374406337738037, "logits/rejected": 3.2433886528015137, "logps/chosen": -430.30352783203125, "logps/rejected": -446.37286376953125, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": 1.8586057424545288, "rewards/margins": 11.883955001831055, "rewards/rejected": -10.025348663330078, "step": 2440 }, { "epoch": 1.57, "learning_rate": 8.195667698167103e-08, "logits/chosen": 2.630288600921631, "logits/rejected": 3.096122980117798, "logps/chosen": -458.97149658203125, "logps/rejected": -464.8585510253906, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": 1.6575380563735962, "rewards/margins": 13.09735107421875, "rewards/rejected": -11.439813613891602, "step": 2450 }, { "epoch": 1.58, "learning_rate": 8.183765770054747e-08, "logits/chosen": 2.9291977882385254, "logits/rejected": 3.383643388748169, "logps/chosen": -426.6884765625, "logps/rejected": -418.20172119140625, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": 1.3758227825164795, "rewards/margins": 11.36131477355957, "rewards/rejected": -9.985492706298828, "step": 2460 }, { "epoch": 1.59, "learning_rate": 8.171863841942394e-08, "logits/chosen": 2.36037278175354, "logits/rejected": 3.379338026046753, "logps/chosen": -409.152587890625, "logps/rejected": -405.7203063964844, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": 0.7479127049446106, "rewards/margins": 10.87302303314209, "rewards/rejected": -10.12511157989502, "step": 2470 }, { "epoch": 1.59, "learning_rate": 8.15996191383004e-08, "logits/chosen": 2.3077635765075684, "logits/rejected": 2.982712745666504, "logps/chosen": -440.3170471191406, "logps/rejected": -441.4231872558594, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": 1.2002421617507935, "rewards/margins": 11.192548751831055, "rewards/rejected": -9.99230670928955, "step": 2480 }, { "epoch": 1.6, "learning_rate": 8.148059985717686e-08, "logits/chosen": 2.3768467903137207, "logits/rejected": 3.2358105182647705, "logps/chosen": -450.10040283203125, "logps/rejected": -459.655517578125, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 0.5886727571487427, "rewards/margins": 11.561334609985352, "rewards/rejected": -10.972661018371582, "step": 2490 }, { "epoch": 1.61, "learning_rate": 8.136158057605331e-08, "logits/chosen": 1.9187663793563843, "logits/rejected": 3.099363088607788, "logps/chosen": -489.19549560546875, "logps/rejected": -454.474609375, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": 1.78690505027771, "rewards/margins": 11.946157455444336, "rewards/rejected": -10.159250259399414, "step": 2500 }, { "epoch": 1.61, "eval_logits/chosen": 1.7307677268981934, "eval_logits/rejected": 2.2528133392333984, "eval_logps/chosen": -395.5159606933594, "eval_logps/rejected": -398.804443359375, "eval_loss": 0.06332825124263763, "eval_rewards/accuracies": 0.9609375, "eval_rewards/chosen": 0.38255468010902405, "eval_rewards/margins": 9.701919555664062, "eval_rewards/rejected": -9.319364547729492, "eval_runtime": 77.9753, "eval_samples_per_second": 12.825, "eval_steps_per_second": 0.41, "step": 2500 }, { "epoch": 1.61, "learning_rate": 8.124256129492978e-08, "logits/chosen": 1.937443494796753, "logits/rejected": 2.9953453540802, "logps/chosen": -452.47900390625, "logps/rejected": -460.998046875, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": 1.4172687530517578, "rewards/margins": 12.328100204467773, "rewards/rejected": -10.910831451416016, "step": 2510 }, { "epoch": 1.62, "learning_rate": 8.112354201380624e-08, "logits/chosen": 2.033159017562866, "logits/rejected": 2.923642158508301, "logps/chosen": -385.84124755859375, "logps/rejected": -427.04095458984375, "loss": 0.0114, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8549903631210327, "rewards/margins": 11.425082206726074, "rewards/rejected": -9.57009220123291, "step": 2520 }, { "epoch": 1.63, "learning_rate": 8.10045227326827e-08, "logits/chosen": 2.1730008125305176, "logits/rejected": 2.7437562942504883, "logps/chosen": -401.44744873046875, "logps/rejected": -461.93597412109375, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": 2.0297939777374268, "rewards/margins": 13.02344799041748, "rewards/rejected": -10.993656158447266, "step": 2530 }, { "epoch": 1.63, "learning_rate": 8.088550345155915e-08, "logits/chosen": 2.1667115688323975, "logits/rejected": 2.8589937686920166, "logps/chosen": -411.73284912109375, "logps/rejected": -428.7911682128906, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 1.2299811840057373, "rewards/margins": 11.306024551391602, "rewards/rejected": -10.076042175292969, "step": 2540 }, { "epoch": 1.64, "learning_rate": 8.076648417043561e-08, "logits/chosen": 2.363246202468872, "logits/rejected": 2.9583797454833984, "logps/chosen": -473.9364318847656, "logps/rejected": -445.84625244140625, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": 2.21724534034729, "rewards/margins": 11.700216293334961, "rewards/rejected": -9.482972145080566, "step": 2550 }, { "epoch": 1.65, "learning_rate": 8.064746488931206e-08, "logits/chosen": 2.5506205558776855, "logits/rejected": 3.462920665740967, "logps/chosen": -411.14678955078125, "logps/rejected": -444.2726135253906, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": 0.6240037083625793, "rewards/margins": 11.176986694335938, "rewards/rejected": -10.552982330322266, "step": 2560 }, { "epoch": 1.65, "learning_rate": 8.052844560818852e-08, "logits/chosen": 2.1937804222106934, "logits/rejected": 3.3300259113311768, "logps/chosen": -457.53814697265625, "logps/rejected": -449.9571838378906, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": 1.528435230255127, "rewards/margins": 11.644414901733398, "rewards/rejected": -10.115981101989746, "step": 2570 }, { "epoch": 1.66, "learning_rate": 8.040942632706498e-08, "logits/chosen": 2.55604887008667, "logits/rejected": 3.0880343914031982, "logps/chosen": -420.58782958984375, "logps/rejected": -388.7262268066406, "loss": 0.0075, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4543565511703491, "rewards/margins": 10.79311752319336, "rewards/rejected": -9.338762283325195, "step": 2580 }, { "epoch": 1.66, "learning_rate": 8.029040704594145e-08, "logits/chosen": 2.547492504119873, "logits/rejected": 3.6884007453918457, "logps/chosen": -380.635009765625, "logps/rejected": -423.63018798828125, "loss": 0.0098, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.180540680885315, "rewards/margins": 11.04011344909668, "rewards/rejected": -9.859573364257812, "step": 2590 }, { "epoch": 1.67, "learning_rate": 8.01713877648179e-08, "logits/chosen": 2.499735116958618, "logits/rejected": 3.318908214569092, "logps/chosen": -406.8338623046875, "logps/rejected": -433.6927795410156, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 0.4649948179721832, "rewards/margins": 11.194982528686523, "rewards/rejected": -10.729987144470215, "step": 2600 }, { "epoch": 1.67, "eval_logits/chosen": 1.8153432607650757, "eval_logits/rejected": 2.3447887897491455, "eval_logps/chosen": -408.9284973144531, "eval_logps/rejected": -409.33148193359375, "eval_loss": 0.07508327066898346, "eval_rewards/accuracies": 0.9609375, "eval_rewards/chosen": -0.9586971402168274, "eval_rewards/margins": 9.413373947143555, "eval_rewards/rejected": -10.3720703125, "eval_runtime": 78.0392, "eval_samples_per_second": 12.814, "eval_steps_per_second": 0.41, "step": 2600 }, { "epoch": 1.68, "learning_rate": 8.005236848369436e-08, "logits/chosen": 2.228560447692871, "logits/rejected": 3.2750918865203857, "logps/chosen": -443.6728515625, "logps/rejected": -464.92108154296875, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": 0.657561182975769, "rewards/margins": 11.451318740844727, "rewards/rejected": -10.793758392333984, "step": 2610 }, { "epoch": 1.68, "learning_rate": 7.993334920257082e-08, "logits/chosen": 2.6553711891174316, "logits/rejected": 3.3680367469787598, "logps/chosen": -449.4634704589844, "logps/rejected": -419.002197265625, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": 0.8493421673774719, "rewards/margins": 11.447193145751953, "rewards/rejected": -10.597851753234863, "step": 2620 }, { "epoch": 1.69, "learning_rate": 7.981432992144727e-08, "logits/chosen": 2.665163278579712, "logits/rejected": 3.3812012672424316, "logps/chosen": -468.38714599609375, "logps/rejected": -457.9358825683594, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": 0.7476972341537476, "rewards/margins": 12.187990188598633, "rewards/rejected": -11.440293312072754, "step": 2630 }, { "epoch": 1.7, "learning_rate": 7.969531064032373e-08, "logits/chosen": 2.5799193382263184, "logits/rejected": 3.3646559715270996, "logps/chosen": -375.97174072265625, "logps/rejected": -422.7632751464844, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": 0.5605910420417786, "rewards/margins": 11.750158309936523, "rewards/rejected": -11.189568519592285, "step": 2640 }, { "epoch": 1.7, "learning_rate": 7.957629135920018e-08, "logits/chosen": 2.4693617820739746, "logits/rejected": 3.5580387115478516, "logps/chosen": -430.0306091308594, "logps/rejected": -447.8194274902344, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": 0.3709467649459839, "rewards/margins": 11.733332633972168, "rewards/rejected": -11.362385749816895, "step": 2650 }, { "epoch": 1.71, "learning_rate": 7.945727207807664e-08, "logits/chosen": 2.5020830631256104, "logits/rejected": 3.252861738204956, "logps/chosen": -435.47222900390625, "logps/rejected": -447.82373046875, "loss": 0.0088, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.05917937681078911, "rewards/margins": 11.433965682983398, "rewards/rejected": -11.493144035339355, "step": 2660 }, { "epoch": 1.72, "learning_rate": 7.93382527969531e-08, "logits/chosen": 2.116004705429077, "logits/rejected": 3.1918604373931885, "logps/chosen": -463.59344482421875, "logps/rejected": -442.08203125, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": 1.2367243766784668, "rewards/margins": 12.351614952087402, "rewards/rejected": -11.11489200592041, "step": 2670 }, { "epoch": 1.72, "learning_rate": 7.921923351582957e-08, "logits/chosen": 2.5514450073242188, "logits/rejected": 2.77209734916687, "logps/chosen": -384.43865966796875, "logps/rejected": -423.64892578125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.14157016575336456, "rewards/margins": 11.351391792297363, "rewards/rejected": -11.209821701049805, "step": 2680 }, { "epoch": 1.73, "learning_rate": 7.910021423470602e-08, "logits/chosen": 2.9975619316101074, "logits/rejected": 2.824094295501709, "logps/chosen": -428.303466796875, "logps/rejected": -438.8485412597656, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": 0.5388720631599426, "rewards/margins": 12.036158561706543, "rewards/rejected": -11.497285842895508, "step": 2690 }, { "epoch": 1.74, "learning_rate": 7.898119495358248e-08, "logits/chosen": 2.3330235481262207, "logits/rejected": 3.021794080734253, "logps/chosen": -417.4193420410156, "logps/rejected": -474.170166015625, "loss": 0.0054, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.0910119041800499, "rewards/margins": 12.0770902633667, "rewards/rejected": -12.168102264404297, "step": 2700 }, { "epoch": 1.74, "eval_logits/chosen": 1.7441036701202393, "eval_logits/rejected": 2.28686785697937, "eval_logps/chosen": -404.1935119628906, "eval_logps/rejected": -408.94500732421875, "eval_loss": 0.0632321760058403, "eval_rewards/accuracies": 0.9609375, "eval_rewards/chosen": -0.48519980907440186, "eval_rewards/margins": 9.848224639892578, "eval_rewards/rejected": -10.333423614501953, "eval_runtime": 77.9801, "eval_samples_per_second": 12.824, "eval_steps_per_second": 0.41, "step": 2700 }, { "epoch": 1.74, "learning_rate": 7.886217567245894e-08, "logits/chosen": 2.2133631706237793, "logits/rejected": 3.3354735374450684, "logps/chosen": -428.5398864746094, "logps/rejected": -432.5113220214844, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": 0.9777836799621582, "rewards/margins": 12.670408248901367, "rewards/rejected": -11.692625045776367, "step": 2710 }, { "epoch": 1.75, "learning_rate": 7.874315639133539e-08, "logits/chosen": 2.4246087074279785, "logits/rejected": 3.052839756011963, "logps/chosen": -421.6343688964844, "logps/rejected": -421.95697021484375, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": 1.3704569339752197, "rewards/margins": 12.342972755432129, "rewards/rejected": -10.972516059875488, "step": 2720 }, { "epoch": 1.75, "learning_rate": 7.862413711021185e-08, "logits/chosen": 3.059401273727417, "logits/rejected": 3.639910936355591, "logps/chosen": -407.8876037597656, "logps/rejected": -447.3395080566406, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": 0.906929612159729, "rewards/margins": 12.296318054199219, "rewards/rejected": -11.389389991760254, "step": 2730 }, { "epoch": 1.76, "learning_rate": 7.85051178290883e-08, "logits/chosen": 2.043600082397461, "logits/rejected": 2.944366931915283, "logps/chosen": -399.25140380859375, "logps/rejected": -418.093994140625, "loss": 0.0117, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2023608684539795, "rewards/margins": 10.754437446594238, "rewards/rejected": -10.55207633972168, "step": 2740 }, { "epoch": 1.77, "learning_rate": 7.838609854796476e-08, "logits/chosen": 2.511198043823242, "logits/rejected": 3.7167916297912598, "logps/chosen": -428.31396484375, "logps/rejected": -459.7771911621094, "loss": 0.0065, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.24688346683979034, "rewards/margins": 11.848150253295898, "rewards/rejected": -11.601266860961914, "step": 2750 }, { "epoch": 1.77, "learning_rate": 7.826707926684123e-08, "logits/chosen": 2.4253451824188232, "logits/rejected": 3.1690335273742676, "logps/chosen": -426.9501953125, "logps/rejected": -453.51531982421875, "loss": 0.0053, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7976835370063782, "rewards/margins": 12.751691818237305, "rewards/rejected": -11.954008102416992, "step": 2760 }, { "epoch": 1.78, "learning_rate": 7.814805998571769e-08, "logits/chosen": 2.323408365249634, "logits/rejected": 3.649256467819214, "logps/chosen": -410.73699951171875, "logps/rejected": -436.67669677734375, "loss": 0.0125, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.16351564228534698, "rewards/margins": 10.847258567810059, "rewards/rejected": -10.683743476867676, "step": 2770 }, { "epoch": 1.79, "learning_rate": 7.802904070459414e-08, "logits/chosen": 2.627110004425049, "logits/rejected": 3.4923622608184814, "logps/chosen": -434.0037536621094, "logps/rejected": -436.61669921875, "loss": 0.0092, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.193922996520996, "rewards/margins": 11.6904296875, "rewards/rejected": -10.496505737304688, "step": 2780 }, { "epoch": 1.79, "learning_rate": 7.79100214234706e-08, "logits/chosen": 2.3079676628112793, "logits/rejected": 3.4073116779327393, "logps/chosen": -419.6051330566406, "logps/rejected": -428.28326416015625, "loss": 0.008, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7547308802604675, "rewards/margins": 11.233491897583008, "rewards/rejected": -10.478760719299316, "step": 2790 }, { "epoch": 1.8, "learning_rate": 7.779100214234706e-08, "logits/chosen": 2.747231960296631, "logits/rejected": 3.3032188415527344, "logps/chosen": -440.6018981933594, "logps/rejected": -423.44952392578125, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": 1.2705620527267456, "rewards/margins": 11.606225967407227, "rewards/rejected": -10.335662841796875, "step": 2800 }, { "epoch": 1.8, "eval_logits/chosen": 1.7818334102630615, "eval_logits/rejected": 2.3191139698028564, "eval_logps/chosen": -398.98333740234375, "eval_logps/rejected": -405.5218811035156, "eval_loss": 0.05957724153995514, "eval_rewards/accuracies": 0.953125, "eval_rewards/chosen": 0.03581659495830536, "eval_rewards/margins": 10.026926040649414, "eval_rewards/rejected": -9.991110801696777, "eval_runtime": 77.8303, "eval_samples_per_second": 12.848, "eval_steps_per_second": 0.411, "step": 2800 }, { "epoch": 1.81, "learning_rate": 7.767198286122351e-08, "logits/chosen": 2.7316629886627197, "logits/rejected": 3.3055825233459473, "logps/chosen": -377.58599853515625, "logps/rejected": -423.8103942871094, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": 0.7823238968849182, "rewards/margins": 11.870222091674805, "rewards/rejected": -11.087898254394531, "step": 2810 }, { "epoch": 1.81, "learning_rate": 7.755296358009997e-08, "logits/chosen": 2.6006321907043457, "logits/rejected": 3.5436511039733887, "logps/chosen": -359.8816833496094, "logps/rejected": -414.5054626464844, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": 0.05306140333414078, "rewards/margins": 11.554253578186035, "rewards/rejected": -11.501191139221191, "step": 2820 }, { "epoch": 1.82, "learning_rate": 7.743394429897642e-08, "logits/chosen": 2.4547390937805176, "logits/rejected": 3.4132580757141113, "logps/chosen": -448.56866455078125, "logps/rejected": -452.8617248535156, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": 1.098752737045288, "rewards/margins": 12.525891304016113, "rewards/rejected": -11.427138328552246, "step": 2830 }, { "epoch": 1.83, "learning_rate": 7.731492501785288e-08, "logits/chosen": 2.644763469696045, "logits/rejected": 3.5032525062561035, "logps/chosen": -416.0933532714844, "logps/rejected": -436.23486328125, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": 1.0677202939987183, "rewards/margins": 11.958089828491211, "rewards/rejected": -10.890369415283203, "step": 2840 }, { "epoch": 1.83, "learning_rate": 7.719590573672935e-08, "logits/chosen": 2.381120204925537, "logits/rejected": 3.223132371902466, "logps/chosen": -429.01861572265625, "logps/rejected": -421.361328125, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 0.6816972494125366, "rewards/margins": 11.070103645324707, "rewards/rejected": -10.388406753540039, "step": 2850 }, { "epoch": 1.84, "learning_rate": 7.70768864556058e-08, "logits/chosen": 3.1485886573791504, "logits/rejected": 3.6244475841522217, "logps/chosen": -411.9215393066406, "logps/rejected": -425.89837646484375, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": 0.4478631615638733, "rewards/margins": 11.844661712646484, "rewards/rejected": -11.39680004119873, "step": 2860 }, { "epoch": 1.84, "learning_rate": 7.695786717448226e-08, "logits/chosen": 2.1998846530914307, "logits/rejected": 3.2429378032684326, "logps/chosen": -422.6412658691406, "logps/rejected": -422.876953125, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": 1.1410973072052002, "rewards/margins": 11.749425888061523, "rewards/rejected": -10.608327865600586, "step": 2870 }, { "epoch": 1.85, "learning_rate": 7.683884789335872e-08, "logits/chosen": 2.5578694343566895, "logits/rejected": 3.4331531524658203, "logps/chosen": -455.40301513671875, "logps/rejected": -470.500732421875, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 1.343795657157898, "rewards/margins": 12.484710693359375, "rewards/rejected": -11.140914916992188, "step": 2880 }, { "epoch": 1.86, "learning_rate": 7.671982861223519e-08, "logits/chosen": 2.706578254699707, "logits/rejected": 3.628066301345825, "logps/chosen": -389.6455383300781, "logps/rejected": -412.09686279296875, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": 1.3088476657867432, "rewards/margins": 12.140149116516113, "rewards/rejected": -10.831302642822266, "step": 2890 }, { "epoch": 1.86, "learning_rate": 7.660080933111163e-08, "logits/chosen": 2.5364797115325928, "logits/rejected": 3.7345130443573, "logps/chosen": -440.4591369628906, "logps/rejected": -508.75164794921875, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 1.2763676643371582, "rewards/margins": 16.67254638671875, "rewards/rejected": -15.3961763381958, "step": 2900 }, { "epoch": 1.86, "eval_logits/chosen": 1.9009253978729248, "eval_logits/rejected": 2.4084951877593994, "eval_logps/chosen": -403.50592041015625, "eval_logps/rejected": -410.25238037109375, "eval_loss": 0.06930559128522873, "eval_rewards/accuracies": 0.9453125, "eval_rewards/chosen": -0.4164417088031769, "eval_rewards/margins": 10.047719955444336, "eval_rewards/rejected": -10.464160919189453, "eval_runtime": 78.1903, "eval_samples_per_second": 12.789, "eval_steps_per_second": 0.409, "step": 2900 }, { "epoch": 1.87, "learning_rate": 7.648179004998809e-08, "logits/chosen": 2.4650321006774902, "logits/rejected": 2.9449849128723145, "logps/chosen": -446.25445556640625, "logps/rejected": -468.34954833984375, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 1.0595293045043945, "rewards/margins": 12.140164375305176, "rewards/rejected": -11.080634117126465, "step": 2910 }, { "epoch": 1.88, "learning_rate": 7.636277076886454e-08, "logits/chosen": 2.7862162590026855, "logits/rejected": 3.3674285411834717, "logps/chosen": -447.1483459472656, "logps/rejected": -483.9112854003906, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": 1.6543643474578857, "rewards/margins": 13.146313667297363, "rewards/rejected": -11.491949081420898, "step": 2920 }, { "epoch": 1.88, "learning_rate": 7.624375148774101e-08, "logits/chosen": 2.4357800483703613, "logits/rejected": 3.8338236808776855, "logps/chosen": -415.13055419921875, "logps/rejected": -419.103515625, "loss": 0.0108, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2529149055480957, "rewards/margins": 11.368078231811523, "rewards/rejected": -10.11516284942627, "step": 2930 }, { "epoch": 1.89, "learning_rate": 7.612473220661747e-08, "logits/chosen": 3.0562214851379395, "logits/rejected": 2.8682100772857666, "logps/chosen": -413.2584533691406, "logps/rejected": -434.45428466796875, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": 0.3094024956226349, "rewards/margins": 11.557034492492676, "rewards/rejected": -11.24763298034668, "step": 2940 }, { "epoch": 1.9, "learning_rate": 7.600571292549393e-08, "logits/chosen": 2.3887839317321777, "logits/rejected": 3.3047709465026855, "logps/chosen": -442.767333984375, "logps/rejected": -466.8439025878906, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": 1.5330932140350342, "rewards/margins": 12.568510055541992, "rewards/rejected": -11.035417556762695, "step": 2950 }, { "epoch": 1.9, "learning_rate": 7.588669364437038e-08, "logits/chosen": 2.524888515472412, "logits/rejected": 3.4284675121307373, "logps/chosen": -402.7549743652344, "logps/rejected": -445.7957458496094, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 0.39540719985961914, "rewards/margins": 11.571582794189453, "rewards/rejected": -11.176176071166992, "step": 2960 }, { "epoch": 1.91, "learning_rate": 7.576767436324685e-08, "logits/chosen": 2.311568260192871, "logits/rejected": 3.2848758697509766, "logps/chosen": -423.08416748046875, "logps/rejected": -438.42718505859375, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": 1.0096681118011475, "rewards/margins": 11.884648323059082, "rewards/rejected": -10.874979972839355, "step": 2970 }, { "epoch": 1.92, "learning_rate": 7.564865508212331e-08, "logits/chosen": 2.790484666824341, "logits/rejected": 3.298766613006592, "logps/chosen": -360.9853515625, "logps/rejected": -404.8863830566406, "loss": 0.0089, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7994908094406128, "rewards/margins": 10.370939254760742, "rewards/rejected": -9.571449279785156, "step": 2980 }, { "epoch": 1.92, "learning_rate": 7.552963580099975e-08, "logits/chosen": 2.6981308460235596, "logits/rejected": 3.2417426109313965, "logps/chosen": -399.3218994140625, "logps/rejected": -431.1190490722656, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": 1.13810133934021, "rewards/margins": 12.226466178894043, "rewards/rejected": -11.08836555480957, "step": 2990 }, { "epoch": 1.93, "learning_rate": 7.541061651987621e-08, "logits/chosen": 2.300840377807617, "logits/rejected": 3.730071544647217, "logps/chosen": -458.31536865234375, "logps/rejected": -440.55523681640625, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": 1.555953860282898, "rewards/margins": 11.838624954223633, "rewards/rejected": -10.282670974731445, "step": 3000 }, { "epoch": 1.93, "eval_logits/chosen": 1.8183174133300781, "eval_logits/rejected": 2.3427019119262695, "eval_logps/chosen": -399.7039794921875, "eval_logps/rejected": -408.1551513671875, "eval_loss": 0.059715636074543, "eval_rewards/accuracies": 0.953125, "eval_rewards/chosen": -0.03624638170003891, "eval_rewards/margins": 10.218188285827637, "eval_rewards/rejected": -10.254435539245605, "eval_runtime": 78.1202, "eval_samples_per_second": 12.801, "eval_steps_per_second": 0.41, "step": 3000 }, { "epoch": 1.93, "learning_rate": 7.529159723875268e-08, "logits/chosen": 2.5125820636749268, "logits/rejected": 2.929962635040283, "logps/chosen": -435.52734375, "logps/rejected": -480.02215576171875, "loss": 0.0104, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.0602343082427979, "rewards/margins": 12.658498764038086, "rewards/rejected": -11.59826374053955, "step": 3010 }, { "epoch": 1.94, "learning_rate": 7.517257795762913e-08, "logits/chosen": 2.5797853469848633, "logits/rejected": 3.4592716693878174, "logps/chosen": -427.9090881347656, "logps/rejected": -460.9290466308594, "loss": 0.0091, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9765796661376953, "rewards/margins": 12.438034057617188, "rewards/rejected": -11.461454391479492, "step": 3020 }, { "epoch": 1.95, "learning_rate": 7.505355867650559e-08, "logits/chosen": 2.317451000213623, "logits/rejected": 3.6136322021484375, "logps/chosen": -426.43841552734375, "logps/rejected": -431.55279541015625, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": 1.6562097072601318, "rewards/margins": 11.710824966430664, "rewards/rejected": -10.05461597442627, "step": 3030 }, { "epoch": 1.95, "learning_rate": 7.493453939538205e-08, "logits/chosen": 2.4118270874023438, "logits/rejected": 3.3411223888397217, "logps/chosen": -433.496337890625, "logps/rejected": -413.53826904296875, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": 0.49976015090942383, "rewards/margins": 10.6139554977417, "rewards/rejected": -10.114194869995117, "step": 3040 }, { "epoch": 1.96, "learning_rate": 7.48155201142585e-08, "logits/chosen": 2.3743271827697754, "logits/rejected": 3.5587539672851562, "logps/chosen": -422.71014404296875, "logps/rejected": -442.51556396484375, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": 0.9052066802978516, "rewards/margins": 11.696832656860352, "rewards/rejected": -10.791627883911133, "step": 3050 }, { "epoch": 1.97, "learning_rate": 7.469650083313497e-08, "logits/chosen": 2.606020450592041, "logits/rejected": 3.443312168121338, "logps/chosen": -397.82208251953125, "logps/rejected": -423.3816833496094, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 0.306690514087677, "rewards/margins": 11.203126907348633, "rewards/rejected": -10.896435737609863, "step": 3060 }, { "epoch": 1.97, "learning_rate": 7.457748155201143e-08, "logits/chosen": 2.916489839553833, "logits/rejected": 3.762838840484619, "logps/chosen": -384.3777770996094, "logps/rejected": -457.49041748046875, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 0.04699459671974182, "rewards/margins": 11.909761428833008, "rewards/rejected": -11.86276626586914, "step": 3070 }, { "epoch": 1.98, "learning_rate": 7.445846227088787e-08, "logits/chosen": 2.0478157997131348, "logits/rejected": 3.478813886642456, "logps/chosen": -432.97686767578125, "logps/rejected": -439.83221435546875, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 1.5262401103973389, "rewards/margins": 12.083934783935547, "rewards/rejected": -10.557694435119629, "step": 3080 }, { "epoch": 1.99, "learning_rate": 7.433944298976433e-08, "logits/chosen": 2.548459053039551, "logits/rejected": 3.1747069358825684, "logps/chosen": -429.4046936035156, "logps/rejected": -495.275146484375, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 0.3433806300163269, "rewards/margins": 12.296731948852539, "rewards/rejected": -11.953351974487305, "step": 3090 }, { "epoch": 1.99, "learning_rate": 7.42204237086408e-08, "logits/chosen": 2.3155176639556885, "logits/rejected": 3.0107157230377197, "logps/chosen": -458.78216552734375, "logps/rejected": -431.2632751464844, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": 1.5119603872299194, "rewards/margins": 12.521995544433594, "rewards/rejected": -11.010034561157227, "step": 3100 }, { "epoch": 1.99, "eval_logits/chosen": 1.7790201902389526, "eval_logits/rejected": 2.328681468963623, "eval_logps/chosen": -406.3954772949219, "eval_logps/rejected": -412.999755859375, "eval_loss": 0.0663955956697464, "eval_rewards/accuracies": 0.9609375, "eval_rewards/chosen": -0.70539790391922, "eval_rewards/margins": 10.033500671386719, "eval_rewards/rejected": -10.738900184631348, "eval_runtime": 78.1747, "eval_samples_per_second": 12.792, "eval_steps_per_second": 0.409, "step": 3100 }, { "epoch": 2.0, "learning_rate": 7.410140442751725e-08, "logits/chosen": 2.371422290802002, "logits/rejected": 3.271979808807373, "logps/chosen": -402.83258056640625, "logps/rejected": -439.00079345703125, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": 0.6034157872200012, "rewards/margins": 12.108831405639648, "rewards/rejected": -11.505415916442871, "step": 3110 } ], "logging_steps": 10, "max_steps": 9336, "num_train_epochs": 6, "save_steps": 500, "total_flos": 0.0, "trial_name": null, "trial_params": null }