diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13798 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 100, + "global_step": 8826, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.662514156285391e-10, + "logits/chosen": 2.7222177982330322, + "logits/rejected": 2.6171863079071045, + "logps/chosen": -391.45166015625, + "logps/rejected": -819.539306640625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 5.66251415628539e-09, + "logits/chosen": 1.2315014600753784, + "logits/rejected": 2.9117307662963867, + "logps/chosen": -618.143798828125, + "logps/rejected": -580.205810546875, + "loss": 0.6935, + "rewards/accuracies": 0.4861111044883728, + "rewards/chosen": 0.007726929150521755, + "rewards/margins": 0.004576317500323057, + "rewards/rejected": 0.0031506128143519163, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 1.132502831257078e-08, + "logits/chosen": 1.0855623483657837, + "logits/rejected": 3.2654030323028564, + "logps/chosen": -365.8334655761719, + "logps/rejected": -507.7064514160156, + "loss": 0.6945, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.00016126893751788884, + "rewards/margins": -0.011320212855935097, + "rewards/rejected": 0.011481483466923237, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 1.698754246885617e-08, + "logits/chosen": 1.2892698049545288, + "logits/rejected": 2.8500564098358154, + "logps/chosen": -507.72869873046875, + "logps/rejected": -473.02337646484375, + "loss": 0.6923, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.0013575742486864328, + "rewards/margins": 0.011692820116877556, + "rewards/rejected": -0.010335246101021767, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 2.265005662514156e-08, + "logits/chosen": 1.204493761062622, + "logits/rejected": 3.2822766304016113, + "logps/chosen": -384.868408203125, + "logps/rejected": -533.8895874023438, + "loss": 0.692, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.005484940949827433, + "rewards/margins": -0.0033146303612738848, + "rewards/rejected": -0.002170309191569686, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 2.8312570781426952e-08, + "logits/chosen": 0.5130025148391724, + "logits/rejected": 2.7183468341827393, + "logps/chosen": -339.9174499511719, + "logps/rejected": -595.5003051757812, + "loss": 0.6896, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0010748256463557482, + "rewards/margins": 0.004564857110381126, + "rewards/rejected": -0.0034900312311947346, + "step": 50 + }, + { + "epoch": 0.02, + "learning_rate": 3.397508493771234e-08, + "logits/chosen": 1.5610108375549316, + "logits/rejected": 3.428831100463867, + "logps/chosen": -428.70269775390625, + "logps/rejected": -401.84478759765625, + "loss": 0.6889, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.012302838265895844, + "rewards/margins": 0.004726693034172058, + "rewards/rejected": 0.007576142903417349, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 3.9637599093997736e-08, + "logits/chosen": 0.7089935541152954, + "logits/rejected": 2.8489346504211426, + "logps/chosen": -394.15484619140625, + "logps/rejected": -542.0667114257812, + "loss": 0.682, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.013238462619483471, + "rewards/margins": 0.023042922839522362, + "rewards/rejected": -0.00980446022003889, + "step": 70 + }, + { + "epoch": 0.03, + "learning_rate": 4.530011325028312e-08, + "logits/chosen": 1.2423776388168335, + "logits/rejected": 3.1232666969299316, + "logps/chosen": -538.8358764648438, + "logps/rejected": -434.6361389160156, + "loss": 0.6757, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.028549503535032272, + "rewards/margins": 0.033378563821315765, + "rewards/rejected": -0.004829060286283493, + "step": 80 + }, + { + "epoch": 0.03, + "learning_rate": 5.096262740656852e-08, + "logits/chosen": 1.0223588943481445, + "logits/rejected": 3.520498752593994, + "logps/chosen": -351.19488525390625, + "logps/rejected": -328.054931640625, + "loss": 0.6678, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04164254665374756, + "rewards/margins": 0.035227615386247635, + "rewards/rejected": 0.0064149340614676476, + "step": 90 + }, + { + "epoch": 0.03, + "learning_rate": 5.6625141562853904e-08, + "logits/chosen": 0.7637745141983032, + "logits/rejected": 3.3096764087677, + "logps/chosen": -328.73541259765625, + "logps/rejected": -459.0279235839844, + "loss": 0.6612, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.06262902915477753, + "rewards/margins": 0.08298339694738388, + "rewards/rejected": -0.02035437896847725, + "step": 100 + }, + { + "epoch": 0.03, + "eval_logits/chosen": 0.9032161235809326, + "eval_logits/rejected": 3.2378697395324707, + "eval_logps/chosen": -388.2252502441406, + "eval_logps/rejected": -502.7813415527344, + "eval_loss": 0.6499212980270386, + "eval_rewards/accuracies": 0.8299663066864014, + "eval_rewards/chosen": 0.07646423578262329, + "eval_rewards/margins": 0.0915694385766983, + "eval_rewards/rejected": -0.015105200931429863, + "eval_runtime": 263.1916, + "eval_samples_per_second": 36.095, + "eval_steps_per_second": 1.128, + "step": 100 + }, + { + "epoch": 0.04, + "learning_rate": 6.22876557191393e-08, + "logits/chosen": 0.8191145062446594, + "logits/rejected": 2.4177682399749756, + "logps/chosen": -339.58966064453125, + "logps/rejected": -722.6461181640625, + "loss": 0.6388, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.09637552499771118, + "rewards/margins": 0.11133376508951187, + "rewards/rejected": -0.014958225190639496, + "step": 110 + }, + { + "epoch": 0.04, + "learning_rate": 6.795016987542468e-08, + "logits/chosen": 1.52385675907135, + "logits/rejected": 2.9242682456970215, + "logps/chosen": -369.44244384765625, + "logps/rejected": -489.9684143066406, + "loss": 0.6228, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.15489232540130615, + "rewards/margins": 0.17702451348304749, + "rewards/rejected": -0.02213219925761223, + "step": 120 + }, + { + "epoch": 0.04, + "learning_rate": 7.361268403171007e-08, + "logits/chosen": 1.4604432582855225, + "logits/rejected": 3.0254647731781006, + "logps/chosen": -409.232421875, + "logps/rejected": -533.1013793945312, + "loss": 0.6093, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.16234104335308075, + "rewards/margins": 0.22091738879680634, + "rewards/rejected": -0.05857633799314499, + "step": 130 + }, + { + "epoch": 0.05, + "learning_rate": 7.927519818799547e-08, + "logits/chosen": 0.8527463674545288, + "logits/rejected": 2.8990535736083984, + "logps/chosen": -306.9147644042969, + "logps/rejected": -554.9722900390625, + "loss": 0.6016, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.14748191833496094, + "rewards/margins": 0.1829536259174347, + "rewards/rejected": -0.03547172620892525, + "step": 140 + }, + { + "epoch": 0.05, + "learning_rate": 8.493771234428086e-08, + "logits/chosen": 1.6399835348129272, + "logits/rejected": 3.005208730697632, + "logps/chosen": -345.0897521972656, + "logps/rejected": -477.846923828125, + "loss": 0.5901, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.19912423193454742, + "rewards/margins": 0.22349488735198975, + "rewards/rejected": -0.024370649829506874, + "step": 150 + }, + { + "epoch": 0.05, + "learning_rate": 9.060022650056625e-08, + "logits/chosen": 1.7255092859268188, + "logits/rejected": 3.2128493785858154, + "logps/chosen": -329.3196716308594, + "logps/rejected": -453.23236083984375, + "loss": 0.5629, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.23143677413463593, + "rewards/margins": 0.29594728350639343, + "rewards/rejected": -0.06451050937175751, + "step": 160 + }, + { + "epoch": 0.06, + "learning_rate": 9.626274065685163e-08, + "logits/chosen": 0.5160809755325317, + "logits/rejected": 3.3329761028289795, + "logps/chosen": -425.4725036621094, + "logps/rejected": -473.4015197753906, + "loss": 0.5183, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.3114267885684967, + "rewards/margins": 0.40139466524124146, + "rewards/rejected": -0.08996789157390594, + "step": 170 + }, + { + "epoch": 0.06, + "learning_rate": 1.0192525481313703e-07, + "logits/chosen": 1.6642049551010132, + "logits/rejected": 2.928480386734009, + "logps/chosen": -462.1224060058594, + "logps/rejected": -587.8157958984375, + "loss": 0.4853, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.41756492853164673, + "rewards/margins": 0.5190845727920532, + "rewards/rejected": -0.10151971876621246, + "step": 180 + }, + { + "epoch": 0.06, + "learning_rate": 1.0758776896942241e-07, + "logits/chosen": 1.4960415363311768, + "logits/rejected": 3.2985711097717285, + "logps/chosen": -346.6057434082031, + "logps/rejected": -505.11907958984375, + "loss": 0.4605, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.5564008951187134, + "rewards/margins": 0.6558611392974854, + "rewards/rejected": -0.09946014732122421, + "step": 190 + }, + { + "epoch": 0.07, + "learning_rate": 1.1325028312570781e-07, + "logits/chosen": 1.667295217514038, + "logits/rejected": 3.3900818824768066, + "logps/chosen": -399.5884094238281, + "logps/rejected": -394.2358703613281, + "loss": 0.4585, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.4879857897758484, + "rewards/margins": 0.5740679502487183, + "rewards/rejected": -0.08608220517635345, + "step": 200 + }, + { + "epoch": 0.07, + "eval_logits/chosen": 0.9080647230148315, + "eval_logits/rejected": 3.249399423599243, + "eval_logps/chosen": -383.7663269042969, + "eval_logps/rejected": -503.872314453125, + "eval_loss": 0.44577568769454956, + "eval_rewards/accuracies": 0.930134654045105, + "eval_rewards/chosen": 0.5223554372787476, + "eval_rewards/margins": 0.6465521454811096, + "eval_rewards/rejected": -0.12419669330120087, + "eval_runtime": 267.4864, + "eval_samples_per_second": 35.516, + "eval_steps_per_second": 1.11, + "step": 200 + }, + { + "epoch": 0.07, + "learning_rate": 1.189127972819932e-07, + "logits/chosen": 1.3309799432754517, + "logits/rejected": 2.590378522872925, + "logps/chosen": -338.12176513671875, + "logps/rejected": -641.5697631835938, + "loss": 0.4334, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6168349385261536, + "rewards/margins": 0.7532841563224792, + "rewards/rejected": -0.13644923269748688, + "step": 210 + }, + { + "epoch": 0.07, + "learning_rate": 1.245753114382786e-07, + "logits/chosen": 0.8702503442764282, + "logits/rejected": 3.3507354259490967, + "logps/chosen": -455.60137939453125, + "logps/rejected": -447.7149963378906, + "loss": 0.4091, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6552426218986511, + "rewards/margins": 0.7988005876541138, + "rewards/rejected": -0.14355802536010742, + "step": 220 + }, + { + "epoch": 0.08, + "learning_rate": 1.3023782559456398e-07, + "logits/chosen": 1.2613376379013062, + "logits/rejected": 3.0275485515594482, + "logps/chosen": -405.063232421875, + "logps/rejected": -535.0067138671875, + "loss": 0.3977, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.672575831413269, + "rewards/margins": 0.811522364616394, + "rewards/rejected": -0.13894659280776978, + "step": 230 + }, + { + "epoch": 0.08, + "learning_rate": 1.3590033975084937e-07, + "logits/chosen": 1.183171272277832, + "logits/rejected": 2.8756091594696045, + "logps/chosen": -385.80487060546875, + "logps/rejected": -625.9861450195312, + "loss": 0.3953, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.6583465337753296, + "rewards/margins": 0.9188927412033081, + "rewards/rejected": -0.2605462074279785, + "step": 240 + }, + { + "epoch": 0.08, + "learning_rate": 1.4156285390713476e-07, + "logits/chosen": 1.2423839569091797, + "logits/rejected": 3.269233226776123, + "logps/chosen": -333.55419921875, + "logps/rejected": -476.35003662109375, + "loss": 0.3743, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7891533970832825, + "rewards/margins": 1.0434578657150269, + "rewards/rejected": -0.254304438829422, + "step": 250 + }, + { + "epoch": 0.09, + "learning_rate": 1.4722536806342014e-07, + "logits/chosen": 1.506807804107666, + "logits/rejected": 3.2091269493103027, + "logps/chosen": -398.2955017089844, + "logps/rejected": -529.6971435546875, + "loss": 0.3339, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8295198678970337, + "rewards/margins": 1.06871497631073, + "rewards/rejected": -0.23919522762298584, + "step": 260 + }, + { + "epoch": 0.09, + "learning_rate": 1.5288788221970556e-07, + "logits/chosen": 0.5976768136024475, + "logits/rejected": 3.549748182296753, + "logps/chosen": -358.4790954589844, + "logps/rejected": -366.1632080078125, + "loss": 0.3109, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.9163538813591003, + "rewards/margins": 1.2519645690917969, + "rewards/rejected": -0.335610568523407, + "step": 270 + }, + { + "epoch": 0.1, + "learning_rate": 1.5855039637599094e-07, + "logits/chosen": 1.9247735738754272, + "logits/rejected": 3.9325199127197266, + "logps/chosen": -334.7523498535156, + "logps/rejected": -343.63299560546875, + "loss": 0.2906, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.034354567527771, + "rewards/margins": 1.4914100170135498, + "rewards/rejected": -0.45705538988113403, + "step": 280 + }, + { + "epoch": 0.1, + "learning_rate": 1.642129105322763e-07, + "logits/chosen": 0.8872585296630859, + "logits/rejected": 3.4810404777526855, + "logps/chosen": -305.9379577636719, + "logps/rejected": -500.093017578125, + "loss": 0.2794, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.9907447695732117, + "rewards/margins": 1.4527558088302612, + "rewards/rejected": -0.46201109886169434, + "step": 290 + }, + { + "epoch": 0.1, + "learning_rate": 1.6987542468856172e-07, + "logits/chosen": 1.4215636253356934, + "logits/rejected": 3.333827257156372, + "logps/chosen": -331.09088134765625, + "logps/rejected": -579.3836669921875, + "loss": 0.2519, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.1994380950927734, + "rewards/margins": 1.746361494064331, + "rewards/rejected": -0.5469235777854919, + "step": 300 + }, + { + "epoch": 0.1, + "eval_logits/chosen": 0.9126914739608765, + "eval_logits/rejected": 3.279005765914917, + "eval_logps/chosen": -376.9534606933594, + "eval_logps/rejected": -507.4445495605469, + "eval_loss": 0.253989577293396, + "eval_rewards/accuracies": 0.9469696879386902, + "eval_rewards/chosen": 1.2036420106887817, + "eval_rewards/margins": 1.6850591897964478, + "eval_rewards/rejected": -0.48141714930534363, + "eval_runtime": 268.0468, + "eval_samples_per_second": 35.442, + "eval_steps_per_second": 1.108, + "step": 300 + }, + { + "epoch": 0.11, + "learning_rate": 1.755379388448471e-07, + "logits/chosen": 1.0812231302261353, + "logits/rejected": 3.189873218536377, + "logps/chosen": -342.5466003417969, + "logps/rejected": -491.248046875, + "loss": 0.2477, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.3413209915161133, + "rewards/margins": 1.7608686685562134, + "rewards/rejected": -0.4195477068424225, + "step": 310 + }, + { + "epoch": 0.11, + "learning_rate": 1.812004530011325e-07, + "logits/chosen": 1.6338831186294556, + "logits/rejected": 3.1776280403137207, + "logps/chosen": -517.4567260742188, + "logps/rejected": -400.4752502441406, + "loss": 0.2259, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 1.4155336618423462, + "rewards/margins": 1.8963558673858643, + "rewards/rejected": -0.4808220863342285, + "step": 320 + }, + { + "epoch": 0.11, + "learning_rate": 1.868629671574179e-07, + "logits/chosen": 1.18483567237854, + "logits/rejected": 2.889453411102295, + "logps/chosen": -396.57440185546875, + "logps/rejected": -572.3213500976562, + "loss": 0.211, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4644243717193604, + "rewards/margins": 2.263093948364258, + "rewards/rejected": -0.7986693382263184, + "step": 330 + }, + { + "epoch": 0.12, + "learning_rate": 1.9252548131370327e-07, + "logits/chosen": 1.4275858402252197, + "logits/rejected": 3.412844181060791, + "logps/chosen": -310.13433837890625, + "logps/rejected": -513.6046752929688, + "loss": 0.223, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.4905742406845093, + "rewards/margins": 2.169682025909424, + "rewards/rejected": -0.679107666015625, + "step": 340 + }, + { + "epoch": 0.12, + "learning_rate": 1.9818799546998865e-07, + "logits/chosen": 1.1891658306121826, + "logits/rejected": 2.760612964630127, + "logps/chosen": -370.5301818847656, + "logps/rejected": -568.3065185546875, + "loss": 0.2099, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5602627992630005, + "rewards/margins": 2.338247776031494, + "rewards/rejected": -0.777985155582428, + "step": 350 + }, + { + "epoch": 0.12, + "learning_rate": 2.0385050962627407e-07, + "logits/chosen": 1.4980577230453491, + "logits/rejected": 3.5479140281677246, + "logps/chosen": -308.7692565917969, + "logps/rejected": -398.22100830078125, + "loss": 0.1847, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.408210039138794, + "rewards/margins": 2.200162887573242, + "rewards/rejected": -0.7919529676437378, + "step": 360 + }, + { + "epoch": 0.13, + "learning_rate": 2.0951302378255946e-07, + "logits/chosen": 1.1414896249771118, + "logits/rejected": 2.569624423980713, + "logps/chosen": -315.5340270996094, + "logps/rejected": -732.6063232421875, + "loss": 0.1939, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.3921127319335938, + "rewards/margins": 2.1865804195404053, + "rewards/rejected": -0.7944676280021667, + "step": 370 + }, + { + "epoch": 0.13, + "learning_rate": 2.1517553793884482e-07, + "logits/chosen": 1.4118402004241943, + "logits/rejected": 2.800691604614258, + "logps/chosen": -356.115478515625, + "logps/rejected": -640.7478637695312, + "loss": 0.1899, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.7040725946426392, + "rewards/margins": 2.3913490772247314, + "rewards/rejected": -0.6872765421867371, + "step": 380 + }, + { + "epoch": 0.13, + "learning_rate": 2.2083805209513023e-07, + "logits/chosen": 1.4530861377716064, + "logits/rejected": 3.166997194290161, + "logps/chosen": -379.05584716796875, + "logps/rejected": -408.69830322265625, + "loss": 0.1978, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.4759045839309692, + "rewards/margins": 2.3196094036102295, + "rewards/rejected": -0.8437048196792603, + "step": 390 + }, + { + "epoch": 0.14, + "learning_rate": 2.2650056625141562e-07, + "logits/chosen": 1.2422279119491577, + "logits/rejected": 3.232043504714966, + "logps/chosen": -335.52813720703125, + "logps/rejected": -601.8195190429688, + "loss": 0.17, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9281127452850342, + "rewards/margins": 3.019509792327881, + "rewards/rejected": -1.0913970470428467, + "step": 400 + }, + { + "epoch": 0.14, + "eval_logits/chosen": 0.9173203706741333, + "eval_logits/rejected": 3.3007280826568604, + "eval_logps/chosen": -373.1958923339844, + "eval_logps/rejected": -512.6629028320312, + "eval_loss": 0.17509011924266815, + "eval_rewards/accuracies": 0.9562289714813232, + "eval_rewards/chosen": 1.5794016122817993, + "eval_rewards/margins": 2.582660436630249, + "eval_rewards/rejected": -1.0032589435577393, + "eval_runtime": 268.2511, + "eval_samples_per_second": 35.415, + "eval_steps_per_second": 1.107, + "step": 400 + }, + { + "epoch": 0.14, + "learning_rate": 2.32163080407701e-07, + "logits/chosen": 0.40517282485961914, + "logits/rejected": 2.861290454864502, + "logps/chosen": -316.54620361328125, + "logps/rejected": -506.2539978027344, + "loss": 0.1664, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.8524707555770874, + "rewards/margins": 2.5778260231018066, + "rewards/rejected": -0.7253550291061401, + "step": 410 + }, + { + "epoch": 0.14, + "learning_rate": 2.378255945639864e-07, + "logits/chosen": 0.664097785949707, + "logits/rejected": 2.845303535461426, + "logps/chosen": -320.03125, + "logps/rejected": -667.8426513671875, + "loss": 0.1677, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.7594044208526611, + "rewards/margins": 2.9025471210479736, + "rewards/rejected": -1.143142819404602, + "step": 420 + }, + { + "epoch": 0.15, + "learning_rate": 2.434881087202718e-07, + "logits/chosen": 0.7501753568649292, + "logits/rejected": 2.8510653972625732, + "logps/chosen": -307.3896179199219, + "logps/rejected": -599.6353149414062, + "loss": 0.1465, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.7693700790405273, + "rewards/margins": 3.032701015472412, + "rewards/rejected": -1.2633306980133057, + "step": 430 + }, + { + "epoch": 0.15, + "learning_rate": 2.491506228765572e-07, + "logits/chosen": 1.5089292526245117, + "logits/rejected": 3.524827241897583, + "logps/chosen": -321.5747375488281, + "logps/rejected": -450.64276123046875, + "loss": 0.1466, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.5898950099945068, + "rewards/margins": 3.1321098804473877, + "rewards/rejected": -1.5422146320343018, + "step": 440 + }, + { + "epoch": 0.15, + "learning_rate": 2.548131370328426e-07, + "logits/chosen": 1.3345590829849243, + "logits/rejected": 3.7219035625457764, + "logps/chosen": -352.8118896484375, + "logps/rejected": -376.58013916015625, + "loss": 0.1484, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4617602825164795, + "rewards/margins": 3.2981064319610596, + "rewards/rejected": -1.8363460302352905, + "step": 450 + }, + { + "epoch": 0.16, + "learning_rate": 2.6047565118912797e-07, + "logits/chosen": 1.7875549793243408, + "logits/rejected": 3.7318198680877686, + "logps/chosen": -387.7720947265625, + "logps/rejected": -427.65972900390625, + "loss": 0.1293, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.8360264301300049, + "rewards/margins": 3.6591992378234863, + "rewards/rejected": -1.8231725692749023, + "step": 460 + }, + { + "epoch": 0.16, + "learning_rate": 2.6613816534541335e-07, + "logits/chosen": 1.857513189315796, + "logits/rejected": 3.7900021076202393, + "logps/chosen": -471.33551025390625, + "logps/rejected": -291.6858825683594, + "loss": 0.1331, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.937786340713501, + "rewards/margins": 3.6518795490264893, + "rewards/rejected": -1.7140929698944092, + "step": 470 + }, + { + "epoch": 0.16, + "learning_rate": 2.7180067950169874e-07, + "logits/chosen": 0.5211232304573059, + "logits/rejected": 2.575617790222168, + "logps/chosen": -314.490966796875, + "logps/rejected": -596.9625244140625, + "loss": 0.1188, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.9314266443252563, + "rewards/margins": 3.5651352405548096, + "rewards/rejected": -1.6337085962295532, + "step": 480 + }, + { + "epoch": 0.17, + "learning_rate": 2.7746319365798413e-07, + "logits/chosen": 1.2549169063568115, + "logits/rejected": 3.3965110778808594, + "logps/chosen": -345.92669677734375, + "logps/rejected": -505.69659423828125, + "loss": 0.1212, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.377042531967163, + "rewards/margins": 4.275637149810791, + "rewards/rejected": -1.8985941410064697, + "step": 490 + }, + { + "epoch": 0.17, + "learning_rate": 2.831257078142695e-07, + "logits/chosen": 1.7851215600967407, + "logits/rejected": 3.7306265830993652, + "logps/chosen": -459.65802001953125, + "logps/rejected": -326.4261169433594, + "loss": 0.1179, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.8807199001312256, + "rewards/margins": 3.8889358043670654, + "rewards/rejected": -2.008216142654419, + "step": 500 + }, + { + "epoch": 0.17, + "eval_logits/chosen": 0.9103542566299438, + "eval_logits/rejected": 3.292522430419922, + "eval_logps/chosen": -370.5672607421875, + "eval_logps/rejected": -523.4216918945312, + "eval_loss": 0.12153849750757217, + "eval_rewards/accuracies": 0.9587541818618774, + "eval_rewards/chosen": 1.8422629833221436, + "eval_rewards/margins": 3.9213974475860596, + "eval_rewards/rejected": -2.079134225845337, + "eval_runtime": 268.6563, + "eval_samples_per_second": 35.361, + "eval_steps_per_second": 1.106, + "step": 500 + }, + { + "epoch": 0.17, + "learning_rate": 2.887882219705549e-07, + "logits/chosen": 2.08186936378479, + "logits/rejected": 2.738398313522339, + "logps/chosen": -426.4551696777344, + "logps/rejected": -635.5200805664062, + "loss": 0.1274, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.002917766571045, + "rewards/margins": 4.0474138259887695, + "rewards/rejected": -2.0444960594177246, + "step": 510 + }, + { + "epoch": 0.18, + "learning_rate": 2.944507361268403e-07, + "logits/chosen": 1.487585186958313, + "logits/rejected": 2.9141831398010254, + "logps/chosen": -328.9551086425781, + "logps/rejected": -544.206787109375, + "loss": 0.112, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.959592580795288, + "rewards/margins": 4.269417762756348, + "rewards/rejected": -2.3098244667053223, + "step": 520 + }, + { + "epoch": 0.18, + "learning_rate": 3.001132502831257e-07, + "logits/chosen": 1.0084563493728638, + "logits/rejected": 3.2186686992645264, + "logps/chosen": -324.2669677734375, + "logps/rejected": -610.30029296875, + "loss": 0.1176, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.2090611457824707, + "rewards/margins": 4.53490686416626, + "rewards/rejected": -2.32584547996521, + "step": 530 + }, + { + "epoch": 0.18, + "learning_rate": 3.057757644394111e-07, + "logits/chosen": 1.4372812509536743, + "logits/rejected": 3.4026083946228027, + "logps/chosen": -311.79840087890625, + "logps/rejected": -425.08453369140625, + "loss": 0.1028, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.122832775115967, + "rewards/margins": 4.550196647644043, + "rewards/rejected": -2.427363872528076, + "step": 540 + }, + { + "epoch": 0.19, + "learning_rate": 3.114382785956965e-07, + "logits/chosen": 0.9511749148368835, + "logits/rejected": 3.254910707473755, + "logps/chosen": -321.9422302246094, + "logps/rejected": -476.6572265625, + "loss": 0.1174, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.133894681930542, + "rewards/margins": 4.333982944488525, + "rewards/rejected": -2.2000882625579834, + "step": 550 + }, + { + "epoch": 0.19, + "learning_rate": 3.171007927519819e-07, + "logits/chosen": 1.9490430355072021, + "logits/rejected": 3.237525224685669, + "logps/chosen": -447.9163513183594, + "logps/rejected": -467.66162109375, + "loss": 0.1015, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 2.141505718231201, + "rewards/margins": 4.317712306976318, + "rewards/rejected": -2.17620587348938, + "step": 560 + }, + { + "epoch": 0.19, + "learning_rate": 3.227633069082673e-07, + "logits/chosen": 0.9235776662826538, + "logits/rejected": 3.400035858154297, + "logps/chosen": -471.5634765625, + "logps/rejected": -351.24774169921875, + "loss": 0.0831, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.087817668914795, + "rewards/margins": 4.746884346008301, + "rewards/rejected": -2.659067153930664, + "step": 570 + }, + { + "epoch": 0.2, + "learning_rate": 3.284258210645526e-07, + "logits/chosen": 1.2091686725616455, + "logits/rejected": 3.3013522624969482, + "logps/chosen": -441.1993713378906, + "logps/rejected": -504.54559326171875, + "loss": 0.1315, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.3476431369781494, + "rewards/margins": 4.925909996032715, + "rewards/rejected": -2.5782666206359863, + "step": 580 + }, + { + "epoch": 0.2, + "learning_rate": 3.34088335220838e-07, + "logits/chosen": 1.3331282138824463, + "logits/rejected": 3.4447312355041504, + "logps/chosen": -329.92047119140625, + "logps/rejected": -407.18438720703125, + "loss": 0.1286, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.297781467437744, + "rewards/margins": 5.266199111938477, + "rewards/rejected": -2.9684176445007324, + "step": 590 + }, + { + "epoch": 0.2, + "learning_rate": 3.3975084937712344e-07, + "logits/chosen": 1.0378614664077759, + "logits/rejected": 2.619415760040283, + "logps/chosen": -421.515625, + "logps/rejected": -664.7316284179688, + "loss": 0.1032, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.105104446411133, + "rewards/margins": 4.875788688659668, + "rewards/rejected": -2.7706847190856934, + "step": 600 + }, + { + "epoch": 0.2, + "eval_logits/chosen": 0.9179551005363464, + "eval_logits/rejected": 3.257366895675659, + "eval_logps/chosen": -368.0875549316406, + "eval_logps/rejected": -530.27734375, + "eval_loss": 0.10775981098413467, + "eval_rewards/accuracies": 0.9595959782600403, + "eval_rewards/chosen": 2.090228796005249, + "eval_rewards/margins": 4.854931831359863, + "eval_rewards/rejected": -2.7647030353546143, + "eval_runtime": 268.0718, + "eval_samples_per_second": 35.438, + "eval_steps_per_second": 1.108, + "step": 600 + }, + { + "epoch": 0.21, + "learning_rate": 3.454133635334088e-07, + "logits/chosen": 1.8005974292755127, + "logits/rejected": 3.3652682304382324, + "logps/chosen": -355.82696533203125, + "logps/rejected": -472.01287841796875, + "loss": 0.1071, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.616498351097107, + "rewards/margins": 4.316979885101318, + "rewards/rejected": -2.70048189163208, + "step": 610 + }, + { + "epoch": 0.21, + "learning_rate": 3.510758776896942e-07, + "logits/chosen": 1.010353684425354, + "logits/rejected": 2.7700181007385254, + "logps/chosen": -447.08050537109375, + "logps/rejected": -578.945556640625, + "loss": 0.1084, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.266951322555542, + "rewards/margins": 4.7797393798828125, + "rewards/rejected": -2.5127882957458496, + "step": 620 + }, + { + "epoch": 0.21, + "learning_rate": 3.567383918459796e-07, + "logits/chosen": 1.316406488418579, + "logits/rejected": 3.448246717453003, + "logps/chosen": -392.95458984375, + "logps/rejected": -317.03570556640625, + "loss": 0.0983, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.9485342502593994, + "rewards/margins": 5.007516384124756, + "rewards/rejected": -3.0589828491210938, + "step": 630 + }, + { + "epoch": 0.22, + "learning_rate": 3.62400906002265e-07, + "logits/chosen": 1.4376599788665771, + "logits/rejected": 3.0495493412017822, + "logps/chosen": -385.1656799316406, + "logps/rejected": -570.9940185546875, + "loss": 0.0844, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.372509479522705, + "rewards/margins": 5.976849555969238, + "rewards/rejected": -3.6043407917022705, + "step": 640 + }, + { + "epoch": 0.22, + "learning_rate": 3.6806342015855037e-07, + "logits/chosen": 1.7421470880508423, + "logits/rejected": 3.6961002349853516, + "logps/chosen": -357.5590515136719, + "logps/rejected": -390.57305908203125, + "loss": 0.0852, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.394005298614502, + "rewards/margins": 5.477439880371094, + "rewards/rejected": -3.083434820175171, + "step": 650 + }, + { + "epoch": 0.22, + "learning_rate": 3.737259343148358e-07, + "logits/chosen": 1.2248786687850952, + "logits/rejected": 3.567431926727295, + "logps/chosen": -317.44512939453125, + "logps/rejected": -538.3814697265625, + "loss": 0.0836, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.2307286262512207, + "rewards/margins": 5.791689872741699, + "rewards/rejected": -3.560961961746216, + "step": 660 + }, + { + "epoch": 0.23, + "learning_rate": 3.7938844847112115e-07, + "logits/chosen": 1.7818076610565186, + "logits/rejected": 3.1708149909973145, + "logps/chosen": -360.680908203125, + "logps/rejected": -613.5748901367188, + "loss": 0.0958, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.9897119998931885, + "rewards/margins": 5.3165106773376465, + "rewards/rejected": -3.326798677444458, + "step": 670 + }, + { + "epoch": 0.23, + "learning_rate": 3.8505096262740653e-07, + "logits/chosen": 1.89913010597229, + "logits/rejected": 3.2814056873321533, + "logps/chosen": -339.29925537109375, + "logps/rejected": -550.3546752929688, + "loss": 0.0629, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.3413264751434326, + "rewards/margins": 5.58084774017334, + "rewards/rejected": -3.23952054977417, + "step": 680 + }, + { + "epoch": 0.23, + "learning_rate": 3.907134767836919e-07, + "logits/chosen": 1.546518325805664, + "logits/rejected": 2.939469337463379, + "logps/chosen": -416.63336181640625, + "logps/rejected": -570.0182495117188, + "loss": 0.0906, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.569227933883667, + "rewards/margins": 6.09161901473999, + "rewards/rejected": -3.5223910808563232, + "step": 690 + }, + { + "epoch": 0.24, + "learning_rate": 3.963759909399773e-07, + "logits/chosen": 0.5802558660507202, + "logits/rejected": 2.9904799461364746, + "logps/chosen": -362.68701171875, + "logps/rejected": -586.5096435546875, + "loss": 0.0614, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8239052295684814, + "rewards/margins": 6.065105438232422, + "rewards/rejected": -3.2412002086639404, + "step": 700 + }, + { + "epoch": 0.24, + "eval_logits/chosen": 0.9190412759780884, + "eval_logits/rejected": 3.2242751121520996, + "eval_logps/chosen": -366.1594543457031, + "eval_logps/rejected": -536.8206787109375, + "eval_loss": 0.08807818591594696, + "eval_rewards/accuracies": 0.9638047218322754, + "eval_rewards/chosen": 2.28304123878479, + "eval_rewards/margins": 5.702078342437744, + "eval_rewards/rejected": -3.419036865234375, + "eval_runtime": 268.6383, + "eval_samples_per_second": 35.364, + "eval_steps_per_second": 1.106, + "step": 700 + }, + { + "epoch": 0.24, + "learning_rate": 4.0203850509626275e-07, + "logits/chosen": 1.4401183128356934, + "logits/rejected": 3.3762612342834473, + "logps/chosen": -354.66778564453125, + "logps/rejected": -428.1043395996094, + "loss": 0.0789, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.3884358406066895, + "rewards/margins": 6.028875827789307, + "rewards/rejected": -3.640439510345459, + "step": 710 + }, + { + "epoch": 0.24, + "learning_rate": 4.0770101925254814e-07, + "logits/chosen": 1.7469635009765625, + "logits/rejected": 3.5658583641052246, + "logps/chosen": -344.3053894042969, + "logps/rejected": -453.9859924316406, + "loss": 0.103, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.06357741355896, + "rewards/margins": 5.505257606506348, + "rewards/rejected": -3.4416797161102295, + "step": 720 + }, + { + "epoch": 0.25, + "learning_rate": 4.133635334088335e-07, + "logits/chosen": 1.3185302019119263, + "logits/rejected": 3.143467903137207, + "logps/chosen": -398.46685791015625, + "logps/rejected": -499.00762939453125, + "loss": 0.0714, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.513793706893921, + "rewards/margins": 5.737306594848633, + "rewards/rejected": -3.2235121726989746, + "step": 730 + }, + { + "epoch": 0.25, + "learning_rate": 4.190260475651189e-07, + "logits/chosen": 1.3261969089508057, + "logits/rejected": 3.325129270553589, + "logps/chosen": -395.0682678222656, + "logps/rejected": -463.11102294921875, + "loss": 0.0837, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.9389358758926392, + "rewards/margins": 5.200533866882324, + "rewards/rejected": -3.2615981101989746, + "step": 740 + }, + { + "epoch": 0.25, + "learning_rate": 4.2468856172140424e-07, + "logits/chosen": 0.8084484338760376, + "logits/rejected": 3.0004987716674805, + "logps/chosen": -308.0705871582031, + "logps/rejected": -545.3846435546875, + "loss": 0.0571, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.676938772201538, + "rewards/margins": 6.314852237701416, + "rewards/rejected": -3.637913227081299, + "step": 750 + }, + { + "epoch": 0.26, + "learning_rate": 4.3035107587768963e-07, + "logits/chosen": 1.5113528966903687, + "logits/rejected": 3.3475310802459717, + "logps/chosen": -314.2061462402344, + "logps/rejected": -437.4307556152344, + "loss": 0.0932, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.2036123275756836, + "rewards/margins": 5.510008811950684, + "rewards/rejected": -3.306396484375, + "step": 760 + }, + { + "epoch": 0.26, + "learning_rate": 4.3601359003397507e-07, + "logits/chosen": 1.2984874248504639, + "logits/rejected": 2.807908058166504, + "logps/chosen": -425.2962951660156, + "logps/rejected": -527.3756713867188, + "loss": 0.0761, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.349900722503662, + "rewards/margins": 5.7003302574157715, + "rewards/rejected": -3.350429058074951, + "step": 770 + }, + { + "epoch": 0.27, + "learning_rate": 4.4167610419026046e-07, + "logits/chosen": 0.9681515693664551, + "logits/rejected": 2.8194892406463623, + "logps/chosen": -438.4591369628906, + "logps/rejected": -627.0498046875, + "loss": 0.0831, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.592127561569214, + "rewards/margins": 6.795496940612793, + "rewards/rejected": -4.203368663787842, + "step": 780 + }, + { + "epoch": 0.27, + "learning_rate": 4.4733861834654585e-07, + "logits/chosen": 1.1889218091964722, + "logits/rejected": 3.3345398902893066, + "logps/chosen": -363.73291015625, + "logps/rejected": -372.0538635253906, + "loss": 0.0681, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.4557104110717773, + "rewards/margins": 6.361659526824951, + "rewards/rejected": -3.905949115753174, + "step": 790 + }, + { + "epoch": 0.27, + "learning_rate": 4.5300113250283123e-07, + "logits/chosen": 1.7186403274536133, + "logits/rejected": 2.917168140411377, + "logps/chosen": -373.5525207519531, + "logps/rejected": -480.1240234375, + "loss": 0.0666, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.4754912853240967, + "rewards/margins": 5.41542387008667, + "rewards/rejected": -2.939932346343994, + "step": 800 + }, + { + "epoch": 0.27, + "eval_logits/chosen": 0.9024965167045593, + "eval_logits/rejected": 3.1787774562835693, + "eval_logps/chosen": -365.29949951171875, + "eval_logps/rejected": -543.2213745117188, + "eval_loss": 0.07510381191968918, + "eval_rewards/accuracies": 0.9688552021980286, + "eval_rewards/chosen": 2.3690366744995117, + "eval_rewards/margins": 6.428139686584473, + "eval_rewards/rejected": -4.059103012084961, + "eval_runtime": 268.4853, + "eval_samples_per_second": 35.384, + "eval_steps_per_second": 1.106, + "step": 800 + }, + { + "epoch": 0.28, + "learning_rate": 4.586636466591166e-07, + "logits/chosen": 1.7102317810058594, + "logits/rejected": 3.0732152462005615, + "logps/chosen": -386.9404296875, + "logps/rejected": -596.4518432617188, + "loss": 0.0662, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.70186185836792, + "rewards/margins": 6.8812575340271, + "rewards/rejected": -4.179396152496338, + "step": 810 + }, + { + "epoch": 0.28, + "learning_rate": 4.64326160815402e-07, + "logits/chosen": 0.8180482983589172, + "logits/rejected": 3.4353580474853516, + "logps/chosen": -461.099609375, + "logps/rejected": -415.77276611328125, + "loss": 0.0646, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.202852249145508, + "rewards/margins": 5.79763126373291, + "rewards/rejected": -3.594778060913086, + "step": 820 + }, + { + "epoch": 0.28, + "learning_rate": 4.6998867497168745e-07, + "logits/chosen": 1.0256288051605225, + "logits/rejected": 3.0926876068115234, + "logps/chosen": -298.9940490722656, + "logps/rejected": -572.1815185546875, + "loss": 0.0808, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.5179805755615234, + "rewards/margins": 6.361120700836182, + "rewards/rejected": -3.843140125274658, + "step": 830 + }, + { + "epoch": 0.29, + "learning_rate": 4.756511891279728e-07, + "logits/chosen": 1.130673885345459, + "logits/rejected": 3.3938686847686768, + "logps/chosen": -388.12799072265625, + "logps/rejected": -495.4458923339844, + "loss": 0.0556, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9046478271484375, + "rewards/margins": 7.362362861633301, + "rewards/rejected": -4.457714557647705, + "step": 840 + }, + { + "epoch": 0.29, + "learning_rate": 4.813137032842582e-07, + "logits/chosen": 0.9663546681404114, + "logits/rejected": 2.575662612915039, + "logps/chosen": -392.9145812988281, + "logps/rejected": -729.2246704101562, + "loss": 0.065, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4775688648223877, + "rewards/margins": 6.4355573654174805, + "rewards/rejected": -3.9579882621765137, + "step": 850 + }, + { + "epoch": 0.29, + "learning_rate": 4.869762174405436e-07, + "logits/chosen": 1.4549670219421387, + "logits/rejected": 3.0449929237365723, + "logps/chosen": -364.0623779296875, + "logps/rejected": -563.6417236328125, + "loss": 0.0749, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.2536864280700684, + "rewards/margins": 6.307879447937012, + "rewards/rejected": -4.054192543029785, + "step": 860 + }, + { + "epoch": 0.3, + "learning_rate": 4.92638731596829e-07, + "logits/chosen": 1.299466848373413, + "logits/rejected": 2.705226421356201, + "logps/chosen": -460.9673767089844, + "logps/rejected": -606.0197143554688, + "loss": 0.0624, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.1807148456573486, + "rewards/margins": 6.8419928550720215, + "rewards/rejected": -3.6612777709960938, + "step": 870 + }, + { + "epoch": 0.3, + "learning_rate": 4.983012457531144e-07, + "logits/chosen": 0.784410834312439, + "logits/rejected": 3.4160079956054688, + "logps/chosen": -290.4139099121094, + "logps/rejected": -488.72674560546875, + "loss": 0.0613, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.693528890609741, + "rewards/margins": 7.0350165367126465, + "rewards/rejected": -4.341487884521484, + "step": 880 + }, + { + "epoch": 0.3, + "learning_rate": 4.995593604431575e-07, + "logits/chosen": 1.353736162185669, + "logits/rejected": 3.3012149333953857, + "logps/chosen": -353.3387145996094, + "logps/rejected": -380.4452819824219, + "loss": 0.0599, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.2113845348358154, + "rewards/margins": 6.058213710784912, + "rewards/rejected": -3.8468291759490967, + "step": 890 + }, + { + "epoch": 0.31, + "learning_rate": 4.989298753619539e-07, + "logits/chosen": 1.7279170751571655, + "logits/rejected": 3.2604126930236816, + "logps/chosen": -329.1844482421875, + "logps/rejected": -455.5565490722656, + "loss": 0.0706, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.6200737953186035, + "rewards/margins": 7.700595855712891, + "rewards/rejected": -5.080521583557129, + "step": 900 + }, + { + "epoch": 0.31, + "eval_logits/chosen": 0.9102387428283691, + "eval_logits/rejected": 3.162445306777954, + "eval_logps/chosen": -364.9873962402344, + "eval_logps/rejected": -547.8843383789062, + "eval_loss": 0.06624022871255875, + "eval_rewards/accuracies": 0.9722222089767456, + "eval_rewards/chosen": 2.4002487659454346, + "eval_rewards/margins": 6.92565393447876, + "eval_rewards/rejected": -4.525404453277588, + "eval_runtime": 268.0833, + "eval_samples_per_second": 35.437, + "eval_steps_per_second": 1.108, + "step": 900 + }, + { + "epoch": 0.31, + "learning_rate": 4.983003902807503e-07, + "logits/chosen": 1.7389227151870728, + "logits/rejected": 3.2229621410369873, + "logps/chosen": -427.50872802734375, + "logps/rejected": -388.01153564453125, + "loss": 0.0658, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.2974724769592285, + "rewards/margins": 6.737030029296875, + "rewards/rejected": -4.439557075500488, + "step": 910 + }, + { + "epoch": 0.31, + "learning_rate": 4.976709051995467e-07, + "logits/chosen": 1.2352038621902466, + "logits/rejected": 3.0501484870910645, + "logps/chosen": -321.5035705566406, + "logps/rejected": -480.87176513671875, + "loss": 0.0651, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.5568785667419434, + "rewards/margins": 7.235832214355469, + "rewards/rejected": -4.678953170776367, + "step": 920 + }, + { + "epoch": 0.32, + "learning_rate": 4.970414201183432e-07, + "logits/chosen": 0.8484708070755005, + "logits/rejected": 3.094775438308716, + "logps/chosen": -347.93670654296875, + "logps/rejected": -578.5217895507812, + "loss": 0.0647, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.865225076675415, + "rewards/margins": 7.946264743804932, + "rewards/rejected": -5.081039905548096, + "step": 930 + }, + { + "epoch": 0.32, + "learning_rate": 4.964119350371396e-07, + "logits/chosen": 1.2264028787612915, + "logits/rejected": 2.9254095554351807, + "logps/chosen": -314.49755859375, + "logps/rejected": -542.8404541015625, + "loss": 0.053, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.582854747772217, + "rewards/margins": 7.226229667663574, + "rewards/rejected": -4.643374443054199, + "step": 940 + }, + { + "epoch": 0.32, + "learning_rate": 4.95782449955936e-07, + "logits/chosen": 1.222130537033081, + "logits/rejected": 3.148733139038086, + "logps/chosen": -374.54144287109375, + "logps/rejected": -497.5044860839844, + "loss": 0.0576, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.738006353378296, + "rewards/margins": 7.113809108734131, + "rewards/rejected": -4.375802993774414, + "step": 950 + }, + { + "epoch": 0.33, + "learning_rate": 4.951529648747325e-07, + "logits/chosen": 1.75167715549469, + "logits/rejected": 3.0500741004943848, + "logps/chosen": -428.451416015625, + "logps/rejected": -546.074462890625, + "loss": 0.0441, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.750443935394287, + "rewards/margins": 7.7640814781188965, + "rewards/rejected": -5.013637065887451, + "step": 960 + }, + { + "epoch": 0.33, + "learning_rate": 4.945234797935289e-07, + "logits/chosen": 0.9173381924629211, + "logits/rejected": 2.5977022647857666, + "logps/chosen": -297.50433349609375, + "logps/rejected": -687.2318115234375, + "loss": 0.0657, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.5338656902313232, + "rewards/margins": 7.702880859375, + "rewards/rejected": -5.1690144538879395, + "step": 970 + }, + { + "epoch": 0.33, + "learning_rate": 4.938939947123252e-07, + "logits/chosen": 0.8320339918136597, + "logits/rejected": 2.6507489681243896, + "logps/chosen": -423.51678466796875, + "logps/rejected": -640.7545776367188, + "loss": 0.087, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.4527504444122314, + "rewards/margins": 7.721698760986328, + "rewards/rejected": -5.268949031829834, + "step": 980 + }, + { + "epoch": 0.34, + "learning_rate": 4.932645096311217e-07, + "logits/chosen": 1.5152180194854736, + "logits/rejected": 3.082857608795166, + "logps/chosen": -295.34088134765625, + "logps/rejected": -608.37353515625, + "loss": 0.0672, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.7915849685668945, + "rewards/margins": 7.854758262634277, + "rewards/rejected": -5.063173770904541, + "step": 990 + }, + { + "epoch": 0.34, + "learning_rate": 4.926350245499181e-07, + "logits/chosen": 1.2758054733276367, + "logits/rejected": 3.3109130859375, + "logps/chosen": -388.2832946777344, + "logps/rejected": -477.9232482910156, + "loss": 0.0711, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.6032989025115967, + "rewards/margins": 7.5214715003967285, + "rewards/rejected": -4.918172359466553, + "step": 1000 + }, + { + "epoch": 0.34, + "eval_logits/chosen": 0.9093015789985657, + "eval_logits/rejected": 3.146707534790039, + "eval_logps/chosen": -364.759765625, + "eval_logps/rejected": -551.8095703125, + "eval_loss": 0.05768350511789322, + "eval_rewards/accuracies": 0.9764309525489807, + "eval_rewards/chosen": 2.4230129718780518, + "eval_rewards/margins": 7.340935230255127, + "eval_rewards/rejected": -4.917922496795654, + "eval_runtime": 267.7884, + "eval_samples_per_second": 35.476, + "eval_steps_per_second": 1.109, + "step": 1000 + }, + { + "epoch": 0.34, + "learning_rate": 4.920055394687146e-07, + "logits/chosen": 1.5080076456069946, + "logits/rejected": 3.27386474609375, + "logps/chosen": -398.11077880859375, + "logps/rejected": -458.70404052734375, + "loss": 0.0625, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3705155849456787, + "rewards/margins": 6.589612007141113, + "rewards/rejected": -4.2190961837768555, + "step": 1010 + }, + { + "epoch": 0.35, + "learning_rate": 4.91376054387511e-07, + "logits/chosen": 0.7365673780441284, + "logits/rejected": 3.152676820755005, + "logps/chosen": -289.18304443359375, + "logps/rejected": -490.31829833984375, + "loss": 0.0608, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.4276957511901855, + "rewards/margins": 6.978020668029785, + "rewards/rejected": -4.550324440002441, + "step": 1020 + }, + { + "epoch": 0.35, + "learning_rate": 4.907465693063074e-07, + "logits/chosen": 1.0980937480926514, + "logits/rejected": 3.2199409008026123, + "logps/chosen": -302.3829040527344, + "logps/rejected": -502.25421142578125, + "loss": 0.0566, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.611393451690674, + "rewards/margins": 7.998917579650879, + "rewards/rejected": -5.387524604797363, + "step": 1030 + }, + { + "epoch": 0.35, + "learning_rate": 4.901170842251039e-07, + "logits/chosen": 1.529950737953186, + "logits/rejected": 2.4496376514434814, + "logps/chosen": -472.7461853027344, + "logps/rejected": -645.9083251953125, + "loss": 0.0607, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.9878193140029907, + "rewards/margins": 6.836523532867432, + "rewards/rejected": -4.848703861236572, + "step": 1040 + }, + { + "epoch": 0.36, + "learning_rate": 4.894875991439003e-07, + "logits/chosen": 1.2354804277420044, + "logits/rejected": 3.264209032058716, + "logps/chosen": -457.37066650390625, + "logps/rejected": -482.914306640625, + "loss": 0.06, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.3142616748809814, + "rewards/margins": 7.839533805847168, + "rewards/rejected": -5.525272369384766, + "step": 1050 + }, + { + "epoch": 0.36, + "learning_rate": 4.888581140626966e-07, + "logits/chosen": 0.9020367860794067, + "logits/rejected": 2.951887607574463, + "logps/chosen": -372.59735107421875, + "logps/rejected": -528.4854125976562, + "loss": 0.0754, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.2299747467041016, + "rewards/margins": 7.801189422607422, + "rewards/rejected": -5.57121467590332, + "step": 1060 + }, + { + "epoch": 0.36, + "learning_rate": 4.882286289814931e-07, + "logits/chosen": 1.0016891956329346, + "logits/rejected": 3.1277599334716797, + "logps/chosen": -420.5859375, + "logps/rejected": -437.7632751464844, + "loss": 0.0458, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.437075138092041, + "rewards/margins": 7.129094123840332, + "rewards/rejected": -4.692019462585449, + "step": 1070 + }, + { + "epoch": 0.37, + "learning_rate": 4.875991439002896e-07, + "logits/chosen": 1.0937416553497314, + "logits/rejected": 3.4849390983581543, + "logps/chosen": -369.060791015625, + "logps/rejected": -446.8038635253906, + "loss": 0.0509, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.3342339992523193, + "rewards/margins": 7.783272743225098, + "rewards/rejected": -5.449038505554199, + "step": 1080 + }, + { + "epoch": 0.37, + "learning_rate": 4.869696588190859e-07, + "logits/chosen": 2.1527936458587646, + "logits/rejected": 3.5624840259552, + "logps/chosen": -362.0032653808594, + "logps/rejected": -347.994140625, + "loss": 0.0518, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3183753490448, + "rewards/margins": 7.678530693054199, + "rewards/rejected": -5.360154628753662, + "step": 1090 + }, + { + "epoch": 0.37, + "learning_rate": 4.863401737378824e-07, + "logits/chosen": 1.1598182916641235, + "logits/rejected": 3.118835926055908, + "logps/chosen": -367.0357666015625, + "logps/rejected": -561.3696899414062, + "loss": 0.0623, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.534534215927124, + "rewards/margins": 8.346112251281738, + "rewards/rejected": -5.811577796936035, + "step": 1100 + }, + { + "epoch": 0.37, + "eval_logits/chosen": 0.901133120059967, + "eval_logits/rejected": 3.118581533432007, + "eval_logps/chosen": -364.1503601074219, + "eval_logps/rejected": -556.2498779296875, + "eval_loss": 0.05724797025322914, + "eval_rewards/accuracies": 0.9772727489471436, + "eval_rewards/chosen": 2.4839539527893066, + "eval_rewards/margins": 7.845913410186768, + "eval_rewards/rejected": -5.361959934234619, + "eval_runtime": 268.446, + "eval_samples_per_second": 35.389, + "eval_steps_per_second": 1.106, + "step": 1100 + }, + { + "epoch": 0.38, + "learning_rate": 4.857106886566788e-07, + "logits/chosen": 0.25722193717956543, + "logits/rejected": 2.8666605949401855, + "logps/chosen": -269.8269958496094, + "logps/rejected": -616.5667114257812, + "loss": 0.0638, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.2851195335388184, + "rewards/margins": 8.181058883666992, + "rewards/rejected": -5.895938873291016, + "step": 1110 + }, + { + "epoch": 0.38, + "learning_rate": 4.850812035754753e-07, + "logits/chosen": 1.0288280248641968, + "logits/rejected": 3.402256727218628, + "logps/chosen": -324.79913330078125, + "logps/rejected": -548.8817138671875, + "loss": 0.0428, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.5030677318573, + "rewards/margins": 7.847231864929199, + "rewards/rejected": -5.344164848327637, + "step": 1120 + }, + { + "epoch": 0.38, + "learning_rate": 4.844517184942716e-07, + "logits/chosen": 1.3817869424819946, + "logits/rejected": 3.4411487579345703, + "logps/chosen": -389.9842529296875, + "logps/rejected": -385.0381774902344, + "loss": 0.0403, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.078273057937622, + "rewards/margins": 7.2798590660095215, + "rewards/rejected": -5.201586723327637, + "step": 1130 + }, + { + "epoch": 0.39, + "learning_rate": 4.838222334130681e-07, + "logits/chosen": 1.1128469705581665, + "logits/rejected": 3.301898241043091, + "logps/chosen": -366.19134521484375, + "logps/rejected": -476.60205078125, + "loss": 0.051, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.424424648284912, + "rewards/margins": 8.37041187286377, + "rewards/rejected": -5.945986270904541, + "step": 1140 + }, + { + "epoch": 0.39, + "learning_rate": 4.831927483318645e-07, + "logits/chosen": 1.1458241939544678, + "logits/rejected": 3.3127903938293457, + "logps/chosen": -397.9763488769531, + "logps/rejected": -356.6136169433594, + "loss": 0.0399, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.525275230407715, + "rewards/margins": 7.989278316497803, + "rewards/rejected": -5.464003562927246, + "step": 1150 + }, + { + "epoch": 0.39, + "learning_rate": 4.82563263250661e-07, + "logits/chosen": 1.3799407482147217, + "logits/rejected": 3.288724422454834, + "logps/chosen": -439.81427001953125, + "logps/rejected": -482.54461669921875, + "loss": 0.0472, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.0920233726501465, + "rewards/margins": 7.863224983215332, + "rewards/rejected": -5.771200656890869, + "step": 1160 + }, + { + "epoch": 0.4, + "learning_rate": 4.819337781694573e-07, + "logits/chosen": 1.8935362100601196, + "logits/rejected": 3.2015902996063232, + "logps/chosen": -301.77191162109375, + "logps/rejected": -543.3331298828125, + "loss": 0.0508, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.1477038860321045, + "rewards/margins": 7.75766658782959, + "rewards/rejected": -5.609963893890381, + "step": 1170 + }, + { + "epoch": 0.4, + "learning_rate": 4.813042930882538e-07, + "logits/chosen": 1.364073634147644, + "logits/rejected": 2.3843796253204346, + "logps/chosen": -350.85760498046875, + "logps/rejected": -687.0422973632812, + "loss": 0.0436, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.2611756324768066, + "rewards/margins": 7.602535247802734, + "rewards/rejected": -5.341360092163086, + "step": 1180 + }, + { + "epoch": 0.4, + "learning_rate": 4.806748080070503e-07, + "logits/chosen": 1.4911861419677734, + "logits/rejected": 3.1939921379089355, + "logps/chosen": -389.1288146972656, + "logps/rejected": -405.0220947265625, + "loss": 0.0607, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.1533682346343994, + "rewards/margins": 7.3057379722595215, + "rewards/rejected": -5.152369022369385, + "step": 1190 + }, + { + "epoch": 0.41, + "learning_rate": 4.800453229258466e-07, + "logits/chosen": 1.3050758838653564, + "logits/rejected": 2.954599380493164, + "logps/chosen": -454.8966369628906, + "logps/rejected": -481.5711975097656, + "loss": 0.0443, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.1215577125549316, + "rewards/margins": 7.0846848487854, + "rewards/rejected": -4.963128089904785, + "step": 1200 + }, + { + "epoch": 0.41, + "eval_logits/chosen": 0.8960862159729004, + "eval_logits/rejected": 3.119628667831421, + "eval_logps/chosen": -364.7530212402344, + "eval_logps/rejected": -557.41455078125, + "eval_loss": 0.052623916417360306, + "eval_rewards/accuracies": 0.9797979593276978, + "eval_rewards/chosen": 2.4236865043640137, + "eval_rewards/margins": 7.902113437652588, + "eval_rewards/rejected": -5.478426933288574, + "eval_runtime": 267.6269, + "eval_samples_per_second": 35.497, + "eval_steps_per_second": 1.11, + "step": 1200 + }, + { + "epoch": 0.41, + "learning_rate": 4.79415837844643e-07, + "logits/chosen": 1.5837571620941162, + "logits/rejected": 3.5294737815856934, + "logps/chosen": -378.23101806640625, + "logps/rejected": -340.2025451660156, + "loss": 0.0529, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2081446647644043, + "rewards/margins": 8.476436614990234, + "rewards/rejected": -6.268291473388672, + "step": 1210 + }, + { + "epoch": 0.41, + "learning_rate": 4.787863527634395e-07, + "logits/chosen": 1.1652976274490356, + "logits/rejected": 2.906097173690796, + "logps/chosen": -320.04486083984375, + "logps/rejected": -624.504150390625, + "loss": 0.0438, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.833052396774292, + "rewards/margins": 7.845742225646973, + "rewards/rejected": -6.012689113616943, + "step": 1220 + }, + { + "epoch": 0.42, + "learning_rate": 4.781568676822359e-07, + "logits/chosen": 0.824341893196106, + "logits/rejected": 2.7557225227355957, + "logps/chosen": -301.5578308105469, + "logps/rejected": -791.7188720703125, + "loss": 0.0324, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.675546884536743, + "rewards/margins": 8.524243354797363, + "rewards/rejected": -5.848695755004883, + "step": 1230 + }, + { + "epoch": 0.42, + "learning_rate": 4.775273826010323e-07, + "logits/chosen": 1.4236054420471191, + "logits/rejected": 3.063476085662842, + "logps/chosen": -381.88348388671875, + "logps/rejected": -580.1704711914062, + "loss": 0.0398, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.311436176300049, + "rewards/margins": 8.216426849365234, + "rewards/rejected": -5.9049906730651855, + "step": 1240 + }, + { + "epoch": 0.42, + "learning_rate": 4.768978975198288e-07, + "logits/chosen": 1.1934541463851929, + "logits/rejected": 2.6553595066070557, + "logps/chosen": -375.6439208984375, + "logps/rejected": -746.5518798828125, + "loss": 0.0377, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.372680187225342, + "rewards/margins": 7.412602424621582, + "rewards/rejected": -5.03992223739624, + "step": 1250 + }, + { + "epoch": 0.43, + "learning_rate": 4.762684124386252e-07, + "logits/chosen": 1.407812237739563, + "logits/rejected": 2.5921523571014404, + "logps/chosen": -382.3465881347656, + "logps/rejected": -705.896484375, + "loss": 0.0441, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.423621654510498, + "rewards/margins": 8.631671905517578, + "rewards/rejected": -6.208049297332764, + "step": 1260 + }, + { + "epoch": 0.43, + "learning_rate": 4.756389273574216e-07, + "logits/chosen": 1.3090200424194336, + "logits/rejected": 3.249882459640503, + "logps/chosen": -420.4686584472656, + "logps/rejected": -606.3863525390625, + "loss": 0.0381, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.314528465270996, + "rewards/margins": 8.804101943969727, + "rewards/rejected": -6.489573001861572, + "step": 1270 + }, + { + "epoch": 0.44, + "learning_rate": 4.7500944227621803e-07, + "logits/chosen": 1.533039927482605, + "logits/rejected": 3.1679673194885254, + "logps/chosen": -336.41033935546875, + "logps/rejected": -435.3956604003906, + "loss": 0.0326, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.788140058517456, + "rewards/margins": 9.346943855285645, + "rewards/rejected": -6.558804512023926, + "step": 1280 + }, + { + "epoch": 0.44, + "learning_rate": 4.7437995719501445e-07, + "logits/chosen": 0.8365401029586792, + "logits/rejected": 3.102825164794922, + "logps/chosen": -311.44677734375, + "logps/rejected": -572.3267211914062, + "loss": 0.0367, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1671829223632812, + "rewards/margins": 8.971689224243164, + "rewards/rejected": -6.804505825042725, + "step": 1290 + }, + { + "epoch": 0.44, + "learning_rate": 4.737504721138109e-07, + "logits/chosen": 1.3206243515014648, + "logits/rejected": 2.648153781890869, + "logps/chosen": -361.6004333496094, + "logps/rejected": -589.2341918945312, + "loss": 0.0416, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.8718124628067017, + "rewards/margins": 7.338369846343994, + "rewards/rejected": -5.466557502746582, + "step": 1300 + }, + { + "epoch": 0.44, + "eval_logits/chosen": 0.872046709060669, + "eval_logits/rejected": 3.0682923793792725, + "eval_logps/chosen": -365.11627197265625, + "eval_logps/rejected": -564.8768310546875, + "eval_loss": 0.04774455353617668, + "eval_rewards/accuracies": 0.9823232293128967, + "eval_rewards/chosen": 2.3873627185821533, + "eval_rewards/margins": 8.612015724182129, + "eval_rewards/rejected": -6.2246527671813965, + "eval_runtime": 267.762, + "eval_samples_per_second": 35.479, + "eval_steps_per_second": 1.109, + "step": 1300 + }, + { + "epoch": 0.45, + "learning_rate": 4.7312098703260735e-07, + "logits/chosen": 1.2614483833312988, + "logits/rejected": 3.360116958618164, + "logps/chosen": -307.0997619628906, + "logps/rejected": -477.2865295410156, + "loss": 0.0437, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.121788263320923, + "rewards/margins": 8.030182838439941, + "rewards/rejected": -5.908394813537598, + "step": 1310 + }, + { + "epoch": 0.45, + "learning_rate": 4.724915019514038e-07, + "logits/chosen": 1.0011619329452515, + "logits/rejected": 2.776240348815918, + "logps/chosen": -362.7132568359375, + "logps/rejected": -661.7002563476562, + "loss": 0.0371, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4083709716796875, + "rewards/margins": 8.502340316772461, + "rewards/rejected": -6.093968391418457, + "step": 1320 + }, + { + "epoch": 0.45, + "learning_rate": 4.7186201687020014e-07, + "logits/chosen": 1.6232942342758179, + "logits/rejected": 3.438117504119873, + "logps/chosen": -356.1873474121094, + "logps/rejected": -431.338134765625, + "loss": 0.0357, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2725751399993896, + "rewards/margins": 8.91747760772705, + "rewards/rejected": -6.64490270614624, + "step": 1330 + }, + { + "epoch": 0.46, + "learning_rate": 4.7123253178899657e-07, + "logits/chosen": 0.9556019902229309, + "logits/rejected": 3.3059210777282715, + "logps/chosen": -296.5425109863281, + "logps/rejected": -507.9095764160156, + "loss": 0.0306, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0043280124664307, + "rewards/margins": 9.976794242858887, + "rewards/rejected": -6.972466945648193, + "step": 1340 + }, + { + "epoch": 0.46, + "learning_rate": 4.70603046707793e-07, + "logits/chosen": 1.2038887739181519, + "logits/rejected": 2.7011523246765137, + "logps/chosen": -311.3517150878906, + "logps/rejected": -632.9171752929688, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.609565258026123, + "rewards/margins": 9.721056938171387, + "rewards/rejected": -7.111492156982422, + "step": 1350 + }, + { + "epoch": 0.46, + "learning_rate": 4.699735616265894e-07, + "logits/chosen": 0.7160284519195557, + "logits/rejected": 3.156071186065674, + "logps/chosen": -371.474365234375, + "logps/rejected": -511.2286682128906, + "loss": 0.1187, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.7736752033233643, + "rewards/margins": 8.387275695800781, + "rewards/rejected": -5.613600730895996, + "step": 1360 + }, + { + "epoch": 0.47, + "learning_rate": 4.693440765453859e-07, + "logits/chosen": 1.0091116428375244, + "logits/rejected": 2.463162660598755, + "logps/chosen": -320.53118896484375, + "logps/rejected": -748.8678588867188, + "loss": 0.1145, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6646820306777954, + "rewards/margins": 8.154109954833984, + "rewards/rejected": -6.4894280433654785, + "step": 1370 + }, + { + "epoch": 0.47, + "learning_rate": 4.687145914641823e-07, + "logits/chosen": 2.027181625366211, + "logits/rejected": 3.0980985164642334, + "logps/chosen": -349.1291198730469, + "logps/rejected": -616.3356323242188, + "loss": 0.0332, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5397064685821533, + "rewards/margins": 9.038924217224121, + "rewards/rejected": -6.499217987060547, + "step": 1380 + }, + { + "epoch": 0.47, + "learning_rate": 4.6808510638297873e-07, + "logits/chosen": 1.762028694152832, + "logits/rejected": 2.9455161094665527, + "logps/chosen": -345.4776306152344, + "logps/rejected": -617.8087768554688, + "loss": 0.0255, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7131543159484863, + "rewards/margins": 8.694609642028809, + "rewards/rejected": -5.9814558029174805, + "step": 1390 + }, + { + "epoch": 0.48, + "learning_rate": 4.674556213017751e-07, + "logits/chosen": 1.4126743078231812, + "logits/rejected": 3.22453236579895, + "logps/chosen": -295.3870849609375, + "logps/rejected": -528.4091186523438, + "loss": 0.0365, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.1942496299743652, + "rewards/margins": 10.106409072875977, + "rewards/rejected": -7.912158966064453, + "step": 1400 + }, + { + "epoch": 0.48, + "eval_logits/chosen": 0.8667415380477905, + "eval_logits/rejected": 3.049123525619507, + "eval_logps/chosen": -366.1031494140625, + "eval_logps/rejected": -570.9899291992188, + "eval_loss": 0.04477392137050629, + "eval_rewards/accuracies": 0.9806397557258606, + "eval_rewards/chosen": 2.2886757850646973, + "eval_rewards/margins": 9.124639511108398, + "eval_rewards/rejected": -6.835964202880859, + "eval_runtime": 267.6793, + "eval_samples_per_second": 35.49, + "eval_steps_per_second": 1.11, + "step": 1400 + }, + { + "epoch": 0.48, + "learning_rate": 4.668261362205715e-07, + "logits/chosen": 1.426150918006897, + "logits/rejected": 3.05987811088562, + "logps/chosen": -397.50238037109375, + "logps/rejected": -614.9202270507812, + "loss": 0.0244, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.230624198913574, + "rewards/margins": 10.895345687866211, + "rewards/rejected": -8.664721488952637, + "step": 1410 + }, + { + "epoch": 0.48, + "learning_rate": 4.6619665113936795e-07, + "logits/chosen": 1.4373013973236084, + "logits/rejected": 2.838214159011841, + "logps/chosen": -378.0550231933594, + "logps/rejected": -618.8988037109375, + "loss": 0.0456, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.2754616737365723, + "rewards/margins": 8.592214584350586, + "rewards/rejected": -6.31675386428833, + "step": 1420 + }, + { + "epoch": 0.49, + "learning_rate": 4.6556716605816437e-07, + "logits/chosen": 1.1580406427383423, + "logits/rejected": 3.0366053581237793, + "logps/chosen": -366.52362060546875, + "logps/rejected": -564.1707153320312, + "loss": 0.0379, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.398660182952881, + "rewards/margins": 8.526018142700195, + "rewards/rejected": -6.127357006072998, + "step": 1430 + }, + { + "epoch": 0.49, + "learning_rate": 4.6493768097696085e-07, + "logits/chosen": 1.256246566772461, + "logits/rejected": 3.201850175857544, + "logps/chosen": -451.3427734375, + "logps/rejected": -479.74505615234375, + "loss": 0.0505, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0574326515197754, + "rewards/margins": 8.310302734375, + "rewards/rejected": -6.252870082855225, + "step": 1440 + }, + { + "epoch": 0.49, + "learning_rate": 4.6430819589575727e-07, + "logits/chosen": 1.2370970249176025, + "logits/rejected": 3.4503719806671143, + "logps/chosen": -319.02691650390625, + "logps/rejected": -411.275146484375, + "loss": 0.036, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.039693832397461, + "rewards/margins": 9.272503852844238, + "rewards/rejected": -7.232810020446777, + "step": 1450 + }, + { + "epoch": 0.5, + "learning_rate": 4.636787108145537e-07, + "logits/chosen": 1.3211932182312012, + "logits/rejected": 2.6074912548065186, + "logps/chosen": -429.85015869140625, + "logps/rejected": -610.8492431640625, + "loss": 0.0301, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.3383851051330566, + "rewards/margins": 8.625310897827148, + "rewards/rejected": -6.286925792694092, + "step": 1460 + }, + { + "epoch": 0.5, + "learning_rate": 4.630492257333501e-07, + "logits/chosen": 1.121718168258667, + "logits/rejected": 3.2523887157440186, + "logps/chosen": -302.6455078125, + "logps/rejected": -466.39752197265625, + "loss": 0.0288, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9783689975738525, + "rewards/margins": 8.877293586730957, + "rewards/rejected": -6.898924827575684, + "step": 1470 + }, + { + "epoch": 0.5, + "learning_rate": 4.624197406521465e-07, + "logits/chosen": 1.5639989376068115, + "logits/rejected": 3.146419048309326, + "logps/chosen": -286.6660461425781, + "logps/rejected": -513.6187744140625, + "loss": 0.0427, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.8529775142669678, + "rewards/margins": 9.158210754394531, + "rewards/rejected": -7.305233955383301, + "step": 1480 + }, + { + "epoch": 0.51, + "learning_rate": 4.617902555709429e-07, + "logits/chosen": 0.8676093816757202, + "logits/rejected": 3.324131488800049, + "logps/chosen": -344.65277099609375, + "logps/rejected": -363.31390380859375, + "loss": 0.0459, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.1465797424316406, + "rewards/margins": 8.678162574768066, + "rewards/rejected": -6.531582832336426, + "step": 1490 + }, + { + "epoch": 0.51, + "learning_rate": 4.611607704897394e-07, + "logits/chosen": 1.0654547214508057, + "logits/rejected": 3.0400424003601074, + "logps/chosen": -427.28790283203125, + "logps/rejected": -542.1488647460938, + "loss": 0.0341, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3886258602142334, + "rewards/margins": 8.62549877166748, + "rewards/rejected": -6.236873626708984, + "step": 1500 + }, + { + "epoch": 0.51, + "eval_logits/chosen": 0.8500487208366394, + "eval_logits/rejected": 3.0298707485198975, + "eval_logps/chosen": -366.1944580078125, + "eval_logps/rejected": -572.177734375, + "eval_loss": 0.04415823146700859, + "eval_rewards/accuracies": 0.9848484992980957, + "eval_rewards/chosen": 2.2795426845550537, + "eval_rewards/margins": 9.234275817871094, + "eval_rewards/rejected": -6.954732894897461, + "eval_runtime": 267.8215, + "eval_samples_per_second": 35.471, + "eval_steps_per_second": 1.109, + "step": 1500 + }, + { + "epoch": 0.51, + "learning_rate": 4.605312854085358e-07, + "logits/chosen": 1.42298424243927, + "logits/rejected": 2.857264518737793, + "logps/chosen": -413.36065673828125, + "logps/rejected": -489.81353759765625, + "loss": 0.0406, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.0433743000030518, + "rewards/margins": 8.779324531555176, + "rewards/rejected": -6.7359514236450195, + "step": 1510 + }, + { + "epoch": 0.52, + "learning_rate": 4.5990180032733223e-07, + "logits/chosen": 1.4577559232711792, + "logits/rejected": 2.931697368621826, + "logps/chosen": -374.41046142578125, + "logps/rejected": -628.4669799804688, + "loss": 0.0432, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.2504830360412598, + "rewards/margins": 9.084136009216309, + "rewards/rejected": -6.833653450012207, + "step": 1520 + }, + { + "epoch": 0.52, + "learning_rate": 4.5927231524612865e-07, + "logits/chosen": 1.394997000694275, + "logits/rejected": 3.512721538543701, + "logps/chosen": -348.52667236328125, + "logps/rejected": -423.2193298339844, + "loss": 0.0343, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.209585666656494, + "rewards/margins": 9.598394393920898, + "rewards/rejected": -7.388807773590088, + "step": 1530 + }, + { + "epoch": 0.52, + "learning_rate": 4.586428301649251e-07, + "logits/chosen": 0.8830512762069702, + "logits/rejected": 2.386176347732544, + "logps/chosen": -316.9195861816406, + "logps/rejected": -686.3156127929688, + "loss": 0.0385, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.390429735183716, + "rewards/margins": 7.798093318939209, + "rewards/rejected": -5.407662391662598, + "step": 1540 + }, + { + "epoch": 0.53, + "learning_rate": 4.5801334508372145e-07, + "logits/chosen": 1.4992390871047974, + "logits/rejected": 3.2531185150146484, + "logps/chosen": -381.68865966796875, + "logps/rejected": -466.6773376464844, + "loss": 0.0408, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.968583345413208, + "rewards/margins": 8.427129745483398, + "rewards/rejected": -6.4585466384887695, + "step": 1550 + }, + { + "epoch": 0.53, + "learning_rate": 4.573838600025179e-07, + "logits/chosen": 1.1836860179901123, + "logits/rejected": 2.7051196098327637, + "logps/chosen": -460.22515869140625, + "logps/rejected": -650.5518798828125, + "loss": 0.0425, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.747108817100525, + "rewards/margins": 8.638761520385742, + "rewards/rejected": -6.891653537750244, + "step": 1560 + }, + { + "epoch": 0.53, + "learning_rate": 4.5675437492131434e-07, + "logits/chosen": 1.3394988775253296, + "logits/rejected": 2.7925000190734863, + "logps/chosen": -351.8819274902344, + "logps/rejected": -588.3709106445312, + "loss": 0.0294, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.231447219848633, + "rewards/margins": 8.942026138305664, + "rewards/rejected": -6.710578918457031, + "step": 1570 + }, + { + "epoch": 0.54, + "learning_rate": 4.5612488984011077e-07, + "logits/chosen": 1.5489451885223389, + "logits/rejected": 3.2238707542419434, + "logps/chosen": -369.0827331542969, + "logps/rejected": -516.4092407226562, + "loss": 0.0426, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.270219326019287, + "rewards/margins": 8.84095287322998, + "rewards/rejected": -6.570733070373535, + "step": 1580 + }, + { + "epoch": 0.54, + "learning_rate": 4.554954047589072e-07, + "logits/chosen": 1.0998046398162842, + "logits/rejected": 3.2007484436035156, + "logps/chosen": -366.46356201171875, + "logps/rejected": -464.4613342285156, + "loss": 0.0357, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.3737833499908447, + "rewards/margins": 10.289929389953613, + "rewards/rejected": -7.916145324707031, + "step": 1590 + }, + { + "epoch": 0.54, + "learning_rate": 4.548659196777036e-07, + "logits/chosen": 1.2878637313842773, + "logits/rejected": 3.2345385551452637, + "logps/chosen": -499.41259765625, + "logps/rejected": -477.4854431152344, + "loss": 0.0406, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7783913612365723, + "rewards/margins": 8.754561424255371, + "rewards/rejected": -5.976171016693115, + "step": 1600 + }, + { + "epoch": 0.54, + "eval_logits/chosen": 0.844233512878418, + "eval_logits/rejected": 3.043703317642212, + "eval_logps/chosen": -368.0941162109375, + "eval_logps/rejected": -572.6334228515625, + "eval_loss": 0.04141601547598839, + "eval_rewards/accuracies": 0.9848484992980957, + "eval_rewards/chosen": 2.0895750522613525, + "eval_rewards/margins": 9.089882850646973, + "eval_rewards/rejected": -7.000306606292725, + "eval_runtime": 268.1349, + "eval_samples_per_second": 35.43, + "eval_steps_per_second": 1.108, + "step": 1600 + }, + { + "epoch": 0.55, + "learning_rate": 4.5423643459650003e-07, + "logits/chosen": 0.930519700050354, + "logits/rejected": 2.513514518737793, + "logps/chosen": -379.267333984375, + "logps/rejected": -638.7659912109375, + "loss": 0.0365, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.8533432483673096, + "rewards/margins": 8.584343910217285, + "rewards/rejected": -6.731001377105713, + "step": 1610 + }, + { + "epoch": 0.55, + "learning_rate": 4.536069495152965e-07, + "logits/chosen": 0.8711696863174438, + "logits/rejected": 2.6304049491882324, + "logps/chosen": -361.69207763671875, + "logps/rejected": -652.2589721679688, + "loss": 0.0412, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.895737648010254, + "rewards/margins": 9.80798625946045, + "rewards/rejected": -7.912248134613037, + "step": 1620 + }, + { + "epoch": 0.55, + "learning_rate": 4.529774644340929e-07, + "logits/chosen": 1.4021892547607422, + "logits/rejected": 3.2112183570861816, + "logps/chosen": -386.8944091796875, + "logps/rejected": -473.6744079589844, + "loss": 0.0281, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.199531078338623, + "rewards/margins": 8.601357460021973, + "rewards/rejected": -6.401825904846191, + "step": 1630 + }, + { + "epoch": 0.56, + "learning_rate": 4.523479793528893e-07, + "logits/chosen": 1.5126926898956299, + "logits/rejected": 2.907017946243286, + "logps/chosen": -352.89801025390625, + "logps/rejected": -503.09149169921875, + "loss": 0.0419, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7960695028305054, + "rewards/margins": 9.080196380615234, + "rewards/rejected": -7.284126281738281, + "step": 1640 + }, + { + "epoch": 0.56, + "learning_rate": 4.517184942716857e-07, + "logits/chosen": 0.9331147074699402, + "logits/rejected": 2.9222187995910645, + "logps/chosen": -430.34521484375, + "logps/rejected": -444.293212890625, + "loss": 0.0345, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6567729711532593, + "rewards/margins": 8.3241605758667, + "rewards/rejected": -6.667386531829834, + "step": 1650 + }, + { + "epoch": 0.56, + "learning_rate": 4.5108900919048215e-07, + "logits/chosen": 1.6180213689804077, + "logits/rejected": 2.923030376434326, + "logps/chosen": -390.9251403808594, + "logps/rejected": -516.4937744140625, + "loss": 0.0343, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9237686395645142, + "rewards/margins": 9.0204496383667, + "rewards/rejected": -7.096681118011475, + "step": 1660 + }, + { + "epoch": 0.57, + "learning_rate": 4.5045952410927857e-07, + "logits/chosen": 1.0812413692474365, + "logits/rejected": 2.9793589115142822, + "logps/chosen": -359.8444519042969, + "logps/rejected": -535.794677734375, + "loss": 0.0289, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.535153865814209, + "rewards/margins": 9.450161933898926, + "rewards/rejected": -6.915008544921875, + "step": 1670 + }, + { + "epoch": 0.57, + "learning_rate": 4.4983003902807505e-07, + "logits/chosen": 1.24253249168396, + "logits/rejected": 2.6760640144348145, + "logps/chosen": -463.6806640625, + "logps/rejected": -732.2550048828125, + "loss": 0.023, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.381948947906494, + "rewards/margins": 9.9815034866333, + "rewards/rejected": -7.59955358505249, + "step": 1680 + }, + { + "epoch": 0.57, + "learning_rate": 4.4920055394687147e-07, + "logits/chosen": 1.4033453464508057, + "logits/rejected": 3.3531792163848877, + "logps/chosen": -316.72991943359375, + "logps/rejected": -479.224853515625, + "loss": 0.0209, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9447095394134521, + "rewards/margins": 8.633768081665039, + "rewards/rejected": -6.689058780670166, + "step": 1690 + }, + { + "epoch": 0.58, + "learning_rate": 4.485710688656679e-07, + "logits/chosen": 1.282178282737732, + "logits/rejected": 2.8951597213745117, + "logps/chosen": -346.11138916015625, + "logps/rejected": -551.4237060546875, + "loss": 0.0427, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.1353259086608887, + "rewards/margins": 10.34667682647705, + "rewards/rejected": -8.21135139465332, + "step": 1700 + }, + { + "epoch": 0.58, + "eval_logits/chosen": 0.8383491039276123, + "eval_logits/rejected": 3.045766830444336, + "eval_logps/chosen": -368.6102294921875, + "eval_logps/rejected": -573.771240234375, + "eval_loss": 0.03865913301706314, + "eval_rewards/accuracies": 0.9856902360916138, + "eval_rewards/chosen": 2.037966728210449, + "eval_rewards/margins": 9.152060508728027, + "eval_rewards/rejected": -7.114094257354736, + "eval_runtime": 267.881, + "eval_samples_per_second": 35.464, + "eval_steps_per_second": 1.109, + "step": 1700 + }, + { + "epoch": 0.58, + "learning_rate": 4.4794158378446426e-07, + "logits/chosen": 0.8792635798454285, + "logits/rejected": 2.5632190704345703, + "logps/chosen": -436.8245544433594, + "logps/rejected": -714.1575927734375, + "loss": 0.0476, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.0016674995422363, + "rewards/margins": 8.103446960449219, + "rewards/rejected": -6.101779460906982, + "step": 1710 + }, + { + "epoch": 0.58, + "learning_rate": 4.473120987032607e-07, + "logits/chosen": 1.1539610624313354, + "logits/rejected": 2.0238335132598877, + "logps/chosen": -366.057861328125, + "logps/rejected": -899.9832763671875, + "loss": 0.0234, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1319379806518555, + "rewards/margins": 9.034749984741211, + "rewards/rejected": -6.902812957763672, + "step": 1720 + }, + { + "epoch": 0.59, + "learning_rate": 4.466826136220571e-07, + "logits/chosen": 1.4467999935150146, + "logits/rejected": 3.262604236602783, + "logps/chosen": -328.6813049316406, + "logps/rejected": -455.77520751953125, + "loss": 0.0315, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.3364861011505127, + "rewards/margins": 9.599719047546387, + "rewards/rejected": -7.263233184814453, + "step": 1730 + }, + { + "epoch": 0.59, + "learning_rate": 4.460531285408536e-07, + "logits/chosen": 1.053497552871704, + "logits/rejected": 2.9759325981140137, + "logps/chosen": -494.638671875, + "logps/rejected": -478.7875061035156, + "loss": 0.0333, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7100532054901123, + "rewards/margins": 9.044734954833984, + "rewards/rejected": -7.334682464599609, + "step": 1740 + }, + { + "epoch": 0.59, + "learning_rate": 4.4542364345965e-07, + "logits/chosen": 1.587790608406067, + "logits/rejected": 3.05256986618042, + "logps/chosen": -425.80120849609375, + "logps/rejected": -446.116455078125, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.355698347091675, + "rewards/margins": 10.500600814819336, + "rewards/rejected": -8.144902229309082, + "step": 1750 + }, + { + "epoch": 0.6, + "learning_rate": 4.4479415837844643e-07, + "logits/chosen": 0.9711793065071106, + "logits/rejected": 3.033139705657959, + "logps/chosen": -419.15093994140625, + "logps/rejected": -527.05908203125, + "loss": 0.0184, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8909587860107422, + "rewards/margins": 9.026007652282715, + "rewards/rejected": -7.135048866271973, + "step": 1760 + }, + { + "epoch": 0.6, + "learning_rate": 4.4416467329724285e-07, + "logits/chosen": 1.252915620803833, + "logits/rejected": 2.7287914752960205, + "logps/chosen": -316.9794921875, + "logps/rejected": -696.9098510742188, + "loss": 0.025, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.2421488761901855, + "rewards/margins": 9.83061408996582, + "rewards/rejected": -7.588465213775635, + "step": 1770 + }, + { + "epoch": 0.61, + "learning_rate": 4.435351882160392e-07, + "logits/chosen": 1.7731847763061523, + "logits/rejected": 3.256561756134033, + "logps/chosen": -392.49127197265625, + "logps/rejected": -497.767822265625, + "loss": 0.0289, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7031304836273193, + "rewards/margins": 8.996789932250977, + "rewards/rejected": -7.293660640716553, + "step": 1780 + }, + { + "epoch": 0.61, + "learning_rate": 4.4290570313483564e-07, + "logits/chosen": 1.0066537857055664, + "logits/rejected": 2.923328161239624, + "logps/chosen": -497.8016662597656, + "logps/rejected": -501.8356018066406, + "loss": 0.0407, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.5872151851654053, + "rewards/margins": 10.02701187133789, + "rewards/rejected": -8.439796447753906, + "step": 1790 + }, + { + "epoch": 0.61, + "learning_rate": 4.422762180536321e-07, + "logits/chosen": 0.7579858899116516, + "logits/rejected": 2.7158029079437256, + "logps/chosen": -332.276123046875, + "logps/rejected": -624.5234375, + "loss": 0.0225, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.291430950164795, + "rewards/margins": 8.889410018920898, + "rewards/rejected": -6.597978115081787, + "step": 1800 + }, + { + "epoch": 0.61, + "eval_logits/chosen": 0.8362001776695251, + "eval_logits/rejected": 3.0442867279052734, + "eval_logps/chosen": -366.83953857421875, + "eval_logps/rejected": -573.6825561523438, + "eval_loss": 0.04210779815912247, + "eval_rewards/accuracies": 0.9890572428703308, + "eval_rewards/chosen": 2.2150368690490723, + "eval_rewards/margins": 9.32026195526123, + "eval_rewards/rejected": -7.105224609375, + "eval_runtime": 267.6481, + "eval_samples_per_second": 35.494, + "eval_steps_per_second": 1.11, + "step": 1800 + }, + { + "epoch": 0.62, + "learning_rate": 4.4164673297242854e-07, + "logits/chosen": 1.06367027759552, + "logits/rejected": 2.7651069164276123, + "logps/chosen": -433.72210693359375, + "logps/rejected": -608.6502075195312, + "loss": 0.0341, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3269495964050293, + "rewards/margins": 10.384018898010254, + "rewards/rejected": -8.057069778442383, + "step": 1810 + }, + { + "epoch": 0.62, + "learning_rate": 4.4101724789122497e-07, + "logits/chosen": 1.0821582078933716, + "logits/rejected": 2.331120491027832, + "logps/chosen": -378.4905090332031, + "logps/rejected": -800.6824340820312, + "loss": 0.026, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.1085920333862305, + "rewards/margins": 8.5978422164917, + "rewards/rejected": -6.489250183105469, + "step": 1820 + }, + { + "epoch": 0.62, + "learning_rate": 4.403877628100214e-07, + "logits/chosen": 1.1348297595977783, + "logits/rejected": 3.279541015625, + "logps/chosen": -381.3242492675781, + "logps/rejected": -529.54052734375, + "loss": 0.0213, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.102947950363159, + "rewards/margins": 8.581428527832031, + "rewards/rejected": -6.478480339050293, + "step": 1830 + }, + { + "epoch": 0.63, + "learning_rate": 4.397582777288178e-07, + "logits/chosen": 1.0020978450775146, + "logits/rejected": 2.961073875427246, + "logps/chosen": -446.6199645996094, + "logps/rejected": -650.9901123046875, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3660645484924316, + "rewards/margins": 9.614389419555664, + "rewards/rejected": -7.248324394226074, + "step": 1840 + }, + { + "epoch": 0.63, + "learning_rate": 4.3912879264761423e-07, + "logits/chosen": 1.8732408285140991, + "logits/rejected": 2.957521915435791, + "logps/chosen": -387.747802734375, + "logps/rejected": -548.1691284179688, + "loss": 0.0216, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9975941181182861, + "rewards/margins": 8.945771217346191, + "rewards/rejected": -6.948177337646484, + "step": 1850 + }, + { + "epoch": 0.63, + "learning_rate": 4.3849930756641066e-07, + "logits/chosen": 0.8730059862136841, + "logits/rejected": 2.674221992492676, + "logps/chosen": -351.84716796875, + "logps/rejected": -652.6336669921875, + "loss": 0.0283, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.034841537475586, + "rewards/margins": 9.151686668395996, + "rewards/rejected": -7.116845607757568, + "step": 1860 + }, + { + "epoch": 0.64, + "learning_rate": 4.378698224852071e-07, + "logits/chosen": 1.5931994915008545, + "logits/rejected": 3.244697093963623, + "logps/chosen": -370.19097900390625, + "logps/rejected": -500.9292907714844, + "loss": 0.0333, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7228028774261475, + "rewards/margins": 9.614213943481445, + "rewards/rejected": -7.891410827636719, + "step": 1870 + }, + { + "epoch": 0.64, + "learning_rate": 4.372403374040035e-07, + "logits/chosen": 0.840854287147522, + "logits/rejected": 3.0632858276367188, + "logps/chosen": -317.24822998046875, + "logps/rejected": -540.1351928710938, + "loss": 0.0299, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6507081985473633, + "rewards/margins": 9.825498580932617, + "rewards/rejected": -8.174789428710938, + "step": 1880 + }, + { + "epoch": 0.64, + "learning_rate": 4.366108523227999e-07, + "logits/chosen": 1.8016700744628906, + "logits/rejected": 2.8221445083618164, + "logps/chosen": -406.5692138671875, + "logps/rejected": -540.0714111328125, + "loss": 0.0376, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.989933967590332, + "rewards/margins": 9.233619689941406, + "rewards/rejected": -7.243685722351074, + "step": 1890 + }, + { + "epoch": 0.65, + "learning_rate": 4.3598136724159635e-07, + "logits/chosen": 1.2036279439926147, + "logits/rejected": 3.0226616859436035, + "logps/chosen": -446.08294677734375, + "logps/rejected": -568.0128784179688, + "loss": 0.0298, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3940460681915283, + "rewards/margins": 9.725221633911133, + "rewards/rejected": -7.331175804138184, + "step": 1900 + }, + { + "epoch": 0.65, + "eval_logits/chosen": 0.8392346501350403, + "eval_logits/rejected": 3.0305709838867188, + "eval_logps/chosen": -368.1361389160156, + "eval_logps/rejected": -579.7667846679688, + "eval_loss": 0.03639867901802063, + "eval_rewards/accuracies": 0.9882155060768127, + "eval_rewards/chosen": 2.085376262664795, + "eval_rewards/margins": 9.799016952514648, + "eval_rewards/rejected": -7.713641166687012, + "eval_runtime": 267.3409, + "eval_samples_per_second": 35.535, + "eval_steps_per_second": 1.111, + "step": 1900 + }, + { + "epoch": 0.65, + "learning_rate": 4.3535188216039277e-07, + "logits/chosen": 1.0752043724060059, + "logits/rejected": 2.855588674545288, + "logps/chosen": -346.389404296875, + "logps/rejected": -544.1331787109375, + "loss": 0.0359, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9727954864501953, + "rewards/margins": 9.62835693359375, + "rewards/rejected": -7.655562400817871, + "step": 1910 + }, + { + "epoch": 0.65, + "learning_rate": 4.3472239707918925e-07, + "logits/chosen": 0.7856771945953369, + "logits/rejected": 3.2946181297302246, + "logps/chosen": -305.7819519042969, + "logps/rejected": -429.08038330078125, + "loss": 0.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1681599617004395, + "rewards/margins": 9.955987930297852, + "rewards/rejected": -7.787827968597412, + "step": 1920 + }, + { + "epoch": 0.66, + "learning_rate": 4.3409291199798567e-07, + "logits/chosen": 1.8215280771255493, + "logits/rejected": 2.743664264678955, + "logps/chosen": -331.0490417480469, + "logps/rejected": -699.108154296875, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.219869375228882, + "rewards/margins": 8.896055221557617, + "rewards/rejected": -6.67618465423584, + "step": 1930 + }, + { + "epoch": 0.66, + "learning_rate": 4.3346342691678204e-07, + "logits/chosen": 1.0438158512115479, + "logits/rejected": 2.8148791790008545, + "logps/chosen": -324.4605407714844, + "logps/rejected": -731.7349853515625, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9487861394882202, + "rewards/margins": 9.898294448852539, + "rewards/rejected": -7.9495062828063965, + "step": 1940 + }, + { + "epoch": 0.66, + "learning_rate": 4.3283394183557846e-07, + "logits/chosen": 1.4545763731002808, + "logits/rejected": 2.9583208560943604, + "logps/chosen": -527.7854614257812, + "logps/rejected": -516.2374877929688, + "loss": 0.0415, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.4202537536621094, + "rewards/margins": 9.624616622924805, + "rewards/rejected": -7.204362392425537, + "step": 1950 + }, + { + "epoch": 0.67, + "learning_rate": 4.322044567543749e-07, + "logits/chosen": 1.4765589237213135, + "logits/rejected": 3.1827173233032227, + "logps/chosen": -379.47216796875, + "logps/rejected": -503.27569580078125, + "loss": 0.0348, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.047214984893799, + "rewards/margins": 9.799707412719727, + "rewards/rejected": -7.752492427825928, + "step": 1960 + }, + { + "epoch": 0.67, + "learning_rate": 4.315749716731713e-07, + "logits/chosen": 1.7842438220977783, + "logits/rejected": 2.418262481689453, + "logps/chosen": -389.3029479980469, + "logps/rejected": -714.0911254882812, + "loss": 0.0325, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.008929967880249, + "rewards/margins": 8.412801742553711, + "rewards/rejected": -6.403871059417725, + "step": 1970 + }, + { + "epoch": 0.67, + "learning_rate": 4.309454865919678e-07, + "logits/chosen": 1.215798258781433, + "logits/rejected": 3.197004795074463, + "logps/chosen": -390.9176940917969, + "logps/rejected": -528.724609375, + "loss": 0.0248, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9435479640960693, + "rewards/margins": 8.589653968811035, + "rewards/rejected": -6.6461052894592285, + "step": 1980 + }, + { + "epoch": 0.68, + "learning_rate": 4.303160015107642e-07, + "logits/chosen": 1.5803320407867432, + "logits/rejected": 3.488564968109131, + "logps/chosen": -419.08111572265625, + "logps/rejected": -409.6341247558594, + "loss": 0.0232, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.560288906097412, + "rewards/margins": 8.865260124206543, + "rewards/rejected": -7.304971218109131, + "step": 1990 + }, + { + "epoch": 0.68, + "learning_rate": 4.2968651642956063e-07, + "logits/chosen": 1.274867057800293, + "logits/rejected": 2.914517879486084, + "logps/chosen": -367.41143798828125, + "logps/rejected": -487.2129821777344, + "loss": 0.0255, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.8918441534042358, + "rewards/margins": 9.418905258178711, + "rewards/rejected": -7.527061462402344, + "step": 2000 + }, + { + "epoch": 0.68, + "eval_logits/chosen": 0.8292204737663269, + "eval_logits/rejected": 3.020355701446533, + "eval_logps/chosen": -367.63873291015625, + "eval_logps/rejected": -579.4823608398438, + "eval_loss": 0.035267189145088196, + "eval_rewards/accuracies": 0.9907407164573669, + "eval_rewards/chosen": 2.135115623474121, + "eval_rewards/margins": 9.820322036743164, + "eval_rewards/rejected": -7.685206413269043, + "eval_runtime": 267.0722, + "eval_samples_per_second": 35.571, + "eval_steps_per_second": 1.112, + "step": 2000 + }, + { + "epoch": 0.68, + "learning_rate": 4.29057031348357e-07, + "logits/chosen": 1.5710475444793701, + "logits/rejected": 3.378007411956787, + "logps/chosen": -315.57305908203125, + "logps/rejected": -575.1492919921875, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9405778646469116, + "rewards/margins": 9.704689025878906, + "rewards/rejected": -7.764111518859863, + "step": 2010 + }, + { + "epoch": 0.69, + "learning_rate": 4.284275462671534e-07, + "logits/chosen": 1.243847131729126, + "logits/rejected": 3.022942304611206, + "logps/chosen": -323.0140075683594, + "logps/rejected": -645.990478515625, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4514198303222656, + "rewards/margins": 10.53857135772705, + "rewards/rejected": -8.087152481079102, + "step": 2020 + }, + { + "epoch": 0.69, + "learning_rate": 4.2779806118594984e-07, + "logits/chosen": 0.9828505516052246, + "logits/rejected": 2.906207323074341, + "logps/chosen": -411.5979919433594, + "logps/rejected": -462.4324645996094, + "loss": 0.0244, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.9616676568984985, + "rewards/margins": 8.81393051147461, + "rewards/rejected": -6.852262020111084, + "step": 2030 + }, + { + "epoch": 0.69, + "learning_rate": 4.271685761047463e-07, + "logits/chosen": 0.7124877572059631, + "logits/rejected": 3.1481852531433105, + "logps/chosen": -312.6952819824219, + "logps/rejected": -522.9578857421875, + "loss": 0.0188, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.42478346824646, + "rewards/margins": 11.089437484741211, + "rewards/rejected": -8.664652824401855, + "step": 2040 + }, + { + "epoch": 0.7, + "learning_rate": 4.2653909102354274e-07, + "logits/chosen": 1.7187106609344482, + "logits/rejected": 3.0672192573547363, + "logps/chosen": -375.66827392578125, + "logps/rejected": -492.21087646484375, + "loss": 0.0222, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.80177903175354, + "rewards/margins": 9.827333450317383, + "rewards/rejected": -8.025555610656738, + "step": 2050 + }, + { + "epoch": 0.7, + "learning_rate": 4.2590960594233917e-07, + "logits/chosen": 1.6581732034683228, + "logits/rejected": 3.062495231628418, + "logps/chosen": -395.61846923828125, + "logps/rejected": -509.40985107421875, + "loss": 0.0251, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8729314804077148, + "rewards/margins": 9.967903137207031, + "rewards/rejected": -8.094970703125, + "step": 2060 + }, + { + "epoch": 0.7, + "learning_rate": 4.252801208611356e-07, + "logits/chosen": 1.193673849105835, + "logits/rejected": 3.2088775634765625, + "logps/chosen": -363.78912353515625, + "logps/rejected": -526.5405883789062, + "loss": 0.0265, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.700319290161133, + "rewards/margins": 9.979622840881348, + "rewards/rejected": -7.279304504394531, + "step": 2070 + }, + { + "epoch": 0.71, + "learning_rate": 4.24650635779932e-07, + "logits/chosen": 1.1417957544326782, + "logits/rejected": 3.5138957500457764, + "logps/chosen": -296.9524841308594, + "logps/rejected": -465.9798889160156, + "loss": 0.0216, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.034841775894165, + "rewards/margins": 11.99071979522705, + "rewards/rejected": -9.955877304077148, + "step": 2080 + }, + { + "epoch": 0.71, + "learning_rate": 4.240211506987284e-07, + "logits/chosen": 0.8898839950561523, + "logits/rejected": 2.8017234802246094, + "logps/chosen": -394.20703125, + "logps/rejected": -560.9544067382812, + "loss": 0.0232, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.06630277633667, + "rewards/margins": 11.01480770111084, + "rewards/rejected": -8.948505401611328, + "step": 2090 + }, + { + "epoch": 0.71, + "learning_rate": 4.233916656175248e-07, + "logits/chosen": 1.0146639347076416, + "logits/rejected": 2.3725476264953613, + "logps/chosen": -412.14776611328125, + "logps/rejected": -777.4273681640625, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.079822063446045, + "rewards/margins": 10.065254211425781, + "rewards/rejected": -7.985433101654053, + "step": 2100 + }, + { + "epoch": 0.71, + "eval_logits/chosen": 0.8412470817565918, + "eval_logits/rejected": 3.0052082538604736, + "eval_logps/chosen": -367.7745361328125, + "eval_logps/rejected": -584.4203491210938, + "eval_loss": 0.029633017256855965, + "eval_rewards/accuracies": 0.9915825128555298, + "eval_rewards/chosen": 2.1215357780456543, + "eval_rewards/margins": 10.300537109375, + "eval_rewards/rejected": -8.179000854492188, + "eval_runtime": 267.8451, + "eval_samples_per_second": 35.468, + "eval_steps_per_second": 1.109, + "step": 2100 + }, + { + "epoch": 0.72, + "learning_rate": 4.227621805363213e-07, + "logits/chosen": 1.0731689929962158, + "logits/rejected": 2.9808132648468018, + "logps/chosen": -345.23016357421875, + "logps/rejected": -500.6625061035156, + "loss": 0.0349, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9625818729400635, + "rewards/margins": 10.356372833251953, + "rewards/rejected": -8.393791198730469, + "step": 2110 + }, + { + "epoch": 0.72, + "learning_rate": 4.221326954551177e-07, + "logits/chosen": 1.323359727859497, + "logits/rejected": 2.431626081466675, + "logps/chosen": -340.8362731933594, + "logps/rejected": -664.531494140625, + "loss": 0.026, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.480254650115967, + "rewards/margins": 10.480634689331055, + "rewards/rejected": -8.00037956237793, + "step": 2120 + }, + { + "epoch": 0.72, + "learning_rate": 4.215032103739141e-07, + "logits/chosen": 1.2426482439041138, + "logits/rejected": 2.7286458015441895, + "logps/chosen": -382.8769836425781, + "logps/rejected": -631.4144287109375, + "loss": 0.0187, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.8545036315917969, + "rewards/margins": 10.803675651550293, + "rewards/rejected": -8.949172019958496, + "step": 2130 + }, + { + "epoch": 0.73, + "learning_rate": 4.2087372529271055e-07, + "logits/chosen": 1.8994210958480835, + "logits/rejected": 3.1800224781036377, + "logps/chosen": -349.4104919433594, + "logps/rejected": -475.03564453125, + "loss": 0.0814, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8960243463516235, + "rewards/margins": 10.89116096496582, + "rewards/rejected": -8.995137214660645, + "step": 2140 + }, + { + "epoch": 0.73, + "learning_rate": 4.2024424021150697e-07, + "logits/chosen": 1.1531842947006226, + "logits/rejected": 2.8384928703308105, + "logps/chosen": -323.14703369140625, + "logps/rejected": -614.2777709960938, + "loss": 0.0295, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.258028030395508, + "rewards/margins": 10.949602127075195, + "rewards/rejected": -8.691572189331055, + "step": 2150 + }, + { + "epoch": 0.73, + "learning_rate": 4.1961475513030334e-07, + "logits/chosen": 1.6405277252197266, + "logits/rejected": 3.201380968093872, + "logps/chosen": -454.3759765625, + "logps/rejected": -358.6201171875, + "loss": 0.031, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.8849576711654663, + "rewards/margins": 10.349023818969727, + "rewards/rejected": -8.464067459106445, + "step": 2160 + }, + { + "epoch": 0.74, + "learning_rate": 4.189852700490998e-07, + "logits/chosen": 1.3615721464157104, + "logits/rejected": 3.039472818374634, + "logps/chosen": -371.7926025390625, + "logps/rejected": -482.4500427246094, + "loss": 0.0416, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9604123830795288, + "rewards/margins": 11.068208694458008, + "rewards/rejected": -9.107797622680664, + "step": 2170 + }, + { + "epoch": 0.74, + "learning_rate": 4.1835578496789624e-07, + "logits/chosen": 1.3801050186157227, + "logits/rejected": 2.936621904373169, + "logps/chosen": -394.67620849609375, + "logps/rejected": -551.686767578125, + "loss": 0.0321, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2390217781066895, + "rewards/margins": 10.892252922058105, + "rewards/rejected": -8.653233528137207, + "step": 2180 + }, + { + "epoch": 0.74, + "learning_rate": 4.1772629988669266e-07, + "logits/chosen": 1.5016125440597534, + "logits/rejected": 2.9172017574310303, + "logps/chosen": -386.29168701171875, + "logps/rejected": -602.5443725585938, + "loss": 0.0189, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.164307117462158, + "rewards/margins": 12.321807861328125, + "rewards/rejected": -10.157500267028809, + "step": 2190 + }, + { + "epoch": 0.75, + "learning_rate": 4.170968148054891e-07, + "logits/chosen": 1.2123631238937378, + "logits/rejected": 2.915442705154419, + "logps/chosen": -608.3240966796875, + "logps/rejected": -524.0863037109375, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4522815942764282, + "rewards/margins": 10.55662727355957, + "rewards/rejected": -9.10434627532959, + "step": 2200 + }, + { + "epoch": 0.75, + "eval_logits/chosen": 0.8183408975601196, + "eval_logits/rejected": 2.9877638816833496, + "eval_logps/chosen": -367.7718811035156, + "eval_logps/rejected": -586.932373046875, + "eval_loss": 0.024771658703684807, + "eval_rewards/accuracies": 0.9907407164573669, + "eval_rewards/chosen": 2.12180233001709, + "eval_rewards/margins": 10.552002906799316, + "eval_rewards/rejected": -8.43019962310791, + "eval_runtime": 267.5281, + "eval_samples_per_second": 35.51, + "eval_steps_per_second": 1.11, + "step": 2200 + }, + { + "epoch": 0.75, + "learning_rate": 4.164673297242855e-07, + "logits/chosen": 1.6179412603378296, + "logits/rejected": 2.405421733856201, + "logps/chosen": -524.8575439453125, + "logps/rejected": -612.1927490234375, + "loss": 0.0239, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.593804121017456, + "rewards/margins": 9.6146240234375, + "rewards/rejected": -8.020818710327148, + "step": 2210 + }, + { + "epoch": 0.75, + "learning_rate": 4.1583784464308193e-07, + "logits/chosen": 0.659404456615448, + "logits/rejected": 2.946692943572998, + "logps/chosen": -289.25628662109375, + "logps/rejected": -510.9583435058594, + "loss": 0.0309, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.205263376235962, + "rewards/margins": 10.87122917175293, + "rewards/rejected": -8.665966033935547, + "step": 2220 + }, + { + "epoch": 0.76, + "learning_rate": 4.152083595618784e-07, + "logits/chosen": 1.4942001104354858, + "logits/rejected": 3.1428303718566895, + "logps/chosen": -299.8153991699219, + "logps/rejected": -480.1278381347656, + "loss": 0.0201, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.1747162342071533, + "rewards/margins": 10.812664985656738, + "rewards/rejected": -8.637947082519531, + "step": 2230 + }, + { + "epoch": 0.76, + "learning_rate": 4.145788744806748e-07, + "logits/chosen": 1.2512485980987549, + "logits/rejected": 2.7009167671203613, + "logps/chosen": -316.3193054199219, + "logps/rejected": -531.200439453125, + "loss": 0.0124, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.160095691680908, + "rewards/margins": 9.601961135864258, + "rewards/rejected": -7.441866397857666, + "step": 2240 + }, + { + "epoch": 0.76, + "learning_rate": 4.139493893994712e-07, + "logits/chosen": 1.755748987197876, + "logits/rejected": 3.5284976959228516, + "logps/chosen": -345.405029296875, + "logps/rejected": -425.1649475097656, + "loss": 0.0194, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.156236171722412, + "rewards/margins": 10.638303756713867, + "rewards/rejected": -8.482067108154297, + "step": 2250 + }, + { + "epoch": 0.77, + "learning_rate": 4.133199043182676e-07, + "logits/chosen": 1.370705008506775, + "logits/rejected": 3.1592979431152344, + "logps/chosen": -323.472900390625, + "logps/rejected": -570.658203125, + "loss": 0.0228, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0871829986572266, + "rewards/margins": 11.190869331359863, + "rewards/rejected": -9.10368537902832, + "step": 2260 + }, + { + "epoch": 0.77, + "learning_rate": 4.1269041923706404e-07, + "logits/chosen": 0.9058796167373657, + "logits/rejected": 3.13399076461792, + "logps/chosen": -415.1564025878906, + "logps/rejected": -534.84521484375, + "loss": 0.0214, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.332183837890625, + "rewards/margins": 11.294748306274414, + "rewards/rejected": -8.962564468383789, + "step": 2270 + }, + { + "epoch": 0.77, + "learning_rate": 4.1206093415586047e-07, + "logits/chosen": 1.493805170059204, + "logits/rejected": 2.8733553886413574, + "logps/chosen": -335.87750244140625, + "logps/rejected": -545.4400024414062, + "loss": 0.0257, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.600989580154419, + "rewards/margins": 10.510766983032227, + "rewards/rejected": -7.909777641296387, + "step": 2280 + }, + { + "epoch": 0.78, + "learning_rate": 4.1143144907465694e-07, + "logits/chosen": 1.5079195499420166, + "logits/rejected": 2.865042209625244, + "logps/chosen": -333.11895751953125, + "logps/rejected": -583.3126831054688, + "loss": 0.0405, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3659024238586426, + "rewards/margins": 10.046285629272461, + "rewards/rejected": -7.680383205413818, + "step": 2290 + }, + { + "epoch": 0.78, + "learning_rate": 4.1080196399345336e-07, + "logits/chosen": 1.1294147968292236, + "logits/rejected": 2.9211654663085938, + "logps/chosen": -356.6783447265625, + "logps/rejected": -602.4715576171875, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0863723754882812, + "rewards/margins": 9.908502578735352, + "rewards/rejected": -7.822129726409912, + "step": 2300 + }, + { + "epoch": 0.78, + "eval_logits/chosen": 0.7942115664482117, + "eval_logits/rejected": 2.975782871246338, + "eval_logps/chosen": -368.0401916503906, + "eval_logps/rejected": -585.2183837890625, + "eval_loss": 0.023808879777789116, + "eval_rewards/accuracies": 0.9924242496490479, + "eval_rewards/chosen": 2.094971179962158, + "eval_rewards/margins": 10.353780746459961, + "eval_rewards/rejected": -8.258810043334961, + "eval_runtime": 268.1286, + "eval_samples_per_second": 35.431, + "eval_steps_per_second": 1.108, + "step": 2300 + }, + { + "epoch": 0.79, + "learning_rate": 4.101724789122498e-07, + "logits/chosen": 0.789573073387146, + "logits/rejected": 2.8354058265686035, + "logps/chosen": -455.05389404296875, + "logps/rejected": -584.878173828125, + "loss": 0.0255, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0611634254455566, + "rewards/margins": 9.882638931274414, + "rewards/rejected": -7.821475982666016, + "step": 2310 + }, + { + "epoch": 0.79, + "learning_rate": 4.0954299383104616e-07, + "logits/chosen": 1.5280182361602783, + "logits/rejected": 3.006089210510254, + "logps/chosen": -358.0874328613281, + "logps/rejected": -526.4508666992188, + "loss": 0.0194, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.2880072593688965, + "rewards/margins": 9.189247131347656, + "rewards/rejected": -6.901240348815918, + "step": 2320 + }, + { + "epoch": 0.79, + "learning_rate": 4.089135087498426e-07, + "logits/chosen": 1.7580665349960327, + "logits/rejected": 3.0975661277770996, + "logps/chosen": -551.3494873046875, + "logps/rejected": -511.8233947753906, + "loss": 0.0276, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6591579914093018, + "rewards/margins": 10.34562873840332, + "rewards/rejected": -8.686470031738281, + "step": 2330 + }, + { + "epoch": 0.8, + "learning_rate": 4.08284023668639e-07, + "logits/chosen": 1.2768250703811646, + "logits/rejected": 1.9886287450790405, + "logps/chosen": -363.474853515625, + "logps/rejected": -855.5643310546875, + "loss": 0.0412, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3451056480407715, + "rewards/margins": 10.982662200927734, + "rewards/rejected": -8.637556076049805, + "step": 2340 + }, + { + "epoch": 0.8, + "learning_rate": 4.076545385874355e-07, + "logits/chosen": 1.0143083333969116, + "logits/rejected": 3.3220772743225098, + "logps/chosen": -345.3077087402344, + "logps/rejected": -444.23480224609375, + "loss": 0.0172, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.1893310546875, + "rewards/margins": 11.086088180541992, + "rewards/rejected": -8.896757125854492, + "step": 2350 + }, + { + "epoch": 0.8, + "learning_rate": 4.070250535062319e-07, + "logits/chosen": 0.9742077589035034, + "logits/rejected": 3.147322177886963, + "logps/chosen": -305.42431640625, + "logps/rejected": -496.04156494140625, + "loss": 0.0237, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.3513169288635254, + "rewards/margins": 11.449823379516602, + "rewards/rejected": -9.098505020141602, + "step": 2360 + }, + { + "epoch": 0.81, + "learning_rate": 4.063955684250283e-07, + "logits/chosen": 1.33650803565979, + "logits/rejected": 2.385652542114258, + "logps/chosen": -355.97296142578125, + "logps/rejected": -673.2279663085938, + "loss": 0.0244, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8403629064559937, + "rewards/margins": 10.767081260681152, + "rewards/rejected": -8.926717758178711, + "step": 2370 + }, + { + "epoch": 0.81, + "learning_rate": 4.0576608334382475e-07, + "logits/chosen": 1.350214958190918, + "logits/rejected": 3.0495901107788086, + "logps/chosen": -378.8364562988281, + "logps/rejected": -482.08209228515625, + "loss": 0.016, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.1247801780700684, + "rewards/margins": 11.251646041870117, + "rewards/rejected": -9.126866340637207, + "step": 2380 + }, + { + "epoch": 0.81, + "learning_rate": 4.051365982626211e-07, + "logits/chosen": 0.7518723011016846, + "logits/rejected": 2.554800271987915, + "logps/chosen": -378.2152404785156, + "logps/rejected": -664.992919921875, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0884647369384766, + "rewards/margins": 10.981764793395996, + "rewards/rejected": -8.893301010131836, + "step": 2390 + }, + { + "epoch": 0.82, + "learning_rate": 4.0450711318141754e-07, + "logits/chosen": 0.7830019593238831, + "logits/rejected": 2.8458642959594727, + "logps/chosen": -339.97833251953125, + "logps/rejected": -581.98095703125, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2123093605041504, + "rewards/margins": 10.689801216125488, + "rewards/rejected": -8.477492332458496, + "step": 2400 + }, + { + "epoch": 0.82, + "eval_logits/chosen": 0.8048932552337646, + "eval_logits/rejected": 2.971872091293335, + "eval_logps/chosen": -367.2885437011719, + "eval_logps/rejected": -589.0294799804688, + "eval_loss": 0.02128712832927704, + "eval_rewards/accuracies": 0.994107723236084, + "eval_rewards/chosen": 2.170135021209717, + "eval_rewards/margins": 10.810053825378418, + "eval_rewards/rejected": -8.63991928100586, + "eval_runtime": 268.8871, + "eval_samples_per_second": 35.331, + "eval_steps_per_second": 1.105, + "step": 2400 + }, + { + "epoch": 0.82, + "learning_rate": 4.03877628100214e-07, + "logits/chosen": 1.4170640707015991, + "logits/rejected": 3.0084924697875977, + "logps/chosen": -338.78497314453125, + "logps/rejected": -485.98419189453125, + "loss": 0.017, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.2607309818267822, + "rewards/margins": 10.983499526977539, + "rewards/rejected": -8.722768783569336, + "step": 2410 + }, + { + "epoch": 0.82, + "learning_rate": 4.0324814301901044e-07, + "logits/chosen": 1.5939487218856812, + "logits/rejected": 2.8795523643493652, + "logps/chosen": -412.08203125, + "logps/rejected": -499.1363220214844, + "loss": 0.0188, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.2640907764434814, + "rewards/margins": 10.028116226196289, + "rewards/rejected": -7.7640252113342285, + "step": 2420 + }, + { + "epoch": 0.83, + "learning_rate": 4.0261865793780686e-07, + "logits/chosen": 1.2058733701705933, + "logits/rejected": 2.6473612785339355, + "logps/chosen": -375.0848693847656, + "logps/rejected": -608.9097290039062, + "loss": 0.0205, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1971919536590576, + "rewards/margins": 10.215367317199707, + "rewards/rejected": -8.01817512512207, + "step": 2430 + }, + { + "epoch": 0.83, + "learning_rate": 4.019891728566033e-07, + "logits/chosen": 1.4290034770965576, + "logits/rejected": 2.5581865310668945, + "logps/chosen": -384.47479248046875, + "logps/rejected": -717.1683349609375, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8649975061416626, + "rewards/margins": 11.204587936401367, + "rewards/rejected": -9.339591026306152, + "step": 2440 + }, + { + "epoch": 0.83, + "learning_rate": 4.013596877753997e-07, + "logits/chosen": 0.9973786473274231, + "logits/rejected": 2.5864202976226807, + "logps/chosen": -328.24285888671875, + "logps/rejected": -633.4420166015625, + "loss": 0.0219, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9440271854400635, + "rewards/margins": 10.755793571472168, + "rewards/rejected": -8.81176471710205, + "step": 2450 + }, + { + "epoch": 0.84, + "learning_rate": 4.0073020269419613e-07, + "logits/chosen": 1.055772066116333, + "logits/rejected": 2.9290902614593506, + "logps/chosen": -300.1732177734375, + "logps/rejected": -595.3922119140625, + "loss": 0.0268, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.2839577198028564, + "rewards/margins": 11.178118705749512, + "rewards/rejected": -8.894161224365234, + "step": 2460 + }, + { + "epoch": 0.84, + "learning_rate": 4.0010071761299255e-07, + "logits/chosen": 1.3420003652572632, + "logits/rejected": 2.7169554233551025, + "logps/chosen": -376.03997802734375, + "logps/rejected": -562.87939453125, + "loss": 0.0239, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.4601471424102783, + "rewards/margins": 12.127756118774414, + "rewards/rejected": -9.667607307434082, + "step": 2470 + }, + { + "epoch": 0.84, + "learning_rate": 3.99471232531789e-07, + "logits/chosen": 1.2516037225723267, + "logits/rejected": 3.1634230613708496, + "logps/chosen": -407.861328125, + "logps/rejected": -549.5359497070312, + "loss": 0.0963, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.2946739196777344, + "rewards/margins": 10.591371536254883, + "rewards/rejected": -9.296697616577148, + "step": 2480 + }, + { + "epoch": 0.85, + "learning_rate": 3.988417474505854e-07, + "logits/chosen": 1.7729272842407227, + "logits/rejected": 3.2436935901641846, + "logps/chosen": -366.273681640625, + "logps/rejected": -456.9263610839844, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1124117374420166, + "rewards/margins": 10.958487510681152, + "rewards/rejected": -8.846076965332031, + "step": 2490 + }, + { + "epoch": 0.85, + "learning_rate": 3.982122623693818e-07, + "logits/chosen": 1.80498468875885, + "logits/rejected": 3.289989471435547, + "logps/chosen": -427.5309143066406, + "logps/rejected": -459.2511291503906, + "loss": 0.0215, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.114067792892456, + "rewards/margins": 12.101420402526855, + "rewards/rejected": -9.987353324890137, + "step": 2500 + }, + { + "epoch": 0.85, + "eval_logits/chosen": 0.7799403071403503, + "eval_logits/rejected": 2.9391396045684814, + "eval_logps/chosen": -367.76953125, + "eval_logps/rejected": -594.5902099609375, + "eval_loss": 0.022420957684516907, + "eval_rewards/accuracies": 0.9932659864425659, + "eval_rewards/chosen": 2.1220319271087646, + "eval_rewards/margins": 11.318024635314941, + "eval_rewards/rejected": -9.195991516113281, + "eval_runtime": 268.5512, + "eval_samples_per_second": 35.375, + "eval_steps_per_second": 1.106, + "step": 2500 + }, + { + "epoch": 0.85, + "learning_rate": 3.9758277728817824e-07, + "logits/chosen": 1.6378087997436523, + "logits/rejected": 2.7543601989746094, + "logps/chosen": -395.02276611328125, + "logps/rejected": -580.5901489257812, + "loss": 0.0188, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1616625785827637, + "rewards/margins": 11.545696258544922, + "rewards/rejected": -9.384033203125, + "step": 2510 + }, + { + "epoch": 0.86, + "learning_rate": 3.9695329220697467e-07, + "logits/chosen": 0.9824737310409546, + "logits/rejected": 2.4738383293151855, + "logps/chosen": -315.70977783203125, + "logps/rejected": -785.3726806640625, + "loss": 0.0189, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2579476833343506, + "rewards/margins": 11.292442321777344, + "rewards/rejected": -9.034494400024414, + "step": 2520 + }, + { + "epoch": 0.86, + "learning_rate": 3.9632380712577114e-07, + "logits/chosen": 1.0176562070846558, + "logits/rejected": 2.7639870643615723, + "logps/chosen": -386.0313415527344, + "logps/rejected": -576.0577392578125, + "loss": 0.0247, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.4897618293762207, + "rewards/margins": 10.882287979125977, + "rewards/rejected": -8.392526626586914, + "step": 2530 + }, + { + "epoch": 0.86, + "learning_rate": 3.9569432204456756e-07, + "logits/chosen": 1.5484209060668945, + "logits/rejected": 2.956909656524658, + "logps/chosen": -316.6213684082031, + "logps/rejected": -477.16070556640625, + "loss": 0.0275, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.1205673217773438, + "rewards/margins": 10.799238204956055, + "rewards/rejected": -8.678671836853027, + "step": 2540 + }, + { + "epoch": 0.87, + "learning_rate": 3.9506483696336393e-07, + "logits/chosen": 1.5496317148208618, + "logits/rejected": 2.9354426860809326, + "logps/chosen": -373.88018798828125, + "logps/rejected": -591.4130859375, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0508036613464355, + "rewards/margins": 11.654314041137695, + "rewards/rejected": -9.603509902954102, + "step": 2550 + }, + { + "epoch": 0.87, + "learning_rate": 3.9443535188216036e-07, + "logits/chosen": 0.9137738347053528, + "logits/rejected": 2.731851577758789, + "logps/chosen": -320.07855224609375, + "logps/rejected": -649.6725463867188, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.17970609664917, + "rewards/margins": 12.163008689880371, + "rewards/rejected": -9.98330307006836, + "step": 2560 + }, + { + "epoch": 0.87, + "learning_rate": 3.938058668009568e-07, + "logits/chosen": 1.3828331232070923, + "logits/rejected": 2.774026870727539, + "logps/chosen": -388.3182067871094, + "logps/rejected": -615.1720581054688, + "loss": 0.0193, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6940110921859741, + "rewards/margins": 11.10916519165039, + "rewards/rejected": -9.415154457092285, + "step": 2570 + }, + { + "epoch": 0.88, + "learning_rate": 3.931763817197532e-07, + "logits/chosen": 1.4449328184127808, + "logits/rejected": 3.2123007774353027, + "logps/chosen": -364.09442138671875, + "logps/rejected": -411.33203125, + "loss": 0.0275, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.192070960998535, + "rewards/margins": 10.379565238952637, + "rewards/rejected": -8.187494277954102, + "step": 2580 + }, + { + "epoch": 0.88, + "learning_rate": 3.925468966385497e-07, + "logits/chosen": 1.5864359140396118, + "logits/rejected": 2.989715337753296, + "logps/chosen": -428.8077087402344, + "logps/rejected": -388.0070495605469, + "loss": 0.0184, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4646167755126953, + "rewards/margins": 11.220399856567383, + "rewards/rejected": -8.755781173706055, + "step": 2590 + }, + { + "epoch": 0.88, + "learning_rate": 3.919174115573461e-07, + "logits/chosen": 1.2315651178359985, + "logits/rejected": 2.7592966556549072, + "logps/chosen": -322.261962890625, + "logps/rejected": -582.4110717773438, + "loss": 0.0579, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.216446876525879, + "rewards/margins": 12.121309280395508, + "rewards/rejected": -9.904863357543945, + "step": 2600 + }, + { + "epoch": 0.88, + "eval_logits/chosen": 0.7932816743850708, + "eval_logits/rejected": 2.929717540740967, + "eval_logps/chosen": -368.6217346191406, + "eval_logps/rejected": -596.0587158203125, + "eval_loss": 0.019284222275018692, + "eval_rewards/accuracies": 0.9932659864425659, + "eval_rewards/chosen": 2.036813259124756, + "eval_rewards/margins": 11.379647254943848, + "eval_rewards/rejected": -9.34283447265625, + "eval_runtime": 268.0206, + "eval_samples_per_second": 35.445, + "eval_steps_per_second": 1.108, + "step": 2600 + }, + { + "epoch": 0.89, + "learning_rate": 3.912879264761425e-07, + "logits/chosen": 0.6048256158828735, + "logits/rejected": 2.5253026485443115, + "logps/chosen": -369.83587646484375, + "logps/rejected": -640.2315673828125, + "loss": 0.0182, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.063535690307617, + "rewards/margins": 10.798006057739258, + "rewards/rejected": -8.73447036743164, + "step": 2610 + }, + { + "epoch": 0.89, + "learning_rate": 3.906584413949389e-07, + "logits/chosen": 1.3932554721832275, + "logits/rejected": 2.831205368041992, + "logps/chosen": -462.0, + "logps/rejected": -515.1595458984375, + "loss": 0.0203, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8407312631607056, + "rewards/margins": 9.663253784179688, + "rewards/rejected": -7.8225226402282715, + "step": 2620 + }, + { + "epoch": 0.89, + "learning_rate": 3.900289563137353e-07, + "logits/chosen": 1.561541199684143, + "logits/rejected": 2.811318874359131, + "logps/chosen": -347.90399169921875, + "logps/rejected": -610.0369873046875, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.943642020225525, + "rewards/margins": 12.20551586151123, + "rewards/rejected": -10.261874198913574, + "step": 2630 + }, + { + "epoch": 0.9, + "learning_rate": 3.8939947123253174e-07, + "logits/chosen": 1.2890950441360474, + "logits/rejected": 3.1064820289611816, + "logps/chosen": -348.0389709472656, + "logps/rejected": -499.8296813964844, + "loss": 0.0154, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4777417182922363, + "rewards/margins": 9.751089096069336, + "rewards/rejected": -8.273346900939941, + "step": 2640 + }, + { + "epoch": 0.9, + "learning_rate": 3.887699861513282e-07, + "logits/chosen": 1.138880729675293, + "logits/rejected": 3.108267068862915, + "logps/chosen": -288.1042785644531, + "logps/rejected": -444.10870361328125, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9868762493133545, + "rewards/margins": 11.673996925354004, + "rewards/rejected": -9.687118530273438, + "step": 2650 + }, + { + "epoch": 0.9, + "learning_rate": 3.8814050107012464e-07, + "logits/chosen": 1.364803671836853, + "logits/rejected": 3.290356159210205, + "logps/chosen": -384.6515197753906, + "logps/rejected": -439.424072265625, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.177926540374756, + "rewards/margins": 12.535216331481934, + "rewards/rejected": -10.357290267944336, + "step": 2660 + }, + { + "epoch": 0.91, + "learning_rate": 3.8751101598892106e-07, + "logits/chosen": 0.7594862580299377, + "logits/rejected": 2.463059186935425, + "logps/chosen": -334.9971008300781, + "logps/rejected": -688.7122192382812, + "loss": 0.0146, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.329500436782837, + "rewards/margins": 11.81839656829834, + "rewards/rejected": -9.488895416259766, + "step": 2670 + }, + { + "epoch": 0.91, + "learning_rate": 3.868815309077175e-07, + "logits/chosen": 1.1251529455184937, + "logits/rejected": 2.556021213531494, + "logps/chosen": -383.8106994628906, + "logps/rejected": -728.5634765625, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8957366943359375, + "rewards/margins": 11.471336364746094, + "rewards/rejected": -9.575600624084473, + "step": 2680 + }, + { + "epoch": 0.91, + "learning_rate": 3.862520458265139e-07, + "logits/chosen": 1.1408156156539917, + "logits/rejected": 2.342296838760376, + "logps/chosen": -417.59320068359375, + "logps/rejected": -750.766357421875, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7239834070205688, + "rewards/margins": 12.186110496520996, + "rewards/rejected": -10.462127685546875, + "step": 2690 + }, + { + "epoch": 0.92, + "learning_rate": 3.856225607453103e-07, + "logits/chosen": 0.2998413145542145, + "logits/rejected": 3.279633045196533, + "logps/chosen": -267.75390625, + "logps/rejected": -492.7950134277344, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8813514709472656, + "rewards/margins": 11.568644523620605, + "rewards/rejected": -9.687294006347656, + "step": 2700 + }, + { + "epoch": 0.92, + "eval_logits/chosen": 0.7627749443054199, + "eval_logits/rejected": 2.911400079727173, + "eval_logps/chosen": -369.9328308105469, + "eval_logps/rejected": -597.5867309570312, + "eval_loss": 0.017963021993637085, + "eval_rewards/accuracies": 0.994107723236084, + "eval_rewards/chosen": 1.9057058095932007, + "eval_rewards/margins": 11.401346206665039, + "eval_rewards/rejected": -9.49563980102539, + "eval_runtime": 268.4558, + "eval_samples_per_second": 35.388, + "eval_steps_per_second": 1.106, + "step": 2700 + }, + { + "epoch": 0.92, + "learning_rate": 3.8499307566410675e-07, + "logits/chosen": 1.146585464477539, + "logits/rejected": 2.8043265342712402, + "logps/chosen": -323.42303466796875, + "logps/rejected": -566.1694946289062, + "loss": 0.0192, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6829410791397095, + "rewards/margins": 12.46528148651123, + "rewards/rejected": -10.782341003417969, + "step": 2710 + }, + { + "epoch": 0.92, + "learning_rate": 3.843635905829032e-07, + "logits/chosen": 0.9065067172050476, + "logits/rejected": 2.6599488258361816, + "logps/chosen": -466.5685119628906, + "logps/rejected": -674.6013793945312, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.813049077987671, + "rewards/margins": 10.931371688842773, + "rewards/rejected": -9.118322372436523, + "step": 2720 + }, + { + "epoch": 0.93, + "learning_rate": 3.837341055016996e-07, + "logits/chosen": 1.059467077255249, + "logits/rejected": 2.8486685752868652, + "logps/chosen": -314.97503662109375, + "logps/rejected": -662.3733520507812, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9159355163574219, + "rewards/margins": 10.187853813171387, + "rewards/rejected": -8.271917343139648, + "step": 2730 + }, + { + "epoch": 0.93, + "learning_rate": 3.83104620420496e-07, + "logits/chosen": 0.7870198488235474, + "logits/rejected": 2.1669564247131348, + "logps/chosen": -379.6060485839844, + "logps/rejected": -886.0685424804688, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5106027126312256, + "rewards/margins": 10.739408493041992, + "rewards/rejected": -9.228806495666504, + "step": 2740 + }, + { + "epoch": 0.93, + "learning_rate": 3.8247513533929244e-07, + "logits/chosen": 0.8939107060432434, + "logits/rejected": 2.5925869941711426, + "logps/chosen": -322.8708190917969, + "logps/rejected": -716.4050903320312, + "loss": 0.058, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.69145929813385, + "rewards/margins": 11.752408981323242, + "rewards/rejected": -10.060951232910156, + "step": 2750 + }, + { + "epoch": 0.94, + "learning_rate": 3.8184565025808887e-07, + "logits/chosen": 1.5585074424743652, + "logits/rejected": 2.8523619174957275, + "logps/chosen": -417.64892578125, + "logps/rejected": -644.0786743164062, + "loss": 0.0207, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.8149226903915405, + "rewards/margins": 11.294268608093262, + "rewards/rejected": -9.47934627532959, + "step": 2760 + }, + { + "epoch": 0.94, + "learning_rate": 3.8121616517688534e-07, + "logits/chosen": 0.7556962370872498, + "logits/rejected": 2.8186545372009277, + "logps/chosen": -309.7710266113281, + "logps/rejected": -595.9371337890625, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.438164234161377, + "rewards/margins": 11.598024368286133, + "rewards/rejected": -9.159860610961914, + "step": 2770 + }, + { + "epoch": 0.94, + "learning_rate": 3.805866800956817e-07, + "logits/chosen": 1.1116828918457031, + "logits/rejected": 3.601109027862549, + "logps/chosen": -313.4578552246094, + "logps/rejected": -368.9190368652344, + "loss": 0.0204, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.6321697235107422, + "rewards/margins": 11.596505165100098, + "rewards/rejected": -9.964335441589355, + "step": 2780 + }, + { + "epoch": 0.95, + "learning_rate": 3.7995719501447813e-07, + "logits/chosen": 1.1926578283309937, + "logits/rejected": 2.93463134765625, + "logps/chosen": -332.8338928222656, + "logps/rejected": -521.95361328125, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0567336082458496, + "rewards/margins": 12.912908554077148, + "rewards/rejected": -10.856175422668457, + "step": 2790 + }, + { + "epoch": 0.95, + "learning_rate": 3.7932770993327456e-07, + "logits/chosen": 1.2860372066497803, + "logits/rejected": 2.5439960956573486, + "logps/chosen": -330.3837890625, + "logps/rejected": -797.3041381835938, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8657745122909546, + "rewards/margins": 11.384241104125977, + "rewards/rejected": -9.51846694946289, + "step": 2800 + }, + { + "epoch": 0.95, + "eval_logits/chosen": 0.7736470699310303, + "eval_logits/rejected": 2.922307014465332, + "eval_logps/chosen": -369.0752258300781, + "eval_logps/rejected": -596.8948974609375, + "eval_loss": 0.019366171211004257, + "eval_rewards/accuracies": 0.9932659864425659, + "eval_rewards/chosen": 1.9914653301239014, + "eval_rewards/margins": 11.417922973632812, + "eval_rewards/rejected": -9.426457405090332, + "eval_runtime": 268.0209, + "eval_samples_per_second": 35.445, + "eval_steps_per_second": 1.108, + "step": 2800 + }, + { + "epoch": 0.96, + "learning_rate": 3.78698224852071e-07, + "logits/chosen": 1.0611159801483154, + "logits/rejected": 3.3828582763671875, + "logps/chosen": -308.9765319824219, + "logps/rejected": -434.8480529785156, + "loss": 0.0237, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.962587594985962, + "rewards/margins": 11.504419326782227, + "rewards/rejected": -9.54183292388916, + "step": 2810 + }, + { + "epoch": 0.96, + "learning_rate": 3.780687397708674e-07, + "logits/chosen": 1.3641759157180786, + "logits/rejected": 3.2257022857666016, + "logps/chosen": -328.934814453125, + "logps/rejected": -443.7102966308594, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9324061870574951, + "rewards/margins": 10.669090270996094, + "rewards/rejected": -8.736684799194336, + "step": 2820 + }, + { + "epoch": 0.96, + "learning_rate": 3.774392546896638e-07, + "logits/chosen": 1.0812848806381226, + "logits/rejected": 2.389310598373413, + "logps/chosen": -402.924072265625, + "logps/rejected": -687.3925170898438, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1839425563812256, + "rewards/margins": 10.521379470825195, + "rewards/rejected": -8.33743667602539, + "step": 2830 + }, + { + "epoch": 0.97, + "learning_rate": 3.768097696084603e-07, + "logits/chosen": 1.0410970449447632, + "logits/rejected": 2.740626335144043, + "logps/chosen": -360.38861083984375, + "logps/rejected": -626.6319580078125, + "loss": 0.021, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.4108242988586426, + "rewards/margins": 13.338290214538574, + "rewards/rejected": -10.92746639251709, + "step": 2840 + }, + { + "epoch": 0.97, + "learning_rate": 3.761802845272567e-07, + "logits/chosen": 0.6179854273796082, + "logits/rejected": 2.8828322887420654, + "logps/chosen": -372.5259094238281, + "logps/rejected": -498.89093017578125, + "loss": 0.0216, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.887270212173462, + "rewards/margins": 11.659965515136719, + "rewards/rejected": -9.772695541381836, + "step": 2850 + }, + { + "epoch": 0.97, + "learning_rate": 3.755507994460531e-07, + "logits/chosen": 1.0825567245483398, + "logits/rejected": 3.1808676719665527, + "logps/chosen": -344.6096496582031, + "logps/rejected": -501.6756896972656, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.016951084136963, + "rewards/margins": 11.96239185333252, + "rewards/rejected": -9.945440292358398, + "step": 2860 + }, + { + "epoch": 0.98, + "learning_rate": 3.749213143648495e-07, + "logits/chosen": 0.966810405254364, + "logits/rejected": 2.5417160987854004, + "logps/chosen": -432.48321533203125, + "logps/rejected": -644.724365234375, + "loss": 0.0302, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.2609174251556396, + "rewards/margins": 10.42978286743164, + "rewards/rejected": -8.168864250183105, + "step": 2870 + }, + { + "epoch": 0.98, + "learning_rate": 3.7429182928364594e-07, + "logits/chosen": 0.7974327206611633, + "logits/rejected": 2.231346368789673, + "logps/chosen": -384.548828125, + "logps/rejected": -807.3264770507812, + "loss": 0.0185, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.849522352218628, + "rewards/margins": 10.164338111877441, + "rewards/rejected": -8.314815521240234, + "step": 2880 + }, + { + "epoch": 0.98, + "learning_rate": 3.7366234420244236e-07, + "logits/chosen": 1.5715937614440918, + "logits/rejected": 2.6558785438537598, + "logps/chosen": -506.68157958984375, + "logps/rejected": -646.1708374023438, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9481022357940674, + "rewards/margins": 11.122952461242676, + "rewards/rejected": -9.174850463867188, + "step": 2890 + }, + { + "epoch": 0.99, + "learning_rate": 3.7303285912123884e-07, + "logits/chosen": 1.315989375114441, + "logits/rejected": 2.911738157272339, + "logps/chosen": -338.9214782714844, + "logps/rejected": -540.4888916015625, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.19695782661438, + "rewards/margins": 11.435901641845703, + "rewards/rejected": -9.238944053649902, + "step": 2900 + }, + { + "epoch": 0.99, + "eval_logits/chosen": 0.759222686290741, + "eval_logits/rejected": 2.918637275695801, + "eval_logps/chosen": -368.2200927734375, + "eval_logps/rejected": -594.5847778320312, + "eval_loss": 0.0181864183396101, + "eval_rewards/accuracies": 0.9957912564277649, + "eval_rewards/chosen": 2.0769829750061035, + "eval_rewards/margins": 11.272428512573242, + "eval_rewards/rejected": -9.195446014404297, + "eval_runtime": 267.5725, + "eval_samples_per_second": 35.504, + "eval_steps_per_second": 1.11, + "step": 2900 + }, + { + "epoch": 0.99, + "learning_rate": 3.7240337404003526e-07, + "logits/chosen": 1.4009299278259277, + "logits/rejected": 2.185304880142212, + "logps/chosen": -324.7784729003906, + "logps/rejected": -688.9271240234375, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8561160564422607, + "rewards/margins": 10.968622207641602, + "rewards/rejected": -9.112505912780762, + "step": 2910 + }, + { + "epoch": 0.99, + "learning_rate": 3.717738889588317e-07, + "logits/chosen": 1.8506414890289307, + "logits/rejected": 2.459937572479248, + "logps/chosen": -335.97039794921875, + "logps/rejected": -724.4388427734375, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7944343090057373, + "rewards/margins": 11.212178230285645, + "rewards/rejected": -9.417744636535645, + "step": 2920 + }, + { + "epoch": 1.0, + "learning_rate": 3.7114440387762805e-07, + "logits/chosen": 1.602473497390747, + "logits/rejected": 3.060894727706909, + "logps/chosen": -315.49273681640625, + "logps/rejected": -431.2628479003906, + "loss": 0.0145, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9308958053588867, + "rewards/margins": 11.219779968261719, + "rewards/rejected": -9.288885116577148, + "step": 2930 + }, + { + "epoch": 1.0, + "learning_rate": 3.705149187964245e-07, + "logits/chosen": 1.5184214115142822, + "logits/rejected": 2.9138710498809814, + "logps/chosen": -386.93157958984375, + "logps/rejected": -530.5115966796875, + "loss": 0.0155, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.803008794784546, + "rewards/margins": 11.285566329956055, + "rewards/rejected": -9.48255729675293, + "step": 2940 + }, + { + "epoch": 1.0, + "learning_rate": 3.698854337152209e-07, + "logits/chosen": 1.690629005432129, + "logits/rejected": 3.274028778076172, + "logps/chosen": -315.9340515136719, + "logps/rejected": -507.6627502441406, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0165982246398926, + "rewards/margins": 11.94556999206543, + "rewards/rejected": -9.928972244262695, + "step": 2950 + }, + { + "epoch": 1.01, + "learning_rate": 3.692559486340174e-07, + "logits/chosen": 0.7777345180511475, + "logits/rejected": 2.230090856552124, + "logps/chosen": -353.92840576171875, + "logps/rejected": -908.7879028320312, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9050586223602295, + "rewards/margins": 12.09577465057373, + "rewards/rejected": -10.190717697143555, + "step": 2960 + }, + { + "epoch": 1.01, + "learning_rate": 3.686264635528138e-07, + "logits/chosen": 0.5998127460479736, + "logits/rejected": 3.0432848930358887, + "logps/chosen": -305.62451171875, + "logps/rejected": -328.70220947265625, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7932469844818115, + "rewards/margins": 10.898086547851562, + "rewards/rejected": -9.104839324951172, + "step": 2970 + }, + { + "epoch": 1.01, + "learning_rate": 3.679969784716102e-07, + "logits/chosen": 0.6457049250602722, + "logits/rejected": 3.008762836456299, + "logps/chosen": -289.96527099609375, + "logps/rejected": -502.66107177734375, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8909308910369873, + "rewards/margins": 11.495240211486816, + "rewards/rejected": -9.604308128356934, + "step": 2980 + }, + { + "epoch": 1.02, + "learning_rate": 3.6736749339040664e-07, + "logits/chosen": 1.1077024936676025, + "logits/rejected": 2.665431499481201, + "logps/chosen": -381.0195617675781, + "logps/rejected": -638.0323486328125, + "loss": 0.0167, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.257869005203247, + "rewards/margins": 12.202370643615723, + "rewards/rejected": -9.944503784179688, + "step": 2990 + }, + { + "epoch": 1.02, + "learning_rate": 3.6673800830920307e-07, + "logits/chosen": 1.476149320602417, + "logits/rejected": 2.7508435249328613, + "logps/chosen": -362.0536804199219, + "logps/rejected": -539.8151245117188, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.177727222442627, + "rewards/margins": 10.220754623413086, + "rewards/rejected": -8.0430269241333, + "step": 3000 + }, + { + "epoch": 1.02, + "eval_logits/chosen": 0.7556570768356323, + "eval_logits/rejected": 2.895693063735962, + "eval_logps/chosen": -369.8956604003906, + "eval_logps/rejected": -597.5946655273438, + "eval_loss": 0.017979048192501068, + "eval_rewards/accuracies": 0.994107723236084, + "eval_rewards/chosen": 1.909419298171997, + "eval_rewards/margins": 11.405853271484375, + "eval_rewards/rejected": -9.496432304382324, + "eval_runtime": 268.1886, + "eval_samples_per_second": 35.423, + "eval_steps_per_second": 1.107, + "step": 3000 + }, + { + "epoch": 1.02, + "learning_rate": 3.6610852322799943e-07, + "logits/chosen": 0.7976253628730774, + "logits/rejected": 2.7230780124664307, + "logps/chosen": -431.26348876953125, + "logps/rejected": -602.6639404296875, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.011127471923828, + "rewards/margins": 11.370102882385254, + "rewards/rejected": -9.358975410461426, + "step": 3010 + }, + { + "epoch": 1.03, + "learning_rate": 3.654790381467959e-07, + "logits/chosen": 1.573567271232605, + "logits/rejected": 3.549926280975342, + "logps/chosen": -337.05633544921875, + "logps/rejected": -439.34820556640625, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0311856269836426, + "rewards/margins": 12.150315284729004, + "rewards/rejected": -10.11913013458252, + "step": 3020 + }, + { + "epoch": 1.03, + "learning_rate": 3.6484955306559233e-07, + "logits/chosen": 1.0998715162277222, + "logits/rejected": 2.7142605781555176, + "logps/chosen": -445.3197326660156, + "logps/rejected": -485.9790954589844, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2856061458587646, + "rewards/margins": 11.167757034301758, + "rewards/rejected": -8.882150650024414, + "step": 3030 + }, + { + "epoch": 1.03, + "learning_rate": 3.6422006798438876e-07, + "logits/chosen": 0.7898514270782471, + "logits/rejected": 2.9154164791107178, + "logps/chosen": -340.6654357910156, + "logps/rejected": -562.8787841796875, + "loss": 0.0216, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.05285906791687, + "rewards/margins": 11.81180477142334, + "rewards/rejected": -9.758944511413574, + "step": 3040 + }, + { + "epoch": 1.04, + "learning_rate": 3.635905829031852e-07, + "logits/chosen": 0.8585756421089172, + "logits/rejected": 3.1079821586608887, + "logps/chosen": -429.490478515625, + "logps/rejected": -481.5127868652344, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7858238220214844, + "rewards/margins": 11.667909622192383, + "rewards/rejected": -9.882083892822266, + "step": 3050 + }, + { + "epoch": 1.04, + "learning_rate": 3.629610978219816e-07, + "logits/chosen": 1.2229456901550293, + "logits/rejected": 2.210904836654663, + "logps/chosen": -349.9075012207031, + "logps/rejected": -760.1993408203125, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1999588012695312, + "rewards/margins": 11.409255981445312, + "rewards/rejected": -9.209296226501465, + "step": 3060 + }, + { + "epoch": 1.04, + "learning_rate": 3.62331612740778e-07, + "logits/chosen": 1.2499350309371948, + "logits/rejected": 2.8214731216430664, + "logps/chosen": -432.19989013671875, + "logps/rejected": -587.6690673828125, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0051960945129395, + "rewards/margins": 12.072220802307129, + "rewards/rejected": -10.067024230957031, + "step": 3070 + }, + { + "epoch": 1.05, + "learning_rate": 3.617021276595745e-07, + "logits/chosen": 1.251936912536621, + "logits/rejected": 2.393967390060425, + "logps/chosen": -538.280029296875, + "logps/rejected": -499.31427001953125, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7456916570663452, + "rewards/margins": 10.943967819213867, + "rewards/rejected": -9.198275566101074, + "step": 3080 + }, + { + "epoch": 1.05, + "learning_rate": 3.6107264257837087e-07, + "logits/chosen": 0.43956202268600464, + "logits/rejected": 2.4506821632385254, + "logps/chosen": -369.17193603515625, + "logps/rejected": -669.3594970703125, + "loss": 0.0104, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.8963171243667603, + "rewards/margins": 11.685440063476562, + "rewards/rejected": -9.78912353515625, + "step": 3090 + }, + { + "epoch": 1.05, + "learning_rate": 3.604431574971673e-07, + "logits/chosen": 1.2288469076156616, + "logits/rejected": 2.752350091934204, + "logps/chosen": -352.03094482421875, + "logps/rejected": -578.940673828125, + "loss": 0.011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6200469732284546, + "rewards/margins": 10.573201179504395, + "rewards/rejected": -8.953152656555176, + "step": 3100 + }, + { + "epoch": 1.05, + "eval_logits/chosen": 0.7294398546218872, + "eval_logits/rejected": 2.855963945388794, + "eval_logps/chosen": -368.9812316894531, + "eval_logps/rejected": -601.975830078125, + "eval_loss": 0.015040040947496891, + "eval_rewards/accuracies": 0.996632993221283, + "eval_rewards/chosen": 2.0008652210235596, + "eval_rewards/margins": 11.935413360595703, + "eval_rewards/rejected": -9.934547424316406, + "eval_runtime": 267.2447, + "eval_samples_per_second": 35.548, + "eval_steps_per_second": 1.111, + "step": 3100 + }, + { + "epoch": 1.06, + "learning_rate": 3.598136724159637e-07, + "logits/chosen": 1.2908390760421753, + "logits/rejected": 3.2633769512176514, + "logps/chosen": -426.2572326660156, + "logps/rejected": -449.2496032714844, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.091630458831787, + "rewards/margins": 12.465272903442383, + "rewards/rejected": -10.37364387512207, + "step": 3110 + }, + { + "epoch": 1.06, + "learning_rate": 3.5918418733476014e-07, + "logits/chosen": 1.0254614353179932, + "logits/rejected": 2.644998788833618, + "logps/chosen": -444.62115478515625, + "logps/rejected": -557.4613037109375, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.416001081466675, + "rewards/margins": 13.090082168579102, + "rewards/rejected": -10.674080848693848, + "step": 3120 + }, + { + "epoch": 1.06, + "learning_rate": 3.5855470225355656e-07, + "logits/chosen": 0.3688656985759735, + "logits/rejected": 2.787039279937744, + "logps/chosen": -371.48199462890625, + "logps/rejected": -606.9403076171875, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.235377788543701, + "rewards/margins": 11.422839164733887, + "rewards/rejected": -9.187460899353027, + "step": 3130 + }, + { + "epoch": 1.07, + "learning_rate": 3.5792521717235304e-07, + "logits/chosen": 1.1069527864456177, + "logits/rejected": 2.7388956546783447, + "logps/chosen": -383.25970458984375, + "logps/rejected": -508.56884765625, + "loss": 0.0196, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.3463032245635986, + "rewards/margins": 13.309300422668457, + "rewards/rejected": -10.962995529174805, + "step": 3140 + }, + { + "epoch": 1.07, + "learning_rate": 3.5729573209114946e-07, + "logits/chosen": 1.341090202331543, + "logits/rejected": 3.0517337322235107, + "logps/chosen": -318.0455627441406, + "logps/rejected": -569.5386962890625, + "loss": 0.0121, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.2804172039031982, + "rewards/margins": 10.993358612060547, + "rewards/rejected": -8.71294116973877, + "step": 3150 + }, + { + "epoch": 1.07, + "learning_rate": 3.5666624700994583e-07, + "logits/chosen": 0.7650747299194336, + "logits/rejected": 2.015530824661255, + "logps/chosen": -323.05804443359375, + "logps/rejected": -855.6818237304688, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.203866481781006, + "rewards/margins": 12.014130592346191, + "rewards/rejected": -9.810262680053711, + "step": 3160 + }, + { + "epoch": 1.08, + "learning_rate": 3.5603676192874225e-07, + "logits/chosen": 0.6404491066932678, + "logits/rejected": 2.8913769721984863, + "logps/chosen": -307.3809509277344, + "logps/rejected": -477.2265625, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.644552230834961, + "rewards/margins": 11.691588401794434, + "rewards/rejected": -9.047036170959473, + "step": 3170 + }, + { + "epoch": 1.08, + "learning_rate": 3.554072768475387e-07, + "logits/chosen": 1.2438385486602783, + "logits/rejected": 2.9186463356018066, + "logps/chosen": -448.6307067871094, + "logps/rejected": -547.8207397460938, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0710911750793457, + "rewards/margins": 10.842884063720703, + "rewards/rejected": -8.7717924118042, + "step": 3180 + }, + { + "epoch": 1.08, + "learning_rate": 3.547777917663351e-07, + "logits/chosen": 0.6184150576591492, + "logits/rejected": 2.433743953704834, + "logps/chosen": -344.1460876464844, + "logps/rejected": -703.8804931640625, + "loss": 0.0122, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.277899742126465, + "rewards/margins": 13.781367301940918, + "rewards/rejected": -11.503466606140137, + "step": 3190 + }, + { + "epoch": 1.09, + "learning_rate": 3.5414830668513157e-07, + "logits/chosen": 0.9604349136352539, + "logits/rejected": 2.8598134517669678, + "logps/chosen": -297.17474365234375, + "logps/rejected": -472.0577697753906, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.357182025909424, + "rewards/margins": 12.045616149902344, + "rewards/rejected": -9.688433647155762, + "step": 3200 + }, + { + "epoch": 1.09, + "eval_logits/chosen": 0.7070604562759399, + "eval_logits/rejected": 2.8564767837524414, + "eval_logps/chosen": -368.12896728515625, + "eval_logps/rejected": -598.782958984375, + "eval_loss": 0.01390204019844532, + "eval_rewards/accuracies": 0.996632993221283, + "eval_rewards/chosen": 2.0860908031463623, + "eval_rewards/margins": 11.701354026794434, + "eval_rewards/rejected": -9.615262985229492, + "eval_runtime": 267.9523, + "eval_samples_per_second": 35.454, + "eval_steps_per_second": 1.108, + "step": 3200 + }, + { + "epoch": 1.09, + "learning_rate": 3.53518821603928e-07, + "logits/chosen": 1.42811918258667, + "logits/rejected": 3.1818394660949707, + "logps/chosen": -312.5201110839844, + "logps/rejected": -479.3251953125, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7172324657440186, + "rewards/margins": 11.587867736816406, + "rewards/rejected": -9.870635986328125, + "step": 3210 + }, + { + "epoch": 1.09, + "learning_rate": 3.528893365227244e-07, + "logits/chosen": 1.412206768989563, + "logits/rejected": 2.872518301010132, + "logps/chosen": -452.25439453125, + "logps/rejected": -519.6819458007812, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7541221380233765, + "rewards/margins": 11.772988319396973, + "rewards/rejected": -10.018865585327148, + "step": 3220 + }, + { + "epoch": 1.1, + "learning_rate": 3.5225985144152084e-07, + "logits/chosen": 0.8214865922927856, + "logits/rejected": 3.258615493774414, + "logps/chosen": -339.0177307128906, + "logps/rejected": -487.43475341796875, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4326157569885254, + "rewards/margins": 13.359159469604492, + "rewards/rejected": -10.926544189453125, + "step": 3230 + }, + { + "epoch": 1.1, + "learning_rate": 3.516303663603172e-07, + "logits/chosen": 1.7511094808578491, + "logits/rejected": 3.0158190727233887, + "logps/chosen": -378.07861328125, + "logps/rejected": -562.0789794921875, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.06204891204834, + "rewards/margins": 11.8056001663208, + "rewards/rejected": -9.743552207946777, + "step": 3240 + }, + { + "epoch": 1.1, + "learning_rate": 3.5100088127911363e-07, + "logits/chosen": 0.7212003469467163, + "logits/rejected": 3.1654887199401855, + "logps/chosen": -399.91644287109375, + "logps/rejected": -462.4659118652344, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0864148139953613, + "rewards/margins": 13.867498397827148, + "rewards/rejected": -11.781085014343262, + "step": 3250 + }, + { + "epoch": 1.11, + "learning_rate": 3.503713961979101e-07, + "logits/chosen": 1.574752688407898, + "logits/rejected": 2.749525547027588, + "logps/chosen": -357.22637939453125, + "logps/rejected": -579.518310546875, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.607234001159668, + "rewards/margins": 11.814053535461426, + "rewards/rejected": -10.206819534301758, + "step": 3260 + }, + { + "epoch": 1.11, + "learning_rate": 3.4974191111670653e-07, + "logits/chosen": 1.2592524290084839, + "logits/rejected": 2.282480239868164, + "logps/chosen": -393.34625244140625, + "logps/rejected": -747.9046020507812, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0157406330108643, + "rewards/margins": 11.675127029418945, + "rewards/rejected": -9.65938663482666, + "step": 3270 + }, + { + "epoch": 1.11, + "learning_rate": 3.4911242603550296e-07, + "logits/chosen": 1.7345688343048096, + "logits/rejected": 2.5775959491729736, + "logps/chosen": -336.7502136230469, + "logps/rejected": -515.5429077148438, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4989908933639526, + "rewards/margins": 10.366995811462402, + "rewards/rejected": -8.868005752563477, + "step": 3280 + }, + { + "epoch": 1.12, + "learning_rate": 3.484829409542994e-07, + "logits/chosen": 1.371095061302185, + "logits/rejected": 2.897758960723877, + "logps/chosen": -422.10595703125, + "logps/rejected": -568.4336547851562, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2941784858703613, + "rewards/margins": 13.477294921875, + "rewards/rejected": -11.183117866516113, + "step": 3290 + }, + { + "epoch": 1.12, + "learning_rate": 3.478534558730958e-07, + "logits/chosen": 1.2534377574920654, + "logits/rejected": 2.912048816680908, + "logps/chosen": -365.91363525390625, + "logps/rejected": -521.9572143554688, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7988227605819702, + "rewards/margins": 11.863677024841309, + "rewards/rejected": -10.064854621887207, + "step": 3300 + }, + { + "epoch": 1.12, + "eval_logits/chosen": 0.7083035111427307, + "eval_logits/rejected": 2.828965902328491, + "eval_logps/chosen": -369.2344055175781, + "eval_logps/rejected": -606.5661010742188, + "eval_loss": 0.013400154188275337, + "eval_rewards/accuracies": 0.9957912564277649, + "eval_rewards/chosen": 1.9755483865737915, + "eval_rewards/margins": 12.369123458862305, + "eval_rewards/rejected": -10.393575668334961, + "eval_runtime": 268.3764, + "eval_samples_per_second": 35.398, + "eval_steps_per_second": 1.107, + "step": 3300 + }, + { + "epoch": 1.13, + "learning_rate": 3.4722397079189217e-07, + "logits/chosen": 0.8336647748947144, + "logits/rejected": 2.756704568862915, + "logps/chosen": -324.1752624511719, + "logps/rejected": -625.7420654296875, + "loss": 0.015, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.8257815837860107, + "rewards/margins": 12.731443405151367, + "rewards/rejected": -10.905664443969727, + "step": 3310 + }, + { + "epoch": 1.13, + "learning_rate": 3.4659448571068865e-07, + "logits/chosen": 0.3902451694011688, + "logits/rejected": 2.532710552215576, + "logps/chosen": -473.7185974121094, + "logps/rejected": -582.3638305664062, + "loss": 0.0125, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6804431676864624, + "rewards/margins": 12.236088752746582, + "rewards/rejected": -10.555645942687988, + "step": 3320 + }, + { + "epoch": 1.13, + "learning_rate": 3.4596500062948507e-07, + "logits/chosen": 0.8588132858276367, + "logits/rejected": 2.571681261062622, + "logps/chosen": -399.33587646484375, + "logps/rejected": -604.447265625, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3679261207580566, + "rewards/margins": 13.345097541809082, + "rewards/rejected": -10.97716999053955, + "step": 3330 + }, + { + "epoch": 1.14, + "learning_rate": 3.453355155482815e-07, + "logits/chosen": 0.8485054969787598, + "logits/rejected": 2.0267395973205566, + "logps/chosen": -366.140869140625, + "logps/rejected": -839.7205200195312, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0398154258728027, + "rewards/margins": 12.325020790100098, + "rewards/rejected": -10.28520393371582, + "step": 3340 + }, + { + "epoch": 1.14, + "learning_rate": 3.447060304670779e-07, + "logits/chosen": 0.7669464349746704, + "logits/rejected": 2.867802143096924, + "logps/chosen": -337.73590087890625, + "logps/rejected": -584.9771728515625, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1630008220672607, + "rewards/margins": 13.043685913085938, + "rewards/rejected": -10.880684852600098, + "step": 3350 + }, + { + "epoch": 1.14, + "learning_rate": 3.4407654538587434e-07, + "logits/chosen": 0.9161348342895508, + "logits/rejected": 2.3917458057403564, + "logps/chosen": -380.1439514160156, + "logps/rejected": -691.7762451171875, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.82815420627594, + "rewards/margins": 11.48265552520752, + "rewards/rejected": -9.654500961303711, + "step": 3360 + }, + { + "epoch": 1.15, + "learning_rate": 3.4344706030467076e-07, + "logits/chosen": 0.9669869542121887, + "logits/rejected": 2.661142349243164, + "logps/chosen": -324.63262939453125, + "logps/rejected": -747.2676391601562, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.553868055343628, + "rewards/margins": 11.775548934936523, + "rewards/rejected": -10.221680641174316, + "step": 3370 + }, + { + "epoch": 1.15, + "learning_rate": 3.4281757522346724e-07, + "logits/chosen": 1.1901271343231201, + "logits/rejected": 2.964195966720581, + "logps/chosen": -433.86602783203125, + "logps/rejected": -518.0977783203125, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0861451625823975, + "rewards/margins": 12.089399337768555, + "rewards/rejected": -10.003252029418945, + "step": 3380 + }, + { + "epoch": 1.15, + "learning_rate": 3.421880901422636e-07, + "logits/chosen": 0.620611310005188, + "logits/rejected": 2.369555950164795, + "logps/chosen": -368.126708984375, + "logps/rejected": -743.3250122070312, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.87155020236969, + "rewards/margins": 12.570436477661133, + "rewards/rejected": -10.698884963989258, + "step": 3390 + }, + { + "epoch": 1.16, + "learning_rate": 3.4155860506106003e-07, + "logits/chosen": 0.7373756170272827, + "logits/rejected": 2.399385452270508, + "logps/chosen": -397.4166564941406, + "logps/rejected": -620.8201904296875, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5980751514434814, + "rewards/margins": 11.54615306854248, + "rewards/rejected": -9.948077201843262, + "step": 3400 + }, + { + "epoch": 1.16, + "eval_logits/chosen": 0.7183800935745239, + "eval_logits/rejected": 2.821152925491333, + "eval_logps/chosen": -369.2712097167969, + "eval_logps/rejected": -606.4811401367188, + "eval_loss": 0.012870008125901222, + "eval_rewards/accuracies": 0.9949495196342468, + "eval_rewards/chosen": 1.9718691110610962, + "eval_rewards/margins": 12.356949806213379, + "eval_rewards/rejected": -10.38508129119873, + "eval_runtime": 269.4638, + "eval_samples_per_second": 35.255, + "eval_steps_per_second": 1.102, + "step": 3400 + }, + { + "epoch": 1.16, + "learning_rate": 3.4092911997985645e-07, + "logits/chosen": 1.1883623600006104, + "logits/rejected": 2.577510356903076, + "logps/chosen": -362.335205078125, + "logps/rejected": -606.2926025390625, + "loss": 0.0135, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4554922580718994, + "rewards/margins": 11.256914138793945, + "rewards/rejected": -9.801423072814941, + "step": 3410 + }, + { + "epoch": 1.16, + "learning_rate": 3.402996348986529e-07, + "logits/chosen": 1.3287720680236816, + "logits/rejected": 2.5388364791870117, + "logps/chosen": -438.2972106933594, + "logps/rejected": -667.9464111328125, + "loss": 0.0185, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.7842785120010376, + "rewards/margins": 11.153425216674805, + "rewards/rejected": -9.369147300720215, + "step": 3420 + }, + { + "epoch": 1.17, + "learning_rate": 3.396701498174493e-07, + "logits/chosen": 1.2645375728607178, + "logits/rejected": 2.871066093444824, + "logps/chosen": -398.85028076171875, + "logps/rejected": -565.3001098632812, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0519790649414062, + "rewards/margins": 11.566902160644531, + "rewards/rejected": -9.514923095703125, + "step": 3430 + }, + { + "epoch": 1.17, + "learning_rate": 3.3904066473624577e-07, + "logits/chosen": 0.7664368152618408, + "logits/rejected": 2.894336223602295, + "logps/chosen": -297.42779541015625, + "logps/rejected": -642.1676635742188, + "loss": 0.0116, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9993404150009155, + "rewards/margins": 12.201227188110352, + "rewards/rejected": -10.201887130737305, + "step": 3440 + }, + { + "epoch": 1.17, + "learning_rate": 3.384111796550422e-07, + "logits/chosen": 0.8389550447463989, + "logits/rejected": 2.6036429405212402, + "logps/chosen": -315.84979248046875, + "logps/rejected": -680.8143920898438, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8412582874298096, + "rewards/margins": 11.332327842712402, + "rewards/rejected": -9.491069793701172, + "step": 3450 + }, + { + "epoch": 1.18, + "learning_rate": 3.377816945738386e-07, + "logits/chosen": 0.4181036353111267, + "logits/rejected": 2.9160115718841553, + "logps/chosen": -286.0548095703125, + "logps/rejected": -531.5001220703125, + "loss": 0.013, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6251122951507568, + "rewards/margins": 11.32032585144043, + "rewards/rejected": -9.695211410522461, + "step": 3460 + }, + { + "epoch": 1.18, + "learning_rate": 3.37152209492635e-07, + "logits/chosen": 0.6207982301712036, + "logits/rejected": 2.664804458618164, + "logps/chosen": -356.7328186035156, + "logps/rejected": -650.884033203125, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8970571756362915, + "rewards/margins": 12.091269493103027, + "rewards/rejected": -10.1942138671875, + "step": 3470 + }, + { + "epoch": 1.18, + "learning_rate": 3.365227244114314e-07, + "logits/chosen": 0.7093061804771423, + "logits/rejected": 2.628840684890747, + "logps/chosen": -305.00457763671875, + "logps/rejected": -528.3023681640625, + "loss": 0.0095, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.1378726959228516, + "rewards/margins": 11.653341293334961, + "rewards/rejected": -9.515467643737793, + "step": 3480 + }, + { + "epoch": 1.19, + "learning_rate": 3.3589323933022783e-07, + "logits/chosen": 1.2648298740386963, + "logits/rejected": 2.892852783203125, + "logps/chosen": -398.9606018066406, + "logps/rejected": -603.8753662109375, + "loss": 0.0088, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4770350456237793, + "rewards/margins": 12.576154708862305, + "rewards/rejected": -11.099119186401367, + "step": 3490 + }, + { + "epoch": 1.19, + "learning_rate": 3.3526375424902426e-07, + "logits/chosen": 1.1422218084335327, + "logits/rejected": 3.253868818283081, + "logps/chosen": -365.7401428222656, + "logps/rejected": -566.3377685546875, + "loss": 0.0152, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4305555820465088, + "rewards/margins": 11.32600212097168, + "rewards/rejected": -9.895447731018066, + "step": 3500 + }, + { + "epoch": 1.19, + "eval_logits/chosen": 0.7139882445335388, + "eval_logits/rejected": 2.8217320442199707, + "eval_logps/chosen": -368.63287353515625, + "eval_logps/rejected": -604.761474609375, + "eval_loss": 0.012398996390402317, + "eval_rewards/accuracies": 0.9957912564277649, + "eval_rewards/chosen": 2.0357046127319336, + "eval_rewards/margins": 12.248809814453125, + "eval_rewards/rejected": -10.213105201721191, + "eval_runtime": 268.6396, + "eval_samples_per_second": 35.363, + "eval_steps_per_second": 1.106, + "step": 3500 + }, + { + "epoch": 1.19, + "learning_rate": 3.3463426916782073e-07, + "logits/chosen": 0.7617170214653015, + "logits/rejected": 2.90086030960083, + "logps/chosen": -347.78912353515625, + "logps/rejected": -474.829345703125, + "loss": 0.0091, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9028854370117188, + "rewards/margins": 11.47684383392334, + "rewards/rejected": -9.573958396911621, + "step": 3510 + }, + { + "epoch": 1.2, + "learning_rate": 3.3400478408661716e-07, + "logits/chosen": 1.6692126989364624, + "logits/rejected": 3.0957980155944824, + "logps/chosen": -336.6679382324219, + "logps/rejected": -442.8111877441406, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1993305683135986, + "rewards/margins": 13.359231948852539, + "rewards/rejected": -11.15990161895752, + "step": 3520 + }, + { + "epoch": 1.2, + "learning_rate": 3.333752990054136e-07, + "logits/chosen": 0.9762203097343445, + "logits/rejected": 2.1863927841186523, + "logps/chosen": -366.7749938964844, + "logps/rejected": -685.2867431640625, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2956480979919434, + "rewards/margins": 13.168296813964844, + "rewards/rejected": -10.872648239135742, + "step": 3530 + }, + { + "epoch": 1.2, + "learning_rate": 3.3274581392420995e-07, + "logits/chosen": 0.9696139097213745, + "logits/rejected": 3.0056960582733154, + "logps/chosen": -297.73333740234375, + "logps/rejected": -467.8060607910156, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.282499074935913, + "rewards/margins": 12.94469165802002, + "rewards/rejected": -10.662192344665527, + "step": 3540 + }, + { + "epoch": 1.21, + "learning_rate": 3.3211632884300637e-07, + "logits/chosen": 1.7081263065338135, + "logits/rejected": 2.8718748092651367, + "logps/chosen": -421.0556640625, + "logps/rejected": -577.2882080078125, + "loss": 0.006, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.122880458831787, + "rewards/margins": 12.477874755859375, + "rewards/rejected": -10.354994773864746, + "step": 3550 + }, + { + "epoch": 1.21, + "learning_rate": 3.314868437618028e-07, + "logits/chosen": 1.3022390604019165, + "logits/rejected": 2.8691840171813965, + "logps/chosen": -433.652587890625, + "logps/rejected": -585.9044189453125, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.045884609222412, + "rewards/margins": 11.981613159179688, + "rewards/rejected": -9.935728073120117, + "step": 3560 + }, + { + "epoch": 1.21, + "learning_rate": 3.3085735868059927e-07, + "logits/chosen": 0.563024640083313, + "logits/rejected": 2.9883615970611572, + "logps/chosen": -486.9850158691406, + "logps/rejected": -485.02978515625, + "loss": 0.0108, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.0081732273101807, + "rewards/margins": 13.054903984069824, + "rewards/rejected": -11.046730995178223, + "step": 3570 + }, + { + "epoch": 1.22, + "learning_rate": 3.302278735993957e-07, + "logits/chosen": 1.2381908893585205, + "logits/rejected": 2.346295118331909, + "logps/chosen": -450.12139892578125, + "logps/rejected": -788.5457763671875, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7062536478042603, + "rewards/margins": 11.615732192993164, + "rewards/rejected": -9.909477233886719, + "step": 3580 + }, + { + "epoch": 1.22, + "learning_rate": 3.295983885181921e-07, + "logits/chosen": 0.8633956909179688, + "logits/rejected": 2.1898419857025146, + "logps/chosen": -524.0609741210938, + "logps/rejected": -731.7201538085938, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.227761745452881, + "rewards/margins": 12.04895305633545, + "rewards/rejected": -9.821191787719727, + "step": 3590 + }, + { + "epoch": 1.22, + "learning_rate": 3.2896890343698854e-07, + "logits/chosen": 0.4451712965965271, + "logits/rejected": 2.969043254852295, + "logps/chosen": -298.8716125488281, + "logps/rejected": -563.7342529296875, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.909551978111267, + "rewards/margins": 11.801454544067383, + "rewards/rejected": -9.891901016235352, + "step": 3600 + }, + { + "epoch": 1.22, + "eval_logits/chosen": 0.6516625881195068, + "eval_logits/rejected": 2.758915424346924, + "eval_logps/chosen": -368.8428039550781, + "eval_logps/rejected": -611.873291015625, + "eval_loss": 0.011604116298258305, + "eval_rewards/accuracies": 0.996632993221283, + "eval_rewards/chosen": 2.0147087574005127, + "eval_rewards/margins": 12.93900203704834, + "eval_rewards/rejected": -10.924293518066406, + "eval_runtime": 268.5101, + "eval_samples_per_second": 35.38, + "eval_steps_per_second": 1.106, + "step": 3600 + }, + { + "epoch": 1.23, + "learning_rate": 3.2833941835578496e-07, + "logits/chosen": 0.9128265380859375, + "logits/rejected": 2.6559739112854004, + "logps/chosen": -323.16656494140625, + "logps/rejected": -685.8175659179688, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.358858108520508, + "rewards/margins": 15.383926391601562, + "rewards/rejected": -13.025070190429688, + "step": 3610 + }, + { + "epoch": 1.23, + "learning_rate": 3.2770993327458133e-07, + "logits/chosen": 1.0553185939788818, + "logits/rejected": 3.042325973510742, + "logps/chosen": -379.9535827636719, + "logps/rejected": -480.6560974121094, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9295809268951416, + "rewards/margins": 15.311012268066406, + "rewards/rejected": -13.381433486938477, + "step": 3620 + }, + { + "epoch": 1.23, + "learning_rate": 3.270804481933778e-07, + "logits/chosen": 1.2228381633758545, + "logits/rejected": 2.805607557296753, + "logps/chosen": -317.10711669921875, + "logps/rejected": -619.1337280273438, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.014592409133911, + "rewards/margins": 12.018266677856445, + "rewards/rejected": -10.003674507141113, + "step": 3630 + }, + { + "epoch": 1.24, + "learning_rate": 3.2645096311217423e-07, + "logits/chosen": 1.6097183227539062, + "logits/rejected": 3.0599427223205566, + "logps/chosen": -386.4832763671875, + "logps/rejected": -413.746826171875, + "loss": 0.0108, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.8478138446807861, + "rewards/margins": 11.943216323852539, + "rewards/rejected": -10.095402717590332, + "step": 3640 + }, + { + "epoch": 1.24, + "learning_rate": 3.2582147803097065e-07, + "logits/chosen": 0.20550115406513214, + "logits/rejected": 2.465472459793091, + "logps/chosen": -335.13824462890625, + "logps/rejected": -604.3788452148438, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0051052570343018, + "rewards/margins": 12.726202011108398, + "rewards/rejected": -10.721096992492676, + "step": 3650 + }, + { + "epoch": 1.24, + "learning_rate": 3.251919929497671e-07, + "logits/chosen": 1.2652103900909424, + "logits/rejected": 2.926816463470459, + "logps/chosen": -407.6763610839844, + "logps/rejected": -495.2499084472656, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9935003519058228, + "rewards/margins": 12.497861862182617, + "rewards/rejected": -10.504361152648926, + "step": 3660 + }, + { + "epoch": 1.25, + "learning_rate": 3.245625078685635e-07, + "logits/chosen": 1.3886959552764893, + "logits/rejected": 2.630584478378296, + "logps/chosen": -481.4646911621094, + "logps/rejected": -748.5326538085938, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8012968301773071, + "rewards/margins": 12.587298393249512, + "rewards/rejected": -10.786002159118652, + "step": 3670 + }, + { + "epoch": 1.25, + "learning_rate": 3.239330227873599e-07, + "logits/chosen": 0.5694072842597961, + "logits/rejected": 2.876537322998047, + "logps/chosen": -342.1631774902344, + "logps/rejected": -513.7351684570312, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.246203899383545, + "rewards/margins": 13.117487907409668, + "rewards/rejected": -10.871283531188965, + "step": 3680 + }, + { + "epoch": 1.25, + "learning_rate": 3.233035377061564e-07, + "logits/chosen": 1.219588041305542, + "logits/rejected": 2.5020108222961426, + "logps/chosen": -361.3206787109375, + "logps/rejected": -556.8141479492188, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6606366634368896, + "rewards/margins": 12.658553123474121, + "rewards/rejected": -10.997916221618652, + "step": 3690 + }, + { + "epoch": 1.26, + "learning_rate": 3.2267405262495277e-07, + "logits/chosen": 2.0143651962280273, + "logits/rejected": 2.8961198329925537, + "logps/chosen": -376.0616149902344, + "logps/rejected": -571.3860473632812, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6621707677841187, + "rewards/margins": 12.810911178588867, + "rewards/rejected": -11.148740768432617, + "step": 3700 + }, + { + "epoch": 1.26, + "eval_logits/chosen": 0.70637047290802, + "eval_logits/rejected": 2.8017258644104004, + "eval_logps/chosen": -369.4627685546875, + "eval_logps/rejected": -611.279541015625, + "eval_loss": 0.011604195460677147, + "eval_rewards/accuracies": 0.996632993221283, + "eval_rewards/chosen": 1.9527100324630737, + "eval_rewards/margins": 12.817631721496582, + "eval_rewards/rejected": -10.864921569824219, + "eval_runtime": 268.6034, + "eval_samples_per_second": 35.368, + "eval_steps_per_second": 1.106, + "step": 3700 + }, + { + "epoch": 1.26, + "learning_rate": 3.220445675437492e-07, + "logits/chosen": 1.3589471578598022, + "logits/rejected": 2.3055531978607178, + "logps/chosen": -345.74774169921875, + "logps/rejected": -734.9967041015625, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7902826070785522, + "rewards/margins": 13.758868217468262, + "rewards/rejected": -11.968585968017578, + "step": 3710 + }, + { + "epoch": 1.26, + "learning_rate": 3.214150824625456e-07, + "logits/chosen": 0.9244592785835266, + "logits/rejected": 2.705847978591919, + "logps/chosen": -314.8524475097656, + "logps/rejected": -643.5977783203125, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7707325220108032, + "rewards/margins": 12.588135719299316, + "rewards/rejected": -10.817402839660645, + "step": 3720 + }, + { + "epoch": 1.27, + "learning_rate": 3.2078559738134203e-07, + "logits/chosen": 1.0180917978286743, + "logits/rejected": 2.3311429023742676, + "logps/chosen": -413.64862060546875, + "logps/rejected": -681.5325927734375, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2779762744903564, + "rewards/margins": 13.67229175567627, + "rewards/rejected": -12.394315719604492, + "step": 3730 + }, + { + "epoch": 1.27, + "learning_rate": 3.2015611230013846e-07, + "logits/chosen": 1.3414030075073242, + "logits/rejected": 2.343313217163086, + "logps/chosen": -337.6605529785156, + "logps/rejected": -741.5252075195312, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.429905891418457, + "rewards/margins": 12.461353302001953, + "rewards/rejected": -10.031448364257812, + "step": 3740 + }, + { + "epoch": 1.27, + "learning_rate": 3.1952662721893493e-07, + "logits/chosen": 0.8920739889144897, + "logits/rejected": 2.75960111618042, + "logps/chosen": -292.0826416015625, + "logps/rejected": -628.2627563476562, + "loss": 0.0137, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9123607873916626, + "rewards/margins": 12.0170316696167, + "rewards/rejected": -10.104671478271484, + "step": 3750 + }, + { + "epoch": 1.28, + "learning_rate": 3.1889714213773135e-07, + "logits/chosen": 1.1719636917114258, + "logits/rejected": 2.649860382080078, + "logps/chosen": -366.6575927734375, + "logps/rejected": -607.0701293945312, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.313616991043091, + "rewards/margins": 12.286029815673828, + "rewards/rejected": -9.972412109375, + "step": 3760 + }, + { + "epoch": 1.28, + "learning_rate": 3.182676570565277e-07, + "logits/chosen": 0.8673251867294312, + "logits/rejected": 2.7998039722442627, + "logps/chosen": -315.17510986328125, + "logps/rejected": -605.2402954101562, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8954683542251587, + "rewards/margins": 14.503629684448242, + "rewards/rejected": -12.608160018920898, + "step": 3770 + }, + { + "epoch": 1.28, + "learning_rate": 3.1763817197532415e-07, + "logits/chosen": 1.0487251281738281, + "logits/rejected": 2.9428367614746094, + "logps/chosen": -364.67401123046875, + "logps/rejected": -538.7274169921875, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0998024940490723, + "rewards/margins": 13.77617359161377, + "rewards/rejected": -11.676373481750488, + "step": 3780 + }, + { + "epoch": 1.29, + "learning_rate": 3.1700868689412057e-07, + "logits/chosen": 1.02610182762146, + "logits/rejected": 2.5808565616607666, + "logps/chosen": -461.45526123046875, + "logps/rejected": -686.8546142578125, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.335012912750244, + "rewards/margins": 14.393170356750488, + "rewards/rejected": -12.05815601348877, + "step": 3790 + }, + { + "epoch": 1.29, + "learning_rate": 3.16379201812917e-07, + "logits/chosen": 0.750059962272644, + "logits/rejected": 2.3385863304138184, + "logps/chosen": -291.5013732910156, + "logps/rejected": -727.7017822265625, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.783456563949585, + "rewards/margins": 12.742868423461914, + "rewards/rejected": -10.959412574768066, + "step": 3800 + }, + { + "epoch": 1.29, + "eval_logits/chosen": 0.6878785490989685, + "eval_logits/rejected": 2.7622649669647217, + "eval_logps/chosen": -371.6280517578125, + "eval_logps/rejected": -618.2280883789062, + "eval_loss": 0.011163265444338322, + "eval_rewards/accuracies": 0.996632993221283, + "eval_rewards/chosen": 1.7361834049224854, + "eval_rewards/margins": 13.295957565307617, + "eval_rewards/rejected": -11.559774398803711, + "eval_runtime": 268.3977, + "eval_samples_per_second": 35.395, + "eval_steps_per_second": 1.107, + "step": 3800 + }, + { + "epoch": 1.3, + "learning_rate": 3.1574971673171347e-07, + "logits/chosen": 0.9575430750846863, + "logits/rejected": 2.964996814727783, + "logps/chosen": -424.96856689453125, + "logps/rejected": -482.3935546875, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4636036157608032, + "rewards/margins": 14.330609321594238, + "rewards/rejected": -12.867006301879883, + "step": 3810 + }, + { + "epoch": 1.3, + "learning_rate": 3.151202316505099e-07, + "logits/chosen": 0.723203182220459, + "logits/rejected": 2.6531193256378174, + "logps/chosen": -339.8141784667969, + "logps/rejected": -731.8880615234375, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1962404251098633, + "rewards/margins": 15.106730461120605, + "rewards/rejected": -12.910490036010742, + "step": 3820 + }, + { + "epoch": 1.3, + "learning_rate": 3.144907465693063e-07, + "logits/chosen": 0.9935673475265503, + "logits/rejected": 2.505452871322632, + "logps/chosen": -361.5821838378906, + "logps/rejected": -588.7732543945312, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2631385326385498, + "rewards/margins": 11.31373405456543, + "rewards/rejected": -10.050596237182617, + "step": 3830 + }, + { + "epoch": 1.31, + "learning_rate": 3.1386126148810274e-07, + "logits/chosen": 1.2363111972808838, + "logits/rejected": 3.0512309074401855, + "logps/chosen": -489.4908752441406, + "logps/rejected": -554.0120239257812, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5856379270553589, + "rewards/margins": 13.790130615234375, + "rewards/rejected": -12.204492568969727, + "step": 3840 + }, + { + "epoch": 1.31, + "learning_rate": 3.132317764068991e-07, + "logits/chosen": 1.086814045906067, + "logits/rejected": 2.504542112350464, + "logps/chosen": -333.9427185058594, + "logps/rejected": -730.4444580078125, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.555995225906372, + "rewards/margins": 12.766497611999512, + "rewards/rejected": -11.210501670837402, + "step": 3850 + }, + { + "epoch": 1.31, + "learning_rate": 3.1260229132569553e-07, + "logits/chosen": 1.3077691793441772, + "logits/rejected": 2.9703030586242676, + "logps/chosen": -427.02716064453125, + "logps/rejected": -514.3189697265625, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2280203104019165, + "rewards/margins": 14.560205459594727, + "rewards/rejected": -13.332185745239258, + "step": 3860 + }, + { + "epoch": 1.32, + "learning_rate": 3.11972806244492e-07, + "logits/chosen": 1.1198952198028564, + "logits/rejected": 2.6473917961120605, + "logps/chosen": -377.2010498046875, + "logps/rejected": -621.55322265625, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3931939601898193, + "rewards/margins": 12.233184814453125, + "rewards/rejected": -10.839990615844727, + "step": 3870 + }, + { + "epoch": 1.32, + "learning_rate": 3.1134332116328843e-07, + "logits/chosen": 0.918321430683136, + "logits/rejected": 2.5897388458251953, + "logps/chosen": -374.2272033691406, + "logps/rejected": -711.2734375, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9573631286621094, + "rewards/margins": 13.664125442504883, + "rewards/rejected": -11.70676040649414, + "step": 3880 + }, + { + "epoch": 1.32, + "learning_rate": 3.1071383608208485e-07, + "logits/chosen": 1.1157448291778564, + "logits/rejected": 2.8496346473693848, + "logps/chosen": -408.95294189453125, + "logps/rejected": -447.25384521484375, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7573410272598267, + "rewards/margins": 14.136436462402344, + "rewards/rejected": -12.379095077514648, + "step": 3890 + }, + { + "epoch": 1.33, + "learning_rate": 3.1008435100088127e-07, + "logits/chosen": 1.2130072116851807, + "logits/rejected": 2.2291102409362793, + "logps/chosen": -332.9510192871094, + "logps/rejected": -833.26025390625, + "loss": 0.0114, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7235133647918701, + "rewards/margins": 15.338064193725586, + "rewards/rejected": -13.614550590515137, + "step": 3900 + }, + { + "epoch": 1.33, + "eval_logits/chosen": 0.672809898853302, + "eval_logits/rejected": 2.7616159915924072, + "eval_logps/chosen": -370.6764831542969, + "eval_logps/rejected": -616.296875, + "eval_loss": 0.01061132363975048, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.8313409090042114, + "eval_rewards/margins": 13.197998046875, + "eval_rewards/rejected": -11.366658210754395, + "eval_runtime": 267.1258, + "eval_samples_per_second": 35.564, + "eval_steps_per_second": 1.112, + "step": 3900 + }, + { + "epoch": 1.33, + "learning_rate": 3.094548659196777e-07, + "logits/chosen": 1.6785743236541748, + "logits/rejected": 2.301844358444214, + "logps/chosen": -524.2055053710938, + "logps/rejected": -739.4368286132812, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7645349502563477, + "rewards/margins": 12.248418807983398, + "rewards/rejected": -10.483884811401367, + "step": 3910 + }, + { + "epoch": 1.33, + "learning_rate": 3.0882538083847407e-07, + "logits/chosen": 1.4525395631790161, + "logits/rejected": 2.9662253856658936, + "logps/chosen": -405.4144592285156, + "logps/rejected": -511.0747985839844, + "loss": 0.0056, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7320349216461182, + "rewards/margins": 12.92414665222168, + "rewards/rejected": -11.192111015319824, + "step": 3920 + }, + { + "epoch": 1.34, + "learning_rate": 3.0819589575727054e-07, + "logits/chosen": 1.3361444473266602, + "logits/rejected": 2.703052520751953, + "logps/chosen": -534.5265502929688, + "logps/rejected": -433.0003967285156, + "loss": 0.0114, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5415256023406982, + "rewards/margins": 12.53891372680664, + "rewards/rejected": -10.99738883972168, + "step": 3930 + }, + { + "epoch": 1.34, + "learning_rate": 3.0756641067606696e-07, + "logits/chosen": 1.4039337635040283, + "logits/rejected": 2.760995388031006, + "logps/chosen": -337.70904541015625, + "logps/rejected": -551.9937744140625, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6379754543304443, + "rewards/margins": 12.422950744628906, + "rewards/rejected": -10.7849760055542, + "step": 3940 + }, + { + "epoch": 1.34, + "learning_rate": 3.069369255948634e-07, + "logits/chosen": 1.0360331535339355, + "logits/rejected": 2.4552958011627197, + "logps/chosen": -329.26422119140625, + "logps/rejected": -655.5435791015625, + "loss": 0.0113, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.4171547889709473, + "rewards/margins": 15.259183883666992, + "rewards/rejected": -12.84202766418457, + "step": 3950 + }, + { + "epoch": 1.35, + "learning_rate": 3.063074405136598e-07, + "logits/chosen": 0.8999547958374023, + "logits/rejected": 2.6917226314544678, + "logps/chosen": -338.5829162597656, + "logps/rejected": -592.2154541015625, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.708500385284424, + "rewards/margins": 14.965312004089355, + "rewards/rejected": -12.256811141967773, + "step": 3960 + }, + { + "epoch": 1.35, + "learning_rate": 3.0567795543245623e-07, + "logits/chosen": 0.8781677484512329, + "logits/rejected": 2.3897573947906494, + "logps/chosen": -380.5794372558594, + "logps/rejected": -688.5714721679688, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.071242094039917, + "rewards/margins": 14.585981369018555, + "rewards/rejected": -12.514738082885742, + "step": 3970 + }, + { + "epoch": 1.35, + "learning_rate": 3.0504847035125266e-07, + "logits/chosen": 1.2194125652313232, + "logits/rejected": 2.6145756244659424, + "logps/chosen": -408.1900939941406, + "logps/rejected": -582.1531372070312, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0091769695281982, + "rewards/margins": 13.346948623657227, + "rewards/rejected": -11.337770462036133, + "step": 3980 + }, + { + "epoch": 1.36, + "learning_rate": 3.0441898527004913e-07, + "logits/chosen": 1.122051477432251, + "logits/rejected": 2.9787373542785645, + "logps/chosen": -309.53192138671875, + "logps/rejected": -560.3518676757812, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.181328296661377, + "rewards/margins": 13.925869941711426, + "rewards/rejected": -11.74454116821289, + "step": 3990 + }, + { + "epoch": 1.36, + "learning_rate": 3.037895001888455e-07, + "logits/chosen": 0.9208853840827942, + "logits/rejected": 2.722747325897217, + "logps/chosen": -386.92669677734375, + "logps/rejected": -594.4363403320312, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3495748043060303, + "rewards/margins": 13.152547836303711, + "rewards/rejected": -10.802971839904785, + "step": 4000 + }, + { + "epoch": 1.36, + "eval_logits/chosen": 0.6694169640541077, + "eval_logits/rejected": 2.7534499168395996, + "eval_logps/chosen": -369.82958984375, + "eval_logps/rejected": -618.1146850585938, + "eval_loss": 0.010111239738762379, + "eval_rewards/accuracies": 0.9991582632064819, + "eval_rewards/chosen": 1.9160256385803223, + "eval_rewards/margins": 13.464466094970703, + "eval_rewards/rejected": -11.548439979553223, + "eval_runtime": 267.2091, + "eval_samples_per_second": 35.553, + "eval_steps_per_second": 1.111, + "step": 4000 + }, + { + "epoch": 1.36, + "learning_rate": 3.031600151076419e-07, + "logits/chosen": 1.105791449546814, + "logits/rejected": 2.6624977588653564, + "logps/chosen": -329.08905029296875, + "logps/rejected": -635.0185546875, + "loss": 0.0111, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.752402901649475, + "rewards/margins": 14.056722640991211, + "rewards/rejected": -12.304319381713867, + "step": 4010 + }, + { + "epoch": 1.37, + "learning_rate": 3.0253053002643835e-07, + "logits/chosen": 1.1525877714157104, + "logits/rejected": 2.8269333839416504, + "logps/chosen": -328.6435546875, + "logps/rejected": -626.5671997070312, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0284876823425293, + "rewards/margins": 15.253156661987305, + "rewards/rejected": -13.224668502807617, + "step": 4020 + }, + { + "epoch": 1.37, + "learning_rate": 3.0190104494523477e-07, + "logits/chosen": 1.3486201763153076, + "logits/rejected": 2.1660232543945312, + "logps/chosen": -320.6233215332031, + "logps/rejected": -756.1076049804688, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2695963382720947, + "rewards/margins": 11.98314380645752, + "rewards/rejected": -10.71354866027832, + "step": 4030 + }, + { + "epoch": 1.37, + "learning_rate": 3.012715598640312e-07, + "logits/chosen": 1.2375625371932983, + "logits/rejected": 2.5896694660186768, + "logps/chosen": -448.31658935546875, + "logps/rejected": -623.9146728515625, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4820594787597656, + "rewards/margins": 15.964773178100586, + "rewards/rejected": -13.482714653015137, + "step": 4040 + }, + { + "epoch": 1.38, + "learning_rate": 3.0064207478282767e-07, + "logits/chosen": 1.529763102531433, + "logits/rejected": 2.7555646896362305, + "logps/chosen": -449.84686279296875, + "logps/rejected": -620.9822998046875, + "loss": 0.0103, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5289344787597656, + "rewards/margins": 13.241473197937012, + "rewards/rejected": -11.712538719177246, + "step": 4050 + }, + { + "epoch": 1.38, + "learning_rate": 3.000125897016241e-07, + "logits/chosen": 1.3951947689056396, + "logits/rejected": 2.7864015102386475, + "logps/chosen": -348.7859802246094, + "logps/rejected": -626.5861206054688, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3628493547439575, + "rewards/margins": 13.327616691589355, + "rewards/rejected": -11.964765548706055, + "step": 4060 + }, + { + "epoch": 1.38, + "learning_rate": 2.993831046204205e-07, + "logits/chosen": 0.9522676467895508, + "logits/rejected": 2.8154313564300537, + "logps/chosen": -476.47412109375, + "logps/rejected": -513.9017333984375, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.842169165611267, + "rewards/margins": 13.47119426727295, + "rewards/rejected": -11.62902545928955, + "step": 4070 + }, + { + "epoch": 1.39, + "learning_rate": 2.987536195392169e-07, + "logits/chosen": 1.5384814739227295, + "logits/rejected": 2.9053680896759033, + "logps/chosen": -417.6114807128906, + "logps/rejected": -534.3853759765625, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0078911781311035, + "rewards/margins": 14.087924003601074, + "rewards/rejected": -12.080032348632812, + "step": 4080 + }, + { + "epoch": 1.39, + "learning_rate": 2.981241344580133e-07, + "logits/chosen": 1.3096532821655273, + "logits/rejected": 2.828274965286255, + "logps/chosen": -393.8643493652344, + "logps/rejected": -567.3588256835938, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1737160682678223, + "rewards/margins": 13.32548999786377, + "rewards/rejected": -11.151773452758789, + "step": 4090 + }, + { + "epoch": 1.39, + "learning_rate": 2.9749464937680973e-07, + "logits/chosen": 1.554233193397522, + "logits/rejected": 2.9938759803771973, + "logps/chosen": -345.17071533203125, + "logps/rejected": -388.05328369140625, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1033272743225098, + "rewards/margins": 13.004033088684082, + "rewards/rejected": -10.900705337524414, + "step": 4100 + }, + { + "epoch": 1.39, + "eval_logits/chosen": 0.6617211103439331, + "eval_logits/rejected": 2.755298376083374, + "eval_logps/chosen": -370.09149169921875, + "eval_logps/rejected": -615.8171997070312, + "eval_loss": 0.009758265689015388, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.8898385763168335, + "eval_rewards/margins": 13.208524703979492, + "eval_rewards/rejected": -11.318686485290527, + "eval_runtime": 267.8618, + "eval_samples_per_second": 35.466, + "eval_steps_per_second": 1.109, + "step": 4100 + }, + { + "epoch": 1.4, + "learning_rate": 2.968651642956062e-07, + "logits/chosen": 0.6344437003135681, + "logits/rejected": 2.64906644821167, + "logps/chosen": -306.6768798828125, + "logps/rejected": -594.3409423828125, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.623979926109314, + "rewards/margins": 12.556502342224121, + "rewards/rejected": -10.932523727416992, + "step": 4110 + }, + { + "epoch": 1.4, + "learning_rate": 2.9623567921440263e-07, + "logits/chosen": 0.9452501535415649, + "logits/rejected": 2.6657590866088867, + "logps/chosen": -320.43115234375, + "logps/rejected": -478.85504150390625, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9362001419067383, + "rewards/margins": 12.062414169311523, + "rewards/rejected": -10.126212120056152, + "step": 4120 + }, + { + "epoch": 1.4, + "learning_rate": 2.9560619413319905e-07, + "logits/chosen": 1.2213366031646729, + "logits/rejected": 2.3294577598571777, + "logps/chosen": -538.0032958984375, + "logps/rejected": -738.4307861328125, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5684618949890137, + "rewards/margins": 13.005151748657227, + "rewards/rejected": -11.436688423156738, + "step": 4130 + }, + { + "epoch": 1.41, + "learning_rate": 2.9497670905199547e-07, + "logits/chosen": 1.44203782081604, + "logits/rejected": 2.5604937076568604, + "logps/chosen": -520.1005249023438, + "logps/rejected": -640.8972778320312, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.205566644668579, + "rewards/margins": 13.87377643585205, + "rewards/rejected": -11.66821002960205, + "step": 4140 + }, + { + "epoch": 1.41, + "learning_rate": 2.9434722397079184e-07, + "logits/chosen": 1.284224271774292, + "logits/rejected": 2.3379929065704346, + "logps/chosen": -426.13909912109375, + "logps/rejected": -782.0924072265625, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9227638244628906, + "rewards/margins": 13.643468856811523, + "rewards/rejected": -11.72070598602295, + "step": 4150 + }, + { + "epoch": 1.41, + "learning_rate": 2.9371773888958827e-07, + "logits/chosen": 1.3185603618621826, + "logits/rejected": 2.7050728797912598, + "logps/chosen": -348.4053649902344, + "logps/rejected": -583.4497680664062, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2634260654449463, + "rewards/margins": 14.19212818145752, + "rewards/rejected": -11.92870044708252, + "step": 4160 + }, + { + "epoch": 1.42, + "learning_rate": 2.9308825380838474e-07, + "logits/chosen": 1.6224581003189087, + "logits/rejected": 2.830731153488159, + "logps/chosen": -413.7862854003906, + "logps/rejected": -518.830322265625, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2098381519317627, + "rewards/margins": 14.230868339538574, + "rewards/rejected": -12.021029472351074, + "step": 4170 + }, + { + "epoch": 1.42, + "learning_rate": 2.9245876872718116e-07, + "logits/chosen": 1.1401679515838623, + "logits/rejected": 2.6897270679473877, + "logps/chosen": -365.6610107421875, + "logps/rejected": -596.2195434570312, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.139878749847412, + "rewards/margins": 13.742103576660156, + "rewards/rejected": -11.602225303649902, + "step": 4180 + }, + { + "epoch": 1.42, + "learning_rate": 2.918292836459776e-07, + "logits/chosen": 0.6644213795661926, + "logits/rejected": 1.8898484706878662, + "logps/chosen": -343.75909423828125, + "logps/rejected": -814.2130737304688, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0662014484405518, + "rewards/margins": 13.086583137512207, + "rewards/rejected": -11.020383834838867, + "step": 4190 + }, + { + "epoch": 1.43, + "learning_rate": 2.91199798564774e-07, + "logits/chosen": 0.5162476301193237, + "logits/rejected": 2.710484266281128, + "logps/chosen": -294.71026611328125, + "logps/rejected": -589.5906982421875, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.821197509765625, + "rewards/margins": 13.46678352355957, + "rewards/rejected": -11.645587921142578, + "step": 4200 + }, + { + "epoch": 1.43, + "eval_logits/chosen": 0.6265316009521484, + "eval_logits/rejected": 2.7234370708465576, + "eval_logps/chosen": -368.2688903808594, + "eval_logps/rejected": -619.3782348632812, + "eval_loss": 0.009091639891266823, + "eval_rewards/accuracies": 0.9991582632064819, + "eval_rewards/chosen": 2.072103977203369, + "eval_rewards/margins": 13.746898651123047, + "eval_rewards/rejected": -11.674796104431152, + "eval_runtime": 268.1863, + "eval_samples_per_second": 35.423, + "eval_steps_per_second": 1.107, + "step": 4200 + }, + { + "epoch": 1.43, + "learning_rate": 2.9057031348357043e-07, + "logits/chosen": 0.5974520444869995, + "logits/rejected": 2.310032844543457, + "logps/chosen": -325.3004150390625, + "logps/rejected": -749.6110229492188, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4108071327209473, + "rewards/margins": 14.243759155273438, + "rewards/rejected": -11.832952499389648, + "step": 4210 + }, + { + "epoch": 1.43, + "learning_rate": 2.8994082840236686e-07, + "logits/chosen": 0.5631410479545593, + "logits/rejected": 2.089045286178589, + "logps/chosen": -426.6354064941406, + "logps/rejected": -681.105712890625, + "loss": 0.0086, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.776482343673706, + "rewards/margins": 13.32958698272705, + "rewards/rejected": -11.553104400634766, + "step": 4220 + }, + { + "epoch": 1.44, + "learning_rate": 2.893113433211632e-07, + "logits/chosen": 0.693230926990509, + "logits/rejected": 2.8969295024871826, + "logps/chosen": -341.26263427734375, + "logps/rejected": -527.8047485351562, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.348053216934204, + "rewards/margins": 13.852984428405762, + "rewards/rejected": -11.504932403564453, + "step": 4230 + }, + { + "epoch": 1.44, + "learning_rate": 2.886818582399597e-07, + "logits/chosen": 1.3099323511123657, + "logits/rejected": 2.9541726112365723, + "logps/chosen": -371.8271179199219, + "logps/rejected": -486.86260986328125, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.005937099456787, + "rewards/margins": 12.468096733093262, + "rewards/rejected": -11.462160110473633, + "step": 4240 + }, + { + "epoch": 1.44, + "learning_rate": 2.880523731587561e-07, + "logits/chosen": 0.6574260592460632, + "logits/rejected": 1.5284206867218018, + "logps/chosen": -478.7499084472656, + "logps/rejected": -949.9332275390625, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.561853051185608, + "rewards/margins": 12.824320793151855, + "rewards/rejected": -11.262468338012695, + "step": 4250 + }, + { + "epoch": 1.45, + "learning_rate": 2.8742288807755255e-07, + "logits/chosen": 0.3190085291862488, + "logits/rejected": 2.2693285942077637, + "logps/chosen": -366.81036376953125, + "logps/rejected": -740.80322265625, + "loss": 0.0072, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.0779354572296143, + "rewards/margins": 12.947741508483887, + "rewards/rejected": -10.869806289672852, + "step": 4260 + }, + { + "epoch": 1.45, + "learning_rate": 2.8679340299634897e-07, + "logits/chosen": 0.3005984425544739, + "logits/rejected": 1.9869167804718018, + "logps/chosen": -310.21075439453125, + "logps/rejected": -915.5408325195312, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2902274131774902, + "rewards/margins": 12.991412162780762, + "rewards/rejected": -10.701186180114746, + "step": 4270 + }, + { + "epoch": 1.45, + "learning_rate": 2.861639179151454e-07, + "logits/chosen": 1.2294288873672485, + "logits/rejected": 2.8581976890563965, + "logps/chosen": -345.51202392578125, + "logps/rejected": -665.0847778320312, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7379974126815796, + "rewards/margins": 14.367947578430176, + "rewards/rejected": -12.629948616027832, + "step": 4280 + }, + { + "epoch": 1.46, + "learning_rate": 2.855344328339418e-07, + "logits/chosen": 1.3199851512908936, + "logits/rejected": 2.684046745300293, + "logps/chosen": -319.9084777832031, + "logps/rejected": -645.8856201171875, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1213021278381348, + "rewards/margins": 14.135685920715332, + "rewards/rejected": -12.014384269714355, + "step": 4290 + }, + { + "epoch": 1.46, + "learning_rate": 2.849049477527383e-07, + "logits/chosen": 1.4351789951324463, + "logits/rejected": 3.1019484996795654, + "logps/chosen": -329.2397155761719, + "logps/rejected": -460.0499572753906, + "loss": 0.006, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.255934715270996, + "rewards/margins": 14.838006973266602, + "rewards/rejected": -12.582071304321289, + "step": 4300 + }, + { + "epoch": 1.46, + "eval_logits/chosen": 0.6225088834762573, + "eval_logits/rejected": 2.705798864364624, + "eval_logps/chosen": -370.5738525390625, + "eval_logps/rejected": -624.5148315429688, + "eval_loss": 0.008837219327688217, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.841599941253662, + "eval_rewards/margins": 14.030046463012695, + "eval_rewards/rejected": -12.188445091247559, + "eval_runtime": 268.1565, + "eval_samples_per_second": 35.427, + "eval_steps_per_second": 1.108, + "step": 4300 + }, + { + "epoch": 1.46, + "learning_rate": 2.8427546267153466e-07, + "logits/chosen": 0.8465960621833801, + "logits/rejected": 2.358778476715088, + "logps/chosen": -333.47857666015625, + "logps/rejected": -711.8942260742188, + "loss": 0.0124, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4469887018203735, + "rewards/margins": 14.328710556030273, + "rewards/rejected": -12.881719589233398, + "step": 4310 + }, + { + "epoch": 1.47, + "learning_rate": 2.836459775903311e-07, + "logits/chosen": 0.7382937669754028, + "logits/rejected": 2.9191064834594727, + "logps/chosen": -427.72296142578125, + "logps/rejected": -541.3267822265625, + "loss": 0.0182, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5231516361236572, + "rewards/margins": 13.919703483581543, + "rewards/rejected": -12.396551132202148, + "step": 4320 + }, + { + "epoch": 1.47, + "learning_rate": 2.830164925091275e-07, + "logits/chosen": 1.1782336235046387, + "logits/rejected": 2.832035541534424, + "logps/chosen": -327.30865478515625, + "logps/rejected": -554.0316162109375, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5803128480911255, + "rewards/margins": 13.803190231323242, + "rewards/rejected": -12.222879409790039, + "step": 4330 + }, + { + "epoch": 1.48, + "learning_rate": 2.8238700742792393e-07, + "logits/chosen": 0.7898053526878357, + "logits/rejected": 3.143005847930908, + "logps/chosen": -373.89959716796875, + "logps/rejected": -453.019775390625, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9527397155761719, + "rewards/margins": 14.160921096801758, + "rewards/rejected": -12.208181381225586, + "step": 4340 + }, + { + "epoch": 1.48, + "learning_rate": 2.8175752234672035e-07, + "logits/chosen": 1.5171926021575928, + "logits/rejected": 2.6992270946502686, + "logps/chosen": -432.3185119628906, + "logps/rejected": -558.4224243164062, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.192749261856079, + "rewards/margins": 12.624958992004395, + "rewards/rejected": -10.432210922241211, + "step": 4350 + }, + { + "epoch": 1.48, + "learning_rate": 2.8112803726551683e-07, + "logits/chosen": 1.8292335271835327, + "logits/rejected": 3.312920093536377, + "logps/chosen": -466.8578186035156, + "logps/rejected": -418.66680908203125, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5297528505325317, + "rewards/margins": 10.969106674194336, + "rewards/rejected": -9.439353942871094, + "step": 4360 + }, + { + "epoch": 1.49, + "learning_rate": 2.8049855218431325e-07, + "logits/chosen": 1.0215450525283813, + "logits/rejected": 2.6072983741760254, + "logps/chosen": -337.31915283203125, + "logps/rejected": -533.4041748046875, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3369078636169434, + "rewards/margins": 12.58057689666748, + "rewards/rejected": -10.243669509887695, + "step": 4370 + }, + { + "epoch": 1.49, + "learning_rate": 2.7986906710310967e-07, + "logits/chosen": 1.2219336032867432, + "logits/rejected": 1.8741023540496826, + "logps/chosen": -400.1982727050781, + "logps/rejected": -858.0445556640625, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0328240394592285, + "rewards/margins": 14.159416198730469, + "rewards/rejected": -12.126591682434082, + "step": 4380 + }, + { + "epoch": 1.49, + "learning_rate": 2.7923958202190604e-07, + "logits/chosen": 0.6931953430175781, + "logits/rejected": 2.141230344772339, + "logps/chosen": -295.72308349609375, + "logps/rejected": -860.4769287109375, + "loss": 0.0097, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6996008157730103, + "rewards/margins": 14.573939323425293, + "rewards/rejected": -12.87433910369873, + "step": 4390 + }, + { + "epoch": 1.5, + "learning_rate": 2.7861009694070247e-07, + "logits/chosen": 1.1773998737335205, + "logits/rejected": 3.001295566558838, + "logps/chosen": -404.4131774902344, + "logps/rejected": -484.2671813964844, + "loss": 0.0071, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9020360708236694, + "rewards/margins": 13.961771011352539, + "rewards/rejected": -12.059735298156738, + "step": 4400 + }, + { + "epoch": 1.5, + "eval_logits/chosen": 0.6230810880661011, + "eval_logits/rejected": 2.71238374710083, + "eval_logps/chosen": -368.8385925292969, + "eval_logps/rejected": -620.0233154296875, + "eval_loss": 0.008326222188770771, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 2.0151307582855225, + "eval_rewards/margins": 13.754433631896973, + "eval_rewards/rejected": -11.739301681518555, + "eval_runtime": 268.8898, + "eval_samples_per_second": 35.33, + "eval_steps_per_second": 1.105, + "step": 4400 + }, + { + "epoch": 1.5, + "learning_rate": 2.779806118594989e-07, + "logits/chosen": 0.8335103988647461, + "logits/rejected": 2.1839518547058105, + "logps/chosen": -392.66632080078125, + "logps/rejected": -686.5694580078125, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3597278594970703, + "rewards/margins": 13.914251327514648, + "rewards/rejected": -11.554522514343262, + "step": 4410 + }, + { + "epoch": 1.5, + "learning_rate": 2.7735112677829536e-07, + "logits/chosen": 0.4120512902736664, + "logits/rejected": 2.686591625213623, + "logps/chosen": -308.36334228515625, + "logps/rejected": -576.0289306640625, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1544272899627686, + "rewards/margins": 13.063441276550293, + "rewards/rejected": -10.909013748168945, + "step": 4420 + }, + { + "epoch": 1.51, + "learning_rate": 2.767216416970918e-07, + "logits/chosen": 0.680091917514801, + "logits/rejected": 2.9787096977233887, + "logps/chosen": -311.6202087402344, + "logps/rejected": -484.0185546875, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9193477630615234, + "rewards/margins": 14.170591354370117, + "rewards/rejected": -12.251241683959961, + "step": 4430 + }, + { + "epoch": 1.51, + "learning_rate": 2.760921566158882e-07, + "logits/chosen": 0.24553918838500977, + "logits/rejected": 2.527836322784424, + "logps/chosen": -356.7384948730469, + "logps/rejected": -613.1387939453125, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.243509531021118, + "rewards/margins": 12.485934257507324, + "rewards/rejected": -10.242424964904785, + "step": 4440 + }, + { + "epoch": 1.51, + "learning_rate": 2.7546267153468463e-07, + "logits/chosen": 1.4051676988601685, + "logits/rejected": 2.4445412158966064, + "logps/chosen": -537.8873291015625, + "logps/rejected": -562.5985107421875, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1561007499694824, + "rewards/margins": 13.01380729675293, + "rewards/rejected": -10.857707023620605, + "step": 4450 + }, + { + "epoch": 1.52, + "learning_rate": 2.74833186453481e-07, + "logits/chosen": 1.222497582435608, + "logits/rejected": 2.6060967445373535, + "logps/chosen": -446.00848388671875, + "logps/rejected": -619.6156616210938, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.138929843902588, + "rewards/margins": 13.503074645996094, + "rewards/rejected": -11.364145278930664, + "step": 4460 + }, + { + "epoch": 1.52, + "learning_rate": 2.742037013722774e-07, + "logits/chosen": 1.1242297887802124, + "logits/rejected": 2.2714600563049316, + "logps/chosen": -385.5606994628906, + "logps/rejected": -600.1583251953125, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1965062618255615, + "rewards/margins": 13.274136543273926, + "rewards/rejected": -11.077630996704102, + "step": 4470 + }, + { + "epoch": 1.52, + "learning_rate": 2.735742162910739e-07, + "logits/chosen": 1.6765056848526, + "logits/rejected": 3.165526866912842, + "logps/chosen": -424.5562438964844, + "logps/rejected": -477.0459899902344, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1084578037261963, + "rewards/margins": 13.339756965637207, + "rewards/rejected": -11.23129940032959, + "step": 4480 + }, + { + "epoch": 1.53, + "learning_rate": 2.729447312098703e-07, + "logits/chosen": 0.5628236532211304, + "logits/rejected": 2.7374966144561768, + "logps/chosen": -480.1666564941406, + "logps/rejected": -688.9700317382812, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3280537128448486, + "rewards/margins": 13.522354125976562, + "rewards/rejected": -11.194300651550293, + "step": 4490 + }, + { + "epoch": 1.53, + "learning_rate": 2.7231524612866675e-07, + "logits/chosen": 0.8318904042243958, + "logits/rejected": 2.778688907623291, + "logps/chosen": -321.25396728515625, + "logps/rejected": -660.5576171875, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0317749977111816, + "rewards/margins": 12.968721389770508, + "rewards/rejected": -10.936944961547852, + "step": 4500 + }, + { + "epoch": 1.53, + "eval_logits/chosen": 0.6407039761543274, + "eval_logits/rejected": 2.7205677032470703, + "eval_logps/chosen": -368.1263732910156, + "eval_logps/rejected": -617.782958984375, + "eval_loss": 0.008253191597759724, + "eval_rewards/accuracies": 0.9991582632064819, + "eval_rewards/chosen": 2.086351156234741, + "eval_rewards/margins": 13.601614952087402, + "eval_rewards/rejected": -11.515264511108398, + "eval_runtime": 269.3071, + "eval_samples_per_second": 35.276, + "eval_steps_per_second": 1.103, + "step": 4500 + }, + { + "epoch": 1.53, + "learning_rate": 2.7168576104746317e-07, + "logits/chosen": 0.6592813730239868, + "logits/rejected": 2.668743848800659, + "logps/chosen": -340.2607116699219, + "logps/rejected": -683.1118774414062, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.177586555480957, + "rewards/margins": 14.914339065551758, + "rewards/rejected": -12.736750602722168, + "step": 4510 + }, + { + "epoch": 1.54, + "learning_rate": 2.710562759662596e-07, + "logits/chosen": 0.7379637956619263, + "logits/rejected": 2.5914015769958496, + "logps/chosen": -390.2478942871094, + "logps/rejected": -640.7365112304688, + "loss": 0.0057, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.2496771812438965, + "rewards/margins": 13.077062606811523, + "rewards/rejected": -10.827384948730469, + "step": 4520 + }, + { + "epoch": 1.54, + "learning_rate": 2.70426790885056e-07, + "logits/chosen": 0.9463948011398315, + "logits/rejected": 2.068814754486084, + "logps/chosen": -361.550048828125, + "logps/rejected": -774.712158203125, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1057486534118652, + "rewards/margins": 15.126858711242676, + "rewards/rejected": -13.021112442016602, + "step": 4530 + }, + { + "epoch": 1.54, + "learning_rate": 2.6979730580385244e-07, + "logits/chosen": 1.441889762878418, + "logits/rejected": 2.6717936992645264, + "logps/chosen": -447.90020751953125, + "logps/rejected": -669.8250732421875, + "loss": 0.0139, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5580735206604004, + "rewards/margins": 12.329019546508789, + "rewards/rejected": -10.770946502685547, + "step": 4540 + }, + { + "epoch": 1.55, + "learning_rate": 2.6916782072264886e-07, + "logits/chosen": 0.7209632396697998, + "logits/rejected": 2.7818751335144043, + "logps/chosen": -372.5408630371094, + "logps/rejected": -507.08544921875, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4891321659088135, + "rewards/margins": 13.217214584350586, + "rewards/rejected": -10.728082656860352, + "step": 4550 + }, + { + "epoch": 1.55, + "learning_rate": 2.685383356414453e-07, + "logits/chosen": 1.1761372089385986, + "logits/rejected": 2.7067036628723145, + "logps/chosen": -387.07568359375, + "logps/rejected": -529.3436279296875, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7527261972427368, + "rewards/margins": 14.56840705871582, + "rewards/rejected": -12.815679550170898, + "step": 4560 + }, + { + "epoch": 1.55, + "learning_rate": 2.679088505602417e-07, + "logits/chosen": 0.8417810201644897, + "logits/rejected": 2.6834988594055176, + "logps/chosen": -416.536865234375, + "logps/rejected": -719.8585205078125, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.182037115097046, + "rewards/margins": 14.083086967468262, + "rewards/rejected": -11.901049613952637, + "step": 4570 + }, + { + "epoch": 1.56, + "learning_rate": 2.6727936547903813e-07, + "logits/chosen": 0.8248815536499023, + "logits/rejected": 2.0804429054260254, + "logps/chosen": -317.86749267578125, + "logps/rejected": -642.9418334960938, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9505695104599, + "rewards/margins": 12.340843200683594, + "rewards/rejected": -10.390274047851562, + "step": 4580 + }, + { + "epoch": 1.56, + "learning_rate": 2.6664988039783455e-07, + "logits/chosen": 1.028735876083374, + "logits/rejected": 3.0316011905670166, + "logps/chosen": -300.3289794921875, + "logps/rejected": -475.13543701171875, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5529836416244507, + "rewards/margins": 12.989659309387207, + "rewards/rejected": -11.436675071716309, + "step": 4590 + }, + { + "epoch": 1.56, + "learning_rate": 2.66020395316631e-07, + "logits/chosen": 1.1963709592819214, + "logits/rejected": 3.01017689704895, + "logps/chosen": -311.89825439453125, + "logps/rejected": -495.96893310546875, + "loss": 0.0054, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5140174627304077, + "rewards/margins": 13.036733627319336, + "rewards/rejected": -11.522716522216797, + "step": 4600 + }, + { + "epoch": 1.56, + "eval_logits/chosen": 0.6099021434783936, + "eval_logits/rejected": 2.7246458530426025, + "eval_logps/chosen": -369.0596618652344, + "eval_logps/rejected": -616.05419921875, + "eval_loss": 0.008312725462019444, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 1.9930243492126465, + "eval_rewards/margins": 13.335411071777344, + "eval_rewards/rejected": -11.342387199401855, + "eval_runtime": 268.6015, + "eval_samples_per_second": 35.368, + "eval_steps_per_second": 1.106, + "step": 4600 + }, + { + "epoch": 1.57, + "learning_rate": 2.6539091023542745e-07, + "logits/chosen": 1.0765217542648315, + "logits/rejected": 2.4071598052978516, + "logps/chosen": -315.82122802734375, + "logps/rejected": -714.0283203125, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.855968952178955, + "rewards/margins": 15.264370918273926, + "rewards/rejected": -12.408401489257812, + "step": 4610 + }, + { + "epoch": 1.57, + "learning_rate": 2.647614251542238e-07, + "logits/chosen": 1.5065996646881104, + "logits/rejected": 2.300039052963257, + "logps/chosen": -351.83367919921875, + "logps/rejected": -720.9744873046875, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7324097156524658, + "rewards/margins": 13.242719650268555, + "rewards/rejected": -11.510311126708984, + "step": 4620 + }, + { + "epoch": 1.57, + "learning_rate": 2.6413194007302024e-07, + "logits/chosen": 1.575552225112915, + "logits/rejected": 2.6150310039520264, + "logps/chosen": -331.12310791015625, + "logps/rejected": -626.9486694335938, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5365428924560547, + "rewards/margins": 11.85051441192627, + "rewards/rejected": -10.313970565795898, + "step": 4630 + }, + { + "epoch": 1.58, + "learning_rate": 2.6350245499181666e-07, + "logits/chosen": 1.4718683958053589, + "logits/rejected": 2.4823708534240723, + "logps/chosen": -396.23748779296875, + "logps/rejected": -616.1155395507812, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2771496772766113, + "rewards/margins": 13.974538803100586, + "rewards/rejected": -11.697389602661133, + "step": 4640 + }, + { + "epoch": 1.58, + "learning_rate": 2.628729699106131e-07, + "logits/chosen": 0.8800075650215149, + "logits/rejected": 3.012760877609253, + "logps/chosen": -323.5577087402344, + "logps/rejected": -583.1554565429688, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.246222496032715, + "rewards/margins": 14.201080322265625, + "rewards/rejected": -11.95485782623291, + "step": 4650 + }, + { + "epoch": 1.58, + "learning_rate": 2.6224348482940956e-07, + "logits/chosen": 0.7633311152458191, + "logits/rejected": 2.2152318954467773, + "logps/chosen": -466.2403869628906, + "logps/rejected": -722.082763671875, + "loss": 0.0159, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.236987829208374, + "rewards/margins": 13.460619926452637, + "rewards/rejected": -11.2236328125, + "step": 4660 + }, + { + "epoch": 1.59, + "learning_rate": 2.61613999748206e-07, + "logits/chosen": 1.5745208263397217, + "logits/rejected": 3.095592975616455, + "logps/chosen": -404.6357116699219, + "logps/rejected": -453.40130615234375, + "loss": 0.0079, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7302166223526, + "rewards/margins": 13.1705961227417, + "rewards/rejected": -11.44037914276123, + "step": 4670 + }, + { + "epoch": 1.59, + "learning_rate": 2.609845146670024e-07, + "logits/chosen": 0.7493244409561157, + "logits/rejected": 2.8759987354278564, + "logps/chosen": -296.1628112792969, + "logps/rejected": -457.446044921875, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.227241277694702, + "rewards/margins": 13.656562805175781, + "rewards/rejected": -11.4293212890625, + "step": 4680 + }, + { + "epoch": 1.59, + "learning_rate": 2.603550295857988e-07, + "logits/chosen": 0.9803324937820435, + "logits/rejected": 2.721207618713379, + "logps/chosen": -336.9967346191406, + "logps/rejected": -632.5643920898438, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8605949878692627, + "rewards/margins": 12.091894149780273, + "rewards/rejected": -10.23129940032959, + "step": 4690 + }, + { + "epoch": 1.6, + "learning_rate": 2.597255445045952e-07, + "logits/chosen": 1.1940078735351562, + "logits/rejected": 3.1348912715911865, + "logps/chosen": -424.3228454589844, + "logps/rejected": -489.9722595214844, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4344812631607056, + "rewards/margins": 13.698277473449707, + "rewards/rejected": -12.263795852661133, + "step": 4700 + }, + { + "epoch": 1.6, + "eval_logits/chosen": 0.6007880568504333, + "eval_logits/rejected": 2.7200093269348145, + "eval_logps/chosen": -369.6922607421875, + "eval_logps/rejected": -615.797119140625, + "eval_loss": 0.008013113401830196, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 1.929763913154602, + "eval_rewards/margins": 13.246444702148438, + "eval_rewards/rejected": -11.316682815551758, + "eval_runtime": 268.0276, + "eval_samples_per_second": 35.444, + "eval_steps_per_second": 1.108, + "step": 4700 + }, + { + "epoch": 1.6, + "learning_rate": 2.590960594233916e-07, + "logits/chosen": 1.1567070484161377, + "logits/rejected": 2.7277112007141113, + "logps/chosen": -341.4814147949219, + "logps/rejected": -638.3734130859375, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9156405925750732, + "rewards/margins": 12.364795684814453, + "rewards/rejected": -10.449155807495117, + "step": 4710 + }, + { + "epoch": 1.6, + "learning_rate": 2.584665743421881e-07, + "logits/chosen": 0.7439510822296143, + "logits/rejected": 3.0841832160949707, + "logps/chosen": -308.150634765625, + "logps/rejected": -454.5015563964844, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.00886869430542, + "rewards/margins": 13.200039863586426, + "rewards/rejected": -11.191170692443848, + "step": 4720 + }, + { + "epoch": 1.61, + "learning_rate": 2.578370892609845e-07, + "logits/chosen": 1.3363820314407349, + "logits/rejected": 2.9260199069976807, + "logps/chosen": -362.2496032714844, + "logps/rejected": -518.860107421875, + "loss": 0.0057, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.637078046798706, + "rewards/margins": 13.22400188446045, + "rewards/rejected": -11.58692455291748, + "step": 4730 + }, + { + "epoch": 1.61, + "learning_rate": 2.5720760417978095e-07, + "logits/chosen": 1.194959282875061, + "logits/rejected": 2.6666481494903564, + "logps/chosen": -426.196533203125, + "logps/rejected": -553.5843505859375, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5456111431121826, + "rewards/margins": 11.892793655395508, + "rewards/rejected": -10.34718132019043, + "step": 4740 + }, + { + "epoch": 1.61, + "learning_rate": 2.5657811909857737e-07, + "logits/chosen": 1.7758777141571045, + "logits/rejected": 2.655397653579712, + "logps/chosen": -411.11578369140625, + "logps/rejected": -534.5379028320312, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7537777423858643, + "rewards/margins": 11.740898132324219, + "rewards/rejected": -9.987119674682617, + "step": 4750 + }, + { + "epoch": 1.62, + "learning_rate": 2.559486340173738e-07, + "logits/chosen": 0.6086001396179199, + "logits/rejected": 2.924757242202759, + "logps/chosen": -326.00897216796875, + "logps/rejected": -520.2870483398438, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8088676929473877, + "rewards/margins": 14.248144149780273, + "rewards/rejected": -12.439276695251465, + "step": 4760 + }, + { + "epoch": 1.62, + "learning_rate": 2.5531914893617016e-07, + "logits/chosen": 1.1811004877090454, + "logits/rejected": 2.159785032272339, + "logps/chosen": -328.2763977050781, + "logps/rejected": -801.7093505859375, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4686853885650635, + "rewards/margins": 12.940330505371094, + "rewards/rejected": -11.471644401550293, + "step": 4770 + }, + { + "epoch": 1.62, + "learning_rate": 2.5468966385496664e-07, + "logits/chosen": 1.3045234680175781, + "logits/rejected": 2.996737241744995, + "logps/chosen": -318.265625, + "logps/rejected": -461.1734313964844, + "loss": 0.0084, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.1198580265045166, + "rewards/margins": 13.443967819213867, + "rewards/rejected": -11.32411003112793, + "step": 4780 + }, + { + "epoch": 1.63, + "learning_rate": 2.5406017877376306e-07, + "logits/chosen": 1.559531807899475, + "logits/rejected": 2.582435131072998, + "logps/chosen": -372.0412902832031, + "logps/rejected": -553.0614624023438, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4875469207763672, + "rewards/margins": 13.850107192993164, + "rewards/rejected": -12.362558364868164, + "step": 4790 + }, + { + "epoch": 1.63, + "learning_rate": 2.534306936925595e-07, + "logits/chosen": 1.2863677740097046, + "logits/rejected": 2.7517781257629395, + "logps/chosen": -382.712890625, + "logps/rejected": -572.390869140625, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2039921283721924, + "rewards/margins": 13.863197326660156, + "rewards/rejected": -11.659204483032227, + "step": 4800 + }, + { + "epoch": 1.63, + "eval_logits/chosen": 0.5697704553604126, + "eval_logits/rejected": 2.6917316913604736, + "eval_logps/chosen": -370.18133544921875, + "eval_logps/rejected": -617.3153686523438, + "eval_loss": 0.007416225038468838, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 1.8808573484420776, + "eval_rewards/margins": 13.349367141723633, + "eval_rewards/rejected": -11.46850872039795, + "eval_runtime": 267.3405, + "eval_samples_per_second": 35.535, + "eval_steps_per_second": 1.111, + "step": 4800 + }, + { + "epoch": 1.63, + "learning_rate": 2.528012086113559e-07, + "logits/chosen": 1.2882909774780273, + "logits/rejected": 2.163443088531494, + "logps/chosen": -332.18695068359375, + "logps/rejected": -742.5114135742188, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9539035558700562, + "rewards/margins": 12.516752243041992, + "rewards/rejected": -10.562848091125488, + "step": 4810 + }, + { + "epoch": 1.64, + "learning_rate": 2.5217172353015233e-07, + "logits/chosen": 0.6577471494674683, + "logits/rejected": 2.275991916656494, + "logps/chosen": -381.43731689453125, + "logps/rejected": -725.3045043945312, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.548079252243042, + "rewards/margins": 16.103429794311523, + "rewards/rejected": -13.555349349975586, + "step": 4820 + }, + { + "epoch": 1.64, + "learning_rate": 2.5154223844894875e-07, + "logits/chosen": 1.2324239015579224, + "logits/rejected": 3.194679021835327, + "logps/chosen": -413.9017639160156, + "logps/rejected": -476.4483947753906, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6373205184936523, + "rewards/margins": 12.455896377563477, + "rewards/rejected": -10.818574905395508, + "step": 4830 + }, + { + "epoch": 1.65, + "learning_rate": 2.509127533677452e-07, + "logits/chosen": 0.9907386898994446, + "logits/rejected": 2.874156951904297, + "logps/chosen": -389.5166320800781, + "logps/rejected": -418.999267578125, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1311802864074707, + "rewards/margins": 13.799692153930664, + "rewards/rejected": -11.668512344360352, + "step": 4840 + }, + { + "epoch": 1.65, + "learning_rate": 2.502832682865416e-07, + "logits/chosen": 0.8386165499687195, + "logits/rejected": 2.858229160308838, + "logps/chosen": -393.86126708984375, + "logps/rejected": -599.6715698242188, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1954259872436523, + "rewards/margins": 12.808425903320312, + "rewards/rejected": -10.613000869750977, + "step": 4850 + }, + { + "epoch": 1.65, + "learning_rate": 2.49653783205338e-07, + "logits/chosen": 1.4464737176895142, + "logits/rejected": 2.7185616493225098, + "logps/chosen": -326.30816650390625, + "logps/rejected": -562.0698852539062, + "loss": 0.0074, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9854040145874023, + "rewards/margins": 13.104391098022461, + "rewards/rejected": -11.118986129760742, + "step": 4860 + }, + { + "epoch": 1.66, + "learning_rate": 2.4902429812413444e-07, + "logits/chosen": 0.9704988598823547, + "logits/rejected": 2.861062526702881, + "logps/chosen": -322.04351806640625, + "logps/rejected": -590.7166137695312, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.773932695388794, + "rewards/margins": 13.489224433898926, + "rewards/rejected": -11.715291976928711, + "step": 4870 + }, + { + "epoch": 1.66, + "learning_rate": 2.4839481304293086e-07, + "logits/chosen": 0.7275189161300659, + "logits/rejected": 2.4044909477233887, + "logps/chosen": -359.2979431152344, + "logps/rejected": -687.754150390625, + "loss": 0.0175, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7108738422393799, + "rewards/margins": 13.818089485168457, + "rewards/rejected": -12.107215881347656, + "step": 4880 + }, + { + "epoch": 1.66, + "learning_rate": 2.477653279617273e-07, + "logits/chosen": 1.3394114971160889, + "logits/rejected": 2.08496356010437, + "logps/chosen": -431.34332275390625, + "logps/rejected": -859.7326049804688, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0375823974609375, + "rewards/margins": 14.49980354309082, + "rewards/rejected": -12.462221145629883, + "step": 4890 + }, + { + "epoch": 1.67, + "learning_rate": 2.471358428805237e-07, + "logits/chosen": 1.1097261905670166, + "logits/rejected": 2.604459285736084, + "logps/chosen": -395.5567321777344, + "logps/rejected": -643.832763671875, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0858750343322754, + "rewards/margins": 13.381146430969238, + "rewards/rejected": -11.295271873474121, + "step": 4900 + }, + { + "epoch": 1.67, + "eval_logits/chosen": 0.5797955989837646, + "eval_logits/rejected": 2.6860883235931396, + "eval_logps/chosen": -369.99676513671875, + "eval_logps/rejected": -621.4749145507812, + "eval_loss": 0.007327604573220015, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.8993133306503296, + "eval_rewards/margins": 13.783766746520996, + "eval_rewards/rejected": -11.884452819824219, + "eval_runtime": 267.4053, + "eval_samples_per_second": 35.527, + "eval_steps_per_second": 1.111, + "step": 4900 + }, + { + "epoch": 1.67, + "learning_rate": 2.4650635779932013e-07, + "logits/chosen": 1.3302220106124878, + "logits/rejected": 2.934269666671753, + "logps/chosen": -337.15960693359375, + "logps/rejected": -503.8389587402344, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.697217345237732, + "rewards/margins": 14.28808879852295, + "rewards/rejected": -12.59087085723877, + "step": 4910 + }, + { + "epoch": 1.67, + "learning_rate": 2.4587687271811656e-07, + "logits/chosen": 1.211080551147461, + "logits/rejected": 2.21518874168396, + "logps/chosen": -342.48504638671875, + "logps/rejected": -734.8573608398438, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1308863162994385, + "rewards/margins": 13.767776489257812, + "rewards/rejected": -11.63688850402832, + "step": 4920 + }, + { + "epoch": 1.68, + "learning_rate": 2.45247387636913e-07, + "logits/chosen": 0.7533133625984192, + "logits/rejected": 2.5293025970458984, + "logps/chosen": -349.9305114746094, + "logps/rejected": -584.0184936523438, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0276365280151367, + "rewards/margins": 13.492365837097168, + "rewards/rejected": -11.464729309082031, + "step": 4930 + }, + { + "epoch": 1.68, + "learning_rate": 2.446179025557094e-07, + "logits/chosen": 1.0548173189163208, + "logits/rejected": 2.1759402751922607, + "logps/chosen": -493.15142822265625, + "logps/rejected": -702.56591796875, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9155597686767578, + "rewards/margins": 13.1171236038208, + "rewards/rejected": -11.201563835144043, + "step": 4940 + }, + { + "epoch": 1.68, + "learning_rate": 2.439884174745059e-07, + "logits/chosen": 0.9858635663986206, + "logits/rejected": 2.9378695487976074, + "logps/chosen": -381.10833740234375, + "logps/rejected": -520.8736572265625, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3787133693695068, + "rewards/margins": 13.84961986541748, + "rewards/rejected": -12.470907211303711, + "step": 4950 + }, + { + "epoch": 1.69, + "learning_rate": 2.4335893239330225e-07, + "logits/chosen": 1.2095704078674316, + "logits/rejected": 2.5496010780334473, + "logps/chosen": -418.56024169921875, + "logps/rejected": -585.3426513671875, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7939611673355103, + "rewards/margins": 12.407074928283691, + "rewards/rejected": -10.613114356994629, + "step": 4960 + }, + { + "epoch": 1.69, + "learning_rate": 2.4272944731209867e-07, + "logits/chosen": 0.9482321739196777, + "logits/rejected": 2.6138076782226562, + "logps/chosen": -396.30633544921875, + "logps/rejected": -573.6236572265625, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.186265230178833, + "rewards/margins": 15.410557746887207, + "rewards/rejected": -13.224291801452637, + "step": 4970 + }, + { + "epoch": 1.69, + "learning_rate": 2.4209996223089514e-07, + "logits/chosen": 1.3051884174346924, + "logits/rejected": 2.48286771774292, + "logps/chosen": -393.8180847167969, + "logps/rejected": -603.3321533203125, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6906706094741821, + "rewards/margins": 13.082013130187988, + "rewards/rejected": -11.39134407043457, + "step": 4980 + }, + { + "epoch": 1.7, + "learning_rate": 2.4147047714969157e-07, + "logits/chosen": 0.8893558382987976, + "logits/rejected": 2.5944838523864746, + "logps/chosen": -335.33447265625, + "logps/rejected": -661.49755859375, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2299575805664062, + "rewards/margins": 12.682855606079102, + "rewards/rejected": -10.452896118164062, + "step": 4990 + }, + { + "epoch": 1.7, + "learning_rate": 2.4084099206848794e-07, + "logits/chosen": 1.099838137626648, + "logits/rejected": 2.7704567909240723, + "logps/chosen": -321.14569091796875, + "logps/rejected": -554.14697265625, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2605462074279785, + "rewards/margins": 13.753137588500977, + "rewards/rejected": -11.492591857910156, + "step": 5000 + }, + { + "epoch": 1.7, + "eval_logits/chosen": 0.5411103367805481, + "eval_logits/rejected": 2.6435320377349854, + "eval_logps/chosen": -370.23480224609375, + "eval_logps/rejected": -625.662353515625, + "eval_loss": 0.007171071134507656, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 1.8755117654800415, + "eval_rewards/margins": 14.178715705871582, + "eval_rewards/rejected": -12.303203582763672, + "eval_runtime": 268.4746, + "eval_samples_per_second": 35.385, + "eval_steps_per_second": 1.106, + "step": 5000 + }, + { + "epoch": 1.7, + "learning_rate": 2.402115069872844e-07, + "logits/chosen": 1.7221934795379639, + "logits/rejected": 2.8443284034729004, + "logps/chosen": -355.40570068359375, + "logps/rejected": -497.22705078125, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.781616449356079, + "rewards/margins": 15.23070240020752, + "rewards/rejected": -13.44908618927002, + "step": 5010 + }, + { + "epoch": 1.71, + "learning_rate": 2.3958202190608084e-07, + "logits/chosen": 0.4256567358970642, + "logits/rejected": 2.852240800857544, + "logps/chosen": -388.9455261230469, + "logps/rejected": -534.2367553710938, + "loss": 0.01, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.100231170654297, + "rewards/margins": 13.96013355255127, + "rewards/rejected": -11.859903335571289, + "step": 5020 + }, + { + "epoch": 1.71, + "learning_rate": 2.3895253682487726e-07, + "logits/chosen": 1.6765273809432983, + "logits/rejected": 2.6882596015930176, + "logps/chosen": -330.9751281738281, + "logps/rejected": -621.0322265625, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1029813289642334, + "rewards/margins": 12.23882007598877, + "rewards/rejected": -10.13583755493164, + "step": 5030 + }, + { + "epoch": 1.71, + "learning_rate": 2.3832305174367368e-07, + "logits/chosen": 1.4308533668518066, + "logits/rejected": 2.5526671409606934, + "logps/chosen": -383.99676513671875, + "logps/rejected": -571.560791015625, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.778713583946228, + "rewards/margins": 14.228243827819824, + "rewards/rejected": -12.449530601501465, + "step": 5040 + }, + { + "epoch": 1.72, + "learning_rate": 2.3769356666247008e-07, + "logits/chosen": 0.7223809361457825, + "logits/rejected": 2.534702777862549, + "logps/chosen": -369.4510803222656, + "logps/rejected": -601.6087646484375, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5519216060638428, + "rewards/margins": 13.02763557434082, + "rewards/rejected": -11.475714683532715, + "step": 5050 + }, + { + "epoch": 1.72, + "learning_rate": 2.370640815812665e-07, + "logits/chosen": 1.0404523611068726, + "logits/rejected": 2.5804059505462646, + "logps/chosen": -396.8551330566406, + "logps/rejected": -581.9253540039062, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.147939682006836, + "rewards/margins": 14.98511028289795, + "rewards/rejected": -12.837170600891113, + "step": 5060 + }, + { + "epoch": 1.72, + "learning_rate": 2.3643459650006295e-07, + "logits/chosen": 0.8765469789505005, + "logits/rejected": 3.199450969696045, + "logps/chosen": -391.029296875, + "logps/rejected": -462.06317138671875, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0410709381103516, + "rewards/margins": 13.092700004577637, + "rewards/rejected": -11.051628112792969, + "step": 5070 + }, + { + "epoch": 1.73, + "learning_rate": 2.3580511141885937e-07, + "logits/chosen": 1.0518453121185303, + "logits/rejected": 2.7533373832702637, + "logps/chosen": -371.9391174316406, + "logps/rejected": -500.344482421875, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1165008544921875, + "rewards/margins": 14.174306869506836, + "rewards/rejected": -12.057806015014648, + "step": 5080 + }, + { + "epoch": 1.73, + "learning_rate": 2.3517562633765577e-07, + "logits/chosen": 1.240299940109253, + "logits/rejected": 2.9455935955047607, + "logps/chosen": -300.67706298828125, + "logps/rejected": -484.98553466796875, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8077754974365234, + "rewards/margins": 13.667498588562012, + "rewards/rejected": -11.859723091125488, + "step": 5090 + }, + { + "epoch": 1.73, + "learning_rate": 2.3454614125645222e-07, + "logits/chosen": 0.7239997982978821, + "logits/rejected": 2.22920823097229, + "logps/chosen": -339.1505432128906, + "logps/rejected": -665.8603515625, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.847394347190857, + "rewards/margins": 13.439477920532227, + "rewards/rejected": -11.592084884643555, + "step": 5100 + }, + { + "epoch": 1.73, + "eval_logits/chosen": 0.5392878651618958, + "eval_logits/rejected": 2.6526615619659424, + "eval_logps/chosen": -369.70660400390625, + "eval_logps/rejected": -621.6979370117188, + "eval_loss": 0.0076078129932284355, + "eval_rewards/accuracies": 0.9957912564277649, + "eval_rewards/chosen": 1.9283288717269897, + "eval_rewards/margins": 13.835088729858398, + "eval_rewards/rejected": -11.906759262084961, + "eval_runtime": 267.4123, + "eval_samples_per_second": 35.526, + "eval_steps_per_second": 1.111, + "step": 5100 + }, + { + "epoch": 1.74, + "learning_rate": 2.3391665617524864e-07, + "logits/chosen": 0.5402101278305054, + "logits/rejected": 2.0428051948547363, + "logps/chosen": -446.457763671875, + "logps/rejected": -828.9158325195312, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1444571018218994, + "rewards/margins": 14.292704582214355, + "rewards/rejected": -12.148245811462402, + "step": 5110 + }, + { + "epoch": 1.74, + "learning_rate": 2.3328717109404506e-07, + "logits/chosen": 1.1088473796844482, + "logits/rejected": 3.142504930496216, + "logps/chosen": -327.00152587890625, + "logps/rejected": -501.23016357421875, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.154792070388794, + "rewards/margins": 14.1039457321167, + "rewards/rejected": -11.9491548538208, + "step": 5120 + }, + { + "epoch": 1.74, + "learning_rate": 2.3265768601284149e-07, + "logits/chosen": 1.1828131675720215, + "logits/rejected": 2.618335247039795, + "logps/chosen": -461.40521240234375, + "logps/rejected": -630.9039916992188, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.291642904281616, + "rewards/margins": 13.117799758911133, + "rewards/rejected": -10.826155662536621, + "step": 5130 + }, + { + "epoch": 1.75, + "learning_rate": 2.320282009316379e-07, + "logits/chosen": 0.49540406465530396, + "logits/rejected": 2.015763282775879, + "logps/chosen": -457.4979553222656, + "logps/rejected": -764.1224975585938, + "loss": 0.0066, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.3074951171875, + "rewards/margins": 13.635294914245605, + "rewards/rejected": -11.327799797058105, + "step": 5140 + }, + { + "epoch": 1.75, + "learning_rate": 2.3139871585043433e-07, + "logits/chosen": 1.3483718633651733, + "logits/rejected": 2.9165961742401123, + "logps/chosen": -384.2827453613281, + "logps/rejected": -517.1640625, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0184836387634277, + "rewards/margins": 14.222018241882324, + "rewards/rejected": -12.203533172607422, + "step": 5150 + }, + { + "epoch": 1.75, + "learning_rate": 2.3076923076923078e-07, + "logits/chosen": 0.9851890802383423, + "logits/rejected": 2.5453128814697266, + "logps/chosen": -472.58721923828125, + "logps/rejected": -554.181884765625, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1495094299316406, + "rewards/margins": 14.743972778320312, + "rewards/rejected": -12.594462394714355, + "step": 5160 + }, + { + "epoch": 1.76, + "learning_rate": 2.3013974568802718e-07, + "logits/chosen": 1.0264804363250732, + "logits/rejected": 2.838040590286255, + "logps/chosen": -378.90606689453125, + "logps/rejected": -520.4061279296875, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.653881788253784, + "rewards/margins": 14.299891471862793, + "rewards/rejected": -11.64600944519043, + "step": 5170 + }, + { + "epoch": 1.76, + "learning_rate": 2.295102606068236e-07, + "logits/chosen": 0.6610434055328369, + "logits/rejected": 2.055659055709839, + "logps/chosen": -336.66229248046875, + "logps/rejected": -809.7601318359375, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5673797130584717, + "rewards/margins": 14.333930969238281, + "rewards/rejected": -12.766549110412598, + "step": 5180 + }, + { + "epoch": 1.76, + "learning_rate": 2.2888077552562005e-07, + "logits/chosen": 0.7072926163673401, + "logits/rejected": 2.731778621673584, + "logps/chosen": -338.4165954589844, + "logps/rejected": -631.5391845703125, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0703439712524414, + "rewards/margins": 13.521017074584961, + "rewards/rejected": -11.450674057006836, + "step": 5190 + }, + { + "epoch": 1.77, + "learning_rate": 2.2825129044441647e-07, + "logits/chosen": 0.7150042653083801, + "logits/rejected": 3.074216842651367, + "logps/chosen": -312.5461120605469, + "logps/rejected": -556.0842895507812, + "loss": 0.0065, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9952480792999268, + "rewards/margins": 13.600128173828125, + "rewards/rejected": -11.604883193969727, + "step": 5200 + }, + { + "epoch": 1.77, + "eval_logits/chosen": 0.5763381123542786, + "eval_logits/rejected": 2.6790342330932617, + "eval_logps/chosen": -369.119873046875, + "eval_logps/rejected": -621.7357177734375, + "eval_loss": 0.007353052031248808, + "eval_rewards/accuracies": 0.9949495196342468, + "eval_rewards/chosen": 1.9870065450668335, + "eval_rewards/margins": 13.897536277770996, + "eval_rewards/rejected": -11.910529136657715, + "eval_runtime": 267.7726, + "eval_samples_per_second": 35.478, + "eval_steps_per_second": 1.109, + "step": 5200 + }, + { + "epoch": 1.77, + "learning_rate": 2.2762180536321287e-07, + "logits/chosen": 1.0697767734527588, + "logits/rejected": 2.543888807296753, + "logps/chosen": -324.13116455078125, + "logps/rejected": -668.995361328125, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9455013275146484, + "rewards/margins": 12.813148498535156, + "rewards/rejected": -10.867646217346191, + "step": 5210 + }, + { + "epoch": 1.77, + "learning_rate": 2.2699232028200932e-07, + "logits/chosen": 1.0008208751678467, + "logits/rejected": 2.202432155609131, + "logps/chosen": -388.22332763671875, + "logps/rejected": -730.78564453125, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0636024475097656, + "rewards/margins": 12.839284896850586, + "rewards/rejected": -10.77568531036377, + "step": 5220 + }, + { + "epoch": 1.78, + "learning_rate": 2.2636283520080574e-07, + "logits/chosen": 1.6768519878387451, + "logits/rejected": 2.8521502017974854, + "logps/chosen": -432.42132568359375, + "logps/rejected": -569.5505981445312, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9164817333221436, + "rewards/margins": 13.948224067687988, + "rewards/rejected": -12.031743049621582, + "step": 5230 + }, + { + "epoch": 1.78, + "learning_rate": 2.2573335011960216e-07, + "logits/chosen": 0.578073263168335, + "logits/rejected": 2.474010467529297, + "logps/chosen": -319.8635559082031, + "logps/rejected": -753.2205200195312, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.47343111038208, + "rewards/margins": 15.181828498840332, + "rewards/rejected": -12.708395957946777, + "step": 5240 + }, + { + "epoch": 1.78, + "learning_rate": 2.2510386503839856e-07, + "logits/chosen": 1.2054836750030518, + "logits/rejected": 2.4689555168151855, + "logps/chosen": -330.71197509765625, + "logps/rejected": -639.7576293945312, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0893828868865967, + "rewards/margins": 14.316856384277344, + "rewards/rejected": -12.227472305297852, + "step": 5250 + }, + { + "epoch": 1.79, + "learning_rate": 2.24474379957195e-07, + "logits/chosen": 0.8659073710441589, + "logits/rejected": 2.069164514541626, + "logps/chosen": -472.73992919921875, + "logps/rejected": -751.3345336914062, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6958650350570679, + "rewards/margins": 14.968708992004395, + "rewards/rejected": -13.272845268249512, + "step": 5260 + }, + { + "epoch": 1.79, + "learning_rate": 2.2384489487599143e-07, + "logits/chosen": 0.7720758318901062, + "logits/rejected": 2.473846912384033, + "logps/chosen": -453.7737731933594, + "logps/rejected": -785.3414306640625, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9351928234100342, + "rewards/margins": 15.376632690429688, + "rewards/rejected": -13.441439628601074, + "step": 5270 + }, + { + "epoch": 1.79, + "learning_rate": 2.2321540979478783e-07, + "logits/chosen": 0.5396759510040283, + "logits/rejected": 2.619581699371338, + "logps/chosen": -372.71575927734375, + "logps/rejected": -683.4382934570312, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3975188732147217, + "rewards/margins": 15.780477523803711, + "rewards/rejected": -14.382959365844727, + "step": 5280 + }, + { + "epoch": 1.8, + "learning_rate": 2.2258592471358428e-07, + "logits/chosen": 1.0776684284210205, + "logits/rejected": 2.364051342010498, + "logps/chosen": -315.5631103515625, + "logps/rejected": -682.8671875, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5904780626296997, + "rewards/margins": 13.19941520690918, + "rewards/rejected": -11.608936309814453, + "step": 5290 + }, + { + "epoch": 1.8, + "learning_rate": 2.219564396323807e-07, + "logits/chosen": 1.0182130336761475, + "logits/rejected": 2.3595683574676514, + "logps/chosen": -320.3334045410156, + "logps/rejected": -739.2911376953125, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6678212881088257, + "rewards/margins": 14.77850341796875, + "rewards/rejected": -13.110682487487793, + "step": 5300 + }, + { + "epoch": 1.8, + "eval_logits/chosen": 0.5392746329307556, + "eval_logits/rejected": 2.6264493465423584, + "eval_logps/chosen": -370.9958801269531, + "eval_logps/rejected": -627.23095703125, + "eval_loss": 0.006811817176640034, + "eval_rewards/accuracies": 0.9957912564277649, + "eval_rewards/chosen": 1.799401879310608, + "eval_rewards/margins": 14.259466171264648, + "eval_rewards/rejected": -12.460063934326172, + "eval_runtime": 268.9812, + "eval_samples_per_second": 35.318, + "eval_steps_per_second": 1.104, + "step": 5300 + }, + { + "epoch": 1.8, + "learning_rate": 2.2132695455117712e-07, + "logits/chosen": 1.2998424768447876, + "logits/rejected": 3.0192737579345703, + "logps/chosen": -341.30340576171875, + "logps/rejected": -509.3768005371094, + "loss": 0.005, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.990985631942749, + "rewards/margins": 14.92119026184082, + "rewards/rejected": -12.930203437805176, + "step": 5310 + }, + { + "epoch": 1.81, + "learning_rate": 2.2069746946997355e-07, + "logits/chosen": 0.29328662157058716, + "logits/rejected": 3.271939516067505, + "logps/chosen": -281.95684814453125, + "logps/rejected": -424.2947692871094, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4099323749542236, + "rewards/margins": 13.121679306030273, + "rewards/rejected": -11.711746215820312, + "step": 5320 + }, + { + "epoch": 1.81, + "learning_rate": 2.2006798438876997e-07, + "logits/chosen": 1.244208574295044, + "logits/rejected": 2.4942610263824463, + "logps/chosen": -451.87091064453125, + "logps/rejected": -706.4019165039062, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5382120609283447, + "rewards/margins": 12.513381004333496, + "rewards/rejected": -10.97516918182373, + "step": 5330 + }, + { + "epoch": 1.82, + "learning_rate": 2.194384993075664e-07, + "logits/chosen": 0.7272036075592041, + "logits/rejected": 2.511995792388916, + "logps/chosen": -298.5057678222656, + "logps/rejected": -636.6715087890625, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.117550849914551, + "rewards/margins": 14.677660942077637, + "rewards/rejected": -12.560112953186035, + "step": 5340 + }, + { + "epoch": 1.82, + "learning_rate": 2.1880901422636284e-07, + "logits/chosen": 1.2972975969314575, + "logits/rejected": 2.9249444007873535, + "logps/chosen": -325.5199279785156, + "logps/rejected": -565.7622680664062, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2052669525146484, + "rewards/margins": 14.870798110961914, + "rewards/rejected": -12.665529251098633, + "step": 5350 + }, + { + "epoch": 1.82, + "learning_rate": 2.1817952914515924e-07, + "logits/chosen": 0.94163978099823, + "logits/rejected": 2.520163059234619, + "logps/chosen": -321.620361328125, + "logps/rejected": -606.1392822265625, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9230222702026367, + "rewards/margins": 13.822883605957031, + "rewards/rejected": -11.899862289428711, + "step": 5360 + }, + { + "epoch": 1.83, + "learning_rate": 2.1755004406395566e-07, + "logits/chosen": 0.6572908163070679, + "logits/rejected": 2.8326666355133057, + "logps/chosen": -399.23150634765625, + "logps/rejected": -500.59326171875, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4918336868286133, + "rewards/margins": 17.32343292236328, + "rewards/rejected": -14.8316011428833, + "step": 5370 + }, + { + "epoch": 1.83, + "learning_rate": 2.169205589827521e-07, + "logits/chosen": 0.7684445381164551, + "logits/rejected": 2.7367186546325684, + "logps/chosen": -334.71453857421875, + "logps/rejected": -572.8629150390625, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8657491207122803, + "rewards/margins": 16.919837951660156, + "rewards/rejected": -15.054089546203613, + "step": 5380 + }, + { + "epoch": 1.83, + "learning_rate": 2.1629107390154853e-07, + "logits/chosen": 0.946051299571991, + "logits/rejected": 2.517101287841797, + "logps/chosen": -322.4632568359375, + "logps/rejected": -652.5506591796875, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.803297996520996, + "rewards/margins": 14.255006790161133, + "rewards/rejected": -12.451708793640137, + "step": 5390 + }, + { + "epoch": 1.84, + "learning_rate": 2.1566158882034493e-07, + "logits/chosen": 0.7960838675498962, + "logits/rejected": 2.9010050296783447, + "logps/chosen": -310.89794921875, + "logps/rejected": -486.76348876953125, + "loss": 0.0076, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.1863255500793457, + "rewards/margins": 14.510612487792969, + "rewards/rejected": -12.324285507202148, + "step": 5400 + }, + { + "epoch": 1.84, + "eval_logits/chosen": 0.5465123057365417, + "eval_logits/rejected": 2.6408891677856445, + "eval_logps/chosen": -368.5406799316406, + "eval_logps/rejected": -624.6871337890625, + "eval_loss": 0.006404118612408638, + "eval_rewards/accuracies": 0.996632993221283, + "eval_rewards/chosen": 2.04492449760437, + "eval_rewards/margins": 14.250606536865234, + "eval_rewards/rejected": -12.205682754516602, + "eval_runtime": 267.8158, + "eval_samples_per_second": 35.472, + "eval_steps_per_second": 1.109, + "step": 5400 + }, + { + "epoch": 1.84, + "learning_rate": 2.1503210373914138e-07, + "logits/chosen": 1.2992589473724365, + "logits/rejected": 2.140784978866577, + "logps/chosen": -386.3299255371094, + "logps/rejected": -728.1055908203125, + "loss": 0.0063, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.298143148422241, + "rewards/margins": 13.360751152038574, + "rewards/rejected": -11.062607765197754, + "step": 5410 + }, + { + "epoch": 1.84, + "learning_rate": 2.144026186579378e-07, + "logits/chosen": 0.944604218006134, + "logits/rejected": 2.4234421253204346, + "logps/chosen": -299.80853271484375, + "logps/rejected": -729.2354736328125, + "loss": 0.0085, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.149277448654175, + "rewards/margins": 15.382696151733398, + "rewards/rejected": -13.233418464660645, + "step": 5420 + }, + { + "epoch": 1.85, + "learning_rate": 2.1377313357673422e-07, + "logits/chosen": 1.1390790939331055, + "logits/rejected": 2.4424569606781006, + "logps/chosen": -372.9580383300781, + "logps/rejected": -661.5415649414062, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.522296667098999, + "rewards/margins": 13.513280868530273, + "rewards/rejected": -11.990983963012695, + "step": 5430 + }, + { + "epoch": 1.85, + "learning_rate": 2.1314364849553065e-07, + "logits/chosen": 0.9248983263969421, + "logits/rejected": 2.6572163105010986, + "logps/chosen": -358.212646484375, + "logps/rejected": -549.5049438476562, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6326959133148193, + "rewards/margins": 14.595819473266602, + "rewards/rejected": -12.96312427520752, + "step": 5440 + }, + { + "epoch": 1.85, + "learning_rate": 2.1251416341432707e-07, + "logits/chosen": 0.878515899181366, + "logits/rejected": 2.608353614807129, + "logps/chosen": -400.9839782714844, + "logps/rejected": -705.0433349609375, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.141390800476074, + "rewards/margins": 14.705162048339844, + "rewards/rejected": -12.56377124786377, + "step": 5450 + }, + { + "epoch": 1.86, + "learning_rate": 2.118846783331235e-07, + "logits/chosen": 0.9696270227432251, + "logits/rejected": 2.727121114730835, + "logps/chosen": -410.68963623046875, + "logps/rejected": -601.6516723632812, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.737865686416626, + "rewards/margins": 14.875020027160645, + "rewards/rejected": -13.137155532836914, + "step": 5460 + }, + { + "epoch": 1.86, + "learning_rate": 2.1125519325191994e-07, + "logits/chosen": 1.1417778730392456, + "logits/rejected": 2.4671216011047363, + "logps/chosen": -338.8183288574219, + "logps/rejected": -724.9601440429688, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.960103988647461, + "rewards/margins": 15.837699890136719, + "rewards/rejected": -13.877595901489258, + "step": 5470 + }, + { + "epoch": 1.86, + "learning_rate": 2.1062570817071634e-07, + "logits/chosen": 1.0455242395401, + "logits/rejected": 2.3731436729431152, + "logps/chosen": -434.44024658203125, + "logps/rejected": -633.2236938476562, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8375475406646729, + "rewards/margins": 12.737271308898926, + "rewards/rejected": -10.899724960327148, + "step": 5480 + }, + { + "epoch": 1.87, + "learning_rate": 2.0999622308951276e-07, + "logits/chosen": 1.118044137954712, + "logits/rejected": 2.3109071254730225, + "logps/chosen": -375.8702087402344, + "logps/rejected": -674.3856811523438, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1463160514831543, + "rewards/margins": 15.210006713867188, + "rewards/rejected": -13.063692092895508, + "step": 5490 + }, + { + "epoch": 1.87, + "learning_rate": 2.093667380083092e-07, + "logits/chosen": 0.9572780728340149, + "logits/rejected": 2.776047945022583, + "logps/chosen": -329.673583984375, + "logps/rejected": -557.1834716796875, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6945956945419312, + "rewards/margins": 14.333562850952148, + "rewards/rejected": -12.638967514038086, + "step": 5500 + }, + { + "epoch": 1.87, + "eval_logits/chosen": 0.5432767271995544, + "eval_logits/rejected": 2.6332035064697266, + "eval_logps/chosen": -369.0491027832031, + "eval_logps/rejected": -627.0294799804688, + "eval_loss": 0.006227751262485981, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.9940775632858276, + "eval_rewards/margins": 14.433989524841309, + "eval_rewards/rejected": -12.439913749694824, + "eval_runtime": 268.031, + "eval_samples_per_second": 35.444, + "eval_steps_per_second": 1.108, + "step": 5500 + }, + { + "epoch": 1.87, + "learning_rate": 2.087372529271056e-07, + "logits/chosen": 0.9330072402954102, + "logits/rejected": 2.2522757053375244, + "logps/chosen": -447.468994140625, + "logps/rejected": -697.0299072265625, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3345236778259277, + "rewards/margins": 15.280471801757812, + "rewards/rejected": -12.945945739746094, + "step": 5510 + }, + { + "epoch": 1.88, + "learning_rate": 2.0810776784590203e-07, + "logits/chosen": 1.5043458938598633, + "logits/rejected": 3.0579121112823486, + "logps/chosen": -326.2335205078125, + "logps/rejected": -600.5350341796875, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7955917119979858, + "rewards/margins": 15.176149368286133, + "rewards/rejected": -13.380559921264648, + "step": 5520 + }, + { + "epoch": 1.88, + "learning_rate": 2.0747828276469848e-07, + "logits/chosen": 0.7315915822982788, + "logits/rejected": 1.9385446310043335, + "logps/chosen": -429.71746826171875, + "logps/rejected": -748.8201904296875, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.13820219039917, + "rewards/margins": 13.658441543579102, + "rewards/rejected": -11.520238876342773, + "step": 5530 + }, + { + "epoch": 1.88, + "learning_rate": 2.068487976834949e-07, + "logits/chosen": 1.217410683631897, + "logits/rejected": 2.797996997833252, + "logps/chosen": -355.1609802246094, + "logps/rejected": -616.5965576171875, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3493545055389404, + "rewards/margins": 15.734540939331055, + "rewards/rejected": -13.385187149047852, + "step": 5540 + }, + { + "epoch": 1.89, + "learning_rate": 2.062193126022913e-07, + "logits/chosen": 0.6537594795227051, + "logits/rejected": 2.7036337852478027, + "logps/chosen": -308.1028747558594, + "logps/rejected": -716.1016235351562, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7219940423965454, + "rewards/margins": 14.988980293273926, + "rewards/rejected": -13.266984939575195, + "step": 5550 + }, + { + "epoch": 1.89, + "learning_rate": 2.0558982752108775e-07, + "logits/chosen": 1.0059354305267334, + "logits/rejected": 2.405097484588623, + "logps/chosen": -354.9513854980469, + "logps/rejected": -728.5120849609375, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8416436910629272, + "rewards/margins": 15.623313903808594, + "rewards/rejected": -13.781671524047852, + "step": 5560 + }, + { + "epoch": 1.89, + "learning_rate": 2.0496034243988417e-07, + "logits/chosen": 0.7288120985031128, + "logits/rejected": 2.4463515281677246, + "logps/chosen": -386.6265869140625, + "logps/rejected": -694.5880737304688, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.276694893836975, + "rewards/margins": 13.237451553344727, + "rewards/rejected": -11.960756301879883, + "step": 5570 + }, + { + "epoch": 1.9, + "learning_rate": 2.043308573586806e-07, + "logits/chosen": 0.9864422082901001, + "logits/rejected": 2.416912794113159, + "logps/chosen": -442.6741638183594, + "logps/rejected": -745.8394775390625, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6303446292877197, + "rewards/margins": 13.753680229187012, + "rewards/rejected": -12.123335838317871, + "step": 5580 + }, + { + "epoch": 1.9, + "learning_rate": 2.0370137227747701e-07, + "logits/chosen": 1.0159143209457397, + "logits/rejected": 2.404919147491455, + "logps/chosen": -403.14892578125, + "logps/rejected": -700.0591430664062, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6275684833526611, + "rewards/margins": 14.735166549682617, + "rewards/rejected": -13.107600212097168, + "step": 5590 + }, + { + "epoch": 1.9, + "learning_rate": 2.0307188719627344e-07, + "logits/chosen": 0.8741198778152466, + "logits/rejected": 2.7800064086914062, + "logps/chosen": -452.5263671875, + "logps/rejected": -546.5322265625, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4914939403533936, + "rewards/margins": 13.270769119262695, + "rewards/rejected": -11.779274940490723, + "step": 5600 + }, + { + "epoch": 1.9, + "eval_logits/chosen": 0.5377217531204224, + "eval_logits/rejected": 2.6299867630004883, + "eval_logps/chosen": -369.87109375, + "eval_logps/rejected": -626.6300048828125, + "eval_loss": 0.006109884940087795, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.9118820428848267, + "eval_rewards/margins": 14.311848640441895, + "eval_rewards/rejected": -12.399968147277832, + "eval_runtime": 268.6218, + "eval_samples_per_second": 35.366, + "eval_steps_per_second": 1.106, + "step": 5600 + }, + { + "epoch": 1.91, + "learning_rate": 2.0244240211506986e-07, + "logits/chosen": 1.6756540536880493, + "logits/rejected": 2.825979709625244, + "logps/chosen": -355.1505126953125, + "logps/rejected": -458.744140625, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7418196201324463, + "rewards/margins": 14.322778701782227, + "rewards/rejected": -12.580958366394043, + "step": 5610 + }, + { + "epoch": 1.91, + "learning_rate": 2.018129170338663e-07, + "logits/chosen": 1.3055442571640015, + "logits/rejected": 2.628654956817627, + "logps/chosen": -510.27685546875, + "logps/rejected": -572.4608764648438, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.037275791168213, + "rewards/margins": 15.664937019348145, + "rewards/rejected": -13.627660751342773, + "step": 5620 + }, + { + "epoch": 1.91, + "learning_rate": 2.011834319526627e-07, + "logits/chosen": 0.9477977752685547, + "logits/rejected": 2.4301483631134033, + "logps/chosen": -390.4095458984375, + "logps/rejected": -582.1488647460938, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3422436714172363, + "rewards/margins": 14.721003532409668, + "rewards/rejected": -12.378759384155273, + "step": 5630 + }, + { + "epoch": 1.92, + "learning_rate": 2.0055394687145913e-07, + "logits/chosen": 0.8035489916801453, + "logits/rejected": 1.7046220302581787, + "logps/chosen": -377.4579162597656, + "logps/rejected": -827.44580078125, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7721195220947266, + "rewards/margins": 13.13145637512207, + "rewards/rejected": -11.359336853027344, + "step": 5640 + }, + { + "epoch": 1.92, + "learning_rate": 1.9992446179025558e-07, + "logits/chosen": 0.450077623128891, + "logits/rejected": 2.0916006565093994, + "logps/chosen": -318.74053955078125, + "logps/rejected": -677.7642822265625, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5363391637802124, + "rewards/margins": 13.124608993530273, + "rewards/rejected": -11.588269233703613, + "step": 5650 + }, + { + "epoch": 1.92, + "learning_rate": 1.99294976709052e-07, + "logits/chosen": 1.2795765399932861, + "logits/rejected": 2.624028444290161, + "logps/chosen": -337.2548828125, + "logps/rejected": -615.2337036132812, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5114516019821167, + "rewards/margins": 14.363494873046875, + "rewards/rejected": -12.852045059204102, + "step": 5660 + }, + { + "epoch": 1.93, + "learning_rate": 1.986654916278484e-07, + "logits/chosen": 1.387978434562683, + "logits/rejected": 2.4463891983032227, + "logps/chosen": -324.6473693847656, + "logps/rejected": -653.7659301757812, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8007307052612305, + "rewards/margins": 13.293085098266602, + "rewards/rejected": -11.492353439331055, + "step": 5670 + }, + { + "epoch": 1.93, + "learning_rate": 1.9803600654664484e-07, + "logits/chosen": 0.9867205619812012, + "logits/rejected": 1.7832437753677368, + "logps/chosen": -331.4938049316406, + "logps/rejected": -868.81591796875, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.198737382888794, + "rewards/margins": 15.122095108032227, + "rewards/rejected": -12.923357009887695, + "step": 5680 + }, + { + "epoch": 1.93, + "learning_rate": 1.9740652146544127e-07, + "logits/chosen": 0.9721837043762207, + "logits/rejected": 2.441796064376831, + "logps/chosen": -407.02264404296875, + "logps/rejected": -610.9381103515625, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.146698474884033, + "rewards/margins": 13.958742141723633, + "rewards/rejected": -11.812042236328125, + "step": 5690 + }, + { + "epoch": 1.94, + "learning_rate": 1.9677703638423766e-07, + "logits/chosen": 1.1788890361785889, + "logits/rejected": 2.8577880859375, + "logps/chosen": -334.38848876953125, + "logps/rejected": -539.4073486328125, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0800750255584717, + "rewards/margins": 15.230023384094238, + "rewards/rejected": -13.149948120117188, + "step": 5700 + }, + { + "epoch": 1.94, + "eval_logits/chosen": 0.5288003087043762, + "eval_logits/rejected": 2.624809980392456, + "eval_logps/chosen": -368.4456787109375, + "eval_logps/rejected": -624.3119506835938, + "eval_loss": 0.006175518035888672, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 2.0544216632843018, + "eval_rewards/margins": 14.222590446472168, + "eval_rewards/rejected": -12.168168067932129, + "eval_runtime": 266.7992, + "eval_samples_per_second": 35.607, + "eval_steps_per_second": 1.113, + "step": 5700 + }, + { + "epoch": 1.94, + "learning_rate": 1.961475513030341e-07, + "logits/chosen": 1.1513116359710693, + "logits/rejected": 2.2623372077941895, + "logps/chosen": -439.71942138671875, + "logps/rejected": -656.05224609375, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9392688274383545, + "rewards/margins": 13.81745433807373, + "rewards/rejected": -11.878186225891113, + "step": 5710 + }, + { + "epoch": 1.94, + "learning_rate": 1.9551806622183054e-07, + "logits/chosen": 0.9612258672714233, + "logits/rejected": 2.3724660873413086, + "logps/chosen": -312.3752136230469, + "logps/rejected": -670.6126708984375, + "loss": 0.0081, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9849334955215454, + "rewards/margins": 14.504766464233398, + "rewards/rejected": -12.5198335647583, + "step": 5720 + }, + { + "epoch": 1.95, + "learning_rate": 1.9488858114062696e-07, + "logits/chosen": 1.6316070556640625, + "logits/rejected": 2.7541658878326416, + "logps/chosen": -328.0393981933594, + "logps/rejected": -498.2681579589844, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.378713846206665, + "rewards/margins": 15.276588439941406, + "rewards/rejected": -12.89787483215332, + "step": 5730 + }, + { + "epoch": 1.95, + "learning_rate": 1.9425909605942338e-07, + "logits/chosen": 0.5457426905632019, + "logits/rejected": 2.7625725269317627, + "logps/chosen": -515.2962646484375, + "logps/rejected": -626.9390869140625, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7357877492904663, + "rewards/margins": 13.215678215026855, + "rewards/rejected": -11.479890823364258, + "step": 5740 + }, + { + "epoch": 1.95, + "learning_rate": 1.936296109782198e-07, + "logits/chosen": 0.8939191102981567, + "logits/rejected": 1.777658462524414, + "logps/chosen": -351.0246276855469, + "logps/rejected": -832.22705078125, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2989163398742676, + "rewards/margins": 16.514678955078125, + "rewards/rejected": -14.2157621383667, + "step": 5750 + }, + { + "epoch": 1.96, + "learning_rate": 1.9300012589701623e-07, + "logits/chosen": 0.9846128225326538, + "logits/rejected": 1.9890168905258179, + "logps/chosen": -417.22454833984375, + "logps/rejected": -747.7069702148438, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.978386640548706, + "rewards/margins": 13.060282707214355, + "rewards/rejected": -11.08189582824707, + "step": 5760 + }, + { + "epoch": 1.96, + "learning_rate": 1.9237064081581268e-07, + "logits/chosen": 0.5852854251861572, + "logits/rejected": 2.476715087890625, + "logps/chosen": -331.2423095703125, + "logps/rejected": -701.3380126953125, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2032055854797363, + "rewards/margins": 15.220135688781738, + "rewards/rejected": -13.016927719116211, + "step": 5770 + }, + { + "epoch": 1.96, + "learning_rate": 1.9174115573460907e-07, + "logits/chosen": 0.7734403610229492, + "logits/rejected": 2.3022501468658447, + "logps/chosen": -400.611328125, + "logps/rejected": -658.062744140625, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9118320941925049, + "rewards/margins": 15.38043212890625, + "rewards/rejected": -13.468599319458008, + "step": 5780 + }, + { + "epoch": 1.97, + "learning_rate": 1.911116706534055e-07, + "logits/chosen": 1.4819624423980713, + "logits/rejected": 2.5294859409332275, + "logps/chosen": -466.876708984375, + "logps/rejected": -613.1146240234375, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.820359230041504, + "rewards/margins": 13.872444152832031, + "rewards/rejected": -12.052085876464844, + "step": 5790 + }, + { + "epoch": 1.97, + "learning_rate": 1.9048218557220194e-07, + "logits/chosen": 0.6445799469947815, + "logits/rejected": 3.186220407485962, + "logps/chosen": -382.71282958984375, + "logps/rejected": -496.46221923828125, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8192764520645142, + "rewards/margins": 14.626768112182617, + "rewards/rejected": -12.807493209838867, + "step": 5800 + }, + { + "epoch": 1.97, + "eval_logits/chosen": 0.5421663522720337, + "eval_logits/rejected": 2.624814510345459, + "eval_logps/chosen": -368.0468444824219, + "eval_logps/rejected": -625.33251953125, + "eval_loss": 0.006073611788451672, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 2.0943007469177246, + "eval_rewards/margins": 14.364514350891113, + "eval_rewards/rejected": -12.270215034484863, + "eval_runtime": 267.5626, + "eval_samples_per_second": 35.506, + "eval_steps_per_second": 1.11, + "step": 5800 + }, + { + "epoch": 1.97, + "learning_rate": 1.8985270049099837e-07, + "logits/chosen": 1.1447639465332031, + "logits/rejected": 2.342881441116333, + "logps/chosen": -317.5584411621094, + "logps/rejected": -704.2658081054688, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8849990367889404, + "rewards/margins": 13.785212516784668, + "rewards/rejected": -11.900214195251465, + "step": 5810 + }, + { + "epoch": 1.98, + "learning_rate": 1.8922321540979476e-07, + "logits/chosen": 1.2115243673324585, + "logits/rejected": 2.874825954437256, + "logps/chosen": -470.36224365234375, + "logps/rejected": -491.6048278808594, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.872240662574768, + "rewards/margins": 15.113815307617188, + "rewards/rejected": -13.241575241088867, + "step": 5820 + }, + { + "epoch": 1.98, + "learning_rate": 1.885937303285912e-07, + "logits/chosen": 0.2891542315483093, + "logits/rejected": 2.481403350830078, + "logps/chosen": -302.67083740234375, + "logps/rejected": -679.9458618164062, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0729470252990723, + "rewards/margins": 15.368230819702148, + "rewards/rejected": -13.29528522491455, + "step": 5830 + }, + { + "epoch": 1.99, + "learning_rate": 1.8796424524738764e-07, + "logits/chosen": 0.47113484144210815, + "logits/rejected": 2.459672451019287, + "logps/chosen": -395.92315673828125, + "logps/rejected": -567.0223388671875, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.37471866607666, + "rewards/margins": 14.61865234375, + "rewards/rejected": -12.243932723999023, + "step": 5840 + }, + { + "epoch": 1.99, + "learning_rate": 1.8733476016618406e-07, + "logits/chosen": 0.5824601054191589, + "logits/rejected": 2.3544952869415283, + "logps/chosen": -366.3118591308594, + "logps/rejected": -600.1126708984375, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0960936546325684, + "rewards/margins": 14.047538757324219, + "rewards/rejected": -11.951444625854492, + "step": 5850 + }, + { + "epoch": 1.99, + "learning_rate": 1.8670527508498048e-07, + "logits/chosen": 0.888663649559021, + "logits/rejected": 2.1462912559509277, + "logps/chosen": -315.76434326171875, + "logps/rejected": -801.0203857421875, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9007564783096313, + "rewards/margins": 15.119722366333008, + "rewards/rejected": -13.218966484069824, + "step": 5860 + }, + { + "epoch": 2.0, + "learning_rate": 1.860757900037769e-07, + "logits/chosen": 0.42253509163856506, + "logits/rejected": 2.834648609161377, + "logps/chosen": -320.17999267578125, + "logps/rejected": -539.3760375976562, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2486395835876465, + "rewards/margins": 15.82959270477295, + "rewards/rejected": -13.580953598022461, + "step": 5870 + }, + { + "epoch": 2.0, + "learning_rate": 1.8544630492257333e-07, + "logits/chosen": 0.5946919322013855, + "logits/rejected": 2.0946545600891113, + "logps/chosen": -306.47149658203125, + "logps/rejected": -675.6048583984375, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.677959680557251, + "rewards/margins": 14.716924667358398, + "rewards/rejected": -13.038965225219727, + "step": 5880 + }, + { + "epoch": 2.0, + "learning_rate": 1.8481681984136978e-07, + "logits/chosen": 1.366878867149353, + "logits/rejected": 2.7167229652404785, + "logps/chosen": -336.2856140136719, + "logps/rejected": -511.2284240722656, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7026660442352295, + "rewards/margins": 14.071836471557617, + "rewards/rejected": -12.369170188903809, + "step": 5890 + }, + { + "epoch": 2.01, + "learning_rate": 1.8418733476016617e-07, + "logits/chosen": 1.1604326963424683, + "logits/rejected": 2.527015209197998, + "logps/chosen": -380.0445251464844, + "logps/rejected": -559.8151245117188, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2937469482421875, + "rewards/margins": 13.332115173339844, + "rewards/rejected": -11.038368225097656, + "step": 5900 + }, + { + "epoch": 2.01, + "eval_logits/chosen": 0.518649697303772, + "eval_logits/rejected": 2.571194648742676, + "eval_logps/chosen": -369.7945556640625, + "eval_logps/rejected": -631.9784545898438, + "eval_loss": 0.005714269354939461, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.919531226158142, + "eval_rewards/margins": 14.854342460632324, + "eval_rewards/rejected": -12.934809684753418, + "eval_runtime": 268.1608, + "eval_samples_per_second": 35.427, + "eval_steps_per_second": 1.108, + "step": 5900 + }, + { + "epoch": 2.01, + "learning_rate": 1.835578496789626e-07, + "logits/chosen": 0.981165885925293, + "logits/rejected": 2.4159669876098633, + "logps/chosen": -390.6913757324219, + "logps/rejected": -670.6868286132812, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1235125064849854, + "rewards/margins": 14.490254402160645, + "rewards/rejected": -12.366741180419922, + "step": 5910 + }, + { + "epoch": 2.01, + "learning_rate": 1.8292836459775904e-07, + "logits/chosen": 1.076453447341919, + "logits/rejected": 2.0716986656188965, + "logps/chosen": -440.6607360839844, + "logps/rejected": -772.1173706054688, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.22540020942688, + "rewards/margins": 14.935269355773926, + "rewards/rejected": -12.709869384765625, + "step": 5920 + }, + { + "epoch": 2.02, + "learning_rate": 1.8229887951655544e-07, + "logits/chosen": 1.0230543613433838, + "logits/rejected": 2.3830807209014893, + "logps/chosen": -298.8233947753906, + "logps/rejected": -630.2376098632812, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1085221767425537, + "rewards/margins": 14.758903503417969, + "rewards/rejected": -12.650381088256836, + "step": 5930 + }, + { + "epoch": 2.02, + "learning_rate": 1.8166939443535186e-07, + "logits/chosen": 1.2830191850662231, + "logits/rejected": 2.8751425743103027, + "logps/chosen": -410.10662841796875, + "logps/rejected": -473.4539489746094, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3019919395446777, + "rewards/margins": 15.205785751342773, + "rewards/rejected": -12.903793334960938, + "step": 5940 + }, + { + "epoch": 2.02, + "learning_rate": 1.8103990935414829e-07, + "logits/chosen": 1.0351749658584595, + "logits/rejected": 2.504169464111328, + "logps/chosen": -318.38153076171875, + "logps/rejected": -625.9144287109375, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.910672903060913, + "rewards/margins": 13.210138320922852, + "rewards/rejected": -11.299463272094727, + "step": 5950 + }, + { + "epoch": 2.03, + "learning_rate": 1.8041042427294474e-07, + "logits/chosen": 1.23222815990448, + "logits/rejected": 2.7579433917999268, + "logps/chosen": -383.14959716796875, + "logps/rejected": -414.42578125, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.964050531387329, + "rewards/margins": 16.147647857666016, + "rewards/rejected": -14.183601379394531, + "step": 5960 + }, + { + "epoch": 2.03, + "learning_rate": 1.7978093919174113e-07, + "logits/chosen": 0.6595891714096069, + "logits/rejected": 2.0642571449279785, + "logps/chosen": -358.0821838378906, + "logps/rejected": -768.7884521484375, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2330410480499268, + "rewards/margins": 14.898645401000977, + "rewards/rejected": -13.665603637695312, + "step": 5970 + }, + { + "epoch": 2.03, + "learning_rate": 1.7915145411053755e-07, + "logits/chosen": 0.7389932870864868, + "logits/rejected": 1.9643020629882812, + "logps/chosen": -365.7792663574219, + "logps/rejected": -685.234619140625, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6057651042938232, + "rewards/margins": 13.977490425109863, + "rewards/rejected": -12.371726036071777, + "step": 5980 + }, + { + "epoch": 2.04, + "learning_rate": 1.78521969029334e-07, + "logits/chosen": 1.0801981687545776, + "logits/rejected": 2.0700907707214355, + "logps/chosen": -374.2262268066406, + "logps/rejected": -765.7568969726562, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5652799606323242, + "rewards/margins": 14.630142211914062, + "rewards/rejected": -13.064863204956055, + "step": 5990 + }, + { + "epoch": 2.04, + "learning_rate": 1.7789248394813043e-07, + "logits/chosen": 1.2110086679458618, + "logits/rejected": 2.5913567543029785, + "logps/chosen": -442.3192443847656, + "logps/rejected": -512.6320190429688, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.126986265182495, + "rewards/margins": 16.897563934326172, + "rewards/rejected": -14.770574569702148, + "step": 6000 + }, + { + "epoch": 2.04, + "eval_logits/chosen": 0.49600929021835327, + "eval_logits/rejected": 2.540468215942383, + "eval_logps/chosen": -370.60565185546875, + "eval_logps/rejected": -636.5339965820312, + "eval_loss": 0.005733425263315439, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.838423728942871, + "eval_rewards/margins": 15.228790283203125, + "eval_rewards/rejected": -13.390366554260254, + "eval_runtime": 267.8718, + "eval_samples_per_second": 35.465, + "eval_steps_per_second": 1.109, + "step": 6000 + }, + { + "epoch": 2.04, + "learning_rate": 1.7726299886692682e-07, + "logits/chosen": 0.9540095329284668, + "logits/rejected": 2.4917612075805664, + "logps/chosen": -344.8664245605469, + "logps/rejected": -551.3218994140625, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.751217246055603, + "rewards/margins": 13.663978576660156, + "rewards/rejected": -11.912760734558105, + "step": 6010 + }, + { + "epoch": 2.05, + "learning_rate": 1.7663351378572327e-07, + "logits/chosen": 0.5778986215591431, + "logits/rejected": 2.036611795425415, + "logps/chosen": -324.55914306640625, + "logps/rejected": -816.7264404296875, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4912192821502686, + "rewards/margins": 14.945416450500488, + "rewards/rejected": -13.454197883605957, + "step": 6020 + }, + { + "epoch": 2.05, + "learning_rate": 1.760040287045197e-07, + "logits/chosen": 0.8682858347892761, + "logits/rejected": 2.310798168182373, + "logps/chosen": -511.52667236328125, + "logps/rejected": -588.7957153320312, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5532984733581543, + "rewards/margins": 14.495959281921387, + "rewards/rejected": -12.942662239074707, + "step": 6030 + }, + { + "epoch": 2.05, + "learning_rate": 1.7537454362331612e-07, + "logits/chosen": 1.4959813356399536, + "logits/rejected": 2.511289119720459, + "logps/chosen": -370.686767578125, + "logps/rejected": -587.7115478515625, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.48617684841156, + "rewards/margins": 15.93817138671875, + "rewards/rejected": -14.451992988586426, + "step": 6040 + }, + { + "epoch": 2.06, + "learning_rate": 1.7474505854211254e-07, + "logits/chosen": 1.0743986368179321, + "logits/rejected": 2.4483957290649414, + "logps/chosen": -504.873779296875, + "logps/rejected": -595.0511474609375, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.765019178390503, + "rewards/margins": 15.832463264465332, + "rewards/rejected": -14.06744384765625, + "step": 6050 + }, + { + "epoch": 2.06, + "learning_rate": 1.7411557346090896e-07, + "logits/chosen": 1.304214596748352, + "logits/rejected": 2.794304132461548, + "logps/chosen": -357.6822204589844, + "logps/rejected": -523.1092529296875, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2659730911254883, + "rewards/margins": 15.043255805969238, + "rewards/rejected": -13.777284622192383, + "step": 6060 + }, + { + "epoch": 2.06, + "learning_rate": 1.7348608837970539e-07, + "logits/chosen": 0.8975250124931335, + "logits/rejected": 2.321444511413574, + "logps/chosen": -377.947998046875, + "logps/rejected": -687.5652465820312, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.609063744544983, + "rewards/margins": 16.110820770263672, + "rewards/rejected": -14.501757621765137, + "step": 6070 + }, + { + "epoch": 2.07, + "learning_rate": 1.7285660329850184e-07, + "logits/chosen": 0.9074400663375854, + "logits/rejected": 2.431576728820801, + "logps/chosen": -313.75970458984375, + "logps/rejected": -636.7111206054688, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8432601690292358, + "rewards/margins": 17.6544246673584, + "rewards/rejected": -15.811162948608398, + "step": 6080 + }, + { + "epoch": 2.07, + "learning_rate": 1.7222711821729823e-07, + "logits/chosen": 1.2238930463790894, + "logits/rejected": 2.038734197616577, + "logps/chosen": -304.6903381347656, + "logps/rejected": -741.0390625, + "loss": 0.0061, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5827157497406006, + "rewards/margins": 14.606298446655273, + "rewards/rejected": -13.023582458496094, + "step": 6090 + }, + { + "epoch": 2.07, + "learning_rate": 1.7159763313609465e-07, + "logits/chosen": 1.0854783058166504, + "logits/rejected": 2.659773111343384, + "logps/chosen": -392.56781005859375, + "logps/rejected": -585.4364013671875, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6525754928588867, + "rewards/margins": 17.150650024414062, + "rewards/rejected": -15.498072624206543, + "step": 6100 + }, + { + "epoch": 2.07, + "eval_logits/chosen": 0.44153305888175964, + "eval_logits/rejected": 2.471759557723999, + "eval_logps/chosen": -372.8395080566406, + "eval_logps/rejected": -645.4886474609375, + "eval_loss": 0.005611285101622343, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 1.6150366067886353, + "eval_rewards/margins": 15.900870323181152, + "eval_rewards/rejected": -14.285835266113281, + "eval_runtime": 267.9573, + "eval_samples_per_second": 35.453, + "eval_steps_per_second": 1.108, + "step": 6100 + }, + { + "epoch": 2.08, + "learning_rate": 1.709681480548911e-07, + "logits/chosen": 0.7842377424240112, + "logits/rejected": 2.7997422218322754, + "logps/chosen": -299.3103942871094, + "logps/rejected": -551.2344970703125, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5577961206436157, + "rewards/margins": 16.915258407592773, + "rewards/rejected": -15.357464790344238, + "step": 6110 + }, + { + "epoch": 2.08, + "learning_rate": 1.7033866297368753e-07, + "logits/chosen": 0.46014171838760376, + "logits/rejected": 2.373453378677368, + "logps/chosen": -398.37518310546875, + "logps/rejected": -673.5498046875, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.542041301727295, + "rewards/margins": 17.008686065673828, + "rewards/rejected": -14.466644287109375, + "step": 6120 + }, + { + "epoch": 2.08, + "learning_rate": 1.6970917789248392e-07, + "logits/chosen": 1.0954312086105347, + "logits/rejected": 2.1974737644195557, + "logps/chosen": -378.20831298828125, + "logps/rejected": -785.3609008789062, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8533457517623901, + "rewards/margins": 16.01312255859375, + "rewards/rejected": -15.15977668762207, + "step": 6130 + }, + { + "epoch": 2.09, + "learning_rate": 1.6907969281128037e-07, + "logits/chosen": 1.350953459739685, + "logits/rejected": 3.0693161487579346, + "logps/chosen": -473.63397216796875, + "logps/rejected": -589.5533447265625, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7659844756126404, + "rewards/margins": 14.892280578613281, + "rewards/rejected": -14.126296997070312, + "step": 6140 + }, + { + "epoch": 2.09, + "learning_rate": 1.684502077300768e-07, + "logits/chosen": 0.7171489000320435, + "logits/rejected": 2.2262418270111084, + "logps/chosen": -406.3450012207031, + "logps/rejected": -637.3649291992188, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3386101722717285, + "rewards/margins": 15.8351411819458, + "rewards/rejected": -14.49653148651123, + "step": 6150 + }, + { + "epoch": 2.09, + "learning_rate": 1.678207226488732e-07, + "logits/chosen": 0.7702849507331848, + "logits/rejected": 2.560065746307373, + "logps/chosen": -375.14044189453125, + "logps/rejected": -543.9547119140625, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9042383432388306, + "rewards/margins": 14.497563362121582, + "rewards/rejected": -12.5933256149292, + "step": 6160 + }, + { + "epoch": 2.1, + "learning_rate": 1.6719123756766964e-07, + "logits/chosen": 0.8514900207519531, + "logits/rejected": 1.924505591392517, + "logps/chosen": -399.55474853515625, + "logps/rejected": -754.6709594726562, + "loss": 0.0036, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.312821626663208, + "rewards/margins": 17.12882423400879, + "rewards/rejected": -14.816003799438477, + "step": 6170 + }, + { + "epoch": 2.1, + "learning_rate": 1.6656175248646606e-07, + "logits/chosen": 1.1173584461212158, + "logits/rejected": 2.6232028007507324, + "logps/chosen": -346.4198913574219, + "logps/rejected": -619.3607177734375, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4097492694854736, + "rewards/margins": 16.980304718017578, + "rewards/rejected": -15.570554733276367, + "step": 6180 + }, + { + "epoch": 2.1, + "learning_rate": 1.6593226740526249e-07, + "logits/chosen": 0.5098138451576233, + "logits/rejected": 3.2658188343048096, + "logps/chosen": -288.40509033203125, + "logps/rejected": -499.8128356933594, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9584379196166992, + "rewards/margins": 16.69192886352539, + "rewards/rejected": -14.733491897583008, + "step": 6190 + }, + { + "epoch": 2.11, + "learning_rate": 1.653027823240589e-07, + "logits/chosen": 0.5227378010749817, + "logits/rejected": 2.2451419830322266, + "logps/chosen": -312.66033935546875, + "logps/rejected": -674.2329711914062, + "loss": 0.0053, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.3434746265411377, + "rewards/margins": 15.002885818481445, + "rewards/rejected": -13.65941047668457, + "step": 6200 + }, + { + "epoch": 2.11, + "eval_logits/chosen": 0.45763009786605835, + "eval_logits/rejected": 2.4921276569366455, + "eval_logps/chosen": -370.72222900390625, + "eval_logps/rejected": -642.0590209960938, + "eval_loss": 0.00525397714227438, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.8267639875411987, + "eval_rewards/margins": 15.769631385803223, + "eval_rewards/rejected": -13.942869186401367, + "eval_runtime": 266.8449, + "eval_samples_per_second": 35.601, + "eval_steps_per_second": 1.113, + "step": 6200 + }, + { + "epoch": 2.11, + "learning_rate": 1.6467329724285533e-07, + "logits/chosen": 0.7600902915000916, + "logits/rejected": 2.226156234741211, + "logps/chosen": -328.57281494140625, + "logps/rejected": -729.6641235351562, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5251786708831787, + "rewards/margins": 14.667379379272461, + "rewards/rejected": -13.14220142364502, + "step": 6210 + }, + { + "epoch": 2.11, + "learning_rate": 1.6404381216165175e-07, + "logits/chosen": 0.9222061038017273, + "logits/rejected": 2.259605884552002, + "logps/chosen": -317.9562072753906, + "logps/rejected": -591.8802490234375, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.16776704788208, + "rewards/margins": 17.08612060546875, + "rewards/rejected": -14.918353080749512, + "step": 6220 + }, + { + "epoch": 2.12, + "learning_rate": 1.634143270804482e-07, + "logits/chosen": 0.06134549900889397, + "logits/rejected": 2.560732126235962, + "logps/chosen": -352.6381530761719, + "logps/rejected": -583.1201171875, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7368847131729126, + "rewards/margins": 15.019735336303711, + "rewards/rejected": -13.282852172851562, + "step": 6230 + }, + { + "epoch": 2.12, + "learning_rate": 1.627848419992446e-07, + "logits/chosen": 1.1012592315673828, + "logits/rejected": 1.9502675533294678, + "logps/chosen": -429.45751953125, + "logps/rejected": -795.875244140625, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9872167110443115, + "rewards/margins": 16.9797420501709, + "rewards/rejected": -14.992526054382324, + "step": 6240 + }, + { + "epoch": 2.12, + "learning_rate": 1.6215535691804102e-07, + "logits/chosen": 1.477541208267212, + "logits/rejected": 2.3593482971191406, + "logps/chosen": -375.87860107421875, + "logps/rejected": -531.9874267578125, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8726123571395874, + "rewards/margins": 14.83423137664795, + "rewards/rejected": -12.96161937713623, + "step": 6250 + }, + { + "epoch": 2.13, + "learning_rate": 1.6152587183683747e-07, + "logits/chosen": 0.6256676316261292, + "logits/rejected": 2.4760072231292725, + "logps/chosen": -315.6788024902344, + "logps/rejected": -704.2340698242188, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.721914291381836, + "rewards/margins": 16.100740432739258, + "rewards/rejected": -14.378824234008789, + "step": 6260 + }, + { + "epoch": 2.13, + "learning_rate": 1.608963867556339e-07, + "logits/chosen": 0.6289627552032471, + "logits/rejected": 3.0689284801483154, + "logps/chosen": -312.2763366699219, + "logps/rejected": -525.0361938476562, + "loss": 0.0071, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7409350872039795, + "rewards/margins": 14.935763359069824, + "rewards/rejected": -13.194829940795898, + "step": 6270 + }, + { + "epoch": 2.13, + "learning_rate": 1.602669016744303e-07, + "logits/chosen": 1.0302293300628662, + "logits/rejected": 1.7600761651992798, + "logps/chosen": -456.14898681640625, + "logps/rejected": -906.0274658203125, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.196922540664673, + "rewards/margins": 15.269792556762695, + "rewards/rejected": -13.072870254516602, + "step": 6280 + }, + { + "epoch": 2.14, + "learning_rate": 1.5963741659322674e-07, + "logits/chosen": 1.2055107355117798, + "logits/rejected": 2.174511432647705, + "logps/chosen": -343.38134765625, + "logps/rejected": -685.1343994140625, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0983285903930664, + "rewards/margins": 15.914003372192383, + "rewards/rejected": -13.815675735473633, + "step": 6290 + }, + { + "epoch": 2.14, + "learning_rate": 1.5900793151202316e-07, + "logits/chosen": 0.9002892374992371, + "logits/rejected": 2.5622847080230713, + "logps/chosen": -310.75067138671875, + "logps/rejected": -680.5726318359375, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9516767263412476, + "rewards/margins": 14.449353218078613, + "rewards/rejected": -12.497675895690918, + "step": 6300 + }, + { + "epoch": 2.14, + "eval_logits/chosen": 0.470478355884552, + "eval_logits/rejected": 2.507901906967163, + "eval_logps/chosen": -369.54644775390625, + "eval_logps/rejected": -640.7470092773438, + "eval_loss": 0.005234770942479372, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 1.9443472623825073, + "eval_rewards/margins": 15.756011962890625, + "eval_rewards/rejected": -13.811664581298828, + "eval_runtime": 268.1768, + "eval_samples_per_second": 35.424, + "eval_steps_per_second": 1.107, + "step": 6300 + }, + { + "epoch": 2.14, + "learning_rate": 1.5837844643081959e-07, + "logits/chosen": 0.47189411520957947, + "logits/rejected": 2.19954776763916, + "logps/chosen": -294.6665954589844, + "logps/rejected": -810.7994995117188, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0598158836364746, + "rewards/margins": 16.0883846282959, + "rewards/rejected": -14.028569221496582, + "step": 6310 + }, + { + "epoch": 2.15, + "learning_rate": 1.57748961349616e-07, + "logits/chosen": 0.6236362457275391, + "logits/rejected": 2.104513168334961, + "logps/chosen": -388.9352111816406, + "logps/rejected": -850.1434326171875, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.145151376724243, + "rewards/margins": 16.31357192993164, + "rewards/rejected": -14.168418884277344, + "step": 6320 + }, + { + "epoch": 2.15, + "learning_rate": 1.5711947626841243e-07, + "logits/chosen": 1.140409231185913, + "logits/rejected": 2.624236822128296, + "logps/chosen": -472.2935485839844, + "logps/rejected": -509.56182861328125, + "loss": 0.0065, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7926658391952515, + "rewards/margins": 14.899357795715332, + "rewards/rejected": -13.106691360473633, + "step": 6330 + }, + { + "epoch": 2.15, + "learning_rate": 1.5648999118720885e-07, + "logits/chosen": 1.207027554512024, + "logits/rejected": 2.677910566329956, + "logps/chosen": -399.31866455078125, + "logps/rejected": -542.0875244140625, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7966368198394775, + "rewards/margins": 17.7134952545166, + "rewards/rejected": -14.916857719421387, + "step": 6340 + }, + { + "epoch": 2.16, + "learning_rate": 1.558605061060053e-07, + "logits/chosen": 1.527717113494873, + "logits/rejected": 2.574253559112549, + "logps/chosen": -340.0840148925781, + "logps/rejected": -657.302978515625, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.499192953109741, + "rewards/margins": 17.460071563720703, + "rewards/rejected": -14.9608793258667, + "step": 6350 + }, + { + "epoch": 2.16, + "learning_rate": 1.552310210248017e-07, + "logits/chosen": 1.2479125261306763, + "logits/rejected": 2.030150890350342, + "logps/chosen": -445.0252990722656, + "logps/rejected": -781.627197265625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9407612085342407, + "rewards/margins": 16.6387996673584, + "rewards/rejected": -14.698038101196289, + "step": 6360 + }, + { + "epoch": 2.17, + "learning_rate": 1.5460153594359812e-07, + "logits/chosen": 1.2388721704483032, + "logits/rejected": 2.1668925285339355, + "logps/chosen": -449.60107421875, + "logps/rejected": -652.534423828125, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8397868871688843, + "rewards/margins": 16.469173431396484, + "rewards/rejected": -14.629384994506836, + "step": 6370 + }, + { + "epoch": 2.17, + "learning_rate": 1.5397205086239457e-07, + "logits/chosen": 0.750697135925293, + "logits/rejected": 2.147480010986328, + "logps/chosen": -381.23333740234375, + "logps/rejected": -724.5010986328125, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.091524124145508, + "rewards/margins": 16.812833786010742, + "rewards/rejected": -14.72131061553955, + "step": 6380 + }, + { + "epoch": 2.17, + "learning_rate": 1.5334256578119097e-07, + "logits/chosen": 1.033517599105835, + "logits/rejected": 2.331897735595703, + "logps/chosen": -393.97259521484375, + "logps/rejected": -627.6727905273438, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.801288604736328, + "rewards/margins": 15.796916007995605, + "rewards/rejected": -12.995626449584961, + "step": 6390 + }, + { + "epoch": 2.18, + "learning_rate": 1.527130806999874e-07, + "logits/chosen": 1.0533195734024048, + "logits/rejected": 2.7393133640289307, + "logps/chosen": -460.53857421875, + "logps/rejected": -512.9381103515625, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.899174690246582, + "rewards/margins": 15.319025039672852, + "rewards/rejected": -13.419851303100586, + "step": 6400 + }, + { + "epoch": 2.18, + "eval_logits/chosen": 0.4822961986064911, + "eval_logits/rejected": 2.5138680934906006, + "eval_logps/chosen": -368.5343017578125, + "eval_logps/rejected": -640.0852661132812, + "eval_loss": 0.005253070965409279, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 2.045560121536255, + "eval_rewards/margins": 15.791056632995605, + "eval_rewards/rejected": -13.745494842529297, + "eval_runtime": 268.0763, + "eval_samples_per_second": 35.438, + "eval_steps_per_second": 1.108, + "step": 6400 + }, + { + "epoch": 2.18, + "learning_rate": 1.5208359561878384e-07, + "logits/chosen": 1.100904107093811, + "logits/rejected": 2.4689745903015137, + "logps/chosen": -373.98675537109375, + "logps/rejected": -577.3536987304688, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8111152648925781, + "rewards/margins": 15.343942642211914, + "rewards/rejected": -13.53282642364502, + "step": 6410 + }, + { + "epoch": 2.18, + "learning_rate": 1.5145411053758026e-07, + "logits/chosen": 1.3738195896148682, + "logits/rejected": 2.6699304580688477, + "logps/chosen": -375.9863586425781, + "logps/rejected": -624.0731201171875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6937096118927002, + "rewards/margins": 17.86076545715332, + "rewards/rejected": -16.167057037353516, + "step": 6420 + }, + { + "epoch": 2.19, + "learning_rate": 1.5082462545637666e-07, + "logits/chosen": 0.3751014173030853, + "logits/rejected": 2.8960976600646973, + "logps/chosen": -321.5099792480469, + "logps/rejected": -479.81268310546875, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7421153783798218, + "rewards/margins": 16.647037506103516, + "rewards/rejected": -14.904919624328613, + "step": 6430 + }, + { + "epoch": 2.19, + "learning_rate": 1.501951403751731e-07, + "logits/chosen": 1.0028069019317627, + "logits/rejected": 2.3100218772888184, + "logps/chosen": -312.1414794921875, + "logps/rejected": -645.2931518554688, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.828717589378357, + "rewards/margins": 17.236507415771484, + "rewards/rejected": -15.407788276672363, + "step": 6440 + }, + { + "epoch": 2.19, + "learning_rate": 1.4956565529396953e-07, + "logits/chosen": 1.4677902460098267, + "logits/rejected": 2.6699881553649902, + "logps/chosen": -346.54742431640625, + "logps/rejected": -606.3338623046875, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.090097665786743, + "rewards/margins": 18.220773696899414, + "rewards/rejected": -16.13067626953125, + "step": 6450 + }, + { + "epoch": 2.2, + "learning_rate": 1.4893617021276595e-07, + "logits/chosen": 1.2526359558105469, + "logits/rejected": 2.1189627647399902, + "logps/chosen": -328.7201232910156, + "logps/rejected": -692.3485107421875, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1243045330047607, + "rewards/margins": 17.358163833618164, + "rewards/rejected": -15.233858108520508, + "step": 6460 + }, + { + "epoch": 2.2, + "learning_rate": 1.4830668513156238e-07, + "logits/chosen": 0.1407167762517929, + "logits/rejected": 3.113206148147583, + "logps/chosen": -287.0426330566406, + "logps/rejected": -430.408935546875, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6383129358291626, + "rewards/margins": 17.28408432006836, + "rewards/rejected": -15.645770072937012, + "step": 6470 + }, + { + "epoch": 2.2, + "learning_rate": 1.476772000503588e-07, + "logits/chosen": 0.8808428645133972, + "logits/rejected": 2.396165132522583, + "logps/chosen": -320.5928649902344, + "logps/rejected": -543.3441772460938, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2573113441467285, + "rewards/margins": 16.72433853149414, + "rewards/rejected": -14.467025756835938, + "step": 6480 + }, + { + "epoch": 2.21, + "learning_rate": 1.4704771496915522e-07, + "logits/chosen": 1.4290237426757812, + "logits/rejected": 2.7337679862976074, + "logps/chosen": -343.0905456542969, + "logps/rejected": -558.6626586914062, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.977564811706543, + "rewards/margins": 18.43790054321289, + "rewards/rejected": -16.4603328704834, + "step": 6490 + }, + { + "epoch": 2.21, + "learning_rate": 1.4641822988795167e-07, + "logits/chosen": 1.210451364517212, + "logits/rejected": 2.4949653148651123, + "logps/chosen": -313.0892639160156, + "logps/rejected": -633.3006591796875, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7542457580566406, + "rewards/margins": 14.538752555847168, + "rewards/rejected": -12.784505844116211, + "step": 6500 + }, + { + "epoch": 2.21, + "eval_logits/chosen": 0.48234444856643677, + "eval_logits/rejected": 2.513512372970581, + "eval_logps/chosen": -368.9617919921875, + "eval_logps/rejected": -639.1259765625, + "eval_loss": 0.005004484672099352, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 2.002810478210449, + "eval_rewards/margins": 15.652373313903809, + "eval_rewards/rejected": -13.649561882019043, + "eval_runtime": 268.1176, + "eval_samples_per_second": 35.432, + "eval_steps_per_second": 1.108, + "step": 6500 + }, + { + "epoch": 2.21, + "learning_rate": 1.4578874480674807e-07, + "logits/chosen": 0.5491858124732971, + "logits/rejected": 2.644583225250244, + "logps/chosen": -417.05865478515625, + "logps/rejected": -580.2625732421875, + "loss": 0.0052, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.1090285778045654, + "rewards/margins": 15.84637451171875, + "rewards/rejected": -13.737344741821289, + "step": 6510 + }, + { + "epoch": 2.22, + "learning_rate": 1.451592597255445e-07, + "logits/chosen": 1.1235544681549072, + "logits/rejected": 2.5771431922912598, + "logps/chosen": -411.65399169921875, + "logps/rejected": -593.0504760742188, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7683136463165283, + "rewards/margins": 15.734085083007812, + "rewards/rejected": -13.96576976776123, + "step": 6520 + }, + { + "epoch": 2.22, + "learning_rate": 1.4452977464434094e-07, + "logits/chosen": 0.9458175897598267, + "logits/rejected": 2.292898654937744, + "logps/chosen": -356.336669921875, + "logps/rejected": -608.5050048828125, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6671857833862305, + "rewards/margins": 15.176393508911133, + "rewards/rejected": -13.509206771850586, + "step": 6530 + }, + { + "epoch": 2.22, + "learning_rate": 1.4390028956313736e-07, + "logits/chosen": 0.9157311320304871, + "logits/rejected": 2.6453652381896973, + "logps/chosen": -327.6547546386719, + "logps/rejected": -647.6570434570312, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3297417163848877, + "rewards/margins": 17.928281784057617, + "rewards/rejected": -15.598539352416992, + "step": 6540 + }, + { + "epoch": 2.23, + "learning_rate": 1.4327080448193376e-07, + "logits/chosen": 0.9368341565132141, + "logits/rejected": 2.830165386199951, + "logps/chosen": -491.01385498046875, + "logps/rejected": -593.498291015625, + "loss": 0.0045, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.416783094406128, + "rewards/margins": 17.794885635375977, + "rewards/rejected": -16.378103256225586, + "step": 6550 + }, + { + "epoch": 2.23, + "learning_rate": 1.426413194007302e-07, + "logits/chosen": 0.7394753098487854, + "logits/rejected": 2.6355130672454834, + "logps/chosen": -335.5205078125, + "logps/rejected": -617.947265625, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2752904891967773, + "rewards/margins": 17.116397857666016, + "rewards/rejected": -14.841107368469238, + "step": 6560 + }, + { + "epoch": 2.23, + "learning_rate": 1.4201183431952663e-07, + "logits/chosen": 0.645321249961853, + "logits/rejected": 2.001403331756592, + "logps/chosen": -383.0551452636719, + "logps/rejected": -834.9183349609375, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.033926010131836, + "rewards/margins": 16.158061981201172, + "rewards/rejected": -14.12413501739502, + "step": 6570 + }, + { + "epoch": 2.24, + "learning_rate": 1.4138234923832303e-07, + "logits/chosen": 1.1049634218215942, + "logits/rejected": 2.1733741760253906, + "logps/chosen": -383.75164794921875, + "logps/rejected": -690.3191528320312, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4952940940856934, + "rewards/margins": 17.292388916015625, + "rewards/rejected": -14.797096252441406, + "step": 6580 + }, + { + "epoch": 2.24, + "learning_rate": 1.4075286415711948e-07, + "logits/chosen": 0.8609091639518738, + "logits/rejected": 2.836751937866211, + "logps/chosen": -399.3436279296875, + "logps/rejected": -577.36474609375, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5047794580459595, + "rewards/margins": 16.921968460083008, + "rewards/rejected": -15.417187690734863, + "step": 6590 + }, + { + "epoch": 2.24, + "learning_rate": 1.401233790759159e-07, + "logits/chosen": 0.3242916464805603, + "logits/rejected": 2.3490653038024902, + "logps/chosen": -412.41607666015625, + "logps/rejected": -621.5950927734375, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4823118448257446, + "rewards/margins": 14.215707778930664, + "rewards/rejected": -12.733394622802734, + "step": 6600 + }, + { + "epoch": 2.24, + "eval_logits/chosen": 0.4459000825881958, + "eval_logits/rejected": 2.4828238487243652, + "eval_logps/chosen": -370.1336669921875, + "eval_logps/rejected": -640.5563354492188, + "eval_loss": 0.004967730492353439, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 1.8856240510940552, + "eval_rewards/margins": 15.678227424621582, + "eval_rewards/rejected": -13.792603492736816, + "eval_runtime": 268.4797, + "eval_samples_per_second": 35.384, + "eval_steps_per_second": 1.106, + "step": 6600 + }, + { + "epoch": 2.25, + "learning_rate": 1.3949389399471232e-07, + "logits/chosen": 0.537019670009613, + "logits/rejected": 2.592452049255371, + "logps/chosen": -317.28826904296875, + "logps/rejected": -648.7763671875, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9283174276351929, + "rewards/margins": 16.98746681213379, + "rewards/rejected": -15.059147834777832, + "step": 6610 + }, + { + "epoch": 2.25, + "learning_rate": 1.3886440891350874e-07, + "logits/chosen": 1.3698558807373047, + "logits/rejected": 3.0117099285125732, + "logps/chosen": -396.42108154296875, + "logps/rejected": -505.86810302734375, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7210032939910889, + "rewards/margins": 17.65418815612793, + "rewards/rejected": -15.933184623718262, + "step": 6620 + }, + { + "epoch": 2.25, + "learning_rate": 1.3823492383230517e-07, + "logits/chosen": 0.719806432723999, + "logits/rejected": 2.455599546432495, + "logps/chosen": -329.3707275390625, + "logps/rejected": -708.4951782226562, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1283833980560303, + "rewards/margins": 17.73508071899414, + "rewards/rejected": -15.606698989868164, + "step": 6630 + }, + { + "epoch": 2.26, + "learning_rate": 1.376054387511016e-07, + "logits/chosen": 1.4004559516906738, + "logits/rejected": 2.5099542140960693, + "logps/chosen": -344.36944580078125, + "logps/rejected": -640.68017578125, + "loss": 0.0031, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6813615560531616, + "rewards/margins": 16.262908935546875, + "rewards/rejected": -14.58154582977295, + "step": 6640 + }, + { + "epoch": 2.26, + "learning_rate": 1.36975953669898e-07, + "logits/chosen": 1.0476455688476562, + "logits/rejected": 2.747251033782959, + "logps/chosen": -369.5597229003906, + "logps/rejected": -523.4691162109375, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7639362812042236, + "rewards/margins": 15.755114555358887, + "rewards/rejected": -13.991180419921875, + "step": 6650 + }, + { + "epoch": 2.26, + "learning_rate": 1.3634646858869444e-07, + "logits/chosen": 1.08604097366333, + "logits/rejected": 2.80775785446167, + "logps/chosen": -343.06317138671875, + "logps/rejected": -491.69091796875, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5290716886520386, + "rewards/margins": 15.872842788696289, + "rewards/rejected": -14.343768119812012, + "step": 6660 + }, + { + "epoch": 2.27, + "learning_rate": 1.3571698350749086e-07, + "logits/chosen": 0.8772619366645813, + "logits/rejected": 2.6861140727996826, + "logps/chosen": -364.5007629394531, + "logps/rejected": -608.0274658203125, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6243185997009277, + "rewards/margins": 15.49360179901123, + "rewards/rejected": -12.869282722473145, + "step": 6670 + }, + { + "epoch": 2.27, + "learning_rate": 1.3508749842628728e-07, + "logits/chosen": 0.7491368055343628, + "logits/rejected": 2.350585460662842, + "logps/chosen": -357.9722900390625, + "logps/rejected": -625.5552978515625, + "loss": 0.0057, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.264483690261841, + "rewards/margins": 14.255470275878906, + "rewards/rejected": -11.990984916687012, + "step": 6680 + }, + { + "epoch": 2.27, + "learning_rate": 1.3445801334508373e-07, + "logits/chosen": 1.0448821783065796, + "logits/rejected": 2.7055163383483887, + "logps/chosen": -420.7792053222656, + "logps/rejected": -438.79913330078125, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5239304304122925, + "rewards/margins": 17.5579776763916, + "rewards/rejected": -16.034046173095703, + "step": 6690 + }, + { + "epoch": 2.28, + "learning_rate": 1.3382852826388013e-07, + "logits/chosen": 1.200180172920227, + "logits/rejected": 2.6250221729278564, + "logps/chosen": -337.2734069824219, + "logps/rejected": -551.2679443359375, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7806466817855835, + "rewards/margins": 16.01031494140625, + "rewards/rejected": -14.229669570922852, + "step": 6700 + }, + { + "epoch": 2.28, + "eval_logits/chosen": 0.4470595121383667, + "eval_logits/rejected": 2.469754457473755, + "eval_logps/chosen": -369.5677795410156, + "eval_logps/rejected": -643.3903198242188, + "eval_loss": 0.00491339759901166, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.9422091245651245, + "eval_rewards/margins": 16.018211364746094, + "eval_rewards/rejected": -14.07600212097168, + "eval_runtime": 268.6508, + "eval_samples_per_second": 35.362, + "eval_steps_per_second": 1.106, + "step": 6700 + }, + { + "epoch": 2.28, + "learning_rate": 1.3319904318267655e-07, + "logits/chosen": 1.0116021633148193, + "logits/rejected": 2.23539662361145, + "logps/chosen": -368.14630126953125, + "logps/rejected": -677.6047973632812, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9957377910614014, + "rewards/margins": 15.763692855834961, + "rewards/rejected": -13.76795482635498, + "step": 6710 + }, + { + "epoch": 2.28, + "learning_rate": 1.32569558101473e-07, + "logits/chosen": 1.3611485958099365, + "logits/rejected": 2.6158175468444824, + "logps/chosen": -343.1318359375, + "logps/rejected": -639.2994384765625, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.975050926208496, + "rewards/margins": 16.714651107788086, + "rewards/rejected": -14.739601135253906, + "step": 6720 + }, + { + "epoch": 2.29, + "learning_rate": 1.3194007302026942e-07, + "logits/chosen": 1.00726318359375, + "logits/rejected": 2.7914206981658936, + "logps/chosen": -364.0087585449219, + "logps/rejected": -535.8576049804688, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7007040977478027, + "rewards/margins": 17.216846466064453, + "rewards/rejected": -14.516143798828125, + "step": 6730 + }, + { + "epoch": 2.29, + "learning_rate": 1.3131058793906582e-07, + "logits/chosen": 1.64545476436615, + "logits/rejected": 2.4137086868286133, + "logps/chosen": -433.92205810546875, + "logps/rejected": -574.2499389648438, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9041860103607178, + "rewards/margins": 14.816378593444824, + "rewards/rejected": -12.912193298339844, + "step": 6740 + }, + { + "epoch": 2.29, + "learning_rate": 1.3068110285786227e-07, + "logits/chosen": 1.1354012489318848, + "logits/rejected": 2.3269338607788086, + "logps/chosen": -381.748046875, + "logps/rejected": -552.319580078125, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7223217487335205, + "rewards/margins": 15.645159721374512, + "rewards/rejected": -13.922839164733887, + "step": 6750 + }, + { + "epoch": 2.3, + "learning_rate": 1.300516177766587e-07, + "logits/chosen": 0.7011697292327881, + "logits/rejected": 2.4489026069641113, + "logps/chosen": -365.9461669921875, + "logps/rejected": -589.5831298828125, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.280221939086914, + "rewards/margins": 16.866567611694336, + "rewards/rejected": -14.586347579956055, + "step": 6760 + }, + { + "epoch": 2.3, + "learning_rate": 1.294221326954551e-07, + "logits/chosen": 0.6479487419128418, + "logits/rejected": 2.078508138656616, + "logps/chosen": -388.99041748046875, + "logps/rejected": -782.4954833984375, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5865205526351929, + "rewards/margins": 15.948519706726074, + "rewards/rejected": -14.361997604370117, + "step": 6770 + }, + { + "epoch": 2.3, + "learning_rate": 1.2879264761425154e-07, + "logits/chosen": 1.262549638748169, + "logits/rejected": 2.834839344024658, + "logps/chosen": -341.15655517578125, + "logps/rejected": -506.4090270996094, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.049922466278076, + "rewards/margins": 17.553844451904297, + "rewards/rejected": -15.503921508789062, + "step": 6780 + }, + { + "epoch": 2.31, + "learning_rate": 1.2816316253304796e-07, + "logits/chosen": 0.232079416513443, + "logits/rejected": 2.5545456409454346, + "logps/chosen": -283.21435546875, + "logps/rejected": -561.6546630859375, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9523422718048096, + "rewards/margins": 15.473034858703613, + "rewards/rejected": -13.520692825317383, + "step": 6790 + }, + { + "epoch": 2.31, + "learning_rate": 1.2753367745184438e-07, + "logits/chosen": 1.0746533870697021, + "logits/rejected": 2.7360405921936035, + "logps/chosen": -378.8026123046875, + "logps/rejected": -479.0779724121094, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9980930089950562, + "rewards/margins": 16.039087295532227, + "rewards/rejected": -14.040995597839355, + "step": 6800 + }, + { + "epoch": 2.31, + "eval_logits/chosen": 0.4561528265476227, + "eval_logits/rejected": 2.4646434783935547, + "eval_logps/chosen": -370.35699462890625, + "eval_logps/rejected": -647.2789916992188, + "eval_loss": 0.0048464760184288025, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.863288402557373, + "eval_rewards/margins": 16.328153610229492, + "eval_rewards/rejected": -14.464864730834961, + "eval_runtime": 268.3678, + "eval_samples_per_second": 35.399, + "eval_steps_per_second": 1.107, + "step": 6800 + }, + { + "epoch": 2.31, + "learning_rate": 1.2690419237064083e-07, + "logits/chosen": 0.6407972574234009, + "logits/rejected": 2.2658538818359375, + "logps/chosen": -491.4794006347656, + "logps/rejected": -666.0134887695312, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.881895661354065, + "rewards/margins": 15.709803581237793, + "rewards/rejected": -13.827908515930176, + "step": 6810 + }, + { + "epoch": 2.32, + "learning_rate": 1.2627470728943723e-07, + "logits/chosen": 0.7811010479927063, + "logits/rejected": 2.0729830265045166, + "logps/chosen": -472.3502502441406, + "logps/rejected": -726.8089599609375, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.061863899230957, + "rewards/margins": 17.083171844482422, + "rewards/rejected": -15.021306991577148, + "step": 6820 + }, + { + "epoch": 2.32, + "learning_rate": 1.2564522220823365e-07, + "logits/chosen": 0.7845171689987183, + "logits/rejected": 2.2855148315429688, + "logps/chosen": -386.7796325683594, + "logps/rejected": -688.0648803710938, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.226625919342041, + "rewards/margins": 16.80321502685547, + "rewards/rejected": -14.57658863067627, + "step": 6830 + }, + { + "epoch": 2.32, + "learning_rate": 1.250157371270301e-07, + "logits/chosen": 0.9920900464057922, + "logits/rejected": 2.7270851135253906, + "logps/chosen": -387.8033142089844, + "logps/rejected": -535.3179931640625, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0011491775512695, + "rewards/margins": 17.804561614990234, + "rewards/rejected": -15.803411483764648, + "step": 6840 + }, + { + "epoch": 2.33, + "learning_rate": 1.243862520458265e-07, + "logits/chosen": 0.44733184576034546, + "logits/rejected": 2.3163790702819824, + "logps/chosen": -367.1181335449219, + "logps/rejected": -533.33203125, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8264821767807007, + "rewards/margins": 17.479494094848633, + "rewards/rejected": -15.6530122756958, + "step": 6850 + }, + { + "epoch": 2.33, + "learning_rate": 1.2375676696462294e-07, + "logits/chosen": 1.0215935707092285, + "logits/rejected": 2.1925833225250244, + "logps/chosen": -402.8360595703125, + "logps/rejected": -763.5743408203125, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4659521579742432, + "rewards/margins": 15.616668701171875, + "rewards/rejected": -14.150716781616211, + "step": 6860 + }, + { + "epoch": 2.34, + "learning_rate": 1.2312728188341934e-07, + "logits/chosen": 1.325761079788208, + "logits/rejected": 2.252635955810547, + "logps/chosen": -348.18145751953125, + "logps/rejected": -673.8773193359375, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1558871269226074, + "rewards/margins": 18.09050941467285, + "rewards/rejected": -15.93462085723877, + "step": 6870 + }, + { + "epoch": 2.34, + "learning_rate": 1.224977968022158e-07, + "logits/chosen": 0.948559582233429, + "logits/rejected": 2.580723762512207, + "logps/chosen": -438.7425231933594, + "logps/rejected": -598.1914672851562, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4692356586456299, + "rewards/margins": 15.825752258300781, + "rewards/rejected": -14.356515884399414, + "step": 6880 + }, + { + "epoch": 2.34, + "learning_rate": 1.218683117210122e-07, + "logits/chosen": 0.8626810908317566, + "logits/rejected": 2.2737300395965576, + "logps/chosen": -320.7707214355469, + "logps/rejected": -713.5247192382812, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.643897771835327, + "rewards/margins": 17.64309310913086, + "rewards/rejected": -14.999194145202637, + "step": 6890 + }, + { + "epoch": 2.35, + "learning_rate": 1.2123882663980863e-07, + "logits/chosen": 1.4227879047393799, + "logits/rejected": 2.165592670440674, + "logps/chosen": -405.59130859375, + "logps/rejected": -746.8988647460938, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1885743141174316, + "rewards/margins": 14.486371994018555, + "rewards/rejected": -12.297799110412598, + "step": 6900 + }, + { + "epoch": 2.35, + "eval_logits/chosen": 0.4292427599430084, + "eval_logits/rejected": 2.427537202835083, + "eval_logps/chosen": -370.9051208496094, + "eval_logps/rejected": -651.1427001953125, + "eval_loss": 0.004910214804112911, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 1.8084813356399536, + "eval_rewards/margins": 16.659719467163086, + "eval_rewards/rejected": -14.851237297058105, + "eval_runtime": 267.9814, + "eval_samples_per_second": 35.45, + "eval_steps_per_second": 1.108, + "step": 6900 + }, + { + "epoch": 2.35, + "learning_rate": 1.2060934155860506e-07, + "logits/chosen": 1.6376726627349854, + "logits/rejected": 2.818448543548584, + "logps/chosen": -344.49267578125, + "logps/rejected": -584.34130859375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.967023253440857, + "rewards/margins": 17.524118423461914, + "rewards/rejected": -15.557093620300293, + "step": 6910 + }, + { + "epoch": 2.35, + "learning_rate": 1.1997985647740148e-07, + "logits/chosen": 0.698118269443512, + "logits/rejected": 2.1906166076660156, + "logps/chosen": -346.07928466796875, + "logps/rejected": -632.543212890625, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3526246547698975, + "rewards/margins": 16.90323829650879, + "rewards/rejected": -14.55061149597168, + "step": 6920 + }, + { + "epoch": 2.36, + "learning_rate": 1.193503713961979e-07, + "logits/chosen": 0.7867122888565063, + "logits/rejected": 1.766857385635376, + "logps/chosen": -498.2786560058594, + "logps/rejected": -871.0455322265625, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9483602046966553, + "rewards/margins": 16.372447967529297, + "rewards/rejected": -14.42408561706543, + "step": 6930 + }, + { + "epoch": 2.36, + "learning_rate": 1.1872088631499433e-07, + "logits/chosen": 0.10261068493127823, + "logits/rejected": 2.4081060886383057, + "logps/chosen": -343.291748046875, + "logps/rejected": -690.6204833984375, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3851890563964844, + "rewards/margins": 16.051509857177734, + "rewards/rejected": -14.666322708129883, + "step": 6940 + }, + { + "epoch": 2.36, + "learning_rate": 1.1809140123379076e-07, + "logits/chosen": 0.6996985673904419, + "logits/rejected": 1.9651216268539429, + "logps/chosen": -345.29193115234375, + "logps/rejected": -695.8432006835938, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2995944023132324, + "rewards/margins": 15.940165519714355, + "rewards/rejected": -13.640571594238281, + "step": 6950 + }, + { + "epoch": 2.37, + "learning_rate": 1.1746191615258717e-07, + "logits/chosen": 1.2867244482040405, + "logits/rejected": 2.4483482837677, + "logps/chosen": -334.0912170410156, + "logps/rejected": -690.3890991210938, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.22255802154541, + "rewards/margins": 19.06466293334961, + "rewards/rejected": -16.842105865478516, + "step": 6960 + }, + { + "epoch": 2.37, + "learning_rate": 1.1683243107138361e-07, + "logits/chosen": 1.11300528049469, + "logits/rejected": 2.5207343101501465, + "logps/chosen": -405.8424377441406, + "logps/rejected": -613.1907958984375, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.195126533508301, + "rewards/margins": 18.852861404418945, + "rewards/rejected": -16.65773582458496, + "step": 6970 + }, + { + "epoch": 2.37, + "learning_rate": 1.1620294599018003e-07, + "logits/chosen": 1.2959927320480347, + "logits/rejected": 2.5358402729034424, + "logps/chosen": -329.21905517578125, + "logps/rejected": -549.5939331054688, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7862987518310547, + "rewards/margins": 15.796490669250488, + "rewards/rejected": -14.01019287109375, + "step": 6980 + }, + { + "epoch": 2.38, + "learning_rate": 1.1557346090897645e-07, + "logits/chosen": 0.9604352712631226, + "logits/rejected": 2.0390219688415527, + "logps/chosen": -319.5994873046875, + "logps/rejected": -810.9552001953125, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9633973836898804, + "rewards/margins": 17.46154022216797, + "rewards/rejected": -15.498143196105957, + "step": 6990 + }, + { + "epoch": 2.38, + "learning_rate": 1.1494397582777288e-07, + "logits/chosen": 0.10136137902736664, + "logits/rejected": 2.167189836502075, + "logps/chosen": -305.06890869140625, + "logps/rejected": -681.5853881835938, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6132564544677734, + "rewards/margins": 15.553873062133789, + "rewards/rejected": -13.9406156539917, + "step": 7000 + }, + { + "epoch": 2.38, + "eval_logits/chosen": 0.44245606660842896, + "eval_logits/rejected": 2.438732624053955, + "eval_logps/chosen": -369.9842224121094, + "eval_logps/rejected": -648.9703369140625, + "eval_loss": 0.004800071474164724, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.9005694389343262, + "eval_rewards/margins": 16.534570693969727, + "eval_rewards/rejected": -14.634000778198242, + "eval_runtime": 267.9901, + "eval_samples_per_second": 35.449, + "eval_steps_per_second": 1.108, + "step": 7000 + }, + { + "epoch": 2.38, + "learning_rate": 1.1431449074656931e-07, + "logits/chosen": 1.2209926843643188, + "logits/rejected": 2.4831814765930176, + "logps/chosen": -508.8880920410156, + "logps/rejected": -517.89794921875, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8793928623199463, + "rewards/margins": 17.297401428222656, + "rewards/rejected": -15.418006896972656, + "step": 7010 + }, + { + "epoch": 2.39, + "learning_rate": 1.1368500566536572e-07, + "logits/chosen": 0.8192776441574097, + "logits/rejected": 1.7947208881378174, + "logps/chosen": -453.9606018066406, + "logps/rejected": -689.9783325195312, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.200059175491333, + "rewards/margins": 15.344413757324219, + "rewards/rejected": -13.144353866577148, + "step": 7020 + }, + { + "epoch": 2.39, + "learning_rate": 1.1305552058416214e-07, + "logits/chosen": 0.6392263174057007, + "logits/rejected": 2.1331827640533447, + "logps/chosen": -336.352294921875, + "logps/rejected": -658.5803833007812, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2526023387908936, + "rewards/margins": 18.084606170654297, + "rewards/rejected": -15.832002639770508, + "step": 7030 + }, + { + "epoch": 2.39, + "learning_rate": 1.1242603550295858e-07, + "logits/chosen": 0.29846692085266113, + "logits/rejected": 2.1883413791656494, + "logps/chosen": -304.7272033691406, + "logps/rejected": -715.2191772460938, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2608134746551514, + "rewards/margins": 18.64798927307129, + "rewards/rejected": -16.387174606323242, + "step": 7040 + }, + { + "epoch": 2.4, + "learning_rate": 1.1179655042175499e-07, + "logits/chosen": 1.5515304803848267, + "logits/rejected": 2.8250606060028076, + "logps/chosen": -342.275634765625, + "logps/rejected": -492.2900390625, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4785754680633545, + "rewards/margins": 15.609323501586914, + "rewards/rejected": -14.13074779510498, + "step": 7050 + }, + { + "epoch": 2.4, + "learning_rate": 1.1116706534055143e-07, + "logits/chosen": 0.6783769726753235, + "logits/rejected": 2.544703722000122, + "logps/chosen": -383.35443115234375, + "logps/rejected": -615.2085571289062, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3915653228759766, + "rewards/margins": 19.474990844726562, + "rewards/rejected": -17.083423614501953, + "step": 7060 + }, + { + "epoch": 2.4, + "learning_rate": 1.1053758025934785e-07, + "logits/chosen": 0.6153481006622314, + "logits/rejected": 1.9820592403411865, + "logps/chosen": -319.31768798828125, + "logps/rejected": -786.171875, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.718113899230957, + "rewards/margins": 15.51591682434082, + "rewards/rejected": -13.797802925109863, + "step": 7070 + }, + { + "epoch": 2.41, + "learning_rate": 1.0990809517814427e-07, + "logits/chosen": 1.1642529964447021, + "logits/rejected": 2.6903514862060547, + "logps/chosen": -388.19854736328125, + "logps/rejected": -527.0130004882812, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.783219337463379, + "rewards/margins": 16.403539657592773, + "rewards/rejected": -14.620318412780762, + "step": 7080 + }, + { + "epoch": 2.41, + "learning_rate": 1.092786100969407e-07, + "logits/chosen": 1.280145287513733, + "logits/rejected": 2.317441463470459, + "logps/chosen": -338.6867370605469, + "logps/rejected": -638.913330078125, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0096359252929688, + "rewards/margins": 15.663032531738281, + "rewards/rejected": -13.653398513793945, + "step": 7090 + }, + { + "epoch": 2.41, + "learning_rate": 1.0864912501573713e-07, + "logits/chosen": 0.9446396827697754, + "logits/rejected": 2.444380283355713, + "logps/chosen": -313.5970458984375, + "logps/rejected": -557.501953125, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4033139944076538, + "rewards/margins": 12.766210556030273, + "rewards/rejected": -11.362897872924805, + "step": 7100 + }, + { + "epoch": 2.41, + "eval_logits/chosen": 0.42960870265960693, + "eval_logits/rejected": 2.4152705669403076, + "eval_logps/chosen": -370.7745666503906, + "eval_logps/rejected": -653.0066528320312, + "eval_loss": 0.00468032481148839, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.8215343952178955, + "eval_rewards/margins": 16.85917091369629, + "eval_rewards/rejected": -15.037636756896973, + "eval_runtime": 267.6607, + "eval_samples_per_second": 35.493, + "eval_steps_per_second": 1.11, + "step": 7100 + }, + { + "epoch": 2.42, + "learning_rate": 1.0801963993453354e-07, + "logits/chosen": 0.5551157593727112, + "logits/rejected": 2.690716028213501, + "logps/chosen": -337.18963623046875, + "logps/rejected": -474.79754638671875, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5211597681045532, + "rewards/margins": 16.70118522644043, + "rewards/rejected": -15.180026054382324, + "step": 7110 + }, + { + "epoch": 2.42, + "learning_rate": 1.0739015485332998e-07, + "logits/chosen": 0.7350689768791199, + "logits/rejected": 2.314960479736328, + "logps/chosen": -413.591796875, + "logps/rejected": -702.0711059570312, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.877844214439392, + "rewards/margins": 15.394256591796875, + "rewards/rejected": -13.516412734985352, + "step": 7120 + }, + { + "epoch": 2.42, + "learning_rate": 1.067606697721264e-07, + "logits/chosen": 1.090998888015747, + "logits/rejected": 2.3243186473846436, + "logps/chosen": -394.4823913574219, + "logps/rejected": -571.0720825195312, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2767682075500488, + "rewards/margins": 15.756521224975586, + "rewards/rejected": -14.479754447937012, + "step": 7130 + }, + { + "epoch": 2.43, + "learning_rate": 1.0613118469092282e-07, + "logits/chosen": 0.48929041624069214, + "logits/rejected": 1.5767260789871216, + "logps/chosen": -382.69842529296875, + "logps/rejected": -966.6769409179688, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.252427577972412, + "rewards/margins": 15.286422729492188, + "rewards/rejected": -13.033994674682617, + "step": 7140 + }, + { + "epoch": 2.43, + "learning_rate": 1.0550169960971924e-07, + "logits/chosen": 0.6126815676689148, + "logits/rejected": 2.0698182582855225, + "logps/chosen": -432.7958068847656, + "logps/rejected": -761.7425537109375, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.890511155128479, + "rewards/margins": 16.206470489501953, + "rewards/rejected": -14.315958023071289, + "step": 7150 + }, + { + "epoch": 2.43, + "learning_rate": 1.0487221452851568e-07, + "logits/chosen": 0.43376216292381287, + "logits/rejected": 2.1499080657958984, + "logps/chosen": -360.62725830078125, + "logps/rejected": -782.2716674804688, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4807064533233643, + "rewards/margins": 14.774923324584961, + "rewards/rejected": -13.294217109680176, + "step": 7160 + }, + { + "epoch": 2.44, + "learning_rate": 1.0424272944731209e-07, + "logits/chosen": 1.1980317831039429, + "logits/rejected": 2.1737492084503174, + "logps/chosen": -398.87933349609375, + "logps/rejected": -765.1239013671875, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9391682147979736, + "rewards/margins": 17.53460121154785, + "rewards/rejected": -15.595433235168457, + "step": 7170 + }, + { + "epoch": 2.44, + "learning_rate": 1.0361324436610853e-07, + "logits/chosen": 1.1051430702209473, + "logits/rejected": 2.751847743988037, + "logps/chosen": -397.548095703125, + "logps/rejected": -588.2581787109375, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.978877067565918, + "rewards/margins": 19.947650909423828, + "rewards/rejected": -17.968774795532227, + "step": 7180 + }, + { + "epoch": 2.44, + "learning_rate": 1.0298375928490494e-07, + "logits/chosen": 1.0062482357025146, + "logits/rejected": 2.347341299057007, + "logps/chosen": -315.4201965332031, + "logps/rejected": -673.93896484375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7003400325775146, + "rewards/margins": 18.57695770263672, + "rewards/rejected": -16.876617431640625, + "step": 7190 + }, + { + "epoch": 2.45, + "learning_rate": 1.0235427420370137e-07, + "logits/chosen": 1.1636916399002075, + "logits/rejected": 2.218010425567627, + "logps/chosen": -342.746826171875, + "logps/rejected": -663.3795166015625, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.88577139377594, + "rewards/margins": 18.186866760253906, + "rewards/rejected": -16.301095962524414, + "step": 7200 + }, + { + "epoch": 2.45, + "eval_logits/chosen": 0.42482060194015503, + "eval_logits/rejected": 2.4152801036834717, + "eval_logps/chosen": -370.7949523925781, + "eval_logps/rejected": -652.7421875, + "eval_loss": 0.004596411250531673, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.8194925785064697, + "eval_rewards/margins": 16.83067512512207, + "eval_rewards/rejected": -15.01118278503418, + "eval_runtime": 268.6365, + "eval_samples_per_second": 35.364, + "eval_steps_per_second": 1.106, + "step": 7200 + }, + { + "epoch": 2.45, + "learning_rate": 1.017247891224978e-07, + "logits/chosen": 0.6930907964706421, + "logits/rejected": 3.015477418899536, + "logps/chosen": -323.17840576171875, + "logps/rejected": -471.4164123535156, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.021176338195801, + "rewards/margins": 17.877004623413086, + "rewards/rejected": -15.855827331542969, + "step": 7210 + }, + { + "epoch": 2.45, + "learning_rate": 1.0109530404129422e-07, + "logits/chosen": 1.0820677280426025, + "logits/rejected": 2.5256545543670654, + "logps/chosen": -402.204833984375, + "logps/rejected": -561.5673217773438, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3005762100219727, + "rewards/margins": 18.47879409790039, + "rewards/rejected": -17.178218841552734, + "step": 7220 + }, + { + "epoch": 2.46, + "learning_rate": 1.0046581896009064e-07, + "logits/chosen": 1.2416799068450928, + "logits/rejected": 2.559627056121826, + "logps/chosen": -369.97821044921875, + "logps/rejected": -608.08251953125, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5119627714157104, + "rewards/margins": 18.328359603881836, + "rewards/rejected": -16.81639862060547, + "step": 7230 + }, + { + "epoch": 2.46, + "learning_rate": 9.983633387888708e-08, + "logits/chosen": 0.5990116000175476, + "logits/rejected": 1.6854044198989868, + "logps/chosen": -371.9095764160156, + "logps/rejected": -837.9762573242188, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8408832550048828, + "rewards/margins": 17.786670684814453, + "rewards/rejected": -15.945785522460938, + "step": 7240 + }, + { + "epoch": 2.46, + "learning_rate": 9.920684879768348e-08, + "logits/chosen": 1.354547381401062, + "logits/rejected": 2.4100263118743896, + "logps/chosen": -451.39874267578125, + "logps/rejected": -672.2733154296875, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4303014278411865, + "rewards/margins": 15.943695068359375, + "rewards/rejected": -13.513392448425293, + "step": 7250 + }, + { + "epoch": 2.47, + "learning_rate": 9.857736371647991e-08, + "logits/chosen": 0.8520407676696777, + "logits/rejected": 2.337068557739258, + "logps/chosen": -460.9368591308594, + "logps/rejected": -715.7650146484375, + "loss": 0.0053, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.073951482772827, + "rewards/margins": 16.552248001098633, + "rewards/rejected": -14.478296279907227, + "step": 7260 + }, + { + "epoch": 2.47, + "learning_rate": 9.794787863527634e-08, + "logits/chosen": 1.0755064487457275, + "logits/rejected": 2.564039707183838, + "logps/chosen": -459.77691650390625, + "logps/rejected": -473.631591796875, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.273218870162964, + "rewards/margins": 18.604473114013672, + "rewards/rejected": -16.331254959106445, + "step": 7270 + }, + { + "epoch": 2.47, + "learning_rate": 9.731839355407275e-08, + "logits/chosen": 0.769875705242157, + "logits/rejected": 2.427485942840576, + "logps/chosen": -370.8426208496094, + "logps/rejected": -486.0166015625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9036086797714233, + "rewards/margins": 14.996641159057617, + "rewards/rejected": -13.09303092956543, + "step": 7280 + }, + { + "epoch": 2.48, + "learning_rate": 9.668890847286919e-08, + "logits/chosen": 1.0000903606414795, + "logits/rejected": 2.689542770385742, + "logps/chosen": -446.7685546875, + "logps/rejected": -504.4189453125, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9139716625213623, + "rewards/margins": 16.287158966064453, + "rewards/rejected": -14.373187065124512, + "step": 7290 + }, + { + "epoch": 2.48, + "learning_rate": 9.605942339166561e-08, + "logits/chosen": 0.08227036148309708, + "logits/rejected": 1.5266690254211426, + "logps/chosen": -370.2149658203125, + "logps/rejected": -883.3233642578125, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4059269428253174, + "rewards/margins": 16.193233489990234, + "rewards/rejected": -13.787304878234863, + "step": 7300 + }, + { + "epoch": 2.48, + "eval_logits/chosen": 0.4234156608581543, + "eval_logits/rejected": 2.4336464405059814, + "eval_logps/chosen": -370.06939697265625, + "eval_logps/rejected": -646.7868041992188, + "eval_loss": 0.004505614284425974, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.8920491933822632, + "eval_rewards/margins": 16.307695388793945, + "eval_rewards/rejected": -14.415645599365234, + "eval_runtime": 267.5767, + "eval_samples_per_second": 35.504, + "eval_steps_per_second": 1.11, + "step": 7300 + }, + { + "epoch": 2.48, + "learning_rate": 9.542993831046203e-08, + "logits/chosen": 1.335982084274292, + "logits/rejected": 2.717874050140381, + "logps/chosen": -380.83026123046875, + "logps/rejected": -566.5546264648438, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8207553625106812, + "rewards/margins": 15.22517204284668, + "rewards/rejected": -13.40441608428955, + "step": 7310 + }, + { + "epoch": 2.49, + "learning_rate": 9.480045322925846e-08, + "logits/chosen": 0.7195658683776855, + "logits/rejected": 2.657860279083252, + "logps/chosen": -369.21490478515625, + "logps/rejected": -649.6099853515625, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9588321447372437, + "rewards/margins": 16.215194702148438, + "rewards/rejected": -14.256364822387695, + "step": 7320 + }, + { + "epoch": 2.49, + "learning_rate": 9.41709681480549e-08, + "logits/chosen": 1.2745463848114014, + "logits/rejected": 2.756723165512085, + "logps/chosen": -319.5630187988281, + "logps/rejected": -561.6343383789062, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1544394493103027, + "rewards/margins": 16.32255744934082, + "rewards/rejected": -14.168116569519043, + "step": 7330 + }, + { + "epoch": 2.49, + "learning_rate": 9.35414830668513e-08, + "logits/chosen": 0.10680651664733887, + "logits/rejected": 2.273437023162842, + "logps/chosen": -281.9779968261719, + "logps/rejected": -660.8433837890625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5733397006988525, + "rewards/margins": 15.828512191772461, + "rewards/rejected": -14.25517463684082, + "step": 7340 + }, + { + "epoch": 2.5, + "learning_rate": 9.291199798564774e-08, + "logits/chosen": 0.892861545085907, + "logits/rejected": 2.3723952770233154, + "logps/chosen": -346.60345458984375, + "logps/rejected": -658.56689453125, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2672369480133057, + "rewards/margins": 16.97647476196289, + "rewards/rejected": -14.70923900604248, + "step": 7350 + }, + { + "epoch": 2.5, + "learning_rate": 9.228251290444416e-08, + "logits/chosen": 0.6463754773139954, + "logits/rejected": 1.9796451330184937, + "logps/chosen": -372.3808288574219, + "logps/rejected": -662.8326416015625, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2259275913238525, + "rewards/margins": 15.783831596374512, + "rewards/rejected": -13.557904243469238, + "step": 7360 + }, + { + "epoch": 2.51, + "learning_rate": 9.165302782324058e-08, + "logits/chosen": 1.0133006572723389, + "logits/rejected": 2.180471897125244, + "logps/chosen": -337.8099060058594, + "logps/rejected": -624.0775146484375, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.715206503868103, + "rewards/margins": 16.924972534179688, + "rewards/rejected": -15.209765434265137, + "step": 7370 + }, + { + "epoch": 2.51, + "learning_rate": 9.102354274203701e-08, + "logits/chosen": 0.553676962852478, + "logits/rejected": 2.322350263595581, + "logps/chosen": -360.35675048828125, + "logps/rejected": -678.6683349609375, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9695672988891602, + "rewards/margins": 17.130569458007812, + "rewards/rejected": -15.161005020141602, + "step": 7380 + }, + { + "epoch": 2.51, + "learning_rate": 9.039405766083344e-08, + "logits/chosen": 1.1079872846603394, + "logits/rejected": 1.9017117023468018, + "logps/chosen": -328.49505615234375, + "logps/rejected": -786.6644287109375, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4413007497787476, + "rewards/margins": 14.29881763458252, + "rewards/rejected": -12.857516288757324, + "step": 7390 + }, + { + "epoch": 2.52, + "learning_rate": 8.976457257962985e-08, + "logits/chosen": 1.005599021911621, + "logits/rejected": 2.1879096031188965, + "logps/chosen": -335.03521728515625, + "logps/rejected": -697.0205688476562, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6808414459228516, + "rewards/margins": 15.54613208770752, + "rewards/rejected": -13.865290641784668, + "step": 7400 + }, + { + "epoch": 2.52, + "eval_logits/chosen": 0.41171300411224365, + "eval_logits/rejected": 2.4101154804229736, + "eval_logps/chosen": -371.1637878417969, + "eval_logps/rejected": -649.152587890625, + "eval_loss": 0.004430423025041819, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.7826143503189087, + "eval_rewards/margins": 16.434837341308594, + "eval_rewards/rejected": -14.652223587036133, + "eval_runtime": 268.1809, + "eval_samples_per_second": 35.424, + "eval_steps_per_second": 1.107, + "step": 7400 + }, + { + "epoch": 2.52, + "learning_rate": 8.913508749842629e-08, + "logits/chosen": 0.9914455413818359, + "logits/rejected": 2.459635019302368, + "logps/chosen": -328.3199768066406, + "logps/rejected": -625.0198974609375, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8556253910064697, + "rewards/margins": 16.197139739990234, + "rewards/rejected": -14.341516494750977, + "step": 7410 + }, + { + "epoch": 2.52, + "learning_rate": 8.850560241722271e-08, + "logits/chosen": 1.4172303676605225, + "logits/rejected": 2.113933563232422, + "logps/chosen": -391.9002990722656, + "logps/rejected": -634.8341064453125, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3552289009094238, + "rewards/margins": 14.91827392578125, + "rewards/rejected": -13.563047409057617, + "step": 7420 + }, + { + "epoch": 2.53, + "learning_rate": 8.787611733601913e-08, + "logits/chosen": 1.3201543092727661, + "logits/rejected": 2.8121328353881836, + "logps/chosen": -495.87701416015625, + "logps/rejected": -456.218017578125, + "loss": 0.0042, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4872987270355225, + "rewards/margins": 16.895530700683594, + "rewards/rejected": -15.408231735229492, + "step": 7430 + }, + { + "epoch": 2.53, + "learning_rate": 8.724663225481556e-08, + "logits/chosen": 0.6333423852920532, + "logits/rejected": 2.0092949867248535, + "logps/chosen": -378.5120849609375, + "logps/rejected": -762.9464111328125, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.560328960418701, + "rewards/margins": 19.02144432067871, + "rewards/rejected": -16.461116790771484, + "step": 7440 + }, + { + "epoch": 2.53, + "learning_rate": 8.6617147173612e-08, + "logits/chosen": 1.240923285484314, + "logits/rejected": 2.028729200363159, + "logps/chosen": -340.8464660644531, + "logps/rejected": -809.6685180664062, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5228168964385986, + "rewards/margins": 17.846059799194336, + "rewards/rejected": -16.3232421875, + "step": 7450 + }, + { + "epoch": 2.54, + "learning_rate": 8.59876620924084e-08, + "logits/chosen": 0.6715523600578308, + "logits/rejected": 2.028759717941284, + "logps/chosen": -512.22705078125, + "logps/rejected": -689.6047973632812, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.257208824157715, + "rewards/margins": 17.234987258911133, + "rewards/rejected": -14.977777481079102, + "step": 7460 + }, + { + "epoch": 2.54, + "learning_rate": 8.535817701120483e-08, + "logits/chosen": 0.9357270002365112, + "logits/rejected": 1.8091167211532593, + "logps/chosen": -338.4635925292969, + "logps/rejected": -813.4832763671875, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0143938064575195, + "rewards/margins": 15.582258224487305, + "rewards/rejected": -13.567866325378418, + "step": 7470 + }, + { + "epoch": 2.54, + "learning_rate": 8.472869193000126e-08, + "logits/chosen": 0.7832736968994141, + "logits/rejected": 2.180711269378662, + "logps/chosen": -516.1185913085938, + "logps/rejected": -668.5208740234375, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3803958892822266, + "rewards/margins": 17.23041534423828, + "rewards/rejected": -15.850018501281738, + "step": 7480 + }, + { + "epoch": 2.55, + "learning_rate": 8.409920684879767e-08, + "logits/chosen": 1.112322449684143, + "logits/rejected": 2.341900110244751, + "logps/chosen": -383.68701171875, + "logps/rejected": -630.36279296875, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8344999551773071, + "rewards/margins": 16.628482818603516, + "rewards/rejected": -14.793981552124023, + "step": 7490 + }, + { + "epoch": 2.55, + "learning_rate": 8.346972176759411e-08, + "logits/chosen": 0.7441210746765137, + "logits/rejected": 1.9885940551757812, + "logps/chosen": -481.265869140625, + "logps/rejected": -694.01416015625, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1450316905975342, + "rewards/margins": 17.184598922729492, + "rewards/rejected": -16.039567947387695, + "step": 7500 + }, + { + "epoch": 2.55, + "eval_logits/chosen": 0.40691351890563965, + "eval_logits/rejected": 2.4040136337280273, + "eval_logps/chosen": -370.7875061035156, + "eval_logps/rejected": -649.6731567382812, + "eval_loss": 0.004426932893693447, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.8202381134033203, + "eval_rewards/margins": 16.524518966674805, + "eval_rewards/rejected": -14.704280853271484, + "eval_runtime": 270.7903, + "eval_samples_per_second": 35.083, + "eval_steps_per_second": 1.097, + "step": 7500 + }, + { + "epoch": 2.55, + "learning_rate": 8.284023668639053e-08, + "logits/chosen": 0.9541034698486328, + "logits/rejected": 2.3612122535705566, + "logps/chosen": -464.33599853515625, + "logps/rejected": -559.6075439453125, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3160302639007568, + "rewards/margins": 14.19543743133545, + "rewards/rejected": -12.879406929016113, + "step": 7510 + }, + { + "epoch": 2.56, + "learning_rate": 8.221075160518695e-08, + "logits/chosen": 0.6179080009460449, + "logits/rejected": 2.113412380218506, + "logps/chosen": -405.10809326171875, + "logps/rejected": -635.8704833984375, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.670983910560608, + "rewards/margins": 15.743799209594727, + "rewards/rejected": -14.072813034057617, + "step": 7520 + }, + { + "epoch": 2.56, + "learning_rate": 8.158126652398338e-08, + "logits/chosen": 0.9145607948303223, + "logits/rejected": 2.355287790298462, + "logps/chosen": -437.69085693359375, + "logps/rejected": -519.5291748046875, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.611969232559204, + "rewards/margins": 17.811548233032227, + "rewards/rejected": -15.19958209991455, + "step": 7530 + }, + { + "epoch": 2.56, + "learning_rate": 8.09517814427798e-08, + "logits/chosen": 0.7780656814575195, + "logits/rejected": 2.4244775772094727, + "logps/chosen": -307.7718200683594, + "logps/rejected": -620.1722412109375, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0296103954315186, + "rewards/margins": 17.512142181396484, + "rewards/rejected": -15.48253059387207, + "step": 7540 + }, + { + "epoch": 2.57, + "learning_rate": 8.032229636157622e-08, + "logits/chosen": 0.618104100227356, + "logits/rejected": 1.8979343175888062, + "logps/chosen": -301.73431396484375, + "logps/rejected": -839.3603515625, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6576316356658936, + "rewards/margins": 17.44778823852539, + "rewards/rejected": -15.790155410766602, + "step": 7550 + }, + { + "epoch": 2.57, + "learning_rate": 7.969281128037266e-08, + "logits/chosen": 0.9188628196716309, + "logits/rejected": 2.692812919616699, + "logps/chosen": -330.3370361328125, + "logps/rejected": -587.1077880859375, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.176570415496826, + "rewards/margins": 16.620418548583984, + "rewards/rejected": -14.44384765625, + "step": 7560 + }, + { + "epoch": 2.57, + "learning_rate": 7.906332619916907e-08, + "logits/chosen": 0.35936424136161804, + "logits/rejected": 2.5122036933898926, + "logps/chosen": -386.1283874511719, + "logps/rejected": -658.3817138671875, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.331057071685791, + "rewards/margins": 15.813084602355957, + "rewards/rejected": -14.482028007507324, + "step": 7570 + }, + { + "epoch": 2.58, + "learning_rate": 7.84338411179655e-08, + "logits/chosen": 1.2575255632400513, + "logits/rejected": 2.4196267127990723, + "logps/chosen": -432.21771240234375, + "logps/rejected": -659.4260864257812, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.846247673034668, + "rewards/margins": 18.127437591552734, + "rewards/rejected": -16.281190872192383, + "step": 7580 + }, + { + "epoch": 2.58, + "learning_rate": 7.780435603676193e-08, + "logits/chosen": 0.49740689992904663, + "logits/rejected": 2.428440809249878, + "logps/chosen": -310.4542541503906, + "logps/rejected": -639.364990234375, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8841445446014404, + "rewards/margins": 18.3008975982666, + "rewards/rejected": -16.416751861572266, + "step": 7590 + }, + { + "epoch": 2.58, + "learning_rate": 7.717487095555835e-08, + "logits/chosen": 1.0477213859558105, + "logits/rejected": 2.2913241386413574, + "logps/chosen": -339.94976806640625, + "logps/rejected": -717.8651123046875, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7267446517944336, + "rewards/margins": 17.785921096801758, + "rewards/rejected": -16.05917739868164, + "step": 7600 + }, + { + "epoch": 2.58, + "eval_logits/chosen": 0.40868476033210754, + "eval_logits/rejected": 2.4019243717193604, + "eval_logps/chosen": -370.2781677246094, + "eval_logps/rejected": -650.1920776367188, + "eval_loss": 0.004376308061182499, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.8711698055267334, + "eval_rewards/margins": 16.627342224121094, + "eval_rewards/rejected": -14.756170272827148, + "eval_runtime": 269.9357, + "eval_samples_per_second": 35.194, + "eval_steps_per_second": 1.1, + "step": 7600 + }, + { + "epoch": 2.59, + "learning_rate": 7.654538587435477e-08, + "logits/chosen": 0.8364641070365906, + "logits/rejected": 2.4752678871154785, + "logps/chosen": -344.04510498046875, + "logps/rejected": -617.2227172851562, + "loss": 0.0034, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.264724016189575, + "rewards/margins": 17.439924240112305, + "rewards/rejected": -15.175201416015625, + "step": 7610 + }, + { + "epoch": 2.59, + "learning_rate": 7.591590079315121e-08, + "logits/chosen": 0.7828987836837769, + "logits/rejected": 2.141719341278076, + "logps/chosen": -364.89471435546875, + "logps/rejected": -642.4763793945312, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7357105016708374, + "rewards/margins": 15.866876602172852, + "rewards/rejected": -14.1311674118042, + "step": 7620 + }, + { + "epoch": 2.59, + "learning_rate": 7.528641571194762e-08, + "logits/chosen": 0.6405404806137085, + "logits/rejected": 1.9401140213012695, + "logps/chosen": -305.2798156738281, + "logps/rejected": -775.0665283203125, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.271310329437256, + "rewards/margins": 16.512285232543945, + "rewards/rejected": -14.240976333618164, + "step": 7630 + }, + { + "epoch": 2.6, + "learning_rate": 7.465693063074405e-08, + "logits/chosen": 1.0794910192489624, + "logits/rejected": 2.674074172973633, + "logps/chosen": -394.71795654296875, + "logps/rejected": -641.1694946289062, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7397727966308594, + "rewards/margins": 17.497838973999023, + "rewards/rejected": -15.758066177368164, + "step": 7640 + }, + { + "epoch": 2.6, + "learning_rate": 7.402744554954048e-08, + "logits/chosen": 1.1082245111465454, + "logits/rejected": 2.569019317626953, + "logps/chosen": -394.66278076171875, + "logps/rejected": -548.3233642578125, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.965540885925293, + "rewards/margins": 15.515264511108398, + "rewards/rejected": -13.549725532531738, + "step": 7650 + }, + { + "epoch": 2.6, + "learning_rate": 7.33979604683369e-08, + "logits/chosen": 1.0259652137756348, + "logits/rejected": 2.4335434436798096, + "logps/chosen": -407.17694091796875, + "logps/rejected": -562.71826171875, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1270415782928467, + "rewards/margins": 15.186243057250977, + "rewards/rejected": -13.05920124053955, + "step": 7660 + }, + { + "epoch": 2.61, + "learning_rate": 7.276847538713332e-08, + "logits/chosen": 0.868080735206604, + "logits/rejected": 2.634049892425537, + "logps/chosen": -417.909423828125, + "logps/rejected": -581.9749145507812, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.856361746788025, + "rewards/margins": 17.1064395904541, + "rewards/rejected": -15.250079154968262, + "step": 7670 + }, + { + "epoch": 2.61, + "learning_rate": 7.213899030592976e-08, + "logits/chosen": 1.3671529293060303, + "logits/rejected": 2.712268829345703, + "logps/chosen": -409.2546081542969, + "logps/rejected": -549.1846313476562, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8779876232147217, + "rewards/margins": 15.200727462768555, + "rewards/rejected": -13.322738647460938, + "step": 7680 + }, + { + "epoch": 2.61, + "learning_rate": 7.150950522472617e-08, + "logits/chosen": 0.3646644949913025, + "logits/rejected": 2.458853244781494, + "logps/chosen": -398.7442932128906, + "logps/rejected": -553.7511596679688, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1193220615386963, + "rewards/margins": 18.095134735107422, + "rewards/rejected": -15.975812911987305, + "step": 7690 + }, + { + "epoch": 2.62, + "learning_rate": 7.088002014352259e-08, + "logits/chosen": 1.1231930255889893, + "logits/rejected": 2.2001609802246094, + "logps/chosen": -459.5314025878906, + "logps/rejected": -600.494384765625, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8814414739608765, + "rewards/margins": 16.648889541625977, + "rewards/rejected": -14.767448425292969, + "step": 7700 + }, + { + "epoch": 2.62, + "eval_logits/chosen": 0.4114474952220917, + "eval_logits/rejected": 2.3995602130889893, + "eval_logps/chosen": -370.58355712890625, + "eval_logps/rejected": -651.2406616210938, + "eval_loss": 0.004253820516169071, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.8406320810317993, + "eval_rewards/margins": 16.70166778564453, + "eval_rewards/rejected": -14.861037254333496, + "eval_runtime": 271.0204, + "eval_samples_per_second": 35.053, + "eval_steps_per_second": 1.096, + "step": 7700 + }, + { + "epoch": 2.62, + "learning_rate": 7.025053506231903e-08, + "logits/chosen": 0.5934748649597168, + "logits/rejected": 2.5598092079162598, + "logps/chosen": -348.83203125, + "logps/rejected": -445.65087890625, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9641309976577759, + "rewards/margins": 17.064300537109375, + "rewards/rejected": -15.10016918182373, + "step": 7710 + }, + { + "epoch": 2.62, + "learning_rate": 6.962104998111543e-08, + "logits/chosen": 1.384304404258728, + "logits/rejected": 2.5968146324157715, + "logps/chosen": -346.9498596191406, + "logps/rejected": -583.7931518554688, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3904762268066406, + "rewards/margins": 17.099462509155273, + "rewards/rejected": -14.708989143371582, + "step": 7720 + }, + { + "epoch": 2.63, + "learning_rate": 6.899156489991187e-08, + "logits/chosen": 0.6948369145393372, + "logits/rejected": 2.5097079277038574, + "logps/chosen": -368.5804138183594, + "logps/rejected": -614.776611328125, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0649654865264893, + "rewards/margins": 18.139123916625977, + "rewards/rejected": -16.074155807495117, + "step": 7730 + }, + { + "epoch": 2.63, + "learning_rate": 6.83620798187083e-08, + "logits/chosen": 0.633045494556427, + "logits/rejected": 2.577214002609253, + "logps/chosen": -442.4937438964844, + "logps/rejected": -551.2289428710938, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.669154405593872, + "rewards/margins": 16.923871994018555, + "rewards/rejected": -15.254716873168945, + "step": 7740 + }, + { + "epoch": 2.63, + "learning_rate": 6.773259473750472e-08, + "logits/chosen": 0.40775442123413086, + "logits/rejected": 2.2701003551483154, + "logps/chosen": -307.6568908691406, + "logps/rejected": -603.6043701171875, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4765632152557373, + "rewards/margins": 16.266742706298828, + "rewards/rejected": -13.790179252624512, + "step": 7750 + }, + { + "epoch": 2.64, + "learning_rate": 6.710310965630114e-08, + "logits/chosen": 1.0287370681762695, + "logits/rejected": 1.9805313348770142, + "logps/chosen": -553.20068359375, + "logps/rejected": -668.2408447265625, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.096820831298828, + "rewards/margins": 16.469932556152344, + "rewards/rejected": -14.373109817504883, + "step": 7760 + }, + { + "epoch": 2.64, + "learning_rate": 6.647362457509758e-08, + "logits/chosen": 0.6259018182754517, + "logits/rejected": 2.5759027004241943, + "logps/chosen": -360.9418029785156, + "logps/rejected": -528.69384765625, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0843729972839355, + "rewards/margins": 16.262943267822266, + "rewards/rejected": -14.178568840026855, + "step": 7770 + }, + { + "epoch": 2.64, + "learning_rate": 6.584413949389398e-08, + "logits/chosen": 0.9427730441093445, + "logits/rejected": 2.1246232986450195, + "logps/chosen": -437.2471618652344, + "logps/rejected": -693.9963989257812, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5401811599731445, + "rewards/margins": 15.073539733886719, + "rewards/rejected": -13.533358573913574, + "step": 7780 + }, + { + "epoch": 2.65, + "learning_rate": 6.521465441269042e-08, + "logits/chosen": 0.3761066794395447, + "logits/rejected": 2.0467357635498047, + "logps/chosen": -427.61212158203125, + "logps/rejected": -732.0155029296875, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0713422298431396, + "rewards/margins": 16.06171226501465, + "rewards/rejected": -13.99036979675293, + "step": 7790 + }, + { + "epoch": 2.65, + "learning_rate": 6.458516933148684e-08, + "logits/chosen": 0.5202646851539612, + "logits/rejected": 2.003371238708496, + "logps/chosen": -361.00372314453125, + "logps/rejected": -708.7975463867188, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5457887649536133, + "rewards/margins": 16.85226821899414, + "rewards/rejected": -15.306478500366211, + "step": 7800 + }, + { + "epoch": 2.65, + "eval_logits/chosen": 0.41474515199661255, + "eval_logits/rejected": 2.3936104774475098, + "eval_logps/chosen": -370.9483642578125, + "eval_logps/rejected": -653.4503173828125, + "eval_loss": 0.004259427078068256, + "eval_rewards/accuracies": 0.9991582632064819, + "eval_rewards/chosen": 1.8041552305221558, + "eval_rewards/margins": 16.886152267456055, + "eval_rewards/rejected": -15.08199691772461, + "eval_runtime": 270.6896, + "eval_samples_per_second": 35.096, + "eval_steps_per_second": 1.097, + "step": 7800 + }, + { + "epoch": 2.65, + "learning_rate": 6.395568425028327e-08, + "logits/chosen": 0.537198543548584, + "logits/rejected": 1.7283565998077393, + "logps/chosen": -301.32806396484375, + "logps/rejected": -793.250244140625, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0168819427490234, + "rewards/margins": 16.878694534301758, + "rewards/rejected": -14.861811637878418, + "step": 7810 + }, + { + "epoch": 2.66, + "learning_rate": 6.332619916907969e-08, + "logits/chosen": 0.9977883100509644, + "logits/rejected": 2.5318446159362793, + "logps/chosen": -313.09185791015625, + "logps/rejected": -679.9685668945312, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.932676076889038, + "rewards/margins": 17.820940017700195, + "rewards/rejected": -15.888264656066895, + "step": 7820 + }, + { + "epoch": 2.66, + "learning_rate": 6.269671408787612e-08, + "logits/chosen": 0.7894630432128906, + "logits/rejected": 2.1625990867614746, + "logps/chosen": -347.5708312988281, + "logps/rejected": -755.1859130859375, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6550207138061523, + "rewards/margins": 16.34111213684082, + "rewards/rejected": -14.686091423034668, + "step": 7830 + }, + { + "epoch": 2.66, + "learning_rate": 6.206722900667253e-08, + "logits/chosen": 0.3686595857143402, + "logits/rejected": 1.8407968282699585, + "logps/chosen": -387.30096435546875, + "logps/rejected": -841.7135009765625, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3304541110992432, + "rewards/margins": 15.88392448425293, + "rewards/rejected": -14.553471565246582, + "step": 7840 + }, + { + "epoch": 2.67, + "learning_rate": 6.143774392546897e-08, + "logits/chosen": 0.9401399493217468, + "logits/rejected": 2.118645191192627, + "logps/chosen": -467.0404357910156, + "logps/rejected": -527.9627685546875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6599762439727783, + "rewards/margins": 16.71710968017578, + "rewards/rejected": -14.057136535644531, + "step": 7850 + }, + { + "epoch": 2.67, + "learning_rate": 6.080825884426539e-08, + "logits/chosen": 1.330041527748108, + "logits/rejected": 2.9794445037841797, + "logps/chosen": -391.0253601074219, + "logps/rejected": -560.0770874023438, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5674159526824951, + "rewards/margins": 17.092731475830078, + "rewards/rejected": -15.525317192077637, + "step": 7860 + }, + { + "epoch": 2.68, + "learning_rate": 6.017877376306182e-08, + "logits/chosen": 0.6090242266654968, + "logits/rejected": 2.478968381881714, + "logps/chosen": -394.5640563964844, + "logps/rejected": -586.6937255859375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5743919610977173, + "rewards/margins": 19.863183975219727, + "rewards/rejected": -18.28879165649414, + "step": 7870 + }, + { + "epoch": 2.68, + "learning_rate": 5.954928868185824e-08, + "logits/chosen": 1.0315090417861938, + "logits/rejected": 1.8467708826065063, + "logps/chosen": -454.67071533203125, + "logps/rejected": -845.1062622070312, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9140666723251343, + "rewards/margins": 15.519999504089355, + "rewards/rejected": -13.605932235717773, + "step": 7880 + }, + { + "epoch": 2.68, + "learning_rate": 5.891980360065466e-08, + "logits/chosen": 1.0535707473754883, + "logits/rejected": 2.7900028228759766, + "logps/chosen": -421.4190368652344, + "logps/rejected": -460.0867614746094, + "loss": 0.0047, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.1875920295715332, + "rewards/margins": 16.28582763671875, + "rewards/rejected": -15.098236083984375, + "step": 7890 + }, + { + "epoch": 2.69, + "learning_rate": 5.8290318519451084e-08, + "logits/chosen": 0.5873435139656067, + "logits/rejected": 2.6181395053863525, + "logps/chosen": -313.67864990234375, + "logps/rejected": -563.8099365234375, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6179535388946533, + "rewards/margins": 15.549154281616211, + "rewards/rejected": -13.93120002746582, + "step": 7900 + }, + { + "epoch": 2.69, + "eval_logits/chosen": 0.39933204650878906, + "eval_logits/rejected": 2.3757200241088867, + "eval_logps/chosen": -370.9472351074219, + "eval_logps/rejected": -655.620361328125, + "eval_loss": 0.004205311182886362, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.8042665719985962, + "eval_rewards/margins": 17.1032657623291, + "eval_rewards/rejected": -15.298998832702637, + "eval_runtime": 270.2927, + "eval_samples_per_second": 35.147, + "eval_steps_per_second": 1.099, + "step": 7900 + }, + { + "epoch": 2.69, + "learning_rate": 5.7660833438247514e-08, + "logits/chosen": 0.811974048614502, + "logits/rejected": 2.5337677001953125, + "logps/chosen": -392.13482666015625, + "logps/rejected": -626.2708740234375, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0893046855926514, + "rewards/margins": 17.98527717590332, + "rewards/rejected": -15.895971298217773, + "step": 7910 + }, + { + "epoch": 2.69, + "learning_rate": 5.7031348357043937e-08, + "logits/chosen": 0.7521673440933228, + "logits/rejected": 1.7344005107879639, + "logps/chosen": -343.30523681640625, + "logps/rejected": -839.4095458984375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6643670797348022, + "rewards/margins": 17.1582088470459, + "rewards/rejected": -15.493840217590332, + "step": 7920 + }, + { + "epoch": 2.7, + "learning_rate": 5.640186327584036e-08, + "logits/chosen": 0.5547946095466614, + "logits/rejected": 2.3363373279571533, + "logps/chosen": -346.8823547363281, + "logps/rejected": -543.2503662109375, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.493767499923706, + "rewards/margins": 17.63357925415039, + "rewards/rejected": -16.139812469482422, + "step": 7930 + }, + { + "epoch": 2.7, + "learning_rate": 5.577237819463679e-08, + "logits/chosen": 1.02409827709198, + "logits/rejected": 2.826772451400757, + "logps/chosen": -388.55462646484375, + "logps/rejected": -478.57049560546875, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4650802612304688, + "rewards/margins": 19.572603225708008, + "rewards/rejected": -18.107524871826172, + "step": 7940 + }, + { + "epoch": 2.7, + "learning_rate": 5.514289311343321e-08, + "logits/chosen": 1.1151471138000488, + "logits/rejected": 2.1015961170196533, + "logps/chosen": -437.498291015625, + "logps/rejected": -720.4129028320312, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7610559463500977, + "rewards/margins": 18.574838638305664, + "rewards/rejected": -16.813783645629883, + "step": 7950 + }, + { + "epoch": 2.71, + "learning_rate": 5.4513408032229634e-08, + "logits/chosen": 0.18950991332530975, + "logits/rejected": 1.766977071762085, + "logps/chosen": -432.38104248046875, + "logps/rejected": -803.8573608398438, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5149455070495605, + "rewards/margins": 17.260913848876953, + "rewards/rejected": -15.74596881866455, + "step": 7960 + }, + { + "epoch": 2.71, + "learning_rate": 5.388392295102606e-08, + "logits/chosen": 1.041501760482788, + "logits/rejected": 2.8908143043518066, + "logps/chosen": -372.1784362792969, + "logps/rejected": -482.1966247558594, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4550552368164062, + "rewards/margins": 16.327838897705078, + "rewards/rejected": -13.872782707214355, + "step": 7970 + }, + { + "epoch": 2.71, + "learning_rate": 5.3254437869822486e-08, + "logits/chosen": 0.7389962673187256, + "logits/rejected": 2.473978281021118, + "logps/chosen": -325.01739501953125, + "logps/rejected": -567.4298095703125, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4190046787261963, + "rewards/margins": 18.693714141845703, + "rewards/rejected": -16.27471160888672, + "step": 7980 + }, + { + "epoch": 2.72, + "learning_rate": 5.262495278861891e-08, + "logits/chosen": 0.8442951440811157, + "logits/rejected": 2.530892848968506, + "logps/chosen": -294.4164123535156, + "logps/rejected": -458.73583984375, + "loss": 0.0039, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5132577419281006, + "rewards/margins": 15.422558784484863, + "rewards/rejected": -13.90929889678955, + "step": 7990 + }, + { + "epoch": 2.72, + "learning_rate": 5.199546770741533e-08, + "logits/chosen": 0.31688395142555237, + "logits/rejected": 2.428586483001709, + "logps/chosen": -317.95941162109375, + "logps/rejected": -623.4954223632812, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7186012268066406, + "rewards/margins": 16.314571380615234, + "rewards/rejected": -14.595973014831543, + "step": 8000 + }, + { + "epoch": 2.72, + "eval_logits/chosen": 0.38528308272361755, + "eval_logits/rejected": 2.363403558731079, + "eval_logps/chosen": -370.7011413574219, + "eval_logps/rejected": -655.7273559570312, + "eval_loss": 0.004197005648165941, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.8288747072219849, + "eval_rewards/margins": 17.13857650756836, + "eval_rewards/rejected": -15.309701919555664, + "eval_runtime": 271.0924, + "eval_samples_per_second": 35.043, + "eval_steps_per_second": 1.096, + "step": 8000 + }, + { + "epoch": 2.72, + "learning_rate": 5.136598262621176e-08, + "logits/chosen": 0.016552647575736046, + "logits/rejected": 2.558486223220825, + "logps/chosen": -297.2393798828125, + "logps/rejected": -554.3638305664062, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.095482349395752, + "rewards/margins": 19.343822479248047, + "rewards/rejected": -17.248340606689453, + "step": 8010 + }, + { + "epoch": 2.73, + "learning_rate": 5.073649754500818e-08, + "logits/chosen": 0.5093849897384644, + "logits/rejected": 2.8169074058532715, + "logps/chosen": -334.86981201171875, + "logps/rejected": -522.8923950195312, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.054229259490967, + "rewards/margins": 17.593975067138672, + "rewards/rejected": -15.53974437713623, + "step": 8020 + }, + { + "epoch": 2.73, + "learning_rate": 5.01070124638046e-08, + "logits/chosen": 1.1472276449203491, + "logits/rejected": 2.4982829093933105, + "logps/chosen": -412.49615478515625, + "logps/rejected": -652.8995361328125, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8879344463348389, + "rewards/margins": 18.21976661682129, + "rewards/rejected": -16.331830978393555, + "step": 8030 + }, + { + "epoch": 2.73, + "learning_rate": 4.947752738260103e-08, + "logits/chosen": 1.0422632694244385, + "logits/rejected": 2.024174213409424, + "logps/chosen": -416.4352111816406, + "logps/rejected": -743.892578125, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8411505222320557, + "rewards/margins": 18.881122589111328, + "rewards/rejected": -17.03997230529785, + "step": 8040 + }, + { + "epoch": 2.74, + "learning_rate": 4.884804230139745e-08, + "logits/chosen": 0.41295504570007324, + "logits/rejected": 1.4202079772949219, + "logps/chosen": -311.0360412597656, + "logps/rejected": -888.4528198242188, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1342692375183105, + "rewards/margins": 17.17261505126953, + "rewards/rejected": -15.038345336914062, + "step": 8050 + }, + { + "epoch": 2.74, + "learning_rate": 4.8218557220193875e-08, + "logits/chosen": 0.8595215678215027, + "logits/rejected": 2.0355379581451416, + "logps/chosen": -602.7960205078125, + "logps/rejected": -667.9617309570312, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5357165336608887, + "rewards/margins": 15.661369323730469, + "rewards/rejected": -14.125653266906738, + "step": 8060 + }, + { + "epoch": 2.74, + "learning_rate": 4.7589072138990305e-08, + "logits/chosen": 0.7463659048080444, + "logits/rejected": 2.516136646270752, + "logps/chosen": -367.22406005859375, + "logps/rejected": -535.6273193359375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6163028478622437, + "rewards/margins": 17.277376174926758, + "rewards/rejected": -15.6610746383667, + "step": 8070 + }, + { + "epoch": 2.75, + "learning_rate": 4.695958705778673e-08, + "logits/chosen": 0.5243152379989624, + "logits/rejected": 2.4038941860198975, + "logps/chosen": -496.75860595703125, + "logps/rejected": -419.94354248046875, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7436059713363647, + "rewards/margins": 16.353389739990234, + "rewards/rejected": -14.609784126281738, + "step": 8080 + }, + { + "epoch": 2.75, + "learning_rate": 4.633010197658315e-08, + "logits/chosen": 0.44321316480636597, + "logits/rejected": 2.27062726020813, + "logps/chosen": -394.5532531738281, + "logps/rejected": -574.9754638671875, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7736930847167969, + "rewards/margins": 15.879064559936523, + "rewards/rejected": -14.105372428894043, + "step": 8090 + }, + { + "epoch": 2.75, + "learning_rate": 4.570061689537958e-08, + "logits/chosen": 1.1519994735717773, + "logits/rejected": 2.0376980304718018, + "logps/chosen": -527.7027587890625, + "logps/rejected": -629.1424560546875, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9384416341781616, + "rewards/margins": 17.458274841308594, + "rewards/rejected": -15.519834518432617, + "step": 8100 + }, + { + "epoch": 2.75, + "eval_logits/chosen": 0.3778843879699707, + "eval_logits/rejected": 2.361894130706787, + "eval_logps/chosen": -370.99468994140625, + "eval_logps/rejected": -655.0099487304688, + "eval_loss": 0.004115123767405748, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.7995245456695557, + "eval_rewards/margins": 17.03748321533203, + "eval_rewards/rejected": -15.237958908081055, + "eval_runtime": 270.8923, + "eval_samples_per_second": 35.069, + "eval_steps_per_second": 1.096, + "step": 8100 + }, + { + "epoch": 2.76, + "learning_rate": 4.5071131814176e-08, + "logits/chosen": 0.33518487215042114, + "logits/rejected": 1.9586979150772095, + "logps/chosen": -440.5299377441406, + "logps/rejected": -661.0430297851562, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8130890130996704, + "rewards/margins": 15.587263107299805, + "rewards/rejected": -13.77417278289795, + "step": 8110 + }, + { + "epoch": 2.76, + "learning_rate": 4.4441646732972425e-08, + "logits/chosen": 1.0590837001800537, + "logits/rejected": 2.1017818450927734, + "logps/chosen": -324.1203918457031, + "logps/rejected": -874.7774658203125, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9442323446273804, + "rewards/margins": 18.850128173828125, + "rewards/rejected": -16.905895233154297, + "step": 8120 + }, + { + "epoch": 2.76, + "learning_rate": 4.3812161651768855e-08, + "logits/chosen": 0.7944759726524353, + "logits/rejected": 2.6383702754974365, + "logps/chosen": -334.22174072265625, + "logps/rejected": -609.9515991210938, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.351963520050049, + "rewards/margins": 18.316631317138672, + "rewards/rejected": -15.964668273925781, + "step": 8130 + }, + { + "epoch": 2.77, + "learning_rate": 4.318267657056528e-08, + "logits/chosen": 1.4045004844665527, + "logits/rejected": 2.415086269378662, + "logps/chosen": -409.2455139160156, + "logps/rejected": -547.939453125, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6006851196289062, + "rewards/margins": 14.521757125854492, + "rewards/rejected": -12.921072006225586, + "step": 8140 + }, + { + "epoch": 2.77, + "learning_rate": 4.25531914893617e-08, + "logits/chosen": 0.8221235275268555, + "logits/rejected": 2.161207675933838, + "logps/chosen": -342.333251953125, + "logps/rejected": -702.6590576171875, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9118928909301758, + "rewards/margins": 16.797853469848633, + "rewards/rejected": -14.885961532592773, + "step": 8150 + }, + { + "epoch": 2.77, + "learning_rate": 4.192370640815812e-08, + "logits/chosen": 1.462494134902954, + "logits/rejected": 2.605686664581299, + "logps/chosen": -357.91046142578125, + "logps/rejected": -549.108642578125, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6379854679107666, + "rewards/margins": 17.190990447998047, + "rewards/rejected": -15.553003311157227, + "step": 8160 + }, + { + "epoch": 2.78, + "learning_rate": 4.129422132695455e-08, + "logits/chosen": 0.5041385889053345, + "logits/rejected": 2.6867682933807373, + "logps/chosen": -368.87445068359375, + "logps/rejected": -542.053955078125, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8026316165924072, + "rewards/margins": 15.136019706726074, + "rewards/rejected": -13.333389282226562, + "step": 8170 + }, + { + "epoch": 2.78, + "learning_rate": 4.0664736245750975e-08, + "logits/chosen": 0.9905563592910767, + "logits/rejected": 2.741023302078247, + "logps/chosen": -330.87286376953125, + "logps/rejected": -528.2855224609375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6960986852645874, + "rewards/margins": 17.097124099731445, + "rewards/rejected": -15.401025772094727, + "step": 8180 + }, + { + "epoch": 2.78, + "learning_rate": 4.00352511645474e-08, + "logits/chosen": 1.4236377477645874, + "logits/rejected": 2.461068630218506, + "logps/chosen": -409.6502990722656, + "logps/rejected": -636.2116088867188, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5380514860153198, + "rewards/margins": 17.385438919067383, + "rewards/rejected": -15.847389221191406, + "step": 8190 + }, + { + "epoch": 2.79, + "learning_rate": 3.940576608334383e-08, + "logits/chosen": 0.491749107837677, + "logits/rejected": 2.350903034210205, + "logps/chosen": -398.1735534667969, + "logps/rejected": -623.8147583007812, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1699795722961426, + "rewards/margins": 16.77878761291504, + "rewards/rejected": -14.608807563781738, + "step": 8200 + }, + { + "epoch": 2.79, + "eval_logits/chosen": 0.3826829195022583, + "eval_logits/rejected": 2.3667984008789062, + "eval_logps/chosen": -370.9769287109375, + "eval_logps/rejected": -655.0703125, + "eval_loss": 0.004031539428979158, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.8012951612472534, + "eval_rewards/margins": 17.0452880859375, + "eval_rewards/rejected": -15.243992805480957, + "eval_runtime": 270.261, + "eval_samples_per_second": 35.151, + "eval_steps_per_second": 1.099, + "step": 8200 + }, + { + "epoch": 2.79, + "learning_rate": 3.877628100214025e-08, + "logits/chosen": 1.0121322870254517, + "logits/rejected": 2.2878799438476562, + "logps/chosen": -380.6305236816406, + "logps/rejected": -651.130859375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8424866199493408, + "rewards/margins": 17.168813705444336, + "rewards/rejected": -15.326327323913574, + "step": 8210 + }, + { + "epoch": 2.79, + "learning_rate": 3.814679592093667e-08, + "logits/chosen": 0.3881423771381378, + "logits/rejected": 1.7822697162628174, + "logps/chosen": -427.33660888671875, + "logps/rejected": -833.1677856445312, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.228376626968384, + "rewards/margins": 19.179027557373047, + "rewards/rejected": -16.95064926147461, + "step": 8220 + }, + { + "epoch": 2.8, + "learning_rate": 3.75173108397331e-08, + "logits/chosen": 0.7444356083869934, + "logits/rejected": 1.9901525974273682, + "logps/chosen": -360.2849426269531, + "logps/rejected": -703.2940673828125, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.849193811416626, + "rewards/margins": 18.03685188293457, + "rewards/rejected": -16.18765640258789, + "step": 8230 + }, + { + "epoch": 2.8, + "learning_rate": 3.688782575852952e-08, + "logits/chosen": 0.6894891858100891, + "logits/rejected": 2.414947986602783, + "logps/chosen": -296.0263671875, + "logps/rejected": -487.558349609375, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.251554012298584, + "rewards/margins": 14.990411758422852, + "rewards/rejected": -12.73885726928711, + "step": 8240 + }, + { + "epoch": 2.8, + "learning_rate": 3.625834067732594e-08, + "logits/chosen": 0.8461538553237915, + "logits/rejected": 2.1160616874694824, + "logps/chosen": -449.3296813964844, + "logps/rejected": -682.245849609375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6871671676635742, + "rewards/margins": 17.47844123840332, + "rewards/rejected": -15.79127311706543, + "step": 8250 + }, + { + "epoch": 2.81, + "learning_rate": 3.562885559612237e-08, + "logits/chosen": 0.7242621779441833, + "logits/rejected": 2.284646511077881, + "logps/chosen": -413.39923095703125, + "logps/rejected": -709.4559326171875, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.405566453933716, + "rewards/margins": 16.431108474731445, + "rewards/rejected": -14.025540351867676, + "step": 8260 + }, + { + "epoch": 2.81, + "learning_rate": 3.499937051491879e-08, + "logits/chosen": 0.5801094174385071, + "logits/rejected": 2.4470572471618652, + "logps/chosen": -316.2228698730469, + "logps/rejected": -567.92626953125, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7270088195800781, + "rewards/margins": 18.105016708374023, + "rewards/rejected": -16.378009796142578, + "step": 8270 + }, + { + "epoch": 2.81, + "learning_rate": 3.4369885433715216e-08, + "logits/chosen": 0.23344922065734863, + "logits/rejected": 2.7615671157836914, + "logps/chosen": -292.5202331542969, + "logps/rejected": -507.83056640625, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3381764888763428, + "rewards/margins": 16.836694717407227, + "rewards/rejected": -14.498517036437988, + "step": 8280 + }, + { + "epoch": 2.82, + "learning_rate": 3.3740400352511645e-08, + "logits/chosen": 0.16733908653259277, + "logits/rejected": 1.8548532724380493, + "logps/chosen": -291.5389709472656, + "logps/rejected": -880.97705078125, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7905704975128174, + "rewards/margins": 17.14309310913086, + "rewards/rejected": -15.35252571105957, + "step": 8290 + }, + { + "epoch": 2.82, + "learning_rate": 3.311091527130807e-08, + "logits/chosen": 1.012211799621582, + "logits/rejected": 2.654669761657715, + "logps/chosen": -351.44061279296875, + "logps/rejected": -489.4246520996094, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4640682935714722, + "rewards/margins": 16.484954833984375, + "rewards/rejected": -15.020886421203613, + "step": 8300 + }, + { + "epoch": 2.82, + "eval_logits/chosen": 0.38338133692741394, + "eval_logits/rejected": 2.3660054206848145, + "eval_logps/chosen": -370.94989013671875, + "eval_logps/rejected": -654.731689453125, + "eval_loss": 0.0040396335534751415, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.803999662399292, + "eval_rewards/margins": 17.01413917541504, + "eval_rewards/rejected": -15.210142135620117, + "eval_runtime": 271.0037, + "eval_samples_per_second": 35.055, + "eval_steps_per_second": 1.096, + "step": 8300 + }, + { + "epoch": 2.82, + "learning_rate": 3.248143019010449e-08, + "logits/chosen": 0.5455238819122314, + "logits/rejected": 2.2325034141540527, + "logps/chosen": -296.95709228515625, + "logps/rejected": -712.0809936523438, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8315194845199585, + "rewards/margins": 16.148942947387695, + "rewards/rejected": -14.317422866821289, + "step": 8310 + }, + { + "epoch": 2.83, + "learning_rate": 3.1851945108900914e-08, + "logits/chosen": 0.4286075532436371, + "logits/rejected": 2.1023879051208496, + "logps/chosen": -386.9786376953125, + "logps/rejected": -722.5575561523438, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6828031539916992, + "rewards/margins": 17.364538192749023, + "rewards/rejected": -15.681735038757324, + "step": 8320 + }, + { + "epoch": 2.83, + "learning_rate": 3.122246002769734e-08, + "logits/chosen": 0.19908392429351807, + "logits/rejected": 2.4876885414123535, + "logps/chosen": -299.66400146484375, + "logps/rejected": -569.06201171875, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1454601287841797, + "rewards/margins": 18.544597625732422, + "rewards/rejected": -16.39913558959961, + "step": 8330 + }, + { + "epoch": 2.83, + "learning_rate": 3.0592974946493766e-08, + "logits/chosen": 1.0194957256317139, + "logits/rejected": 2.586066484451294, + "logps/chosen": -370.0921325683594, + "logps/rejected": -541.5203857421875, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3220067024230957, + "rewards/margins": 17.71945571899414, + "rewards/rejected": -15.39744758605957, + "step": 8340 + }, + { + "epoch": 2.84, + "learning_rate": 2.996348986529019e-08, + "logits/chosen": 1.2256660461425781, + "logits/rejected": 2.4342265129089355, + "logps/chosen": -465.48046875, + "logps/rejected": -524.5305786132812, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7561149597167969, + "rewards/margins": 16.329362869262695, + "rewards/rejected": -14.573247909545898, + "step": 8350 + }, + { + "epoch": 2.84, + "learning_rate": 2.9334004784086618e-08, + "logits/chosen": 1.0791127681732178, + "logits/rejected": 2.0485925674438477, + "logps/chosen": -338.21142578125, + "logps/rejected": -673.5530395507812, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2146098613739014, + "rewards/margins": 17.91824722290039, + "rewards/rejected": -15.703636169433594, + "step": 8360 + }, + { + "epoch": 2.85, + "learning_rate": 2.870451970288304e-08, + "logits/chosen": 0.8204256892204285, + "logits/rejected": 2.081756353378296, + "logps/chosen": -440.00341796875, + "logps/rejected": -725.962890625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.794062614440918, + "rewards/margins": 16.563032150268555, + "rewards/rejected": -14.768969535827637, + "step": 8370 + }, + { + "epoch": 2.85, + "learning_rate": 2.8075034621679467e-08, + "logits/chosen": 0.86540687084198, + "logits/rejected": 2.551528215408325, + "logps/chosen": -502.77386474609375, + "logps/rejected": -431.47930908203125, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.017047643661499, + "rewards/margins": 15.298959732055664, + "rewards/rejected": -13.28191089630127, + "step": 8380 + }, + { + "epoch": 2.85, + "learning_rate": 2.744554954047589e-08, + "logits/chosen": 0.7401953935623169, + "logits/rejected": 2.2178268432617188, + "logps/chosen": -313.5390319824219, + "logps/rejected": -635.0584716796875, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3623398542404175, + "rewards/margins": 17.631855010986328, + "rewards/rejected": -16.269515991210938, + "step": 8390 + }, + { + "epoch": 2.86, + "learning_rate": 2.6816064459272312e-08, + "logits/chosen": 1.069883942604065, + "logits/rejected": 2.098914623260498, + "logps/chosen": -505.76971435546875, + "logps/rejected": -673.7338256835938, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7513952255249023, + "rewards/margins": 16.28409767150879, + "rewards/rejected": -14.53270149230957, + "step": 8400 + }, + { + "epoch": 2.86, + "eval_logits/chosen": 0.3679710328578949, + "eval_logits/rejected": 2.349754571914673, + "eval_logps/chosen": -371.5492858886719, + "eval_logps/rejected": -655.7620849609375, + "eval_loss": 0.004049401730298996, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.7440600395202637, + "eval_rewards/margins": 17.057235717773438, + "eval_rewards/rejected": -15.3131742477417, + "eval_runtime": 270.8521, + "eval_samples_per_second": 35.074, + "eval_steps_per_second": 1.097, + "step": 8400 + }, + { + "epoch": 2.86, + "learning_rate": 2.618657937806874e-08, + "logits/chosen": 0.24509386718273163, + "logits/rejected": 2.538390636444092, + "logps/chosen": -384.0797424316406, + "logps/rejected": -612.6265869140625, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4948558807373047, + "rewards/margins": 16.53910255432129, + "rewards/rejected": -15.044245719909668, + "step": 8410 + }, + { + "epoch": 2.86, + "learning_rate": 2.555709429686516e-08, + "logits/chosen": 0.545474112033844, + "logits/rejected": 2.173983097076416, + "logps/chosen": -460.26141357421875, + "logps/rejected": -642.0891723632812, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5278213024139404, + "rewards/margins": 17.008352279663086, + "rewards/rejected": -14.480531692504883, + "step": 8420 + }, + { + "epoch": 2.87, + "learning_rate": 2.4927609215661587e-08, + "logits/chosen": 0.6959326863288879, + "logits/rejected": 2.6381914615631104, + "logps/chosen": -317.1097717285156, + "logps/rejected": -466.00372314453125, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5947946310043335, + "rewards/margins": 17.090538024902344, + "rewards/rejected": -15.49574089050293, + "step": 8430 + }, + { + "epoch": 2.87, + "learning_rate": 2.4298124134458013e-08, + "logits/chosen": 0.34631219506263733, + "logits/rejected": 2.274941921234131, + "logps/chosen": -319.5626525878906, + "logps/rejected": -699.2388305664062, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.376375675201416, + "rewards/margins": 16.615434646606445, + "rewards/rejected": -15.239057540893555, + "step": 8440 + }, + { + "epoch": 2.87, + "learning_rate": 2.3668639053254436e-08, + "logits/chosen": 1.167719841003418, + "logits/rejected": 2.4808707237243652, + "logps/chosen": -312.66412353515625, + "logps/rejected": -496.24755859375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2384330034255981, + "rewards/margins": 15.005212783813477, + "rewards/rejected": -13.766778945922852, + "step": 8450 + }, + { + "epoch": 2.88, + "learning_rate": 2.3039153972050862e-08, + "logits/chosen": 0.39031848311424255, + "logits/rejected": 1.9357246160507202, + "logps/chosen": -392.21319580078125, + "logps/rejected": -832.5721435546875, + "loss": 0.0048, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.8006553649902344, + "rewards/margins": 18.051868438720703, + "rewards/rejected": -16.251216888427734, + "step": 8460 + }, + { + "epoch": 2.88, + "learning_rate": 2.2409668890847285e-08, + "logits/chosen": 0.9093269109725952, + "logits/rejected": 2.1801865100860596, + "logps/chosen": -328.8011779785156, + "logps/rejected": -662.0884399414062, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8631995916366577, + "rewards/margins": 18.073341369628906, + "rewards/rejected": -16.210142135620117, + "step": 8470 + }, + { + "epoch": 2.88, + "learning_rate": 2.178018380964371e-08, + "logits/chosen": 0.610144317150116, + "logits/rejected": 2.2706358432769775, + "logps/chosen": -394.10028076171875, + "logps/rejected": -649.7537841796875, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4115517139434814, + "rewards/margins": 18.226654052734375, + "rewards/rejected": -16.815099716186523, + "step": 8480 + }, + { + "epoch": 2.89, + "learning_rate": 2.1150698728440137e-08, + "logits/chosen": 0.09516476094722748, + "logits/rejected": 2.2865424156188965, + "logps/chosen": -359.79754638671875, + "logps/rejected": -595.9285888671875, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8083724975585938, + "rewards/margins": 17.95835304260254, + "rewards/rejected": -16.149978637695312, + "step": 8490 + }, + { + "epoch": 2.89, + "learning_rate": 2.052121364723656e-08, + "logits/chosen": 0.7596250176429749, + "logits/rejected": 2.644648551940918, + "logps/chosen": -317.4564514160156, + "logps/rejected": -603.7373046875, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4471447467803955, + "rewards/margins": 15.757558822631836, + "rewards/rejected": -14.31041431427002, + "step": 8500 + }, + { + "epoch": 2.89, + "eval_logits/chosen": 0.3713549077510834, + "eval_logits/rejected": 2.350857734680176, + "eval_logps/chosen": -371.4393310546875, + "eval_logps/rejected": -655.9080200195312, + "eval_loss": 0.004041966050863266, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.7550534009933472, + "eval_rewards/margins": 17.08282470703125, + "eval_rewards/rejected": -15.327771186828613, + "eval_runtime": 269.8288, + "eval_samples_per_second": 35.208, + "eval_steps_per_second": 1.101, + "step": 8500 + }, + { + "epoch": 2.89, + "learning_rate": 1.9891728566032983e-08, + "logits/chosen": 1.1884275674819946, + "logits/rejected": 2.863093614578247, + "logps/chosen": -388.61187744140625, + "logps/rejected": -472.49200439453125, + "loss": 0.0097, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.506639838218689, + "rewards/margins": 17.00218391418457, + "rewards/rejected": -15.495542526245117, + "step": 8510 + }, + { + "epoch": 2.9, + "learning_rate": 1.926224348482941e-08, + "logits/chosen": 0.4562016427516937, + "logits/rejected": 2.073929786682129, + "logps/chosen": -438.0223083496094, + "logps/rejected": -584.0213623046875, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8205589056015015, + "rewards/margins": 17.869468688964844, + "rewards/rejected": -16.04891014099121, + "step": 8520 + }, + { + "epoch": 2.9, + "learning_rate": 1.863275840362583e-08, + "logits/chosen": 1.0397640466690063, + "logits/rejected": 2.006401538848877, + "logps/chosen": -317.5547790527344, + "logps/rejected": -775.80322265625, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4246604442596436, + "rewards/margins": 18.71263313293457, + "rewards/rejected": -17.2879695892334, + "step": 8530 + }, + { + "epoch": 2.9, + "learning_rate": 1.8003273322422258e-08, + "logits/chosen": 1.233269453048706, + "logits/rejected": 2.318169355392456, + "logps/chosen": -346.78643798828125, + "logps/rejected": -649.5104370117188, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6148817539215088, + "rewards/margins": 16.13572883605957, + "rewards/rejected": -14.520846366882324, + "step": 8540 + }, + { + "epoch": 2.91, + "learning_rate": 1.737378824121868e-08, + "logits/chosen": 1.0876624584197998, + "logits/rejected": 2.3624844551086426, + "logps/chosen": -310.1029052734375, + "logps/rejected": -627.22802734375, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.419732689857483, + "rewards/margins": 16.585172653198242, + "rewards/rejected": -15.165440559387207, + "step": 8550 + }, + { + "epoch": 2.91, + "learning_rate": 1.6744303160015107e-08, + "logits/chosen": 0.5178649425506592, + "logits/rejected": 2.642382860183716, + "logps/chosen": -383.968505859375, + "logps/rejected": -616.5057983398438, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3818258047103882, + "rewards/margins": 15.679946899414062, + "rewards/rejected": -14.298121452331543, + "step": 8560 + }, + { + "epoch": 2.91, + "learning_rate": 1.6114818078811533e-08, + "logits/chosen": 0.7347376942634583, + "logits/rejected": 1.8428224325180054, + "logps/chosen": -402.1912841796875, + "logps/rejected": -741.1703491210938, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5274848937988281, + "rewards/margins": 15.618868827819824, + "rewards/rejected": -14.09138298034668, + "step": 8570 + }, + { + "epoch": 2.92, + "learning_rate": 1.5485332997607955e-08, + "logits/chosen": 0.6257011890411377, + "logits/rejected": 2.5201776027679443, + "logps/chosen": -305.34344482421875, + "logps/rejected": -632.3414306640625, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6069772243499756, + "rewards/margins": 18.388032913208008, + "rewards/rejected": -16.781055450439453, + "step": 8580 + }, + { + "epoch": 2.92, + "learning_rate": 1.485584791640438e-08, + "logits/chosen": 0.6669371128082275, + "logits/rejected": 2.31857967376709, + "logps/chosen": -358.6900634765625, + "logps/rejected": -674.6116943359375, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.514675498008728, + "rewards/margins": 17.45252227783203, + "rewards/rejected": -15.937848091125488, + "step": 8590 + }, + { + "epoch": 2.92, + "learning_rate": 1.4226362835200804e-08, + "logits/chosen": 0.5116016864776611, + "logits/rejected": 2.267575740814209, + "logps/chosen": -319.94268798828125, + "logps/rejected": -688.0324096679688, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.873579978942871, + "rewards/margins": 16.022626876831055, + "rewards/rejected": -14.1490478515625, + "step": 8600 + }, + { + "epoch": 2.92, + "eval_logits/chosen": 0.3700520396232605, + "eval_logits/rejected": 2.351830005645752, + "eval_logps/chosen": -371.48968505859375, + "eval_logps/rejected": -655.9204711914062, + "eval_loss": 0.004031510092318058, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.750023603439331, + "eval_rewards/margins": 17.079038619995117, + "eval_rewards/rejected": -15.329015731811523, + "eval_runtime": 269.9511, + "eval_samples_per_second": 35.192, + "eval_steps_per_second": 1.1, + "step": 8600 + }, + { + "epoch": 2.93, + "learning_rate": 1.3596877753997229e-08, + "logits/chosen": 0.7066696882247925, + "logits/rejected": 2.703484058380127, + "logps/chosen": -447.2012634277344, + "logps/rejected": -508.0310974121094, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8840053081512451, + "rewards/margins": 18.700672149658203, + "rewards/rejected": -16.816667556762695, + "step": 8610 + }, + { + "epoch": 2.93, + "learning_rate": 1.2967392672793655e-08, + "logits/chosen": 0.5371155738830566, + "logits/rejected": 1.6587845087051392, + "logps/chosen": -340.7304992675781, + "logps/rejected": -915.28173828125, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9751720428466797, + "rewards/margins": 17.403154373168945, + "rewards/rejected": -15.42798137664795, + "step": 8620 + }, + { + "epoch": 2.93, + "learning_rate": 1.233790759159008e-08, + "logits/chosen": 1.0137431621551514, + "logits/rejected": 2.462019443511963, + "logps/chosen": -357.55096435546875, + "logps/rejected": -623.3336791992188, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7592010498046875, + "rewards/margins": 16.47138023376465, + "rewards/rejected": -14.712181091308594, + "step": 8630 + }, + { + "epoch": 2.94, + "learning_rate": 1.1708422510386504e-08, + "logits/chosen": 0.3507612347602844, + "logits/rejected": 2.4512226581573486, + "logps/chosen": -425.46441650390625, + "logps/rejected": -606.0345458984375, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.872849464416504, + "rewards/margins": 17.761226654052734, + "rewards/rejected": -15.88837718963623, + "step": 8640 + }, + { + "epoch": 2.94, + "learning_rate": 1.1078937429182926e-08, + "logits/chosen": 0.7619954943656921, + "logits/rejected": 2.1373696327209473, + "logps/chosen": -490.814697265625, + "logps/rejected": -750.1331787109375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4002864360809326, + "rewards/margins": 15.68773365020752, + "rewards/rejected": -14.287447929382324, + "step": 8650 + }, + { + "epoch": 2.94, + "learning_rate": 1.0449452347979353e-08, + "logits/chosen": 1.328615665435791, + "logits/rejected": 2.1859679222106934, + "logps/chosen": -433.30377197265625, + "logps/rejected": -648.45947265625, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.117361068725586, + "rewards/margins": 17.00444221496582, + "rewards/rejected": -14.887080192565918, + "step": 8660 + }, + { + "epoch": 2.95, + "learning_rate": 9.819967266775777e-09, + "logits/chosen": 0.5871554017066956, + "logits/rejected": 1.835679054260254, + "logps/chosen": -314.6351013183594, + "logps/rejected": -679.0066528320312, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2678003311157227, + "rewards/margins": 16.66663932800293, + "rewards/rejected": -15.398837089538574, + "step": 8670 + }, + { + "epoch": 2.95, + "learning_rate": 9.190482185572201e-09, + "logits/chosen": 0.7893953323364258, + "logits/rejected": 1.7981361150741577, + "logps/chosen": -427.78192138671875, + "logps/rejected": -786.5863647460938, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6110270023345947, + "rewards/margins": 16.643098831176758, + "rewards/rejected": -15.032072067260742, + "step": 8680 + }, + { + "epoch": 2.95, + "learning_rate": 8.560997104368626e-09, + "logits/chosen": 1.0635652542114258, + "logits/rejected": 2.400646686553955, + "logps/chosen": -365.33770751953125, + "logps/rejected": -511.2415466308594, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2030410766601562, + "rewards/margins": 16.05733871459961, + "rewards/rejected": -13.854296684265137, + "step": 8690 + }, + { + "epoch": 2.96, + "learning_rate": 7.931512023165052e-09, + "logits/chosen": 1.1826716661453247, + "logits/rejected": 2.120217800140381, + "logps/chosen": -392.803955078125, + "logps/rejected": -609.3681640625, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2204407453536987, + "rewards/margins": 15.291590690612793, + "rewards/rejected": -14.071149826049805, + "step": 8700 + }, + { + "epoch": 2.96, + "eval_logits/chosen": 0.3659575581550598, + "eval_logits/rejected": 2.3477611541748047, + "eval_logps/chosen": -371.695556640625, + "eval_logps/rejected": -656.2755737304688, + "eval_loss": 0.004046812187880278, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.7294360399246216, + "eval_rewards/margins": 17.093955993652344, + "eval_rewards/rejected": -15.364519119262695, + "eval_runtime": 269.7042, + "eval_samples_per_second": 35.224, + "eval_steps_per_second": 1.101, + "step": 8700 + }, + { + "epoch": 2.96, + "learning_rate": 7.3020269419614755e-09, + "logits/chosen": 0.9278692007064819, + "logits/rejected": 1.7400996685028076, + "logps/chosen": -324.8660583496094, + "logps/rejected": -832.1110229492188, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9335384368896484, + "rewards/margins": 16.6754207611084, + "rewards/rejected": -14.74188232421875, + "step": 8710 + }, + { + "epoch": 2.96, + "learning_rate": 6.6725418607579e-09, + "logits/chosen": 0.9633650779724121, + "logits/rejected": 2.232921838760376, + "logps/chosen": -408.4643249511719, + "logps/rejected": -730.5870361328125, + "loss": 0.0036, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.055891513824463, + "rewards/margins": 19.569120407104492, + "rewards/rejected": -17.513227462768555, + "step": 8720 + }, + { + "epoch": 2.97, + "learning_rate": 6.043056779554324e-09, + "logits/chosen": 0.22244560718536377, + "logits/rejected": 1.6365598440170288, + "logps/chosen": -327.47320556640625, + "logps/rejected": -794.9049072265625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.042677402496338, + "rewards/margins": 14.997645378112793, + "rewards/rejected": -12.954968452453613, + "step": 8730 + }, + { + "epoch": 2.97, + "learning_rate": 5.41357169835075e-09, + "logits/chosen": 1.1285948753356934, + "logits/rejected": 2.2873001098632812, + "logps/chosen": -411.16046142578125, + "logps/rejected": -647.3367919921875, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.571076512336731, + "rewards/margins": 20.033849716186523, + "rewards/rejected": -18.4627742767334, + "step": 8740 + }, + { + "epoch": 2.97, + "learning_rate": 4.784086617147173e-09, + "logits/chosen": 0.8112198114395142, + "logits/rejected": 2.69305419921875, + "logps/chosen": -504.293701171875, + "logps/rejected": -551.0088500976562, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9363229274749756, + "rewards/margins": 15.2676362991333, + "rewards/rejected": -13.331314086914062, + "step": 8750 + }, + { + "epoch": 2.98, + "learning_rate": 4.1546015359435984e-09, + "logits/chosen": 0.9000552296638489, + "logits/rejected": 2.208092212677002, + "logps/chosen": -448.109375, + "logps/rejected": -644.7918090820312, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6445732116699219, + "rewards/margins": 17.73623275756836, + "rewards/rejected": -16.091657638549805, + "step": 8760 + }, + { + "epoch": 2.98, + "learning_rate": 3.5251164547400225e-09, + "logits/chosen": 1.073515772819519, + "logits/rejected": 2.285383462905884, + "logps/chosen": -423.59234619140625, + "logps/rejected": -647.4371948242188, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7608124017715454, + "rewards/margins": 16.25401496887207, + "rewards/rejected": -14.493202209472656, + "step": 8770 + }, + { + "epoch": 2.98, + "learning_rate": 2.895631373536447e-09, + "logits/chosen": 1.2739975452423096, + "logits/rejected": 2.3136279582977295, + "logps/chosen": -397.5133361816406, + "logps/rejected": -485.2103576660156, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.536682367324829, + "rewards/margins": 16.215782165527344, + "rewards/rejected": -14.679100036621094, + "step": 8780 + }, + { + "epoch": 2.99, + "learning_rate": 2.2661462923328713e-09, + "logits/chosen": 0.6144751310348511, + "logits/rejected": 2.3888256549835205, + "logps/chosen": -377.27337646484375, + "logps/rejected": -669.4237060546875, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5046803951263428, + "rewards/margins": 16.96630096435547, + "rewards/rejected": -15.461621284484863, + "step": 8790 + }, + { + "epoch": 2.99, + "learning_rate": 1.6366612111292962e-09, + "logits/chosen": 1.018761396408081, + "logits/rejected": 2.47017765045166, + "logps/chosen": -323.1969299316406, + "logps/rejected": -533.069091796875, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4490963220596313, + "rewards/margins": 17.300067901611328, + "rewards/rejected": -15.850972175598145, + "step": 8800 + }, + { + "epoch": 2.99, + "eval_logits/chosen": 0.36471885442733765, + "eval_logits/rejected": 2.3464467525482178, + "eval_logps/chosen": -371.68450927734375, + "eval_logps/rejected": -656.2389526367188, + "eval_loss": 0.004039596766233444, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 1.730539083480835, + "eval_rewards/margins": 17.091402053833008, + "eval_rewards/rejected": -15.360862731933594, + "eval_runtime": 269.8964, + "eval_samples_per_second": 35.199, + "eval_steps_per_second": 1.1, + "step": 8800 + }, + { + "epoch": 2.99, + "learning_rate": 1.0071761299257208e-09, + "logits/chosen": 0.8230178952217102, + "logits/rejected": 2.178173780441284, + "logps/chosen": -360.5575256347656, + "logps/rejected": -679.1911010742188, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.107916831970215, + "rewards/margins": 18.536773681640625, + "rewards/rejected": -16.428855895996094, + "step": 8810 + }, + { + "epoch": 3.0, + "learning_rate": 3.7769104872214527e-10, + "logits/chosen": 0.9183434247970581, + "logits/rejected": 2.830625057220459, + "logps/chosen": -334.22283935546875, + "logps/rejected": -536.5045776367188, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4037619829177856, + "rewards/margins": 16.705692291259766, + "rewards/rejected": -15.301928520202637, + "step": 8820 + }, + { + "epoch": 3.0, + "step": 8826, + "total_flos": 0.0, + "train_loss": 0.03725933715852631, + "train_runtime": 48940.1741, + "train_samples_per_second": 11.542, + "train_steps_per_second": 0.18 + } + ], + "logging_steps": 10, + "max_steps": 8826, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}