diff --git "a/checkpoint-1000/trainer_state.json" "b/checkpoint-1000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1000/trainer_state.json" @@ -0,0 +1,14021 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.3086864060199574, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.000000000000001e-07, + "logits/chosen": -1.9473017454147339, + "logits/rejected": -1.9154374599456787, + "logps/chosen": -178.9344940185547, + "logps/rejected": -157.74179077148438, + "loss": 0.6983, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0024076940026134253, + "rewards/margins": -0.008672237396240234, + "rewards/rejected": 0.011079930700361729, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": -1.6428608894348145, + "logits/rejected": -1.720033884048462, + "logps/chosen": -144.77987670898438, + "logps/rejected": -144.8594207763672, + "loss": 0.6891, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.010233569890260696, + "rewards/margins": 0.00934591330587864, + "rewards/rejected": 0.0008876564679667354, + "step": 2 + }, + { + "epoch": 0.0, + "learning_rate": 1.5e-06, + "logits/chosen": -2.0388426780700684, + "logits/rejected": -1.988499402999878, + "logps/chosen": -212.5189208984375, + "logps/rejected": -199.8199462890625, + "loss": 0.6912, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.010333014652132988, + "rewards/margins": 0.005366659723222256, + "rewards/rejected": 0.004966353997588158, + "step": 3 + }, + { + "epoch": 0.01, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -1.9381685256958008, + "logits/rejected": -1.9115777015686035, + "logps/chosen": -141.91700744628906, + "logps/rejected": -141.35037231445312, + "loss": 0.6876, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.006317043211311102, + "rewards/margins": 0.01223981473594904, + "rewards/rejected": -0.005922770127654076, + "step": 4 + }, + { + "epoch": 0.01, + "learning_rate": 2.5e-06, + "logits/chosen": -2.0682923793792725, + "logits/rejected": -2.116490364074707, + "logps/chosen": -144.48883056640625, + "logps/rejected": -136.4881591796875, + "loss": 0.6905, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.004738474264740944, + "rewards/margins": 0.006069636438041925, + "rewards/rejected": -0.0013311614748090506, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 3e-06, + "logits/chosen": -1.8117401599884033, + "logits/rejected": -1.763953447341919, + "logps/chosen": -151.67367553710938, + "logps/rejected": -137.02761840820312, + "loss": 0.6984, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.004828071687370539, + "rewards/margins": -0.008975815027952194, + "rewards/rejected": 0.00414774427190423, + "step": 6 + }, + { + "epoch": 0.01, + "learning_rate": 3.5000000000000004e-06, + "logits/chosen": -2.0915303230285645, + "logits/rejected": -2.1096673011779785, + "logps/chosen": -146.31625366210938, + "logps/rejected": -154.5668487548828, + "loss": 0.6955, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.002810073085129261, + "rewards/margins": -0.004074001684784889, + "rewards/rejected": 0.001263928133994341, + "step": 7 + }, + { + "epoch": 0.01, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -2.126337766647339, + "logits/rejected": -2.126678228378296, + "logps/chosen": -204.9049072265625, + "logps/rejected": -203.91268920898438, + "loss": 0.6962, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.008610963821411133, + "rewards/margins": -0.005686474964022636, + "rewards/rejected": 0.014297439716756344, + "step": 8 + }, + { + "epoch": 0.01, + "learning_rate": 4.5e-06, + "logits/chosen": -2.2768051624298096, + "logits/rejected": -2.281789541244507, + "logps/chosen": -153.13116455078125, + "logps/rejected": -150.92642211914062, + "loss": 0.6874, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.01126334723085165, + "rewards/margins": 0.012444520369172096, + "rewards/rejected": -0.0011811736039817333, + "step": 9 + }, + { + "epoch": 0.01, + "learning_rate": 5e-06, + "logits/chosen": -1.9441958665847778, + "logits/rejected": -1.964548110961914, + "logps/chosen": -149.23391723632812, + "logps/rejected": -137.14862060546875, + "loss": 0.6881, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.020777558907866478, + "rewards/margins": 0.011875724419951439, + "rewards/rejected": 0.008901833556592464, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 5.500000000000001e-06, + "logits/chosen": -2.147075653076172, + "logits/rejected": -2.1675617694854736, + "logps/chosen": -174.22434997558594, + "logps/rejected": -178.1639404296875, + "loss": 0.6947, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.013791371136903763, + "rewards/margins": -0.0025409706868231297, + "rewards/rejected": 0.016332341358065605, + "step": 11 + }, + { + "epoch": 0.02, + "learning_rate": 6e-06, + "logits/chosen": -1.9246153831481934, + "logits/rejected": -1.9109158515930176, + "logps/chosen": -150.50784301757812, + "logps/rejected": -180.81753540039062, + "loss": 0.7001, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.012552834115922451, + "rewards/margins": -0.012693023309111595, + "rewards/rejected": 0.00014019012451171875, + "step": 12 + }, + { + "epoch": 0.02, + "learning_rate": 6.5000000000000004e-06, + "logits/chosen": -1.9389597177505493, + "logits/rejected": -1.9608503580093384, + "logps/chosen": -205.6465606689453, + "logps/rejected": -201.75946044921875, + "loss": 0.6865, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.014348938129842281, + "rewards/margins": 0.013899493962526321, + "rewards/rejected": 0.00044944253750145435, + "step": 13 + }, + { + "epoch": 0.02, + "learning_rate": 7.000000000000001e-06, + "logits/chosen": -1.9243704080581665, + "logits/rejected": -1.9085144996643066, + "logps/chosen": -201.07188415527344, + "logps/rejected": -221.47982788085938, + "loss": 0.6869, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0006500715389847755, + "rewards/margins": 0.013952446170151234, + "rewards/rejected": -0.01460251584649086, + "step": 14 + }, + { + "epoch": 0.02, + "learning_rate": 7.5e-06, + "logits/chosen": -2.1744699478149414, + "logits/rejected": -2.13222599029541, + "logps/chosen": -204.4735107421875, + "logps/rejected": -200.4049072265625, + "loss": 0.6807, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.017023086547851562, + "rewards/margins": 0.02616300620138645, + "rewards/rejected": -0.009139918722212315, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 8.000000000000001e-06, + "logits/chosen": -2.1755266189575195, + "logits/rejected": -2.150144338607788, + "logps/chosen": -160.6802215576172, + "logps/rejected": -157.8507080078125, + "loss": 0.6891, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.007808923255652189, + "rewards/margins": 0.009649563580751419, + "rewards/rejected": -0.0018406407907605171, + "step": 16 + }, + { + "epoch": 0.02, + "learning_rate": 8.500000000000002e-06, + "logits/chosen": -2.1014106273651123, + "logits/rejected": -2.065537452697754, + "logps/chosen": -137.4163360595703, + "logps/rejected": -137.72653198242188, + "loss": 0.6789, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.019019413739442825, + "rewards/margins": 0.030721498653292656, + "rewards/rejected": -0.01170208491384983, + "step": 17 + }, + { + "epoch": 0.02, + "learning_rate": 9e-06, + "logits/chosen": -2.126314401626587, + "logits/rejected": -2.1292574405670166, + "logps/chosen": -157.85987854003906, + "logps/rejected": -175.0936279296875, + "loss": 0.69, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.008974838070571423, + "rewards/margins": 0.007031917572021484, + "rewards/rejected": 0.0019429202657192945, + "step": 18 + }, + { + "epoch": 0.02, + "learning_rate": 9.5e-06, + "logits/chosen": -1.821614384651184, + "logits/rejected": -1.8386234045028687, + "logps/chosen": -157.74339294433594, + "logps/rejected": -173.5108642578125, + "loss": 0.6854, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0003479006700217724, + "rewards/margins": 0.016933869570493698, + "rewards/rejected": -0.016585970297455788, + "step": 19 + }, + { + "epoch": 0.03, + "learning_rate": 1e-05, + "logits/chosen": -2.3475935459136963, + "logits/rejected": -2.3527560234069824, + "logps/chosen": -198.3360137939453, + "logps/rejected": -197.68064880371094, + "loss": 0.7084, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.014697913080453873, + "rewards/margins": -0.029072880744934082, + "rewards/rejected": 0.014374972321093082, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 1.05e-05, + "logits/chosen": -1.9768266677856445, + "logits/rejected": -1.979588508605957, + "logps/chosen": -138.13516235351562, + "logps/rejected": -137.23497009277344, + "loss": 0.7008, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.008711148053407669, + "rewards/margins": -0.014900517649948597, + "rewards/rejected": 0.006189371459186077, + "step": 21 + }, + { + "epoch": 0.03, + "learning_rate": 1.1000000000000001e-05, + "logits/chosen": -2.020932197570801, + "logits/rejected": -2.0543789863586426, + "logps/chosen": -143.21487426757812, + "logps/rejected": -145.47467041015625, + "loss": 0.7039, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0006606103852391243, + "rewards/margins": -0.019345475360751152, + "rewards/rejected": 0.0200060848146677, + "step": 22 + }, + { + "epoch": 0.03, + "learning_rate": 1.1500000000000002e-05, + "logits/chosen": -1.9338953495025635, + "logits/rejected": -1.8950414657592773, + "logps/chosen": -151.0517578125, + "logps/rejected": -144.9283447265625, + "loss": 0.6729, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0207887664437294, + "rewards/margins": 0.0420105941593647, + "rewards/rejected": -0.02122182957828045, + "step": 23 + }, + { + "epoch": 0.03, + "learning_rate": 1.2e-05, + "logits/chosen": -2.1229336261749268, + "logits/rejected": -2.1787612438201904, + "logps/chosen": -143.4667510986328, + "logps/rejected": -151.78887939453125, + "loss": 0.6944, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0076406970620155334, + "rewards/margins": -0.001663590781390667, + "rewards/rejected": -0.0059771062806248665, + "step": 24 + }, + { + "epoch": 0.03, + "learning_rate": 1.25e-05, + "logits/chosen": -1.9385018348693848, + "logits/rejected": -1.8521825075149536, + "logps/chosen": -136.98797607421875, + "logps/rejected": -123.29915618896484, + "loss": 0.6874, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.016515588387846947, + "rewards/margins": 0.013585926033556461, + "rewards/rejected": 0.0029296651482582092, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 1.3000000000000001e-05, + "logits/chosen": -2.0494840145111084, + "logits/rejected": -2.0260462760925293, + "logps/chosen": -206.23294067382812, + "logps/rejected": -204.59170532226562, + "loss": 0.7052, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.005298185162246227, + "rewards/margins": -0.022758912295103073, + "rewards/rejected": 0.01746072620153427, + "step": 26 + }, + { + "epoch": 0.04, + "learning_rate": 1.3500000000000001e-05, + "logits/chosen": -2.185777425765991, + "logits/rejected": -2.16872501373291, + "logps/chosen": -166.58811950683594, + "logps/rejected": -168.48207092285156, + "loss": 0.6994, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02519994042813778, + "rewards/margins": -0.011203411035239697, + "rewards/rejected": -0.013996529392898083, + "step": 27 + }, + { + "epoch": 0.04, + "learning_rate": 1.4000000000000001e-05, + "logits/chosen": -2.1363608837127686, + "logits/rejected": -2.1508541107177734, + "logps/chosen": -174.48770141601562, + "logps/rejected": -169.1255645751953, + "loss": 0.7026, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.008833074010908604, + "rewards/margins": -0.01709108054637909, + "rewards/rejected": 0.008258008398115635, + "step": 28 + }, + { + "epoch": 0.04, + "learning_rate": 1.45e-05, + "logits/chosen": -2.1649487018585205, + "logits/rejected": -2.170478582382202, + "logps/chosen": -166.38059997558594, + "logps/rejected": -170.26541137695312, + "loss": 0.7105, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.028479862958192825, + "rewards/margins": -0.0317809097468853, + "rewards/rejected": 0.0033010481856763363, + "step": 29 + }, + { + "epoch": 0.04, + "learning_rate": 1.5e-05, + "logits/chosen": -2.190495014190674, + "logits/rejected": -2.2020809650421143, + "logps/chosen": -167.83895874023438, + "logps/rejected": -171.6207275390625, + "loss": 0.6924, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.007887793704867363, + "rewards/margins": 0.0026388168334960938, + "rewards/rejected": 0.005248976871371269, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 1.55e-05, + "logits/chosen": -2.0097718238830566, + "logits/rejected": -1.9927585124969482, + "logps/chosen": -130.2247314453125, + "logps/rejected": -149.3783416748047, + "loss": 0.6896, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.010196828283369541, + "rewards/margins": 0.008201027289032936, + "rewards/rejected": 0.0019958019256591797, + "step": 31 + }, + { + "epoch": 0.04, + "learning_rate": 1.6000000000000003e-05, + "logits/chosen": -1.9905472993850708, + "logits/rejected": -2.0040249824523926, + "logps/chosen": -176.70541381835938, + "logps/rejected": -200.47962951660156, + "loss": 0.7047, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0031967416871339083, + "rewards/margins": -0.021031878888607025, + "rewards/rejected": 0.024228623136878014, + "step": 32 + }, + { + "epoch": 0.04, + "learning_rate": 1.65e-05, + "logits/chosen": -2.0395946502685547, + "logits/rejected": -2.019467353820801, + "logps/chosen": -136.9028778076172, + "logps/rejected": -118.68156433105469, + "loss": 0.6872, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02108776569366455, + "rewards/margins": 0.013590503484010696, + "rewards/rejected": 0.007497262209653854, + "step": 33 + }, + { + "epoch": 0.04, + "learning_rate": 1.7000000000000003e-05, + "logits/chosen": -2.0771450996398926, + "logits/rejected": -2.1549808979034424, + "logps/chosen": -133.3074493408203, + "logps/rejected": -141.6013946533203, + "loss": 0.6851, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01747903786599636, + "rewards/margins": 0.017702102661132812, + "rewards/rejected": -0.00022306526079773903, + "step": 34 + }, + { + "epoch": 0.05, + "learning_rate": 1.75e-05, + "logits/chosen": -2.1276822090148926, + "logits/rejected": -2.078389883041382, + "logps/chosen": -154.18087768554688, + "logps/rejected": -135.57997131347656, + "loss": 0.7035, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.009335566312074661, + "rewards/margins": -0.019415616989135742, + "rewards/rejected": 0.028751183301210403, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 1.8e-05, + "logits/chosen": -2.0852906703948975, + "logits/rejected": -2.04500675201416, + "logps/chosen": -167.43182373046875, + "logps/rejected": -164.1446533203125, + "loss": 0.6959, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.02079620398581028, + "rewards/margins": -0.0038356767036020756, + "rewards/rejected": 0.024631882086396217, + "step": 36 + }, + { + "epoch": 0.05, + "learning_rate": 1.85e-05, + "logits/chosen": -1.9053428173065186, + "logits/rejected": -1.8860125541687012, + "logps/chosen": -186.79791259765625, + "logps/rejected": -183.456298828125, + "loss": 0.7025, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.009571265429258347, + "rewards/margins": -0.01751232147216797, + "rewards/rejected": 0.027083586901426315, + "step": 37 + }, + { + "epoch": 0.05, + "learning_rate": 1.9e-05, + "logits/chosen": -1.9760335683822632, + "logits/rejected": -2.062265634536743, + "logps/chosen": -131.81793212890625, + "logps/rejected": -126.02755737304688, + "loss": 0.6897, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.006855441257357597, + "rewards/margins": 0.009298227727413177, + "rewards/rejected": -0.016153670847415924, + "step": 38 + }, + { + "epoch": 0.05, + "learning_rate": 1.9500000000000003e-05, + "logits/chosen": -1.9339022636413574, + "logits/rejected": -1.9163322448730469, + "logps/chosen": -154.1663818359375, + "logps/rejected": -155.90472412109375, + "loss": 0.6856, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.028609374538064003, + "rewards/margins": 0.01638217084109783, + "rewards/rejected": 0.012227201834321022, + "step": 39 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "logits/chosen": -1.9966247081756592, + "logits/rejected": -1.949063777923584, + "logps/chosen": -167.77857971191406, + "logps/rejected": -169.52732849121094, + "loss": 0.6966, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09298010170459747, + "rewards/margins": -0.005213452503085136, + "rewards/rejected": 0.09819354861974716, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 2.05e-05, + "logits/chosen": -2.129456043243408, + "logits/rejected": -2.1342248916625977, + "logps/chosen": -133.3363494873047, + "logps/rejected": -137.60580444335938, + "loss": 0.6937, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.060414716601371765, + "rewards/margins": 0.00036644982174038887, + "rewards/rejected": 0.060048267245292664, + "step": 41 + }, + { + "epoch": 0.05, + "learning_rate": 2.1e-05, + "logits/chosen": -2.113492012023926, + "logits/rejected": -2.149386167526245, + "logps/chosen": -158.78790283203125, + "logps/rejected": -151.22894287109375, + "loss": 0.6959, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.042578794062137604, + "rewards/margins": -0.003336430061608553, + "rewards/rejected": 0.04591522365808487, + "step": 42 + }, + { + "epoch": 0.06, + "learning_rate": 2.15e-05, + "logits/chosen": -2.14461088180542, + "logits/rejected": -2.127142906188965, + "logps/chosen": -161.65231323242188, + "logps/rejected": -193.1698760986328, + "loss": 0.7047, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.04428081959486008, + "rewards/margins": -0.021442033350467682, + "rewards/rejected": 0.06572284549474716, + "step": 43 + }, + { + "epoch": 0.06, + "learning_rate": 2.2000000000000003e-05, + "logits/chosen": -1.9628074169158936, + "logits/rejected": -1.9140665531158447, + "logps/chosen": -165.8412628173828, + "logps/rejected": -159.8250274658203, + "loss": 0.699, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.01591024361550808, + "rewards/margins": -0.009422186762094498, + "rewards/rejected": -0.006488058716058731, + "step": 44 + }, + { + "epoch": 0.06, + "learning_rate": 2.25e-05, + "logits/chosen": -2.009124517440796, + "logits/rejected": -2.051636219024658, + "logps/chosen": -157.3372039794922, + "logps/rejected": -168.30613708496094, + "loss": 0.6646, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.06606419384479523, + "rewards/margins": 0.060197923332452774, + "rewards/rejected": 0.0058662667870521545, + "step": 45 + }, + { + "epoch": 0.06, + "learning_rate": 2.3000000000000003e-05, + "logits/chosen": -1.9960308074951172, + "logits/rejected": -2.013731002807617, + "logps/chosen": -152.45262145996094, + "logps/rejected": -152.74990844726562, + "loss": 0.6924, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.041900113224983215, + "rewards/margins": 0.005311486776918173, + "rewards/rejected": 0.036588624119758606, + "step": 46 + }, + { + "epoch": 0.06, + "learning_rate": 2.35e-05, + "logits/chosen": -2.084881544113159, + "logits/rejected": -2.051604747772217, + "logps/chosen": -147.22457885742188, + "logps/rejected": -130.90673828125, + "loss": 0.7003, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.027727343142032623, + "rewards/margins": -0.011356806382536888, + "rewards/rejected": 0.03908415138721466, + "step": 47 + }, + { + "epoch": 0.06, + "learning_rate": 2.4e-05, + "logits/chosen": -2.172280788421631, + "logits/rejected": -2.161738872528076, + "logps/chosen": -169.7745819091797, + "logps/rejected": -169.87936401367188, + "loss": 0.6954, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.026907825842499733, + "rewards/margins": -0.0014351843856275082, + "rewards/rejected": 0.02834300883114338, + "step": 48 + }, + { + "epoch": 0.06, + "learning_rate": 2.45e-05, + "logits/chosen": -2.127919912338257, + "logits/rejected": -2.0433998107910156, + "logps/chosen": -170.56634521484375, + "logps/rejected": -158.96981811523438, + "loss": 0.7064, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.04663754254579544, + "rewards/margins": -0.023569582030177116, + "rewards/rejected": 0.07020711898803711, + "step": 49 + }, + { + "epoch": 0.07, + "learning_rate": 2.5e-05, + "logits/chosen": -2.0349974632263184, + "logits/rejected": -1.9954936504364014, + "logps/chosen": -198.20065307617188, + "logps/rejected": -184.9116973876953, + "loss": 0.6746, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.031090188771486282, + "rewards/margins": 0.039133429527282715, + "rewards/rejected": -0.008043241687119007, + "step": 50 + }, + { + "epoch": 0.07, + "learning_rate": 2.5500000000000003e-05, + "logits/chosen": -2.1735117435455322, + "logits/rejected": -2.1993043422698975, + "logps/chosen": -152.16355895996094, + "logps/rejected": -152.68006896972656, + "loss": 0.7076, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.03591465950012207, + "rewards/margins": -0.024077631533145905, + "rewards/rejected": 0.05999229848384857, + "step": 51 + }, + { + "epoch": 0.07, + "learning_rate": 2.6000000000000002e-05, + "logits/chosen": -1.999128818511963, + "logits/rejected": -2.059635877609253, + "logps/chosen": -201.5443878173828, + "logps/rejected": -227.005859375, + "loss": 0.6924, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.02559967152774334, + "rewards/margins": 0.004609701223671436, + "rewards/rejected": 0.02098996937274933, + "step": 52 + }, + { + "epoch": 0.07, + "learning_rate": 2.6500000000000004e-05, + "logits/chosen": -2.162651538848877, + "logits/rejected": -2.1881916522979736, + "logps/chosen": -124.34934997558594, + "logps/rejected": -130.11549377441406, + "loss": 0.6881, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08756237477064133, + "rewards/margins": 0.012067937292158604, + "rewards/rejected": 0.0754944384098053, + "step": 53 + }, + { + "epoch": 0.07, + "learning_rate": 2.7000000000000002e-05, + "logits/chosen": -1.9938299655914307, + "logits/rejected": -2.034883499145508, + "logps/chosen": -167.63168334960938, + "logps/rejected": -171.09104919433594, + "loss": 0.6914, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0470796562731266, + "rewards/margins": 0.00963954720646143, + "rewards/rejected": 0.03744010999798775, + "step": 54 + }, + { + "epoch": 0.07, + "learning_rate": 2.7500000000000004e-05, + "logits/chosen": -2.0790414810180664, + "logits/rejected": -2.1538212299346924, + "logps/chosen": -155.3251953125, + "logps/rejected": -174.05441284179688, + "loss": 0.701, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.041025254875421524, + "rewards/margins": -0.013429548591375351, + "rewards/rejected": 0.054454803466796875, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 2.8000000000000003e-05, + "logits/chosen": -2.1940407752990723, + "logits/rejected": -2.2308764457702637, + "logps/chosen": -187.58314514160156, + "logps/rejected": -175.3198699951172, + "loss": 0.663, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.09865675121545792, + "rewards/margins": 0.06462635844945908, + "rewards/rejected": 0.03403039276599884, + "step": 56 + }, + { + "epoch": 0.07, + "learning_rate": 2.8499999999999998e-05, + "logits/chosen": -2.0591015815734863, + "logits/rejected": -2.084754705429077, + "logps/chosen": -170.54522705078125, + "logps/rejected": -157.01759338378906, + "loss": 0.6995, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.030951734632253647, + "rewards/margins": -0.004660461097955704, + "rewards/rejected": 0.03561220318078995, + "step": 57 + }, + { + "epoch": 0.08, + "learning_rate": 2.9e-05, + "logits/chosen": -1.9737329483032227, + "logits/rejected": -1.965449333190918, + "logps/chosen": -149.2818603515625, + "logps/rejected": -140.09564208984375, + "loss": 0.6789, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07900600135326385, + "rewards/margins": 0.030826283618807793, + "rewards/rejected": 0.04817971587181091, + "step": 58 + }, + { + "epoch": 0.08, + "learning_rate": 2.95e-05, + "logits/chosen": -1.9386465549468994, + "logits/rejected": -1.9863896369934082, + "logps/chosen": -148.0929718017578, + "logps/rejected": -152.614990234375, + "loss": 0.6867, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06615930050611496, + "rewards/margins": 0.014939931221306324, + "rewards/rejected": 0.05121936649084091, + "step": 59 + }, + { + "epoch": 0.08, + "learning_rate": 3e-05, + "logits/chosen": -2.1406030654907227, + "logits/rejected": -2.140972852706909, + "logps/chosen": -164.55569458007812, + "logps/rejected": -160.45802307128906, + "loss": 0.6848, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.07306110858917236, + "rewards/margins": 0.02018454112112522, + "rewards/rejected": 0.05287656933069229, + "step": 60 + }, + { + "epoch": 0.08, + "learning_rate": 3.05e-05, + "logits/chosen": -2.1821627616882324, + "logits/rejected": -2.23848819732666, + "logps/chosen": -154.3368682861328, + "logps/rejected": -153.51547241210938, + "loss": 0.6941, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.07242944836616516, + "rewards/margins": 9.055249392986298e-05, + "rewards/rejected": 0.07233888655900955, + "step": 61 + }, + { + "epoch": 0.08, + "learning_rate": 3.1e-05, + "logits/chosen": -2.1834821701049805, + "logits/rejected": -2.132404327392578, + "logps/chosen": -148.86456298828125, + "logps/rejected": -149.2091522216797, + "loss": 0.7058, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.09177599102258682, + "rewards/margins": -0.021642468869686127, + "rewards/rejected": 0.11341846734285355, + "step": 62 + }, + { + "epoch": 0.08, + "learning_rate": 3.15e-05, + "logits/chosen": -2.037621259689331, + "logits/rejected": -2.0467352867126465, + "logps/chosen": -151.49147033691406, + "logps/rejected": -165.01246643066406, + "loss": 0.7098, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.1084907054901123, + "rewards/margins": -0.023577161133289337, + "rewards/rejected": 0.13206787407398224, + "step": 63 + }, + { + "epoch": 0.08, + "learning_rate": 3.2000000000000005e-05, + "logits/chosen": -2.062051773071289, + "logits/rejected": -1.9661951065063477, + "logps/chosen": -139.70193481445312, + "logps/rejected": -155.814697265625, + "loss": 0.6917, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0038987197913229465, + "rewards/margins": 0.0076565514318645, + "rewards/rejected": -0.0037578358314931393, + "step": 64 + }, + { + "epoch": 0.09, + "learning_rate": 3.2500000000000004e-05, + "logits/chosen": -2.132296562194824, + "logits/rejected": -2.091379404067993, + "logps/chosen": -148.2380828857422, + "logps/rejected": -155.97186279296875, + "loss": 0.6824, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.061218734830617905, + "rewards/margins": 0.02413906902074814, + "rewards/rejected": 0.037079669535160065, + "step": 65 + }, + { + "epoch": 0.09, + "learning_rate": 3.3e-05, + "logits/chosen": -1.8095799684524536, + "logits/rejected": -1.8316328525543213, + "logps/chosen": -168.23243713378906, + "logps/rejected": -163.02633666992188, + "loss": 0.7047, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.09200361371040344, + "rewards/margins": -0.017515014857053757, + "rewards/rejected": 0.1095186173915863, + "step": 66 + }, + { + "epoch": 0.09, + "learning_rate": 3.35e-05, + "logits/chosen": -1.8896549940109253, + "logits/rejected": -1.9064245223999023, + "logps/chosen": -178.73973083496094, + "logps/rejected": -180.1619873046875, + "loss": 0.7115, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.024672793224453926, + "rewards/margins": -0.03270921856164932, + "rewards/rejected": 0.0573820136487484, + "step": 67 + }, + { + "epoch": 0.09, + "learning_rate": 3.4000000000000007e-05, + "logits/chosen": -1.7941502332687378, + "logits/rejected": -1.8412476778030396, + "logps/chosen": -134.13589477539062, + "logps/rejected": -135.673828125, + "loss": 0.6453, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.13328862190246582, + "rewards/margins": 0.10187779366970062, + "rewards/rejected": 0.03141083940863609, + "step": 68 + }, + { + "epoch": 0.09, + "learning_rate": 3.45e-05, + "logits/chosen": -2.1372315883636475, + "logits/rejected": -2.158930778503418, + "logps/chosen": -142.1901397705078, + "logps/rejected": -140.1024169921875, + "loss": 0.7298, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.04432840272784233, + "rewards/margins": -0.06238814443349838, + "rewards/rejected": 0.10671653598546982, + "step": 69 + }, + { + "epoch": 0.09, + "learning_rate": 3.5e-05, + "logits/chosen": -2.2218027114868164, + "logits/rejected": -2.2194578647613525, + "logps/chosen": -172.63040161132812, + "logps/rejected": -171.96815490722656, + "loss": 0.7005, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.033330656588077545, + "rewards/margins": -0.012100504711270332, + "rewards/rejected": 0.04543116316199303, + "step": 70 + }, + { + "epoch": 0.09, + "learning_rate": 3.55e-05, + "logits/chosen": -2.012197971343994, + "logits/rejected": -2.0255186557769775, + "logps/chosen": -217.9222869873047, + "logps/rejected": -226.47398376464844, + "loss": 0.6984, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.058617155998945236, + "rewards/margins": -0.005790230818092823, + "rewards/rejected": 0.06440739333629608, + "step": 71 + }, + { + "epoch": 0.09, + "learning_rate": 3.6e-05, + "logits/chosen": -2.14078688621521, + "logits/rejected": -2.111306667327881, + "logps/chosen": -160.2899169921875, + "logps/rejected": -154.1366424560547, + "loss": 0.6881, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.074866384267807, + "rewards/margins": 0.01636476442217827, + "rewards/rejected": 0.05850161984562874, + "step": 72 + }, + { + "epoch": 0.1, + "learning_rate": 3.65e-05, + "logits/chosen": -1.9476792812347412, + "logits/rejected": -1.9206452369689941, + "logps/chosen": -156.6016845703125, + "logps/rejected": -155.9908905029297, + "loss": 0.6852, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.004470587708055973, + "rewards/margins": 0.024280693382024765, + "rewards/rejected": -0.019810102880001068, + "step": 73 + }, + { + "epoch": 0.1, + "learning_rate": 3.7e-05, + "logits/chosen": -2.07848858833313, + "logits/rejected": -2.1027286052703857, + "logps/chosen": -153.7880096435547, + "logps/rejected": -151.38841247558594, + "loss": 0.7136, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.049511026591062546, + "rewards/margins": -0.0270721185952425, + "rewards/rejected": 0.0765831470489502, + "step": 74 + }, + { + "epoch": 0.1, + "learning_rate": 3.7500000000000003e-05, + "logits/chosen": -2.0560519695281982, + "logits/rejected": -2.0237274169921875, + "logps/chosen": -168.32672119140625, + "logps/rejected": -166.328125, + "loss": 0.6885, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01440761424601078, + "rewards/margins": 0.0161895789206028, + "rewards/rejected": -0.0017819646745920181, + "step": 75 + }, + { + "epoch": 0.1, + "learning_rate": 3.8e-05, + "logits/chosen": -1.9260778427124023, + "logits/rejected": -1.9472447633743286, + "logps/chosen": -145.28759765625, + "logps/rejected": -149.98504638671875, + "loss": 0.701, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.034238480031490326, + "rewards/margins": -0.005065919831395149, + "rewards/rejected": 0.039304401725530624, + "step": 76 + }, + { + "epoch": 0.1, + "learning_rate": 3.85e-05, + "logits/chosen": -1.6790459156036377, + "logits/rejected": -1.6938380002975464, + "logps/chosen": -169.58250427246094, + "logps/rejected": -171.97097778320312, + "loss": 0.7036, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12939399480819702, + "rewards/margins": -0.008996442891657352, + "rewards/rejected": 0.1383904367685318, + "step": 77 + }, + { + "epoch": 0.1, + "learning_rate": 3.9000000000000006e-05, + "logits/chosen": -2.2361156940460205, + "logits/rejected": -2.1365509033203125, + "logps/chosen": -207.76535034179688, + "logps/rejected": -197.6790313720703, + "loss": 0.6763, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10514166951179504, + "rewards/margins": 0.03811817243695259, + "rewards/rejected": 0.06702349334955215, + "step": 78 + }, + { + "epoch": 0.1, + "learning_rate": 3.9500000000000005e-05, + "logits/chosen": -2.0133023262023926, + "logits/rejected": -2.0518369674682617, + "logps/chosen": -143.17681884765625, + "logps/rejected": -136.48175048828125, + "loss": 0.6976, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.05453691631555557, + "rewards/margins": -0.002915834542363882, + "rewards/rejected": 0.05745274946093559, + "step": 79 + }, + { + "epoch": 0.1, + "learning_rate": 4e-05, + "logits/chosen": -2.024761438369751, + "logits/rejected": -2.054973840713501, + "logps/chosen": -153.51858520507812, + "logps/rejected": -142.02432250976562, + "loss": 0.6967, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.01629617251455784, + "rewards/margins": -0.0009380597621202469, + "rewards/rejected": 0.017234232276678085, + "step": 80 + }, + { + "epoch": 0.11, + "learning_rate": 4.05e-05, + "logits/chosen": -2.0025839805603027, + "logits/rejected": -2.0151402950286865, + "logps/chosen": -159.1929931640625, + "logps/rejected": -168.1624755859375, + "loss": 0.687, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.06722717732191086, + "rewards/margins": 0.03626187518239021, + "rewards/rejected": 0.030965294688940048, + "step": 81 + }, + { + "epoch": 0.11, + "learning_rate": 4.1e-05, + "logits/chosen": -1.9327080249786377, + "logits/rejected": -1.947874665260315, + "logps/chosen": -215.408935546875, + "logps/rejected": -199.43902587890625, + "loss": 0.7457, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.1100463718175888, + "rewards/margins": -0.09321331977844238, + "rewards/rejected": -0.016833044588565826, + "step": 82 + }, + { + "epoch": 0.11, + "learning_rate": 4.15e-05, + "logits/chosen": -1.9191949367523193, + "logits/rejected": -1.9307305812835693, + "logps/chosen": -170.3133544921875, + "logps/rejected": -154.84254455566406, + "loss": 0.7279, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03949739784002304, + "rewards/margins": -0.06344561278820038, + "rewards/rejected": 0.023948216810822487, + "step": 83 + }, + { + "epoch": 0.11, + "learning_rate": 4.2e-05, + "logits/chosen": -1.8360068798065186, + "logits/rejected": -1.8425829410552979, + "logps/chosen": -165.39141845703125, + "logps/rejected": -161.62539672851562, + "loss": 0.7216, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07253064215183258, + "rewards/margins": -0.03998390585184097, + "rewards/rejected": 0.11251455545425415, + "step": 84 + }, + { + "epoch": 0.11, + "learning_rate": 4.25e-05, + "logits/chosen": -1.9030508995056152, + "logits/rejected": -1.9330065250396729, + "logps/chosen": -209.78103637695312, + "logps/rejected": -220.98043823242188, + "loss": 0.6309, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0014497279189527035, + "rewards/margins": 0.13583050668239594, + "rewards/rejected": -0.1343807578086853, + "step": 85 + }, + { + "epoch": 0.11, + "learning_rate": 4.3e-05, + "logits/chosen": -1.9742060899734497, + "logits/rejected": -1.9866974353790283, + "logps/chosen": -171.7286376953125, + "logps/rejected": -168.8912811279297, + "loss": 0.7484, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.025174710899591446, + "rewards/margins": -0.09477367997169495, + "rewards/rejected": 0.1199483871459961, + "step": 86 + }, + { + "epoch": 0.11, + "learning_rate": 4.35e-05, + "logits/chosen": -2.1695969104766846, + "logits/rejected": -2.252450704574585, + "logps/chosen": -125.7203598022461, + "logps/rejected": -158.59054565429688, + "loss": 0.7255, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.010992627590894699, + "rewards/margins": -0.052416570484638214, + "rewards/rejected": 0.04142393916845322, + "step": 87 + }, + { + "epoch": 0.12, + "learning_rate": 4.4000000000000006e-05, + "logits/chosen": -1.9488649368286133, + "logits/rejected": -1.9779549837112427, + "logps/chosen": -182.26705932617188, + "logps/rejected": -173.61148071289062, + "loss": 0.7215, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.07614026218652725, + "rewards/margins": -0.042962007224559784, + "rewards/rejected": 0.11910226941108704, + "step": 88 + }, + { + "epoch": 0.12, + "learning_rate": 4.4500000000000004e-05, + "logits/chosen": -1.9029545783996582, + "logits/rejected": -1.9423730373382568, + "logps/chosen": -200.17457580566406, + "logps/rejected": -173.20709228515625, + "loss": 0.733, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.07196169346570969, + "rewards/margins": -0.0681467056274414, + "rewards/rejected": 0.1401083916425705, + "step": 89 + }, + { + "epoch": 0.12, + "learning_rate": 4.5e-05, + "logits/chosen": -2.3139305114746094, + "logits/rejected": -2.351571798324585, + "logps/chosen": -146.32611083984375, + "logps/rejected": -156.42579650878906, + "loss": 0.7095, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.0665954127907753, + "rewards/margins": -0.01309509202837944, + "rewards/rejected": -0.05350032076239586, + "step": 90 + }, + { + "epoch": 0.12, + "learning_rate": 4.55e-05, + "logits/chosen": -1.9338881969451904, + "logits/rejected": -1.9347970485687256, + "logps/chosen": -143.15924072265625, + "logps/rejected": -152.71804809570312, + "loss": 0.7129, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.010173199698328972, + "rewards/margins": -0.03304898738861084, + "rewards/rejected": 0.022875778377056122, + "step": 91 + }, + { + "epoch": 0.12, + "learning_rate": 4.600000000000001e-05, + "logits/chosen": -1.7374138832092285, + "logits/rejected": -1.7747021913528442, + "logps/chosen": -187.83636474609375, + "logps/rejected": -160.97560119628906, + "loss": 0.682, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.06837701797485352, + "rewards/margins": 0.042838774621486664, + "rewards/rejected": 0.02553824707865715, + "step": 92 + }, + { + "epoch": 0.12, + "learning_rate": 4.6500000000000005e-05, + "logits/chosen": -2.21986722946167, + "logits/rejected": -2.2679433822631836, + "logps/chosen": -147.17849731445312, + "logps/rejected": -152.95431518554688, + "loss": 0.6438, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.08736428618431091, + "rewards/margins": 0.10906204581260681, + "rewards/rejected": -0.021697763353586197, + "step": 93 + }, + { + "epoch": 0.12, + "learning_rate": 4.7e-05, + "logits/chosen": -2.187103271484375, + "logits/rejected": -2.239633083343506, + "logps/chosen": -124.42755126953125, + "logps/rejected": -125.82307434082031, + "loss": 0.6312, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03520822525024414, + "rewards/margins": 0.14293043315410614, + "rewards/rejected": -0.1077222228050232, + "step": 94 + }, + { + "epoch": 0.12, + "learning_rate": 4.75e-05, + "logits/chosen": -2.1484622955322266, + "logits/rejected": -2.167619228363037, + "logps/chosen": -139.31021118164062, + "logps/rejected": -148.8551483154297, + "loss": 0.6474, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.11308951675891876, + "rewards/margins": 0.10877098888158798, + "rewards/rejected": 0.004318520426750183, + "step": 95 + }, + { + "epoch": 0.13, + "learning_rate": 4.8e-05, + "logits/chosen": -2.1004369258880615, + "logits/rejected": -2.0368094444274902, + "logps/chosen": -142.34259033203125, + "logps/rejected": -131.41265869140625, + "loss": 0.6511, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.06739182770252228, + "rewards/margins": 0.0982145369052887, + "rewards/rejected": -0.03082270734012127, + "step": 96 + }, + { + "epoch": 0.13, + "learning_rate": 4.85e-05, + "logits/chosen": -2.269822597503662, + "logits/rejected": -2.3615293502807617, + "logps/chosen": -159.7416534423828, + "logps/rejected": -158.27255249023438, + "loss": 0.7067, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.042380258440971375, + "rewards/margins": -0.016107436269521713, + "rewards/rejected": -0.02627282217144966, + "step": 97 + }, + { + "epoch": 0.13, + "learning_rate": 4.9e-05, + "logits/chosen": -2.1764469146728516, + "logits/rejected": -2.1454200744628906, + "logps/chosen": -142.2655029296875, + "logps/rejected": -153.2463836669922, + "loss": 0.7423, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.0025094300508499146, + "rewards/margins": -0.08575483411550522, + "rewards/rejected": 0.08324538916349411, + "step": 98 + }, + { + "epoch": 0.13, + "learning_rate": 4.9500000000000004e-05, + "logits/chosen": -2.193749189376831, + "logits/rejected": -2.1985344886779785, + "logps/chosen": -187.168701171875, + "logps/rejected": -197.92352294921875, + "loss": 0.6663, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.06072089821100235, + "rewards/margins": 0.07174615561962128, + "rewards/rejected": -0.011025259271264076, + "step": 99 + }, + { + "epoch": 0.13, + "learning_rate": 5e-05, + "logits/chosen": -2.068103551864624, + "logits/rejected": -2.0740597248077393, + "logps/chosen": -144.1133270263672, + "logps/rejected": -148.28744506835938, + "loss": 0.7095, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.04808714985847473, + "rewards/margins": -0.004095175303518772, + "rewards/rejected": 0.05218231678009033, + "step": 100 + }, + { + "epoch": 0.13, + "learning_rate": 4.999997432392803e-05, + "logits/chosen": -1.8877215385437012, + "logits/rejected": -1.8843252658843994, + "logps/chosen": -212.08737182617188, + "logps/rejected": -159.77037048339844, + "loss": 0.6442, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.05051098391413689, + "rewards/margins": 0.11454086750745773, + "rewards/rejected": -0.06402988731861115, + "step": 101 + }, + { + "epoch": 0.13, + "learning_rate": 4.9999897295764844e-05, + "logits/chosen": -2.040144205093384, + "logits/rejected": -2.0494396686553955, + "logps/chosen": -160.35443115234375, + "logps/rejected": -169.1414794921875, + "loss": 0.8033, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.07908819615840912, + "rewards/margins": -0.19221995770931244, + "rewards/rejected": 0.11313176155090332, + "step": 102 + }, + { + "epoch": 0.13, + "learning_rate": 4.9999768915668665e-05, + "logits/chosen": -2.148070812225342, + "logits/rejected": -2.1513681411743164, + "logps/chosen": -145.71995544433594, + "logps/rejected": -140.0707550048828, + "loss": 0.7564, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.12240414321422577, + "rewards/margins": -0.10564970970153809, + "rewards/rejected": -0.01675444096326828, + "step": 103 + }, + { + "epoch": 0.14, + "learning_rate": 4.999958918390321e-05, + "logits/chosen": -2.0417237281799316, + "logits/rejected": -1.9771349430084229, + "logps/chosen": -146.751708984375, + "logps/rejected": -139.89523315429688, + "loss": 0.7258, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.053674325346946716, + "rewards/margins": -0.05639982223510742, + "rewards/rejected": 0.0027255089953541756, + "step": 104 + }, + { + "epoch": 0.14, + "learning_rate": 4.999935810083766e-05, + "logits/chosen": -2.003383159637451, + "logits/rejected": -1.9574511051177979, + "logps/chosen": -173.7794647216797, + "logps/rejected": -171.2220916748047, + "loss": 0.7666, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.051795199513435364, + "rewards/margins": -0.1156022921204567, + "rewards/rejected": 0.06380710750818253, + "step": 105 + }, + { + "epoch": 0.14, + "learning_rate": 4.999907566694667e-05, + "logits/chosen": -2.0609560012817383, + "logits/rejected": -2.101062536239624, + "logps/chosen": -143.58428955078125, + "logps/rejected": -162.1249237060547, + "loss": 0.7173, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.04189272224903107, + "rewards/margins": -0.02848353236913681, + "rewards/rejected": -0.013409186154603958, + "step": 106 + }, + { + "epoch": 0.14, + "learning_rate": 4.9998741882810384e-05, + "logits/chosen": -1.862243890762329, + "logits/rejected": -1.8240137100219727, + "logps/chosen": -177.3428192138672, + "logps/rejected": -179.76573181152344, + "loss": 0.7098, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03088255040347576, + "rewards/margins": -0.002100745216012001, + "rewards/rejected": -0.028781799599528313, + "step": 107 + }, + { + "epoch": 0.14, + "learning_rate": 4.999835674911443e-05, + "logits/chosen": -1.7740706205368042, + "logits/rejected": -1.810904860496521, + "logps/chosen": -203.03497314453125, + "logps/rejected": -209.43789672851562, + "loss": 0.66, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06684975326061249, + "rewards/margins": 0.08228196948766708, + "rewards/rejected": -0.015432218089699745, + "step": 108 + }, + { + "epoch": 0.14, + "learning_rate": 4.999792026664991e-05, + "logits/chosen": -1.946571707725525, + "logits/rejected": -1.9524767398834229, + "logps/chosen": -160.1234130859375, + "logps/rejected": -150.89892578125, + "loss": 0.6772, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01813810132443905, + "rewards/margins": 0.042000893503427505, + "rewards/rejected": -0.06013898551464081, + "step": 109 + }, + { + "epoch": 0.14, + "learning_rate": 4.9997432436313384e-05, + "logits/chosen": -2.24985933303833, + "logits/rejected": -2.2947096824645996, + "logps/chosen": -147.19674682617188, + "logps/rejected": -147.4495086669922, + "loss": 0.7232, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14067719876766205, + "rewards/margins": -0.044903866946697235, + "rewards/rejected": -0.09577332437038422, + "step": 110 + }, + { + "epoch": 0.15, + "learning_rate": 4.99968932591069e-05, + "logits/chosen": -2.289724826812744, + "logits/rejected": -2.1994781494140625, + "logps/chosen": -170.31321716308594, + "logps/rejected": -153.826904296875, + "loss": 0.7918, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1584378331899643, + "rewards/margins": -0.16181893646717072, + "rewards/rejected": 0.0033811070024967194, + "step": 111 + }, + { + "epoch": 0.15, + "learning_rate": 4.999630273613799e-05, + "logits/chosen": -1.6163581609725952, + "logits/rejected": -1.6122593879699707, + "logps/chosen": -168.62997436523438, + "logps/rejected": -222.83038330078125, + "loss": 0.6669, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.057224374264478683, + "rewards/margins": 0.09167467057704926, + "rewards/rejected": -0.03445029631257057, + "step": 112 + }, + { + "epoch": 0.15, + "learning_rate": 4.999566086861961e-05, + "logits/chosen": -2.1440043449401855, + "logits/rejected": -2.1370463371276855, + "logps/chosen": -133.85556030273438, + "logps/rejected": -146.81748962402344, + "loss": 0.7124, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06403251737356186, + "rewards/margins": -0.009389929473400116, + "rewards/rejected": -0.054642580449581146, + "step": 113 + }, + { + "epoch": 0.15, + "learning_rate": 4.999496765787024e-05, + "logits/chosen": -2.1842727661132812, + "logits/rejected": -2.2154009342193604, + "logps/chosen": -151.34469604492188, + "logps/rejected": -153.56678771972656, + "loss": 0.6244, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.009856656193733215, + "rewards/margins": 0.15723924338817596, + "rewards/rejected": -0.14738260209560394, + "step": 114 + }, + { + "epoch": 0.15, + "learning_rate": 4.9994223105313774e-05, + "logits/chosen": -2.212733745574951, + "logits/rejected": -2.2484166622161865, + "logps/chosen": -129.7595672607422, + "logps/rejected": -129.0322265625, + "loss": 0.6389, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0931883379817009, + "rewards/margins": 0.12833790481090546, + "rewards/rejected": -0.035149574279785156, + "step": 115 + }, + { + "epoch": 0.15, + "learning_rate": 4.9993427212479606e-05, + "logits/chosen": -2.1278915405273438, + "logits/rejected": -2.1659746170043945, + "logps/chosen": -166.4802703857422, + "logps/rejected": -177.78651428222656, + "loss": 0.7355, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.13353124260902405, + "rewards/margins": -0.06453972309827805, + "rewards/rejected": -0.068991519510746, + "step": 116 + }, + { + "epoch": 0.15, + "learning_rate": 4.999257998100254e-05, + "logits/chosen": -2.3552539348602295, + "logits/rejected": -2.4131429195404053, + "logps/chosen": -167.33251953125, + "logps/rejected": -171.97781372070312, + "loss": 0.7883, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.13030581176280975, + "rewards/margins": -0.1426038295030594, + "rewards/rejected": 0.012298017740249634, + "step": 117 + }, + { + "epoch": 0.15, + "learning_rate": 4.999168141262289e-05, + "logits/chosen": -2.211704969406128, + "logits/rejected": -2.204983711242676, + "logps/chosen": -206.1199188232422, + "logps/rejected": -197.96646118164062, + "loss": 0.7095, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04005058482289314, + "rewards/margins": -0.022744348272681236, + "rewards/rejected": -0.017306234687566757, + "step": 118 + }, + { + "epoch": 0.16, + "learning_rate": 4.9990731509186376e-05, + "logits/chosen": -2.1870293617248535, + "logits/rejected": -2.1281723976135254, + "logps/chosen": -155.93276977539062, + "logps/rejected": -145.3558807373047, + "loss": 0.6718, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.005868477746844292, + "rewards/margins": 0.055788375437259674, + "rewards/rejected": -0.049919892102479935, + "step": 119 + }, + { + "epoch": 0.16, + "learning_rate": 4.998973027264419e-05, + "logits/chosen": -1.9813153743743896, + "logits/rejected": -1.9183677434921265, + "logps/chosen": -150.78817749023438, + "logps/rejected": -147.15357971191406, + "loss": 0.7039, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.17385157942771912, + "rewards/margins": -0.001015951856970787, + "rewards/rejected": -0.1728356033563614, + "step": 120 + }, + { + "epoch": 0.16, + "learning_rate": 4.998867770505295e-05, + "logits/chosen": -2.391598701477051, + "logits/rejected": -2.3418595790863037, + "logps/chosen": -167.2589111328125, + "logps/rejected": -178.03309631347656, + "loss": 0.6671, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0277772955596447, + "rewards/margins": 0.0589999184012413, + "rewards/rejected": -0.0867772102355957, + "step": 121 + }, + { + "epoch": 0.16, + "learning_rate": 4.9987573808574726e-05, + "logits/chosen": -1.9197032451629639, + "logits/rejected": -2.0230495929718018, + "logps/chosen": -114.3215560913086, + "logps/rejected": -125.48574829101562, + "loss": 0.6731, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02883918769657612, + "rewards/margins": 0.04838750883936882, + "rewards/rejected": -0.0772266834974289, + "step": 122 + }, + { + "epoch": 0.16, + "learning_rate": 4.9986418585477016e-05, + "logits/chosen": -2.1796233654022217, + "logits/rejected": -2.1268649101257324, + "logps/chosen": -144.93719482421875, + "logps/rejected": -125.82449340820312, + "loss": 0.7403, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0862671434879303, + "rewards/margins": -0.07632093131542206, + "rewards/rejected": -0.009946208447217941, + "step": 123 + }, + { + "epoch": 0.16, + "learning_rate": 4.998521203813274e-05, + "logits/chosen": -1.9971880912780762, + "logits/rejected": -2.0406086444854736, + "logps/chosen": -154.99465942382812, + "logps/rejected": -154.3177947998047, + "loss": 0.753, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1506948173046112, + "rewards/margins": -0.05945264920592308, + "rewards/rejected": -0.09124217927455902, + "step": 124 + }, + { + "epoch": 0.16, + "learning_rate": 4.9983954169020256e-05, + "logits/chosen": -2.0880544185638428, + "logits/rejected": -2.123401165008545, + "logps/chosen": -169.77288818359375, + "logps/rejected": -179.76673889160156, + "loss": 0.6491, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16007837653160095, + "rewards/margins": 0.12425337731838226, + "rewards/rejected": -0.284331738948822, + "step": 125 + }, + { + "epoch": 0.16, + "learning_rate": 4.9982644980723334e-05, + "logits/chosen": -2.232131242752075, + "logits/rejected": -2.2934720516204834, + "logps/chosen": -159.07867431640625, + "logps/rejected": -156.19357299804688, + "loss": 0.7076, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1424141526222229, + "rewards/margins": 0.0020025279372930527, + "rewards/rejected": -0.1444166749715805, + "step": 126 + }, + { + "epoch": 0.17, + "learning_rate": 4.998128447593117e-05, + "logits/chosen": -2.07320499420166, + "logits/rejected": -2.0934879779815674, + "logps/chosen": -173.55859375, + "logps/rejected": -171.46405029296875, + "loss": 0.6545, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.15888014435768127, + "rewards/margins": 0.14077186584472656, + "rewards/rejected": -0.29965201020240784, + "step": 127 + }, + { + "epoch": 0.17, + "learning_rate": 4.997987265743834e-05, + "logits/chosen": -2.186350107192993, + "logits/rejected": -2.153662919998169, + "logps/chosen": -137.228271484375, + "logps/rejected": -150.18634033203125, + "loss": 0.6362, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.06281783431768417, + "rewards/margins": 0.16370470821857452, + "rewards/rejected": -0.2265225499868393, + "step": 128 + }, + { + "epoch": 0.17, + "learning_rate": 4.997840952814484e-05, + "logits/chosen": -2.0112736225128174, + "logits/rejected": -2.0177900791168213, + "logps/chosen": -194.06292724609375, + "logps/rejected": -184.00973510742188, + "loss": 0.7179, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24062253534793854, + "rewards/margins": -0.023746546357870102, + "rewards/rejected": -0.21687600016593933, + "step": 129 + }, + { + "epoch": 0.17, + "learning_rate": 4.9976895091056075e-05, + "logits/chosen": -2.066251277923584, + "logits/rejected": -2.070246458053589, + "logps/chosen": -136.72486877441406, + "logps/rejected": -135.44017028808594, + "loss": 0.6315, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02474498748779297, + "rewards/margins": 0.1634208709001541, + "rewards/rejected": -0.13867586851119995, + "step": 130 + }, + { + "epoch": 0.17, + "learning_rate": 4.9975329349282826e-05, + "logits/chosen": -1.9598355293273926, + "logits/rejected": -2.0056591033935547, + "logps/chosen": -123.45001220703125, + "logps/rejected": -134.2187042236328, + "loss": 0.6282, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.014127110131084919, + "rewards/margins": 0.1591949462890625, + "rewards/rejected": -0.17332205176353455, + "step": 131 + }, + { + "epoch": 0.17, + "learning_rate": 4.9973712306041256e-05, + "logits/chosen": -2.207669734954834, + "logits/rejected": -2.261643886566162, + "logps/chosen": -143.61402893066406, + "logps/rejected": -150.94285583496094, + "loss": 0.7032, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08026294410228729, + "rewards/margins": -0.0010632979683578014, + "rewards/rejected": -0.0791996419429779, + "step": 132 + }, + { + "epoch": 0.17, + "learning_rate": 4.997204396465292e-05, + "logits/chosen": -2.2536189556121826, + "logits/rejected": -2.234713554382324, + "logps/chosen": -166.07725524902344, + "logps/rejected": -153.78976440429688, + "loss": 0.7129, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.194389209151268, + "rewards/margins": -0.020291997119784355, + "rewards/rejected": -0.1740972250699997, + "step": 133 + }, + { + "epoch": 0.18, + "learning_rate": 4.997032432854472e-05, + "logits/chosen": -2.1735360622406006, + "logits/rejected": -2.2178611755371094, + "logps/chosen": -196.8380126953125, + "logps/rejected": -206.2840118408203, + "loss": 0.7158, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.007880209013819695, + "rewards/margins": -0.0179959274828434, + "rewards/rejected": 0.010115718469023705, + "step": 134 + }, + { + "epoch": 0.18, + "learning_rate": 4.996855340124894e-05, + "logits/chosen": -2.069317579269409, + "logits/rejected": -2.0932023525238037, + "logps/chosen": -135.76324462890625, + "logps/rejected": -132.4375, + "loss": 0.7887, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.13397419452667236, + "rewards/margins": -0.15053679049015045, + "rewards/rejected": 0.016562584787607193, + "step": 135 + }, + { + "epoch": 0.18, + "learning_rate": 4.996673118640323e-05, + "logits/chosen": -2.2751736640930176, + "logits/rejected": -2.262275218963623, + "logps/chosen": -144.22671508789062, + "logps/rejected": -146.7581787109375, + "loss": 0.6953, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.08663605898618698, + "rewards/margins": 0.018158910796046257, + "rewards/rejected": -0.10479498654603958, + "step": 136 + }, + { + "epoch": 0.18, + "learning_rate": 4.996485768775055e-05, + "logits/chosen": -2.041444778442383, + "logits/rejected": -2.091449022293091, + "logps/chosen": -143.69158935546875, + "logps/rejected": -157.0668182373047, + "loss": 0.7353, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.209483340382576, + "rewards/margins": -0.06017661839723587, + "rewards/rejected": -0.14930672943592072, + "step": 137 + }, + { + "epoch": 0.18, + "learning_rate": 4.996293290913926e-05, + "logits/chosen": -1.9326127767562866, + "logits/rejected": -2.0368032455444336, + "logps/chosen": -147.84547424316406, + "logps/rejected": -203.94833374023438, + "loss": 0.7451, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20097234845161438, + "rewards/margins": -0.03583552688360214, + "rewards/rejected": -0.16513679921627045, + "step": 138 + }, + { + "epoch": 0.18, + "learning_rate": 4.9960956854522986e-05, + "logits/chosen": -2.267336368560791, + "logits/rejected": -2.2412469387054443, + "logps/chosen": -134.1835174560547, + "logps/rejected": -136.10162353515625, + "loss": 0.7371, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.18377897143363953, + "rewards/margins": -0.07501955330371857, + "rewards/rejected": -0.10875942558050156, + "step": 139 + }, + { + "epoch": 0.18, + "learning_rate": 4.995892952796074e-05, + "logits/chosen": -2.3473498821258545, + "logits/rejected": -2.2889528274536133, + "logps/chosen": -179.09548950195312, + "logps/rejected": -176.6771697998047, + "loss": 0.6946, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.167083278298378, + "rewards/margins": 0.044769808650016785, + "rewards/rejected": -0.21185307204723358, + "step": 140 + }, + { + "epoch": 0.18, + "learning_rate": 4.995685093361682e-05, + "logits/chosen": -2.046065330505371, + "logits/rejected": -2.105637311935425, + "logps/chosen": -156.4433135986328, + "logps/rejected": -193.69764709472656, + "loss": 0.6491, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10867753624916077, + "rewards/margins": 0.12942247092723846, + "rewards/rejected": -0.23810002207756042, + "step": 141 + }, + { + "epoch": 0.19, + "learning_rate": 4.9954721075760824e-05, + "logits/chosen": -2.135222911834717, + "logits/rejected": -2.0551419258117676, + "logps/chosen": -160.54461669921875, + "logps/rejected": -154.21876525878906, + "loss": 0.7472, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3932434618473053, + "rewards/margins": -0.07500467449426651, + "rewards/rejected": -0.318238765001297, + "step": 142 + }, + { + "epoch": 0.19, + "learning_rate": 4.995253995876767e-05, + "logits/chosen": -2.36671781539917, + "logits/rejected": -2.3601675033569336, + "logps/chosen": -161.03819274902344, + "logps/rejected": -143.7089385986328, + "loss": 0.811, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.37680506706237793, + "rewards/margins": -0.192378431558609, + "rewards/rejected": -0.18442663550376892, + "step": 143 + }, + { + "epoch": 0.19, + "learning_rate": 4.995030758711756e-05, + "logits/chosen": -1.7832953929901123, + "logits/rejected": -1.7478176355361938, + "logps/chosen": -143.82127380371094, + "logps/rejected": -146.1196746826172, + "loss": 0.7393, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.32953861355781555, + "rewards/margins": -0.02949490211904049, + "rewards/rejected": -0.3000437021255493, + "step": 144 + }, + { + "epoch": 0.19, + "learning_rate": 4.994802396539598e-05, + "logits/chosen": -1.5314689874649048, + "logits/rejected": -1.5684059858322144, + "logps/chosen": -239.08958435058594, + "logps/rejected": -253.18838500976562, + "loss": 0.6769, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.31847676634788513, + "rewards/margins": 0.07425765693187714, + "rewards/rejected": -0.3927344083786011, + "step": 145 + }, + { + "epoch": 0.19, + "learning_rate": 4.994568909829368e-05, + "logits/chosen": -2.0874621868133545, + "logits/rejected": -2.074917793273926, + "logps/chosen": -161.60157775878906, + "logps/rejected": -152.967529296875, + "loss": 0.7313, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.36861109733581543, + "rewards/margins": -0.05294780433177948, + "rewards/rejected": -0.31566327810287476, + "step": 146 + }, + { + "epoch": 0.19, + "learning_rate": 4.9943302990606684e-05, + "logits/chosen": -1.996845006942749, + "logits/rejected": -1.9692479372024536, + "logps/chosen": -157.77352905273438, + "logps/rejected": -132.0946044921875, + "loss": 0.8205, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.32108697295188904, + "rewards/margins": -0.1798648089170456, + "rewards/rejected": -0.14122214913368225, + "step": 147 + }, + { + "epoch": 0.19, + "learning_rate": 4.994086564723626e-05, + "logits/chosen": -2.1677422523498535, + "logits/rejected": -2.18229341506958, + "logps/chosen": -160.99119567871094, + "logps/rejected": -180.38742065429688, + "loss": 0.6763, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5339494943618774, + "rewards/margins": 0.05808330327272415, + "rewards/rejected": -0.592032790184021, + "step": 148 + }, + { + "epoch": 0.19, + "learning_rate": 4.9938377073188905e-05, + "logits/chosen": -2.201998233795166, + "logits/rejected": -2.1696887016296387, + "logps/chosen": -155.02134704589844, + "logps/rejected": -140.8427734375, + "loss": 0.7243, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.29963669180870056, + "rewards/margins": -0.020382262766361237, + "rewards/rejected": -0.2792544364929199, + "step": 149 + }, + { + "epoch": 0.2, + "learning_rate": 4.993583727357638e-05, + "logits/chosen": -2.0553178787231445, + "logits/rejected": -2.0267903804779053, + "logps/chosen": -148.95738220214844, + "logps/rejected": -158.5399932861328, + "loss": 0.6162, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2186848223209381, + "rewards/margins": 0.20554782450199127, + "rewards/rejected": -0.4242326319217682, + "step": 150 + }, + { + "epoch": 0.2, + "learning_rate": 4.993324625361565e-05, + "logits/chosen": -2.2440099716186523, + "logits/rejected": -2.1926894187927246, + "logps/chosen": -185.78314208984375, + "logps/rejected": -184.54876708984375, + "loss": 0.6397, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2549251317977905, + "rewards/margins": 0.24536924064159393, + "rewards/rejected": -0.5002943873405457, + "step": 151 + }, + { + "epoch": 0.2, + "learning_rate": 4.993060401862888e-05, + "logits/chosen": -1.9377892017364502, + "logits/rejected": -1.967207908630371, + "logps/chosen": -145.74510192871094, + "logps/rejected": -142.37782287597656, + "loss": 0.7606, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.3630293905735016, + "rewards/margins": -0.08116517215967178, + "rewards/rejected": -0.281864196062088, + "step": 152 + }, + { + "epoch": 0.2, + "learning_rate": 4.9927910574043465e-05, + "logits/chosen": -2.01324462890625, + "logits/rejected": -2.029172658920288, + "logps/chosen": -123.41838836669922, + "logps/rejected": -119.00106048583984, + "loss": 0.7249, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.20013180375099182, + "rewards/margins": -0.028365857899188995, + "rewards/rejected": -0.17176595330238342, + "step": 153 + }, + { + "epoch": 0.2, + "learning_rate": 4.992516592539196e-05, + "logits/chosen": -1.8588542938232422, + "logits/rejected": -1.865120530128479, + "logps/chosen": -193.2555389404297, + "logps/rejected": -200.77032470703125, + "loss": 0.7573, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4952069818973541, + "rewards/margins": -0.08547386527061462, + "rewards/rejected": -0.4097330868244171, + "step": 154 + }, + { + "epoch": 0.2, + "learning_rate": 4.9922370078312105e-05, + "logits/chosen": -2.0865638256073, + "logits/rejected": -2.0685646533966064, + "logps/chosen": -156.38755798339844, + "logps/rejected": -151.69525146484375, + "loss": 0.7112, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.34599804878234863, + "rewards/margins": -0.008161775767803192, + "rewards/rejected": -0.33783626556396484, + "step": 155 + }, + { + "epoch": 0.2, + "learning_rate": 4.991952303854682e-05, + "logits/chosen": -2.1627612113952637, + "logits/rejected": -2.19240665435791, + "logps/chosen": -149.98419189453125, + "logps/rejected": -151.00982666015625, + "loss": 0.646, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3006647229194641, + "rewards/margins": 0.12499865889549255, + "rewards/rejected": -0.42566338181495667, + "step": 156 + }, + { + "epoch": 0.21, + "learning_rate": 4.9916624811944175e-05, + "logits/chosen": -1.9495761394500732, + "logits/rejected": -2.0748660564422607, + "logps/chosen": -159.08567810058594, + "logps/rejected": -180.21310424804688, + "loss": 0.707, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39853742718696594, + "rewards/margins": 0.021911904215812683, + "rewards/rejected": -0.42044931650161743, + "step": 157 + }, + { + "epoch": 0.21, + "learning_rate": 4.991367540445735e-05, + "logits/chosen": -2.0296339988708496, + "logits/rejected": -1.9863965511322021, + "logps/chosen": -164.78256225585938, + "logps/rejected": -165.06405639648438, + "loss": 0.6399, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.22124487161636353, + "rewards/margins": 0.14064420759677887, + "rewards/rejected": -0.3618890643119812, + "step": 158 + }, + { + "epoch": 0.21, + "learning_rate": 4.991067482214471e-05, + "logits/chosen": -1.9162077903747559, + "logits/rejected": -1.9429746866226196, + "logps/chosen": -177.34408569335938, + "logps/rejected": -191.58206176757812, + "loss": 0.6414, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5006837844848633, + "rewards/margins": 0.1467215120792389, + "rewards/rejected": -0.6474053263664246, + "step": 159 + }, + { + "epoch": 0.21, + "learning_rate": 4.9907623071169686e-05, + "logits/chosen": -1.9484643936157227, + "logits/rejected": -1.9940898418426514, + "logps/chosen": -135.9336700439453, + "logps/rejected": -147.29998779296875, + "loss": 0.6828, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2519877552986145, + "rewards/margins": 0.03291063383221626, + "rewards/rejected": -0.28489840030670166, + "step": 160 + }, + { + "epoch": 0.21, + "learning_rate": 4.990452015780085e-05, + "logits/chosen": -1.9855961799621582, + "logits/rejected": -2.01076340675354, + "logps/chosen": -140.24473571777344, + "logps/rejected": -147.60067749023438, + "loss": 0.6506, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20326684415340424, + "rewards/margins": 0.11952166259288788, + "rewards/rejected": -0.3227885365486145, + "step": 161 + }, + { + "epoch": 0.21, + "learning_rate": 4.9901366088411846e-05, + "logits/chosen": -1.8572781085968018, + "logits/rejected": -1.886858582496643, + "logps/chosen": -159.31396484375, + "logps/rejected": -150.73434448242188, + "loss": 0.6815, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3573068380355835, + "rewards/margins": 0.08972961455583572, + "rewards/rejected": -0.447036474943161, + "step": 162 + }, + { + "epoch": 0.21, + "learning_rate": 4.98981608694814e-05, + "logits/chosen": -2.1316683292388916, + "logits/rejected": -2.108705759048462, + "logps/chosen": -166.07952880859375, + "logps/rejected": -165.3894805908203, + "loss": 0.7855, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.4258570075035095, + "rewards/margins": -0.1352803260087967, + "rewards/rejected": -0.29057663679122925, + "step": 163 + }, + { + "epoch": 0.21, + "learning_rate": 4.9894904507593316e-05, + "logits/chosen": -2.1435980796813965, + "logits/rejected": -2.0952234268188477, + "logps/chosen": -179.2945098876953, + "logps/rejected": -177.13780212402344, + "loss": 0.8496, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.6829274892807007, + "rewards/margins": -0.2457571029663086, + "rewards/rejected": -0.4371703267097473, + "step": 164 + }, + { + "epoch": 0.22, + "learning_rate": 4.989159700943643e-05, + "logits/chosen": -2.087015151977539, + "logits/rejected": -2.0333518981933594, + "logps/chosen": -156.10824584960938, + "logps/rejected": -135.26223754882812, + "loss": 0.7424, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5498830676078796, + "rewards/margins": -0.06063465029001236, + "rewards/rejected": -0.4892484247684479, + "step": 165 + }, + { + "epoch": 0.22, + "learning_rate": 4.988823838180464e-05, + "logits/chosen": -2.076122999191284, + "logits/rejected": -1.985759973526001, + "logps/chosen": -157.4197540283203, + "logps/rejected": -160.52862548828125, + "loss": 0.6492, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21956738829612732, + "rewards/margins": 0.1264350414276123, + "rewards/rejected": -0.3460024297237396, + "step": 166 + }, + { + "epoch": 0.22, + "learning_rate": 4.988482863159684e-05, + "logits/chosen": -1.8897199630737305, + "logits/rejected": -1.9176274538040161, + "logps/chosen": -185.94778442382812, + "logps/rejected": -197.8125, + "loss": 0.7095, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.23862342536449432, + "rewards/margins": 0.02010211907327175, + "rewards/rejected": -0.2587255537509918, + "step": 167 + }, + { + "epoch": 0.22, + "learning_rate": 4.988136776581696e-05, + "logits/chosen": -2.0815958976745605, + "logits/rejected": -2.0887794494628906, + "logps/chosen": -173.06820678710938, + "logps/rejected": -177.52330017089844, + "loss": 0.6666, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.41561299562454224, + "rewards/margins": 0.10110354423522949, + "rewards/rejected": -0.5167165994644165, + "step": 168 + }, + { + "epoch": 0.22, + "learning_rate": 4.9877855791573915e-05, + "logits/chosen": -1.8423527479171753, + "logits/rejected": -1.9063888788223267, + "logps/chosen": -123.87655639648438, + "logps/rejected": -143.618408203125, + "loss": 0.659, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08650312572717667, + "rewards/margins": 0.08998794108629227, + "rewards/rejected": -0.17649102210998535, + "step": 169 + }, + { + "epoch": 0.22, + "learning_rate": 4.9874292716081595e-05, + "logits/chosen": -1.977996587753296, + "logits/rejected": -2.031097650527954, + "logps/chosen": -147.41297912597656, + "logps/rejected": -145.47198486328125, + "loss": 0.65, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14451590180397034, + "rewards/margins": 0.10963588207960129, + "rewards/rejected": -0.2541517913341522, + "step": 170 + }, + { + "epoch": 0.22, + "learning_rate": 4.9870678546658865e-05, + "logits/chosen": -1.904013752937317, + "logits/rejected": -1.9502118825912476, + "logps/chosen": -149.9442596435547, + "logps/rejected": -156.0302734375, + "loss": 0.6812, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.31452393531799316, + "rewards/margins": 0.06542593240737915, + "rewards/rejected": -0.3799498677253723, + "step": 171 + }, + { + "epoch": 0.23, + "learning_rate": 4.9867013290729535e-05, + "logits/chosen": -1.8673038482666016, + "logits/rejected": -1.8840340375900269, + "logps/chosen": -136.29080200195312, + "logps/rejected": -140.873046875, + "loss": 0.7688, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3597078323364258, + "rewards/margins": -0.09261485189199448, + "rewards/rejected": -0.2670930027961731, + "step": 172 + }, + { + "epoch": 0.23, + "learning_rate": 4.986329695582237e-05, + "logits/chosen": -1.965585708618164, + "logits/rejected": -1.9712939262390137, + "logps/chosen": -170.4312286376953, + "logps/rejected": -177.6762237548828, + "loss": 0.6966, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2586846947669983, + "rewards/margins": 0.04440504312515259, + "rewards/rejected": -0.3030897378921509, + "step": 173 + }, + { + "epoch": 0.23, + "learning_rate": 4.985952954957103e-05, + "logits/chosen": -1.9653695821762085, + "logits/rejected": -1.9432342052459717, + "logps/chosen": -145.86233520507812, + "logps/rejected": -127.70154571533203, + "loss": 0.7328, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.14063119888305664, + "rewards/margins": -0.0184502974152565, + "rewards/rejected": -0.12218090891838074, + "step": 174 + }, + { + "epoch": 0.23, + "learning_rate": 4.985571107971408e-05, + "logits/chosen": -2.048206090927124, + "logits/rejected": -2.040398359298706, + "logps/chosen": -150.42953491210938, + "logps/rejected": -161.55064392089844, + "loss": 0.6729, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.23834238946437836, + "rewards/margins": 0.059960223734378815, + "rewards/rejected": -0.29830265045166016, + "step": 175 + }, + { + "epoch": 0.23, + "learning_rate": 4.9851841554095e-05, + "logits/chosen": -2.0315334796905518, + "logits/rejected": -2.008814811706543, + "logps/chosen": -171.3214569091797, + "logps/rejected": -145.38833618164062, + "loss": 0.664, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2100733369588852, + "rewards/margins": 0.09207681566476822, + "rewards/rejected": -0.3021501302719116, + "step": 176 + }, + { + "epoch": 0.23, + "learning_rate": 4.9847920980662134e-05, + "logits/chosen": -2.062809467315674, + "logits/rejected": -2.053818941116333, + "logps/chosen": -139.9019775390625, + "logps/rejected": -137.94232177734375, + "loss": 0.6947, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2767728269100189, + "rewards/margins": 0.011704735457897186, + "rewards/rejected": -0.2884775400161743, + "step": 177 + }, + { + "epoch": 0.23, + "learning_rate": 4.984394936746865e-05, + "logits/chosen": -2.2114696502685547, + "logits/rejected": -2.3026535511016846, + "logps/chosen": -155.966552734375, + "logps/rejected": -169.798583984375, + "loss": 0.786, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.33644577860832214, + "rewards/margins": -0.15074411034584045, + "rewards/rejected": -0.1857016235589981, + "step": 178 + }, + { + "epoch": 0.23, + "learning_rate": 4.98399267226726e-05, + "logits/chosen": -2.354398727416992, + "logits/rejected": -2.3184590339660645, + "logps/chosen": -175.42050170898438, + "logps/rejected": -143.98519897460938, + "loss": 0.7219, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3392341136932373, + "rewards/margins": -0.028377681970596313, + "rewards/rejected": -0.3108564615249634, + "step": 179 + }, + { + "epoch": 0.24, + "learning_rate": 4.9835853054536846e-05, + "logits/chosen": -1.9018394947052002, + "logits/rejected": -1.85094153881073, + "logps/chosen": -199.07778930664062, + "logps/rejected": -232.95819091796875, + "loss": 0.7405, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.3375914692878723, + "rewards/margins": -0.021947531029582024, + "rewards/rejected": -0.31564390659332275, + "step": 180 + }, + { + "epoch": 0.24, + "learning_rate": 4.9831728371429046e-05, + "logits/chosen": -2.243464231491089, + "logits/rejected": -2.2446773052215576, + "logps/chosen": -133.00210571289062, + "logps/rejected": -138.46485900878906, + "loss": 0.7021, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23482827842235565, + "rewards/margins": 0.007431086152791977, + "rewards/rejected": -0.24225935339927673, + "step": 181 + }, + { + "epoch": 0.24, + "learning_rate": 4.982755268182164e-05, + "logits/chosen": -2.26414155960083, + "logits/rejected": -2.140368700027466, + "logps/chosen": -149.4698028564453, + "logps/rejected": -123.35488891601562, + "loss": 0.8467, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3118973970413208, + "rewards/margins": -0.22676469385623932, + "rewards/rejected": -0.0851326733827591, + "step": 182 + }, + { + "epoch": 0.24, + "learning_rate": 4.982332599429187e-05, + "logits/chosen": -2.044692039489746, + "logits/rejected": -1.9895075559616089, + "logps/chosen": -143.49749755859375, + "logps/rejected": -170.93624877929688, + "loss": 0.6813, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.05278599262237549, + "rewards/margins": 0.0477883517742157, + "rewards/rejected": -0.10057434439659119, + "step": 183 + }, + { + "epoch": 0.24, + "learning_rate": 4.981904831752171e-05, + "logits/chosen": -2.3891890048980713, + "logits/rejected": -2.4430036544799805, + "logps/chosen": -135.30807495117188, + "logps/rejected": -145.2589874267578, + "loss": 0.7079, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.05791349709033966, + "rewards/margins": 0.005677835550159216, + "rewards/rejected": -0.06359133124351501, + "step": 184 + }, + { + "epoch": 0.24, + "learning_rate": 4.981471966029787e-05, + "logits/chosen": -2.2154502868652344, + "logits/rejected": -2.205904006958008, + "logps/chosen": -185.5220947265625, + "logps/rejected": -211.70501708984375, + "loss": 0.7164, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.21913209557533264, + "rewards/margins": 0.028864163905382156, + "rewards/rejected": -0.24799621105194092, + "step": 185 + }, + { + "epoch": 0.24, + "learning_rate": 4.981034003151178e-05, + "logits/chosen": -2.2102999687194824, + "logits/rejected": -2.261056900024414, + "logps/chosen": -200.81346130371094, + "logps/rejected": -215.66226196289062, + "loss": 0.663, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.18210916221141815, + "rewards/margins": 0.13367104530334473, + "rewards/rejected": -0.3157802224159241, + "step": 186 + }, + { + "epoch": 0.24, + "learning_rate": 4.980590944015958e-05, + "logits/chosen": -2.360344648361206, + "logits/rejected": -2.3027474880218506, + "logps/chosen": -161.29354858398438, + "logps/rejected": -146.9613037109375, + "loss": 0.7363, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.11364208161830902, + "rewards/margins": -0.03879079967737198, + "rewards/rejected": -0.07485126703977585, + "step": 187 + }, + { + "epoch": 0.25, + "learning_rate": 4.98014278953421e-05, + "logits/chosen": -2.300621747970581, + "logits/rejected": -2.2820940017700195, + "logps/chosen": -166.09739685058594, + "logps/rejected": -163.3979034423828, + "loss": 0.6378, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.017802832648158073, + "rewards/margins": 0.2077583521604538, + "rewards/rejected": -0.18995548784732819, + "step": 188 + }, + { + "epoch": 0.25, + "learning_rate": 4.979689540626479e-05, + "logits/chosen": -2.056394577026367, + "logits/rejected": -2.069194793701172, + "logps/chosen": -138.0771942138672, + "logps/rejected": -125.778076171875, + "loss": 0.7441, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.13161520659923553, + "rewards/margins": -0.06547226756811142, + "rewards/rejected": -0.06614293903112411, + "step": 189 + }, + { + "epoch": 0.25, + "learning_rate": 4.9792311982237774e-05, + "logits/chosen": -1.7692331075668335, + "logits/rejected": -1.78682541847229, + "logps/chosen": -184.9560546875, + "logps/rejected": -196.3815155029297, + "loss": 0.7329, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2540504038333893, + "rewards/margins": -0.049651261419057846, + "rewards/rejected": -0.20439916849136353, + "step": 190 + }, + { + "epoch": 0.25, + "learning_rate": 4.9787677632675825e-05, + "logits/chosen": -2.1416497230529785, + "logits/rejected": -2.1960196495056152, + "logps/chosen": -161.20358276367188, + "logps/rejected": -173.38148498535156, + "loss": 0.7561, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.16931605339050293, + "rewards/margins": -0.0933171734213829, + "rewards/rejected": -0.07599887996912003, + "step": 191 + }, + { + "epoch": 0.25, + "learning_rate": 4.978299236709826e-05, + "logits/chosen": -2.149383544921875, + "logits/rejected": -2.1642093658447266, + "logps/chosen": -184.3213348388672, + "logps/rejected": -146.78817749023438, + "loss": 0.712, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.02038545347750187, + "rewards/margins": 0.031157677993178368, + "rewards/rejected": -0.051543138921260834, + "step": 192 + }, + { + "epoch": 0.25, + "learning_rate": 4.977825619512904e-05, + "logits/chosen": -2.1460795402526855, + "logits/rejected": -2.201239824295044, + "logps/chosen": -147.57321166992188, + "logps/rejected": -139.89291381835938, + "loss": 0.7989, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.1287820041179657, + "rewards/margins": -0.1620349884033203, + "rewards/rejected": 0.033253006637096405, + "step": 193 + }, + { + "epoch": 0.25, + "learning_rate": 4.977346912649666e-05, + "logits/chosen": -2.237581729888916, + "logits/rejected": -2.2825090885162354, + "logps/chosen": -138.7524871826172, + "logps/rejected": -174.55783081054688, + "loss": 0.6347, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.032103441655635834, + "rewards/margins": 0.143732950091362, + "rewards/rejected": -0.17583641409873962, + "step": 194 + }, + { + "epoch": 0.26, + "learning_rate": 4.9768631171034175e-05, + "logits/chosen": -2.133234739303589, + "logits/rejected": -2.1712846755981445, + "logps/chosen": -168.8503875732422, + "logps/rejected": -162.818603515625, + "loss": 0.8217, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11681299656629562, + "rewards/margins": -0.1929503083229065, + "rewards/rejected": 0.07613730430603027, + "step": 195 + }, + { + "epoch": 0.26, + "learning_rate": 4.9763742338679145e-05, + "logits/chosen": -2.1170601844787598, + "logits/rejected": -2.1072490215301514, + "logps/chosen": -161.21279907226562, + "logps/rejected": -178.30638122558594, + "loss": 0.6846, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.10452104359865189, + "rewards/margins": 0.06742212921380997, + "rewards/rejected": -0.17194317281246185, + "step": 196 + }, + { + "epoch": 0.26, + "learning_rate": 4.975880263947367e-05, + "logits/chosen": -2.262809991836548, + "logits/rejected": -2.249521017074585, + "logps/chosen": -142.46505737304688, + "logps/rejected": -142.214599609375, + "loss": 0.7602, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10644787549972534, + "rewards/margins": -0.08289748430252075, + "rewards/rejected": -0.02355036698281765, + "step": 197 + }, + { + "epoch": 0.26, + "learning_rate": 4.9753812083564304e-05, + "logits/chosen": -2.270545482635498, + "logits/rejected": -2.2995519638061523, + "logps/chosen": -180.8372039794922, + "logps/rejected": -184.7505340576172, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.05116385966539383, + "rewards/margins": 0.05982666835188866, + "rewards/rejected": -0.008662798441946507, + "step": 198 + }, + { + "epoch": 0.26, + "learning_rate": 4.974877068120208e-05, + "logits/chosen": -1.9270200729370117, + "logits/rejected": -1.962209701538086, + "logps/chosen": -186.33587646484375, + "logps/rejected": -227.984619140625, + "loss": 0.7574, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.28816187381744385, + "rewards/margins": -0.04220404103398323, + "rewards/rejected": -0.24595780670642853, + "step": 199 + }, + { + "epoch": 0.26, + "learning_rate": 4.974367844274248e-05, + "logits/chosen": -2.477126121520996, + "logits/rejected": -2.4897990226745605, + "logps/chosen": -120.29115295410156, + "logps/rejected": -115.61780548095703, + "loss": 0.6899, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.002930285409092903, + "rewards/margins": 0.039310526102781296, + "rewards/rejected": -0.04224081337451935, + "step": 200 + }, + { + "epoch": 0.26, + "learning_rate": 4.973853537864538e-05, + "logits/chosen": -2.2658283710479736, + "logits/rejected": -2.26222562789917, + "logps/chosen": -129.32443237304688, + "logps/rejected": -130.61666870117188, + "loss": 0.6441, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06606045365333557, + "rewards/margins": 0.13228707015514374, + "rewards/rejected": -0.06622661650180817, + "step": 201 + }, + { + "epoch": 0.26, + "learning_rate": 4.973334149947508e-05, + "logits/chosen": -2.133833408355713, + "logits/rejected": -2.12565279006958, + "logps/chosen": -169.78045654296875, + "logps/rejected": -154.434814453125, + "loss": 0.695, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02759992890059948, + "rewards/margins": 0.031831562519073486, + "rewards/rejected": -0.0042316243052482605, + "step": 202 + }, + { + "epoch": 0.27, + "learning_rate": 4.972809681590026e-05, + "logits/chosen": -2.2754271030426025, + "logits/rejected": -2.2876760959625244, + "logps/chosen": -195.24818420410156, + "logps/rejected": -206.7046661376953, + "loss": 0.7125, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.03773926943540573, + "rewards/margins": -0.024865150451660156, + "rewards/rejected": 0.06260443478822708, + "step": 203 + }, + { + "epoch": 0.27, + "learning_rate": 4.972280133869396e-05, + "logits/chosen": -2.323434829711914, + "logits/rejected": -2.3305745124816895, + "logps/chosen": -160.62684631347656, + "logps/rejected": -162.4305419921875, + "loss": 0.7052, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10412123799324036, + "rewards/margins": -0.009756050072610378, + "rewards/rejected": -0.0943651795387268, + "step": 204 + }, + { + "epoch": 0.27, + "learning_rate": 4.971745507873352e-05, + "logits/chosen": -2.3126134872436523, + "logits/rejected": -2.3449103832244873, + "logps/chosen": -164.61329650878906, + "logps/rejected": -156.84730529785156, + "loss": 0.6569, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07602532207965851, + "rewards/margins": 0.11013013869524002, + "rewards/rejected": -0.03410482034087181, + "step": 205 + }, + { + "epoch": 0.27, + "learning_rate": 4.971205804700063e-05, + "logits/chosen": -1.984785556793213, + "logits/rejected": -2.0369019508361816, + "logps/chosen": -160.10220336914062, + "logps/rejected": -179.91224670410156, + "loss": 0.8494, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.1521720588207245, + "rewards/margins": -0.21498528122901917, + "rewards/rejected": 0.06281323730945587, + "step": 206 + }, + { + "epoch": 0.27, + "learning_rate": 4.970661025458125e-05, + "logits/chosen": -2.309006452560425, + "logits/rejected": -2.3352603912353516, + "logps/chosen": -161.5703887939453, + "logps/rejected": -163.7167205810547, + "loss": 0.6492, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05860462412238121, + "rewards/margins": 0.11608318984508514, + "rewards/rejected": -0.05747856944799423, + "step": 207 + }, + { + "epoch": 0.27, + "learning_rate": 4.9701111712665625e-05, + "logits/chosen": -2.227006673812866, + "logits/rejected": -2.1848740577697754, + "logps/chosen": -177.54269409179688, + "logps/rejected": -171.7179718017578, + "loss": 0.7767, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.17108993232250214, + "rewards/margins": -0.11503319442272186, + "rewards/rejected": -0.05605673789978027, + "step": 208 + }, + { + "epoch": 0.27, + "learning_rate": 4.969556243254822e-05, + "logits/chosen": -2.226109743118286, + "logits/rejected": -2.2309632301330566, + "logps/chosen": -126.68124389648438, + "logps/rejected": -135.05209350585938, + "loss": 0.6762, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.12173338234424591, + "rewards/margins": 0.05880265310406685, + "rewards/rejected": 0.06293072551488876, + "step": 209 + }, + { + "epoch": 0.27, + "learning_rate": 4.968996242562774e-05, + "logits/chosen": -2.1414482593536377, + "logits/rejected": -2.112384796142578, + "logps/chosen": -162.0722198486328, + "logps/rejected": -151.81146240234375, + "loss": 0.8236, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.04249238967895508, + "rewards/margins": -0.19530200958251953, + "rewards/rejected": 0.2377944141626358, + "step": 210 + }, + { + "epoch": 0.28, + "learning_rate": 4.968431170340706e-05, + "logits/chosen": -2.2804222106933594, + "logits/rejected": -2.297311782836914, + "logps/chosen": -130.9635772705078, + "logps/rejected": -132.71075439453125, + "loss": 0.7031, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05266380310058594, + "rewards/margins": 0.007896373979747295, + "rewards/rejected": -0.06056017801165581, + "step": 211 + }, + { + "epoch": 0.28, + "learning_rate": 4.9678610277493275e-05, + "logits/chosen": -2.324260711669922, + "logits/rejected": -2.287720203399658, + "logps/chosen": -144.84410095214844, + "logps/rejected": -141.16293334960938, + "loss": 0.693, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13754111528396606, + "rewards/margins": 0.018452219665050507, + "rewards/rejected": 0.11908888816833496, + "step": 212 + }, + { + "epoch": 0.28, + "learning_rate": 4.967285815959759e-05, + "logits/chosen": -2.1252338886260986, + "logits/rejected": -2.181086778640747, + "logps/chosen": -165.26145935058594, + "logps/rejected": -185.48065185546875, + "loss": 0.6365, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.05688684061169624, + "rewards/margins": 0.2325521856546402, + "rewards/rejected": -0.17566533386707306, + "step": 213 + }, + { + "epoch": 0.28, + "learning_rate": 4.9667055361535354e-05, + "logits/chosen": -2.0358633995056152, + "logits/rejected": -2.037306308746338, + "logps/chosen": -158.7928466796875, + "logps/rejected": -174.2408905029297, + "loss": 0.7384, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20100197196006775, + "rewards/margins": -0.03293357789516449, + "rewards/rejected": -0.16806840896606445, + "step": 214 + }, + { + "epoch": 0.28, + "learning_rate": 4.9661201895226e-05, + "logits/chosen": -2.1538403034210205, + "logits/rejected": -2.2139194011688232, + "logps/chosen": -131.5411376953125, + "logps/rejected": -151.45513916015625, + "loss": 0.6839, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.008547332137823105, + "rewards/margins": 0.0680898129940033, + "rewards/rejected": -0.059542469680309296, + "step": 215 + }, + { + "epoch": 0.28, + "learning_rate": 4.965529777269306e-05, + "logits/chosen": -2.2412662506103516, + "logits/rejected": -2.2856147289276123, + "logps/chosen": -117.49397277832031, + "logps/rejected": -120.62272644042969, + "loss": 0.6965, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.228101447224617, + "rewards/margins": 0.008968396112322807, + "rewards/rejected": -0.23706983029842377, + "step": 216 + }, + { + "epoch": 0.28, + "learning_rate": 4.964934300606411e-05, + "logits/chosen": -2.066718816757202, + "logits/rejected": -2.138875722885132, + "logps/chosen": -153.00021362304688, + "logps/rejected": -164.0558624267578, + "loss": 0.6836, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10881123691797256, + "rewards/margins": 0.06324261426925659, + "rewards/rejected": -0.17205384373664856, + "step": 217 + }, + { + "epoch": 0.29, + "learning_rate": 4.964333760757074e-05, + "logits/chosen": -2.3143880367279053, + "logits/rejected": -2.184577703475952, + "logps/chosen": -161.26583862304688, + "logps/rejected": -178.37884521484375, + "loss": 0.6808, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.09615574032068253, + "rewards/margins": 0.0893949344754219, + "rewards/rejected": -0.18555067479610443, + "step": 218 + }, + { + "epoch": 0.29, + "learning_rate": 4.963728158954856e-05, + "logits/chosen": -2.3663442134857178, + "logits/rejected": -2.404465675354004, + "logps/chosen": -140.4896240234375, + "logps/rejected": -160.0462188720703, + "loss": 0.6943, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.03150591999292374, + "rewards/margins": 0.02258572168648243, + "rewards/rejected": -0.05409163981676102, + "step": 219 + }, + { + "epoch": 0.29, + "learning_rate": 4.963117496443715e-05, + "logits/chosen": -2.264538526535034, + "logits/rejected": -2.2544782161712646, + "logps/chosen": -180.01846313476562, + "logps/rejected": -177.05178833007812, + "loss": 0.7132, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.13371305167675018, + "rewards/margins": -0.008653441444039345, + "rewards/rejected": -0.1250596046447754, + "step": 220 + }, + { + "epoch": 0.29, + "learning_rate": 4.9625017744780045e-05, + "logits/chosen": -2.1741209030151367, + "logits/rejected": -2.1468567848205566, + "logps/chosen": -164.97640991210938, + "logps/rejected": -169.90350341796875, + "loss": 0.6725, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10341596603393555, + "rewards/margins": 0.06766539812088013, + "rewards/rejected": -0.17108136415481567, + "step": 221 + }, + { + "epoch": 0.29, + "learning_rate": 4.96188099432247e-05, + "logits/chosen": -2.02036714553833, + "logits/rejected": -2.0622000694274902, + "logps/chosen": -196.87257385253906, + "logps/rejected": -192.37307739257812, + "loss": 0.692, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.21568839251995087, + "rewards/margins": 0.0388670451939106, + "rewards/rejected": -0.25455543398857117, + "step": 222 + }, + { + "epoch": 0.29, + "learning_rate": 4.9612551572522464e-05, + "logits/chosen": -2.2688815593719482, + "logits/rejected": -2.305769920349121, + "logps/chosen": -143.64569091796875, + "logps/rejected": -154.7953338623047, + "loss": 0.6345, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16745011508464813, + "rewards/margins": 0.1968405842781067, + "rewards/rejected": -0.36429068446159363, + "step": 223 + }, + { + "epoch": 0.29, + "learning_rate": 4.960624264552858e-05, + "logits/chosen": -2.271151065826416, + "logits/rejected": -2.328986167907715, + "logps/chosen": -133.5254364013672, + "logps/rejected": -172.62962341308594, + "loss": 0.7838, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2519568204879761, + "rewards/margins": -0.10307664424180984, + "rewards/rejected": -0.14888018369674683, + "step": 224 + }, + { + "epoch": 0.29, + "learning_rate": 4.9599883175202124e-05, + "logits/chosen": -2.3042290210723877, + "logits/rejected": -2.285111427307129, + "logps/chosen": -114.66839599609375, + "logps/rejected": -120.16878509521484, + "loss": 0.6911, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0025178715586662292, + "rewards/margins": 0.055260900408029556, + "rewards/rejected": -0.052743006497621536, + "step": 225 + }, + { + "epoch": 0.3, + "learning_rate": 4.9593473174605974e-05, + "logits/chosen": -2.4031026363372803, + "logits/rejected": -2.3971669673919678, + "logps/chosen": -180.76190185546875, + "logps/rejected": -189.8751220703125, + "loss": 0.7679, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2216603308916092, + "rewards/margins": -0.037253011018037796, + "rewards/rejected": -0.1844073235988617, + "step": 226 + }, + { + "epoch": 0.3, + "learning_rate": 4.958701265690685e-05, + "logits/chosen": -2.2369589805603027, + "logits/rejected": -2.213113784790039, + "logps/chosen": -163.1219482421875, + "logps/rejected": -163.7172393798828, + "loss": 0.7187, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.29494044184684753, + "rewards/margins": -0.002360118553042412, + "rewards/rejected": -0.29258033633232117, + "step": 227 + }, + { + "epoch": 0.3, + "learning_rate": 4.958050163537519e-05, + "logits/chosen": -2.4455294609069824, + "logits/rejected": -2.393801212310791, + "logps/chosen": -143.6052703857422, + "logps/rejected": -128.57278442382812, + "loss": 0.7247, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2546530067920685, + "rewards/margins": -0.035662561655044556, + "rewards/rejected": -0.2189904749393463, + "step": 228 + }, + { + "epoch": 0.3, + "learning_rate": 4.957394012338519e-05, + "logits/chosen": -2.242324113845825, + "logits/rejected": -2.276301383972168, + "logps/chosen": -154.13751220703125, + "logps/rejected": -155.56005859375, + "loss": 0.75, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3772943615913391, + "rewards/margins": -0.08221397548913956, + "rewards/rejected": -0.29508039355278015, + "step": 229 + }, + { + "epoch": 0.3, + "learning_rate": 4.956732813441477e-05, + "logits/chosen": -2.4516844749450684, + "logits/rejected": -2.397124767303467, + "logps/chosen": -142.58702087402344, + "logps/rejected": -139.663330078125, + "loss": 0.8001, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2911987900733948, + "rewards/margins": -0.15551666915416718, + "rewards/rejected": -0.1356821358203888, + "step": 230 + }, + { + "epoch": 0.3, + "learning_rate": 4.956066568204552e-05, + "logits/chosen": -2.039741039276123, + "logits/rejected": -2.023181200027466, + "logps/chosen": -154.21315002441406, + "logps/rejected": -135.28794860839844, + "loss": 0.6237, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0628127008676529, + "rewards/margins": 0.22161367535591125, + "rewards/rejected": -0.28442639112472534, + "step": 231 + }, + { + "epoch": 0.3, + "learning_rate": 4.955395277996268e-05, + "logits/chosen": -2.2360501289367676, + "logits/rejected": -2.2908437252044678, + "logps/chosen": -184.1702423095703, + "logps/rejected": -185.9180145263672, + "loss": 0.6513, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20668905973434448, + "rewards/margins": 0.1489761769771576, + "rewards/rejected": -0.3556652069091797, + "step": 232 + }, + { + "epoch": 0.3, + "learning_rate": 4.954718944195512e-05, + "logits/chosen": -2.2392733097076416, + "logits/rejected": -2.256279706954956, + "logps/chosen": -140.51260375976562, + "logps/rejected": -144.2230682373047, + "loss": 0.6254, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18107274174690247, + "rewards/margins": 0.16339734196662903, + "rewards/rejected": -0.3444700539112091, + "step": 233 + }, + { + "epoch": 0.31, + "learning_rate": 4.954037568191534e-05, + "logits/chosen": -2.2857413291931152, + "logits/rejected": -2.2823734283447266, + "logps/chosen": -139.91226196289062, + "logps/rejected": -140.5911407470703, + "loss": 0.6551, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10547606647014618, + "rewards/margins": 0.11846562474966049, + "rewards/rejected": -0.22394171357154846, + "step": 234 + }, + { + "epoch": 0.31, + "learning_rate": 4.9533511513839384e-05, + "logits/chosen": -2.052267551422119, + "logits/rejected": -2.02097487449646, + "logps/chosen": -135.19448852539062, + "logps/rejected": -150.51913452148438, + "loss": 0.678, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1995100975036621, + "rewards/margins": 0.07375679910182953, + "rewards/rejected": -0.27326688170433044, + "step": 235 + }, + { + "epoch": 0.31, + "learning_rate": 4.9526596951826824e-05, + "logits/chosen": -2.1060378551483154, + "logits/rejected": -2.1363437175750732, + "logps/chosen": -189.6680145263672, + "logps/rejected": -194.1062774658203, + "loss": 0.7673, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.20259276032447815, + "rewards/margins": -0.09232301265001297, + "rewards/rejected": -0.11026974767446518, + "step": 236 + }, + { + "epoch": 0.31, + "learning_rate": 4.951963201008076e-05, + "logits/chosen": -2.146965265274048, + "logits/rejected": -2.1230671405792236, + "logps/chosen": -161.82183837890625, + "logps/rejected": -152.2267303466797, + "loss": 0.785, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.17847128212451935, + "rewards/margins": -0.1518484354019165, + "rewards/rejected": -0.026622820645570755, + "step": 237 + }, + { + "epoch": 0.31, + "learning_rate": 4.951261670290781e-05, + "logits/chosen": -2.0150346755981445, + "logits/rejected": -2.04280424118042, + "logps/chosen": -204.8814239501953, + "logps/rejected": -211.47410583496094, + "loss": 0.8185, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.35267120599746704, + "rewards/margins": -0.18207934498786926, + "rewards/rejected": -0.17059186100959778, + "step": 238 + }, + { + "epoch": 0.31, + "learning_rate": 4.950555104471799e-05, + "logits/chosen": -2.0696749687194824, + "logits/rejected": -2.0690653324127197, + "logps/chosen": -180.72787475585938, + "logps/rejected": -167.14132690429688, + "loss": 0.701, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1277678906917572, + "rewards/margins": 0.01966879889369011, + "rewards/rejected": -0.14743672311306, + "step": 239 + }, + { + "epoch": 0.31, + "learning_rate": 4.949843505002477e-05, + "logits/chosen": -2.276556968688965, + "logits/rejected": -2.307004928588867, + "logps/chosen": -139.89166259765625, + "logps/rejected": -149.29598999023438, + "loss": 0.5977, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.11064109206199646, + "rewards/margins": 0.22020389139652252, + "rewards/rejected": -0.10956278443336487, + "step": 240 + }, + { + "epoch": 0.32, + "learning_rate": 4.9491268733445034e-05, + "logits/chosen": -2.3263444900512695, + "logits/rejected": -2.216820240020752, + "logps/chosen": -176.23760986328125, + "logps/rejected": -181.00119018554688, + "loss": 0.5842, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14714036881923676, + "rewards/margins": 0.2764008939266205, + "rewards/rejected": -0.4235413074493408, + "step": 241 + }, + { + "epoch": 0.32, + "learning_rate": 4.9484052109698984e-05, + "logits/chosen": -2.2672414779663086, + "logits/rejected": -2.29109525680542, + "logps/chosen": -146.66981506347656, + "logps/rejected": -148.75990295410156, + "loss": 0.6706, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.010637687519192696, + "rewards/margins": 0.07865148037672043, + "rewards/rejected": -0.08928915858268738, + "step": 242 + }, + { + "epoch": 0.32, + "learning_rate": 4.947678519361021e-05, + "logits/chosen": -2.173220634460449, + "logits/rejected": -2.1617910861968994, + "logps/chosen": -157.75787353515625, + "logps/rejected": -161.2376251220703, + "loss": 0.7178, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1290948987007141, + "rewards/margins": 0.04089619964361191, + "rewards/rejected": -0.16999109089374542, + "step": 243 + }, + { + "epoch": 0.32, + "learning_rate": 4.946946800010556e-05, + "logits/chosen": -2.089167833328247, + "logits/rejected": -2.098552942276001, + "logps/chosen": -167.84510803222656, + "logps/rejected": -177.17660522460938, + "loss": 0.7749, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.23455895483493805, + "rewards/margins": -0.11604375392198563, + "rewards/rejected": -0.11851520091295242, + "step": 244 + }, + { + "epoch": 0.32, + "learning_rate": 4.946210054421518e-05, + "logits/chosen": -2.3614742755889893, + "logits/rejected": -2.367582321166992, + "logps/chosen": -138.63021850585938, + "logps/rejected": -138.73843383789062, + "loss": 0.7186, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0001854933798313141, + "rewards/margins": -0.002179570496082306, + "rewards/rejected": 0.001994088292121887, + "step": 245 + }, + { + "epoch": 0.32, + "learning_rate": 4.945468284107246e-05, + "logits/chosen": -2.033447265625, + "logits/rejected": -2.047463893890381, + "logps/chosen": -163.36306762695312, + "logps/rejected": -178.01649475097656, + "loss": 0.6706, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.026612814515829086, + "rewards/margins": 0.12761405110359192, + "rewards/rejected": -0.10100121051073074, + "step": 246 + }, + { + "epoch": 0.32, + "learning_rate": 4.944721490591401e-05, + "logits/chosen": -2.2349605560302734, + "logits/rejected": -2.2344400882720947, + "logps/chosen": -153.67425537109375, + "logps/rejected": -140.24844360351562, + "loss": 0.7657, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.08484932780265808, + "rewards/margins": -0.11917441338300705, + "rewards/rejected": 0.034325070679187775, + "step": 247 + }, + { + "epoch": 0.32, + "learning_rate": 4.9439696754079595e-05, + "logits/chosen": -2.026840925216675, + "logits/rejected": -2.0905795097351074, + "logps/chosen": -153.32894897460938, + "logps/rejected": -157.39907836914062, + "loss": 0.609, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1143321692943573, + "rewards/margins": 0.21959424018859863, + "rewards/rejected": -0.10526210069656372, + "step": 248 + }, + { + "epoch": 0.33, + "learning_rate": 4.9432128401012144e-05, + "logits/chosen": -2.2150955200195312, + "logits/rejected": -2.1640408039093018, + "logps/chosen": -176.57774353027344, + "logps/rejected": -177.27870178222656, + "loss": 0.8383, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.25025683641433716, + "rewards/margins": -0.19921274483203888, + "rewards/rejected": -0.05104408413171768, + "step": 249 + }, + { + "epoch": 0.33, + "learning_rate": 4.9424509862257706e-05, + "logits/chosen": -1.9792041778564453, + "logits/rejected": -1.9765446186065674, + "logps/chosen": -134.66946411132812, + "logps/rejected": -144.38739013671875, + "loss": 0.6723, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0672139897942543, + "rewards/margins": 0.08019405603408813, + "rewards/rejected": -0.14740802347660065, + "step": 250 + }, + { + "epoch": 0.33, + "learning_rate": 4.941684115346541e-05, + "logits/chosen": -2.319556474685669, + "logits/rejected": -2.351644277572632, + "logps/chosen": -173.3380126953125, + "logps/rejected": -170.59963989257812, + "loss": 0.7212, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.019015267491340637, + "rewards/margins": -0.03376225382089615, + "rewards/rejected": 0.05277752876281738, + "step": 251 + }, + { + "epoch": 0.33, + "learning_rate": 4.940912229038745e-05, + "logits/chosen": -2.0435500144958496, + "logits/rejected": -2.022526502609253, + "logps/chosen": -135.12265014648438, + "logps/rejected": -125.80049133300781, + "loss": 0.7075, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.021306686103343964, + "rewards/margins": 0.02372751012444496, + "rewards/rejected": -0.045034196227788925, + "step": 252 + }, + { + "epoch": 0.33, + "learning_rate": 4.9401353288879024e-05, + "logits/chosen": -1.973745584487915, + "logits/rejected": -1.964308738708496, + "logps/chosen": -148.7356719970703, + "logps/rejected": -140.4853515625, + "loss": 0.9136, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.35221704840660095, + "rewards/margins": -0.27745121717453003, + "rewards/rejected": -0.07476583123207092, + "step": 253 + }, + { + "epoch": 0.33, + "learning_rate": 4.9393534164898335e-05, + "logits/chosen": -2.2897889614105225, + "logits/rejected": -2.2667198181152344, + "logps/chosen": -169.73867797851562, + "logps/rejected": -154.4757080078125, + "loss": 0.7961, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.01284090057015419, + "rewards/margins": -0.14840884506702423, + "rewards/rejected": 0.13556794822216034, + "step": 254 + }, + { + "epoch": 0.33, + "learning_rate": 4.9385664934506526e-05, + "logits/chosen": -2.320432186126709, + "logits/rejected": -2.2988882064819336, + "logps/chosen": -144.0506591796875, + "logps/rejected": -145.1551055908203, + "loss": 0.8377, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.11610221862792969, + "rewards/margins": -0.22257395088672638, + "rewards/rejected": 0.33867618441581726, + "step": 255 + }, + { + "epoch": 0.34, + "learning_rate": 4.937774561386768e-05, + "logits/chosen": -2.1970014572143555, + "logits/rejected": -2.222613573074341, + "logps/chosen": -138.17433166503906, + "logps/rejected": -143.5152130126953, + "loss": 0.7075, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0024851299822330475, + "rewards/margins": 0.022421889007091522, + "rewards/rejected": -0.01993674784898758, + "step": 256 + }, + { + "epoch": 0.34, + "learning_rate": 4.936977621924875e-05, + "logits/chosen": -2.1020843982696533, + "logits/rejected": -2.0904481410980225, + "logps/chosen": -148.5655517578125, + "logps/rejected": -153.73941040039062, + "loss": 0.6322, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.003152729943394661, + "rewards/margins": 0.18040181696414948, + "rewards/rejected": -0.17724908888339996, + "step": 257 + }, + { + "epoch": 0.34, + "learning_rate": 4.9361756767019564e-05, + "logits/chosen": -1.8264960050582886, + "logits/rejected": -1.792884349822998, + "logps/chosen": -168.45526123046875, + "logps/rejected": -160.93218994140625, + "loss": 0.7427, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.034992799162864685, + "rewards/margins": -0.04541083052754402, + "rewards/rejected": 0.010418036952614784, + "step": 258 + }, + { + "epoch": 0.34, + "learning_rate": 4.935368727365276e-05, + "logits/chosen": -2.033273458480835, + "logits/rejected": -2.0581772327423096, + "logps/chosen": -151.52731323242188, + "logps/rejected": -166.76388549804688, + "loss": 0.7695, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.07278600335121155, + "rewards/margins": -0.08771771937608719, + "rewards/rejected": 0.014931721612811089, + "step": 259 + }, + { + "epoch": 0.34, + "learning_rate": 4.934556775572377e-05, + "logits/chosen": -2.1122307777404785, + "logits/rejected": -2.1686928272247314, + "logps/chosen": -158.33444213867188, + "logps/rejected": -155.8374481201172, + "loss": 0.7004, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11077702045440674, + "rewards/margins": 0.048048801720142365, + "rewards/rejected": -0.1588258296251297, + "step": 260 + }, + { + "epoch": 0.34, + "learning_rate": 4.9337398229910784e-05, + "logits/chosen": -2.0639896392822266, + "logits/rejected": -2.0493862628936768, + "logps/chosen": -140.65463256835938, + "logps/rejected": -143.4046173095703, + "loss": 0.6137, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.332168847322464, + "rewards/margins": 0.21151331067085266, + "rewards/rejected": 0.12065552175045013, + "step": 261 + }, + { + "epoch": 0.34, + "learning_rate": 4.932917871299471e-05, + "logits/chosen": -2.1873834133148193, + "logits/rejected": -2.1482584476470947, + "logps/chosen": -156.8861846923828, + "logps/rejected": -152.4180450439453, + "loss": 0.7677, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.044190216809511185, + "rewards/margins": -0.10401745140552521, + "rewards/rejected": 0.1482076644897461, + "step": 262 + }, + { + "epoch": 0.34, + "learning_rate": 4.9320909221859134e-05, + "logits/chosen": -2.1388399600982666, + "logits/rejected": -2.1468729972839355, + "logps/chosen": -150.1500701904297, + "logps/rejected": -152.03726196289062, + "loss": 0.7504, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.08211887627840042, + "rewards/margins": -0.07361260801553726, + "rewards/rejected": 0.15573148429393768, + "step": 263 + }, + { + "epoch": 0.35, + "learning_rate": 4.9312589773490304e-05, + "logits/chosen": -1.996654987335205, + "logits/rejected": -1.9872137308120728, + "logps/chosen": -150.1443328857422, + "logps/rejected": -155.907958984375, + "loss": 0.7484, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.17925025522708893, + "rewards/margins": -0.05667828768491745, + "rewards/rejected": 0.23592855036258698, + "step": 264 + }, + { + "epoch": 0.35, + "learning_rate": 4.930422038497708e-05, + "logits/chosen": -2.2004756927490234, + "logits/rejected": -2.2537848949432373, + "logps/chosen": -150.181640625, + "logps/rejected": -177.4576416015625, + "loss": 0.6491, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.28161391615867615, + "rewards/margins": 0.11887946724891663, + "rewards/rejected": 0.16273444890975952, + "step": 265 + }, + { + "epoch": 0.35, + "learning_rate": 4.92958010735109e-05, + "logits/chosen": -2.1067912578582764, + "logits/rejected": -2.1667635440826416, + "logps/chosen": -122.0634765625, + "logps/rejected": -125.26715850830078, + "loss": 0.7781, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.10816986113786697, + "rewards/margins": -0.11183619499206543, + "rewards/rejected": 0.2200060486793518, + "step": 266 + }, + { + "epoch": 0.35, + "learning_rate": 4.928733185638575e-05, + "logits/chosen": -2.1284282207489014, + "logits/rejected": -2.1388614177703857, + "logps/chosen": -145.78765869140625, + "logps/rejected": -164.8605194091797, + "loss": 0.694, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06616201996803284, + "rewards/margins": 0.05039716139435768, + "rewards/rejected": 0.01576484926044941, + "step": 267 + }, + { + "epoch": 0.35, + "learning_rate": 4.927881275099815e-05, + "logits/chosen": -2.2518081665039062, + "logits/rejected": -2.2290408611297607, + "logps/chosen": -153.83200073242188, + "logps/rejected": -160.59776306152344, + "loss": 0.7557, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1310650259256363, + "rewards/margins": -0.07667630910873413, + "rewards/rejected": 0.20774134993553162, + "step": 268 + }, + { + "epoch": 0.35, + "learning_rate": 4.927024377484705e-05, + "logits/chosen": -2.0690414905548096, + "logits/rejected": -2.096713066101074, + "logps/chosen": -168.7005157470703, + "logps/rejected": -168.45872497558594, + "loss": 0.6711, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.16334614157676697, + "rewards/margins": 0.09711476415395737, + "rewards/rejected": 0.06623139977455139, + "step": 269 + }, + { + "epoch": 0.35, + "learning_rate": 4.9261624945533855e-05, + "logits/chosen": -1.6682740449905396, + "logits/rejected": -1.611401915550232, + "logps/chosen": -170.30328369140625, + "logps/rejected": -174.10769653320312, + "loss": 0.8147, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.10478762537240982, + "rewards/margins": -0.1818379908800125, + "rewards/rejected": 0.07705036550760269, + "step": 270 + }, + { + "epoch": 0.35, + "learning_rate": 4.925295628076241e-05, + "logits/chosen": -1.9349985122680664, + "logits/rejected": -1.9530787467956543, + "logps/chosen": -130.31883239746094, + "logps/rejected": -152.9124755859375, + "loss": 0.8018, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0537966787815094, + "rewards/margins": -0.1305672526359558, + "rewards/rejected": 0.1843639612197876, + "step": 271 + }, + { + "epoch": 0.36, + "learning_rate": 4.9244237798338866e-05, + "logits/chosen": -2.0011062622070312, + "logits/rejected": -1.9965519905090332, + "logps/chosen": -164.77838134765625, + "logps/rejected": -192.95458984375, + "loss": 0.7462, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0018954463303089142, + "rewards/margins": -0.058521248400211334, + "rewards/rejected": 0.056625787168741226, + "step": 272 + }, + { + "epoch": 0.36, + "learning_rate": 4.923546951617175e-05, + "logits/chosen": -2.117265224456787, + "logits/rejected": -2.1309285163879395, + "logps/chosen": -157.73707580566406, + "logps/rejected": -156.6586456298828, + "loss": 0.7046, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09399585425853729, + "rewards/margins": 0.057814061641693115, + "rewards/rejected": 0.03618178144097328, + "step": 273 + }, + { + "epoch": 0.36, + "learning_rate": 4.922665145227187e-05, + "logits/chosen": -2.2057220935821533, + "logits/rejected": -2.1393046379089355, + "logps/chosen": -182.5475311279297, + "logps/rejected": -164.41293334960938, + "loss": 0.8501, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0016194283962249756, + "rewards/margins": -0.25813543796539307, + "rewards/rejected": 0.2565160095691681, + "step": 274 + }, + { + "epoch": 0.36, + "learning_rate": 4.9217783624752266e-05, + "logits/chosen": -2.2684426307678223, + "logits/rejected": -2.243990659713745, + "logps/chosen": -128.5909881591797, + "logps/rejected": -125.42144775390625, + "loss": 0.6339, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.07768978923559189, + "rewards/margins": 0.20721395313739777, + "rewards/rejected": -0.12952415645122528, + "step": 275 + }, + { + "epoch": 0.36, + "learning_rate": 4.920886605182823e-05, + "logits/chosen": -2.2352826595306396, + "logits/rejected": -2.317643404006958, + "logps/chosen": -147.9606170654297, + "logps/rejected": -151.64022827148438, + "loss": 0.7825, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1962936520576477, + "rewards/margins": -0.0966653823852539, + "rewards/rejected": 0.2929590344429016, + "step": 276 + }, + { + "epoch": 0.36, + "learning_rate": 4.919989875181722e-05, + "logits/chosen": -2.0506582260131836, + "logits/rejected": -2.056596517562866, + "logps/chosen": -133.77655029296875, + "logps/rejected": -142.32601928710938, + "loss": 0.698, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.06665589660406113, + "rewards/margins": 0.017916321754455566, + "rewards/rejected": 0.04873957484960556, + "step": 277 + }, + { + "epoch": 0.36, + "learning_rate": 4.919088174313884e-05, + "logits/chosen": -2.1315078735351562, + "logits/rejected": -2.1690430641174316, + "logps/chosen": -166.51275634765625, + "logps/rejected": -171.87623596191406, + "loss": 0.6526, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.17828050255775452, + "rewards/margins": 0.1265142858028412, + "rewards/rejected": 0.05176621302962303, + "step": 278 + }, + { + "epoch": 0.37, + "learning_rate": 4.91818150443148e-05, + "logits/chosen": -2.316528797149658, + "logits/rejected": -2.3489573001861572, + "logps/chosen": -158.4711456298828, + "logps/rejected": -156.23777770996094, + "loss": 0.6217, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19824568927288055, + "rewards/margins": 0.19765932857990265, + "rewards/rejected": 0.0005863434635102749, + "step": 279 + }, + { + "epoch": 0.37, + "learning_rate": 4.917269867396886e-05, + "logits/chosen": -2.1244661808013916, + "logits/rejected": -2.0779430866241455, + "logps/chosen": -164.66851806640625, + "logps/rejected": -145.93212890625, + "loss": 0.6832, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0432266928255558, + "rewards/margins": 0.07876121997833252, + "rewards/rejected": -0.03553451970219612, + "step": 280 + }, + { + "epoch": 0.37, + "learning_rate": 4.916353265082686e-05, + "logits/chosen": -2.397843837738037, + "logits/rejected": -2.4166927337646484, + "logps/chosen": -214.45030212402344, + "logps/rejected": -215.8428497314453, + "loss": 0.7811, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.009777067229151726, + "rewards/margins": -0.09677901118993759, + "rewards/rejected": 0.10655608028173447, + "step": 281 + }, + { + "epoch": 0.37, + "learning_rate": 4.9154316993716565e-05, + "logits/chosen": -2.245692491531372, + "logits/rejected": -2.245835781097412, + "logps/chosen": -156.67173767089844, + "logps/rejected": -149.71917724609375, + "loss": 0.7229, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05182276666164398, + "rewards/margins": -0.009018277749419212, + "rewards/rejected": 0.06084103882312775, + "step": 282 + }, + { + "epoch": 0.37, + "learning_rate": 4.9145051721567734e-05, + "logits/chosen": -2.2843127250671387, + "logits/rejected": -2.2395739555358887, + "logps/chosen": -167.3928985595703, + "logps/rejected": -162.44593811035156, + "loss": 0.7401, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.057207878679037094, + "rewards/margins": -0.031176520511507988, + "rewards/rejected": 0.08838438242673874, + "step": 283 + }, + { + "epoch": 0.37, + "learning_rate": 4.913573685341205e-05, + "logits/chosen": -2.0592617988586426, + "logits/rejected": -2.108931064605713, + "logps/chosen": -189.07752990722656, + "logps/rejected": -181.4488983154297, + "loss": 0.8664, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.3183850646018982, + "rewards/margins": -0.25120943784713745, + "rewards/rejected": -0.06717558205127716, + "step": 284 + }, + { + "epoch": 0.37, + "learning_rate": 4.9126372408383025e-05, + "logits/chosen": -1.9963423013687134, + "logits/rejected": -1.9505279064178467, + "logps/chosen": -149.78964233398438, + "logps/rejected": -148.13140869140625, + "loss": 0.7801, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.006365638226270676, + "rewards/margins": -0.0903141051530838, + "rewards/rejected": 0.09667973965406418, + "step": 285 + }, + { + "epoch": 0.37, + "learning_rate": 4.911695840571605e-05, + "logits/chosen": -2.259364604949951, + "logits/rejected": -2.2460901737213135, + "logps/chosen": -173.76356506347656, + "logps/rejected": -162.274169921875, + "loss": 0.7328, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.26215800642967224, + "rewards/margins": -0.027749449014663696, + "rewards/rejected": -0.23440855741500854, + "step": 286 + }, + { + "epoch": 0.38, + "learning_rate": 4.910749486474828e-05, + "logits/chosen": -2.1809744834899902, + "logits/rejected": -2.1365909576416016, + "logps/chosen": -173.26553344726562, + "logps/rejected": -168.60324096679688, + "loss": 0.6541, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.13890515267848969, + "rewards/margins": 0.1398504674434662, + "rewards/rejected": -0.2787555754184723, + "step": 287 + }, + { + "epoch": 0.38, + "learning_rate": 4.909798180491865e-05, + "logits/chosen": -2.2078680992126465, + "logits/rejected": -2.202317714691162, + "logps/chosen": -161.5787353515625, + "logps/rejected": -148.63571166992188, + "loss": 0.6944, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0784807801246643, + "rewards/margins": 0.04119132459163666, + "rewards/rejected": 0.037289444357156754, + "step": 288 + }, + { + "epoch": 0.38, + "learning_rate": 4.9088419245767803e-05, + "logits/chosen": -2.323357582092285, + "logits/rejected": -2.331782817840576, + "logps/chosen": -154.3722686767578, + "logps/rejected": -142.60247802734375, + "loss": 0.7108, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.03877534344792366, + "rewards/margins": 0.031510498374700546, + "rewards/rejected": 0.007264849729835987, + "step": 289 + }, + { + "epoch": 0.38, + "learning_rate": 4.907880720693804e-05, + "logits/chosen": -2.3022680282592773, + "logits/rejected": -2.242792844772339, + "logps/chosen": -143.08497619628906, + "logps/rejected": -142.6212158203125, + "loss": 0.6595, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.19559280574321747, + "rewards/margins": 0.14293703436851501, + "rewards/rejected": 0.052655745297670364, + "step": 290 + }, + { + "epoch": 0.38, + "learning_rate": 4.9069145708173324e-05, + "logits/chosen": -2.24276065826416, + "logits/rejected": -2.2772743701934814, + "logps/chosen": -141.13607788085938, + "logps/rejected": -143.68528747558594, + "loss": 0.749, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.13126200437545776, + "rewards/margins": -0.08278003334999084, + "rewards/rejected": -0.04848198592662811, + "step": 291 + }, + { + "epoch": 0.38, + "learning_rate": 4.9059434769319205e-05, + "logits/chosen": -1.9449436664581299, + "logits/rejected": -1.8548604249954224, + "logps/chosen": -182.99716186523438, + "logps/rejected": -157.7788848876953, + "loss": 0.7759, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.11397290229797363, + "rewards/margins": -0.0827551856637001, + "rewards/rejected": -0.031217724084854126, + "step": 292 + }, + { + "epoch": 0.38, + "learning_rate": 4.904967441032278e-05, + "logits/chosen": -2.3559162616729736, + "logits/rejected": -2.351862907409668, + "logps/chosen": -156.9031982421875, + "logps/rejected": -151.17758178710938, + "loss": 0.7666, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.030413679778575897, + "rewards/margins": -0.07400794327259064, + "rewards/rejected": 0.04359426349401474, + "step": 293 + }, + { + "epoch": 0.38, + "learning_rate": 4.903986465123266e-05, + "logits/chosen": -2.013231039047241, + "logits/rejected": -2.1264541149139404, + "logps/chosen": -142.36181640625, + "logps/rejected": -151.9750213623047, + "loss": 0.7971, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.08235807716846466, + "rewards/margins": -0.12663468718528748, + "rewards/rejected": 0.04427662491798401, + "step": 294 + }, + { + "epoch": 0.39, + "learning_rate": 4.903000551219894e-05, + "logits/chosen": -2.1607303619384766, + "logits/rejected": -2.1577768325805664, + "logps/chosen": -180.66165161132812, + "logps/rejected": -193.0799102783203, + "loss": 0.6103, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.06296667456626892, + "rewards/margins": 0.2222498655319214, + "rewards/rejected": -0.15928319096565247, + "step": 295 + }, + { + "epoch": 0.39, + "learning_rate": 4.902009701347313e-05, + "logits/chosen": -2.039292335510254, + "logits/rejected": -2.075709342956543, + "logps/chosen": -139.63912963867188, + "logps/rejected": -145.09732055664062, + "loss": 0.7106, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06535654515028, + "rewards/margins": 0.01611308380961418, + "rewards/rejected": -0.08146963268518448, + "step": 296 + }, + { + "epoch": 0.39, + "learning_rate": 4.901013917540814e-05, + "logits/chosen": -2.17149019241333, + "logits/rejected": -2.1316046714782715, + "logps/chosen": -149.57943725585938, + "logps/rejected": -135.62716674804688, + "loss": 0.6599, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.09369993209838867, + "rewards/margins": 0.16279715299606323, + "rewards/rejected": -0.06909724324941635, + "step": 297 + }, + { + "epoch": 0.39, + "learning_rate": 4.900013201845821e-05, + "logits/chosen": -2.245067834854126, + "logits/rejected": -2.3142921924591064, + "logps/chosen": -249.42477416992188, + "logps/rejected": -244.9073486328125, + "loss": 0.6299, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.03764531761407852, + "rewards/margins": 0.16476468741893768, + "rewards/rejected": -0.2024100124835968, + "step": 298 + }, + { + "epoch": 0.39, + "learning_rate": 4.899007556317893e-05, + "logits/chosen": -2.1344449520111084, + "logits/rejected": -2.114758014678955, + "logps/chosen": -149.18663024902344, + "logps/rejected": -157.50579833984375, + "loss": 0.7001, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.05027259886264801, + "rewards/margins": 0.06326065212488174, + "rewards/rejected": -0.11353327333927155, + "step": 299 + }, + { + "epoch": 0.39, + "learning_rate": 4.8979969830227086e-05, + "logits/chosen": -2.040646553039551, + "logits/rejected": -2.045273780822754, + "logps/chosen": -202.0292510986328, + "logps/rejected": -214.07327270507812, + "loss": 0.6409, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.050717130303382874, + "rewards/margins": 0.17365548014640808, + "rewards/rejected": -0.22437259554862976, + "step": 300 + }, + { + "epoch": 0.39, + "learning_rate": 4.896981484036074e-05, + "logits/chosen": -2.013162612915039, + "logits/rejected": -1.9795409440994263, + "logps/chosen": -146.5740509033203, + "logps/rejected": -149.60594177246094, + "loss": 0.6182, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.060083091259002686, + "rewards/margins": 0.2053542286157608, + "rewards/rejected": -0.2654373347759247, + "step": 301 + }, + { + "epoch": 0.4, + "learning_rate": 4.895961061443911e-05, + "logits/chosen": -2.287144184112549, + "logits/rejected": -2.292527914047241, + "logps/chosen": -157.79879760742188, + "logps/rejected": -156.17396545410156, + "loss": 0.693, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.018628746271133423, + "rewards/margins": 0.05455555394291878, + "rewards/rejected": -0.035926803946495056, + "step": 302 + }, + { + "epoch": 0.4, + "learning_rate": 4.894935717342255e-05, + "logits/chosen": -1.8966100215911865, + "logits/rejected": -1.9237083196640015, + "logps/chosen": -139.45045471191406, + "logps/rejected": -163.8885955810547, + "loss": 0.7204, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2020660787820816, + "rewards/margins": 0.042556531727313995, + "rewards/rejected": -0.2446226328611374, + "step": 303 + }, + { + "epoch": 0.4, + "learning_rate": 4.8939054538372496e-05, + "logits/chosen": -2.121617317199707, + "logits/rejected": -2.1448612213134766, + "logps/chosen": -237.39845275878906, + "logps/rejected": -254.28359985351562, + "loss": 0.7159, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0699370950460434, + "rewards/margins": 0.008224628865718842, + "rewards/rejected": 0.06171245500445366, + "step": 304 + }, + { + "epoch": 0.4, + "learning_rate": 4.8928702730451456e-05, + "logits/chosen": -1.949974775314331, + "logits/rejected": -1.870169997215271, + "logps/chosen": -153.4409942626953, + "logps/rejected": -156.4421844482422, + "loss": 0.7495, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.12804728746414185, + "rewards/margins": -0.07111337780952454, + "rewards/rejected": -0.0569339245557785, + "step": 305 + }, + { + "epoch": 0.4, + "learning_rate": 4.891830177092294e-05, + "logits/chosen": -2.206598997116089, + "logits/rejected": -2.1638033390045166, + "logps/chosen": -151.28504943847656, + "logps/rejected": -148.6924285888672, + "loss": 0.6044, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.030383877456188202, + "rewards/margins": 0.2497837245464325, + "rewards/rejected": -0.2801675796508789, + "step": 306 + }, + { + "epoch": 0.4, + "learning_rate": 4.8907851681151396e-05, + "logits/chosen": -1.9466733932495117, + "logits/rejected": -2.0422286987304688, + "logps/chosen": -147.58941650390625, + "logps/rejected": -163.70912170410156, + "loss": 0.7529, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.18631082773208618, + "rewards/margins": -0.07437282055616379, + "rewards/rejected": -0.1119379922747612, + "step": 307 + }, + { + "epoch": 0.4, + "learning_rate": 4.889735248260221e-05, + "logits/chosen": -1.9457015991210938, + "logits/rejected": -1.9883947372436523, + "logps/chosen": -168.67608642578125, + "logps/rejected": -175.51678466796875, + "loss": 0.6777, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1923048198223114, + "rewards/margins": 0.08353283256292343, + "rewards/rejected": -0.27583765983581543, + "step": 308 + }, + { + "epoch": 0.4, + "learning_rate": 4.8886804196841626e-05, + "logits/chosen": -2.105576515197754, + "logits/rejected": -2.0363874435424805, + "logps/chosen": -154.8956756591797, + "logps/rejected": -155.22312927246094, + "loss": 0.7898, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08425075560808182, + "rewards/margins": -0.13373690843582153, + "rewards/rejected": 0.04948614165186882, + "step": 309 + }, + { + "epoch": 0.41, + "learning_rate": 4.887620684553674e-05, + "logits/chosen": -2.1795973777770996, + "logits/rejected": -2.2052454948425293, + "logps/chosen": -132.4536895751953, + "logps/rejected": -134.7356719970703, + "loss": 0.8224, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.002498343586921692, + "rewards/margins": -0.15511931478977203, + "rewards/rejected": 0.15761765837669373, + "step": 310 + }, + { + "epoch": 0.41, + "learning_rate": 4.886556045045542e-05, + "logits/chosen": -2.068981409072876, + "logits/rejected": -2.0851194858551025, + "logps/chosen": -146.88949584960938, + "logps/rejected": -162.4698028564453, + "loss": 0.7558, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15774422883987427, + "rewards/margins": -0.06368841230869293, + "rewards/rejected": -0.09405580163002014, + "step": 311 + }, + { + "epoch": 0.41, + "learning_rate": 4.8854865033466275e-05, + "logits/chosen": -1.7522428035736084, + "logits/rejected": -1.729603886604309, + "logps/chosen": -217.58309936523438, + "logps/rejected": -244.42282104492188, + "loss": 0.7803, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.23889069259166718, + "rewards/margins": -0.09420409798622131, + "rewards/rejected": -0.14468660950660706, + "step": 312 + }, + { + "epoch": 0.41, + "learning_rate": 4.88441206165386e-05, + "logits/chosen": -2.1225035190582275, + "logits/rejected": -2.0949532985687256, + "logps/chosen": -177.24856567382812, + "logps/rejected": -175.7102813720703, + "loss": 0.7095, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.12725457549095154, + "rewards/margins": 0.04284512996673584, + "rewards/rejected": -0.17009973526000977, + "step": 313 + }, + { + "epoch": 0.41, + "learning_rate": 4.8833327221742356e-05, + "logits/chosen": -2.0727334022521973, + "logits/rejected": -2.134239435195923, + "logps/chosen": -131.3041534423828, + "logps/rejected": -136.77574157714844, + "loss": 0.6715, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.037539076060056686, + "rewards/margins": 0.08883160352706909, + "rewards/rejected": -0.0512925386428833, + "step": 314 + }, + { + "epoch": 0.41, + "learning_rate": 4.88224848712481e-05, + "logits/chosen": -1.953855276107788, + "logits/rejected": -1.9414265155792236, + "logps/chosen": -187.20455932617188, + "logps/rejected": -181.2246551513672, + "loss": 0.5929, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1832740604877472, + "rewards/margins": 0.2624707818031311, + "rewards/rejected": -0.07919671386480331, + "step": 315 + }, + { + "epoch": 0.41, + "learning_rate": 4.881159358732694e-05, + "logits/chosen": -1.8854598999023438, + "logits/rejected": -1.8628448247909546, + "logps/chosen": -156.7054443359375, + "logps/rejected": -158.3913116455078, + "loss": 0.6956, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11108584702014923, + "rewards/margins": 0.05867990851402283, + "rewards/rejected": -0.16976574063301086, + "step": 316 + }, + { + "epoch": 0.41, + "learning_rate": 4.8800653392350526e-05, + "logits/chosen": -1.9753170013427734, + "logits/rejected": -2.036269426345825, + "logps/chosen": -162.7784423828125, + "logps/rejected": -176.72535705566406, + "loss": 0.6218, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21797724068164825, + "rewards/margins": 0.17236095666885376, + "rewards/rejected": -0.3903381824493408, + "step": 317 + }, + { + "epoch": 0.42, + "learning_rate": 4.8789664308790936e-05, + "logits/chosen": -1.8070260286331177, + "logits/rejected": -1.8477225303649902, + "logps/chosen": -221.89990234375, + "logps/rejected": -199.6925048828125, + "loss": 0.7534, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5042496919631958, + "rewards/margins": 0.0008268915116786957, + "rewards/rejected": -0.5050765872001648, + "step": 318 + }, + { + "epoch": 0.42, + "learning_rate": 4.8778626359220715e-05, + "logits/chosen": -1.9517831802368164, + "logits/rejected": -1.9943265914916992, + "logps/chosen": -157.50552368164062, + "logps/rejected": -189.5386962890625, + "loss": 0.7944, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07389702647924423, + "rewards/margins": -0.1338617205619812, + "rewards/rejected": 0.059964705258607864, + "step": 319 + }, + { + "epoch": 0.42, + "learning_rate": 4.8767539566312734e-05, + "logits/chosen": -2.0427169799804688, + "logits/rejected": -1.9722158908843994, + "logps/chosen": -153.33877563476562, + "logps/rejected": -152.97161865234375, + "loss": 0.9077, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.18528270721435547, + "rewards/margins": -0.3350525200366974, + "rewards/rejected": 0.14976979792118073, + "step": 320 + }, + { + "epoch": 0.42, + "learning_rate": 4.875640395284023e-05, + "logits/chosen": -2.2405035495758057, + "logits/rejected": -2.180358648300171, + "logps/chosen": -149.2640838623047, + "logps/rejected": -144.21060180664062, + "loss": 0.6886, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.029363110661506653, + "rewards/margins": 0.07899504899978638, + "rewards/rejected": -0.10835815221071243, + "step": 321 + }, + { + "epoch": 0.42, + "learning_rate": 4.874521954167671e-05, + "logits/chosen": -2.171722888946533, + "logits/rejected": -2.134547710418701, + "logps/chosen": -142.7288818359375, + "logps/rejected": -133.72573852539062, + "loss": 0.7838, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.31318625807762146, + "rewards/margins": -0.14739766716957092, + "rewards/rejected": -0.16578857600688934, + "step": 322 + }, + { + "epoch": 0.42, + "learning_rate": 4.8733986355795905e-05, + "logits/chosen": -1.8786143064498901, + "logits/rejected": -1.8554753065109253, + "logps/chosen": -202.17709350585938, + "logps/rejected": -205.63951110839844, + "loss": 0.6038, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.08190246671438217, + "rewards/margins": 0.2607104182243347, + "rewards/rejected": -0.17880797386169434, + "step": 323 + }, + { + "epoch": 0.42, + "learning_rate": 4.8722704418271745e-05, + "logits/chosen": -2.1331138610839844, + "logits/rejected": -2.1795654296875, + "logps/chosen": -140.4710693359375, + "logps/rejected": -143.41455078125, + "loss": 0.6648, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.11869597434997559, + "rewards/margins": 0.0922723338007927, + "rewards/rejected": -0.21096831560134888, + "step": 324 + }, + { + "epoch": 0.43, + "learning_rate": 4.871137375227829e-05, + "logits/chosen": -1.9935777187347412, + "logits/rejected": -1.9692142009735107, + "logps/chosen": -297.49334716796875, + "logps/rejected": -299.84173583984375, + "loss": 0.7695, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12770779430866241, + "rewards/margins": -0.043505311012268066, + "rewards/rejected": -0.08420247584581375, + "step": 325 + }, + { + "epoch": 0.43, + "learning_rate": 4.869999438108971e-05, + "logits/chosen": -2.244154691696167, + "logits/rejected": -2.2609543800354004, + "logps/chosen": -148.76113891601562, + "logps/rejected": -137.73040771484375, + "loss": 0.7324, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.16916662454605103, + "rewards/margins": -0.04308079555630684, + "rewards/rejected": -0.12608584761619568, + "step": 326 + }, + { + "epoch": 0.43, + "learning_rate": 4.8688566328080215e-05, + "logits/chosen": -2.1407310962677, + "logits/rejected": -2.119994640350342, + "logps/chosen": -167.45089721679688, + "logps/rejected": -159.93060302734375, + "loss": 0.7649, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3510248363018036, + "rewards/margins": -0.09942013025283813, + "rewards/rejected": -0.25160470604896545, + "step": 327 + }, + { + "epoch": 0.43, + "learning_rate": 4.867708961672399e-05, + "logits/chosen": -2.185452699661255, + "logits/rejected": -2.1928515434265137, + "logps/chosen": -186.91505432128906, + "logps/rejected": -185.78733825683594, + "loss": 0.7766, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.09309504181146622, + "rewards/margins": -0.11240511387586594, + "rewards/rejected": 0.01931007206439972, + "step": 328 + }, + { + "epoch": 0.43, + "learning_rate": 4.866556427059519e-05, + "logits/chosen": -2.1376357078552246, + "logits/rejected": -2.2185373306274414, + "logps/chosen": -166.6249542236328, + "logps/rejected": -159.41290283203125, + "loss": 0.8817, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1689298152923584, + "rewards/margins": -0.18555757403373718, + "rewards/rejected": 0.016627788543701172, + "step": 329 + }, + { + "epoch": 0.43, + "learning_rate": 4.865399031336787e-05, + "logits/chosen": -1.9547293186187744, + "logits/rejected": -2.0602641105651855, + "logps/chosen": -159.6194610595703, + "logps/rejected": -187.00814819335938, + "loss": 0.5742, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.08318576216697693, + "rewards/margins": 0.3107747733592987, + "rewards/rejected": -0.22758902609348297, + "step": 330 + }, + { + "epoch": 0.43, + "learning_rate": 4.8642367768815936e-05, + "logits/chosen": -2.290693998336792, + "logits/rejected": -2.2612967491149902, + "logps/chosen": -204.46893310546875, + "logps/rejected": -196.72003173828125, + "loss": 0.6234, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20541870594024658, + "rewards/margins": 0.22441859543323517, + "rewards/rejected": -0.42983728647232056, + "step": 331 + }, + { + "epoch": 0.43, + "learning_rate": 4.863069666081307e-05, + "logits/chosen": -2.1638312339782715, + "logits/rejected": -2.109403133392334, + "logps/chosen": -138.51168823242188, + "logps/rejected": -137.55712890625, + "loss": 0.7573, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.020162537693977356, + "rewards/margins": -0.048720985651016235, + "rewards/rejected": 0.028558451682329178, + "step": 332 + }, + { + "epoch": 0.44, + "learning_rate": 4.861897701333274e-05, + "logits/chosen": -2.2542800903320312, + "logits/rejected": -2.258378744125366, + "logps/chosen": -134.4961700439453, + "logps/rejected": -148.88816833496094, + "loss": 0.7321, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.294142484664917, + "rewards/margins": -0.008809719234704971, + "rewards/rejected": -0.2853327691555023, + "step": 333 + }, + { + "epoch": 0.44, + "learning_rate": 4.86072088504481e-05, + "logits/chosen": -2.152188539505005, + "logits/rejected": -2.193638324737549, + "logps/chosen": -138.4483184814453, + "logps/rejected": -143.88937377929688, + "loss": 0.7268, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.020661018788814545, + "rewards/margins": 0.026195645332336426, + "rewards/rejected": -0.00553460419178009, + "step": 334 + }, + { + "epoch": 0.44, + "learning_rate": 4.859539219633199e-05, + "logits/chosen": -2.293026924133301, + "logits/rejected": -2.278402805328369, + "logps/chosen": -163.4123077392578, + "logps/rejected": -180.30471801757812, + "loss": 0.7216, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1497456282377243, + "rewards/margins": -0.013850819319486618, + "rewards/rejected": -0.1358948051929474, + "step": 335 + }, + { + "epoch": 0.44, + "learning_rate": 4.8583527075256804e-05, + "logits/chosen": -2.0941126346588135, + "logits/rejected": -2.131237745285034, + "logps/chosen": -153.41043090820312, + "logps/rejected": -145.23020935058594, + "loss": 0.6753, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.00170879065990448, + "rewards/margins": 0.09639380127191544, + "rewards/rejected": -0.09810256958007812, + "step": 336 + }, + { + "epoch": 0.44, + "learning_rate": 4.857161351159454e-05, + "logits/chosen": -1.8922992944717407, + "logits/rejected": -1.7540662288665771, + "logps/chosen": -142.38389587402344, + "logps/rejected": -148.71514892578125, + "loss": 0.6864, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02964305877685547, + "rewards/margins": 0.07973094284534454, + "rewards/rejected": -0.10937398672103882, + "step": 337 + }, + { + "epoch": 0.44, + "learning_rate": 4.8559651529816664e-05, + "logits/chosen": -2.170006513595581, + "logits/rejected": -2.202960252761841, + "logps/chosen": -144.66648864746094, + "logps/rejected": -136.111083984375, + "loss": 0.7541, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.14397282898426056, + "rewards/margins": -0.059857327491045, + "rewards/rejected": -0.08411550521850586, + "step": 338 + }, + { + "epoch": 0.44, + "learning_rate": 4.854764115449411e-05, + "logits/chosen": -2.098104238510132, + "logits/rejected": -2.05627179145813, + "logps/chosen": -138.97332763671875, + "logps/rejected": -143.80482482910156, + "loss": 0.7632, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.13671807944774628, + "rewards/margins": -0.11267251521348953, + "rewards/rejected": -0.024045560508966446, + "step": 339 + }, + { + "epoch": 0.44, + "learning_rate": 4.853558241029723e-05, + "logits/chosen": -2.045684337615967, + "logits/rejected": -2.010531425476074, + "logps/chosen": -158.4129180908203, + "logps/rejected": -169.3193817138672, + "loss": 0.7891, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.13485963642597198, + "rewards/margins": -0.10890495777130127, + "rewards/rejected": -0.025954678654670715, + "step": 340 + }, + { + "epoch": 0.45, + "learning_rate": 4.8523475321995715e-05, + "logits/chosen": -2.199786901473999, + "logits/rejected": -2.0181658267974854, + "logps/chosen": -189.12388610839844, + "logps/rejected": -139.44427490234375, + "loss": 0.7936, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.2502678632736206, + "rewards/margins": -0.15687265992164612, + "rewards/rejected": -0.09339523315429688, + "step": 341 + }, + { + "epoch": 0.45, + "learning_rate": 4.8511319914458555e-05, + "logits/chosen": -2.1136443614959717, + "logits/rejected": -2.1628942489624023, + "logps/chosen": -153.02317810058594, + "logps/rejected": -147.45071411132812, + "loss": 0.7746, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.15984635055065155, + "rewards/margins": -0.07732124626636505, + "rewards/rejected": -0.0825251117348671, + "step": 342 + }, + { + "epoch": 0.45, + "learning_rate": 4.849911621265401e-05, + "logits/chosen": -2.0306332111358643, + "logits/rejected": -2.0360593795776367, + "logps/chosen": -153.94784545898438, + "logps/rejected": -162.53622436523438, + "loss": 0.7162, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1451529562473297, + "rewards/margins": 0.013205336406826973, + "rewards/rejected": -0.15835829079151154, + "step": 343 + }, + { + "epoch": 0.45, + "learning_rate": 4.848686424164953e-05, + "logits/chosen": -2.1170244216918945, + "logits/rejected": -2.0833611488342285, + "logps/chosen": -140.57786560058594, + "logps/rejected": -133.22264099121094, + "loss": 0.6097, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15281759202480316, + "rewards/margins": 0.23962977528572083, + "rewards/rejected": -0.08681218326091766, + "step": 344 + }, + { + "epoch": 0.45, + "learning_rate": 4.84745640266117e-05, + "logits/chosen": -2.1526424884796143, + "logits/rejected": -2.1702723503112793, + "logps/chosen": -152.15927124023438, + "logps/rejected": -158.22642517089844, + "loss": 0.6938, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.09753294289112091, + "rewards/margins": 0.03131475672125816, + "rewards/rejected": -0.12884768843650818, + "step": 345 + }, + { + "epoch": 0.45, + "learning_rate": 4.846221559280624e-05, + "logits/chosen": -2.025162696838379, + "logits/rejected": -2.069690704345703, + "logps/chosen": -147.05039978027344, + "logps/rejected": -162.94961547851562, + "loss": 0.6505, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.15327642858028412, + "rewards/margins": 0.11415009945631027, + "rewards/rejected": 0.03912632539868355, + "step": 346 + }, + { + "epoch": 0.45, + "learning_rate": 4.844981896559787e-05, + "logits/chosen": -2.208263635635376, + "logits/rejected": -2.176539659500122, + "logps/chosen": -212.233154296875, + "logps/rejected": -209.1885223388672, + "loss": 0.6771, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.31684011220932007, + "rewards/margins": 0.08533424139022827, + "rewards/rejected": -0.40217435359954834, + "step": 347 + }, + { + "epoch": 0.46, + "learning_rate": 4.8437374170450344e-05, + "logits/chosen": -2.2429494857788086, + "logits/rejected": -2.2173945903778076, + "logps/chosen": -199.77957153320312, + "logps/rejected": -156.09060668945312, + "loss": 0.6945, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.056611090898513794, + "rewards/margins": 0.11540088802576065, + "rewards/rejected": -0.17201198637485504, + "step": 348 + }, + { + "epoch": 0.46, + "learning_rate": 4.842488123292632e-05, + "logits/chosen": -2.088970422744751, + "logits/rejected": -2.0711379051208496, + "logps/chosen": -161.3831787109375, + "logps/rejected": -153.05728149414062, + "loss": 0.6521, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.25492504239082336, + "rewards/margins": 0.13332128524780273, + "rewards/rejected": 0.12160372734069824, + "step": 349 + }, + { + "epoch": 0.46, + "learning_rate": 4.8412340178687374e-05, + "logits/chosen": -2.0223379135131836, + "logits/rejected": -2.0152769088745117, + "logps/chosen": -131.5534210205078, + "logps/rejected": -137.33941650390625, + "loss": 0.6658, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.1717926263809204, + "rewards/margins": 0.12639762461185455, + "rewards/rejected": -0.29819023609161377, + "step": 350 + }, + { + "epoch": 0.46, + "learning_rate": 4.839975103349391e-05, + "logits/chosen": -2.299764633178711, + "logits/rejected": -2.2954838275909424, + "logps/chosen": -177.5594024658203, + "logps/rejected": -162.41683959960938, + "loss": 0.6371, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.12795095145702362, + "rewards/margins": 0.15119165182113647, + "rewards/rejected": -0.2791425883769989, + "step": 351 + }, + { + "epoch": 0.46, + "learning_rate": 4.8387113823205096e-05, + "logits/chosen": -2.0409903526306152, + "logits/rejected": -2.0102832317352295, + "logps/chosen": -171.60342407226562, + "logps/rejected": -162.114013671875, + "loss": 0.7369, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04189925640821457, + "rewards/margins": 0.03296327590942383, + "rewards/rejected": -0.074862539768219, + "step": 352 + }, + { + "epoch": 0.46, + "learning_rate": 4.8374428573778864e-05, + "logits/chosen": -2.1743125915527344, + "logits/rejected": -2.2494208812713623, + "logps/chosen": -154.51788330078125, + "logps/rejected": -161.42127990722656, + "loss": 0.7557, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.04235090687870979, + "rewards/margins": -0.0858534649014473, + "rewards/rejected": 0.1282043755054474, + "step": 353 + }, + { + "epoch": 0.46, + "learning_rate": 4.8361695311271795e-05, + "logits/chosen": -1.9694074392318726, + "logits/rejected": -1.9777883291244507, + "logps/chosen": -172.72686767578125, + "logps/rejected": -187.39076232910156, + "loss": 0.6227, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23133322596549988, + "rewards/margins": 0.19938969612121582, + "rewards/rejected": -0.4307229220867157, + "step": 354 + }, + { + "epoch": 0.46, + "learning_rate": 4.83489140618391e-05, + "logits/chosen": -2.125094175338745, + "logits/rejected": -2.069443464279175, + "logps/chosen": -177.20260620117188, + "logps/rejected": -166.99986267089844, + "loss": 0.681, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07136122137308121, + "rewards/margins": 0.09348535537719727, + "rewards/rejected": -0.02212415263056755, + "step": 355 + }, + { + "epoch": 0.47, + "learning_rate": 4.833608485173457e-05, + "logits/chosen": -2.26041841506958, + "logits/rejected": -2.3254926204681396, + "logps/chosen": -142.5237274169922, + "logps/rejected": -150.29586791992188, + "loss": 0.7873, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.08850278705358505, + "rewards/margins": -0.12702789902687073, + "rewards/rejected": 0.03852510452270508, + "step": 356 + }, + { + "epoch": 0.47, + "learning_rate": 4.8323207707310496e-05, + "logits/chosen": -2.112971782684326, + "logits/rejected": -2.1613588333129883, + "logps/chosen": -170.9886932373047, + "logps/rejected": -180.46363830566406, + "loss": 0.68, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.04462575912475586, + "rewards/margins": 0.08605735003948212, + "rewards/rejected": -0.13068309426307678, + "step": 357 + }, + { + "epoch": 0.47, + "learning_rate": 4.831028265501764e-05, + "logits/chosen": -1.8144406080245972, + "logits/rejected": -1.8706724643707275, + "logps/chosen": -172.77764892578125, + "logps/rejected": -187.60598754882812, + "loss": 0.7243, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.18522275984287262, + "rewards/margins": 0.02290777862071991, + "rewards/rejected": -0.20813053846359253, + "step": 358 + }, + { + "epoch": 0.47, + "learning_rate": 4.829730972140517e-05, + "logits/chosen": -2.0918076038360596, + "logits/rejected": -2.0828185081481934, + "logps/chosen": -133.16778564453125, + "logps/rejected": -139.5255889892578, + "loss": 0.6419, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.05682823061943054, + "rewards/margins": 0.1471494734287262, + "rewards/rejected": -0.09032122790813446, + "step": 359 + }, + { + "epoch": 0.47, + "learning_rate": 4.8284288933120594e-05, + "logits/chosen": -2.0551576614379883, + "logits/rejected": -2.0017828941345215, + "logps/chosen": -173.2173309326172, + "logps/rejected": -187.19105529785156, + "loss": 0.6964, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.13149763643741608, + "rewards/margins": 0.09333821386098862, + "rewards/rejected": -0.2248358577489853, + "step": 360 + }, + { + "epoch": 0.47, + "learning_rate": 4.8271220316909735e-05, + "logits/chosen": -2.1928532123565674, + "logits/rejected": -2.184141159057617, + "logps/chosen": -200.8461456298828, + "logps/rejected": -204.56504821777344, + "loss": 0.7748, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.022504782304167747, + "rewards/margins": -0.08254070580005646, + "rewards/rejected": 0.06003589183092117, + "step": 361 + }, + { + "epoch": 0.47, + "learning_rate": 4.825810389961666e-05, + "logits/chosen": -2.254242181777954, + "logits/rejected": -2.261145830154419, + "logps/chosen": -160.71571350097656, + "logps/rejected": -142.36376953125, + "loss": 0.6519, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.024668443948030472, + "rewards/margins": 0.13744822144508362, + "rewards/rejected": -0.16211667656898499, + "step": 362 + }, + { + "epoch": 0.48, + "learning_rate": 4.8244939708183596e-05, + "logits/chosen": -2.1350438594818115, + "logits/rejected": -2.1070897579193115, + "logps/chosen": -160.7988739013672, + "logps/rejected": -157.40176391601562, + "loss": 0.7235, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01498755719512701, + "rewards/margins": -0.015977520495653152, + "rewards/rejected": 0.03096509724855423, + "step": 363 + }, + { + "epoch": 0.48, + "learning_rate": 4.823172776965094e-05, + "logits/chosen": -2.2463290691375732, + "logits/rejected": -2.2258195877075195, + "logps/chosen": -136.88392639160156, + "logps/rejected": -128.96087646484375, + "loss": 0.6831, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.055554769933223724, + "rewards/margins": 0.06438815593719482, + "rewards/rejected": -0.11994291841983795, + "step": 364 + }, + { + "epoch": 0.48, + "learning_rate": 4.821846811115713e-05, + "logits/chosen": -1.9229159355163574, + "logits/rejected": -1.839210867881775, + "logps/chosen": -173.51405334472656, + "logps/rejected": -153.61341857910156, + "loss": 0.7675, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1366410106420517, + "rewards/margins": -0.06360301375389099, + "rewards/rejected": 0.20024403929710388, + "step": 365 + }, + { + "epoch": 0.48, + "learning_rate": 4.820516075993865e-05, + "logits/chosen": -2.0157222747802734, + "logits/rejected": -2.0540072917938232, + "logps/chosen": -141.3604278564453, + "logps/rejected": -146.56663513183594, + "loss": 0.7431, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15194594860076904, + "rewards/margins": 0.032469067722558975, + "rewards/rejected": -0.1844150424003601, + "step": 366 + }, + { + "epoch": 0.48, + "learning_rate": 4.819180574332994e-05, + "logits/chosen": -2.1887617111206055, + "logits/rejected": -2.1101431846618652, + "logps/chosen": -164.18893432617188, + "logps/rejected": -160.06475830078125, + "loss": 0.6856, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.02116868458688259, + "rewards/margins": 0.028199315071105957, + "rewards/rejected": -0.0493679977953434, + "step": 367 + }, + { + "epoch": 0.48, + "learning_rate": 4.8178403088763355e-05, + "logits/chosen": -2.2744359970092773, + "logits/rejected": -2.291210651397705, + "logps/chosen": -168.8428955078125, + "logps/rejected": -171.43356323242188, + "loss": 0.8289, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.059717368334531784, + "rewards/margins": -0.18386751413345337, + "rewards/rejected": 0.12415014207363129, + "step": 368 + }, + { + "epoch": 0.48, + "learning_rate": 4.8164952823769085e-05, + "logits/chosen": -1.8740739822387695, + "logits/rejected": -1.8469749689102173, + "logps/chosen": -135.71493530273438, + "logps/rejected": -149.68069458007812, + "loss": 0.6854, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.029844455420970917, + "rewards/margins": 0.11711962521076202, + "rewards/rejected": -0.14696410298347473, + "step": 369 + }, + { + "epoch": 0.48, + "learning_rate": 4.815145497597514e-05, + "logits/chosen": -2.102431297302246, + "logits/rejected": -2.032944917678833, + "logps/chosen": -173.45106506347656, + "logps/rejected": -159.73878479003906, + "loss": 0.9183, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.022377396002411842, + "rewards/margins": -0.20098015666007996, + "rewards/rejected": 0.22335757315158844, + "step": 370 + }, + { + "epoch": 0.49, + "learning_rate": 4.8137909573107246e-05, + "logits/chosen": -2.3416833877563477, + "logits/rejected": -2.3538575172424316, + "logps/chosen": -167.69911193847656, + "logps/rejected": -157.8156280517578, + "loss": 0.7045, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.004629455506801605, + "rewards/margins": 0.09061210602521896, + "rewards/rejected": -0.08598263561725616, + "step": 371 + }, + { + "epoch": 0.49, + "learning_rate": 4.812431664298883e-05, + "logits/chosen": -2.17607045173645, + "logits/rejected": -2.1720361709594727, + "logps/chosen": -166.7891845703125, + "logps/rejected": -166.55804443359375, + "loss": 0.7444, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.178691565990448, + "rewards/margins": -0.029032886028289795, + "rewards/rejected": -0.1496586799621582, + "step": 372 + }, + { + "epoch": 0.49, + "learning_rate": 4.811067621354094e-05, + "logits/chosen": -2.0844247341156006, + "logits/rejected": -2.150310516357422, + "logps/chosen": -160.33216857910156, + "logps/rejected": -183.65805053710938, + "loss": 0.738, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.019554704427719116, + "rewards/margins": -0.00408715195953846, + "rewards/rejected": -0.015467546880245209, + "step": 373 + }, + { + "epoch": 0.49, + "learning_rate": 4.8096988312782174e-05, + "logits/chosen": -2.148946762084961, + "logits/rejected": -2.091033458709717, + "logps/chosen": -163.8697052001953, + "logps/rejected": -177.90403747558594, + "loss": 0.7519, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.020494531840085983, + "rewards/margins": -0.09285805374383926, + "rewards/rejected": 0.11335259675979614, + "step": 374 + }, + { + "epoch": 0.49, + "learning_rate": 4.8083252968828665e-05, + "logits/chosen": -2.034929037094116, + "logits/rejected": -2.0719597339630127, + "logps/chosen": -154.4242401123047, + "logps/rejected": -157.4151611328125, + "loss": 0.7685, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.11203078925609589, + "rewards/margins": -0.1181333065032959, + "rewards/rejected": 0.006102517247200012, + "step": 375 + }, + { + "epoch": 0.49, + "learning_rate": 4.8069470209893974e-05, + "logits/chosen": -2.1990954875946045, + "logits/rejected": -2.1845507621765137, + "logps/chosen": -167.10202026367188, + "logps/rejected": -173.89694213867188, + "loss": 0.7338, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.048778437077999115, + "rewards/margins": -0.013265417888760567, + "rewards/rejected": -0.0355130136013031, + "step": 376 + }, + { + "epoch": 0.49, + "learning_rate": 4.8055640064289086e-05, + "logits/chosen": -2.0750880241394043, + "logits/rejected": -2.0607268810272217, + "logps/chosen": -145.49095153808594, + "logps/rejected": -146.54730224609375, + "loss": 0.7064, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.12515582144260406, + "rewards/margins": 0.004622337408363819, + "rewards/rejected": -0.1297781616449356, + "step": 377 + }, + { + "epoch": 0.49, + "learning_rate": 4.80417625604223e-05, + "logits/chosen": -2.065021276473999, + "logits/rejected": -1.985723853111267, + "logps/chosen": -168.58168029785156, + "logps/rejected": -154.1855010986328, + "loss": 0.7135, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2825557291507721, + "rewards/margins": -0.0009878575801849365, + "rewards/rejected": 0.28354358673095703, + "step": 378 + }, + { + "epoch": 0.5, + "learning_rate": 4.8027837726799205e-05, + "logits/chosen": -2.0231850147247314, + "logits/rejected": -2.0219273567199707, + "logps/chosen": -177.43508911132812, + "logps/rejected": -180.77137756347656, + "loss": 0.6861, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.008570343255996704, + "rewards/margins": 0.08320408314466476, + "rewards/rejected": -0.07463373243808746, + "step": 379 + }, + { + "epoch": 0.5, + "learning_rate": 4.801386559202259e-05, + "logits/chosen": -2.098315477371216, + "logits/rejected": -2.0270769596099854, + "logps/chosen": -174.91799926757812, + "logps/rejected": -150.36114501953125, + "loss": 0.7361, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.12160081416368484, + "rewards/margins": 0.009557720273733139, + "rewards/rejected": 0.112043097615242, + "step": 380 + }, + { + "epoch": 0.5, + "learning_rate": 4.799984618479242e-05, + "logits/chosen": -1.9548274278640747, + "logits/rejected": -1.9769983291625977, + "logps/chosen": -142.3773193359375, + "logps/rejected": -138.9167938232422, + "loss": 0.8095, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.1921553909778595, + "rewards/margins": -0.17860561609268188, + "rewards/rejected": -0.01354978233575821, + "step": 381 + }, + { + "epoch": 0.5, + "learning_rate": 4.798577953390577e-05, + "logits/chosen": -2.0205070972442627, + "logits/rejected": -2.027376413345337, + "logps/chosen": -195.26025390625, + "logps/rejected": -219.80625915527344, + "loss": 0.7086, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.27936846017837524, + "rewards/margins": 0.01577301323413849, + "rewards/rejected": -0.29514145851135254, + "step": 382 + }, + { + "epoch": 0.5, + "learning_rate": 4.797166566825675e-05, + "logits/chosen": -2.071674346923828, + "logits/rejected": -2.0323328971862793, + "logps/chosen": -151.1609344482422, + "logps/rejected": -169.61581420898438, + "loss": 0.6817, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.14004096388816833, + "rewards/margins": 0.09525477886199951, + "rewards/rejected": 0.04478616267442703, + "step": 383 + }, + { + "epoch": 0.5, + "learning_rate": 4.795750461683644e-05, + "logits/chosen": -2.1325645446777344, + "logits/rejected": -2.059990882873535, + "logps/chosen": -154.6941680908203, + "logps/rejected": -157.33233642578125, + "loss": 0.6819, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.07734552025794983, + "rewards/margins": 0.0484672486782074, + "rewards/rejected": 0.028878264129161835, + "step": 384 + }, + { + "epoch": 0.5, + "learning_rate": 4.794329640873285e-05, + "logits/chosen": -2.094820737838745, + "logits/rejected": -2.146373987197876, + "logps/chosen": -145.1197509765625, + "logps/rejected": -144.53829956054688, + "loss": 0.5279, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.18119190633296967, + "rewards/margins": 0.4195486307144165, + "rewards/rejected": -0.23835672438144684, + "step": 385 + }, + { + "epoch": 0.51, + "learning_rate": 4.7929041073130867e-05, + "logits/chosen": -2.0487372875213623, + "logits/rejected": -1.9535651206970215, + "logps/chosen": -146.88800048828125, + "logps/rejected": -146.015380859375, + "loss": 0.5709, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05707988515496254, + "rewards/margins": 0.39438337087631226, + "rewards/rejected": -0.33730348944664, + "step": 386 + }, + { + "epoch": 0.51, + "learning_rate": 4.7914738639312165e-05, + "logits/chosen": -1.9662736654281616, + "logits/rejected": -2.0507540702819824, + "logps/chosen": -159.54833984375, + "logps/rejected": -181.677978515625, + "loss": 0.8444, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.1504884660243988, + "rewards/margins": -0.18733000755310059, + "rewards/rejected": 0.036841537803411484, + "step": 387 + }, + { + "epoch": 0.51, + "learning_rate": 4.790038913665519e-05, + "logits/chosen": -1.8254969120025635, + "logits/rejected": -1.8466860055923462, + "logps/chosen": -121.59519958496094, + "logps/rejected": -131.51698303222656, + "loss": 0.7607, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.16594046354293823, + "rewards/margins": -0.08683046698570251, + "rewards/rejected": 0.25277090072631836, + "step": 388 + }, + { + "epoch": 0.51, + "learning_rate": 4.788599259463502e-05, + "logits/chosen": -2.0490996837615967, + "logits/rejected": -2.088806390762329, + "logps/chosen": -144.5177459716797, + "logps/rejected": -151.15444946289062, + "loss": 0.7034, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16910819709300995, + "rewards/margins": 0.07333025336265564, + "rewards/rejected": 0.09577794373035431, + "step": 389 + }, + { + "epoch": 0.51, + "learning_rate": 4.787154904282341e-05, + "logits/chosen": -2.2665562629699707, + "logits/rejected": -2.2620351314544678, + "logps/chosen": -133.22760009765625, + "logps/rejected": -147.43667602539062, + "loss": 0.7657, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.18194647133350372, + "rewards/margins": -0.09383340179920197, + "rewards/rejected": -0.08811306953430176, + "step": 390 + }, + { + "epoch": 0.51, + "learning_rate": 4.7857058510888645e-05, + "logits/chosen": -2.2096052169799805, + "logits/rejected": -2.216580867767334, + "logps/chosen": -155.67662048339844, + "logps/rejected": -169.5216064453125, + "loss": 0.7002, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0033222604542970657, + "rewards/margins": 0.02931857667863369, + "rewards/rejected": -0.02599630132317543, + "step": 391 + }, + { + "epoch": 0.51, + "learning_rate": 4.7842521028595526e-05, + "logits/chosen": -2.1179494857788086, + "logits/rejected": -2.172715187072754, + "logps/chosen": -139.75497436523438, + "logps/rejected": -149.2156982421875, + "loss": 0.7134, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20826546847820282, + "rewards/margins": 0.03352481871843338, + "rewards/rejected": -0.2417902648448944, + "step": 392 + }, + { + "epoch": 0.51, + "learning_rate": 4.7827936625805284e-05, + "logits/chosen": -2.1453120708465576, + "logits/rejected": -2.2276957035064697, + "logps/chosen": -160.8773651123047, + "logps/rejected": -182.9378204345703, + "loss": 0.715, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20616355538368225, + "rewards/margins": 0.011941194534301758, + "rewards/rejected": -0.21810473501682281, + "step": 393 + }, + { + "epoch": 0.52, + "learning_rate": 4.7813305332475535e-05, + "logits/chosen": -1.5362883806228638, + "logits/rejected": -1.4986250400543213, + "logps/chosen": -256.8396301269531, + "logps/rejected": -269.427001953125, + "loss": 0.6733, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.024325193837285042, + "rewards/margins": 0.11790871620178223, + "rewards/rejected": -0.14223390817642212, + "step": 394 + }, + { + "epoch": 0.52, + "learning_rate": 4.77986271786602e-05, + "logits/chosen": -2.1201870441436768, + "logits/rejected": -2.150663137435913, + "logps/chosen": -144.35189819335938, + "logps/rejected": -152.51849365234375, + "loss": 0.605, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.19814497232437134, + "rewards/margins": 0.2016732394695282, + "rewards/rejected": -0.39981821179389954, + "step": 395 + }, + { + "epoch": 0.52, + "learning_rate": 4.778390219450949e-05, + "logits/chosen": -1.9574625492095947, + "logits/rejected": -1.9414085149765015, + "logps/chosen": -206.0396728515625, + "logps/rejected": -230.67552185058594, + "loss": 0.5939, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.0024340637028217316, + "rewards/margins": 0.2473735809326172, + "rewards/rejected": -0.24493952095508575, + "step": 396 + }, + { + "epoch": 0.52, + "learning_rate": 4.776913041026976e-05, + "logits/chosen": -2.2131030559539795, + "logits/rejected": -2.1374146938323975, + "logps/chosen": -157.73483276367188, + "logps/rejected": -147.059326171875, + "loss": 0.6997, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1409614086151123, + "rewards/margins": 0.018417831510305405, + "rewards/rejected": -0.15937921404838562, + "step": 397 + }, + { + "epoch": 0.52, + "learning_rate": 4.775431185628353e-05, + "logits/chosen": -1.8067915439605713, + "logits/rejected": -1.7414895296096802, + "logps/chosen": -192.5210418701172, + "logps/rejected": -151.17227172851562, + "loss": 0.8156, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.41330477595329285, + "rewards/margins": -0.11488999426364899, + "rewards/rejected": -0.29841476678848267, + "step": 398 + }, + { + "epoch": 0.52, + "learning_rate": 4.7739446562989384e-05, + "logits/chosen": -2.2105062007904053, + "logits/rejected": -2.2545359134674072, + "logps/chosen": -171.10385131835938, + "logps/rejected": -176.2754364013672, + "loss": 0.6499, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5258097648620605, + "rewards/margins": 0.18083709478378296, + "rewards/rejected": -0.7066469192504883, + "step": 399 + }, + { + "epoch": 0.52, + "learning_rate": 4.772453456092191e-05, + "logits/chosen": -2.2050392627716064, + "logits/rejected": -2.1882925033569336, + "logps/chosen": -181.8564453125, + "logps/rejected": -180.79759216308594, + "loss": 0.7227, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.48925238847732544, + "rewards/margins": -0.004873424768447876, + "rewards/rejected": -0.4843789339065552, + "step": 400 + }, + { + "epoch": 0.52, + "learning_rate": 4.7709575880711634e-05, + "logits/chosen": -2.2256252765655518, + "logits/rejected": -2.2812557220458984, + "logps/chosen": -113.94125366210938, + "logps/rejected": -123.2302474975586, + "loss": 0.7755, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.23328812420368195, + "rewards/margins": -0.1003262847661972, + "rewards/rejected": -0.13296185433864594, + "step": 401 + }, + { + "epoch": 0.53, + "learning_rate": 4.769457055308497e-05, + "logits/chosen": -2.215670108795166, + "logits/rejected": -2.2150087356567383, + "logps/chosen": -166.19100952148438, + "logps/rejected": -174.61366271972656, + "loss": 0.7388, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3156086504459381, + "rewards/margins": -0.022609539330005646, + "rewards/rejected": -0.29299911856651306, + "step": 402 + }, + { + "epoch": 0.53, + "learning_rate": 4.767951860886415e-05, + "logits/chosen": -1.9413666725158691, + "logits/rejected": -2.0344936847686768, + "logps/chosen": -140.91476440429688, + "logps/rejected": -164.15542602539062, + "loss": 0.7518, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.37001854181289673, + "rewards/margins": -0.07773025333881378, + "rewards/rejected": -0.29228830337524414, + "step": 403 + }, + { + "epoch": 0.53, + "learning_rate": 4.766442007896715e-05, + "logits/chosen": -2.185791254043579, + "logits/rejected": -2.158402681350708, + "logps/chosen": -174.4066925048828, + "logps/rejected": -174.22225952148438, + "loss": 0.6677, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.426675409078598, + "rewards/margins": 0.14762084186077118, + "rewards/rejected": -0.5742962956428528, + "step": 404 + }, + { + "epoch": 0.53, + "learning_rate": 4.764927499440767e-05, + "logits/chosen": -1.982418417930603, + "logits/rejected": -1.9350054264068604, + "logps/chosen": -178.34976196289062, + "logps/rejected": -180.20494079589844, + "loss": 0.6274, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3919566869735718, + "rewards/margins": 0.19768588244915009, + "rewards/rejected": -0.5896425247192383, + "step": 405 + }, + { + "epoch": 0.53, + "learning_rate": 4.763408338629498e-05, + "logits/chosen": -2.2213985919952393, + "logits/rejected": -2.2709920406341553, + "logps/chosen": -143.23550415039062, + "logps/rejected": -147.50332641601562, + "loss": 0.6232, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20936697721481323, + "rewards/margins": 0.1896706074476242, + "rewards/rejected": -0.39903756976127625, + "step": 406 + }, + { + "epoch": 0.53, + "learning_rate": 4.761884528583396e-05, + "logits/chosen": -2.2714898586273193, + "logits/rejected": -2.1939728260040283, + "logps/chosen": -182.6309356689453, + "logps/rejected": -193.048583984375, + "loss": 0.7323, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4604118764400482, + "rewards/margins": 0.03731643036007881, + "rewards/rejected": -0.49772825837135315, + "step": 407 + }, + { + "epoch": 0.53, + "learning_rate": 4.760356072432498e-05, + "logits/chosen": -2.160900115966797, + "logits/rejected": -2.0733065605163574, + "logps/chosen": -184.3874053955078, + "logps/rejected": -181.5076904296875, + "loss": 0.6753, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.27709901332855225, + "rewards/margins": 0.06253170967102051, + "rewards/rejected": -0.33963072299957275, + "step": 408 + }, + { + "epoch": 0.54, + "learning_rate": 4.7588229733163834e-05, + "logits/chosen": -1.7874306440353394, + "logits/rejected": -1.8620976209640503, + "logps/chosen": -207.32083129882812, + "logps/rejected": -205.32540893554688, + "loss": 0.6809, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5504447817802429, + "rewards/margins": 0.07099004089832306, + "rewards/rejected": -0.6214348077774048, + "step": 409 + }, + { + "epoch": 0.54, + "learning_rate": 4.757285234384169e-05, + "logits/chosen": -2.2311081886291504, + "logits/rejected": -2.1636242866516113, + "logps/chosen": -165.6267852783203, + "logps/rejected": -168.37962341308594, + "loss": 0.7484, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8211495280265808, + "rewards/margins": -0.0740201398730278, + "rewards/rejected": -0.7471294403076172, + "step": 410 + }, + { + "epoch": 0.54, + "learning_rate": 4.755742858794503e-05, + "logits/chosen": -2.26678729057312, + "logits/rejected": -2.2344679832458496, + "logps/chosen": -157.74423217773438, + "logps/rejected": -152.1922149658203, + "loss": 0.7029, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.18589326739311218, + "rewards/margins": 0.01804957166314125, + "rewards/rejected": -0.20394286513328552, + "step": 411 + }, + { + "epoch": 0.54, + "learning_rate": 4.754195849715557e-05, + "logits/chosen": -2.143720865249634, + "logits/rejected": -2.176121711730957, + "logps/chosen": -171.01498413085938, + "logps/rejected": -174.25668334960938, + "loss": 0.7835, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.34361717104911804, + "rewards/margins": -0.003480616956949234, + "rewards/rejected": -0.3401365578174591, + "step": 412 + }, + { + "epoch": 0.54, + "learning_rate": 4.75264421032502e-05, + "logits/chosen": -2.2774815559387207, + "logits/rejected": -2.2418551445007324, + "logps/chosen": -183.46746826171875, + "logps/rejected": -172.81829833984375, + "loss": 0.6842, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.40849414467811584, + "rewards/margins": 0.09351891279220581, + "rewards/rejected": -0.5020129680633545, + "step": 413 + }, + { + "epoch": 0.54, + "learning_rate": 4.751087943810093e-05, + "logits/chosen": -1.9202462434768677, + "logits/rejected": -1.9200413227081299, + "logps/chosen": -178.16827392578125, + "logps/rejected": -163.49049377441406, + "loss": 0.7876, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.6583839058876038, + "rewards/margins": -0.0725487470626831, + "rewards/rejected": -0.5858351588249207, + "step": 414 + }, + { + "epoch": 0.54, + "learning_rate": 4.749527053367481e-05, + "logits/chosen": -2.3440024852752686, + "logits/rejected": -2.306640386581421, + "logps/chosen": -204.37374877929688, + "logps/rejected": -196.37139892578125, + "loss": 0.7817, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3725470006465912, + "rewards/margins": -0.10262566059827805, + "rewards/rejected": -0.26992136240005493, + "step": 415 + }, + { + "epoch": 0.54, + "learning_rate": 4.747961542203386e-05, + "logits/chosen": -2.206644296646118, + "logits/rejected": -2.216529607772827, + "logps/chosen": -145.55783081054688, + "logps/rejected": -149.01528930664062, + "loss": 0.651, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.29852521419525146, + "rewards/margins": 0.12249046564102173, + "rewards/rejected": -0.42101573944091797, + "step": 416 + }, + { + "epoch": 0.55, + "learning_rate": 4.746391413533503e-05, + "logits/chosen": -1.9940528869628906, + "logits/rejected": -1.959697961807251, + "logps/chosen": -162.80947875976562, + "logps/rejected": -181.39344787597656, + "loss": 0.6996, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5685576796531677, + "rewards/margins": 0.06293636560440063, + "rewards/rejected": -0.6314940452575684, + "step": 417 + }, + { + "epoch": 0.55, + "learning_rate": 4.74481667058301e-05, + "logits/chosen": -2.064054250717163, + "logits/rejected": -2.1215896606445312, + "logps/chosen": -154.61865234375, + "logps/rejected": -155.92892456054688, + "loss": 0.6675, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4008215069770813, + "rewards/margins": 0.12303955852985382, + "rewards/rejected": -0.5238610506057739, + "step": 418 + }, + { + "epoch": 0.55, + "learning_rate": 4.743237316586564e-05, + "logits/chosen": -2.2184903621673584, + "logits/rejected": -2.3070037364959717, + "logps/chosen": -149.61441040039062, + "logps/rejected": -151.4566650390625, + "loss": 0.7559, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3738159239292145, + "rewards/margins": -0.02443253993988037, + "rewards/rejected": -0.3493833541870117, + "step": 419 + }, + { + "epoch": 0.55, + "learning_rate": 4.741653354788295e-05, + "logits/chosen": -2.168128252029419, + "logits/rejected": -2.1225526332855225, + "logps/chosen": -160.09083557128906, + "logps/rejected": -169.74386596679688, + "loss": 0.7091, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5666425228118896, + "rewards/margins": 0.06661619246006012, + "rewards/rejected": -0.6332587003707886, + "step": 420 + }, + { + "epoch": 0.55, + "learning_rate": 4.7400647884417956e-05, + "logits/chosen": -1.9577604532241821, + "logits/rejected": -1.9241951704025269, + "logps/chosen": -207.59442138671875, + "logps/rejected": -217.36038208007812, + "loss": 0.6921, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.37186557054519653, + "rewards/margins": 0.10641665011644363, + "rewards/rejected": -0.47828227281570435, + "step": 421 + }, + { + "epoch": 0.55, + "learning_rate": 4.7384716208101166e-05, + "logits/chosen": -2.064387559890747, + "logits/rejected": -2.022218942642212, + "logps/chosen": -189.0228271484375, + "logps/rejected": -185.25689697265625, + "loss": 0.6958, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4377724528312683, + "rewards/margins": 0.09676932543516159, + "rewards/rejected": -0.5345417857170105, + "step": 422 + }, + { + "epoch": 0.55, + "learning_rate": 4.736873855165762e-05, + "logits/chosen": -2.2576003074645996, + "logits/rejected": -2.309368848800659, + "logps/chosen": -180.17724609375, + "logps/rejected": -186.60797119140625, + "loss": 0.7273, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.19769221544265747, + "rewards/margins": 0.021774642169475555, + "rewards/rejected": -0.21946686506271362, + "step": 423 + }, + { + "epoch": 0.55, + "learning_rate": 4.735271494790678e-05, + "logits/chosen": -2.0674479007720947, + "logits/rejected": -2.043943166732788, + "logps/chosen": -189.95338439941406, + "logps/rejected": -188.52682495117188, + "loss": 0.8214, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.30729472637176514, + "rewards/margins": -0.16690713167190552, + "rewards/rejected": -0.14038759469985962, + "step": 424 + }, + { + "epoch": 0.56, + "learning_rate": 4.733664542976253e-05, + "logits/chosen": -2.2818522453308105, + "logits/rejected": -2.243852138519287, + "logps/chosen": -139.22390747070312, + "logps/rejected": -138.7141876220703, + "loss": 0.6405, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1333819478750229, + "rewards/margins": 0.24985717236995697, + "rewards/rejected": -0.38323909044265747, + "step": 425 + }, + { + "epoch": 0.56, + "learning_rate": 4.732053003023301e-05, + "logits/chosen": -2.3318722248077393, + "logits/rejected": -2.331906795501709, + "logps/chosen": -170.11148071289062, + "logps/rejected": -154.28684997558594, + "loss": 0.8989, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.4540347456932068, + "rewards/margins": -0.20150336623191833, + "rewards/rejected": -0.25253134965896606, + "step": 426 + }, + { + "epoch": 0.56, + "learning_rate": 4.730436878242064e-05, + "logits/chosen": -2.0526726245880127, + "logits/rejected": -2.094531536102295, + "logps/chosen": -148.9898681640625, + "logps/rejected": -150.51864624023438, + "loss": 0.6562, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.34212395548820496, + "rewards/margins": 0.1897253841161728, + "rewards/rejected": -0.5318493247032166, + "step": 427 + }, + { + "epoch": 0.56, + "learning_rate": 4.7288161719522016e-05, + "logits/chosen": -2.161623954772949, + "logits/rejected": -2.1211910247802734, + "logps/chosen": -151.83523559570312, + "logps/rejected": -140.38848876953125, + "loss": 0.7175, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.545448899269104, + "rewards/margins": 0.03246283903717995, + "rewards/rejected": -0.5779117345809937, + "step": 428 + }, + { + "epoch": 0.56, + "learning_rate": 4.727190887482783e-05, + "logits/chosen": -2.1172046661376953, + "logits/rejected": -2.05790376663208, + "logps/chosen": -141.56634521484375, + "logps/rejected": -152.59849548339844, + "loss": 0.7776, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.29310688376426697, + "rewards/margins": 0.007600661367177963, + "rewards/rejected": -0.30070751905441284, + "step": 429 + }, + { + "epoch": 0.56, + "learning_rate": 4.725561028172282e-05, + "logits/chosen": -2.2756776809692383, + "logits/rejected": -2.261565685272217, + "logps/chosen": -121.21385955810547, + "logps/rejected": -120.08661651611328, + "loss": 0.6775, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.21489115059375763, + "rewards/margins": 0.12781137228012085, + "rewards/rejected": -0.3427025377750397, + "step": 430 + }, + { + "epoch": 0.56, + "learning_rate": 4.7239265973685696e-05, + "logits/chosen": -2.1066219806671143, + "logits/rejected": -2.201887845993042, + "logps/chosen": -129.29530334472656, + "logps/rejected": -147.51333618164062, + "loss": 0.7801, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6134562492370605, + "rewards/margins": -0.030932441353797913, + "rewards/rejected": -0.5825238227844238, + "step": 431 + }, + { + "epoch": 0.57, + "learning_rate": 4.722287598428907e-05, + "logits/chosen": -2.0897793769836426, + "logits/rejected": -2.091312885284424, + "logps/chosen": -150.09129333496094, + "logps/rejected": -157.86312866210938, + "loss": 0.7574, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5798418521881104, + "rewards/margins": -0.0995514839887619, + "rewards/rejected": -0.48029035329818726, + "step": 432 + }, + { + "epoch": 0.57, + "learning_rate": 4.720644034719938e-05, + "logits/chosen": -2.1701114177703857, + "logits/rejected": -2.149559259414673, + "logps/chosen": -178.9230499267578, + "logps/rejected": -163.4266815185547, + "loss": 0.7959, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5811534523963928, + "rewards/margins": -0.08480577170848846, + "rewards/rejected": -0.49634772539138794, + "step": 433 + }, + { + "epoch": 0.57, + "learning_rate": 4.7189959096176825e-05, + "logits/chosen": -2.404374837875366, + "logits/rejected": -2.407968521118164, + "logps/chosen": -149.89866638183594, + "logps/rejected": -149.10494995117188, + "loss": 0.7978, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.4539230465888977, + "rewards/margins": -0.16405440866947174, + "rewards/rejected": -0.2898685932159424, + "step": 434 + }, + { + "epoch": 0.57, + "learning_rate": 4.7173432265075334e-05, + "logits/chosen": -2.3623311519622803, + "logits/rejected": -2.344980239868164, + "logps/chosen": -233.85638427734375, + "logps/rejected": -233.85716247558594, + "loss": 0.7299, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4674505591392517, + "rewards/margins": 0.010146500542759895, + "rewards/rejected": -0.47759705781936646, + "step": 435 + }, + { + "epoch": 0.57, + "learning_rate": 4.7156859887842416e-05, + "logits/chosen": -2.444455623626709, + "logits/rejected": -2.4147419929504395, + "logps/chosen": -157.481201171875, + "logps/rejected": -155.94912719726562, + "loss": 0.8272, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4809521436691284, + "rewards/margins": -0.1707230508327484, + "rewards/rejected": -0.3102290630340576, + "step": 436 + }, + { + "epoch": 0.57, + "learning_rate": 4.714024199851915e-05, + "logits/chosen": -2.13613224029541, + "logits/rejected": -2.11737060546875, + "logps/chosen": -196.46791076660156, + "logps/rejected": -206.36705017089844, + "loss": 0.6643, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.41040414571762085, + "rewards/margins": 0.20290464162826538, + "rewards/rejected": -0.613308846950531, + "step": 437 + }, + { + "epoch": 0.57, + "learning_rate": 4.712357863124013e-05, + "logits/chosen": -2.302917003631592, + "logits/rejected": -2.30071759223938, + "logps/chosen": -153.2189483642578, + "logps/rejected": -148.35671997070312, + "loss": 0.8639, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5300933122634888, + "rewards/margins": -0.23618030548095703, + "rewards/rejected": -0.29391294717788696, + "step": 438 + }, + { + "epoch": 0.57, + "learning_rate": 4.710686982023332e-05, + "logits/chosen": -2.1037135124206543, + "logits/rejected": -2.084882974624634, + "logps/chosen": -171.69317626953125, + "logps/rejected": -184.76455688476562, + "loss": 0.8317, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7027316093444824, + "rewards/margins": -0.20845447480678558, + "rewards/rejected": -0.49427708983421326, + "step": 439 + }, + { + "epoch": 0.58, + "learning_rate": 4.709011559982006e-05, + "logits/chosen": -2.315164566040039, + "logits/rejected": -2.35860538482666, + "logps/chosen": -142.0930938720703, + "logps/rejected": -144.05953979492188, + "loss": 0.538, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2587888836860657, + "rewards/margins": 0.43140506744384766, + "rewards/rejected": -0.6901938915252686, + "step": 440 + }, + { + "epoch": 0.58, + "learning_rate": 4.707331600441495e-05, + "logits/chosen": -2.025707483291626, + "logits/rejected": -2.1452269554138184, + "logps/chosen": -135.1129608154297, + "logps/rejected": -180.877685546875, + "loss": 0.7269, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.48439598083496094, + "rewards/margins": 0.12395285069942474, + "rewards/rejected": -0.6083488464355469, + "step": 441 + }, + { + "epoch": 0.58, + "learning_rate": 4.705647106852581e-05, + "logits/chosen": -2.1537985801696777, + "logits/rejected": -2.1138670444488525, + "logps/chosen": -161.801025390625, + "logps/rejected": -152.15338134765625, + "loss": 0.6527, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5599794983863831, + "rewards/margins": 0.14886192977428436, + "rewards/rejected": -0.7088414430618286, + "step": 442 + }, + { + "epoch": 0.58, + "learning_rate": 4.7039580826753564e-05, + "logits/chosen": -1.9758260250091553, + "logits/rejected": -2.015881299972534, + "logps/chosen": -184.20703125, + "logps/rejected": -191.9273681640625, + "loss": 0.6208, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.572418749332428, + "rewards/margins": 0.2520812749862671, + "rewards/rejected": -0.8244999647140503, + "step": 443 + }, + { + "epoch": 0.58, + "learning_rate": 4.7022645313792235e-05, + "logits/chosen": -2.2235710620880127, + "logits/rejected": -2.2551536560058594, + "logps/chosen": -152.44741821289062, + "logps/rejected": -160.27525329589844, + "loss": 0.792, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.53495192527771, + "rewards/margins": -0.10131815075874329, + "rewards/rejected": -0.4336337447166443, + "step": 444 + }, + { + "epoch": 0.58, + "learning_rate": 4.700566456442882e-05, + "logits/chosen": -2.0654869079589844, + "logits/rejected": -2.1601076126098633, + "logps/chosen": -129.15142822265625, + "logps/rejected": -140.05062866210938, + "loss": 0.7985, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.4286402463912964, + "rewards/margins": -0.08634166419506073, + "rewards/rejected": -0.3422985374927521, + "step": 445 + }, + { + "epoch": 0.58, + "learning_rate": 4.6988638613543216e-05, + "logits/chosen": -2.106463670730591, + "logits/rejected": -2.1899609565734863, + "logps/chosen": -207.76812744140625, + "logps/rejected": -217.55172729492188, + "loss": 0.7031, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.49319323897361755, + "rewards/margins": 0.05841163173317909, + "rewards/rejected": -0.5516048669815063, + "step": 446 + }, + { + "epoch": 0.58, + "learning_rate": 4.6971567496108206e-05, + "logits/chosen": -2.287020683288574, + "logits/rejected": -2.302515745162964, + "logps/chosen": -146.24276733398438, + "logps/rejected": -157.62596130371094, + "loss": 0.7239, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.35381338000297546, + "rewards/margins": -0.004093277268111706, + "rewards/rejected": -0.3497200906276703, + "step": 447 + }, + { + "epoch": 0.59, + "learning_rate": 4.695445124718931e-05, + "logits/chosen": -2.316082000732422, + "logits/rejected": -2.2804858684539795, + "logps/chosen": -172.0008087158203, + "logps/rejected": -177.33901977539062, + "loss": 0.7188, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4350382685661316, + "rewards/margins": -0.01686153933405876, + "rewards/rejected": -0.41817668080329895, + "step": 448 + }, + { + "epoch": 0.59, + "learning_rate": 4.693728990194479e-05, + "logits/chosen": -2.356797695159912, + "logits/rejected": -2.373162269592285, + "logps/chosen": -187.43911743164062, + "logps/rejected": -201.7132568359375, + "loss": 0.6999, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3604944050312042, + "rewards/margins": 0.06350287795066833, + "rewards/rejected": -0.42399728298187256, + "step": 449 + }, + { + "epoch": 0.59, + "learning_rate": 4.692008349562551e-05, + "logits/chosen": -2.3371644020080566, + "logits/rejected": -2.244069814682007, + "logps/chosen": -157.56373596191406, + "logps/rejected": -158.41690063476562, + "loss": 0.7745, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7513232827186584, + "rewards/margins": -0.11284235119819641, + "rewards/rejected": -0.6384809613227844, + "step": 450 + }, + { + "epoch": 0.59, + "learning_rate": 4.690283206357491e-05, + "logits/chosen": -1.9031703472137451, + "logits/rejected": -1.8866480588912964, + "logps/chosen": -137.91941833496094, + "logps/rejected": -149.28627014160156, + "loss": 0.6434, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.44801104068756104, + "rewards/margins": 0.23882272839546204, + "rewards/rejected": -0.6868337392807007, + "step": 451 + }, + { + "epoch": 0.59, + "learning_rate": 4.6885535641228904e-05, + "logits/chosen": -2.1832618713378906, + "logits/rejected": -2.121121406555176, + "logps/chosen": -158.70449829101562, + "logps/rejected": -139.44195556640625, + "loss": 0.7216, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.542538046836853, + "rewards/margins": -0.03218982741236687, + "rewards/rejected": -0.5103481411933899, + "step": 452 + }, + { + "epoch": 0.59, + "learning_rate": 4.6868194264115833e-05, + "logits/chosen": -2.2379095554351807, + "logits/rejected": -2.124793291091919, + "logps/chosen": -171.65277099609375, + "logps/rejected": -158.50015258789062, + "loss": 0.7441, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5649640560150146, + "rewards/margins": -0.034692008048295975, + "rewards/rejected": -0.5302720665931702, + "step": 453 + }, + { + "epoch": 0.59, + "learning_rate": 4.685080796785637e-05, + "logits/chosen": -2.2778732776641846, + "logits/rejected": -2.2594337463378906, + "logps/chosen": -151.48411560058594, + "logps/rejected": -149.89930725097656, + "loss": 0.6605, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.48167526721954346, + "rewards/margins": 0.17855703830718994, + "rewards/rejected": -0.6602323651313782, + "step": 454 + }, + { + "epoch": 0.6, + "learning_rate": 4.683337678816345e-05, + "logits/chosen": -2.148597478866577, + "logits/rejected": -2.1484172344207764, + "logps/chosen": -143.36769104003906, + "logps/rejected": -158.61419677734375, + "loss": 0.6751, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5770382881164551, + "rewards/margins": 0.10789009928703308, + "rewards/rejected": -0.6849284172058105, + "step": 455 + }, + { + "epoch": 0.6, + "learning_rate": 4.6815900760842236e-05, + "logits/chosen": -2.4137473106384277, + "logits/rejected": -2.4021241664886475, + "logps/chosen": -154.04502868652344, + "logps/rejected": -157.95596313476562, + "loss": 0.5879, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4364089369773865, + "rewards/margins": 0.2789525091648102, + "rewards/rejected": -0.715361475944519, + "step": 456 + }, + { + "epoch": 0.6, + "learning_rate": 4.679837992178996e-05, + "logits/chosen": -2.3135428428649902, + "logits/rejected": -2.2878835201263428, + "logps/chosen": -181.33885192871094, + "logps/rejected": -168.310546875, + "loss": 0.8835, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.7887184619903564, + "rewards/margins": -0.2950814366340637, + "rewards/rejected": -0.4936370253562927, + "step": 457 + }, + { + "epoch": 0.6, + "learning_rate": 4.678081430699594e-05, + "logits/chosen": -2.263319492340088, + "logits/rejected": -2.260213613510132, + "logps/chosen": -159.97354125976562, + "logps/rejected": -173.87554931640625, + "loss": 0.7764, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6154264211654663, + "rewards/margins": 0.04768490046262741, + "rewards/rejected": -0.6631112694740295, + "step": 458 + }, + { + "epoch": 0.6, + "learning_rate": 4.676320395254146e-05, + "logits/chosen": -2.1689982414245605, + "logits/rejected": -2.0917975902557373, + "logps/chosen": -199.61489868164062, + "logps/rejected": -203.17724609375, + "loss": 0.684, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5080626606941223, + "rewards/margins": 0.14702026546001434, + "rewards/rejected": -0.6550828218460083, + "step": 459 + }, + { + "epoch": 0.6, + "learning_rate": 4.674554889459968e-05, + "logits/chosen": -2.080754518508911, + "logits/rejected": -2.030247449874878, + "logps/chosen": -142.29052734375, + "logps/rejected": -165.71142578125, + "loss": 0.7408, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3904760479927063, + "rewards/margins": 0.15203596651554108, + "rewards/rejected": -0.542512059211731, + "step": 460 + }, + { + "epoch": 0.6, + "learning_rate": 4.672784916943562e-05, + "logits/chosen": -1.9848957061767578, + "logits/rejected": -1.9429978132247925, + "logps/chosen": -186.40225219726562, + "logps/rejected": -179.22129821777344, + "loss": 0.7617, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8649554252624512, + "rewards/margins": -0.06034373492002487, + "rewards/rejected": -0.8046116828918457, + "step": 461 + }, + { + "epoch": 0.6, + "learning_rate": 4.6710104813406034e-05, + "logits/chosen": -2.0926647186279297, + "logits/rejected": -2.121751070022583, + "logps/chosen": -170.80023193359375, + "logps/rejected": -175.02835083007812, + "loss": 0.7499, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.61098712682724, + "rewards/margins": 0.014241974800825119, + "rewards/rejected": -0.6252290606498718, + "step": 462 + }, + { + "epoch": 0.61, + "learning_rate": 4.669231586295934e-05, + "logits/chosen": -1.9556676149368286, + "logits/rejected": -1.9728052616119385, + "logps/chosen": -213.68606567382812, + "logps/rejected": -174.6243896484375, + "loss": 0.8219, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.6639366149902344, + "rewards/margins": -0.16740846633911133, + "rewards/rejected": -0.49652814865112305, + "step": 463 + }, + { + "epoch": 0.61, + "learning_rate": 4.667448235463557e-05, + "logits/chosen": -2.060239553451538, + "logits/rejected": -2.061102867126465, + "logps/chosen": -137.50906372070312, + "logps/rejected": -128.2222137451172, + "loss": 0.818, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.2569347321987152, + "rewards/margins": -0.15142616629600525, + "rewards/rejected": -0.10550854355096817, + "step": 464 + }, + { + "epoch": 0.61, + "learning_rate": 4.665660432506629e-05, + "logits/chosen": -2.1114845275878906, + "logits/rejected": -2.12737774848938, + "logps/chosen": -169.5026397705078, + "logps/rejected": -141.35501098632812, + "loss": 0.7919, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.30224573612213135, + "rewards/margins": -0.09046731889247894, + "rewards/rejected": -0.2117784172296524, + "step": 465 + }, + { + "epoch": 0.61, + "learning_rate": 4.6638681810974496e-05, + "logits/chosen": -2.0212507247924805, + "logits/rejected": -2.1288084983825684, + "logps/chosen": -147.18545532226562, + "logps/rejected": -166.57452392578125, + "loss": 0.7137, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.31484949588775635, + "rewards/margins": 0.05684095248579979, + "rewards/rejected": -0.37169045209884644, + "step": 466 + }, + { + "epoch": 0.61, + "learning_rate": 4.6620714849174576e-05, + "logits/chosen": -2.2373719215393066, + "logits/rejected": -2.2259066104888916, + "logps/chosen": -221.53643798828125, + "logps/rejected": -209.2789306640625, + "loss": 0.6621, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.18473269045352936, + "rewards/margins": 0.11960664391517639, + "rewards/rejected": -0.30433934926986694, + "step": 467 + }, + { + "epoch": 0.61, + "learning_rate": 4.660270347657219e-05, + "logits/chosen": -2.235313892364502, + "logits/rejected": -2.207859992980957, + "logps/chosen": -145.51107788085938, + "logps/rejected": -132.99411010742188, + "loss": 0.8067, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4317854642868042, + "rewards/margins": -0.16359470784664154, + "rewards/rejected": -0.26819074153900146, + "step": 468 + }, + { + "epoch": 0.61, + "learning_rate": 4.658464773016428e-05, + "logits/chosen": -2.1742892265319824, + "logits/rejected": -2.266177177429199, + "logps/chosen": -142.66419982910156, + "logps/rejected": -159.65928649902344, + "loss": 0.6783, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.25551578402519226, + "rewards/margins": 0.11744444072246552, + "rewards/rejected": -0.3729602098464966, + "step": 469 + }, + { + "epoch": 0.62, + "learning_rate": 4.6566547647038864e-05, + "logits/chosen": -2.1665616035461426, + "logits/rejected": -2.1410114765167236, + "logps/chosen": -172.94570922851562, + "logps/rejected": -167.9188995361328, + "loss": 0.7223, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.34934595227241516, + "rewards/margins": 0.025624670088291168, + "rewards/rejected": -0.37497058510780334, + "step": 470 + }, + { + "epoch": 0.62, + "learning_rate": 4.6548403264375074e-05, + "logits/chosen": -2.055755615234375, + "logits/rejected": -2.020624876022339, + "logps/chosen": -150.6020965576172, + "logps/rejected": -185.5800323486328, + "loss": 0.6847, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5740702748298645, + "rewards/margins": 0.09851770102977753, + "rewards/rejected": -0.6725879907608032, + "step": 471 + }, + { + "epoch": 0.62, + "learning_rate": 4.6530214619443037e-05, + "logits/chosen": -2.0328664779663086, + "logits/rejected": -2.0425925254821777, + "logps/chosen": -167.3757781982422, + "logps/rejected": -173.4803466796875, + "loss": 0.715, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4150983989238739, + "rewards/margins": 0.005986988544464111, + "rewards/rejected": -0.421085387468338, + "step": 472 + }, + { + "epoch": 0.62, + "learning_rate": 4.6511981749603775e-05, + "logits/chosen": -1.9052200317382812, + "logits/rejected": -1.998968243598938, + "logps/chosen": -123.46515655517578, + "logps/rejected": -141.20614624023438, + "loss": 0.6627, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01827160082757473, + "rewards/margins": 0.1650063842535019, + "rewards/rejected": -0.14673477411270142, + "step": 473 + }, + { + "epoch": 0.62, + "learning_rate": 4.6493704692309175e-05, + "logits/chosen": -2.1721608638763428, + "logits/rejected": -2.181016445159912, + "logps/chosen": -169.97547912597656, + "logps/rejected": -174.20501708984375, + "loss": 0.6978, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.34234559535980225, + "rewards/margins": 0.039180606603622437, + "rewards/rejected": -0.3815262019634247, + "step": 474 + }, + { + "epoch": 0.62, + "learning_rate": 4.647538348510189e-05, + "logits/chosen": -2.042267084121704, + "logits/rejected": -2.0465734004974365, + "logps/chosen": -141.28273010253906, + "logps/rejected": -137.6270294189453, + "loss": 0.7715, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.45353037118911743, + "rewards/margins": -0.10618551820516586, + "rewards/rejected": -0.34734484553337097, + "step": 475 + }, + { + "epoch": 0.62, + "learning_rate": 4.645701816561523e-05, + "logits/chosen": -2.080418825149536, + "logits/rejected": -2.0591931343078613, + "logps/chosen": -156.5729217529297, + "logps/rejected": -160.69155883789062, + "loss": 0.8914, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.3119348883628845, + "rewards/margins": -0.33930426836013794, + "rewards/rejected": 0.027369357645511627, + "step": 476 + }, + { + "epoch": 0.62, + "learning_rate": 4.643860877157314e-05, + "logits/chosen": -1.639522671699524, + "logits/rejected": -1.6502141952514648, + "logps/chosen": -178.24334716796875, + "logps/rejected": -169.12493896484375, + "loss": 0.7049, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.44444751739501953, + "rewards/margins": 0.014572631567716599, + "rewards/rejected": -0.45902013778686523, + "step": 477 + }, + { + "epoch": 0.63, + "learning_rate": 4.642015534079012e-05, + "logits/chosen": -2.047992467880249, + "logits/rejected": -2.0363173484802246, + "logps/chosen": -134.27391052246094, + "logps/rejected": -123.03015899658203, + "loss": 0.7044, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.21198835968971252, + "rewards/margins": 0.025383614003658295, + "rewards/rejected": -0.23737198114395142, + "step": 478 + }, + { + "epoch": 0.63, + "learning_rate": 4.640165791117106e-05, + "logits/chosen": -2.0618531703948975, + "logits/rejected": -2.0514118671417236, + "logps/chosen": -152.69686889648438, + "logps/rejected": -135.8447265625, + "loss": 0.6722, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.04251031577587128, + "rewards/margins": 0.09286715090274811, + "rewards/rejected": -0.13537748157978058, + "step": 479 + }, + { + "epoch": 0.63, + "learning_rate": 4.63831165207113e-05, + "logits/chosen": -2.015552282333374, + "logits/rejected": -1.9658578634262085, + "logps/chosen": -143.47520446777344, + "logps/rejected": -146.58863830566406, + "loss": 0.8025, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.14033322036266327, + "rewards/margins": -0.1578764021396637, + "rewards/rejected": 0.01754317432641983, + "step": 480 + }, + { + "epoch": 0.63, + "learning_rate": 4.6364531207496426e-05, + "logits/chosen": -2.1612448692321777, + "logits/rejected": -2.099830389022827, + "logps/chosen": -161.23220825195312, + "logps/rejected": -143.65408325195312, + "loss": 0.7213, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.29882240295410156, + "rewards/margins": -0.008122054859995842, + "rewards/rejected": -0.2907003164291382, + "step": 481 + }, + { + "epoch": 0.63, + "learning_rate": 4.634590200970227e-05, + "logits/chosen": -1.8700077533721924, + "logits/rejected": -1.9340232610702515, + "logps/chosen": -160.71038818359375, + "logps/rejected": -151.01939392089844, + "loss": 0.7197, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3910747170448303, + "rewards/margins": 0.08369296789169312, + "rewards/rejected": -0.47476768493652344, + "step": 482 + }, + { + "epoch": 0.63, + "learning_rate": 4.632722896559481e-05, + "logits/chosen": -2.1195974349975586, + "logits/rejected": -2.1336312294006348, + "logps/chosen": -151.23971557617188, + "logps/rejected": -160.65672302246094, + "loss": 0.78, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.08127432316541672, + "rewards/margins": -0.11872512847185135, + "rewards/rejected": 0.19999945163726807, + "step": 483 + }, + { + "epoch": 0.63, + "learning_rate": 4.630851211353007e-05, + "logits/chosen": -2.0612592697143555, + "logits/rejected": -2.0779013633728027, + "logps/chosen": -131.81069946289062, + "logps/rejected": -129.8345489501953, + "loss": 0.689, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03394623100757599, + "rewards/margins": 0.08058298379182816, + "rewards/rejected": -0.11452920734882355, + "step": 484 + }, + { + "epoch": 0.63, + "learning_rate": 4.628975149195407e-05, + "logits/chosen": -1.9617186784744263, + "logits/rejected": -1.9468779563903809, + "logps/chosen": -130.2001495361328, + "logps/rejected": -125.31912994384766, + "loss": 0.8841, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.41198205947875977, + "rewards/margins": -0.24151001870632172, + "rewards/rejected": -0.17047205567359924, + "step": 485 + }, + { + "epoch": 0.64, + "learning_rate": 4.6270947139402744e-05, + "logits/chosen": -2.0373647212982178, + "logits/rejected": -2.0272598266601562, + "logps/chosen": -161.21817016601562, + "logps/rejected": -160.80905151367188, + "loss": 0.5759, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23811860382556915, + "rewards/margins": 0.3550880253314972, + "rewards/rejected": -0.5932066440582275, + "step": 486 + }, + { + "epoch": 0.64, + "learning_rate": 4.6252099094501834e-05, + "logits/chosen": -2.263495922088623, + "logits/rejected": -2.220659017562866, + "logps/chosen": -150.18930053710938, + "logps/rejected": -147.73287963867188, + "loss": 0.6553, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.039855338633060455, + "rewards/margins": 0.12497195601463318, + "rewards/rejected": -0.16482731699943542, + "step": 487 + }, + { + "epoch": 0.64, + "learning_rate": 4.623320739596685e-05, + "logits/chosen": -2.1021156311035156, + "logits/rejected": -2.120116949081421, + "logps/chosen": -153.40098571777344, + "logps/rejected": -157.4463653564453, + "loss": 0.7669, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09802110493183136, + "rewards/margins": -0.008432537317276001, + "rewards/rejected": -0.08958857506513596, + "step": 488 + }, + { + "epoch": 0.64, + "learning_rate": 4.621427208260296e-05, + "logits/chosen": -1.9562160968780518, + "logits/rejected": -2.006755828857422, + "logps/chosen": -165.9380340576172, + "logps/rejected": -173.2657928466797, + "loss": 0.7826, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6031227111816406, + "rewards/margins": -0.10150633007287979, + "rewards/rejected": -0.501616358757019, + "step": 489 + }, + { + "epoch": 0.64, + "learning_rate": 4.6195293193304915e-05, + "logits/chosen": -1.9626574516296387, + "logits/rejected": -1.9933884143829346, + "logps/chosen": -187.50564575195312, + "logps/rejected": -184.9365234375, + "loss": 0.7809, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.05860882252454758, + "rewards/margins": -0.11062480509281158, + "rewards/rejected": 0.05201598256826401, + "step": 490 + }, + { + "epoch": 0.64, + "learning_rate": 4.6176270767056976e-05, + "logits/chosen": -2.0054826736450195, + "logits/rejected": -2.002007484436035, + "logps/chosen": -172.8408966064453, + "logps/rejected": -165.1022491455078, + "loss": 0.7222, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1988084763288498, + "rewards/margins": -0.013871286064386368, + "rewards/rejected": -0.18493719398975372, + "step": 491 + }, + { + "epoch": 0.64, + "learning_rate": 4.615720484293286e-05, + "logits/chosen": -2.0947346687316895, + "logits/rejected": -2.138517379760742, + "logps/chosen": -148.93450927734375, + "logps/rejected": -144.9428253173828, + "loss": 0.6561, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12516680359840393, + "rewards/margins": 0.15067778527736664, + "rewards/rejected": -0.2758445739746094, + "step": 492 + }, + { + "epoch": 0.65, + "learning_rate": 4.613809546009558e-05, + "logits/chosen": -2.115553140640259, + "logits/rejected": -2.149724006652832, + "logps/chosen": -153.2530059814453, + "logps/rejected": -160.34811401367188, + "loss": 0.6297, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14684545993804932, + "rewards/margins": 0.21833759546279907, + "rewards/rejected": -0.365183025598526, + "step": 493 + }, + { + "epoch": 0.65, + "learning_rate": 4.611894265779748e-05, + "logits/chosen": -2.068711280822754, + "logits/rejected": -2.126939058303833, + "logps/chosen": -139.7198486328125, + "logps/rejected": -151.35397338867188, + "loss": 0.6535, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08141559362411499, + "rewards/margins": 0.1623344123363495, + "rewards/rejected": -0.24375002086162567, + "step": 494 + }, + { + "epoch": 0.65, + "learning_rate": 4.609974647538003e-05, + "logits/chosen": -2.225661516189575, + "logits/rejected": -2.150395393371582, + "logps/chosen": -132.438232421875, + "logps/rejected": -128.7378692626953, + "loss": 0.7277, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.009231429547071457, + "rewards/margins": -0.0193635243922472, + "rewards/rejected": 0.02859494648873806, + "step": 495 + }, + { + "epoch": 0.65, + "learning_rate": 4.608050695227385e-05, + "logits/chosen": -2.0139918327331543, + "logits/rejected": -2.0279946327209473, + "logps/chosen": -163.32350158691406, + "logps/rejected": -148.53485107421875, + "loss": 0.7111, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04781512916088104, + "rewards/margins": 0.12081319838762283, + "rewards/rejected": -0.16862833499908447, + "step": 496 + }, + { + "epoch": 0.65, + "learning_rate": 4.606122412799857e-05, + "logits/chosen": -2.3113937377929688, + "logits/rejected": -2.31269907951355, + "logps/chosen": -147.7089080810547, + "logps/rejected": -147.32814025878906, + "loss": 0.6941, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2833206355571747, + "rewards/margins": 0.048162560909986496, + "rewards/rejected": -0.3314831852912903, + "step": 497 + }, + { + "epoch": 0.65, + "learning_rate": 4.6041898042162764e-05, + "logits/chosen": -2.3534443378448486, + "logits/rejected": -2.3750576972961426, + "logps/chosen": -179.6420440673828, + "logps/rejected": -186.58526611328125, + "loss": 0.8517, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.15785303711891174, + "rewards/margins": -0.20781296491622925, + "rewards/rejected": 0.049959927797317505, + "step": 498 + }, + { + "epoch": 0.65, + "learning_rate": 4.602252873446386e-05, + "logits/chosen": -1.7936934232711792, + "logits/rejected": -1.7282207012176514, + "logps/chosen": -171.0880584716797, + "logps/rejected": -180.4699249267578, + "loss": 0.7747, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.37305474281311035, + "rewards/margins": 0.08523893356323242, + "rewards/rejected": -0.45829373598098755, + "step": 499 + }, + { + "epoch": 0.65, + "learning_rate": 4.60031162446881e-05, + "logits/chosen": -2.154116153717041, + "logits/rejected": -2.1634771823883057, + "logps/chosen": -142.67947387695312, + "logps/rejected": -141.00306701660156, + "loss": 0.651, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04277772828936577, + "rewards/margins": 0.16890141367912292, + "rewards/rejected": -0.12612366676330566, + "step": 500 + }, + { + "epoch": 0.66, + "learning_rate": 4.5983660612710365e-05, + "logits/chosen": -2.0763444900512695, + "logits/rejected": -2.0053396224975586, + "logps/chosen": -151.91763305664062, + "logps/rejected": -144.047607421875, + "loss": 0.8691, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.48068270087242126, + "rewards/margins": -0.29285871982574463, + "rewards/rejected": -0.18782396614551544, + "step": 501 + }, + { + "epoch": 0.66, + "learning_rate": 4.596416187849423e-05, + "logits/chosen": -2.0705764293670654, + "logits/rejected": -2.0982978343963623, + "logps/chosen": -132.6533203125, + "logps/rejected": -136.65093994140625, + "loss": 0.5925, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2096860408782959, + "rewards/margins": 0.35339686274528503, + "rewards/rejected": -0.14371080696582794, + "step": 502 + }, + { + "epoch": 0.66, + "learning_rate": 4.5944620082091745e-05, + "logits/chosen": -1.9696085453033447, + "logits/rejected": -2.015946865081787, + "logps/chosen": -253.59783935546875, + "logps/rejected": -241.3708038330078, + "loss": 0.7486, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.22850322723388672, + "rewards/margins": -0.06344452500343323, + "rewards/rejected": -0.16505871713161469, + "step": 503 + }, + { + "epoch": 0.66, + "learning_rate": 4.5925035263643444e-05, + "logits/chosen": -2.2930264472961426, + "logits/rejected": -2.2867448329925537, + "logps/chosen": -167.2974090576172, + "logps/rejected": -155.6397247314453, + "loss": 0.7972, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.23387077450752258, + "rewards/margins": -0.13073894381523132, + "rewards/rejected": -0.10313184559345245, + "step": 504 + }, + { + "epoch": 0.66, + "learning_rate": 4.5905407463378225e-05, + "logits/chosen": -2.336209774017334, + "logits/rejected": -2.3158388137817383, + "logps/chosen": -153.33377075195312, + "logps/rejected": -149.57073974609375, + "loss": 0.6428, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.29021593928337097, + "rewards/margins": 0.20943422615528107, + "rewards/rejected": 0.08078169822692871, + "step": 505 + }, + { + "epoch": 0.66, + "learning_rate": 4.588573672161326e-05, + "logits/chosen": -1.9931033849716187, + "logits/rejected": -1.9577481746673584, + "logps/chosen": -165.14727783203125, + "logps/rejected": -179.18228149414062, + "loss": 0.7458, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.25228357315063477, + "rewards/margins": -0.07357550412416458, + "rewards/rejected": -0.1787080615758896, + "step": 506 + }, + { + "epoch": 0.66, + "learning_rate": 4.586602307875396e-05, + "logits/chosen": -2.2065775394439697, + "logits/rejected": -2.18172550201416, + "logps/chosen": -171.14437866210938, + "logps/rejected": -185.22750854492188, + "loss": 0.799, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.23644599318504333, + "rewards/margins": -0.07532497495412827, + "rewards/rejected": -0.16112102568149567, + "step": 507 + }, + { + "epoch": 0.66, + "learning_rate": 4.5846266575293816e-05, + "logits/chosen": -2.1658072471618652, + "logits/rejected": -2.1649296283721924, + "logps/chosen": -180.37255859375, + "logps/rejected": -158.17051696777344, + "loss": 0.7103, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09764609485864639, + "rewards/margins": 0.07636680454015732, + "rewards/rejected": -0.1740128993988037, + "step": 508 + }, + { + "epoch": 0.67, + "learning_rate": 4.582646725181441e-05, + "logits/chosen": -1.7266638278961182, + "logits/rejected": -1.8396620750427246, + "logps/chosen": -129.93885803222656, + "logps/rejected": -139.12106323242188, + "loss": 0.6701, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.10675990581512451, + "rewards/margins": 0.09391005337238312, + "rewards/rejected": -0.20066994428634644, + "step": 509 + }, + { + "epoch": 0.67, + "learning_rate": 4.580662514898522e-05, + "logits/chosen": -2.210435628890991, + "logits/rejected": -2.176440477371216, + "logps/chosen": -161.60562133789062, + "logps/rejected": -143.5146484375, + "loss": 0.6319, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.23548352718353271, + "rewards/margins": 0.1876177191734314, + "rewards/rejected": -0.4231012463569641, + "step": 510 + }, + { + "epoch": 0.67, + "learning_rate": 4.5786740307563636e-05, + "logits/chosen": -2.292863130569458, + "logits/rejected": -2.286449432373047, + "logps/chosen": -149.38560485839844, + "logps/rejected": -152.18753051757812, + "loss": 0.694, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1346094161272049, + "rewards/margins": 0.04607778787612915, + "rewards/rejected": -0.18068721890449524, + "step": 511 + }, + { + "epoch": 0.67, + "learning_rate": 4.576681276839483e-05, + "logits/chosen": -2.000340461730957, + "logits/rejected": -1.9955222606658936, + "logps/chosen": -169.0735321044922, + "logps/rejected": -171.15048217773438, + "loss": 0.7291, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1468675434589386, + "rewards/margins": 0.021292299032211304, + "rewards/rejected": -0.1681598424911499, + "step": 512 + }, + { + "epoch": 0.67, + "learning_rate": 4.574684257241168e-05, + "logits/chosen": -1.7635064125061035, + "logits/rejected": -1.7762371301651, + "logps/chosen": -179.9252471923828, + "logps/rejected": -179.78445434570312, + "loss": 0.7814, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.15447980165481567, + "rewards/margins": -0.054616428911685944, + "rewards/rejected": -0.09986338019371033, + "step": 513 + }, + { + "epoch": 0.67, + "learning_rate": 4.572682976063468e-05, + "logits/chosen": -2.17221736907959, + "logits/rejected": -2.205554485321045, + "logps/chosen": -124.07170104980469, + "logps/rejected": -144.3170928955078, + "loss": 0.835, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1368752419948578, + "rewards/margins": -0.1436615288257599, + "rewards/rejected": 0.2805367410182953, + "step": 514 + }, + { + "epoch": 0.67, + "learning_rate": 4.5706774374171854e-05, + "logits/chosen": -2.003601551055908, + "logits/rejected": -2.0354526042938232, + "logps/chosen": -201.1471710205078, + "logps/rejected": -220.0992431640625, + "loss": 0.8975, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.33305513858795166, + "rewards/margins": -0.3173186182975769, + "rewards/rejected": -0.01573648676276207, + "step": 515 + }, + { + "epoch": 0.68, + "learning_rate": 4.56866764542187e-05, + "logits/chosen": -2.144970417022705, + "logits/rejected": -2.187018394470215, + "logps/chosen": -156.69705200195312, + "logps/rejected": -157.2908477783203, + "loss": 0.6765, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.24758291244506836, + "rewards/margins": 0.09244808554649353, + "rewards/rejected": 0.15513482689857483, + "step": 516 + }, + { + "epoch": 0.68, + "learning_rate": 4.566653604205805e-05, + "logits/chosen": -2.15089750289917, + "logits/rejected": -2.1836040019989014, + "logps/chosen": -166.6836395263672, + "logps/rejected": -181.80368041992188, + "loss": 0.7843, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2800065279006958, + "rewards/margins": -0.08594866096973419, + "rewards/rejected": 0.3659551739692688, + "step": 517 + }, + { + "epoch": 0.68, + "learning_rate": 4.5646353179060057e-05, + "logits/chosen": -2.251128673553467, + "logits/rejected": -2.24637770652771, + "logps/chosen": -137.0120849609375, + "logps/rejected": -135.96041870117188, + "loss": 0.6631, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.20585085451602936, + "rewards/margins": 0.15107136964797974, + "rewards/rejected": 0.054779477417469025, + "step": 518 + }, + { + "epoch": 0.68, + "learning_rate": 4.562612790668204e-05, + "logits/chosen": -1.813207983970642, + "logits/rejected": -1.8710843324661255, + "logps/chosen": -126.45897674560547, + "logps/rejected": -127.25151062011719, + "loss": 0.6969, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2428382784128189, + "rewards/margins": 0.050096381455659866, + "rewards/rejected": 0.19274193048477173, + "step": 519 + }, + { + "epoch": 0.68, + "learning_rate": 4.560586026646845e-05, + "logits/chosen": -2.1233975887298584, + "logits/rejected": -2.1502273082733154, + "logps/chosen": -141.46368408203125, + "logps/rejected": -143.70501708984375, + "loss": 0.6942, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.05234923213720322, + "rewards/margins": 0.039612069725990295, + "rewards/rejected": 0.012737158685922623, + "step": 520 + }, + { + "epoch": 0.68, + "learning_rate": 4.558555030005075e-05, + "logits/chosen": -1.9741415977478027, + "logits/rejected": -1.9955835342407227, + "logps/chosen": -144.7069549560547, + "logps/rejected": -167.02902221679688, + "loss": 0.7312, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05080757290124893, + "rewards/margins": -0.03744647279381752, + "rewards/rejected": 0.08825404942035675, + "step": 521 + }, + { + "epoch": 0.68, + "learning_rate": 4.556519804914736e-05, + "logits/chosen": -2.0981431007385254, + "logits/rejected": -2.073946714401245, + "logps/chosen": -225.3271942138672, + "logps/rejected": -232.33668518066406, + "loss": 0.7173, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.12561050057411194, + "rewards/margins": 0.03423985838890076, + "rewards/rejected": -0.15985038876533508, + "step": 522 + }, + { + "epoch": 0.68, + "learning_rate": 4.554480355556354e-05, + "logits/chosen": -1.8198878765106201, + "logits/rejected": -1.7702445983886719, + "logps/chosen": -148.35325622558594, + "logps/rejected": -151.1799774169922, + "loss": 0.5943, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.26486697793006897, + "rewards/margins": 0.25646913051605225, + "rewards/rejected": 0.00839783065021038, + "step": 523 + }, + { + "epoch": 0.69, + "learning_rate": 4.552436686119134e-05, + "logits/chosen": -1.8180818557739258, + "logits/rejected": -1.764922857284546, + "logps/chosen": -175.82241821289062, + "logps/rejected": -188.15457153320312, + "loss": 0.6774, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.026587432250380516, + "rewards/margins": 0.06728048622608185, + "rewards/rejected": -0.0938679426908493, + "step": 524 + }, + { + "epoch": 0.69, + "learning_rate": 4.550388800800948e-05, + "logits/chosen": -1.7546145915985107, + "logits/rejected": -1.7742228507995605, + "logps/chosen": -152.79208374023438, + "logps/rejected": -140.30728149414062, + "loss": 0.7556, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.02463456243276596, + "rewards/margins": -0.02168111503124237, + "rewards/rejected": 0.046315666288137436, + "step": 525 + }, + { + "epoch": 0.69, + "learning_rate": 4.548336703808328e-05, + "logits/chosen": -1.8469312191009521, + "logits/rejected": -1.8778002262115479, + "logps/chosen": -126.43699645996094, + "logps/rejected": -146.99752807617188, + "loss": 0.7736, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.17384478449821472, + "rewards/margins": -0.11481676995754242, + "rewards/rejected": -0.059028007090091705, + "step": 526 + }, + { + "epoch": 0.69, + "learning_rate": 4.546280399356457e-05, + "logits/chosen": -1.8530418872833252, + "logits/rejected": -1.927619218826294, + "logps/chosen": -175.30499267578125, + "logps/rejected": -165.12033081054688, + "loss": 0.8325, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.1547343134880066, + "rewards/margins": -0.2065506875514984, + "rewards/rejected": 0.05181637406349182, + "step": 527 + }, + { + "epoch": 0.69, + "learning_rate": 4.54421989166916e-05, + "logits/chosen": -2.190769910812378, + "logits/rejected": -2.173021078109741, + "logps/chosen": -148.19883728027344, + "logps/rejected": -149.60968017578125, + "loss": 0.6945, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1685972660779953, + "rewards/margins": 0.083544060587883, + "rewards/rejected": 0.0850532054901123, + "step": 528 + }, + { + "epoch": 0.69, + "learning_rate": 4.542155184978898e-05, + "logits/chosen": -2.227403402328491, + "logits/rejected": -2.2359485626220703, + "logps/chosen": -129.83944702148438, + "logps/rejected": -130.6429443359375, + "loss": 0.6671, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20258253812789917, + "rewards/margins": 0.09699730575084686, + "rewards/rejected": 0.1055852472782135, + "step": 529 + }, + { + "epoch": 0.69, + "learning_rate": 4.540086283526754e-05, + "logits/chosen": -2.176431179046631, + "logits/rejected": -2.1221935749053955, + "logps/chosen": -148.75912475585938, + "logps/rejected": -121.70980834960938, + "loss": 0.6792, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.11940637975931168, + "rewards/margins": 0.08864616602659225, + "rewards/rejected": 0.030760202556848526, + "step": 530 + }, + { + "epoch": 0.69, + "learning_rate": 4.538013191562431e-05, + "logits/chosen": -2.2914586067199707, + "logits/rejected": -2.2601258754730225, + "logps/chosen": -185.4001922607422, + "logps/rejected": -187.08941650390625, + "loss": 0.6852, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.008667275309562683, + "rewards/margins": 0.06690745055675507, + "rewards/rejected": -0.05824017524719238, + "step": 531 + }, + { + "epoch": 0.7, + "learning_rate": 4.5359359133442356e-05, + "logits/chosen": -2.1340489387512207, + "logits/rejected": -2.1484596729278564, + "logps/chosen": -145.6934814453125, + "logps/rejected": -157.48910522460938, + "loss": 0.6541, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.03526604175567627, + "rewards/margins": 0.22745643556118011, + "rewards/rejected": -0.19219039380550385, + "step": 532 + }, + { + "epoch": 0.7, + "learning_rate": 4.533854453139077e-05, + "logits/chosen": -2.163139581680298, + "logits/rejected": -2.1732969284057617, + "logps/chosen": -157.28152465820312, + "logps/rejected": -164.52548217773438, + "loss": 0.5849, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14357003569602966, + "rewards/margins": 0.33174028992652893, + "rewards/rejected": -0.18817025423049927, + "step": 533 + }, + { + "epoch": 0.7, + "learning_rate": 4.5317688152224515e-05, + "logits/chosen": -2.232987642288208, + "logits/rejected": -2.2299559116363525, + "logps/chosen": -200.51223754882812, + "logps/rejected": -193.10623168945312, + "loss": 0.7663, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08180832862854004, + "rewards/margins": -0.07060222327709198, + "rewards/rejected": 0.1524105668067932, + "step": 534 + }, + { + "epoch": 0.7, + "learning_rate": 4.52967900387844e-05, + "logits/chosen": -2.139064073562622, + "logits/rejected": -2.153592586517334, + "logps/chosen": -150.04501342773438, + "logps/rejected": -153.50726318359375, + "loss": 0.8413, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.004394152667373419, + "rewards/margins": -0.2386518269777298, + "rewards/rejected": 0.24304598569869995, + "step": 535 + }, + { + "epoch": 0.7, + "learning_rate": 4.5275850233996925e-05, + "logits/chosen": -2.081514835357666, + "logits/rejected": -2.029918670654297, + "logps/chosen": -152.26541137695312, + "logps/rejected": -168.11135864257812, + "loss": 0.6554, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15842419862747192, + "rewards/margins": 0.11571840941905975, + "rewards/rejected": 0.042705781757831573, + "step": 536 + }, + { + "epoch": 0.7, + "learning_rate": 4.525486878087426e-05, + "logits/chosen": -2.0249204635620117, + "logits/rejected": -1.9895412921905518, + "logps/chosen": -148.14431762695312, + "logps/rejected": -138.2922821044922, + "loss": 0.7594, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12210666388273239, + "rewards/margins": -0.07970865815877914, + "rewards/rejected": 0.20181530714035034, + "step": 537 + }, + { + "epoch": 0.7, + "learning_rate": 4.523384572251409e-05, + "logits/chosen": -2.079011917114258, + "logits/rejected": -2.151048421859741, + "logps/chosen": -172.8366241455078, + "logps/rejected": -196.3922576904297, + "loss": 0.7753, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05615053325891495, + "rewards/margins": -0.0960412472486496, + "rewards/rejected": 0.15219178795814514, + "step": 538 + }, + { + "epoch": 0.71, + "learning_rate": 4.52127811020996e-05, + "logits/chosen": -2.2182424068450928, + "logits/rejected": -2.2047057151794434, + "logps/chosen": -143.86557006835938, + "logps/rejected": -155.3209686279297, + "loss": 0.6784, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.018200304359197617, + "rewards/margins": 0.08800992369651794, + "rewards/rejected": -0.10621023923158646, + "step": 539 + }, + { + "epoch": 0.71, + "learning_rate": 4.5191674962899314e-05, + "logits/chosen": -1.9326967000961304, + "logits/rejected": -1.9407259225845337, + "logps/chosen": -166.248291015625, + "logps/rejected": -184.2652130126953, + "loss": 0.7111, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.07094278186559677, + "rewards/margins": 0.1123301088809967, + "rewards/rejected": -0.18327289819717407, + "step": 540 + }, + { + "epoch": 0.71, + "learning_rate": 4.5170527348267054e-05, + "logits/chosen": -2.1330301761627197, + "logits/rejected": -2.218416690826416, + "logps/chosen": -138.27850341796875, + "logps/rejected": -139.3997344970703, + "loss": 0.6661, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.176968052983284, + "rewards/margins": 0.11333853006362915, + "rewards/rejected": -0.29030656814575195, + "step": 541 + }, + { + "epoch": 0.71, + "learning_rate": 4.5149338301641845e-05, + "logits/chosen": -2.203751564025879, + "logits/rejected": -2.177703380584717, + "logps/chosen": -153.626708984375, + "logps/rejected": -145.74542236328125, + "loss": 0.8742, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.1450812965631485, + "rewards/margins": -0.26044121384620667, + "rewards/rejected": 0.11535991728305817, + "step": 542 + }, + { + "epoch": 0.71, + "learning_rate": 4.512810786654779e-05, + "logits/chosen": -2.061079502105713, + "logits/rejected": -2.138723611831665, + "logps/chosen": -163.50033569335938, + "logps/rejected": -169.90225219726562, + "loss": 0.7113, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.12472327053546906, + "rewards/margins": 0.01421796903014183, + "rewards/rejected": -0.13894124329090118, + "step": 543 + }, + { + "epoch": 0.71, + "learning_rate": 4.510683608659403e-05, + "logits/chosen": -1.9471606016159058, + "logits/rejected": -1.951507806777954, + "logps/chosen": -153.19105529785156, + "logps/rejected": -183.1653289794922, + "loss": 0.7854, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2256731539964676, + "rewards/margins": -0.1194252148270607, + "rewards/rejected": -0.1062479168176651, + "step": 544 + }, + { + "epoch": 0.71, + "learning_rate": 4.508552300547463e-05, + "logits/chosen": -1.9512101411819458, + "logits/rejected": -2.0366945266723633, + "logps/chosen": -162.8254852294922, + "logps/rejected": -170.9732666015625, + "loss": 0.6963, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13340815901756287, + "rewards/margins": 0.08452873677015305, + "rewards/rejected": -0.2179369032382965, + "step": 545 + }, + { + "epoch": 0.71, + "learning_rate": 4.506416866696848e-05, + "logits/chosen": -2.29795503616333, + "logits/rejected": -2.2391650676727295, + "logps/chosen": -157.7142791748047, + "logps/rejected": -160.1687469482422, + "loss": 0.7126, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1038050726056099, + "rewards/margins": 0.07325511425733566, + "rewards/rejected": 0.030549969524145126, + "step": 546 + }, + { + "epoch": 0.72, + "learning_rate": 4.504277311493922e-05, + "logits/chosen": -2.0046989917755127, + "logits/rejected": -2.0597028732299805, + "logps/chosen": -161.10891723632812, + "logps/rejected": -154.01663208007812, + "loss": 0.6706, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.03068912774324417, + "rewards/margins": 0.09607706218957901, + "rewards/rejected": -0.126766175031662, + "step": 547 + }, + { + "epoch": 0.72, + "learning_rate": 4.502133639333516e-05, + "logits/chosen": -2.2434914112091064, + "logits/rejected": -2.2011220455169678, + "logps/chosen": -166.30824279785156, + "logps/rejected": -166.396240234375, + "loss": 0.7998, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.23776453733444214, + "rewards/margins": -0.1319459229707718, + "rewards/rejected": -0.10581861436367035, + "step": 548 + }, + { + "epoch": 0.72, + "learning_rate": 4.499985854618915e-05, + "logits/chosen": -2.00533127784729, + "logits/rejected": -1.9807076454162598, + "logps/chosen": -171.72430419921875, + "logps/rejected": -176.17196655273438, + "loss": 0.7283, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1258818507194519, + "rewards/margins": -0.023614590987563133, + "rewards/rejected": -0.10226726531982422, + "step": 549 + }, + { + "epoch": 0.72, + "learning_rate": 4.497833961761855e-05, + "logits/chosen": -1.9163005352020264, + "logits/rejected": -1.9533987045288086, + "logps/chosen": -187.05703735351562, + "logps/rejected": -160.68228149414062, + "loss": 0.7811, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.18283602595329285, + "rewards/margins": -0.12604478001594543, + "rewards/rejected": -0.05679125338792801, + "step": 550 + }, + { + "epoch": 0.72, + "learning_rate": 4.495677965182506e-05, + "logits/chosen": -1.7719175815582275, + "logits/rejected": -1.7788423299789429, + "logps/chosen": -149.80589294433594, + "logps/rejected": -148.5959014892578, + "loss": 0.7087, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0498424731194973, + "rewards/margins": 0.04123033583164215, + "rewards/rejected": 0.00861213356256485, + "step": 551 + }, + { + "epoch": 0.72, + "learning_rate": 4.4935178693094714e-05, + "logits/chosen": -1.8553767204284668, + "logits/rejected": -1.8368808031082153, + "logps/chosen": -179.26145935058594, + "logps/rejected": -164.9579620361328, + "loss": 0.8264, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5210792422294617, + "rewards/margins": -0.1817505955696106, + "rewards/rejected": -0.3393285870552063, + "step": 552 + }, + { + "epoch": 0.72, + "learning_rate": 4.491353678579774e-05, + "logits/chosen": -2.287044048309326, + "logits/rejected": -2.296315908432007, + "logps/chosen": -154.74530029296875, + "logps/rejected": -161.77780151367188, + "loss": 0.7811, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.222887322306633, + "rewards/margins": -0.1348755806684494, + "rewards/rejected": -0.0880117416381836, + "step": 553 + }, + { + "epoch": 0.73, + "learning_rate": 4.489185397438845e-05, + "logits/chosen": -2.1219000816345215, + "logits/rejected": -2.1018307209014893, + "logps/chosen": -152.45147705078125, + "logps/rejected": -163.04336547851562, + "loss": 0.7666, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2625100910663605, + "rewards/margins": -0.10655740648508072, + "rewards/rejected": -0.15595270693302155, + "step": 554 + }, + { + "epoch": 0.73, + "learning_rate": 4.4870130303405214e-05, + "logits/chosen": -2.108427047729492, + "logits/rejected": -2.1353774070739746, + "logps/chosen": -117.01795196533203, + "logps/rejected": -126.68714141845703, + "loss": 0.6582, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.13066193461418152, + "rewards/margins": 0.10399821400642395, + "rewards/rejected": 0.026663724333047867, + "step": 555 + }, + { + "epoch": 0.73, + "learning_rate": 4.484836581747032e-05, + "logits/chosen": -2.193756580352783, + "logits/rejected": -2.219733953475952, + "logps/chosen": -152.94854736328125, + "logps/rejected": -157.80963134765625, + "loss": 0.6522, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12146392464637756, + "rewards/margins": 0.1416078507900238, + "rewards/rejected": -0.26307177543640137, + "step": 556 + }, + { + "epoch": 0.73, + "learning_rate": 4.4826560561289865e-05, + "logits/chosen": -2.087712526321411, + "logits/rejected": -2.0898258686065674, + "logps/chosen": -172.42575073242188, + "logps/rejected": -163.4795379638672, + "loss": 0.628, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.22759903967380524, + "rewards/margins": 0.20539291203022003, + "rewards/rejected": -0.4329919219017029, + "step": 557 + }, + { + "epoch": 0.73, + "learning_rate": 4.4804714579653736e-05, + "logits/chosen": -2.2692031860351562, + "logits/rejected": -2.326690435409546, + "logps/chosen": -161.824951171875, + "logps/rejected": -154.09632873535156, + "loss": 0.7904, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4080825448036194, + "rewards/margins": -0.13219794631004333, + "rewards/rejected": -0.27588459849357605, + "step": 558 + }, + { + "epoch": 0.73, + "learning_rate": 4.4782827917435454e-05, + "logits/chosen": -1.4946506023406982, + "logits/rejected": -1.4943220615386963, + "logps/chosen": -169.450927734375, + "logps/rejected": -177.3900146484375, + "loss": 0.715, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.09592101722955704, + "rewards/margins": -0.032054100185632706, + "rewards/rejected": -0.06386692076921463, + "step": 559 + }, + { + "epoch": 0.73, + "learning_rate": 4.4760900619592085e-05, + "logits/chosen": -2.043163537979126, + "logits/rejected": -2.1061294078826904, + "logps/chosen": -146.00006103515625, + "logps/rejected": -167.94851684570312, + "loss": 0.7728, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.056870993226766586, + "rewards/margins": -0.011001009494066238, + "rewards/rejected": -0.045869968831539154, + "step": 560 + }, + { + "epoch": 0.73, + "learning_rate": 4.4738932731164194e-05, + "logits/chosen": -2.10860538482666, + "logits/rejected": -2.0873773097991943, + "logps/chosen": -147.17306518554688, + "logps/rejected": -149.69338989257812, + "loss": 0.673, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.06803455948829651, + "rewards/margins": 0.09854596853256226, + "rewards/rejected": -0.16658052802085876, + "step": 561 + }, + { + "epoch": 0.74, + "learning_rate": 4.47169242972757e-05, + "logits/chosen": -2.1042397022247314, + "logits/rejected": -2.107679605484009, + "logps/chosen": -206.35293579101562, + "logps/rejected": -179.4486541748047, + "loss": 0.7299, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.17581839859485626, + "rewards/margins": -0.013735771179199219, + "rewards/rejected": -0.16208262741565704, + "step": 562 + }, + { + "epoch": 0.74, + "learning_rate": 4.469487536313381e-05, + "logits/chosen": -1.9570105075836182, + "logits/rejected": -1.9454288482666016, + "logps/chosen": -165.0225067138672, + "logps/rejected": -182.94992065429688, + "loss": 0.6866, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.18598608672618866, + "rewards/margins": 0.06062725931406021, + "rewards/rejected": -0.24661336839199066, + "step": 563 + }, + { + "epoch": 0.74, + "learning_rate": 4.467278597402894e-05, + "logits/chosen": -2.0907437801361084, + "logits/rejected": -2.0541324615478516, + "logps/chosen": -186.67782592773438, + "logps/rejected": -194.32305908203125, + "loss": 0.7212, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.26842382550239563, + "rewards/margins": 0.008616073057055473, + "rewards/rejected": -0.27703991532325745, + "step": 564 + }, + { + "epoch": 0.74, + "learning_rate": 4.465065617533457e-05, + "logits/chosen": -2.0557780265808105, + "logits/rejected": -2.066563129425049, + "logps/chosen": -146.12057495117188, + "logps/rejected": -144.6167449951172, + "loss": 0.9087, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.5873019695281982, + "rewards/margins": -0.3313911557197571, + "rewards/rejected": -0.25591081380844116, + "step": 565 + }, + { + "epoch": 0.74, + "learning_rate": 4.462848601250722e-05, + "logits/chosen": -2.1823415756225586, + "logits/rejected": -2.1932756900787354, + "logps/chosen": -151.8548126220703, + "logps/rejected": -147.4888916015625, + "loss": 0.6847, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.24860316514968872, + "rewards/margins": 0.11341544985771179, + "rewards/rejected": -0.3620185852050781, + "step": 566 + }, + { + "epoch": 0.74, + "learning_rate": 4.4606275531086295e-05, + "logits/chosen": -1.800735354423523, + "logits/rejected": -1.7712013721466064, + "logps/chosen": -145.9650115966797, + "logps/rejected": -163.93515014648438, + "loss": 0.7801, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.49170562624931335, + "rewards/margins": -0.12783843278884888, + "rewards/rejected": -0.3638671636581421, + "step": 567 + }, + { + "epoch": 0.74, + "learning_rate": 4.4584024776694035e-05, + "logits/chosen": -1.8931790590286255, + "logits/rejected": -1.8744131326675415, + "logps/chosen": -208.36671447753906, + "logps/rejected": -222.00416564941406, + "loss": 0.6506, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.040707264095544815, + "rewards/margins": 0.14632365107536316, + "rewards/rejected": -0.18703092634677887, + "step": 568 + }, + { + "epoch": 0.74, + "learning_rate": 4.45617337950354e-05, + "logits/chosen": -1.761922836303711, + "logits/rejected": -1.7843868732452393, + "logps/chosen": -153.74264526367188, + "logps/rejected": -178.4879150390625, + "loss": 0.7251, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10153071582317352, + "rewards/margins": 0.11845846474170685, + "rewards/rejected": -0.21998921036720276, + "step": 569 + }, + { + "epoch": 0.75, + "learning_rate": 4.453940263189797e-05, + "logits/chosen": -2.2519378662109375, + "logits/rejected": -2.1770126819610596, + "logps/chosen": -142.41217041015625, + "logps/rejected": -133.3799285888672, + "loss": 0.709, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19167102873325348, + "rewards/margins": 0.01131703332066536, + "rewards/rejected": -0.20298807322978973, + "step": 570 + }, + { + "epoch": 0.75, + "learning_rate": 4.4517031333151874e-05, + "logits/chosen": -2.078045606613159, + "logits/rejected": -2.046724557876587, + "logps/chosen": -137.99807739257812, + "logps/rejected": -137.1576385498047, + "loss": 0.8089, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5403813123703003, + "rewards/margins": -0.16598506271839142, + "rewards/rejected": -0.37439629435539246, + "step": 571 + }, + { + "epoch": 0.75, + "learning_rate": 4.449461994474968e-05, + "logits/chosen": -1.9291788339614868, + "logits/rejected": -1.961287260055542, + "logps/chosen": -190.75672912597656, + "logps/rejected": -210.29931640625, + "loss": 0.7873, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.06664823740720749, + "rewards/margins": -0.008556842803955078, + "rewards/rejected": -0.05809139460325241, + "step": 572 + }, + { + "epoch": 0.75, + "learning_rate": 4.44721685127263e-05, + "logits/chosen": -2.2334282398223877, + "logits/rejected": -2.1782572269439697, + "logps/chosen": -164.24710083007812, + "logps/rejected": -161.57275390625, + "loss": 0.8631, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.24856142699718475, + "rewards/margins": -0.2524217367172241, + "rewards/rejected": 0.003860296681523323, + "step": 573 + }, + { + "epoch": 0.75, + "learning_rate": 4.4449677083198896e-05, + "logits/chosen": -2.198751211166382, + "logits/rejected": -2.1961047649383545, + "logps/chosen": -139.06393432617188, + "logps/rejected": -136.85275268554688, + "loss": 0.6852, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.24132420122623444, + "rewards/margins": 0.04328273981809616, + "rewards/rejected": -0.2846069633960724, + "step": 574 + }, + { + "epoch": 0.75, + "learning_rate": 4.4427145702366804e-05, + "logits/chosen": -2.167025327682495, + "logits/rejected": -2.1032488346099854, + "logps/chosen": -150.00331115722656, + "logps/rejected": -143.5449981689453, + "loss": 0.805, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3796214461326599, + "rewards/margins": -0.16332601010799408, + "rewards/rejected": -0.21629548072814941, + "step": 575 + }, + { + "epoch": 0.75, + "learning_rate": 4.440457441651139e-05, + "logits/chosen": -2.0747876167297363, + "logits/rejected": -2.0561654567718506, + "logps/chosen": -189.77783203125, + "logps/rejected": -190.18592834472656, + "loss": 0.6855, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1576140820980072, + "rewards/margins": 0.06496872752904892, + "rewards/rejected": -0.2225828319787979, + "step": 576 + }, + { + "epoch": 0.76, + "learning_rate": 4.4381963271996044e-05, + "logits/chosen": -1.8249603509902954, + "logits/rejected": -1.7742782831192017, + "logps/chosen": -165.38795471191406, + "logps/rejected": -166.17652893066406, + "loss": 0.757, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.25819915533065796, + "rewards/margins": 0.011954933404922485, + "rewards/rejected": -0.27015408873558044, + "step": 577 + }, + { + "epoch": 0.76, + "learning_rate": 4.435931231526597e-05, + "logits/chosen": -2.0663070678710938, + "logits/rejected": -2.095902919769287, + "logps/chosen": -168.3959197998047, + "logps/rejected": -168.69862365722656, + "loss": 0.8164, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5112836360931396, + "rewards/margins": -0.09502200782299042, + "rewards/rejected": -0.4162616431713104, + "step": 578 + }, + { + "epoch": 0.76, + "learning_rate": 4.433662159284818e-05, + "logits/chosen": -1.9458318948745728, + "logits/rejected": -1.9755809307098389, + "logps/chosen": -158.89151000976562, + "logps/rejected": -181.25265502929688, + "loss": 0.6813, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.40285855531692505, + "rewards/margins": 0.05986136570572853, + "rewards/rejected": -0.4627199172973633, + "step": 579 + }, + { + "epoch": 0.76, + "learning_rate": 4.4313891151351375e-05, + "logits/chosen": -1.8997300863265991, + "logits/rejected": -1.9370012283325195, + "logps/chosen": -189.06930541992188, + "logps/rejected": -198.09793090820312, + "loss": 0.725, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23561373353004456, + "rewards/margins": 0.04797305539250374, + "rewards/rejected": -0.2835868000984192, + "step": 580 + }, + { + "epoch": 0.76, + "learning_rate": 4.429112103746582e-05, + "logits/chosen": -2.2178287506103516, + "logits/rejected": -2.218778133392334, + "logps/chosen": -225.8673095703125, + "logps/rejected": -211.37353515625, + "loss": 0.6963, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.43705853819847107, + "rewards/margins": 0.05578887462615967, + "rewards/rejected": -0.49284741282463074, + "step": 581 + }, + { + "epoch": 0.76, + "learning_rate": 4.4268311297963295e-05, + "logits/chosen": -1.8671170473098755, + "logits/rejected": -1.928563117980957, + "logps/chosen": -168.80242919921875, + "logps/rejected": -170.85487365722656, + "loss": 0.9331, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6543461084365845, + "rewards/margins": -0.21871261298656464, + "rewards/rejected": -0.43563348054885864, + "step": 582 + }, + { + "epoch": 0.76, + "learning_rate": 4.4245461979696937e-05, + "logits/chosen": -2.1317386627197266, + "logits/rejected": -2.069695234298706, + "logps/chosen": -170.15565490722656, + "logps/rejected": -172.81195068359375, + "loss": 0.7135, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.37158751487731934, + "rewards/margins": 0.09176269173622131, + "rewards/rejected": -0.46335020661354065, + "step": 583 + }, + { + "epoch": 0.76, + "learning_rate": 4.422257312960123e-05, + "logits/chosen": -2.1730875968933105, + "logits/rejected": -2.2167327404022217, + "logps/chosen": -165.11517333984375, + "logps/rejected": -160.7394256591797, + "loss": 0.666, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11963924765586853, + "rewards/margins": 0.12844307720661163, + "rewards/rejected": -0.24808233976364136, + "step": 584 + }, + { + "epoch": 0.77, + "learning_rate": 4.419964479469182e-05, + "logits/chosen": -2.059325933456421, + "logits/rejected": -2.017867088317871, + "logps/chosen": -155.38075256347656, + "logps/rejected": -140.58778381347656, + "loss": 0.5993, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.31128260493278503, + "rewards/margins": 0.24781286716461182, + "rewards/rejected": -0.5590954422950745, + "step": 585 + }, + { + "epoch": 0.77, + "learning_rate": 4.417667702206548e-05, + "logits/chosen": -1.8881521224975586, + "logits/rejected": -1.952433705329895, + "logps/chosen": -161.21728515625, + "logps/rejected": -170.68008422851562, + "loss": 0.7572, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.32663464546203613, + "rewards/margins": -0.015847772359848022, + "rewards/rejected": -0.3107869029045105, + "step": 586 + }, + { + "epoch": 0.77, + "learning_rate": 4.415366985889998e-05, + "logits/chosen": -2.0811541080474854, + "logits/rejected": -2.0584499835968018, + "logps/chosen": -153.13113403320312, + "logps/rejected": -144.1329803466797, + "loss": 0.7128, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.24523815512657166, + "rewards/margins": 0.0006711352616548538, + "rewards/rejected": -0.24590928852558136, + "step": 587 + }, + { + "epoch": 0.77, + "learning_rate": 4.413062335245402e-05, + "logits/chosen": -2.009754180908203, + "logits/rejected": -2.03741717338562, + "logps/chosen": -178.30477905273438, + "logps/rejected": -169.07321166992188, + "loss": 0.6164, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.011600993573665619, + "rewards/margins": 0.23952721059322357, + "rewards/rejected": -0.251128226518631, + "step": 588 + }, + { + "epoch": 0.77, + "learning_rate": 4.410753755006708e-05, + "logits/chosen": -2.0812978744506836, + "logits/rejected": -2.0661728382110596, + "logps/chosen": -205.08026123046875, + "logps/rejected": -189.98153686523438, + "loss": 0.6828, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.22344040870666504, + "rewards/margins": 0.08454579859972, + "rewards/rejected": -0.30798622965812683, + "step": 589 + }, + { + "epoch": 0.77, + "learning_rate": 4.408441249915938e-05, + "logits/chosen": -2.2237741947174072, + "logits/rejected": -2.2071006298065186, + "logps/chosen": -145.8521728515625, + "logps/rejected": -153.61093139648438, + "loss": 0.6731, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.031120114028453827, + "rewards/margins": 0.07498609274625778, + "rewards/rejected": -0.10610620677471161, + "step": 590 + }, + { + "epoch": 0.77, + "learning_rate": 4.4061248247231776e-05, + "logits/chosen": -2.0926318168640137, + "logits/rejected": -2.064544916152954, + "logps/chosen": -200.65924072265625, + "logps/rejected": -201.64376831054688, + "loss": 0.7178, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2164732813835144, + "rewards/margins": 0.056919313967227936, + "rewards/rejected": -0.27339258790016174, + "step": 591 + }, + { + "epoch": 0.77, + "learning_rate": 4.4038044841865614e-05, + "logits/chosen": -1.9554996490478516, + "logits/rejected": -1.9424071311950684, + "logps/chosen": -165.02134704589844, + "logps/rejected": -169.86578369140625, + "loss": 0.6787, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.12406764924526215, + "rewards/margins": 0.09511314332485199, + "rewards/rejected": -0.21918082237243652, + "step": 592 + }, + { + "epoch": 0.78, + "learning_rate": 4.401480233072268e-05, + "logits/chosen": -2.166558265686035, + "logits/rejected": -2.2012734413146973, + "logps/chosen": -203.3728485107422, + "logps/rejected": -207.2845001220703, + "loss": 0.7391, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.14014330506324768, + "rewards/margins": -0.0016260165721178055, + "rewards/rejected": -0.13851726055145264, + "step": 593 + }, + { + "epoch": 0.78, + "learning_rate": 4.399152076154509e-05, + "logits/chosen": -1.908893346786499, + "logits/rejected": -1.9617127180099487, + "logps/chosen": -154.35598754882812, + "logps/rejected": -156.9701690673828, + "loss": 0.775, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.09893524646759033, + "rewards/margins": -0.09441002458333969, + "rewards/rejected": -0.004525229334831238, + "step": 594 + }, + { + "epoch": 0.78, + "learning_rate": 4.396820018215518e-05, + "logits/chosen": -1.9621710777282715, + "logits/rejected": -2.016408920288086, + "logps/chosen": -178.7270965576172, + "logps/rejected": -200.99447631835938, + "loss": 0.6945, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3592956066131592, + "rewards/margins": 0.08869167417287827, + "rewards/rejected": -0.44798728823661804, + "step": 595 + }, + { + "epoch": 0.78, + "learning_rate": 4.394484064045542e-05, + "logits/chosen": -2.2477211952209473, + "logits/rejected": -2.2041501998901367, + "logps/chosen": -233.4781494140625, + "logps/rejected": -218.37103271484375, + "loss": 0.8302, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.027775531634688377, + "rewards/margins": -0.17972132563591003, + "rewards/rejected": 0.1519457995891571, + "step": 596 + }, + { + "epoch": 0.78, + "learning_rate": 4.392144218442831e-05, + "logits/chosen": -1.885699987411499, + "logits/rejected": -1.8928428888320923, + "logps/chosen": -147.02767944335938, + "logps/rejected": -141.90377807617188, + "loss": 0.7493, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.34831514954566956, + "rewards/margins": -0.05675073340535164, + "rewards/rejected": -0.291564404964447, + "step": 597 + }, + { + "epoch": 0.78, + "learning_rate": 4.3898004862136286e-05, + "logits/chosen": -2.138289213180542, + "logits/rejected": -2.1273224353790283, + "logps/chosen": -173.62033081054688, + "logps/rejected": -163.26397705078125, + "loss": 0.8012, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.24764633178710938, + "rewards/margins": -0.08084635436534882, + "rewards/rejected": -0.16679996252059937, + "step": 598 + }, + { + "epoch": 0.78, + "learning_rate": 4.3874528721721624e-05, + "logits/chosen": -2.150501012802124, + "logits/rejected": -2.1304874420166016, + "logps/chosen": -167.4502716064453, + "logps/rejected": -207.66793823242188, + "loss": 0.859, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3565550744533539, + "rewards/margins": -0.1929093897342682, + "rewards/rejected": -0.1636456847190857, + "step": 599 + }, + { + "epoch": 0.79, + "learning_rate": 4.385101381140633e-05, + "logits/chosen": -2.0221786499023438, + "logits/rejected": -2.1972155570983887, + "logps/chosen": -174.52090454101562, + "logps/rejected": -189.63348388671875, + "loss": 0.6853, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1927257478237152, + "rewards/margins": 0.07324501872062683, + "rewards/rejected": -0.26597073674201965, + "step": 600 + }, + { + "epoch": 0.79, + "learning_rate": 4.382746017949203e-05, + "logits/chosen": -2.0691418647766113, + "logits/rejected": -2.0149199962615967, + "logps/chosen": -247.23388671875, + "logps/rejected": -234.80191040039062, + "loss": 0.8027, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.552363395690918, + "rewards/margins": -0.18797653913497925, + "rewards/rejected": -0.36438679695129395, + "step": 601 + }, + { + "epoch": 0.79, + "learning_rate": 4.380386787435992e-05, + "logits/chosen": -2.1596169471740723, + "logits/rejected": -2.1461353302001953, + "logps/chosen": -171.2084503173828, + "logps/rejected": -213.65097045898438, + "loss": 0.7502, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5347078442573547, + "rewards/margins": -0.039867326617240906, + "rewards/rejected": -0.49484050273895264, + "step": 602 + }, + { + "epoch": 0.79, + "learning_rate": 4.378023694447061e-05, + "logits/chosen": -2.121032953262329, + "logits/rejected": -2.1503822803497314, + "logps/chosen": -164.81271362304688, + "logps/rejected": -170.76075744628906, + "loss": 0.6693, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01932806894183159, + "rewards/margins": 0.08725270628929138, + "rewards/rejected": -0.06792464852333069, + "step": 603 + }, + { + "epoch": 0.79, + "learning_rate": 4.375656743836407e-05, + "logits/chosen": -2.059199094772339, + "logits/rejected": -2.071756601333618, + "logps/chosen": -164.81121826171875, + "logps/rejected": -161.324951171875, + "loss": 0.6746, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.22563201189041138, + "rewards/margins": 0.145117849111557, + "rewards/rejected": -0.37074992060661316, + "step": 604 + }, + { + "epoch": 0.79, + "learning_rate": 4.373285940465948e-05, + "logits/chosen": -2.096938371658325, + "logits/rejected": -2.2146358489990234, + "logps/chosen": -149.6543731689453, + "logps/rejected": -172.9806671142578, + "loss": 0.7894, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2511146366596222, + "rewards/margins": 0.0032073892652988434, + "rewards/rejected": -0.2543220520019531, + "step": 605 + }, + { + "epoch": 0.79, + "learning_rate": 4.370911289205518e-05, + "logits/chosen": -2.0443522930145264, + "logits/rejected": -2.012732982635498, + "logps/chosen": -181.68507385253906, + "logps/rejected": -182.77114868164062, + "loss": 0.6579, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.19644834101200104, + "rewards/margins": 0.14906372129917145, + "rewards/rejected": -0.3455120623111725, + "step": 606 + }, + { + "epoch": 0.79, + "learning_rate": 4.368532794932854e-05, + "logits/chosen": -2.0499818325042725, + "logits/rejected": -2.0729148387908936, + "logps/chosen": -156.1490020751953, + "logps/rejected": -156.59902954101562, + "loss": 0.7566, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.12633728981018066, + "rewards/margins": -0.09627077728509903, + "rewards/rejected": -0.030066512525081635, + "step": 607 + }, + { + "epoch": 0.8, + "learning_rate": 4.366150462533588e-05, + "logits/chosen": -2.1601145267486572, + "logits/rejected": -2.153270959854126, + "logps/chosen": -130.52975463867188, + "logps/rejected": -131.89930725097656, + "loss": 0.8018, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.30306872725486755, + "rewards/margins": -0.10493332147598267, + "rewards/rejected": -0.19813542068004608, + "step": 608 + }, + { + "epoch": 0.8, + "learning_rate": 4.363764296901234e-05, + "logits/chosen": -2.1186208724975586, + "logits/rejected": -2.134394645690918, + "logps/chosen": -183.10049438476562, + "logps/rejected": -175.95849609375, + "loss": 0.8474, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.47242993116378784, + "rewards/margins": -0.21657848358154297, + "rewards/rejected": -0.25585147738456726, + "step": 609 + }, + { + "epoch": 0.8, + "learning_rate": 4.361374302937182e-05, + "logits/chosen": -2.0684666633605957, + "logits/rejected": -2.0428881645202637, + "logps/chosen": -207.9459686279297, + "logps/rejected": -229.56698608398438, + "loss": 0.8513, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.3699374198913574, + "rewards/margins": -0.2574125826358795, + "rewards/rejected": -0.11252487450838089, + "step": 610 + }, + { + "epoch": 0.8, + "learning_rate": 4.358980485550683e-05, + "logits/chosen": -1.949844479560852, + "logits/rejected": -1.9453861713409424, + "logps/chosen": -143.38499450683594, + "logps/rejected": -150.81947326660156, + "loss": 0.705, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5936511754989624, + "rewards/margins": 0.01353779248893261, + "rewards/rejected": -0.6071889400482178, + "step": 611 + }, + { + "epoch": 0.8, + "learning_rate": 4.356582849658845e-05, + "logits/chosen": -1.9948573112487793, + "logits/rejected": -2.0161683559417725, + "logps/chosen": -140.34910583496094, + "logps/rejected": -141.26271057128906, + "loss": 0.5736, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19039779901504517, + "rewards/margins": 0.31717830896377563, + "rewards/rejected": -0.5075761079788208, + "step": 612 + }, + { + "epoch": 0.8, + "learning_rate": 4.354181400186617e-05, + "logits/chosen": -1.9521052837371826, + "logits/rejected": -1.9075651168823242, + "logps/chosen": -137.3512420654297, + "logps/rejected": -136.6510467529297, + "loss": 0.7616, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.45448195934295654, + "rewards/margins": -0.034489963203668594, + "rewards/rejected": -0.41999197006225586, + "step": 613 + }, + { + "epoch": 0.8, + "learning_rate": 4.351776142066782e-05, + "logits/chosen": -2.138842821121216, + "logits/rejected": -2.1373250484466553, + "logps/chosen": -193.3833465576172, + "logps/rejected": -193.51292419433594, + "loss": 0.6409, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2606329023838043, + "rewards/margins": 0.1631646752357483, + "rewards/rejected": -0.423797607421875, + "step": 614 + }, + { + "epoch": 0.8, + "learning_rate": 4.349367080239946e-05, + "logits/chosen": -2.118267774581909, + "logits/rejected": -2.135613441467285, + "logps/chosen": -250.55662536621094, + "logps/rejected": -243.0400390625, + "loss": 0.7276, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7016344666481018, + "rewards/margins": -0.04274986684322357, + "rewards/rejected": -0.6588846445083618, + "step": 615 + }, + { + "epoch": 0.81, + "learning_rate": 4.34695421965453e-05, + "logits/chosen": -2.00390362739563, + "logits/rejected": -1.9568325281143188, + "logps/chosen": -203.2637176513672, + "logps/rejected": -215.809326171875, + "loss": 0.7331, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3516182005405426, + "rewards/margins": -0.01222594827413559, + "rewards/rejected": -0.3393922746181488, + "step": 616 + }, + { + "epoch": 0.81, + "learning_rate": 4.344537565266755e-05, + "logits/chosen": -2.201554536819458, + "logits/rejected": -2.2679710388183594, + "logps/chosen": -168.6824188232422, + "logps/rejected": -193.9791259765625, + "loss": 0.6135, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3374904990196228, + "rewards/margins": 0.24233487248420715, + "rewards/rejected": -0.5798253417015076, + "step": 617 + }, + { + "epoch": 0.81, + "learning_rate": 4.342117122040637e-05, + "logits/chosen": -1.8600742816925049, + "logits/rejected": -2.029170274734497, + "logps/chosen": -151.89068603515625, + "logps/rejected": -183.169189453125, + "loss": 0.5888, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1524229496717453, + "rewards/margins": 0.2849957346916199, + "rewards/rejected": -0.43741869926452637, + "step": 618 + }, + { + "epoch": 0.81, + "learning_rate": 4.339692894947974e-05, + "logits/chosen": -2.1574344635009766, + "logits/rejected": -2.1366348266601562, + "logps/chosen": -168.2608642578125, + "logps/rejected": -169.64785766601562, + "loss": 0.748, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5251599550247192, + "rewards/margins": -0.011976517736911774, + "rewards/rejected": -0.5131835341453552, + "step": 619 + }, + { + "epoch": 0.81, + "learning_rate": 4.3372648889683364e-05, + "logits/chosen": -2.0989270210266113, + "logits/rejected": -2.053802490234375, + "logps/chosen": -147.33477783203125, + "logps/rejected": -148.05438232421875, + "loss": 0.6681, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18421587347984314, + "rewards/margins": 0.19642779231071472, + "rewards/rejected": -0.38064366579055786, + "step": 620 + }, + { + "epoch": 0.81, + "learning_rate": 4.334833109089057e-05, + "logits/chosen": -2.010010242462158, + "logits/rejected": -2.007097005844116, + "logps/chosen": -162.26063537597656, + "logps/rejected": -156.50814819335938, + "loss": 0.6747, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4700685143470764, + "rewards/margins": 0.10477405786514282, + "rewards/rejected": -0.5748425126075745, + "step": 621 + }, + { + "epoch": 0.81, + "learning_rate": 4.33239756030522e-05, + "logits/chosen": -1.7842457294464111, + "logits/rejected": -1.7858459949493408, + "logps/chosen": -133.84458923339844, + "logps/rejected": -175.0372314453125, + "loss": 0.6192, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2971382737159729, + "rewards/margins": 0.2653331458568573, + "rewards/rejected": -0.5624713897705078, + "step": 622 + }, + { + "epoch": 0.82, + "learning_rate": 4.329958247619651e-05, + "logits/chosen": -2.1888294219970703, + "logits/rejected": -2.1918630599975586, + "logps/chosen": -153.9444580078125, + "logps/rejected": -150.8408966064453, + "loss": 0.6443, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15923231840133667, + "rewards/margins": 0.16449183225631714, + "rewards/rejected": -0.3237241506576538, + "step": 623 + }, + { + "epoch": 0.82, + "learning_rate": 4.3275151760429075e-05, + "logits/chosen": -2.0962228775024414, + "logits/rejected": -2.1192712783813477, + "logps/chosen": -155.5863494873047, + "logps/rejected": -150.6765594482422, + "loss": 0.5493, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19339366257190704, + "rewards/margins": 0.4053744971752167, + "rewards/rejected": -0.5987681150436401, + "step": 624 + }, + { + "epoch": 0.82, + "learning_rate": 4.325068350593268e-05, + "logits/chosen": -2.095531702041626, + "logits/rejected": -2.180366039276123, + "logps/chosen": -187.6971893310547, + "logps/rejected": -229.3963165283203, + "loss": 0.7277, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.45450133085250854, + "rewards/margins": 0.05496390163898468, + "rewards/rejected": -0.509465217590332, + "step": 625 + }, + { + "epoch": 0.82, + "learning_rate": 4.322617776296723e-05, + "logits/chosen": -2.116663694381714, + "logits/rejected": -2.131500244140625, + "logps/chosen": -147.40220642089844, + "logps/rejected": -160.78799438476562, + "loss": 0.7796, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3981628715991974, + "rewards/margins": -0.08542328327894211, + "rewards/rejected": -0.3127395808696747, + "step": 626 + }, + { + "epoch": 0.82, + "learning_rate": 4.320163458186961e-05, + "logits/chosen": -1.8610426187515259, + "logits/rejected": -1.8280242681503296, + "logps/chosen": -142.5250244140625, + "logps/rejected": -167.79307556152344, + "loss": 0.6001, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.26573097705841064, + "rewards/margins": 0.3146597445011139, + "rewards/rejected": -0.5803906917572021, + "step": 627 + }, + { + "epoch": 0.82, + "learning_rate": 4.317705401305362e-05, + "logits/chosen": -2.2919328212738037, + "logits/rejected": -2.295372486114502, + "logps/chosen": -182.9703369140625, + "logps/rejected": -184.76123046875, + "loss": 0.6323, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2809330224990845, + "rewards/margins": 0.18110422790050507, + "rewards/rejected": -0.46203726530075073, + "step": 628 + }, + { + "epoch": 0.82, + "learning_rate": 4.315243610700986e-05, + "logits/chosen": -1.962925910949707, + "logits/rejected": -1.932532787322998, + "logps/chosen": -135.61175537109375, + "logps/rejected": -145.57708740234375, + "loss": 0.5795, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2901972532272339, + "rewards/margins": 0.3114909827709198, + "rewards/rejected": -0.6016882658004761, + "step": 629 + }, + { + "epoch": 0.82, + "learning_rate": 4.312778091430563e-05, + "logits/chosen": -2.0856752395629883, + "logits/rejected": -2.060380697250366, + "logps/chosen": -161.23788452148438, + "logps/rejected": -174.86415100097656, + "loss": 0.685, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3572569191455841, + "rewards/margins": 0.08018150925636292, + "rewards/rejected": -0.437438428401947, + "step": 630 + }, + { + "epoch": 0.83, + "learning_rate": 4.310308848558479e-05, + "logits/chosen": -1.967565894126892, + "logits/rejected": -2.0015602111816406, + "logps/chosen": -158.85549926757812, + "logps/rejected": -165.99868774414062, + "loss": 0.643, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.298388808965683, + "rewards/margins": 0.15071076154708862, + "rewards/rejected": -0.4490995407104492, + "step": 631 + }, + { + "epoch": 0.83, + "learning_rate": 4.3078358871567706e-05, + "logits/chosen": -1.9660686254501343, + "logits/rejected": -2.092501163482666, + "logps/chosen": -134.13931274414062, + "logps/rejected": -151.00088500976562, + "loss": 0.6324, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28502339124679565, + "rewards/margins": 0.22240550816059113, + "rewards/rejected": -0.5074288845062256, + "step": 632 + }, + { + "epoch": 0.83, + "learning_rate": 4.305359212305115e-05, + "logits/chosen": -2.0704879760742188, + "logits/rejected": -2.108783721923828, + "logps/chosen": -151.75942993164062, + "logps/rejected": -169.45274353027344, + "loss": 0.6533, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2645598351955414, + "rewards/margins": 0.18161676824092865, + "rewards/rejected": -0.44617658853530884, + "step": 633 + }, + { + "epoch": 0.83, + "learning_rate": 4.302878829090813e-05, + "logits/chosen": -2.2878875732421875, + "logits/rejected": -2.275372266769409, + "logps/chosen": -171.14463806152344, + "logps/rejected": -161.89524841308594, + "loss": 0.721, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.572907567024231, + "rewards/margins": 0.08118201792240143, + "rewards/rejected": -0.6540895700454712, + "step": 634 + }, + { + "epoch": 0.83, + "learning_rate": 4.300394742608784e-05, + "logits/chosen": -2.080106019973755, + "logits/rejected": -2.0426576137542725, + "logps/chosen": -151.9457550048828, + "logps/rejected": -161.84259033203125, + "loss": 0.6613, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22103726863861084, + "rewards/margins": 0.21995976567268372, + "rewards/rejected": -0.44099700450897217, + "step": 635 + }, + { + "epoch": 0.83, + "learning_rate": 4.2979069579615564e-05, + "logits/chosen": -2.027768850326538, + "logits/rejected": -2.0777463912963867, + "logps/chosen": -177.09764099121094, + "logps/rejected": -151.2614288330078, + "loss": 0.8068, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6577314734458923, + "rewards/margins": -0.037055857479572296, + "rewards/rejected": -0.6206756234169006, + "step": 636 + }, + { + "epoch": 0.83, + "learning_rate": 4.2954154802592514e-05, + "logits/chosen": -2.1159207820892334, + "logits/rejected": -2.081036329269409, + "logps/chosen": -177.8500213623047, + "logps/rejected": -154.96888732910156, + "loss": 0.8062, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4962155818939209, + "rewards/margins": -0.15886664390563965, + "rewards/rejected": -0.33734893798828125, + "step": 637 + }, + { + "epoch": 0.83, + "learning_rate": 4.292920314619578e-05, + "logits/chosen": -2.139322519302368, + "logits/rejected": -2.1055707931518555, + "logps/chosen": -135.03768920898438, + "logps/rejected": -146.54347229003906, + "loss": 0.7386, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4281715750694275, + "rewards/margins": 0.03442011773586273, + "rewards/rejected": -0.4625917077064514, + "step": 638 + }, + { + "epoch": 0.84, + "learning_rate": 4.290421466167822e-05, + "logits/chosen": -2.173722743988037, + "logits/rejected": -2.145287036895752, + "logps/chosen": -188.748779296875, + "logps/rejected": -167.95037841796875, + "loss": 0.7986, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.45367497205734253, + "rewards/margins": -0.16028070449829102, + "rewards/rejected": -0.2933943271636963, + "step": 639 + }, + { + "epoch": 0.84, + "learning_rate": 4.2879189400368314e-05, + "logits/chosen": -1.828595757484436, + "logits/rejected": -1.8133007287979126, + "logps/chosen": -171.7655792236328, + "logps/rejected": -181.7562255859375, + "loss": 0.637, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15220165252685547, + "rewards/margins": 0.2077586054801941, + "rewards/rejected": -0.35996025800704956, + "step": 640 + }, + { + "epoch": 0.84, + "learning_rate": 4.2854127413670096e-05, + "logits/chosen": -2.1601057052612305, + "logits/rejected": -2.1688709259033203, + "logps/chosen": -140.67388916015625, + "logps/rejected": -159.5793914794922, + "loss": 0.6556, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.19438128173351288, + "rewards/margins": 0.14893627166748047, + "rewards/rejected": -0.34331756830215454, + "step": 641 + }, + { + "epoch": 0.84, + "learning_rate": 4.282902875306304e-05, + "logits/chosen": -2.2678325176239014, + "logits/rejected": -2.235651969909668, + "logps/chosen": -206.64051818847656, + "logps/rejected": -193.34620666503906, + "loss": 0.7065, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3568398952484131, + "rewards/margins": 0.062468186020851135, + "rewards/rejected": -0.419308066368103, + "step": 642 + }, + { + "epoch": 0.84, + "learning_rate": 4.280389347010194e-05, + "logits/chosen": -2.0710811614990234, + "logits/rejected": -2.140387535095215, + "logps/chosen": -148.68856811523438, + "logps/rejected": -162.29550170898438, + "loss": 0.7149, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.37908732891082764, + "rewards/margins": 0.052500005811452866, + "rewards/rejected": -0.4315873384475708, + "step": 643 + }, + { + "epoch": 0.84, + "learning_rate": 4.277872161641682e-05, + "logits/chosen": -2.075469493865967, + "logits/rejected": -2.0515270233154297, + "logps/chosen": -154.94790649414062, + "logps/rejected": -143.33999633789062, + "loss": 0.8673, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.636205792427063, + "rewards/margins": -0.23466622829437256, + "rewards/rejected": -0.40153956413269043, + "step": 644 + }, + { + "epoch": 0.84, + "learning_rate": 4.275351324371283e-05, + "logits/chosen": -2.034024953842163, + "logits/rejected": -2.0730140209198, + "logps/chosen": -165.8710479736328, + "logps/rejected": -167.48043823242188, + "loss": 0.6052, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4217650890350342, + "rewards/margins": 0.24776552617549896, + "rewards/rejected": -0.6695306301116943, + "step": 645 + }, + { + "epoch": 0.85, + "learning_rate": 4.2728268403770145e-05, + "logits/chosen": -2.018975019454956, + "logits/rejected": -2.0665860176086426, + "logps/chosen": -168.4711456298828, + "logps/rejected": -167.21612548828125, + "loss": 0.719, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.43721261620521545, + "rewards/margins": 0.02011800743639469, + "rewards/rejected": -0.4573306143283844, + "step": 646 + }, + { + "epoch": 0.85, + "learning_rate": 4.270298714844381e-05, + "logits/chosen": -2.073598623275757, + "logits/rejected": -2.032815456390381, + "logps/chosen": -168.9475860595703, + "logps/rejected": -157.1396942138672, + "loss": 0.7293, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.24852538108825684, + "rewards/margins": -0.01923494040966034, + "rewards/rejected": -0.2292904555797577, + "step": 647 + }, + { + "epoch": 0.85, + "learning_rate": 4.267766952966369e-05, + "logits/chosen": -1.8729684352874756, + "logits/rejected": -1.8412991762161255, + "logps/chosen": -157.169677734375, + "logps/rejected": -138.6353302001953, + "loss": 0.7324, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.472010999917984, + "rewards/margins": 0.06063162162899971, + "rewards/rejected": -0.5326426029205322, + "step": 648 + }, + { + "epoch": 0.85, + "learning_rate": 4.2652315599434354e-05, + "logits/chosen": -1.9037437438964844, + "logits/rejected": -1.9179785251617432, + "logps/chosen": -149.9954071044922, + "logps/rejected": -150.65032958984375, + "loss": 0.7982, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.272285133600235, + "rewards/margins": -0.07758139818906784, + "rewards/rejected": -0.19470372796058655, + "step": 649 + }, + { + "epoch": 0.85, + "learning_rate": 4.262692540983496e-05, + "logits/chosen": -2.205641746520996, + "logits/rejected": -2.2717151641845703, + "logps/chosen": -150.75364685058594, + "logps/rejected": -156.90733337402344, + "loss": 0.8112, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3120537996292114, + "rewards/margins": -0.1661502569913864, + "rewards/rejected": -0.1459035575389862, + "step": 650 + }, + { + "epoch": 0.85, + "learning_rate": 4.2601499013019126e-05, + "logits/chosen": -1.9804850816726685, + "logits/rejected": -2.0665392875671387, + "logps/chosen": -148.04840087890625, + "logps/rejected": -172.39447021484375, + "loss": 0.6469, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.41176366806030273, + "rewards/margins": 0.12474896013736725, + "rewards/rejected": -0.5365126132965088, + "step": 651 + }, + { + "epoch": 0.85, + "learning_rate": 4.257603646121484e-05, + "logits/chosen": -1.8488759994506836, + "logits/rejected": -1.9345241785049438, + "logps/chosen": -150.47325134277344, + "logps/rejected": -180.80213928222656, + "loss": 0.7586, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2809077203273773, + "rewards/margins": -0.0009739026427268982, + "rewards/rejected": -0.2799338400363922, + "step": 652 + }, + { + "epoch": 0.85, + "learning_rate": 4.2550537806724384e-05, + "logits/chosen": -2.0807509422302246, + "logits/rejected": -2.0452427864074707, + "logps/chosen": -152.01800537109375, + "logps/rejected": -177.10731506347656, + "loss": 0.6403, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19310708343982697, + "rewards/margins": 0.18130317330360413, + "rewards/rejected": -0.3744102120399475, + "step": 653 + }, + { + "epoch": 0.86, + "learning_rate": 4.2525003101924164e-05, + "logits/chosen": -1.9181900024414062, + "logits/rejected": -1.9261229038238525, + "logps/chosen": -140.71975708007812, + "logps/rejected": -151.97988891601562, + "loss": 0.6749, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.08997268974781036, + "rewards/margins": 0.10172852873802185, + "rewards/rejected": -0.1917012333869934, + "step": 654 + }, + { + "epoch": 0.86, + "learning_rate": 4.249943239926467e-05, + "logits/chosen": -2.0650923252105713, + "logits/rejected": -1.9644784927368164, + "logps/chosen": -176.4941864013672, + "logps/rejected": -159.2071533203125, + "loss": 0.7129, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6318143010139465, + "rewards/margins": 0.05239555612206459, + "rewards/rejected": -0.6842098832130432, + "step": 655 + }, + { + "epoch": 0.86, + "learning_rate": 4.247382575127031e-05, + "logits/chosen": -2.0065248012542725, + "logits/rejected": -2.021730661392212, + "logps/chosen": -215.60427856445312, + "logps/rejected": -202.29327392578125, + "loss": 0.6533, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.05739132687449455, + "rewards/margins": 0.12948471307754517, + "rewards/rejected": -0.18687602877616882, + "step": 656 + }, + { + "epoch": 0.86, + "learning_rate": 4.2448183210539334e-05, + "logits/chosen": -2.0068016052246094, + "logits/rejected": -1.9955005645751953, + "logps/chosen": -178.9858856201172, + "logps/rejected": -138.95423889160156, + "loss": 0.7054, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.005308074876666069, + "rewards/margins": 0.04537278786301613, + "rewards/rejected": -0.040064722299575806, + "step": 657 + }, + { + "epoch": 0.86, + "learning_rate": 4.2422504829743724e-05, + "logits/chosen": -2.07608699798584, + "logits/rejected": -2.0257248878479004, + "logps/chosen": -175.4975128173828, + "logps/rejected": -152.2142333984375, + "loss": 0.6871, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.006427427753806114, + "rewards/margins": 0.06741438060998917, + "rewards/rejected": -0.07384180277585983, + "step": 658 + }, + { + "epoch": 0.86, + "learning_rate": 4.239679066162907e-05, + "logits/chosen": -1.834554672241211, + "logits/rejected": -1.9117746353149414, + "logps/chosen": -152.05563354492188, + "logps/rejected": -168.4638671875, + "loss": 0.6393, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.16257554292678833, + "rewards/margins": 0.14442546665668488, + "rewards/rejected": -0.3070010244846344, + "step": 659 + }, + { + "epoch": 0.86, + "learning_rate": 4.237104075901449e-05, + "logits/chosen": -1.856727123260498, + "logits/rejected": -1.8436062335968018, + "logps/chosen": -166.12420654296875, + "logps/rejected": -169.3896484375, + "loss": 0.8167, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.6063433289527893, + "rewards/margins": -0.16782012581825256, + "rewards/rejected": -0.4385232627391815, + "step": 660 + }, + { + "epoch": 0.87, + "learning_rate": 4.234525517479248e-05, + "logits/chosen": -1.8753873109817505, + "logits/rejected": -1.8495755195617676, + "logps/chosen": -149.54005432128906, + "logps/rejected": -163.44752502441406, + "loss": 0.5914, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15266531705856323, + "rewards/margins": 0.2658059000968933, + "rewards/rejected": -0.41847115755081177, + "step": 661 + }, + { + "epoch": 0.87, + "learning_rate": 4.2319433961928844e-05, + "logits/chosen": -2.13688063621521, + "logits/rejected": -2.1577110290527344, + "logps/chosen": -189.70277404785156, + "logps/rejected": -197.83218383789062, + "loss": 0.7608, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2501004934310913, + "rewards/margins": -0.03931330889463425, + "rewards/rejected": -0.21078716218471527, + "step": 662 + }, + { + "epoch": 0.87, + "learning_rate": 4.229357717346257e-05, + "logits/chosen": -1.8148566484451294, + "logits/rejected": -1.7939611673355103, + "logps/chosen": -164.1804656982422, + "logps/rejected": -168.58926391601562, + "loss": 0.7507, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.46129778027534485, + "rewards/margins": -0.07121908664703369, + "rewards/rejected": -0.39007866382598877, + "step": 663 + }, + { + "epoch": 0.87, + "learning_rate": 4.226768486250572e-05, + "logits/chosen": -2.0947425365448, + "logits/rejected": -2.138808250427246, + "logps/chosen": -150.54244995117188, + "logps/rejected": -168.19464111328125, + "loss": 0.6282, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.030175644904375076, + "rewards/margins": 0.19879963994026184, + "rewards/rejected": -0.2289752960205078, + "step": 664 + }, + { + "epoch": 0.87, + "learning_rate": 4.224175708224332e-05, + "logits/chosen": -2.044949769973755, + "logits/rejected": -2.0898711681365967, + "logps/chosen": -167.0792999267578, + "logps/rejected": -173.0589599609375, + "loss": 0.7429, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.28365832567214966, + "rewards/margins": -0.027682170271873474, + "rewards/rejected": -0.255976140499115, + "step": 665 + }, + { + "epoch": 0.87, + "learning_rate": 4.221579388593326e-05, + "logits/chosen": -2.167201042175293, + "logits/rejected": -2.199467658996582, + "logps/chosen": -139.47561645507812, + "logps/rejected": -147.73927307128906, + "loss": 0.6791, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2867812514305115, + "rewards/margins": 0.12948867678642273, + "rewards/rejected": -0.4162698984146118, + "step": 666 + }, + { + "epoch": 0.87, + "learning_rate": 4.218979532690616e-05, + "logits/chosen": -2.078275680541992, + "logits/rejected": -2.125436305999756, + "logps/chosen": -132.52284240722656, + "logps/rejected": -142.66737365722656, + "loss": 0.9048, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.31447887420654297, + "rewards/margins": -0.27543795108795166, + "rewards/rejected": -0.0390409454703331, + "step": 667 + }, + { + "epoch": 0.87, + "learning_rate": 4.216376145856529e-05, + "logits/chosen": -1.9571113586425781, + "logits/rejected": -1.9044986963272095, + "logps/chosen": -149.46768188476562, + "logps/rejected": -135.63412475585938, + "loss": 0.5922, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15536966919898987, + "rewards/margins": 0.27842316031455994, + "rewards/rejected": -0.4337928295135498, + "step": 668 + }, + { + "epoch": 0.88, + "learning_rate": 4.213769233438646e-05, + "logits/chosen": -2.0616254806518555, + "logits/rejected": -2.054438352584839, + "logps/chosen": -171.1319122314453, + "logps/rejected": -173.5122528076172, + "loss": 0.8061, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.3872271478176117, + "rewards/margins": -0.13944876194000244, + "rewards/rejected": -0.24777841567993164, + "step": 669 + }, + { + "epoch": 0.88, + "learning_rate": 4.211158800791788e-05, + "logits/chosen": -2.0811805725097656, + "logits/rejected": -2.0651638507843018, + "logps/chosen": -134.8317413330078, + "logps/rejected": -159.145263671875, + "loss": 0.6551, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16378769278526306, + "rewards/margins": 0.17533215880393982, + "rewards/rejected": -0.33911988139152527, + "step": 670 + }, + { + "epoch": 0.88, + "learning_rate": 4.208544853278008e-05, + "logits/chosen": -1.8314135074615479, + "logits/rejected": -1.7468427419662476, + "logps/chosen": -162.07394409179688, + "logps/rejected": -195.4473876953125, + "loss": 0.9136, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5376362800598145, + "rewards/margins": -0.3255177438259125, + "rewards/rejected": -0.21211856603622437, + "step": 671 + }, + { + "epoch": 0.88, + "learning_rate": 4.205927396266577e-05, + "logits/chosen": -2.237910270690918, + "logits/rejected": -2.2668380737304688, + "logps/chosen": -190.4712677001953, + "logps/rejected": -189.41978454589844, + "loss": 0.7084, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.310894250869751, + "rewards/margins": 0.13236525654792786, + "rewards/rejected": -0.44325950741767883, + "step": 672 + }, + { + "epoch": 0.88, + "learning_rate": 4.203306435133978e-05, + "logits/chosen": -2.1241772174835205, + "logits/rejected": -2.1544597148895264, + "logps/chosen": -197.74461364746094, + "logps/rejected": -223.18495178222656, + "loss": 0.7411, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5014972686767578, + "rewards/margins": 0.0350608304142952, + "rewards/rejected": -0.5365581512451172, + "step": 673 + }, + { + "epoch": 0.88, + "learning_rate": 4.200681975263888e-05, + "logits/chosen": -1.9514243602752686, + "logits/rejected": -1.9178723096847534, + "logps/chosen": -148.33969116210938, + "logps/rejected": -155.63035583496094, + "loss": 0.8101, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4579694867134094, + "rewards/margins": -0.15268799662590027, + "rewards/rejected": -0.30528151988983154, + "step": 674 + }, + { + "epoch": 0.88, + "learning_rate": 4.1980540220471744e-05, + "logits/chosen": -2.1476829051971436, + "logits/rejected": -2.1371512413024902, + "logps/chosen": -148.34912109375, + "logps/rejected": -143.85255432128906, + "loss": 0.7207, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2867552638053894, + "rewards/margins": 0.0834084004163742, + "rewards/rejected": -0.3701636791229248, + "step": 675 + }, + { + "epoch": 0.88, + "learning_rate": 4.195422580881878e-05, + "logits/chosen": -2.039616346359253, + "logits/rejected": -2.122281551361084, + "logps/chosen": -146.25778198242188, + "logps/rejected": -152.91786193847656, + "loss": 0.6984, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.27001526951789856, + "rewards/margins": 0.08569072186946869, + "rewards/rejected": -0.35570600628852844, + "step": 676 + }, + { + "epoch": 0.89, + "learning_rate": 4.192787657173204e-05, + "logits/chosen": -2.0761122703552246, + "logits/rejected": -2.03676438331604, + "logps/chosen": -260.5777587890625, + "logps/rejected": -249.19309997558594, + "loss": 0.7267, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06880796700716019, + "rewards/margins": -0.00840199738740921, + "rewards/rejected": -0.06040596961975098, + "step": 677 + }, + { + "epoch": 0.89, + "learning_rate": 4.1901492563335115e-05, + "logits/chosen": -1.779012680053711, + "logits/rejected": -1.868876576423645, + "logps/chosen": -171.57522583007812, + "logps/rejected": -179.360107421875, + "loss": 0.7206, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.45824557542800903, + "rewards/margins": 0.07630608975887299, + "rewards/rejected": -0.534551739692688, + "step": 678 + }, + { + "epoch": 0.89, + "learning_rate": 4.187507383782303e-05, + "logits/chosen": -1.95026695728302, + "logits/rejected": -2.003551959991455, + "logps/chosen": -152.73373413085938, + "logps/rejected": -153.47882080078125, + "loss": 0.742, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.15587645769119263, + "rewards/margins": 0.019024118781089783, + "rewards/rejected": -0.1749005764722824, + "step": 679 + }, + { + "epoch": 0.89, + "learning_rate": 4.1848620449462115e-05, + "logits/chosen": -2.1446237564086914, + "logits/rejected": -2.1463828086853027, + "logps/chosen": -155.42047119140625, + "logps/rejected": -152.02236938476562, + "loss": 0.7138, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.29967939853668213, + "rewards/margins": 0.058842338621616364, + "rewards/rejected": -0.3585217297077179, + "step": 680 + }, + { + "epoch": 0.89, + "learning_rate": 4.1822132452589885e-05, + "logits/chosen": -1.958126425743103, + "logits/rejected": -1.9775969982147217, + "logps/chosen": -147.2157745361328, + "logps/rejected": -177.09339904785156, + "loss": 0.7635, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3013852834701538, + "rewards/margins": -0.06730309128761292, + "rewards/rejected": -0.2340822070837021, + "step": 681 + }, + { + "epoch": 0.89, + "learning_rate": 4.1795609901614966e-05, + "logits/chosen": -2.1244254112243652, + "logits/rejected": -2.1423747539520264, + "logps/chosen": -154.98565673828125, + "logps/rejected": -160.26119995117188, + "loss": 0.7795, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.2978401184082031, + "rewards/margins": -0.14463642239570618, + "rewards/rejected": -0.15320369601249695, + "step": 682 + }, + { + "epoch": 0.89, + "learning_rate": 4.176905285101695e-05, + "logits/chosen": -2.1043314933776855, + "logits/rejected": -2.0934536457061768, + "logps/chosen": -168.54006958007812, + "logps/rejected": -175.0818634033203, + "loss": 0.7897, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6096583008766174, + "rewards/margins": -0.020460113883018494, + "rewards/rejected": -0.5891982316970825, + "step": 683 + }, + { + "epoch": 0.9, + "learning_rate": 4.17424613553463e-05, + "logits/chosen": -2.0135533809661865, + "logits/rejected": -2.057865858078003, + "logps/chosen": -165.50119018554688, + "logps/rejected": -158.755859375, + "loss": 0.7246, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.37735715508461, + "rewards/margins": -0.000978667289018631, + "rewards/rejected": -0.37637850642204285, + "step": 684 + }, + { + "epoch": 0.9, + "learning_rate": 4.171583546922423e-05, + "logits/chosen": -1.9394524097442627, + "logits/rejected": -1.9916284084320068, + "logps/chosen": -149.19032287597656, + "logps/rejected": -168.09466552734375, + "loss": 0.7205, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.17173324525356293, + "rewards/margins": 0.10954613983631134, + "rewards/rejected": -0.2812793552875519, + "step": 685 + }, + { + "epoch": 0.9, + "learning_rate": 4.1689175247342584e-05, + "logits/chosen": -2.00577712059021, + "logits/rejected": -1.9910097122192383, + "logps/chosen": -181.58145141601562, + "logps/rejected": -168.53257751464844, + "loss": 0.7636, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5042218565940857, + "rewards/margins": -0.05149242654442787, + "rewards/rejected": -0.4527294337749481, + "step": 686 + }, + { + "epoch": 0.9, + "learning_rate": 4.1662480744463744e-05, + "logits/chosen": -1.9668911695480347, + "logits/rejected": -2.0537397861480713, + "logps/chosen": -146.09317016601562, + "logps/rejected": -173.7940673828125, + "loss": 0.5798, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5452170968055725, + "rewards/margins": 0.28624922037124634, + "rewards/rejected": -0.8314663767814636, + "step": 687 + }, + { + "epoch": 0.9, + "learning_rate": 4.163575201542052e-05, + "logits/chosen": -1.8841594457626343, + "logits/rejected": -1.8778733015060425, + "logps/chosen": -161.64395141601562, + "logps/rejected": -167.02130126953125, + "loss": 0.6612, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2581769526004791, + "rewards/margins": 0.09805278480052948, + "rewards/rejected": -0.3562297224998474, + "step": 688 + }, + { + "epoch": 0.9, + "learning_rate": 4.1608989115116e-05, + "logits/chosen": -2.224517583847046, + "logits/rejected": -2.1719701290130615, + "logps/chosen": -157.36236572265625, + "logps/rejected": -159.3743438720703, + "loss": 0.7116, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5532806515693665, + "rewards/margins": 0.07318402081727982, + "rewards/rejected": -0.6264646649360657, + "step": 689 + }, + { + "epoch": 0.9, + "learning_rate": 4.158219209852349e-05, + "logits/chosen": -2.2228987216949463, + "logits/rejected": -2.118849277496338, + "logps/chosen": -157.40219116210938, + "logps/rejected": -158.98670959472656, + "loss": 0.769, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.47785037755966187, + "rewards/margins": -0.026296239346265793, + "rewards/rejected": -0.45155417919158936, + "step": 690 + }, + { + "epoch": 0.9, + "learning_rate": 4.155536102068636e-05, + "logits/chosen": -2.170536756515503, + "logits/rejected": -2.1680848598480225, + "logps/chosen": -187.9390869140625, + "logps/rejected": -193.16246032714844, + "loss": 0.7623, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00898093730211258, + "rewards/margins": -0.09791917353868484, + "rewards/rejected": 0.10690011084079742, + "step": 691 + }, + { + "epoch": 0.91, + "learning_rate": 4.152849593671793e-05, + "logits/chosen": -2.271533727645874, + "logits/rejected": -2.3052127361297607, + "logps/chosen": -158.00819396972656, + "logps/rejected": -174.9010467529297, + "loss": 0.8109, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4787377119064331, + "rewards/margins": -0.16133984923362732, + "rewards/rejected": -0.3173978626728058, + "step": 692 + }, + { + "epoch": 0.91, + "learning_rate": 4.1501596901801384e-05, + "logits/chosen": -2.1939315795898438, + "logits/rejected": -2.177248954772949, + "logps/chosen": -142.37559509277344, + "logps/rejected": -153.59266662597656, + "loss": 0.7123, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2645539939403534, + "rewards/margins": 0.06362085044384003, + "rewards/rejected": -0.3281748294830322, + "step": 693 + }, + { + "epoch": 0.91, + "learning_rate": 4.147466397118968e-05, + "logits/chosen": -2.2683045864105225, + "logits/rejected": -2.251051187515259, + "logps/chosen": -157.3998565673828, + "logps/rejected": -158.64883422851562, + "loss": 0.7921, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.3830302655696869, + "rewards/margins": -0.15283122658729553, + "rewards/rejected": -0.23019905388355255, + "step": 694 + }, + { + "epoch": 0.91, + "learning_rate": 4.144769720020533e-05, + "logits/chosen": -2.172579050064087, + "logits/rejected": -2.182398796081543, + "logps/chosen": -166.13075256347656, + "logps/rejected": -157.43788146972656, + "loss": 0.8064, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5050195455551147, + "rewards/margins": -0.07979856431484222, + "rewards/rejected": -0.42522096633911133, + "step": 695 + }, + { + "epoch": 0.91, + "learning_rate": 4.142069664424041e-05, + "logits/chosen": -2.117938280105591, + "logits/rejected": -2.1075210571289062, + "logps/chosen": -161.34828186035156, + "logps/rejected": -152.29771423339844, + "loss": 0.7732, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.18459086120128632, + "rewards/margins": -0.009315093979239464, + "rewards/rejected": -0.1752757579088211, + "step": 696 + }, + { + "epoch": 0.91, + "learning_rate": 4.139366235875637e-05, + "logits/chosen": -1.6788434982299805, + "logits/rejected": -1.7080583572387695, + "logps/chosen": -161.94412231445312, + "logps/rejected": -151.36595153808594, + "loss": 0.6239, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19800153374671936, + "rewards/margins": 0.2967408299446106, + "rewards/rejected": -0.49474233388900757, + "step": 697 + }, + { + "epoch": 0.91, + "learning_rate": 4.136659439928397e-05, + "logits/chosen": -1.9581472873687744, + "logits/rejected": -2.0028040409088135, + "logps/chosen": -155.76913452148438, + "logps/rejected": -160.2482452392578, + "loss": 0.7087, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3814413845539093, + "rewards/margins": 0.04772702604532242, + "rewards/rejected": -0.4291684031486511, + "step": 698 + }, + { + "epoch": 0.91, + "learning_rate": 4.13394928214231e-05, + "logits/chosen": -2.0003318786621094, + "logits/rejected": -2.0119810104370117, + "logps/chosen": -163.40011596679688, + "logps/rejected": -171.38818359375, + "loss": 0.6004, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.04564451798796654, + "rewards/margins": 0.31329357624053955, + "rewards/rejected": -0.3589380979537964, + "step": 699 + }, + { + "epoch": 0.92, + "learning_rate": 4.1312357680842735e-05, + "logits/chosen": -2.158578395843506, + "logits/rejected": -2.1833291053771973, + "logps/chosen": -147.8467254638672, + "logps/rejected": -146.0576171875, + "loss": 0.9128, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.12403154373168945, + "rewards/margins": -0.2896636128425598, + "rewards/rejected": 0.16563208401203156, + "step": 700 + }, + { + "epoch": 0.92, + "learning_rate": 4.128518903328078e-05, + "logits/chosen": -2.157421350479126, + "logits/rejected": -2.1012282371520996, + "logps/chosen": -165.8155517578125, + "logps/rejected": -168.45291137695312, + "loss": 0.9046, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.408125638961792, + "rewards/margins": -0.2868077754974365, + "rewards/rejected": -0.12131791561841965, + "step": 701 + }, + { + "epoch": 0.92, + "learning_rate": 4.125798693454396e-05, + "logits/chosen": -2.077611207962036, + "logits/rejected": -2.026050567626953, + "logps/chosen": -142.8761749267578, + "logps/rejected": -118.22064208984375, + "loss": 0.7815, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.11512994021177292, + "rewards/margins": -0.07021620869636536, + "rewards/rejected": -0.044913746416568756, + "step": 702 + }, + { + "epoch": 0.92, + "learning_rate": 4.123075144050772e-05, + "logits/chosen": -1.8848479986190796, + "logits/rejected": -1.810481309890747, + "logps/chosen": -165.26451110839844, + "logps/rejected": -153.42242431640625, + "loss": 0.7349, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6419331431388855, + "rewards/margins": -0.010387040674686432, + "rewards/rejected": -0.6315460801124573, + "step": 703 + }, + { + "epoch": 0.92, + "learning_rate": 4.120348260711611e-05, + "logits/chosen": -2.085160255432129, + "logits/rejected": -2.0904765129089355, + "logps/chosen": -236.3929443359375, + "logps/rejected": -242.44451904296875, + "loss": 0.7578, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.37019020318984985, + "rewards/margins": -0.07077489048242569, + "rewards/rejected": -0.2994152903556824, + "step": 704 + }, + { + "epoch": 0.92, + "learning_rate": 4.117618049038165e-05, + "logits/chosen": -2.1439950466156006, + "logits/rejected": -2.22857666015625, + "logps/chosen": -144.78225708007812, + "logps/rejected": -181.41627502441406, + "loss": 0.7095, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.03200235217809677, + "rewards/margins": 0.057953305542469025, + "rewards/rejected": -0.02595096081495285, + "step": 705 + }, + { + "epoch": 0.92, + "learning_rate": 4.1148845146385214e-05, + "logits/chosen": -2.0723843574523926, + "logits/rejected": -2.1756937503814697, + "logps/chosen": -138.92593383789062, + "logps/rejected": -126.64840698242188, + "loss": 0.8782, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6072953939437866, + "rewards/margins": -0.19861473143100739, + "rewards/rejected": -0.40868061780929565, + "step": 706 + }, + { + "epoch": 0.93, + "learning_rate": 4.112147663127596e-05, + "logits/chosen": -2.0885486602783203, + "logits/rejected": -2.033440113067627, + "logps/chosen": -152.20188903808594, + "logps/rejected": -152.0369415283203, + "loss": 0.6951, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3034742772579193, + "rewards/margins": 0.04874863103032112, + "rewards/rejected": -0.35222291946411133, + "step": 707 + }, + { + "epoch": 0.93, + "learning_rate": 4.109407500127116e-05, + "logits/chosen": -1.9431442022323608, + "logits/rejected": -1.9316291809082031, + "logps/chosen": -170.7488555908203, + "logps/rejected": -174.8116455078125, + "loss": 0.6792, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2055988609790802, + "rewards/margins": 0.07953611761331558, + "rewards/rejected": 0.1260627806186676, + "step": 708 + }, + { + "epoch": 0.93, + "learning_rate": 4.106664031265611e-05, + "logits/chosen": -2.0817761421203613, + "logits/rejected": -2.1072683334350586, + "logps/chosen": -163.31427001953125, + "logps/rejected": -174.4534912109375, + "loss": 0.5775, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2241707593202591, + "rewards/margins": 0.33371949195861816, + "rewards/rejected": -0.10954873263835907, + "step": 709 + }, + { + "epoch": 0.93, + "learning_rate": 4.103917262178402e-05, + "logits/chosen": -2.1241044998168945, + "logits/rejected": -2.0995607376098633, + "logps/chosen": -144.4809112548828, + "logps/rejected": -157.41078186035156, + "loss": 0.7782, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.273907333612442, + "rewards/margins": -0.07136461138725281, + "rewards/rejected": -0.20254270732402802, + "step": 710 + }, + { + "epoch": 0.93, + "learning_rate": 4.1011671985075865e-05, + "logits/chosen": -1.778105616569519, + "logits/rejected": -1.807044267654419, + "logps/chosen": -171.52398681640625, + "logps/rejected": -175.2049102783203, + "loss": 0.9113, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.46856945753097534, + "rewards/margins": -0.3406550884246826, + "rewards/rejected": -0.12791432440280914, + "step": 711 + }, + { + "epoch": 0.93, + "learning_rate": 4.098413845902033e-05, + "logits/chosen": -1.969050645828247, + "logits/rejected": -1.9264585971832275, + "logps/chosen": -175.43978881835938, + "logps/rejected": -158.25021362304688, + "loss": 0.8706, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.41940295696258545, + "rewards/margins": -0.2438211590051651, + "rewards/rejected": -0.17558182775974274, + "step": 712 + }, + { + "epoch": 0.93, + "learning_rate": 4.095657210017364e-05, + "logits/chosen": -2.143251895904541, + "logits/rejected": -2.111879587173462, + "logps/chosen": -162.4940185546875, + "logps/rejected": -155.96981811523438, + "loss": 0.6832, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04569421708583832, + "rewards/margins": 0.11248306930065155, + "rewards/rejected": -0.15817728638648987, + "step": 713 + }, + { + "epoch": 0.93, + "learning_rate": 4.092897296515944e-05, + "logits/chosen": -2.0187065601348877, + "logits/rejected": -1.9244728088378906, + "logps/chosen": -164.12301635742188, + "logps/rejected": -184.9077606201172, + "loss": 0.6384, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.13283205032348633, + "rewards/margins": 0.15339265763759613, + "rewards/rejected": -0.28622472286224365, + "step": 714 + }, + { + "epoch": 0.94, + "learning_rate": 4.090134111066874e-05, + "logits/chosen": -1.8528801202774048, + "logits/rejected": -1.8440043926239014, + "logps/chosen": -179.0013427734375, + "logps/rejected": -188.64830017089844, + "loss": 0.7665, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.24608290195465088, + "rewards/margins": -0.08188387751579285, + "rewards/rejected": -0.16419902443885803, + "step": 715 + }, + { + "epoch": 0.94, + "learning_rate": 4.0873676593459725e-05, + "logits/chosen": -1.9546207189559937, + "logits/rejected": -1.875700831413269, + "logps/chosen": -135.4862060546875, + "logps/rejected": -141.05101013183594, + "loss": 0.6278, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23251035809516907, + "rewards/margins": 0.21048937737941742, + "rewards/rejected": -0.4429997205734253, + "step": 716 + }, + { + "epoch": 0.94, + "learning_rate": 4.08459794703577e-05, + "logits/chosen": -2.0966317653656006, + "logits/rejected": -2.0341391563415527, + "logps/chosen": -151.62359619140625, + "logps/rejected": -155.7061767578125, + "loss": 0.6931, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1692691296339035, + "rewards/margins": 0.05348172038793564, + "rewards/rejected": -0.22275087237358093, + "step": 717 + }, + { + "epoch": 0.94, + "learning_rate": 4.081824979825492e-05, + "logits/chosen": -2.116243362426758, + "logits/rejected": -2.13222074508667, + "logps/chosen": -147.01651000976562, + "logps/rejected": -149.95855712890625, + "loss": 0.7723, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2274257391691208, + "rewards/margins": -0.09935702383518219, + "rewards/rejected": -0.128068745136261, + "step": 718 + }, + { + "epoch": 0.94, + "learning_rate": 4.07904876341105e-05, + "logits/chosen": -2.0546305179595947, + "logits/rejected": -2.1076295375823975, + "logps/chosen": -152.6702117919922, + "logps/rejected": -143.9462127685547, + "loss": 0.7773, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.06750668585300446, + "rewards/margins": -0.091024249792099, + "rewards/rejected": 0.023517563939094543, + "step": 719 + }, + { + "epoch": 0.94, + "learning_rate": 4.076269303495033e-05, + "logits/chosen": -2.0860443115234375, + "logits/rejected": -2.146270751953125, + "logps/chosen": -143.63839721679688, + "logps/rejected": -160.30484008789062, + "loss": 0.6651, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12404690682888031, + "rewards/margins": 0.13515986502170563, + "rewards/rejected": -0.2592068016529083, + "step": 720 + }, + { + "epoch": 0.94, + "learning_rate": 4.073486605786689e-05, + "logits/chosen": -2.288916826248169, + "logits/rejected": -2.24888277053833, + "logps/chosen": -195.5283203125, + "logps/rejected": -183.82054138183594, + "loss": 0.712, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2654836177825928, + "rewards/margins": -0.004374176263809204, + "rewards/rejected": -0.26110947132110596, + "step": 721 + }, + { + "epoch": 0.94, + "learning_rate": 4.0707006760019175e-05, + "logits/chosen": -2.06491756439209, + "logits/rejected": -2.0922203063964844, + "logps/chosen": -151.57093811035156, + "logps/rejected": -156.29843139648438, + "loss": 0.7023, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3546985387802124, + "rewards/margins": 0.029046528041362762, + "rewards/rejected": -0.38374507427215576, + "step": 722 + }, + { + "epoch": 0.95, + "learning_rate": 4.067911519863257e-05, + "logits/chosen": -2.074341058731079, + "logits/rejected": -2.069526433944702, + "logps/chosen": -144.95079040527344, + "logps/rejected": -143.27981567382812, + "loss": 0.9193, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.13518384099006653, + "rewards/margins": -0.2904050052165985, + "rewards/rejected": 0.15522116422653198, + "step": 723 + }, + { + "epoch": 0.95, + "learning_rate": 4.065119143099874e-05, + "logits/chosen": -2.0665669441223145, + "logits/rejected": -2.0500869750976562, + "logps/chosen": -129.72299194335938, + "logps/rejected": -123.09598541259766, + "loss": 0.6954, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1962793618440628, + "rewards/margins": 0.07191440463066101, + "rewards/rejected": -0.268193781375885, + "step": 724 + }, + { + "epoch": 0.95, + "learning_rate": 4.062323551447549e-05, + "logits/chosen": -1.8880443572998047, + "logits/rejected": -1.9188331365585327, + "logps/chosen": -197.51145935058594, + "logps/rejected": -205.3161163330078, + "loss": 0.7255, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.244761124253273, + "rewards/margins": 0.003908276557922363, + "rewards/rejected": -0.24866940081119537, + "step": 725 + }, + { + "epoch": 0.95, + "learning_rate": 4.059524750648668e-05, + "logits/chosen": -2.0474305152893066, + "logits/rejected": -2.0502195358276367, + "logps/chosen": -146.49232482910156, + "logps/rejected": -169.22877502441406, + "loss": 0.8212, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.12031140178442001, + "rewards/margins": -0.1856772005558014, + "rewards/rejected": 0.06536579132080078, + "step": 726 + }, + { + "epoch": 0.95, + "learning_rate": 4.056722746452207e-05, + "logits/chosen": -1.802725076675415, + "logits/rejected": -1.8228119611740112, + "logps/chosen": -155.632080078125, + "logps/rejected": -176.6995849609375, + "loss": 0.9494, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5103746652603149, + "rewards/margins": -0.398013174533844, + "rewards/rejected": -0.11236148327589035, + "step": 727 + }, + { + "epoch": 0.95, + "learning_rate": 4.053917544613723e-05, + "logits/chosen": -2.0305285453796387, + "logits/rejected": -1.9622552394866943, + "logps/chosen": -155.14163208007812, + "logps/rejected": -141.12828063964844, + "loss": 0.7736, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.26194462180137634, + "rewards/margins": -0.11262011528015137, + "rewards/rejected": -0.1493244767189026, + "step": 728 + }, + { + "epoch": 0.95, + "learning_rate": 4.051109150895343e-05, + "logits/chosen": -2.125661611557007, + "logits/rejected": -2.060500144958496, + "logps/chosen": -136.96627807617188, + "logps/rejected": -152.24871826171875, + "loss": 0.6332, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14833050966262817, + "rewards/margins": 0.16227081418037415, + "rewards/rejected": -0.3106013536453247, + "step": 729 + }, + { + "epoch": 0.96, + "learning_rate": 4.0482975710657455e-05, + "logits/chosen": -2.14182710647583, + "logits/rejected": -2.084892511367798, + "logps/chosen": -175.79473876953125, + "logps/rejected": -168.12074279785156, + "loss": 0.8449, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.2244684100151062, + "rewards/margins": -0.2297031730413437, + "rewards/rejected": 0.0052347928285598755, + "step": 730 + }, + { + "epoch": 0.96, + "learning_rate": 4.045482810900159e-05, + "logits/chosen": -2.0067529678344727, + "logits/rejected": -2.0317680835723877, + "logps/chosen": -168.8496856689453, + "logps/rejected": -163.8263397216797, + "loss": 0.6518, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19057075679302216, + "rewards/margins": 0.1778174489736557, + "rewards/rejected": -0.36838820576667786, + "step": 731 + }, + { + "epoch": 0.96, + "learning_rate": 4.042664876180341e-05, + "logits/chosen": -2.1688036918640137, + "logits/rejected": -2.1718666553497314, + "logps/chosen": -170.41114807128906, + "logps/rejected": -169.48265075683594, + "loss": 0.8418, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.15957602858543396, + "rewards/margins": -0.18063148856163025, + "rewards/rejected": 0.021055400371551514, + "step": 732 + }, + { + "epoch": 0.96, + "learning_rate": 4.0398437726945716e-05, + "logits/chosen": -1.8246833086013794, + "logits/rejected": -1.7999447584152222, + "logps/chosen": -142.14801025390625, + "logps/rejected": -152.80712890625, + "loss": 0.8179, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.20929935574531555, + "rewards/margins": -0.17062106728553772, + "rewards/rejected": -0.03867826238274574, + "step": 733 + }, + { + "epoch": 0.96, + "learning_rate": 4.037019506237638e-05, + "logits/chosen": -1.6244423389434814, + "logits/rejected": -1.6328479051589966, + "logps/chosen": -172.3773956298828, + "logps/rejected": -212.7267303466797, + "loss": 0.5395, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.04836279898881912, + "rewards/margins": 0.5529451370239258, + "rewards/rejected": -0.6013079881668091, + "step": 734 + }, + { + "epoch": 0.96, + "learning_rate": 4.034192082610828e-05, + "logits/chosen": -2.0388615131378174, + "logits/rejected": -2.085139513015747, + "logps/chosen": -141.1918182373047, + "logps/rejected": -139.84365844726562, + "loss": 0.6897, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08627957850694656, + "rewards/margins": 0.07983030378818512, + "rewards/rejected": 0.006449267268180847, + "step": 735 + }, + { + "epoch": 0.96, + "learning_rate": 4.031361507621911e-05, + "logits/chosen": -2.163036584854126, + "logits/rejected": -2.172485589981079, + "logps/chosen": -149.1195068359375, + "logps/rejected": -136.47434997558594, + "loss": 0.7033, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.05051477998495102, + "rewards/margins": 0.08880610764026642, + "rewards/rejected": -0.0382913202047348, + "step": 736 + }, + { + "epoch": 0.96, + "learning_rate": 4.02852778708513e-05, + "logits/chosen": -1.6721755266189575, + "logits/rejected": -1.6523504257202148, + "logps/chosen": -180.20445251464844, + "logps/rejected": -192.36607360839844, + "loss": 0.7664, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.17255781590938568, + "rewards/margins": -0.05452100187540054, + "rewards/rejected": -0.11803679168224335, + "step": 737 + }, + { + "epoch": 0.97, + "learning_rate": 4.0256909268211914e-05, + "logits/chosen": -2.0831446647644043, + "logits/rejected": -2.0978503227233887, + "logps/chosen": -121.0216064453125, + "logps/rejected": -125.146484375, + "loss": 0.7649, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.14153949916362762, + "rewards/margins": -0.12446232885122299, + "rewards/rejected": -0.017077183350920677, + "step": 738 + }, + { + "epoch": 0.97, + "learning_rate": 4.0228509326572496e-05, + "logits/chosen": -1.6901588439941406, + "logits/rejected": -1.721649408340454, + "logps/chosen": -134.27215576171875, + "logps/rejected": -136.54830932617188, + "loss": 0.6938, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.012902306392788887, + "rewards/margins": 0.09037284553050995, + "rewards/rejected": -0.07747054100036621, + "step": 739 + }, + { + "epoch": 0.97, + "learning_rate": 4.0200078104268944e-05, + "logits/chosen": -2.149341344833374, + "logits/rejected": -2.117537021636963, + "logps/chosen": -161.55764770507812, + "logps/rejected": -142.29367065429688, + "loss": 0.8426, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.18234218657016754, + "rewards/margins": -0.15283283591270447, + "rewards/rejected": -0.02950935624539852, + "step": 740 + }, + { + "epoch": 0.97, + "learning_rate": 4.017161565970144e-05, + "logits/chosen": -1.7856075763702393, + "logits/rejected": -1.7834945917129517, + "logps/chosen": -158.6090545654297, + "logps/rejected": -158.350341796875, + "loss": 0.7749, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.17489013075828552, + "rewards/margins": -0.10388021171092987, + "rewards/rejected": -0.07100993394851685, + "step": 741 + }, + { + "epoch": 0.97, + "learning_rate": 4.014312205133428e-05, + "logits/chosen": -2.026144027709961, + "logits/rejected": -2.0260369777679443, + "logps/chosen": -169.1302490234375, + "logps/rejected": -125.34761047363281, + "loss": 0.778, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.043028444051742554, + "rewards/margins": -0.08100062608718872, + "rewards/rejected": 0.12402907758951187, + "step": 742 + }, + { + "epoch": 0.97, + "learning_rate": 4.011459733769579e-05, + "logits/chosen": -1.9844965934753418, + "logits/rejected": -2.031277656555176, + "logps/chosen": -139.288818359375, + "logps/rejected": -152.935791015625, + "loss": 0.8053, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.28731998801231384, + "rewards/margins": -0.16818708181381226, + "rewards/rejected": -0.11913290619850159, + "step": 743 + }, + { + "epoch": 0.97, + "learning_rate": 4.0086041577378166e-05, + "logits/chosen": -1.9718449115753174, + "logits/rejected": -2.001786708831787, + "logps/chosen": -113.56793212890625, + "logps/rejected": -115.19658660888672, + "loss": 0.6771, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.10746096074581146, + "rewards/margins": 0.0731014758348465, + "rewards/rejected": -0.18056243658065796, + "step": 744 + }, + { + "epoch": 0.97, + "learning_rate": 4.005745482903739e-05, + "logits/chosen": -1.9297832250595093, + "logits/rejected": -1.8905633687973022, + "logps/chosen": -189.31784057617188, + "logps/rejected": -230.98753356933594, + "loss": 0.6559, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15024004876613617, + "rewards/margins": 0.3164423704147339, + "rewards/rejected": -0.466682493686676, + "step": 745 + }, + { + "epoch": 0.98, + "learning_rate": 4.002883715139309e-05, + "logits/chosen": -1.7887859344482422, + "logits/rejected": -1.82424795627594, + "logps/chosen": -208.4186553955078, + "logps/rejected": -213.05670166015625, + "loss": 0.6253, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.026865793392062187, + "rewards/margins": 0.1876009702682495, + "rewards/rejected": -0.16073518991470337, + "step": 746 + }, + { + "epoch": 0.98, + "learning_rate": 4.000018860322845e-05, + "logits/chosen": -2.2344820499420166, + "logits/rejected": -2.233741283416748, + "logps/chosen": -183.4325408935547, + "logps/rejected": -181.87393188476562, + "loss": 0.5643, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1357138305902481, + "rewards/margins": 0.355926513671875, + "rewards/rejected": -0.22021271288394928, + "step": 747 + }, + { + "epoch": 0.98, + "learning_rate": 3.9971509243390025e-05, + "logits/chosen": -2.194760799407959, + "logits/rejected": -2.2292375564575195, + "logps/chosen": -156.87254333496094, + "logps/rejected": -168.62025451660156, + "loss": 0.7486, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.046510808169841766, + "rewards/margins": 0.025812923908233643, + "rewards/rejected": -0.07232370227575302, + "step": 748 + }, + { + "epoch": 0.98, + "learning_rate": 3.99427991307877e-05, + "logits/chosen": -1.8178576231002808, + "logits/rejected": -1.8639076948165894, + "logps/chosen": -148.9747314453125, + "logps/rejected": -161.0830078125, + "loss": 0.6006, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.05070438235998154, + "rewards/margins": 0.26105034351348877, + "rewards/rejected": -0.21034595370292664, + "step": 749 + }, + { + "epoch": 0.98, + "learning_rate": 3.9914058324394486e-05, + "logits/chosen": -2.137637138366699, + "logits/rejected": -2.1838955879211426, + "logps/chosen": -171.7498016357422, + "logps/rejected": -182.58238220214844, + "loss": 0.628, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.053341031074523926, + "rewards/margins": 0.19571509957313538, + "rewards/rejected": -0.14237406849861145, + "step": 750 + }, + { + "epoch": 0.98, + "learning_rate": 3.9885286883246476e-05, + "logits/chosen": -1.965986728668213, + "logits/rejected": -2.001103162765503, + "logps/chosen": -151.49935913085938, + "logps/rejected": -180.28414916992188, + "loss": 0.7626, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.19454632699489594, + "rewards/margins": -0.05682283639907837, + "rewards/rejected": -0.13772347569465637, + "step": 751 + }, + { + "epoch": 0.98, + "learning_rate": 3.985648486644267e-05, + "logits/chosen": -2.0965442657470703, + "logits/rejected": -2.115654706954956, + "logps/chosen": -153.47987365722656, + "logps/rejected": -161.52932739257812, + "loss": 0.8653, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03188393637537956, + "rewards/margins": -0.20181989669799805, + "rewards/rejected": 0.169935941696167, + "step": 752 + }, + { + "epoch": 0.99, + "learning_rate": 3.982765233314489e-05, + "logits/chosen": -1.8265806436538696, + "logits/rejected": -1.8221160173416138, + "logps/chosen": -201.52655029296875, + "logps/rejected": -200.19342041015625, + "loss": 0.7038, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.10736799240112305, + "rewards/margins": 0.025161506608128548, + "rewards/rejected": -0.13252949714660645, + "step": 753 + }, + { + "epoch": 0.99, + "learning_rate": 3.979878934257762e-05, + "logits/chosen": -1.97774076461792, + "logits/rejected": -1.9710618257522583, + "logps/chosen": -161.500732421875, + "logps/rejected": -155.46812438964844, + "loss": 0.7226, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.40990006923675537, + "rewards/margins": 0.09453297406435013, + "rewards/rejected": -0.5044330954551697, + "step": 754 + }, + { + "epoch": 0.99, + "learning_rate": 3.976989595402793e-05, + "logits/chosen": -1.9539647102355957, + "logits/rejected": -1.8984624147415161, + "logps/chosen": -156.4872283935547, + "logps/rejected": -157.50094604492188, + "loss": 1.0053, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.43122974038124084, + "rewards/margins": -0.45546600222587585, + "rewards/rejected": 0.024236250668764114, + "step": 755 + }, + { + "epoch": 0.99, + "learning_rate": 3.974097222684532e-05, + "logits/chosen": -2.0011062622070312, + "logits/rejected": -2.0779013633728027, + "logps/chosen": -142.67002868652344, + "logps/rejected": -146.49705505371094, + "loss": 0.7767, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.021236799657344818, + "rewards/margins": -0.09352488815784454, + "rewards/rejected": 0.07228809595108032, + "step": 756 + }, + { + "epoch": 0.99, + "learning_rate": 3.9712018220441596e-05, + "logits/chosen": -2.1120223999023438, + "logits/rejected": -2.0883326530456543, + "logps/chosen": -150.02957153320312, + "logps/rejected": -154.8041534423828, + "loss": 0.8471, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.38133108615875244, + "rewards/margins": -0.2452768087387085, + "rewards/rejected": -0.13605426251888275, + "step": 757 + }, + { + "epoch": 0.99, + "learning_rate": 3.9683033994290767e-05, + "logits/chosen": -1.9895920753479004, + "logits/rejected": -2.0780937671661377, + "logps/chosen": -122.83441925048828, + "logps/rejected": -135.78131103515625, + "loss": 0.7101, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.042702965438365936, + "rewards/margins": 0.10642305016517639, + "rewards/rejected": -0.14912599325180054, + "step": 758 + }, + { + "epoch": 0.99, + "learning_rate": 3.965401960792894e-05, + "logits/chosen": -1.9442851543426514, + "logits/rejected": -1.9203828573226929, + "logps/chosen": -163.14080810546875, + "logps/rejected": -158.33706665039062, + "loss": 0.7844, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.21191173791885376, + "rewards/margins": -0.12215844541788101, + "rewards/rejected": -0.08975328505039215, + "step": 759 + }, + { + "epoch": 0.99, + "learning_rate": 3.962497512095412e-05, + "logits/chosen": -2.0754759311676025, + "logits/rejected": -2.033245801925659, + "logps/chosen": -209.28817749023438, + "logps/rejected": -203.22897338867188, + "loss": 0.8654, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.20608650147914886, + "rewards/margins": -0.2308453768491745, + "rewards/rejected": 0.02475885860621929, + "step": 760 + }, + { + "epoch": 1.0, + "learning_rate": 3.95959005930262e-05, + "logits/chosen": -1.9952623844146729, + "logits/rejected": -1.9730302095413208, + "logps/chosen": -192.40951538085938, + "logps/rejected": -196.0675506591797, + "loss": 0.7612, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10101304203271866, + "rewards/margins": -0.0519864559173584, + "rewards/rejected": -0.04902658611536026, + "step": 761 + }, + { + "epoch": 1.0, + "learning_rate": 3.9566796083866756e-05, + "logits/chosen": -1.8687723875045776, + "logits/rejected": -1.8516573905944824, + "logps/chosen": -163.812744140625, + "logps/rejected": -167.4736328125, + "loss": 0.6654, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08760890364646912, + "rewards/margins": 0.10946011543273926, + "rewards/rejected": -0.02185121551156044, + "step": 762 + }, + { + "epoch": 1.0, + "learning_rate": 3.953766165325892e-05, + "logits/chosen": -1.9016671180725098, + "logits/rejected": -1.8528916835784912, + "logps/chosen": -122.68069458007812, + "logps/rejected": -166.39230346679688, + "loss": 0.8049, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.1457635462284088, + "rewards/margins": -0.134870246052742, + "rewards/rejected": -0.010893300175666809, + "step": 763 + }, + { + "epoch": 1.0, + "learning_rate": 3.9508497361047334e-05, + "logits/chosen": -2.126763105392456, + "logits/rejected": -2.0734221935272217, + "logps/chosen": -153.5342254638672, + "logps/rejected": -124.99877166748047, + "loss": 0.7467, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09060416370630264, + "rewards/margins": -0.037557389587163925, + "rewards/rejected": -0.05304676294326782, + "step": 764 + }, + { + "epoch": 1.0, + "learning_rate": 3.9479303267137944e-05, + "logits/chosen": -2.080320358276367, + "logits/rejected": -2.0969486236572266, + "logps/chosen": -181.7772216796875, + "logps/rejected": -209.83665466308594, + "loss": 0.1747, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.5646958351135254, + "rewards/margins": 3.972540855407715, + "rewards/rejected": -2.4078450202941895, + "step": 765 + }, + { + "epoch": 1.0, + "learning_rate": 3.9450079431497936e-05, + "logits/chosen": -2.0887792110443115, + "logits/rejected": -2.027618169784546, + "logps/chosen": -161.34857177734375, + "logps/rejected": -186.8290252685547, + "loss": 0.0359, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.125178098678589, + "rewards/margins": 5.42678165435791, + "rewards/rejected": -3.301603317260742, + "step": 766 + }, + { + "epoch": 1.0, + "learning_rate": 3.9420825914155554e-05, + "logits/chosen": -2.104077100753784, + "logits/rejected": -2.1051535606384277, + "logps/chosen": -139.18788146972656, + "logps/rejected": -189.3860321044922, + "loss": 0.124, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.9335505962371826, + "rewards/margins": 4.618389129638672, + "rewards/rejected": -2.6848387718200684, + "step": 767 + }, + { + "epoch": 1.01, + "learning_rate": 3.939154277520006e-05, + "logits/chosen": -2.1465070247650146, + "logits/rejected": -2.0535991191864014, + "logps/chosen": -140.7772979736328, + "logps/rejected": -178.20533752441406, + "loss": 0.0353, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0950372219085693, + "rewards/margins": 4.902841091156006, + "rewards/rejected": -2.8078043460845947, + "step": 768 + }, + { + "epoch": 1.01, + "learning_rate": 3.9362230074781506e-05, + "logits/chosen": -2.032116413116455, + "logits/rejected": -2.0662102699279785, + "logps/chosen": -163.39682006835938, + "logps/rejected": -208.9725341796875, + "loss": 0.1048, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.8455992937088013, + "rewards/margins": 4.063425064086914, + "rewards/rejected": -2.2178261280059814, + "step": 769 + }, + { + "epoch": 1.01, + "learning_rate": 3.9332887873110695e-05, + "logits/chosen": -1.9861818552017212, + "logits/rejected": -2.0381555557250977, + "logps/chosen": -171.25347900390625, + "logps/rejected": -219.2564239501953, + "loss": 0.0385, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1809895038604736, + "rewards/margins": 6.388942718505859, + "rewards/rejected": -4.207952976226807, + "step": 770 + }, + { + "epoch": 1.01, + "learning_rate": 3.9303516230459035e-05, + "logits/chosen": -2.1124377250671387, + "logits/rejected": -2.0706801414489746, + "logps/chosen": -129.31185913085938, + "logps/rejected": -167.8997039794922, + "loss": 0.137, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.6835626363754272, + "rewards/margins": 4.556663990020752, + "rewards/rejected": -2.8731014728546143, + "step": 771 + }, + { + "epoch": 1.01, + "learning_rate": 3.92741152071584e-05, + "logits/chosen": -1.8337211608886719, + "logits/rejected": -1.8092454671859741, + "logps/chosen": -136.34719848632812, + "logps/rejected": -187.0718994140625, + "loss": 0.1101, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5944790840148926, + "rewards/margins": 4.034244537353516, + "rewards/rejected": -2.439765214920044, + "step": 772 + }, + { + "epoch": 1.01, + "learning_rate": 3.924468486360101e-05, + "logits/chosen": -1.8283705711364746, + "logits/rejected": -1.7894047498703003, + "logps/chosen": -245.77256774902344, + "logps/rejected": -281.06781005859375, + "loss": 0.1771, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.18741774559021, + "rewards/margins": 3.2372922897338867, + "rewards/rejected": -2.0498743057250977, + "step": 773 + }, + { + "epoch": 1.01, + "learning_rate": 3.921522526023931e-05, + "logits/chosen": -1.9555280208587646, + "logits/rejected": -2.059631586074829, + "logps/chosen": -144.19586181640625, + "logps/rejected": -203.6154327392578, + "loss": 0.0749, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2664604187011719, + "rewards/margins": 3.9877490997314453, + "rewards/rejected": -2.7212884426116943, + "step": 774 + }, + { + "epoch": 1.01, + "learning_rate": 3.918573645758586e-05, + "logits/chosen": -2.0166711807250977, + "logits/rejected": -2.075800657272339, + "logps/chosen": -155.22332763671875, + "logps/rejected": -199.52328491210938, + "loss": 0.1121, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.6841697692871094, + "rewards/margins": 4.346258640289307, + "rewards/rejected": -2.6620891094207764, + "step": 775 + }, + { + "epoch": 1.02, + "learning_rate": 3.915621851621318e-05, + "logits/chosen": -1.8551626205444336, + "logits/rejected": -1.8971370458602905, + "logps/chosen": -144.870849609375, + "logps/rejected": -215.3363037109375, + "loss": 0.0826, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.5322911739349365, + "rewards/margins": 4.17916202545166, + "rewards/rejected": -2.6468706130981445, + "step": 776 + }, + { + "epoch": 1.02, + "learning_rate": 3.9126671496753666e-05, + "logits/chosen": -1.8935290575027466, + "logits/rejected": -1.8119558095932007, + "logps/chosen": -175.02459716796875, + "logps/rejected": -182.73126220703125, + "loss": 0.1091, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.9930130243301392, + "rewards/margins": 4.418476581573486, + "rewards/rejected": -2.4254636764526367, + "step": 777 + }, + { + "epoch": 1.02, + "learning_rate": 3.909709545989942e-05, + "logits/chosen": -1.8846909999847412, + "logits/rejected": -1.8772252798080444, + "logps/chosen": -123.37010192871094, + "logps/rejected": -165.76968383789062, + "loss": 0.0891, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5085937976837158, + "rewards/margins": 4.403595924377441, + "rewards/rejected": -2.8950023651123047, + "step": 778 + }, + { + "epoch": 1.02, + "learning_rate": 3.9067490466402156e-05, + "logits/chosen": -1.924761414527893, + "logits/rejected": -1.9515211582183838, + "logps/chosen": -119.23345947265625, + "logps/rejected": -164.5015869140625, + "loss": 0.178, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.1522871255874634, + "rewards/margins": 3.376887083053589, + "rewards/rejected": -2.224600076675415, + "step": 779 + }, + { + "epoch": 1.02, + "learning_rate": 3.903785657707307e-05, + "logits/chosen": -1.7551360130310059, + "logits/rejected": -1.8034405708312988, + "logps/chosen": -132.70225524902344, + "logps/rejected": -188.23854064941406, + "loss": 0.1239, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.4596197605133057, + "rewards/margins": 5.111968994140625, + "rewards/rejected": -3.6523497104644775, + "step": 780 + }, + { + "epoch": 1.02, + "learning_rate": 3.9008193852782733e-05, + "logits/chosen": -1.9345866441726685, + "logits/rejected": -2.0259900093078613, + "logps/chosen": -147.717529296875, + "logps/rejected": -198.76608276367188, + "loss": 0.1756, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.160006046295166, + "rewards/margins": 3.809837579727173, + "rewards/rejected": -2.649831533432007, + "step": 781 + }, + { + "epoch": 1.02, + "learning_rate": 3.897850235446089e-05, + "logits/chosen": -1.8931665420532227, + "logits/rejected": -1.8667347431182861, + "logps/chosen": -144.32681274414062, + "logps/rejected": -176.2711181640625, + "loss": 0.0913, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.2157329320907593, + "rewards/margins": 4.00248908996582, + "rewards/rejected": -2.7867562770843506, + "step": 782 + }, + { + "epoch": 1.02, + "learning_rate": 3.894878214309645e-05, + "logits/chosen": -1.8950620889663696, + "logits/rejected": -1.7776212692260742, + "logps/chosen": -153.48272705078125, + "logps/rejected": -185.09901428222656, + "loss": 0.0948, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7234764099121094, + "rewards/margins": 4.474335193634033, + "rewards/rejected": -2.7508585453033447, + "step": 783 + }, + { + "epoch": 1.03, + "learning_rate": 3.8919033279737274e-05, + "logits/chosen": -1.9167214632034302, + "logits/rejected": -1.955217719078064, + "logps/chosen": -149.79092407226562, + "logps/rejected": -209.3562469482422, + "loss": 0.1909, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6991943717002869, + "rewards/margins": 3.995697021484375, + "rewards/rejected": -3.2965028285980225, + "step": 784 + }, + { + "epoch": 1.03, + "learning_rate": 3.888925582549006e-05, + "logits/chosen": -1.7585481405258179, + "logits/rejected": -1.7582013607025146, + "logps/chosen": -161.952392578125, + "logps/rejected": -223.1368408203125, + "loss": 0.0444, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2704546451568604, + "rewards/margins": 5.138402462005615, + "rewards/rejected": -3.867948055267334, + "step": 785 + }, + { + "epoch": 1.03, + "learning_rate": 3.885944984152027e-05, + "logits/chosen": -1.7177354097366333, + "logits/rejected": -1.7072687149047852, + "logps/chosen": -141.16017150878906, + "logps/rejected": -197.3968963623047, + "loss": 0.0631, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3352869749069214, + "rewards/margins": 4.8079514503479, + "rewards/rejected": -3.4726650714874268, + "step": 786 + }, + { + "epoch": 1.03, + "learning_rate": 3.882961538905194e-05, + "logits/chosen": -1.9421478509902954, + "logits/rejected": -1.9236348867416382, + "logps/chosen": -156.8341827392578, + "logps/rejected": -201.7722625732422, + "loss": 0.0734, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3398832082748413, + "rewards/margins": 4.717867851257324, + "rewards/rejected": -3.3779852390289307, + "step": 787 + }, + { + "epoch": 1.03, + "learning_rate": 3.879975252936761e-05, + "logits/chosen": -1.918500542640686, + "logits/rejected": -1.9371358156204224, + "logps/chosen": -200.98944091796875, + "logps/rejected": -276.0814514160156, + "loss": 0.0714, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.0912672281265259, + "rewards/margins": 5.926272392272949, + "rewards/rejected": -4.835005283355713, + "step": 788 + }, + { + "epoch": 1.03, + "learning_rate": 3.876986132380814e-05, + "logits/chosen": -1.894209623336792, + "logits/rejected": -1.9814263582229614, + "logps/chosen": -127.55168151855469, + "logps/rejected": -186.98684692382812, + "loss": 0.0765, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.0201938152313232, + "rewards/margins": 4.509597301483154, + "rewards/rejected": -3.489403247833252, + "step": 789 + }, + { + "epoch": 1.03, + "learning_rate": 3.8739941833772643e-05, + "logits/chosen": -1.743139386177063, + "logits/rejected": -1.7789626121520996, + "logps/chosen": -135.5447235107422, + "logps/rejected": -201.73886108398438, + "loss": 0.1591, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7540079951286316, + "rewards/margins": 5.139564514160156, + "rewards/rejected": -4.385556697845459, + "step": 790 + }, + { + "epoch": 1.04, + "learning_rate": 3.870999412071829e-05, + "logits/chosen": -1.9243521690368652, + "logits/rejected": -1.907222867012024, + "logps/chosen": -132.49636840820312, + "logps/rejected": -192.06646728515625, + "loss": 0.1116, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.1499325037002563, + "rewards/margins": 5.0751800537109375, + "rewards/rejected": -3.92524790763855, + "step": 791 + }, + { + "epoch": 1.04, + "learning_rate": 3.8680018246160295e-05, + "logits/chosen": -1.6514546871185303, + "logits/rejected": -1.6387147903442383, + "logps/chosen": -132.7026824951172, + "logps/rejected": -164.41177368164062, + "loss": 0.112, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4314553737640381, + "rewards/margins": 3.427611827850342, + "rewards/rejected": -2.9961564540863037, + "step": 792 + }, + { + "epoch": 1.04, + "learning_rate": 3.865001427167164e-05, + "logits/chosen": -1.8426374197006226, + "logits/rejected": -1.8846511840820312, + "logps/chosen": -136.38165283203125, + "logps/rejected": -199.7681121826172, + "loss": 0.0956, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.0450799465179443, + "rewards/margins": 5.463000297546387, + "rewards/rejected": -4.4179205894470215, + "step": 793 + }, + { + "epoch": 1.04, + "learning_rate": 3.861998225888307e-05, + "logits/chosen": -1.8042436838150024, + "logits/rejected": -1.8148198127746582, + "logps/chosen": -142.37742614746094, + "logps/rejected": -197.7374267578125, + "loss": 0.0785, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.9900847673416138, + "rewards/margins": 5.896683692932129, + "rewards/rejected": -4.906599044799805, + "step": 794 + }, + { + "epoch": 1.04, + "learning_rate": 3.8589922269482924e-05, + "logits/chosen": -1.9162298440933228, + "logits/rejected": -1.9350849390029907, + "logps/chosen": -147.20562744140625, + "logps/rejected": -187.5604705810547, + "loss": 0.0646, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.3646960258483887, + "rewards/margins": 4.906591415405273, + "rewards/rejected": -3.5418953895568848, + "step": 795 + }, + { + "epoch": 1.04, + "learning_rate": 3.855983436521699e-05, + "logits/chosen": -1.5755800008773804, + "logits/rejected": -1.5610840320587158, + "logps/chosen": -128.9073028564453, + "logps/rejected": -180.30923461914062, + "loss": 0.1132, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7019076943397522, + "rewards/margins": 4.735352039337158, + "rewards/rejected": -4.033443927764893, + "step": 796 + }, + { + "epoch": 1.04, + "learning_rate": 3.8529718607888394e-05, + "logits/chosen": -2.063124656677246, + "logits/rejected": -2.062586545944214, + "logps/chosen": -145.98892211914062, + "logps/rejected": -188.89068603515625, + "loss": 0.1308, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6006640791893005, + "rewards/margins": 4.575399875640869, + "rewards/rejected": -3.974735975265503, + "step": 797 + }, + { + "epoch": 1.04, + "learning_rate": 3.8499575059357506e-05, + "logits/chosen": -2.017670154571533, + "logits/rejected": -1.987682819366455, + "logps/chosen": -162.8350830078125, + "logps/rejected": -233.1547393798828, + "loss": 0.0839, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6563400030136108, + "rewards/margins": 4.883859157562256, + "rewards/rejected": -4.227519512176514, + "step": 798 + }, + { + "epoch": 1.05, + "learning_rate": 3.8469403781541745e-05, + "logits/chosen": -1.8146461248397827, + "logits/rejected": -1.7456778287887573, + "logps/chosen": -230.2406005859375, + "logps/rejected": -258.3357849121094, + "loss": 0.129, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0335506796836853, + "rewards/margins": 4.429178714752197, + "rewards/rejected": -4.395627975463867, + "step": 799 + }, + { + "epoch": 1.05, + "learning_rate": 3.843920483641551e-05, + "logits/chosen": -1.9329719543457031, + "logits/rejected": -1.9156296253204346, + "logps/chosen": -136.6041717529297, + "logps/rejected": -176.88087463378906, + "loss": 0.0892, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7322125434875488, + "rewards/margins": 5.226524353027344, + "rewards/rejected": -4.494311332702637, + "step": 800 + }, + { + "epoch": 1.05, + "learning_rate": 3.840897828601002e-05, + "logits/chosen": -1.6023508310317993, + "logits/rejected": -1.6873928308486938, + "logps/chosen": -116.11083984375, + "logps/rejected": -189.20831298828125, + "loss": 0.0576, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5114001035690308, + "rewards/margins": 5.1929192543029785, + "rewards/rejected": -4.6815185546875, + "step": 801 + }, + { + "epoch": 1.05, + "learning_rate": 3.83787241924132e-05, + "logits/chosen": -1.798394799232483, + "logits/rejected": -1.797690987586975, + "logps/chosen": -149.56710815429688, + "logps/rejected": -228.49143981933594, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9114323854446411, + "rewards/margins": 6.009431838989258, + "rewards/rejected": -5.097999572753906, + "step": 802 + }, + { + "epoch": 1.05, + "learning_rate": 3.8348442617769564e-05, + "logits/chosen": -1.9224333763122559, + "logits/rejected": -1.9393596649169922, + "logps/chosen": -152.3304443359375, + "logps/rejected": -205.50289916992188, + "loss": 0.0952, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3278241455554962, + "rewards/margins": 5.112401485443115, + "rewards/rejected": -4.784577369689941, + "step": 803 + }, + { + "epoch": 1.05, + "learning_rate": 3.831813362428005e-05, + "logits/chosen": -1.937591552734375, + "logits/rejected": -1.858033299446106, + "logps/chosen": -138.99546813964844, + "logps/rejected": -170.10922241210938, + "loss": 0.1423, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07943601906299591, + "rewards/margins": 3.7533488273620605, + "rewards/rejected": -3.673912763595581, + "step": 804 + }, + { + "epoch": 1.05, + "learning_rate": 3.8287797274201934e-05, + "logits/chosen": -1.9712400436401367, + "logits/rejected": -2.0466256141662598, + "logps/chosen": -146.24937438964844, + "logps/rejected": -204.24668884277344, + "loss": 0.0868, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24946361780166626, + "rewards/margins": 4.4833879470825195, + "rewards/rejected": -4.23392391204834, + "step": 805 + }, + { + "epoch": 1.05, + "learning_rate": 3.825743362984868e-05, + "logits/chosen": -1.6642037630081177, + "logits/rejected": -1.7018256187438965, + "logps/chosen": -148.9640655517578, + "logps/rejected": -202.5922088623047, + "loss": 0.1762, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22627446055412292, + "rewards/margins": 5.173376560211182, + "rewards/rejected": -4.9471025466918945, + "step": 806 + }, + { + "epoch": 1.06, + "learning_rate": 3.8227042753589824e-05, + "logits/chosen": -1.74186372756958, + "logits/rejected": -1.7566051483154297, + "logps/chosen": -140.58749389648438, + "logps/rejected": -201.23391723632812, + "loss": 0.0873, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41862282156944275, + "rewards/margins": 5.055604457855225, + "rewards/rejected": -4.636981010437012, + "step": 807 + }, + { + "epoch": 1.06, + "learning_rate": 3.819662470785082e-05, + "logits/chosen": -1.9721858501434326, + "logits/rejected": -1.9488880634307861, + "logps/chosen": -197.06591796875, + "logps/rejected": -247.65264892578125, + "loss": 0.102, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2280472218990326, + "rewards/margins": 6.295541763305664, + "rewards/rejected": -6.067493915557861, + "step": 808 + }, + { + "epoch": 1.06, + "learning_rate": 3.816617955511296e-05, + "logits/chosen": -1.9263876676559448, + "logits/rejected": -2.024251937866211, + "logps/chosen": -144.51605224609375, + "logps/rejected": -229.19000244140625, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0787353515625, + "rewards/margins": 6.103402137756348, + "rewards/rejected": -6.1821370124816895, + "step": 809 + }, + { + "epoch": 1.06, + "learning_rate": 3.8135707357913176e-05, + "logits/chosen": -1.9594213962554932, + "logits/rejected": -1.9519600868225098, + "logps/chosen": -163.80809020996094, + "logps/rejected": -218.344970703125, + "loss": 0.0908, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4812394976615906, + "rewards/margins": 6.453742027282715, + "rewards/rejected": -5.9725022315979, + "step": 810 + }, + { + "epoch": 1.06, + "learning_rate": 3.8105208178843984e-05, + "logits/chosen": -1.9472453594207764, + "logits/rejected": -1.9247050285339355, + "logps/chosen": -176.8766326904297, + "logps/rejected": -215.28683471679688, + "loss": 0.0745, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.35657447576522827, + "rewards/margins": 5.61616849899292, + "rewards/rejected": -5.972743034362793, + "step": 811 + }, + { + "epoch": 1.06, + "learning_rate": 3.8074682080553335e-05, + "logits/chosen": -2.037539005279541, + "logits/rejected": -2.0407166481018066, + "logps/chosen": -158.74447631835938, + "logps/rejected": -200.7305145263672, + "loss": 0.1276, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5493102073669434, + "rewards/margins": 4.338137149810791, + "rewards/rejected": -4.887447357177734, + "step": 812 + }, + { + "epoch": 1.06, + "learning_rate": 3.804412912574442e-05, + "logits/chosen": -1.5923181772232056, + "logits/rejected": -1.6689324378967285, + "logps/chosen": -123.06846618652344, + "logps/rejected": -194.64022827148438, + "loss": 0.1177, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.14290863275527954, + "rewards/margins": 6.073092460632324, + "rewards/rejected": -5.930182456970215, + "step": 813 + }, + { + "epoch": 1.07, + "learning_rate": 3.801354937717565e-05, + "logits/chosen": -1.763596773147583, + "logits/rejected": -1.761218547821045, + "logps/chosen": -152.07403564453125, + "logps/rejected": -195.23391723632812, + "loss": 0.0384, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.022281453013420105, + "rewards/margins": 5.0504584312438965, + "rewards/rejected": -5.028176784515381, + "step": 814 + }, + { + "epoch": 1.07, + "learning_rate": 3.798294289766043e-05, + "logits/chosen": -1.8372154235839844, + "logits/rejected": -1.877131700515747, + "logps/chosen": -153.13719177246094, + "logps/rejected": -214.76300048828125, + "loss": 0.164, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.30196332931518555, + "rewards/margins": 5.951177597045898, + "rewards/rejected": -6.253140449523926, + "step": 815 + }, + { + "epoch": 1.07, + "learning_rate": 3.795230975006712e-05, + "logits/chosen": -2.02732253074646, + "logits/rejected": -2.0462913513183594, + "logps/chosen": -149.4512176513672, + "logps/rejected": -206.4884490966797, + "loss": 0.0554, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3465498387813568, + "rewards/margins": 5.401089668273926, + "rewards/rejected": -5.7476396560668945, + "step": 816 + }, + { + "epoch": 1.07, + "learning_rate": 3.792164999731881e-05, + "logits/chosen": -1.9269187450408936, + "logits/rejected": -1.9644137620925903, + "logps/chosen": -130.3570556640625, + "logps/rejected": -184.312255859375, + "loss": 0.1769, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5219983458518982, + "rewards/margins": 4.185467720031738, + "rewards/rejected": -4.707466125488281, + "step": 817 + }, + { + "epoch": 1.07, + "learning_rate": 3.789096370239328e-05, + "logits/chosen": -1.7042335271835327, + "logits/rejected": -1.730513334274292, + "logps/chosen": -195.3024139404297, + "logps/rejected": -241.0108184814453, + "loss": 0.0231, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1243710145354271, + "rewards/margins": 6.514937400817871, + "rewards/rejected": -6.390565872192383, + "step": 818 + }, + { + "epoch": 1.07, + "learning_rate": 3.786025092832279e-05, + "logits/chosen": -1.6639565229415894, + "logits/rejected": -1.5779391527175903, + "logps/chosen": -174.08763122558594, + "logps/rejected": -248.2294464111328, + "loss": 0.0325, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5761074423789978, + "rewards/margins": 7.08044958114624, + "rewards/rejected": -7.656557083129883, + "step": 819 + }, + { + "epoch": 1.07, + "learning_rate": 3.782951173819403e-05, + "logits/chosen": -1.858964204788208, + "logits/rejected": -1.8241450786590576, + "logps/chosen": -175.27001953125, + "logps/rejected": -236.11073303222656, + "loss": 0.0821, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.46175456047058105, + "rewards/margins": 6.477173805236816, + "rewards/rejected": -6.938928604125977, + "step": 820 + }, + { + "epoch": 1.07, + "learning_rate": 3.7798746195147914e-05, + "logits/chosen": -1.890723466873169, + "logits/rejected": -1.931648850440979, + "logps/chosen": -148.67684936523438, + "logps/rejected": -197.44638061523438, + "loss": 0.0738, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4770905375480652, + "rewards/margins": 4.815718173980713, + "rewards/rejected": -5.292808532714844, + "step": 821 + }, + { + "epoch": 1.08, + "learning_rate": 3.776795436237954e-05, + "logits/chosen": -1.5346938371658325, + "logits/rejected": -1.5795890092849731, + "logps/chosen": -134.60568237304688, + "logps/rejected": -227.31204223632812, + "loss": 0.0891, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7744713425636292, + "rewards/margins": 8.283700942993164, + "rewards/rejected": -7.50922966003418, + "step": 822 + }, + { + "epoch": 1.08, + "learning_rate": 3.773713630313793e-05, + "logits/chosen": -1.794039011001587, + "logits/rejected": -1.7841002941131592, + "logps/chosen": -177.0500946044922, + "logps/rejected": -220.89031982421875, + "loss": 0.1019, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4332965612411499, + "rewards/margins": 7.096560001373291, + "rewards/rejected": -7.5298566818237305, + "step": 823 + }, + { + "epoch": 1.08, + "learning_rate": 3.7706292080726055e-05, + "logits/chosen": -1.6856433153152466, + "logits/rejected": -1.6186890602111816, + "logps/chosen": -180.94700622558594, + "logps/rejected": -231.9288330078125, + "loss": 0.0917, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3657735586166382, + "rewards/margins": 7.685838222503662, + "rewards/rejected": -8.051612854003906, + "step": 824 + }, + { + "epoch": 1.08, + "learning_rate": 3.767542175850058e-05, + "logits/chosen": -1.6293786764144897, + "logits/rejected": -1.6187849044799805, + "logps/chosen": -164.32154846191406, + "logps/rejected": -270.12139892578125, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7973673343658447, + "rewards/margins": 8.690082550048828, + "rewards/rejected": -9.487449645996094, + "step": 825 + }, + { + "epoch": 1.08, + "learning_rate": 3.764452539987179e-05, + "logits/chosen": -1.6801162958145142, + "logits/rejected": -1.7072298526763916, + "logps/chosen": -164.7991943359375, + "logps/rejected": -236.45445251464844, + "loss": 0.0628, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.21068036556243896, + "rewards/margins": 6.561530113220215, + "rewards/rejected": -6.772209644317627, + "step": 826 + }, + { + "epoch": 1.08, + "learning_rate": 3.761360306830345e-05, + "logits/chosen": -1.649834394454956, + "logits/rejected": -1.677789568901062, + "logps/chosen": -161.15872192382812, + "logps/rejected": -217.13719177246094, + "loss": 0.1428, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9511618614196777, + "rewards/margins": 5.468101978302002, + "rewards/rejected": -6.4192633628845215, + "step": 827 + }, + { + "epoch": 1.08, + "learning_rate": 3.75826548273127e-05, + "logits/chosen": -1.6571552753448486, + "logits/rejected": -1.7653439044952393, + "logps/chosen": -215.34512329101562, + "logps/rejected": -283.7718811035156, + "loss": 0.1087, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.772979199886322, + "rewards/margins": 5.931948184967041, + "rewards/rejected": -6.704927921295166, + "step": 828 + }, + { + "epoch": 1.08, + "learning_rate": 3.7551680740469874e-05, + "logits/chosen": -1.8746492862701416, + "logits/rejected": -1.846010684967041, + "logps/chosen": -194.9639129638672, + "logps/rejected": -278.802001953125, + "loss": 0.0818, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6157702207565308, + "rewards/margins": 6.646345615386963, + "rewards/rejected": -7.262115955352783, + "step": 829 + }, + { + "epoch": 1.09, + "learning_rate": 3.752068087139839e-05, + "logits/chosen": -1.7149810791015625, + "logits/rejected": -1.6446678638458252, + "logps/chosen": -186.3773956298828, + "logps/rejected": -245.27151489257812, + "loss": 0.1825, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0201466083526611, + "rewards/margins": 7.199706554412842, + "rewards/rejected": -8.219853401184082, + "step": 830 + }, + { + "epoch": 1.09, + "learning_rate": 3.7489655283774657e-05, + "logits/chosen": -1.9555699825286865, + "logits/rejected": -2.0322654247283936, + "logps/chosen": -166.51882934570312, + "logps/rejected": -217.6027069091797, + "loss": 0.0656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.890149712562561, + "rewards/margins": 6.400627136230469, + "rewards/rejected": -7.29077672958374, + "step": 831 + }, + { + "epoch": 1.09, + "learning_rate": 3.7458604041327874e-05, + "logits/chosen": -1.8436245918273926, + "logits/rejected": -1.7562732696533203, + "logps/chosen": -172.03787231445312, + "logps/rejected": -267.79901123046875, + "loss": 0.1362, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0612387657165527, + "rewards/margins": 5.854422092437744, + "rewards/rejected": -6.915660858154297, + "step": 832 + }, + { + "epoch": 1.09, + "learning_rate": 3.742752720783997e-05, + "logits/chosen": -1.9384236335754395, + "logits/rejected": -1.94955575466156, + "logps/chosen": -142.2777099609375, + "logps/rejected": -220.53286743164062, + "loss": 0.0745, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6656227707862854, + "rewards/margins": 6.518516540527344, + "rewards/rejected": -7.184139728546143, + "step": 833 + }, + { + "epoch": 1.09, + "learning_rate": 3.7396424847145425e-05, + "logits/chosen": -1.6971423625946045, + "logits/rejected": -1.618373155593872, + "logps/chosen": -181.0162811279297, + "logps/rejected": -220.42465209960938, + "loss": 0.1364, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1203206777572632, + "rewards/margins": 5.729173183441162, + "rewards/rejected": -6.849493980407715, + "step": 834 + }, + { + "epoch": 1.09, + "learning_rate": 3.736529702313114e-05, + "logits/chosen": -2.0127007961273193, + "logits/rejected": -1.9761312007904053, + "logps/chosen": -186.38592529296875, + "logps/rejected": -266.2695617675781, + "loss": 0.0531, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.07478967308998108, + "rewards/margins": 8.762662887573242, + "rewards/rejected": -8.687873840332031, + "step": 835 + }, + { + "epoch": 1.09, + "learning_rate": 3.733414379973635e-05, + "logits/chosen": -1.7820303440093994, + "logits/rejected": -1.7872676849365234, + "logps/chosen": -181.1324005126953, + "logps/rejected": -247.3558349609375, + "loss": 0.1107, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1160593032836914, + "rewards/margins": 6.529733180999756, + "rewards/rejected": -7.6457929611206055, + "step": 836 + }, + { + "epoch": 1.1, + "learning_rate": 3.730296524095245e-05, + "logits/chosen": -1.7832846641540527, + "logits/rejected": -1.7420811653137207, + "logps/chosen": -191.2445831298828, + "logps/rejected": -225.04779052734375, + "loss": 0.1667, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5104981660842896, + "rewards/margins": 4.743764400482178, + "rewards/rejected": -6.2542619705200195, + "step": 837 + }, + { + "epoch": 1.1, + "learning_rate": 3.7271761410822856e-05, + "logits/chosen": -1.8028768301010132, + "logits/rejected": -1.8446567058563232, + "logps/chosen": -161.13967895507812, + "logps/rejected": -220.7889404296875, + "loss": 0.0945, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.173981711268425, + "rewards/margins": 6.750193119049072, + "rewards/rejected": -6.924174785614014, + "step": 838 + }, + { + "epoch": 1.1, + "learning_rate": 3.724053237344294e-05, + "logits/chosen": -1.8580052852630615, + "logits/rejected": -1.9401862621307373, + "logps/chosen": -164.49525451660156, + "logps/rejected": -236.0361328125, + "loss": 0.0573, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2278103828430176, + "rewards/margins": 6.170598030090332, + "rewards/rejected": -7.398408889770508, + "step": 839 + }, + { + "epoch": 1.1, + "learning_rate": 3.720927819295979e-05, + "logits/chosen": -1.7781224250793457, + "logits/rejected": -1.7530982494354248, + "logps/chosen": -175.5201873779297, + "logps/rejected": -205.05555725097656, + "loss": 0.0918, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.761866569519043, + "rewards/margins": 5.51897668838501, + "rewards/rejected": -6.2808427810668945, + "step": 840 + }, + { + "epoch": 1.1, + "learning_rate": 3.7177998933572186e-05, + "logits/chosen": -1.849440574645996, + "logits/rejected": -1.8066624402999878, + "logps/chosen": -162.9970245361328, + "logps/rejected": -206.83587646484375, + "loss": 0.182, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9978735446929932, + "rewards/margins": 4.817579746246338, + "rewards/rejected": -5.815453052520752, + "step": 841 + }, + { + "epoch": 1.1, + "learning_rate": 3.7146694659530425e-05, + "logits/chosen": -1.958741307258606, + "logits/rejected": -1.9996490478515625, + "logps/chosen": -176.97909545898438, + "logps/rejected": -224.2952880859375, + "loss": 0.1747, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3607791066169739, + "rewards/margins": 6.919254779815674, + "rewards/rejected": -7.28003454208374, + "step": 842 + }, + { + "epoch": 1.1, + "learning_rate": 3.711536543513614e-05, + "logits/chosen": -1.615922451019287, + "logits/rejected": -1.65949285030365, + "logps/chosen": -136.40432739257812, + "logps/rejected": -226.84005737304688, + "loss": 0.1267, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20042994618415833, + "rewards/margins": 6.525726318359375, + "rewards/rejected": -6.726156234741211, + "step": 843 + }, + { + "epoch": 1.1, + "learning_rate": 3.708401132474228e-05, + "logits/chosen": -1.7548750638961792, + "logits/rejected": -1.8182183504104614, + "logps/chosen": -164.22671508789062, + "logps/rejected": -234.67388916015625, + "loss": 0.1388, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38912245631217957, + "rewards/margins": 6.252824783325195, + "rewards/rejected": -6.641947269439697, + "step": 844 + }, + { + "epoch": 1.11, + "learning_rate": 3.705263239275284e-05, + "logits/chosen": -1.981087565422058, + "logits/rejected": -1.9561117887496948, + "logps/chosen": -133.49888610839844, + "logps/rejected": -170.16517639160156, + "loss": 0.2347, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5047491788864136, + "rewards/margins": 4.245537757873535, + "rewards/rejected": -4.7502875328063965, + "step": 845 + }, + { + "epoch": 1.11, + "learning_rate": 3.702122870362286e-05, + "logits/chosen": -1.9115400314331055, + "logits/rejected": -1.9423316717147827, + "logps/chosen": -148.95416259765625, + "logps/rejected": -224.52301025390625, + "loss": 0.0484, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36701521277427673, + "rewards/margins": 6.9237189292907715, + "rewards/rejected": -7.29073429107666, + "step": 846 + }, + { + "epoch": 1.11, + "learning_rate": 3.698980032185821e-05, + "logits/chosen": -1.9395225048065186, + "logits/rejected": -1.9611619710922241, + "logps/chosen": -142.73492431640625, + "logps/rejected": -209.59840393066406, + "loss": 0.0262, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4082959294319153, + "rewards/margins": 5.452874183654785, + "rewards/rejected": -5.861170768737793, + "step": 847 + }, + { + "epoch": 1.11, + "learning_rate": 3.695834731201548e-05, + "logits/chosen": -1.6748156547546387, + "logits/rejected": -1.6887158155441284, + "logps/chosen": -128.32301330566406, + "logps/rejected": -196.24961853027344, + "loss": 0.12, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0047543048858643, + "rewards/margins": 5.711575508117676, + "rewards/rejected": -6.716329574584961, + "step": 848 + }, + { + "epoch": 1.11, + "learning_rate": 3.692686973870184e-05, + "logits/chosen": -1.4874969720840454, + "logits/rejected": -1.5210838317871094, + "logps/chosen": -245.0333709716797, + "logps/rejected": -269.8932800292969, + "loss": 0.1697, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7569795846939087, + "rewards/margins": 5.025569438934326, + "rewards/rejected": -6.782548904418945, + "step": 849 + }, + { + "epoch": 1.11, + "learning_rate": 3.689536766657494e-05, + "logits/chosen": -1.6894526481628418, + "logits/rejected": -1.7335946559906006, + "logps/chosen": -187.63340759277344, + "logps/rejected": -261.6910400390625, + "loss": 0.1432, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8881826400756836, + "rewards/margins": 4.793706893920898, + "rewards/rejected": -6.681889533996582, + "step": 850 + }, + { + "epoch": 1.11, + "learning_rate": 3.6863841160342723e-05, + "logits/chosen": -1.9806772470474243, + "logits/rejected": -1.9756790399551392, + "logps/chosen": -142.82334899902344, + "logps/rejected": -207.6903533935547, + "loss": 0.1081, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2324090003967285, + "rewards/margins": 5.244317054748535, + "rewards/rejected": -6.4767255783081055, + "step": 851 + }, + { + "epoch": 1.12, + "learning_rate": 3.683229028476334e-05, + "logits/chosen": -1.581850528717041, + "logits/rejected": -1.5908780097961426, + "logps/chosen": -155.81094360351562, + "logps/rejected": -206.24839782714844, + "loss": 0.0606, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8023391366004944, + "rewards/margins": 5.794926166534424, + "rewards/rejected": -6.597265720367432, + "step": 852 + }, + { + "epoch": 1.12, + "learning_rate": 3.6800715104645e-05, + "logits/chosen": -1.8793675899505615, + "logits/rejected": -1.871899127960205, + "logps/chosen": -151.31143188476562, + "logps/rejected": -213.91415405273438, + "loss": 0.0934, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.20195460319519043, + "rewards/margins": 5.953958511352539, + "rewards/rejected": -6.15591287612915, + "step": 853 + }, + { + "epoch": 1.12, + "learning_rate": 3.676911568484583e-05, + "logits/chosen": -1.8364856243133545, + "logits/rejected": -1.8397459983825684, + "logps/chosen": -155.30813598632812, + "logps/rejected": -239.54324340820312, + "loss": 0.0756, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3419159650802612, + "rewards/margins": 7.008118629455566, + "rewards/rejected": -8.350035667419434, + "step": 854 + }, + { + "epoch": 1.12, + "learning_rate": 3.673749209027375e-05, + "logits/chosen": -1.7932429313659668, + "logits/rejected": -1.7990435361862183, + "logps/chosen": -137.10389709472656, + "logps/rejected": -207.8248291015625, + "loss": 0.1251, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2713301181793213, + "rewards/margins": 6.852174282073975, + "rewards/rejected": -7.123503684997559, + "step": 855 + }, + { + "epoch": 1.12, + "learning_rate": 3.6705844385886334e-05, + "logits/chosen": -1.965791940689087, + "logits/rejected": -1.9781205654144287, + "logps/chosen": -177.0115203857422, + "logps/rejected": -230.706298828125, + "loss": 0.0733, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48921334743499756, + "rewards/margins": 5.834098815917969, + "rewards/rejected": -6.323312282562256, + "step": 856 + }, + { + "epoch": 1.12, + "learning_rate": 3.667417263669068e-05, + "logits/chosen": -2.0746617317199707, + "logits/rejected": -2.110121011734009, + "logps/chosen": -160.97117614746094, + "logps/rejected": -228.11575317382812, + "loss": 0.0222, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32855403423309326, + "rewards/margins": 6.803640365600586, + "rewards/rejected": -6.475086212158203, + "step": 857 + }, + { + "epoch": 1.12, + "learning_rate": 3.6642476907743276e-05, + "logits/chosen": -2.0400307178497314, + "logits/rejected": -2.042586326599121, + "logps/chosen": -151.84706115722656, + "logps/rejected": -205.36061096191406, + "loss": 0.096, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.581629753112793, + "rewards/margins": 4.815516948699951, + "rewards/rejected": -5.397147178649902, + "step": 858 + }, + { + "epoch": 1.12, + "learning_rate": 3.661075726414986e-05, + "logits/chosen": -1.9448158740997314, + "logits/rejected": -1.9009454250335693, + "logps/chosen": -154.41232299804688, + "logps/rejected": -230.76966857910156, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33493664860725403, + "rewards/margins": 6.677095890045166, + "rewards/rejected": -7.012032508850098, + "step": 859 + }, + { + "epoch": 1.13, + "learning_rate": 3.6579013771065305e-05, + "logits/chosen": -1.6962409019470215, + "logits/rejected": -1.7169561386108398, + "logps/chosen": -130.30642700195312, + "logps/rejected": -224.05807495117188, + "loss": 0.0538, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.25574082136154175, + "rewards/margins": 6.733345985412598, + "rewards/rejected": -6.989086627960205, + "step": 860 + }, + { + "epoch": 1.13, + "learning_rate": 3.654724649369348e-05, + "logits/chosen": -1.8101693391799927, + "logits/rejected": -1.84493088722229, + "logps/chosen": -208.43240356445312, + "logps/rejected": -260.6061096191406, + "loss": 0.0705, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2112698256969452, + "rewards/margins": 6.099943161010742, + "rewards/rejected": -5.888673305511475, + "step": 861 + }, + { + "epoch": 1.13, + "learning_rate": 3.651545549728709e-05, + "logits/chosen": -1.7534453868865967, + "logits/rejected": -1.7846795320510864, + "logps/chosen": -189.62843322753906, + "logps/rejected": -241.3184814453125, + "loss": 0.1383, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5467860102653503, + "rewards/margins": 5.219336986541748, + "rewards/rejected": -5.766123294830322, + "step": 862 + }, + { + "epoch": 1.13, + "learning_rate": 3.6483640847147554e-05, + "logits/chosen": -1.8643182516098022, + "logits/rejected": -1.9524219036102295, + "logps/chosen": -177.50558471679688, + "logps/rejected": -267.6409606933594, + "loss": 0.0927, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.042034849524497986, + "rewards/margins": 8.53548812866211, + "rewards/rejected": -8.493453025817871, + "step": 863 + }, + { + "epoch": 1.13, + "learning_rate": 3.645180260862492e-05, + "logits/chosen": -1.7953535318374634, + "logits/rejected": -1.9094650745391846, + "logps/chosen": -147.2042694091797, + "logps/rejected": -208.8782196044922, + "loss": 0.1173, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6070430278778076, + "rewards/margins": 5.45149040222168, + "rewards/rejected": -6.058534145355225, + "step": 864 + }, + { + "epoch": 1.13, + "learning_rate": 3.6419940847117626e-05, + "logits/chosen": -1.8601136207580566, + "logits/rejected": -1.9172056913375854, + "logps/chosen": -153.92825317382812, + "logps/rejected": -239.37582397460938, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5362662672996521, + "rewards/margins": 7.263176441192627, + "rewards/rejected": -6.726909637451172, + "step": 865 + }, + { + "epoch": 1.13, + "learning_rate": 3.638805562807249e-05, + "logits/chosen": -2.1471753120422363, + "logits/rejected": -2.1143898963928223, + "logps/chosen": -150.82498168945312, + "logps/rejected": -217.10682678222656, + "loss": 0.1042, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.35013625025749207, + "rewards/margins": 6.030050754547119, + "rewards/rejected": -6.380187034606934, + "step": 866 + }, + { + "epoch": 1.13, + "learning_rate": 3.635614701698448e-05, + "logits/chosen": -1.8338104486465454, + "logits/rejected": -1.765840768814087, + "logps/chosen": -166.82640075683594, + "logps/rejected": -216.974365234375, + "loss": 0.1033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1417291164398193, + "rewards/margins": 4.327848434448242, + "rewards/rejected": -5.469577789306641, + "step": 867 + }, + { + "epoch": 1.14, + "learning_rate": 3.632421507939661e-05, + "logits/chosen": -2.05930757522583, + "logits/rejected": -2.066603899002075, + "logps/chosen": -162.9051055908203, + "logps/rejected": -217.9956817626953, + "loss": 0.0979, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6731106638908386, + "rewards/margins": 5.83856725692749, + "rewards/rejected": -6.511678218841553, + "step": 868 + }, + { + "epoch": 1.14, + "learning_rate": 3.629225988089983e-05, + "logits/chosen": -1.6549468040466309, + "logits/rejected": -1.6761040687561035, + "logps/chosen": -126.88896179199219, + "logps/rejected": -162.6875457763672, + "loss": 0.1569, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3273538649082184, + "rewards/margins": 4.158275604248047, + "rewards/rejected": -4.485629081726074, + "step": 869 + }, + { + "epoch": 1.14, + "learning_rate": 3.6260281487132846e-05, + "logits/chosen": -2.0777931213378906, + "logits/rejected": -2.1053049564361572, + "logps/chosen": -141.18258666992188, + "logps/rejected": -213.8999786376953, + "loss": 0.0952, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5611621141433716, + "rewards/margins": 6.539155006408691, + "rewards/rejected": -7.100317001342773, + "step": 870 + }, + { + "epoch": 1.14, + "learning_rate": 3.622827996378203e-05, + "logits/chosen": -1.9073095321655273, + "logits/rejected": -2.015732765197754, + "logps/chosen": -133.18026733398438, + "logps/rejected": -161.3040771484375, + "loss": 0.1126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45305973291397095, + "rewards/margins": 3.4641504287719727, + "rewards/rejected": -3.917210578918457, + "step": 871 + }, + { + "epoch": 1.14, + "learning_rate": 3.6196255376581254e-05, + "logits/chosen": -1.976507306098938, + "logits/rejected": -1.9917340278625488, + "logps/chosen": -149.463623046875, + "logps/rejected": -251.50559997558594, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2065681666135788, + "rewards/margins": 7.24117374420166, + "rewards/rejected": -7.034605503082275, + "step": 872 + }, + { + "epoch": 1.14, + "learning_rate": 3.616420779131177e-05, + "logits/chosen": -1.90818190574646, + "logits/rejected": -1.9081482887268066, + "logps/chosen": -208.44915771484375, + "logps/rejected": -265.2822265625, + "loss": 0.2055, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7645907998085022, + "rewards/margins": 5.729945659637451, + "rewards/rejected": -6.49453592300415, + "step": 873 + }, + { + "epoch": 1.14, + "learning_rate": 3.613213727380206e-05, + "logits/chosen": -1.9490686655044556, + "logits/rejected": -1.9532685279846191, + "logps/chosen": -151.6713104248047, + "logps/rejected": -201.02764892578125, + "loss": 0.0896, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1919515132904053, + "rewards/margins": 5.8545074462890625, + "rewards/rejected": -7.046459197998047, + "step": 874 + }, + { + "epoch": 1.15, + "learning_rate": 3.610004388992771e-05, + "logits/chosen": -1.531323790550232, + "logits/rejected": -1.5893418788909912, + "logps/chosen": -119.92091369628906, + "logps/rejected": -190.87521362304688, + "loss": 0.0822, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02865486592054367, + "rewards/margins": 7.342573165893555, + "rewards/rejected": -7.3139190673828125, + "step": 875 + }, + { + "epoch": 1.15, + "learning_rate": 3.6067927705611304e-05, + "logits/chosen": -1.969842553138733, + "logits/rejected": -1.9398161172866821, + "logps/chosen": -149.05857849121094, + "logps/rejected": -244.5399932861328, + "loss": 0.06, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.039116568863391876, + "rewards/margins": 6.896425724029541, + "rewards/rejected": -6.935542583465576, + "step": 876 + }, + { + "epoch": 1.15, + "learning_rate": 3.6035788786822225e-05, + "logits/chosen": -2.0903704166412354, + "logits/rejected": -2.1718435287475586, + "logps/chosen": -186.17471313476562, + "logps/rejected": -222.77317810058594, + "loss": 0.1733, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8885395526885986, + "rewards/margins": 4.886023044586182, + "rewards/rejected": -5.774562835693359, + "step": 877 + }, + { + "epoch": 1.15, + "learning_rate": 3.6003627199576564e-05, + "logits/chosen": -1.7578074932098389, + "logits/rejected": -1.8191397190093994, + "logps/chosen": -156.69163513183594, + "logps/rejected": -235.78305053710938, + "loss": 0.0509, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1328904628753662, + "rewards/margins": 7.572022438049316, + "rewards/rejected": -7.704913139343262, + "step": 878 + }, + { + "epoch": 1.15, + "learning_rate": 3.597144300993699e-05, + "logits/chosen": -1.8367571830749512, + "logits/rejected": -1.9178485870361328, + "logps/chosen": -203.2810516357422, + "logps/rejected": -253.2967987060547, + "loss": 0.1422, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8315750360488892, + "rewards/margins": 5.403246879577637, + "rewards/rejected": -6.2348222732543945, + "step": 879 + }, + { + "epoch": 1.15, + "learning_rate": 3.593923628401259e-05, + "logits/chosen": -2.070420265197754, + "logits/rejected": -2.117854356765747, + "logps/chosen": -129.53744506835938, + "logps/rejected": -213.93307495117188, + "loss": 0.1386, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.20923367142677307, + "rewards/margins": 7.3108906745910645, + "rewards/rejected": -7.101656436920166, + "step": 880 + }, + { + "epoch": 1.15, + "learning_rate": 3.5907007087958726e-05, + "logits/chosen": -1.6655091047286987, + "logits/rejected": -1.693989634513855, + "logps/chosen": -168.85360717773438, + "logps/rejected": -233.3699951171875, + "loss": 0.2251, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.492025375366211, + "rewards/margins": 4.709729194641113, + "rewards/rejected": -6.201755046844482, + "step": 881 + }, + { + "epoch": 1.15, + "learning_rate": 3.587475548797694e-05, + "logits/chosen": -2.0595686435699463, + "logits/rejected": -1.963339924812317, + "logps/chosen": -149.55838012695312, + "logps/rejected": -191.93299865722656, + "loss": 0.1129, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6186510920524597, + "rewards/margins": 5.05853271484375, + "rewards/rejected": -5.677184104919434, + "step": 882 + }, + { + "epoch": 1.16, + "learning_rate": 3.5842481550314794e-05, + "logits/chosen": -1.9392576217651367, + "logits/rejected": -1.889467477798462, + "logps/chosen": -168.27606201171875, + "logps/rejected": -279.1664733886719, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9032707214355469, + "rewards/margins": 8.34951400756836, + "rewards/rejected": -9.25278377532959, + "step": 883 + }, + { + "epoch": 1.16, + "learning_rate": 3.581018534126571e-05, + "logits/chosen": -2.025470495223999, + "logits/rejected": -2.0483429431915283, + "logps/chosen": -201.74916076660156, + "logps/rejected": -267.3995361328125, + "loss": 0.08, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8922535181045532, + "rewards/margins": 5.836456775665283, + "rewards/rejected": -6.728710174560547, + "step": 884 + }, + { + "epoch": 1.16, + "learning_rate": 3.577786692716886e-05, + "logits/chosen": -1.8123650550842285, + "logits/rejected": -1.8336703777313232, + "logps/chosen": -166.75900268554688, + "logps/rejected": -210.2139129638672, + "loss": 0.107, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29484596848487854, + "rewards/margins": 5.857831001281738, + "rewards/rejected": -6.152677059173584, + "step": 885 + }, + { + "epoch": 1.16, + "learning_rate": 3.574552637440907e-05, + "logits/chosen": -1.826763391494751, + "logits/rejected": -1.8925033807754517, + "logps/chosen": -147.36468505859375, + "logps/rejected": -225.35337829589844, + "loss": 0.0689, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.057733140885829926, + "rewards/margins": 6.5763630867004395, + "rewards/rejected": -6.634096622467041, + "step": 886 + }, + { + "epoch": 1.16, + "learning_rate": 3.571316374941658e-05, + "logits/chosen": -1.8496880531311035, + "logits/rejected": -1.9214247465133667, + "logps/chosen": -143.42526245117188, + "logps/rejected": -276.57928466796875, + "loss": 0.0297, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7962692975997925, + "rewards/margins": 8.667549133300781, + "rewards/rejected": -9.46381950378418, + "step": 887 + }, + { + "epoch": 1.16, + "learning_rate": 3.568077911866703e-05, + "logits/chosen": -2.2294137477874756, + "logits/rejected": -2.256291627883911, + "logps/chosen": -165.8625030517578, + "logps/rejected": -226.53073120117188, + "loss": 0.1032, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0509380102157593, + "rewards/margins": 5.621366500854492, + "rewards/rejected": -6.672304153442383, + "step": 888 + }, + { + "epoch": 1.16, + "learning_rate": 3.564837254868118e-05, + "logits/chosen": -1.8474410772323608, + "logits/rejected": -1.8447623252868652, + "logps/chosen": -170.68482971191406, + "logps/rejected": -241.2418212890625, + "loss": 0.1629, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5344374179840088, + "rewards/margins": 6.09687614440918, + "rewards/rejected": -6.631314277648926, + "step": 889 + }, + { + "epoch": 1.16, + "learning_rate": 3.561594410602495e-05, + "logits/chosen": -1.969462513923645, + "logits/rejected": -1.974022388458252, + "logps/chosen": -155.7165985107422, + "logps/rejected": -204.6715087890625, + "loss": 0.1178, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.070492148399353, + "rewards/margins": 5.344703197479248, + "rewards/rejected": -6.415195465087891, + "step": 890 + }, + { + "epoch": 1.17, + "learning_rate": 3.558349385730913e-05, + "logits/chosen": -1.9857453107833862, + "logits/rejected": -1.9735053777694702, + "logps/chosen": -171.04324340820312, + "logps/rejected": -229.79617309570312, + "loss": 0.0638, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3228389322757721, + "rewards/margins": 6.23259162902832, + "rewards/rejected": -6.555430889129639, + "step": 891 + }, + { + "epoch": 1.17, + "learning_rate": 3.5551021869189286e-05, + "logits/chosen": -1.9572652578353882, + "logits/rejected": -2.0211684703826904, + "logps/chosen": -177.00323486328125, + "logps/rejected": -242.7465362548828, + "loss": 0.0911, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.09810394793748856, + "rewards/margins": 6.194761753082275, + "rewards/rejected": -6.292865753173828, + "step": 892 + }, + { + "epoch": 1.17, + "learning_rate": 3.55185282083657e-05, + "logits/chosen": -2.00260066986084, + "logits/rejected": -2.0481128692626953, + "logps/chosen": -179.40768432617188, + "logps/rejected": -237.79440307617188, + "loss": 0.1789, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9579204320907593, + "rewards/margins": 5.24198579788208, + "rewards/rejected": -6.1999053955078125, + "step": 893 + }, + { + "epoch": 1.17, + "learning_rate": 3.548601294158313e-05, + "logits/chosen": -1.9575377702713013, + "logits/rejected": -1.9701429605484009, + "logps/chosen": -170.8883819580078, + "logps/rejected": -204.66455078125, + "loss": 0.2138, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2080439329147339, + "rewards/margins": 3.7738595008850098, + "rewards/rejected": -4.981903076171875, + "step": 894 + }, + { + "epoch": 1.17, + "learning_rate": 3.5453476135630706e-05, + "logits/chosen": -2.005340814590454, + "logits/rejected": -2.003964900970459, + "logps/chosen": -153.75393676757812, + "logps/rejected": -222.0598907470703, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16159169375896454, + "rewards/margins": 6.470172882080078, + "rewards/rejected": -6.308581352233887, + "step": 895 + }, + { + "epoch": 1.17, + "learning_rate": 3.542091785734184e-05, + "logits/chosen": -1.9905329942703247, + "logits/rejected": -1.9771034717559814, + "logps/chosen": -140.4086456298828, + "logps/rejected": -171.7359619140625, + "loss": 0.1531, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.499469518661499, + "rewards/margins": 4.087397575378418, + "rewards/rejected": -4.586867332458496, + "step": 896 + }, + { + "epoch": 1.17, + "learning_rate": 3.538833817359401e-05, + "logits/chosen": -2.136502265930176, + "logits/rejected": -2.1341865062713623, + "logps/chosen": -172.52622985839844, + "logps/rejected": -240.81138610839844, + "loss": 0.188, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5810390114784241, + "rewards/margins": 6.328227996826172, + "rewards/rejected": -6.909266948699951, + "step": 897 + }, + { + "epoch": 1.18, + "learning_rate": 3.5355737151308686e-05, + "logits/chosen": -1.9634038209915161, + "logits/rejected": -1.9026023149490356, + "logps/chosen": -133.00393676757812, + "logps/rejected": -188.03909301757812, + "loss": 0.0604, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5214068293571472, + "rewards/margins": 5.715162754058838, + "rewards/rejected": -6.236569881439209, + "step": 898 + }, + { + "epoch": 1.18, + "learning_rate": 3.5323114857451174e-05, + "logits/chosen": -2.059035301208496, + "logits/rejected": -2.093261241912842, + "logps/chosen": -178.64810180664062, + "logps/rejected": -254.74835205078125, + "loss": 0.0499, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20145034790039062, + "rewards/margins": 7.516545295715332, + "rewards/rejected": -7.7179951667785645, + "step": 899 + }, + { + "epoch": 1.18, + "learning_rate": 3.529047135903045e-05, + "logits/chosen": -1.9401965141296387, + "logits/rejected": -1.893580675125122, + "logps/chosen": -148.152099609375, + "logps/rejected": -217.60165405273438, + "loss": 0.0832, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.22576339542865753, + "rewards/margins": 6.487100124359131, + "rewards/rejected": -6.2613372802734375, + "step": 900 + }, + { + "epoch": 1.18, + "learning_rate": 3.525780672309907e-05, + "logits/chosen": -2.099334478378296, + "logits/rejected": -2.1224048137664795, + "logps/chosen": -176.6794891357422, + "logps/rejected": -189.8108367919922, + "loss": 0.1743, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3507277965545654, + "rewards/margins": 4.109447956085205, + "rewards/rejected": -5.460175514221191, + "step": 901 + }, + { + "epoch": 1.18, + "learning_rate": 3.522512101675299e-05, + "logits/chosen": -2.1799097061157227, + "logits/rejected": -2.176818609237671, + "logps/chosen": -181.85447692871094, + "logps/rejected": -213.69349670410156, + "loss": 0.2637, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0590920448303223, + "rewards/margins": 3.6977596282958984, + "rewards/rejected": -4.756852149963379, + "step": 902 + }, + { + "epoch": 1.18, + "learning_rate": 3.519241430713145e-05, + "logits/chosen": -1.9091711044311523, + "logits/rejected": -2.006434679031372, + "logps/chosen": -143.8796844482422, + "logps/rejected": -204.40049743652344, + "loss": 0.0631, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.471727192401886, + "rewards/margins": 6.225522994995117, + "rewards/rejected": -6.6972503662109375, + "step": 903 + }, + { + "epoch": 1.18, + "learning_rate": 3.5159686661416834e-05, + "logits/chosen": -1.8754589557647705, + "logits/rejected": -1.8962466716766357, + "logps/chosen": -160.32774353027344, + "logps/rejected": -256.0925598144531, + "loss": 0.0538, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6474647521972656, + "rewards/margins": 8.686860084533691, + "rewards/rejected": -8.039395332336426, + "step": 904 + }, + { + "epoch": 1.18, + "learning_rate": 3.512693814683456e-05, + "logits/chosen": -1.6069674491882324, + "logits/rejected": -1.6205403804779053, + "logps/chosen": -159.8393096923828, + "logps/rejected": -223.45440673828125, + "loss": 0.107, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5345543622970581, + "rewards/margins": 4.439685344696045, + "rewards/rejected": -4.974239826202393, + "step": 905 + }, + { + "epoch": 1.19, + "learning_rate": 3.5094168830652854e-05, + "logits/chosen": -1.88560950756073, + "logits/rejected": -1.9885283708572388, + "logps/chosen": -162.08815002441406, + "logps/rejected": -214.88351440429688, + "loss": 0.1758, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36401665210723877, + "rewards/margins": 5.732705116271973, + "rewards/rejected": -6.09672212600708, + "step": 906 + }, + { + "epoch": 1.19, + "learning_rate": 3.506137878018272e-05, + "logits/chosen": -2.0618391036987305, + "logits/rejected": -2.058894634246826, + "logps/chosen": -158.00828552246094, + "logps/rejected": -212.74961853027344, + "loss": 0.0398, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6322505474090576, + "rewards/margins": 5.919046401977539, + "rewards/rejected": -6.551296710968018, + "step": 907 + }, + { + "epoch": 1.19, + "learning_rate": 3.502856806277773e-05, + "logits/chosen": -1.9531464576721191, + "logits/rejected": -2.014647960662842, + "logps/chosen": -155.9281005859375, + "logps/rejected": -252.1271209716797, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32207781076431274, + "rewards/margins": 9.11711597442627, + "rewards/rejected": -8.795039176940918, + "step": 908 + }, + { + "epoch": 1.19, + "learning_rate": 3.4995736745833895e-05, + "logits/chosen": -2.0440237522125244, + "logits/rejected": -2.0321359634399414, + "logps/chosen": -193.749267578125, + "logps/rejected": -250.82113647460938, + "loss": 0.122, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3427705764770508, + "rewards/margins": 5.0545477867126465, + "rewards/rejected": -5.3973188400268555, + "step": 909 + }, + { + "epoch": 1.19, + "learning_rate": 3.496288489678958e-05, + "logits/chosen": -2.010633707046509, + "logits/rejected": -1.9929156303405762, + "logps/chosen": -166.10728454589844, + "logps/rejected": -248.0762939453125, + "loss": 0.0532, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27336451411247253, + "rewards/margins": 7.333755016326904, + "rewards/rejected": -7.060391426086426, + "step": 910 + }, + { + "epoch": 1.19, + "learning_rate": 3.493001258312529e-05, + "logits/chosen": -2.1049704551696777, + "logits/rejected": -2.184370279312134, + "logps/chosen": -152.46676635742188, + "logps/rejected": -193.95933532714844, + "loss": 0.1413, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22922754287719727, + "rewards/margins": 5.094156265258789, + "rewards/rejected": -5.323383331298828, + "step": 911 + }, + { + "epoch": 1.19, + "learning_rate": 3.489711987236357e-05, + "logits/chosen": -1.8540668487548828, + "logits/rejected": -1.8573285341262817, + "logps/chosen": -150.3858642578125, + "logps/rejected": -199.2803192138672, + "loss": 0.1095, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.062329575419425964, + "rewards/margins": 4.983062744140625, + "rewards/rejected": -4.920733451843262, + "step": 912 + }, + { + "epoch": 1.19, + "learning_rate": 3.4864206832068884e-05, + "logits/chosen": -1.9019203186035156, + "logits/rejected": -1.9186983108520508, + "logps/chosen": -166.6719207763672, + "logps/rejected": -236.49954223632812, + "loss": 0.059, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6351613402366638, + "rewards/margins": 5.781877040863037, + "rewards/rejected": -6.417038440704346, + "step": 913 + }, + { + "epoch": 1.2, + "learning_rate": 3.483127352984742e-05, + "logits/chosen": -1.8288668394088745, + "logits/rejected": -1.8692231178283691, + "logps/chosen": -203.69033813476562, + "logps/rejected": -294.76300048828125, + "loss": 0.1123, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3427416682243347, + "rewards/margins": 6.824990272521973, + "rewards/rejected": -6.482248306274414, + "step": 914 + }, + { + "epoch": 1.2, + "learning_rate": 3.479832003334702e-05, + "logits/chosen": -1.9807475805282593, + "logits/rejected": -1.9427908658981323, + "logps/chosen": -154.55177307128906, + "logps/rejected": -220.6343536376953, + "loss": 0.0636, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.33972498774528503, + "rewards/margins": 6.373534202575684, + "rewards/rejected": -6.033810615539551, + "step": 915 + }, + { + "epoch": 1.2, + "learning_rate": 3.476534641025698e-05, + "logits/chosen": -1.9640017747879028, + "logits/rejected": -2.0024826526641846, + "logps/chosen": -144.81207275390625, + "logps/rejected": -178.35458374023438, + "loss": 0.106, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5719605088233948, + "rewards/margins": 4.674602031707764, + "rewards/rejected": -5.2465620040893555, + "step": 916 + }, + { + "epoch": 1.2, + "learning_rate": 3.4732352728307966e-05, + "logits/chosen": -2.039835214614868, + "logits/rejected": -2.054598569869995, + "logps/chosen": -137.9036407470703, + "logps/rejected": -193.40792846679688, + "loss": 0.171, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.238839864730835, + "rewards/margins": 4.93145751953125, + "rewards/rejected": -6.170297622680664, + "step": 917 + }, + { + "epoch": 1.2, + "learning_rate": 3.469933905527182e-05, + "logits/chosen": -2.0529818534851074, + "logits/rejected": -2.0553789138793945, + "logps/chosen": -181.83724975585938, + "logps/rejected": -250.62246704101562, + "loss": 0.0482, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.027744874358177185, + "rewards/margins": 7.317486763000488, + "rewards/rejected": -7.345231056213379, + "step": 918 + }, + { + "epoch": 1.2, + "learning_rate": 3.466630545896146e-05, + "logits/chosen": -1.7903227806091309, + "logits/rejected": -1.8518483638763428, + "logps/chosen": -148.36544799804688, + "logps/rejected": -238.90078735351562, + "loss": 0.0195, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14351078867912292, + "rewards/margins": 6.769139289855957, + "rewards/rejected": -6.912649631500244, + "step": 919 + }, + { + "epoch": 1.2, + "learning_rate": 3.463325200723071e-05, + "logits/chosen": -1.8101056814193726, + "logits/rejected": -1.8288451433181763, + "logps/chosen": -129.47695922851562, + "logps/rejected": -183.96656799316406, + "loss": 0.061, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39186879992485046, + "rewards/margins": 5.672905921936035, + "rewards/rejected": -6.064774036407471, + "step": 920 + }, + { + "epoch": 1.21, + "learning_rate": 3.460017876797422e-05, + "logits/chosen": -1.7113351821899414, + "logits/rejected": -1.7809813022613525, + "logps/chosen": -167.154541015625, + "logps/rejected": -234.83380126953125, + "loss": 0.0302, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08966308832168579, + "rewards/margins": 7.432629108428955, + "rewards/rejected": -7.342965602874756, + "step": 921 + }, + { + "epoch": 1.21, + "learning_rate": 3.456708580912725e-05, + "logits/chosen": -1.7443047761917114, + "logits/rejected": -1.7694761753082275, + "logps/chosen": -139.82110595703125, + "logps/rejected": -190.7325439453125, + "loss": 0.1295, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8659021854400635, + "rewards/margins": 5.205358982086182, + "rewards/rejected": -6.071261882781982, + "step": 922 + }, + { + "epoch": 1.21, + "learning_rate": 3.453397319866557e-05, + "logits/chosen": -2.0398495197296143, + "logits/rejected": -2.0124411582946777, + "logps/chosen": -151.38482666015625, + "logps/rejected": -214.8488311767578, + "loss": 0.193, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8532832860946655, + "rewards/margins": 5.717514991760254, + "rewards/rejected": -6.570797920227051, + "step": 923 + }, + { + "epoch": 1.21, + "learning_rate": 3.4500841004605324e-05, + "logits/chosen": -2.002171039581299, + "logits/rejected": -2.0679383277893066, + "logps/chosen": -218.12924194335938, + "logps/rejected": -283.44219970703125, + "loss": 0.0952, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.25244951248168945, + "rewards/margins": 5.972373008728027, + "rewards/rejected": -6.224822044372559, + "step": 924 + }, + { + "epoch": 1.21, + "learning_rate": 3.446768929500288e-05, + "logits/chosen": -1.782718539237976, + "logits/rejected": -1.842342734336853, + "logps/chosen": -170.8223876953125, + "logps/rejected": -227.46841430664062, + "loss": 0.1333, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4326009750366211, + "rewards/margins": 6.7193145751953125, + "rewards/rejected": -7.151915550231934, + "step": 925 + }, + { + "epoch": 1.21, + "learning_rate": 3.443451813795469e-05, + "logits/chosen": -1.9137529134750366, + "logits/rejected": -1.9013252258300781, + "logps/chosen": -166.90280151367188, + "logps/rejected": -216.41151428222656, + "loss": 0.172, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.974904477596283, + "rewards/margins": 5.321615695953369, + "rewards/rejected": -6.296520709991455, + "step": 926 + }, + { + "epoch": 1.21, + "learning_rate": 3.4401327601597174e-05, + "logits/chosen": -2.1041436195373535, + "logits/rejected": -2.086627721786499, + "logps/chosen": -140.56207275390625, + "logps/rejected": -184.04986572265625, + "loss": 0.1364, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7975207567214966, + "rewards/margins": 5.4658427238464355, + "rewards/rejected": -6.263363361358643, + "step": 927 + }, + { + "epoch": 1.21, + "learning_rate": 3.436811775410651e-05, + "logits/chosen": -2.108426570892334, + "logits/rejected": -2.1942784786224365, + "logps/chosen": -189.42230224609375, + "logps/rejected": -268.1767578125, + "loss": 0.118, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.321402907371521, + "rewards/margins": 5.401063442230225, + "rewards/rejected": -6.722466468811035, + "step": 928 + }, + { + "epoch": 1.22, + "learning_rate": 3.43348886636986e-05, + "logits/chosen": -1.9828169345855713, + "logits/rejected": -1.994112491607666, + "logps/chosen": -151.9613494873047, + "logps/rejected": -221.04864501953125, + "loss": 0.0853, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6451107859611511, + "rewards/margins": 5.837140083312988, + "rewards/rejected": -6.482250690460205, + "step": 929 + }, + { + "epoch": 1.22, + "learning_rate": 3.430164039862882e-05, + "logits/chosen": -1.779541254043579, + "logits/rejected": -1.7532923221588135, + "logps/chosen": -202.18359375, + "logps/rejected": -259.9750061035156, + "loss": 0.1775, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7676839828491211, + "rewards/margins": 5.355978012084961, + "rewards/rejected": -6.12366247177124, + "step": 930 + }, + { + "epoch": 1.22, + "learning_rate": 3.426837302719197e-05, + "logits/chosen": -1.8977571725845337, + "logits/rejected": -1.91450035572052, + "logps/chosen": -161.89410400390625, + "logps/rejected": -217.23263549804688, + "loss": 0.1046, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7960255742073059, + "rewards/margins": 5.442742824554443, + "rewards/rejected": -6.238769054412842, + "step": 931 + }, + { + "epoch": 1.22, + "learning_rate": 3.42350866177221e-05, + "logits/chosen": -1.796115517616272, + "logits/rejected": -1.8026353120803833, + "logps/chosen": -167.72669982910156, + "logps/rejected": -235.31195068359375, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7101820111274719, + "rewards/margins": 7.052955150604248, + "rewards/rejected": -7.763136386871338, + "step": 932 + }, + { + "epoch": 1.22, + "learning_rate": 3.420178123859233e-05, + "logits/chosen": -1.884534239768982, + "logits/rejected": -1.9360039234161377, + "logps/chosen": -175.97605895996094, + "logps/rejected": -280.8975524902344, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3528714179992676, + "rewards/margins": 7.842462539672852, + "rewards/rejected": -8.195335388183594, + "step": 933 + }, + { + "epoch": 1.22, + "learning_rate": 3.416845695821476e-05, + "logits/chosen": -1.6204770803451538, + "logits/rejected": -1.7952289581298828, + "logps/chosen": -141.75640869140625, + "logps/rejected": -205.68414306640625, + "loss": 0.0849, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3700450658798218, + "rewards/margins": 5.952082633972168, + "rewards/rejected": -7.322127342224121, + "step": 934 + }, + { + "epoch": 1.22, + "learning_rate": 3.413511384504034e-05, + "logits/chosen": -1.999087929725647, + "logits/rejected": -2.0146172046661377, + "logps/chosen": -181.1517791748047, + "logps/rejected": -228.80722045898438, + "loss": 0.0826, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20987439155578613, + "rewards/margins": 6.5635085105896, + "rewards/rejected": -6.353633403778076, + "step": 935 + }, + { + "epoch": 1.22, + "learning_rate": 3.410175196755866e-05, + "logits/chosen": -1.9721521139144897, + "logits/rejected": -2.0418949127197266, + "logps/chosen": -205.38955688476562, + "logps/rejected": -272.0664367675781, + "loss": 0.1197, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2950458526611328, + "rewards/margins": 5.9981231689453125, + "rewards/rejected": -7.2931694984436035, + "step": 936 + }, + { + "epoch": 1.23, + "learning_rate": 3.40683713942979e-05, + "logits/chosen": -2.0940282344818115, + "logits/rejected": -2.13757586479187, + "logps/chosen": -155.8927459716797, + "logps/rejected": -231.48611450195312, + "loss": 0.0444, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1400851011276245, + "rewards/margins": 6.619723320007324, + "rewards/rejected": -7.75980806350708, + "step": 937 + }, + { + "epoch": 1.23, + "learning_rate": 3.403497219382461e-05, + "logits/chosen": -1.7476333379745483, + "logits/rejected": -1.7370529174804688, + "logps/chosen": -148.12130737304688, + "logps/rejected": -232.86611938476562, + "loss": 0.1126, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08682404458522797, + "rewards/margins": 7.703474044799805, + "rewards/rejected": -7.790297985076904, + "step": 938 + }, + { + "epoch": 1.23, + "learning_rate": 3.400155443474361e-05, + "logits/chosen": -2.0138838291168213, + "logits/rejected": -2.05570912361145, + "logps/chosen": -162.871826171875, + "logps/rejected": -249.21856689453125, + "loss": 0.1125, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1651180535554886, + "rewards/margins": 7.845034122467041, + "rewards/rejected": -8.010151863098145, + "step": 939 + }, + { + "epoch": 1.23, + "learning_rate": 3.396811818569785e-05, + "logits/chosen": -1.6162145137786865, + "logits/rejected": -1.7617723941802979, + "logps/chosen": -154.22023010253906, + "logps/rejected": -233.10836791992188, + "loss": 0.1039, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.20926518738269806, + "rewards/margins": 6.4955153465271, + "rewards/rejected": -6.7047810554504395, + "step": 940 + }, + { + "epoch": 1.23, + "learning_rate": 3.3934663515368236e-05, + "logits/chosen": -2.0607364177703857, + "logits/rejected": -2.064283847808838, + "logps/chosen": -181.87879943847656, + "logps/rejected": -239.00088500976562, + "loss": 0.0548, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.947770357131958, + "rewards/margins": 5.138289451599121, + "rewards/rejected": -7.086060047149658, + "step": 941 + }, + { + "epoch": 1.23, + "learning_rate": 3.3901190492473554e-05, + "logits/chosen": -2.0121662616729736, + "logits/rejected": -1.9954758882522583, + "logps/chosen": -164.00282287597656, + "logps/rejected": -251.4072723388672, + "loss": 0.057, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6742583513259888, + "rewards/margins": 6.951840877532959, + "rewards/rejected": -7.626099109649658, + "step": 942 + }, + { + "epoch": 1.23, + "learning_rate": 3.3867699185770255e-05, + "logits/chosen": -1.8472353219985962, + "logits/rejected": -1.953643560409546, + "logps/chosen": -150.764404296875, + "logps/rejected": -238.75494384765625, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6845617294311523, + "rewards/margins": 6.793731689453125, + "rewards/rejected": -7.478293418884277, + "step": 943 + }, + { + "epoch": 1.24, + "learning_rate": 3.383418966405234e-05, + "logits/chosen": -2.131664276123047, + "logits/rejected": -2.15356707572937, + "logps/chosen": -174.39974975585938, + "logps/rejected": -247.7554168701172, + "loss": 0.0517, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35220858454704285, + "rewards/margins": 7.23449182510376, + "rewards/rejected": -7.586700439453125, + "step": 944 + }, + { + "epoch": 1.24, + "learning_rate": 3.3800661996151264e-05, + "logits/chosen": -2.0658442974090576, + "logits/rejected": -2.0301132202148438, + "logps/chosen": -163.01617431640625, + "logps/rejected": -198.7908477783203, + "loss": 0.1661, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4769498109817505, + "rewards/margins": 4.503304481506348, + "rewards/rejected": -5.980254173278809, + "step": 945 + }, + { + "epoch": 1.24, + "learning_rate": 3.376711625093571e-05, + "logits/chosen": -1.9626531600952148, + "logits/rejected": -2.005638360977173, + "logps/chosen": -227.94239807128906, + "logps/rejected": -285.3360900878906, + "loss": 0.121, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0329959392547607, + "rewards/margins": 5.07936429977417, + "rewards/rejected": -6.11236047744751, + "step": 946 + }, + { + "epoch": 1.24, + "learning_rate": 3.373355249731153e-05, + "logits/chosen": -1.6072015762329102, + "logits/rejected": -1.6854274272918701, + "logps/chosen": -149.63619995117188, + "logps/rejected": -231.1209716796875, + "loss": 0.0537, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.43269726634025574, + "rewards/margins": 7.149322032928467, + "rewards/rejected": -7.582018852233887, + "step": 947 + }, + { + "epoch": 1.24, + "learning_rate": 3.369997080422155e-05, + "logits/chosen": -1.702265739440918, + "logits/rejected": -1.7799084186553955, + "logps/chosen": -170.57896423339844, + "logps/rejected": -253.85064697265625, + "loss": 0.146, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.236419916152954, + "rewards/margins": 6.673525810241699, + "rewards/rejected": -7.909945964813232, + "step": 948 + }, + { + "epoch": 1.24, + "learning_rate": 3.366637124064544e-05, + "logits/chosen": -1.895546793937683, + "logits/rejected": -1.9188759326934814, + "logps/chosen": -179.75929260253906, + "logps/rejected": -222.67404174804688, + "loss": 0.1123, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3694798946380615, + "rewards/margins": 5.722755432128906, + "rewards/rejected": -7.092235088348389, + "step": 949 + }, + { + "epoch": 1.24, + "learning_rate": 3.36327538755996e-05, + "logits/chosen": -1.7629483938217163, + "logits/rejected": -1.834211826324463, + "logps/chosen": -163.36801147460938, + "logps/rejected": -240.06100463867188, + "loss": 0.0835, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1130611896514893, + "rewards/margins": 6.393387794494629, + "rewards/rejected": -7.5064496994018555, + "step": 950 + }, + { + "epoch": 1.24, + "learning_rate": 3.3599118778136965e-05, + "logits/chosen": -1.956146478652954, + "logits/rejected": -1.949027180671692, + "logps/chosen": -154.01885986328125, + "logps/rejected": -214.5371856689453, + "loss": 0.0942, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0469669103622437, + "rewards/margins": 5.84091329574585, + "rewards/rejected": -6.887880325317383, + "step": 951 + }, + { + "epoch": 1.25, + "learning_rate": 3.356546601734692e-05, + "logits/chosen": -2.103282928466797, + "logits/rejected": -2.1140389442443848, + "logps/chosen": -160.41705322265625, + "logps/rejected": -222.67813110351562, + "loss": 0.0934, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0452954769134521, + "rewards/margins": 5.0740461349487305, + "rewards/rejected": -6.119342803955078, + "step": 952 + }, + { + "epoch": 1.25, + "learning_rate": 3.3531795662355115e-05, + "logits/chosen": -2.031980276107788, + "logits/rejected": -2.070544481277466, + "logps/chosen": -160.99761962890625, + "logps/rejected": -243.9342041015625, + "loss": 0.0933, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9394694566726685, + "rewards/margins": 7.686059951782227, + "rewards/rejected": -8.625529289245605, + "step": 953 + }, + { + "epoch": 1.25, + "learning_rate": 3.349810778232335e-05, + "logits/chosen": -1.8342362642288208, + "logits/rejected": -1.9654871225357056, + "logps/chosen": -167.48745727539062, + "logps/rejected": -208.47169494628906, + "loss": 0.1846, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0384410619735718, + "rewards/margins": 4.180461883544922, + "rewards/rejected": -5.218903064727783, + "step": 954 + }, + { + "epoch": 1.25, + "learning_rate": 3.346440244644942e-05, + "logits/chosen": -1.765977144241333, + "logits/rejected": -1.7814247608184814, + "logps/chosen": -162.0328826904297, + "logps/rejected": -251.04905700683594, + "loss": 0.0176, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1184916496276855, + "rewards/margins": 6.529496669769287, + "rewards/rejected": -7.647988319396973, + "step": 955 + }, + { + "epoch": 1.25, + "learning_rate": 3.3430679723966976e-05, + "logits/chosen": -1.7043746709823608, + "logits/rejected": -1.6794589757919312, + "logps/chosen": -177.63613891601562, + "logps/rejected": -251.107666015625, + "loss": 0.1218, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0984007120132446, + "rewards/margins": 7.543315410614014, + "rewards/rejected": -8.641716003417969, + "step": 956 + }, + { + "epoch": 1.25, + "learning_rate": 3.339693968414538e-05, + "logits/chosen": -1.9795281887054443, + "logits/rejected": -1.9849295616149902, + "logps/chosen": -137.0846710205078, + "logps/rejected": -184.3840789794922, + "loss": 0.2054, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7593737840652466, + "rewards/margins": 4.5130181312561035, + "rewards/rejected": -5.2723917961120605, + "step": 957 + }, + { + "epoch": 1.25, + "learning_rate": 3.336318239628956e-05, + "logits/chosen": -2.0036728382110596, + "logits/rejected": -1.933376669883728, + "logps/chosen": -211.4518585205078, + "logps/rejected": -290.0897216796875, + "loss": 0.1094, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8515461087226868, + "rewards/margins": 5.591423511505127, + "rewards/rejected": -6.442969799041748, + "step": 958 + }, + { + "epoch": 1.26, + "learning_rate": 3.3329407929739906e-05, + "logits/chosen": -1.83505117893219, + "logits/rejected": -1.8577007055282593, + "logps/chosen": -186.45472717285156, + "logps/rejected": -260.4599304199219, + "loss": 0.067, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8454413414001465, + "rewards/margins": 6.24069881439209, + "rewards/rejected": -7.0861406326293945, + "step": 959 + }, + { + "epoch": 1.26, + "learning_rate": 3.3295616353872026e-05, + "logits/chosen": -1.5665518045425415, + "logits/rejected": -1.5015395879745483, + "logps/chosen": -196.26315307617188, + "logps/rejected": -232.317138671875, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3765389919281006, + "rewards/margins": 5.861631393432617, + "rewards/rejected": -7.238170623779297, + "step": 960 + }, + { + "epoch": 1.26, + "learning_rate": 3.326180773809676e-05, + "logits/chosen": -1.9945480823516846, + "logits/rejected": -2.0716819763183594, + "logps/chosen": -181.3908233642578, + "logps/rejected": -261.51983642578125, + "loss": 0.1299, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5620561838150024, + "rewards/margins": 5.328022480010986, + "rewards/rejected": -6.890079021453857, + "step": 961 + }, + { + "epoch": 1.26, + "learning_rate": 3.3227982151859873e-05, + "logits/chosen": -2.0687661170959473, + "logits/rejected": -2.008908271789551, + "logps/chosen": -188.9337615966797, + "logps/rejected": -245.06329345703125, + "loss": 0.058, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7507072687149048, + "rewards/margins": 5.501278400421143, + "rewards/rejected": -6.251986026763916, + "step": 962 + }, + { + "epoch": 1.26, + "learning_rate": 3.3194139664642035e-05, + "logits/chosen": -2.0266988277435303, + "logits/rejected": -2.055248260498047, + "logps/chosen": -170.5966339111328, + "logps/rejected": -261.96221923828125, + "loss": 0.1398, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17907534539699554, + "rewards/margins": 7.6258544921875, + "rewards/rejected": -7.804930686950684, + "step": 963 + }, + { + "epoch": 1.26, + "learning_rate": 3.3160280345958614e-05, + "logits/chosen": -2.056675434112549, + "logits/rejected": -2.0761566162109375, + "logps/chosen": -147.96145629882812, + "logps/rejected": -208.98709106445312, + "loss": 0.1157, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7914592623710632, + "rewards/margins": 5.899379730224609, + "rewards/rejected": -6.690839767456055, + "step": 964 + }, + { + "epoch": 1.26, + "learning_rate": 3.3126404265359545e-05, + "logits/chosen": -2.066800832748413, + "logits/rejected": -2.0727744102478027, + "logps/chosen": -142.2117156982422, + "logps/rejected": -224.7412872314453, + "loss": 0.0703, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2246077060699463, + "rewards/margins": 5.702795505523682, + "rewards/rejected": -6.927403450012207, + "step": 965 + }, + { + "epoch": 1.26, + "learning_rate": 3.3092511492429216e-05, + "logits/chosen": -1.8461310863494873, + "logits/rejected": -1.9243555068969727, + "logps/chosen": -137.1815643310547, + "logps/rejected": -208.0272216796875, + "loss": 0.0951, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0130833387374878, + "rewards/margins": 5.715978622436523, + "rewards/rejected": -6.729062080383301, + "step": 966 + }, + { + "epoch": 1.27, + "learning_rate": 3.305860209678628e-05, + "logits/chosen": -1.9411567449569702, + "logits/rejected": -1.9278594255447388, + "logps/chosen": -136.62242126464844, + "logps/rejected": -198.45223999023438, + "loss": 0.1076, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.49253129959106445, + "rewards/margins": 5.900206089019775, + "rewards/rejected": -6.39273738861084, + "step": 967 + }, + { + "epoch": 1.27, + "learning_rate": 3.3024676148083555e-05, + "logits/chosen": -1.78202223777771, + "logits/rejected": -1.7654985189437866, + "logps/chosen": -193.89064025878906, + "logps/rejected": -248.55442810058594, + "loss": 0.1166, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5818485021591187, + "rewards/margins": 6.548543930053711, + "rewards/rejected": -7.130392551422119, + "step": 968 + }, + { + "epoch": 1.27, + "learning_rate": 3.299073371600784e-05, + "logits/chosen": -1.9086130857467651, + "logits/rejected": -1.8986905813217163, + "logps/chosen": -153.02613830566406, + "logps/rejected": -183.6855926513672, + "loss": 0.0734, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0142678022384644, + "rewards/margins": 4.934293746948242, + "rewards/rejected": -5.948561668395996, + "step": 969 + }, + { + "epoch": 1.27, + "learning_rate": 3.29567748702798e-05, + "logits/chosen": -2.012598991394043, + "logits/rejected": -2.0287890434265137, + "logps/chosen": -213.5174102783203, + "logps/rejected": -255.91893005371094, + "loss": 0.1708, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9519654512405396, + "rewards/margins": 4.617500305175781, + "rewards/rejected": -5.569465637207031, + "step": 970 + }, + { + "epoch": 1.27, + "learning_rate": 3.2922799680653816e-05, + "logits/chosen": -1.865847110748291, + "logits/rejected": -1.846561312675476, + "logps/chosen": -210.79624938964844, + "logps/rejected": -282.8587341308594, + "loss": 0.1333, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6331790685653687, + "rewards/margins": 7.995599746704102, + "rewards/rejected": -7.362420082092285, + "step": 971 + }, + { + "epoch": 1.27, + "learning_rate": 3.288880821691785e-05, + "logits/chosen": -1.8660513162612915, + "logits/rejected": -1.8335342407226562, + "logps/chosen": -171.3505096435547, + "logps/rejected": -233.75755310058594, + "loss": 0.0517, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.488456130027771, + "rewards/margins": 6.7542009353637695, + "rewards/rejected": -7.242657661437988, + "step": 972 + }, + { + "epoch": 1.27, + "learning_rate": 3.285480054889327e-05, + "logits/chosen": -1.8647754192352295, + "logits/rejected": -1.8496925830841064, + "logps/chosen": -160.0685577392578, + "logps/rejected": -222.1055145263672, + "loss": 0.1012, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09002574533224106, + "rewards/margins": 6.101812362670898, + "rewards/rejected": -6.011786460876465, + "step": 973 + }, + { + "epoch": 1.27, + "learning_rate": 3.2820776746434764e-05, + "logits/chosen": -2.1003689765930176, + "logits/rejected": -2.096348524093628, + "logps/chosen": -141.83660888671875, + "logps/rejected": -200.94760131835938, + "loss": 0.1068, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0471521615982056, + "rewards/margins": 4.825916290283203, + "rewards/rejected": -5.873068332672119, + "step": 974 + }, + { + "epoch": 1.28, + "learning_rate": 3.278673687943011e-05, + "logits/chosen": -1.9682233333587646, + "logits/rejected": -1.9423187971115112, + "logps/chosen": -145.8862762451172, + "logps/rejected": -204.0359344482422, + "loss": 0.0704, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2208142280578613, + "rewards/margins": 5.135477542877197, + "rewards/rejected": -6.356292724609375, + "step": 975 + }, + { + "epoch": 1.28, + "learning_rate": 3.2752681017800144e-05, + "logits/chosen": -2.1087942123413086, + "logits/rejected": -2.081089973449707, + "logps/chosen": -198.58547973632812, + "logps/rejected": -238.7448272705078, + "loss": 0.1334, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2582627534866333, + "rewards/margins": 5.285311698913574, + "rewards/rejected": -6.543575286865234, + "step": 976 + }, + { + "epoch": 1.28, + "learning_rate": 3.27186092314985e-05, + "logits/chosen": -1.8373255729675293, + "logits/rejected": -1.8873920440673828, + "logps/chosen": -149.75485229492188, + "logps/rejected": -203.3557586669922, + "loss": 0.0728, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4826914966106415, + "rewards/margins": 6.0817389488220215, + "rewards/rejected": -6.564430236816406, + "step": 977 + }, + { + "epoch": 1.28, + "learning_rate": 3.2684521590511566e-05, + "logits/chosen": -1.7711788415908813, + "logits/rejected": -1.783356785774231, + "logps/chosen": -167.17437744140625, + "logps/rejected": -235.35980224609375, + "loss": 0.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3992350399494171, + "rewards/margins": 7.782591819763184, + "rewards/rejected": -8.1818265914917, + "step": 978 + }, + { + "epoch": 1.28, + "learning_rate": 3.2650418164858284e-05, + "logits/chosen": -1.8944889307022095, + "logits/rejected": -1.8610248565673828, + "logps/chosen": -171.46087646484375, + "logps/rejected": -230.45013427734375, + "loss": 0.0555, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3834775984287262, + "rewards/margins": 5.588604927062988, + "rewards/rejected": -5.972082614898682, + "step": 979 + }, + { + "epoch": 1.28, + "learning_rate": 3.261629902459e-05, + "logits/chosen": -1.941145896911621, + "logits/rejected": -1.890683889389038, + "logps/chosen": -162.6816864013672, + "logps/rejected": -206.3589630126953, + "loss": 0.0687, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9792903065681458, + "rewards/margins": 5.750655174255371, + "rewards/rejected": -6.729945182800293, + "step": 980 + }, + { + "epoch": 1.28, + "learning_rate": 3.258216423979037e-05, + "logits/chosen": -1.9625548124313354, + "logits/rejected": -1.982649803161621, + "logps/chosen": -179.1558380126953, + "logps/rejected": -259.2472839355469, + "loss": 0.0974, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9090802073478699, + "rewards/margins": 6.0814690589904785, + "rewards/rejected": -6.990549564361572, + "step": 981 + }, + { + "epoch": 1.29, + "learning_rate": 3.254801388057514e-05, + "logits/chosen": -1.9789154529571533, + "logits/rejected": -2.015522003173828, + "logps/chosen": -162.9058837890625, + "logps/rejected": -222.94760131835938, + "loss": 0.1611, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0825514793395996, + "rewards/margins": 4.608074188232422, + "rewards/rejected": -5.690625190734863, + "step": 982 + }, + { + "epoch": 1.29, + "learning_rate": 3.2513848017092113e-05, + "logits/chosen": -1.927085280418396, + "logits/rejected": -2.0209970474243164, + "logps/chosen": -189.93618774414062, + "logps/rejected": -229.33177185058594, + "loss": 0.0933, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4452272653579712, + "rewards/margins": 5.701321601867676, + "rewards/rejected": -6.146548748016357, + "step": 983 + }, + { + "epoch": 1.29, + "learning_rate": 3.2479666719520886e-05, + "logits/chosen": -1.8675872087478638, + "logits/rejected": -1.8806711435317993, + "logps/chosen": -182.062255859375, + "logps/rejected": -235.03907775878906, + "loss": 0.0412, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7320657968521118, + "rewards/margins": 6.459377288818359, + "rewards/rejected": -7.191442966461182, + "step": 984 + }, + { + "epoch": 1.29, + "learning_rate": 3.2445470058072766e-05, + "logits/chosen": -1.8914560079574585, + "logits/rejected": -1.8393675088882446, + "logps/chosen": -167.7892608642578, + "logps/rejected": -226.37176513671875, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3088177740573883, + "rewards/margins": 6.601188659667969, + "rewards/rejected": -6.910006046295166, + "step": 985 + }, + { + "epoch": 1.29, + "learning_rate": 3.2411258102990646e-05, + "logits/chosen": -1.9076112508773804, + "logits/rejected": -1.9284731149673462, + "logps/chosen": -172.68734741210938, + "logps/rejected": -241.80279541015625, + "loss": 0.0473, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14136190712451935, + "rewards/margins": 6.534528732299805, + "rewards/rejected": -6.675890922546387, + "step": 986 + }, + { + "epoch": 1.29, + "learning_rate": 3.23770309245488e-05, + "logits/chosen": -1.9282286167144775, + "logits/rejected": -1.8869755268096924, + "logps/chosen": -185.94140625, + "logps/rejected": -244.49862670898438, + "loss": 0.1127, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2529757618904114, + "rewards/margins": 5.684840679168701, + "rewards/rejected": -5.937817096710205, + "step": 987 + }, + { + "epoch": 1.29, + "learning_rate": 3.23427885930528e-05, + "logits/chosen": -1.8192644119262695, + "logits/rejected": -1.9099791049957275, + "logps/chosen": -138.94749450683594, + "logps/rejected": -210.98141479492188, + "loss": 0.1202, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6651886105537415, + "rewards/margins": 5.962369441986084, + "rewards/rejected": -6.62755823135376, + "step": 988 + }, + { + "epoch": 1.29, + "learning_rate": 3.230853117883933e-05, + "logits/chosen": -1.8780765533447266, + "logits/rejected": -1.9324958324432373, + "logps/chosen": -127.44175720214844, + "logps/rejected": -195.4342041015625, + "loss": 0.1616, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8466498851776123, + "rewards/margins": 5.705711364746094, + "rewards/rejected": -6.552361488342285, + "step": 989 + }, + { + "epoch": 1.3, + "learning_rate": 3.227425875227605e-05, + "logits/chosen": -1.9504503011703491, + "logits/rejected": -1.9801244735717773, + "logps/chosen": -151.307861328125, + "logps/rejected": -232.94631958007812, + "loss": 0.0913, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4374302327632904, + "rewards/margins": 5.352200508117676, + "rewards/rejected": -5.789630889892578, + "step": 990 + }, + { + "epoch": 1.3, + "learning_rate": 3.223997138376146e-05, + "logits/chosen": -1.768437385559082, + "logits/rejected": -1.8152680397033691, + "logps/chosen": -219.896240234375, + "logps/rejected": -264.9837646484375, + "loss": 0.1102, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0947672128677368, + "rewards/margins": 4.561570167541504, + "rewards/rejected": -5.656337738037109, + "step": 991 + }, + { + "epoch": 1.3, + "learning_rate": 3.220566914372477e-05, + "logits/chosen": -1.914330244064331, + "logits/rejected": -1.8707256317138672, + "logps/chosen": -161.68264770507812, + "logps/rejected": -206.5924835205078, + "loss": 0.0468, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0668751001358032, + "rewards/margins": 4.970590114593506, + "rewards/rejected": -6.0374650955200195, + "step": 992 + }, + { + "epoch": 1.3, + "learning_rate": 3.2171352102625716e-05, + "logits/chosen": -1.932759165763855, + "logits/rejected": -1.9709361791610718, + "logps/chosen": -215.7425079345703, + "logps/rejected": -309.05194091796875, + "loss": 0.0456, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2079094648361206, + "rewards/margins": 7.571928024291992, + "rewards/rejected": -7.779837608337402, + "step": 993 + }, + { + "epoch": 1.3, + "learning_rate": 3.213702033095444e-05, + "logits/chosen": -1.8618203401565552, + "logits/rejected": -1.9024603366851807, + "logps/chosen": -178.80416870117188, + "logps/rejected": -260.2957763671875, + "loss": 0.1162, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.20789635181427, + "rewards/margins": 6.124366283416748, + "rewards/rejected": -7.3322625160217285, + "step": 994 + }, + { + "epoch": 1.3, + "learning_rate": 3.210267389923135e-05, + "logits/chosen": -2.0747745037078857, + "logits/rejected": -2.092907667160034, + "logps/chosen": -163.86630249023438, + "logps/rejected": -208.08639526367188, + "loss": 0.2317, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4616580009460449, + "rewards/margins": 4.820524215698242, + "rewards/rejected": -5.282181739807129, + "step": 995 + }, + { + "epoch": 1.3, + "learning_rate": 3.2068312878006955e-05, + "logits/chosen": -1.9825048446655273, + "logits/rejected": -2.042208433151245, + "logps/chosen": -149.8975372314453, + "logps/rejected": -213.1818389892578, + "loss": 0.0461, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6773964166641235, + "rewards/margins": 6.074430465698242, + "rewards/rejected": -6.751826286315918, + "step": 996 + }, + { + "epoch": 1.3, + "learning_rate": 3.2033937337861744e-05, + "logits/chosen": -1.456627368927002, + "logits/rejected": -1.5451298952102661, + "logps/chosen": -195.9957275390625, + "logps/rejected": -275.362060546875, + "loss": 0.0561, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.783312976360321, + "rewards/margins": 6.08614444732666, + "rewards/rejected": -6.869457721710205, + "step": 997 + }, + { + "epoch": 1.31, + "learning_rate": 3.199954734940603e-05, + "logits/chosen": -1.848314881324768, + "logits/rejected": -1.8288490772247314, + "logps/chosen": -146.74244689941406, + "logps/rejected": -214.40316772460938, + "loss": 0.1002, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5085213780403137, + "rewards/margins": 6.950279235839844, + "rewards/rejected": -7.45880126953125, + "step": 998 + }, + { + "epoch": 1.31, + "learning_rate": 3.196514298327979e-05, + "logits/chosen": -1.9763479232788086, + "logits/rejected": -1.9941948652267456, + "logps/chosen": -175.21180725097656, + "logps/rejected": -211.73851013183594, + "loss": 0.2338, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7451385259628296, + "rewards/margins": 4.789652347564697, + "rewards/rejected": -6.534790992736816, + "step": 999 + }, + { + "epoch": 1.31, + "learning_rate": 3.193072431015254e-05, + "logits/chosen": -1.9940496683120728, + "logits/rejected": -1.9827721118927002, + "logps/chosen": -141.00697326660156, + "logps/rejected": -209.87010192871094, + "loss": 0.0899, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5353857278823853, + "rewards/margins": 6.444552898406982, + "rewards/rejected": -6.979938507080078, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 2292, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}