{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.517799352750809, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.000000000000001e-07, "logits/chosen": -2.0985755920410156, "logits/rejected": -1.9598942995071411, "logps/chosen": -282.9971618652344, "logps/rejected": -239.9343719482422, "loss": 0.6951, "rewards/accuracies": 0.5, "rewards/chosen": -0.008073234930634499, "rewards/margins": -0.0036141639575362206, "rewards/rejected": -0.004459070973098278, "step": 1 }, { "epoch": 0.0, "learning_rate": 1.0000000000000002e-06, "logits/chosen": -2.0734448432922363, "logits/rejected": -2.004692316055298, "logps/chosen": -277.91009521484375, "logps/rejected": -271.27777099609375, "loss": 0.694, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0035435440950095654, "rewards/margins": -0.0011311531998217106, "rewards/rejected": 0.004674696363508701, "step": 2 }, { "epoch": 0.0, "learning_rate": 1.5e-06, "logits/chosen": -2.2034449577331543, "logits/rejected": -2.15450382232666, "logps/chosen": -272.84222412109375, "logps/rejected": -296.7918701171875, "loss": 0.6936, "rewards/accuracies": 0.5625, "rewards/chosen": 0.011506916023790836, "rewards/margins": -0.0003993515856564045, "rewards/rejected": 0.011906265281140804, "step": 3 }, { "epoch": 0.0, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -2.265322685241699, "logits/rejected": -2.2147812843322754, "logps/chosen": -371.73626708984375, "logps/rejected": -411.03802490234375, "loss": 0.6923, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00851450115442276, "rewards/margins": 0.0020747678354382515, "rewards/rejected": 0.0064397333189845085, "step": 4 }, { "epoch": 0.01, "learning_rate": 2.5e-06, "logits/chosen": -2.126523971557617, "logits/rejected": -2.257094383239746, "logps/chosen": -232.20245361328125, "logps/rejected": -273.501953125, "loss": 0.6971, "rewards/accuracies": 0.375, "rewards/chosen": -0.012802458368241787, "rewards/margins": -0.007561136037111282, "rewards/rejected": -0.005241322796791792, "step": 5 }, { "epoch": 0.01, "learning_rate": 3e-06, "logits/chosen": -2.3206775188446045, "logits/rejected": -2.236947774887085, "logps/chosen": -282.0924072265625, "logps/rejected": -330.6745910644531, "loss": 0.6916, "rewards/accuracies": 0.5, "rewards/chosen": 0.012886857613921165, "rewards/margins": 0.0036454680375754833, "rewards/rejected": 0.00924139004200697, "step": 6 }, { "epoch": 0.01, "learning_rate": 3.5000000000000004e-06, "logits/chosen": -2.068502426147461, "logits/rejected": -2.1196892261505127, "logps/chosen": -270.6734313964844, "logps/rejected": -337.42877197265625, "loss": 0.6981, "rewards/accuracies": 0.4375, "rewards/chosen": -0.01318061351776123, "rewards/margins": -0.009621287696063519, "rewards/rejected": -0.003559327684342861, "step": 7 }, { "epoch": 0.01, "learning_rate": 4.000000000000001e-06, "logits/chosen": -2.100869655609131, "logits/rejected": -2.2541885375976562, "logps/chosen": -310.18951416015625, "logps/rejected": -404.02984619140625, "loss": 0.6922, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0011783126974478364, "rewards/margins": 0.0021168000530451536, "rewards/rejected": -0.0009384873555973172, "step": 8 }, { "epoch": 0.01, "learning_rate": 4.5e-06, "logits/chosen": -2.0913190841674805, "logits/rejected": -2.1440823078155518, "logps/chosen": -293.1015625, "logps/rejected": -351.9281005859375, "loss": 0.694, "rewards/accuracies": 0.375, "rewards/chosen": 0.0038339614402502775, "rewards/margins": -0.0011605499312281609, "rewards/rejected": 0.004994511604309082, "step": 9 }, { "epoch": 0.01, "learning_rate": 5e-06, "logits/chosen": -2.306243419647217, "logits/rejected": -2.3045802116394043, "logps/chosen": -386.077392578125, "logps/rejected": -367.2294921875, "loss": 0.6882, "rewards/accuracies": 0.625, "rewards/chosen": -0.0069270143285393715, "rewards/margins": 0.010493995621800423, "rewards/rejected": -0.01742100901901722, "step": 10 }, { "epoch": 0.01, "learning_rate": 5.500000000000001e-06, "logits/chosen": -2.136928081512451, "logits/rejected": -2.2102103233337402, "logps/chosen": -302.4460754394531, "logps/rejected": -337.15631103515625, "loss": 0.6912, "rewards/accuracies": 0.5, "rewards/chosen": -0.006688881199806929, "rewards/margins": 0.004262590315192938, "rewards/rejected": -0.010951472446322441, "step": 11 }, { "epoch": 0.01, "learning_rate": 6e-06, "logits/chosen": -2.064110279083252, "logits/rejected": -2.200680732727051, "logps/chosen": -320.602294921875, "logps/rejected": -369.6840515136719, "loss": 0.6988, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0003773693460971117, "rewards/margins": -0.0108009809628129, "rewards/rejected": 0.011178349144756794, "step": 12 }, { "epoch": 0.01, "learning_rate": 6.5000000000000004e-06, "logits/chosen": -2.348961353302002, "logits/rejected": -2.3161768913269043, "logps/chosen": -400.9246826171875, "logps/rejected": -472.69769287109375, "loss": 0.6845, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0075957290828228, "rewards/margins": 0.017815779894590378, "rewards/rejected": -0.010220050811767578, "step": 13 }, { "epoch": 0.01, "learning_rate": 7.000000000000001e-06, "logits/chosen": -2.412529468536377, "logits/rejected": -2.233689785003662, "logps/chosen": -422.3519287109375, "logps/rejected": -368.8478088378906, "loss": 0.6941, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0051600453443825245, "rewards/margins": -0.0017115597147494555, "rewards/rejected": 0.006871605291962624, "step": 14 }, { "epoch": 0.02, "learning_rate": 7.5e-06, "logits/chosen": -2.0220694541931152, "logits/rejected": -2.0454282760620117, "logps/chosen": -315.8769836425781, "logps/rejected": -314.20269775390625, "loss": 0.6934, "rewards/accuracies": 0.4375, "rewards/chosen": -4.372652620077133e-05, "rewards/margins": -0.00020327605307102203, "rewards/rejected": 0.000159549992531538, "step": 15 }, { "epoch": 0.02, "learning_rate": 8.000000000000001e-06, "logits/chosen": -2.0351786613464355, "logits/rejected": -1.877925992012024, "logps/chosen": -383.6861572265625, "logps/rejected": -276.95050048828125, "loss": 0.6963, "rewards/accuracies": 0.5, "rewards/chosen": -0.0005419491790235043, "rewards/margins": -0.005987692158669233, "rewards/rejected": 0.005445742513984442, "step": 16 }, { "epoch": 0.02, "learning_rate": 8.500000000000002e-06, "logits/chosen": -2.0652737617492676, "logits/rejected": -2.2235422134399414, "logps/chosen": -343.7192077636719, "logps/rejected": -360.5483703613281, "loss": 0.6858, "rewards/accuracies": 0.625, "rewards/chosen": 0.007448338903486729, "rewards/margins": 0.015540864318609238, "rewards/rejected": -0.00809252168983221, "step": 17 }, { "epoch": 0.02, "learning_rate": 9e-06, "logits/chosen": -2.2705795764923096, "logits/rejected": -2.159031867980957, "logps/chosen": -335.9582824707031, "logps/rejected": -337.2821044921875, "loss": 0.6976, "rewards/accuracies": 0.3125, "rewards/chosen": -0.010970616713166237, "rewards/margins": -0.008028840646147728, "rewards/rejected": -0.0029417751356959343, "step": 18 }, { "epoch": 0.02, "learning_rate": 9.5e-06, "logits/chosen": -1.9825160503387451, "logits/rejected": -2.1957268714904785, "logps/chosen": -275.072509765625, "logps/rejected": -364.77215576171875, "loss": 0.6922, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0009894607355818152, "rewards/margins": 0.002422523219138384, "rewards/rejected": -0.0034119843039661646, "step": 19 }, { "epoch": 0.02, "learning_rate": 1e-05, "logits/chosen": -2.0461435317993164, "logits/rejected": -2.114314556121826, "logps/chosen": -280.7231140136719, "logps/rejected": -340.37872314453125, "loss": 0.6928, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0057319882325828075, "rewards/margins": 0.0008945947047322989, "rewards/rejected": 0.004837393760681152, "step": 20 }, { "epoch": 0.02, "learning_rate": 1.05e-05, "logits/chosen": -2.2721879482269287, "logits/rejected": -2.31955885887146, "logps/chosen": -350.15765380859375, "logps/rejected": -346.72943115234375, "loss": 0.6961, "rewards/accuracies": 0.4375, "rewards/chosen": -0.01111826952546835, "rewards/margins": -0.005553150083869696, "rewards/rejected": -0.0055651189759373665, "step": 21 }, { "epoch": 0.02, "learning_rate": 1.1000000000000001e-05, "logits/chosen": -2.297549247741699, "logits/rejected": -2.258702516555786, "logps/chosen": -354.07342529296875, "logps/rejected": -381.96453857421875, "loss": 0.7021, "rewards/accuracies": 0.5, "rewards/chosen": -0.010824179276823997, "rewards/margins": -0.01675737090408802, "rewards/rejected": 0.005933189764618874, "step": 22 }, { "epoch": 0.02, "learning_rate": 1.1500000000000002e-05, "logits/chosen": -2.095402240753174, "logits/rejected": -2.2284393310546875, "logps/chosen": -367.1632385253906, "logps/rejected": -395.11016845703125, "loss": 0.6846, "rewards/accuracies": 0.625, "rewards/chosen": -0.00539558008313179, "rewards/margins": 0.018006421625614166, "rewards/rejected": -0.023401999846100807, "step": 23 }, { "epoch": 0.02, "learning_rate": 1.2e-05, "logits/chosen": -2.058290719985962, "logits/rejected": -2.142444133758545, "logps/chosen": -208.247314453125, "logps/rejected": -261.3697204589844, "loss": 0.6851, "rewards/accuracies": 0.5, "rewards/chosen": -0.0028132672887295485, "rewards/margins": 0.01700596883893013, "rewards/rejected": -0.0198192335665226, "step": 24 }, { "epoch": 0.03, "learning_rate": 1.25e-05, "logits/chosen": -1.9966950416564941, "logits/rejected": -2.0200271606445312, "logps/chosen": -390.6391906738281, "logps/rejected": -335.07867431640625, "loss": 0.6898, "rewards/accuracies": 0.5, "rewards/chosen": -0.008169196546077728, "rewards/margins": 0.007835723459720612, "rewards/rejected": -0.01600492000579834, "step": 25 }, { "epoch": 0.03, "learning_rate": 1.3000000000000001e-05, "logits/chosen": -2.1417901515960693, "logits/rejected": -2.102891683578491, "logps/chosen": -334.50872802734375, "logps/rejected": -364.5052795410156, "loss": 0.688, "rewards/accuracies": 0.5625, "rewards/chosen": -0.019382527098059654, "rewards/margins": 0.01085577066987753, "rewards/rejected": -0.03023829497396946, "step": 26 }, { "epoch": 0.03, "learning_rate": 1.3500000000000001e-05, "logits/chosen": -2.243198871612549, "logits/rejected": -2.3117640018463135, "logps/chosen": -343.4504699707031, "logps/rejected": -356.1696472167969, "loss": 0.7073, "rewards/accuracies": 0.375, "rewards/chosen": -0.03033299744129181, "rewards/margins": -0.027142930775880814, "rewards/rejected": -0.0031900645699352026, "step": 27 }, { "epoch": 0.03, "learning_rate": 1.4000000000000001e-05, "logits/chosen": -2.0717036724090576, "logits/rejected": -2.2167444229125977, "logps/chosen": -270.87353515625, "logps/rejected": -327.87640380859375, "loss": 0.6986, "rewards/accuracies": 0.375, "rewards/chosen": -0.032239750027656555, "rewards/margins": -0.010378909297287464, "rewards/rejected": -0.021860837936401367, "step": 28 }, { "epoch": 0.03, "learning_rate": 1.45e-05, "logits/chosen": -2.2460880279541016, "logits/rejected": -2.2309041023254395, "logps/chosen": -435.2510986328125, "logps/rejected": -461.74859619140625, "loss": 0.6958, "rewards/accuracies": 0.375, "rewards/chosen": -0.018374010920524597, "rewards/margins": -0.004351234529167414, "rewards/rejected": -0.01402278058230877, "step": 29 }, { "epoch": 0.03, "learning_rate": 1.5e-05, "logits/chosen": -2.228811740875244, "logits/rejected": -2.293030261993408, "logps/chosen": -382.5074768066406, "logps/rejected": -371.8452453613281, "loss": 0.6839, "rewards/accuracies": 0.625, "rewards/chosen": 0.004188417922705412, "rewards/margins": 0.01916835457086563, "rewards/rejected": -0.014979935251176357, "step": 30 }, { "epoch": 0.03, "learning_rate": 1.55e-05, "logits/chosen": -1.9169931411743164, "logits/rejected": -2.1672849655151367, "logps/chosen": -270.2207946777344, "logps/rejected": -279.6405944824219, "loss": 0.6917, "rewards/accuracies": 0.5625, "rewards/chosen": 0.002476263325661421, "rewards/margins": 0.003316545393317938, "rewards/rejected": -0.0008402818348258734, "step": 31 }, { "epoch": 0.03, "learning_rate": 1.6000000000000003e-05, "logits/chosen": -2.349118232727051, "logits/rejected": -2.260673999786377, "logps/chosen": -375.1436462402344, "logps/rejected": -423.8641357421875, "loss": 0.6963, "rewards/accuracies": 0.5625, "rewards/chosen": -0.018851473927497864, "rewards/margins": -0.004983377177268267, "rewards/rejected": -0.013868091627955437, "step": 32 }, { "epoch": 0.03, "learning_rate": 1.65e-05, "logits/chosen": -2.253589630126953, "logits/rejected": -2.3596742153167725, "logps/chosen": -454.2779846191406, "logps/rejected": -503.1221008300781, "loss": 0.6843, "rewards/accuracies": 0.4375, "rewards/chosen": -0.004412556067109108, "rewards/margins": 0.02018122747540474, "rewards/rejected": -0.024593783542513847, "step": 33 }, { "epoch": 0.04, "learning_rate": 1.7000000000000003e-05, "logits/chosen": -2.041515588760376, "logits/rejected": -2.1577486991882324, "logps/chosen": -338.1881103515625, "logps/rejected": -387.30023193359375, "loss": 0.6887, "rewards/accuracies": 0.5625, "rewards/chosen": -0.011815071105957031, "rewards/margins": 0.009705852717161179, "rewards/rejected": -0.02152092382311821, "step": 34 }, { "epoch": 0.04, "learning_rate": 1.75e-05, "logits/chosen": -2.2442219257354736, "logits/rejected": -2.3945209980010986, "logps/chosen": -293.7795715332031, "logps/rejected": -396.3448181152344, "loss": 0.686, "rewards/accuracies": 0.8125, "rewards/chosen": -0.004475045017898083, "rewards/margins": 0.015541339293122292, "rewards/rejected": -0.0200163833796978, "step": 35 }, { "epoch": 0.04, "learning_rate": 1.8e-05, "logits/chosen": -2.0793919563293457, "logits/rejected": -2.1432011127471924, "logps/chosen": -322.59869384765625, "logps/rejected": -437.3168029785156, "loss": 0.6776, "rewards/accuracies": 0.8125, "rewards/chosen": 0.006924772635102272, "rewards/margins": 0.03214583545923233, "rewards/rejected": -0.02522106282413006, "step": 36 }, { "epoch": 0.04, "learning_rate": 1.85e-05, "logits/chosen": -2.155754804611206, "logits/rejected": -2.2335851192474365, "logps/chosen": -295.4434814453125, "logps/rejected": -359.8757019042969, "loss": 0.7043, "rewards/accuracies": 0.5, "rewards/chosen": -0.023294713348150253, "rewards/margins": -0.02007477357983589, "rewards/rejected": -0.0032199383713304996, "step": 37 }, { "epoch": 0.04, "learning_rate": 1.9e-05, "logits/chosen": -2.2595698833465576, "logits/rejected": -2.064648151397705, "logps/chosen": -310.83416748046875, "logps/rejected": -272.6891784667969, "loss": 0.6706, "rewards/accuracies": 0.5625, "rewards/chosen": -0.003548526205122471, "rewards/margins": 0.058505721390247345, "rewards/rejected": -0.06205424666404724, "step": 38 }, { "epoch": 0.04, "learning_rate": 1.9500000000000003e-05, "logits/chosen": -2.247586250305176, "logits/rejected": -2.2411181926727295, "logps/chosen": -410.89801025390625, "logps/rejected": -397.5184631347656, "loss": 0.6933, "rewards/accuracies": 0.375, "rewards/chosen": -0.023591995239257812, "rewards/margins": 0.0011774520389735699, "rewards/rejected": -0.024769451469182968, "step": 39 }, { "epoch": 0.04, "learning_rate": 2e-05, "logits/chosen": -2.0857701301574707, "logits/rejected": -2.3974575996398926, "logps/chosen": -298.82373046875, "logps/rejected": -322.1784973144531, "loss": 0.6681, "rewards/accuracies": 0.8125, "rewards/chosen": 0.012818144634366035, "rewards/margins": 0.05216258019208908, "rewards/rejected": -0.0393444299697876, "step": 40 }, { "epoch": 0.04, "learning_rate": 2.05e-05, "logits/chosen": -2.0326108932495117, "logits/rejected": -2.021967887878418, "logps/chosen": -322.1849365234375, "logps/rejected": -286.97393798828125, "loss": 0.6877, "rewards/accuracies": 0.5, "rewards/chosen": -0.03254096582531929, "rewards/margins": 0.01258254237473011, "rewards/rejected": -0.04512350261211395, "step": 41 }, { "epoch": 0.04, "learning_rate": 2.1e-05, "logits/chosen": -2.1398110389709473, "logits/rejected": -2.1065280437469482, "logps/chosen": -315.64190673828125, "logps/rejected": -273.958740234375, "loss": 0.6843, "rewards/accuracies": 0.5625, "rewards/chosen": -0.06157093122601509, "rewards/margins": 0.055743757635354996, "rewards/rejected": -0.11731469631195068, "step": 42 }, { "epoch": 0.04, "learning_rate": 2.15e-05, "logits/chosen": -2.027090549468994, "logits/rejected": -2.1431775093078613, "logps/chosen": -328.4605407714844, "logps/rejected": -336.6142272949219, "loss": 0.6785, "rewards/accuracies": 0.5625, "rewards/chosen": 0.002649687696248293, "rewards/margins": 0.030248070135712624, "rewards/rejected": -0.02759838104248047, "step": 43 }, { "epoch": 0.05, "learning_rate": 2.2000000000000003e-05, "logits/chosen": -2.1685423851013184, "logits/rejected": -2.3537817001342773, "logps/chosen": -280.93017578125, "logps/rejected": -430.02398681640625, "loss": 0.6791, "rewards/accuracies": 0.625, "rewards/chosen": -0.009914875030517578, "rewards/margins": 0.031635358929634094, "rewards/rejected": -0.04155023396015167, "step": 44 }, { "epoch": 0.05, "learning_rate": 2.25e-05, "logits/chosen": -2.026183605194092, "logits/rejected": -2.2760047912597656, "logps/chosen": -327.8951416015625, "logps/rejected": -326.314697265625, "loss": 0.6854, "rewards/accuracies": 0.5625, "rewards/chosen": -0.003114223014563322, "rewards/margins": 0.017296195030212402, "rewards/rejected": -0.020410416647791862, "step": 45 }, { "epoch": 0.05, "learning_rate": 2.3000000000000003e-05, "logits/chosen": -2.2608642578125, "logits/rejected": -2.2612316608428955, "logps/chosen": -313.09063720703125, "logps/rejected": -299.6613464355469, "loss": 0.6969, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0639270767569542, "rewards/margins": -0.005310798529535532, "rewards/rejected": -0.0586162805557251, "step": 46 }, { "epoch": 0.05, "learning_rate": 2.35e-05, "logits/chosen": -2.0354762077331543, "logits/rejected": -2.2385144233703613, "logps/chosen": -361.6187744140625, "logps/rejected": -416.43804931640625, "loss": 0.6855, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0550750270485878, "rewards/margins": 0.021474791690707207, "rewards/rejected": -0.07654982060194016, "step": 47 }, { "epoch": 0.05, "learning_rate": 2.4e-05, "logits/chosen": -2.1052396297454834, "logits/rejected": -2.1244544982910156, "logps/chosen": -401.3201904296875, "logps/rejected": -394.404052734375, "loss": 0.6713, "rewards/accuracies": 0.5625, "rewards/chosen": -0.04391036182641983, "rewards/margins": 0.048601724207401276, "rewards/rejected": -0.0925120860338211, "step": 48 }, { "epoch": 0.05, "learning_rate": 2.45e-05, "logits/chosen": -2.325778007507324, "logits/rejected": -2.1452741622924805, "logps/chosen": -359.43255615234375, "logps/rejected": -393.9988708496094, "loss": 0.6997, "rewards/accuracies": 0.5, "rewards/chosen": -0.03692295402288437, "rewards/margins": -0.005499981343746185, "rewards/rejected": -0.031422972679138184, "step": 49 }, { "epoch": 0.05, "learning_rate": 2.5e-05, "logits/chosen": -2.0367650985717773, "logits/rejected": -2.193774700164795, "logps/chosen": -318.42266845703125, "logps/rejected": -370.5718994140625, "loss": 0.6619, "rewards/accuracies": 0.8125, "rewards/chosen": -0.03530995920300484, "rewards/margins": 0.06544995307922363, "rewards/rejected": -0.10075991600751877, "step": 50 }, { "epoch": 0.05, "learning_rate": 2.5500000000000003e-05, "logits/chosen": -2.1022160053253174, "logits/rejected": -2.0698752403259277, "logps/chosen": -338.8807678222656, "logps/rejected": -333.04351806640625, "loss": 0.6707, "rewards/accuracies": 0.625, "rewards/chosen": -0.04516604542732239, "rewards/margins": 0.047384001314640045, "rewards/rejected": -0.09255003929138184, "step": 51 }, { "epoch": 0.05, "learning_rate": 2.6000000000000002e-05, "logits/chosen": -2.1273610591888428, "logits/rejected": -2.2891712188720703, "logps/chosen": -307.1384582519531, "logps/rejected": -314.24676513671875, "loss": 0.6753, "rewards/accuracies": 0.625, "rewards/chosen": -0.024128681048750877, "rewards/margins": 0.040785644203424454, "rewards/rejected": -0.06491431593894958, "step": 52 }, { "epoch": 0.05, "learning_rate": 2.6500000000000004e-05, "logits/chosen": -1.951267957687378, "logits/rejected": -2.171773672103882, "logps/chosen": -313.5145568847656, "logps/rejected": -345.32452392578125, "loss": 0.6764, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06882210075855255, "rewards/margins": 0.04536902531981468, "rewards/rejected": -0.11419112980365753, "step": 53 }, { "epoch": 0.06, "learning_rate": 2.7000000000000002e-05, "logits/chosen": -2.318854808807373, "logits/rejected": -2.4153428077697754, "logps/chosen": -407.51080322265625, "logps/rejected": -407.7454833984375, "loss": 0.6911, "rewards/accuracies": 0.5625, "rewards/chosen": -0.14593330025672913, "rewards/margins": 0.0101944450289011, "rewards/rejected": -0.15612773597240448, "step": 54 }, { "epoch": 0.06, "learning_rate": 2.7500000000000004e-05, "logits/chosen": -2.251836061477661, "logits/rejected": -2.0312557220458984, "logps/chosen": -257.6128234863281, "logps/rejected": -260.79705810546875, "loss": 0.7024, "rewards/accuracies": 0.5, "rewards/chosen": -0.12117181718349457, "rewards/margins": -0.015745995566248894, "rewards/rejected": -0.10542581230401993, "step": 55 }, { "epoch": 0.06, "learning_rate": 2.8000000000000003e-05, "logits/chosen": -2.3783812522888184, "logits/rejected": -2.2356433868408203, "logps/chosen": -322.89599609375, "logps/rejected": -305.649658203125, "loss": 0.7044, "rewards/accuracies": 0.5, "rewards/chosen": -0.0736481174826622, "rewards/margins": -0.007364703342318535, "rewards/rejected": -0.06628341972827911, "step": 56 }, { "epoch": 0.06, "learning_rate": 2.8499999999999998e-05, "logits/chosen": -2.0815675258636475, "logits/rejected": -1.8037395477294922, "logps/chosen": -341.66839599609375, "logps/rejected": -266.3909912109375, "loss": 0.6625, "rewards/accuracies": 0.5625, "rewards/chosen": -0.028729846701025963, "rewards/margins": 0.06697390228509903, "rewards/rejected": -0.09570374339818954, "step": 57 }, { "epoch": 0.06, "learning_rate": 2.9e-05, "logits/chosen": -2.123415946960449, "logits/rejected": -2.154893398284912, "logps/chosen": -304.90008544921875, "logps/rejected": -325.108154296875, "loss": 0.6931, "rewards/accuracies": 0.4375, "rewards/chosen": -0.11998450756072998, "rewards/margins": 0.007698964327573776, "rewards/rejected": -0.12768347561359406, "step": 58 }, { "epoch": 0.06, "learning_rate": 2.95e-05, "logits/chosen": -2.086604356765747, "logits/rejected": -2.0624561309814453, "logps/chosen": -294.25244140625, "logps/rejected": -324.137939453125, "loss": 0.676, "rewards/accuracies": 0.5625, "rewards/chosen": -0.06285426020622253, "rewards/margins": 0.06548047065734863, "rewards/rejected": -0.12833473086357117, "step": 59 }, { "epoch": 0.06, "learning_rate": 3e-05, "logits/chosen": -2.2143869400024414, "logits/rejected": -2.3379101753234863, "logps/chosen": -283.2222900390625, "logps/rejected": -320.791748046875, "loss": 0.657, "rewards/accuracies": 0.625, "rewards/chosen": -0.06308362632989883, "rewards/margins": 0.09713932871818542, "rewards/rejected": -0.16022296249866486, "step": 60 }, { "epoch": 0.06, "learning_rate": 3.05e-05, "logits/chosen": -2.117171287536621, "logits/rejected": -2.3281807899475098, "logps/chosen": -299.3207702636719, "logps/rejected": -364.2285461425781, "loss": 0.7127, "rewards/accuracies": 0.3125, "rewards/chosen": -0.15081080794334412, "rewards/margins": -0.02138001285493374, "rewards/rejected": -0.12943080067634583, "step": 61 }, { "epoch": 0.06, "learning_rate": 3.1e-05, "logits/chosen": -1.974008321762085, "logits/rejected": -2.116246223449707, "logps/chosen": -263.60394287109375, "logps/rejected": -367.9918212890625, "loss": 0.6806, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1498243808746338, "rewards/margins": 0.032494522631168365, "rewards/rejected": -0.18231889605522156, "step": 62 }, { "epoch": 0.07, "learning_rate": 3.15e-05, "logits/chosen": -2.0844931602478027, "logits/rejected": -2.224573850631714, "logps/chosen": -280.8338928222656, "logps/rejected": -339.91009521484375, "loss": 0.6893, "rewards/accuracies": 0.625, "rewards/chosen": -0.1308109611272812, "rewards/margins": 0.016523031517863274, "rewards/rejected": -0.14733397960662842, "step": 63 }, { "epoch": 0.07, "learning_rate": 3.2000000000000005e-05, "logits/chosen": -2.2176733016967773, "logits/rejected": -2.2587060928344727, "logps/chosen": -252.23390197753906, "logps/rejected": -273.116943359375, "loss": 0.681, "rewards/accuracies": 0.4375, "rewards/chosen": -0.08397925645112991, "rewards/margins": 0.031244704499840736, "rewards/rejected": -0.1152239516377449, "step": 64 }, { "epoch": 0.07, "learning_rate": 3.2500000000000004e-05, "logits/chosen": -2.1103031635284424, "logits/rejected": -2.1916050910949707, "logps/chosen": -253.43565368652344, "logps/rejected": -299.1423645019531, "loss": 0.6848, "rewards/accuracies": 0.625, "rewards/chosen": -0.11171821504831314, "rewards/margins": 0.028400154784321785, "rewards/rejected": -0.14011836051940918, "step": 65 }, { "epoch": 0.07, "learning_rate": 3.3e-05, "logits/chosen": -2.138770580291748, "logits/rejected": -2.277637481689453, "logps/chosen": -361.4617919921875, "logps/rejected": -364.7515869140625, "loss": 0.6562, "rewards/accuracies": 0.6875, "rewards/chosen": -0.13355040550231934, "rewards/margins": 0.10390853881835938, "rewards/rejected": -0.23745892941951752, "step": 66 }, { "epoch": 0.07, "learning_rate": 3.35e-05, "logits/chosen": -2.256972312927246, "logits/rejected": -2.155823230743408, "logps/chosen": -385.70989990234375, "logps/rejected": -342.7967529296875, "loss": 0.6888, "rewards/accuracies": 0.5625, "rewards/chosen": -0.20078128576278687, "rewards/margins": 0.030581658706068993, "rewards/rejected": -0.2313629388809204, "step": 67 }, { "epoch": 0.07, "learning_rate": 3.4000000000000007e-05, "logits/chosen": -2.1459546089172363, "logits/rejected": -2.092371940612793, "logps/chosen": -302.3841247558594, "logps/rejected": -248.30654907226562, "loss": 0.69, "rewards/accuracies": 0.625, "rewards/chosen": -0.13762035965919495, "rewards/margins": 0.019860554486513138, "rewards/rejected": -0.15748091042041779, "step": 68 }, { "epoch": 0.07, "learning_rate": 3.45e-05, "logits/chosen": -1.9295188188552856, "logits/rejected": -2.1819865703582764, "logps/chosen": -197.07498168945312, "logps/rejected": -277.8927917480469, "loss": 0.7116, "rewards/accuracies": 0.5625, "rewards/chosen": -0.21833539009094238, "rewards/margins": -0.016281111165881157, "rewards/rejected": -0.20205429196357727, "step": 69 }, { "epoch": 0.07, "learning_rate": 3.5e-05, "logits/chosen": -2.0420405864715576, "logits/rejected": -2.095397710800171, "logps/chosen": -440.4430236816406, "logps/rejected": -404.73297119140625, "loss": 0.6922, "rewards/accuracies": 0.5625, "rewards/chosen": -0.17053675651550293, "rewards/margins": 0.025177769362926483, "rewards/rejected": -0.19571453332901, "step": 70 }, { "epoch": 0.07, "learning_rate": 3.55e-05, "logits/chosen": -1.8378528356552124, "logits/rejected": -2.188674211502075, "logps/chosen": -271.0667724609375, "logps/rejected": -360.34454345703125, "loss": 0.6789, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08768844604492188, "rewards/margins": 0.06176728755235672, "rewards/rejected": -0.1494557410478592, "step": 71 }, { "epoch": 0.07, "learning_rate": 3.6e-05, "logits/chosen": -2.31483793258667, "logits/rejected": -2.28462290763855, "logps/chosen": -386.14312744140625, "logps/rejected": -423.07281494140625, "loss": 0.7184, "rewards/accuracies": 0.4375, "rewards/chosen": -0.18145808577537537, "rewards/margins": -0.015067771077156067, "rewards/rejected": -0.1663903295993805, "step": 72 }, { "epoch": 0.08, "learning_rate": 3.65e-05, "logits/chosen": -2.0742580890655518, "logits/rejected": -2.2153687477111816, "logps/chosen": -255.38543701171875, "logps/rejected": -290.1117248535156, "loss": 0.6921, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0764242559671402, "rewards/margins": 0.01691494509577751, "rewards/rejected": -0.0933392122387886, "step": 73 }, { "epoch": 0.08, "learning_rate": 3.7e-05, "logits/chosen": -2.0832855701446533, "logits/rejected": -2.060133934020996, "logps/chosen": -340.09197998046875, "logps/rejected": -342.4574890136719, "loss": 0.6582, "rewards/accuracies": 0.625, "rewards/chosen": -0.12881942093372345, "rewards/margins": 0.10232281684875488, "rewards/rejected": -0.23114225268363953, "step": 74 }, { "epoch": 0.08, "learning_rate": 3.7500000000000003e-05, "logits/chosen": -2.045681953430176, "logits/rejected": -2.041499137878418, "logps/chosen": -399.8717956542969, "logps/rejected": -384.0579528808594, "loss": 0.6529, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1383177787065506, "rewards/margins": 0.0989096462726593, "rewards/rejected": -0.2372274249792099, "step": 75 }, { "epoch": 0.08, "learning_rate": 3.8e-05, "logits/chosen": -2.0136969089508057, "logits/rejected": -2.131852149963379, "logps/chosen": -246.70101928710938, "logps/rejected": -256.7235412597656, "loss": 0.6548, "rewards/accuracies": 0.75, "rewards/chosen": -0.20958754420280457, "rewards/margins": 0.1072821319103241, "rewards/rejected": -0.31686967611312866, "step": 76 }, { "epoch": 0.08, "learning_rate": 3.85e-05, "logits/chosen": -1.8942384719848633, "logits/rejected": -1.7946527004241943, "logps/chosen": -363.3137512207031, "logps/rejected": -288.844482421875, "loss": 0.7086, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2392420619726181, "rewards/margins": -0.015016615390777588, "rewards/rejected": -0.2242254614830017, "step": 77 }, { "epoch": 0.08, "learning_rate": 3.9000000000000006e-05, "logits/chosen": -2.284573793411255, "logits/rejected": -2.2139668464660645, "logps/chosen": -291.585205078125, "logps/rejected": -311.04986572265625, "loss": 0.6962, "rewards/accuracies": 0.4375, "rewards/chosen": -0.24729153513908386, "rewards/margins": 0.011042074300348759, "rewards/rejected": -0.25833362340927124, "step": 78 }, { "epoch": 0.08, "learning_rate": 3.9500000000000005e-05, "logits/chosen": -2.126842737197876, "logits/rejected": -2.178943634033203, "logps/chosen": -289.07049560546875, "logps/rejected": -267.1867980957031, "loss": 0.6765, "rewards/accuracies": 0.5, "rewards/chosen": -0.09771183133125305, "rewards/margins": 0.04596526175737381, "rewards/rejected": -0.14367708563804626, "step": 79 }, { "epoch": 0.08, "learning_rate": 4e-05, "logits/chosen": -1.9899426698684692, "logits/rejected": -2.218869209289551, "logps/chosen": -348.4210205078125, "logps/rejected": -453.83306884765625, "loss": 0.6183, "rewards/accuracies": 0.8125, "rewards/chosen": -0.17673715949058533, "rewards/margins": 0.17254139482975006, "rewards/rejected": -0.3492785692214966, "step": 80 }, { "epoch": 0.08, "learning_rate": 4.05e-05, "logits/chosen": -2.3394789695739746, "logits/rejected": -2.3765251636505127, "logps/chosen": -356.1443786621094, "logps/rejected": -352.0921936035156, "loss": 0.6923, "rewards/accuracies": 0.5625, "rewards/chosen": -0.30028024315834045, "rewards/margins": 0.05001110956072807, "rewards/rejected": -0.35029137134552, "step": 81 }, { "epoch": 0.08, "learning_rate": 4.1e-05, "logits/chosen": -2.136016607284546, "logits/rejected": -2.1530215740203857, "logps/chosen": -241.19703674316406, "logps/rejected": -251.56166076660156, "loss": 0.7461, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3470645248889923, "rewards/margins": -0.07802443206310272, "rewards/rejected": -0.2690401077270508, "step": 82 }, { "epoch": 0.09, "learning_rate": 4.15e-05, "logits/chosen": -2.1615593433380127, "logits/rejected": -2.184434413909912, "logps/chosen": -338.65057373046875, "logps/rejected": -285.83734130859375, "loss": 0.734, "rewards/accuracies": 0.375, "rewards/chosen": -0.3146396279335022, "rewards/margins": -0.06066913902759552, "rewards/rejected": -0.2539704740047455, "step": 83 }, { "epoch": 0.09, "learning_rate": 4.2e-05, "logits/chosen": -2.1108598709106445, "logits/rejected": -2.383390426635742, "logps/chosen": -285.7464599609375, "logps/rejected": -336.65997314453125, "loss": 0.6534, "rewards/accuracies": 0.625, "rewards/chosen": -0.30576157569885254, "rewards/margins": 0.13044245541095734, "rewards/rejected": -0.43620407581329346, "step": 84 }, { "epoch": 0.09, "learning_rate": 4.25e-05, "logits/chosen": -2.185880422592163, "logits/rejected": -2.2634472846984863, "logps/chosen": -382.79132080078125, "logps/rejected": -426.94805908203125, "loss": 0.6376, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2730083465576172, "rewards/margins": 0.15455365180969238, "rewards/rejected": -0.4275619387626648, "step": 85 }, { "epoch": 0.09, "learning_rate": 4.3e-05, "logits/chosen": -2.200338363647461, "logits/rejected": -2.3442678451538086, "logps/chosen": -295.3491516113281, "logps/rejected": -292.9728698730469, "loss": 0.6193, "rewards/accuracies": 0.75, "rewards/chosen": -0.1620425134897232, "rewards/margins": 0.1742367446422577, "rewards/rejected": -0.3362792432308197, "step": 86 }, { "epoch": 0.09, "learning_rate": 4.35e-05, "logits/chosen": -2.085541248321533, "logits/rejected": -2.1654982566833496, "logps/chosen": -313.8997497558594, "logps/rejected": -406.6915588378906, "loss": 0.6755, "rewards/accuracies": 0.5, "rewards/chosen": -0.18983058631420135, "rewards/margins": 0.09621434658765793, "rewards/rejected": -0.2860449254512787, "step": 87 }, { "epoch": 0.09, "learning_rate": 4.4000000000000006e-05, "logits/chosen": -2.0523123741149902, "logits/rejected": -2.1804168224334717, "logps/chosen": -278.9588928222656, "logps/rejected": -358.6983642578125, "loss": 0.7034, "rewards/accuracies": 0.5625, "rewards/chosen": -0.369529664516449, "rewards/margins": 0.06673409789800644, "rewards/rejected": -0.43626368045806885, "step": 88 }, { "epoch": 0.09, "learning_rate": 4.4500000000000004e-05, "logits/chosen": -1.9767179489135742, "logits/rejected": -2.073478937149048, "logps/chosen": -312.8780212402344, "logps/rejected": -334.3851623535156, "loss": 0.721, "rewards/accuracies": 0.5, "rewards/chosen": -0.41693034768104553, "rewards/margins": -0.03711947053670883, "rewards/rejected": -0.3798108398914337, "step": 89 }, { "epoch": 0.09, "learning_rate": 4.5e-05, "logits/chosen": -2.34897780418396, "logits/rejected": -2.1732659339904785, "logps/chosen": -365.05712890625, "logps/rejected": -338.4190673828125, "loss": 0.6986, "rewards/accuracies": 0.5, "rewards/chosen": -0.4183480143547058, "rewards/margins": 0.03731951862573624, "rewards/rejected": -0.45566752552986145, "step": 90 }, { "epoch": 0.09, "learning_rate": 4.55e-05, "logits/chosen": -2.2497520446777344, "logits/rejected": -2.2454562187194824, "logps/chosen": -326.4650573730469, "logps/rejected": -414.0813903808594, "loss": 0.6715, "rewards/accuracies": 0.5, "rewards/chosen": -0.4516112208366394, "rewards/margins": 0.08267778903245926, "rewards/rejected": -0.5342890024185181, "step": 91 }, { "epoch": 0.1, "learning_rate": 4.600000000000001e-05, "logits/chosen": -2.0850160121917725, "logits/rejected": -2.2057738304138184, "logps/chosen": -356.4827575683594, "logps/rejected": -372.5768127441406, "loss": 0.5613, "rewards/accuracies": 0.75, "rewards/chosen": -0.45258861780166626, "rewards/margins": 0.37770116329193115, "rewards/rejected": -0.8302898406982422, "step": 92 }, { "epoch": 0.1, "learning_rate": 4.6500000000000005e-05, "logits/chosen": -2.3158974647521973, "logits/rejected": -2.1951651573181152, "logps/chosen": -314.29595947265625, "logps/rejected": -327.9600524902344, "loss": 0.8837, "rewards/accuracies": 0.3125, "rewards/chosen": -0.591391384601593, "rewards/margins": -0.23991218209266663, "rewards/rejected": -0.3514792025089264, "step": 93 }, { "epoch": 0.1, "learning_rate": 4.7e-05, "logits/chosen": -2.166097640991211, "logits/rejected": -2.2614049911499023, "logps/chosen": -359.9385986328125, "logps/rejected": -416.6658020019531, "loss": 0.6978, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5130646824836731, "rewards/margins": 0.03734045475721359, "rewards/rejected": -0.5504050850868225, "step": 94 }, { "epoch": 0.1, "learning_rate": 4.75e-05, "logits/chosen": -2.368900775909424, "logits/rejected": -2.303640365600586, "logps/chosen": -379.1260986328125, "logps/rejected": -359.70281982421875, "loss": 0.7422, "rewards/accuracies": 0.5, "rewards/chosen": -0.5878180265426636, "rewards/margins": -0.014551635831594467, "rewards/rejected": -0.5732664465904236, "step": 95 }, { "epoch": 0.1, "learning_rate": 4.8e-05, "logits/chosen": -1.9750244617462158, "logits/rejected": -2.13342547416687, "logps/chosen": -309.6315002441406, "logps/rejected": -290.8506164550781, "loss": 0.706, "rewards/accuracies": 0.625, "rewards/chosen": -0.5889778733253479, "rewards/margins": 0.024408388882875443, "rewards/rejected": -0.6133862733840942, "step": 96 }, { "epoch": 0.1, "learning_rate": 4.85e-05, "logits/chosen": -2.128643035888672, "logits/rejected": -2.0525574684143066, "logps/chosen": -413.3954772949219, "logps/rejected": -345.6094055175781, "loss": 0.7972, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7196497917175293, "rewards/margins": -0.1434842050075531, "rewards/rejected": -0.5761655569076538, "step": 97 }, { "epoch": 0.1, "learning_rate": 4.9e-05, "logits/chosen": -2.2135891914367676, "logits/rejected": -2.2574219703674316, "logps/chosen": -370.8169250488281, "logps/rejected": -344.8519592285156, "loss": 0.7296, "rewards/accuracies": 0.375, "rewards/chosen": -0.5168295502662659, "rewards/margins": -0.034182578325271606, "rewards/rejected": -0.48264697194099426, "step": 98 }, { "epoch": 0.1, "learning_rate": 4.9500000000000004e-05, "logits/chosen": -2.0821609497070312, "logits/rejected": -2.0011813640594482, "logps/chosen": -249.1553955078125, "logps/rejected": -292.9418029785156, "loss": 0.6621, "rewards/accuracies": 0.6875, "rewards/chosen": -0.31434231996536255, "rewards/margins": 0.10270004719495773, "rewards/rejected": -0.41704243421554565, "step": 99 }, { "epoch": 0.1, "learning_rate": 5e-05, "logits/chosen": -2.0994839668273926, "logits/rejected": -2.1740150451660156, "logps/chosen": -377.63885498046875, "logps/rejected": -479.538818359375, "loss": 0.6675, "rewards/accuracies": 0.5625, "rewards/chosen": -0.41925686597824097, "rewards/margins": 0.06821132451295853, "rewards/rejected": -0.4874681532382965, "step": 100 }, { "epoch": 0.1, "learning_rate": 4.999983511654996e-05, "logits/chosen": -2.200211524963379, "logits/rejected": -2.1883296966552734, "logps/chosen": -384.2947998046875, "logps/rejected": -448.62957763671875, "loss": 0.6957, "rewards/accuracies": 0.375, "rewards/chosen": -0.5274229049682617, "rewards/margins": 0.025556959211826324, "rewards/rejected": -0.5529798865318298, "step": 101 }, { "epoch": 0.11, "learning_rate": 4.9999340468374787e-05, "logits/chosen": -2.13332462310791, "logits/rejected": -2.218052387237549, "logps/chosen": -311.7938537597656, "logps/rejected": -272.7942199707031, "loss": 0.6103, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3114677965641022, "rewards/margins": 0.20093847811222076, "rewards/rejected": -0.5124062895774841, "step": 102 }, { "epoch": 0.11, "learning_rate": 4.99985160619992e-05, "logits/chosen": -2.151207447052002, "logits/rejected": -2.073565721511841, "logps/chosen": -324.60223388671875, "logps/rejected": -350.4017028808594, "loss": 0.6223, "rewards/accuracies": 0.5, "rewards/chosen": -0.4631275534629822, "rewards/margins": 0.19337055087089539, "rewards/rejected": -0.6564981341362, "step": 103 }, { "epoch": 0.11, "learning_rate": 4.99973619082977e-05, "logits/chosen": -2.077446699142456, "logits/rejected": -2.1615357398986816, "logps/chosen": -347.063232421875, "logps/rejected": -355.1671142578125, "loss": 0.6216, "rewards/accuracies": 0.5, "rewards/chosen": -0.387503445148468, "rewards/margins": 0.18972592055797577, "rewards/rejected": -0.577229380607605, "step": 104 }, { "epoch": 0.11, "learning_rate": 4.9995878022494335e-05, "logits/chosen": -2.173962354660034, "logits/rejected": -2.1823246479034424, "logps/chosen": -378.25152587890625, "logps/rejected": -369.76983642578125, "loss": 0.7478, "rewards/accuracies": 0.375, "rewards/chosen": -0.5083476901054382, "rewards/margins": -0.091352179646492, "rewards/rejected": -0.41699549555778503, "step": 105 }, { "epoch": 0.11, "learning_rate": 4.9994064424162575e-05, "logits/chosen": -2.2684097290039062, "logits/rejected": -2.341747760772705, "logps/chosen": -422.6853332519531, "logps/rejected": -422.57330322265625, "loss": 0.6335, "rewards/accuracies": 0.5, "rewards/chosen": -0.5471891760826111, "rewards/margins": 0.16060353815555573, "rewards/rejected": -0.7077926993370056, "step": 106 }, { "epoch": 0.11, "learning_rate": 4.9991921137225e-05, "logits/chosen": -2.0649406909942627, "logits/rejected": -2.0103793144226074, "logps/chosen": -398.6349182128906, "logps/rejected": -321.7645263671875, "loss": 0.7777, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5084972381591797, "rewards/margins": -0.11370338499546051, "rewards/rejected": -0.39479386806488037, "step": 107 }, { "epoch": 0.11, "learning_rate": 4.998944818995302e-05, "logits/chosen": -2.1160833835601807, "logits/rejected": -2.2650928497314453, "logps/chosen": -319.59326171875, "logps/rejected": -399.0347900390625, "loss": 0.6419, "rewards/accuracies": 0.75, "rewards/chosen": -0.46814414858818054, "rewards/margins": 0.12615980207920074, "rewards/rejected": -0.5943039059638977, "step": 108 }, { "epoch": 0.11, "learning_rate": 4.998664561496647e-05, "logits/chosen": -2.0261175632476807, "logits/rejected": -2.01719331741333, "logps/chosen": -352.3540344238281, "logps/rejected": -404.3323669433594, "loss": 0.5917, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5237185955047607, "rewards/margins": 0.2665461599826813, "rewards/rejected": -0.7902647256851196, "step": 109 }, { "epoch": 0.11, "learning_rate": 4.998351344923322e-05, "logits/chosen": -2.1265554428100586, "logits/rejected": -2.188615322113037, "logps/chosen": -362.6216125488281, "logps/rejected": -339.3355407714844, "loss": 0.7484, "rewards/accuracies": 0.25, "rewards/chosen": -0.6888318657875061, "rewards/margins": -0.090579092502594, "rewards/rejected": -0.5982527732849121, "step": 110 }, { "epoch": 0.11, "learning_rate": 4.998005173406865e-05, "logits/chosen": -2.379279136657715, "logits/rejected": -2.329848527908325, "logps/chosen": -325.26385498046875, "logps/rejected": -324.79656982421875, "loss": 0.7852, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8365094065666199, "rewards/margins": -0.1392899453639984, "rewards/rejected": -0.6972194910049438, "step": 111 }, { "epoch": 0.12, "learning_rate": 4.997626051513512e-05, "logits/chosen": -2.126467227935791, "logits/rejected": -2.1980350017547607, "logps/chosen": -376.349609375, "logps/rejected": -436.8643798828125, "loss": 0.6107, "rewards/accuracies": 0.6875, "rewards/chosen": -0.49128276109695435, "rewards/margins": 0.21428784728050232, "rewards/rejected": -0.705570638179779, "step": 112 }, { "epoch": 0.12, "learning_rate": 4.997213984244138e-05, "logits/chosen": -2.1278860569000244, "logits/rejected": -2.218194007873535, "logps/chosen": -235.55670166015625, "logps/rejected": -324.1790466308594, "loss": 0.6201, "rewards/accuracies": 0.875, "rewards/chosen": -0.6700268387794495, "rewards/margins": 0.23135630786418915, "rewards/rejected": -0.9013831615447998, "step": 113 }, { "epoch": 0.12, "learning_rate": 4.996768977034188e-05, "logits/chosen": -2.236452579498291, "logits/rejected": -2.300806999206543, "logps/chosen": -308.35467529296875, "logps/rejected": -374.0247497558594, "loss": 0.6864, "rewards/accuracies": 0.625, "rewards/chosen": -0.6465363502502441, "rewards/margins": 0.05209742486476898, "rewards/rejected": -0.6986337900161743, "step": 114 }, { "epoch": 0.12, "learning_rate": 4.996291035753608e-05, "logits/chosen": -2.234069347381592, "logits/rejected": -2.244938611984253, "logps/chosen": -533.840576171875, "logps/rejected": -479.3876953125, "loss": 0.644, "rewards/accuracies": 0.75, "rewards/chosen": -0.6587188243865967, "rewards/margins": 0.14327090978622437, "rewards/rejected": -0.801989734172821, "step": 115 }, { "epoch": 0.12, "learning_rate": 4.995780166706767e-05, "logits/chosen": -2.2996180057525635, "logits/rejected": -2.1304776668548584, "logps/chosen": -336.5658264160156, "logps/rejected": -290.9165954589844, "loss": 0.6859, "rewards/accuracies": 0.5, "rewards/chosen": -0.6112417578697205, "rewards/margins": 0.08546795696020126, "rewards/rejected": -0.6967097520828247, "step": 116 }, { "epoch": 0.12, "learning_rate": 4.995236376632373e-05, "logits/chosen": -2.143672466278076, "logits/rejected": -2.0870895385742188, "logps/chosen": -285.63385009765625, "logps/rejected": -269.767822265625, "loss": 0.6417, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6124986410140991, "rewards/margins": 0.17768412828445435, "rewards/rejected": -0.7901827096939087, "step": 117 }, { "epoch": 0.12, "learning_rate": 4.994659672703383e-05, "logits/chosen": -2.0322999954223633, "logits/rejected": -2.1645522117614746, "logps/chosen": -294.4024963378906, "logps/rejected": -434.4449768066406, "loss": 0.5761, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6845369338989258, "rewards/margins": 0.3629693388938904, "rewards/rejected": -1.047506332397461, "step": 118 }, { "epoch": 0.12, "learning_rate": 4.994050062526915e-05, "logits/chosen": -2.268840789794922, "logits/rejected": -2.210455894470215, "logps/chosen": -454.2048034667969, "logps/rejected": -401.5218200683594, "loss": 0.6914, "rewards/accuracies": 0.5, "rewards/chosen": -0.7424222826957703, "rewards/margins": 0.03279120847582817, "rewards/rejected": -0.7752134799957275, "step": 119 }, { "epoch": 0.12, "learning_rate": 4.993407554144136e-05, "logits/chosen": -2.1254310607910156, "logits/rejected": -2.2457275390625, "logps/chosen": -263.01318359375, "logps/rejected": -279.0079040527344, "loss": 0.6246, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8383140563964844, "rewards/margins": 0.3438203036785126, "rewards/rejected": -1.1821343898773193, "step": 120 }, { "epoch": 0.13, "learning_rate": 4.9927321560301686e-05, "logits/chosen": -1.9596202373504639, "logits/rejected": -1.9732022285461426, "logps/chosen": -333.146240234375, "logps/rejected": -317.2575988769531, "loss": 0.6247, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6822391152381897, "rewards/margins": 0.18031413853168488, "rewards/rejected": -0.8625531196594238, "step": 121 }, { "epoch": 0.13, "learning_rate": 4.992023877093969e-05, "logits/chosen": -2.2412517070770264, "logits/rejected": -2.2476582527160645, "logps/chosen": -270.1220703125, "logps/rejected": -297.88116455078125, "loss": 0.6934, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1133708953857422, "rewards/margins": 0.07820569723844528, "rewards/rejected": -1.1915764808654785, "step": 122 }, { "epoch": 0.13, "learning_rate": 4.991282726678215e-05, "logits/chosen": -2.1082093715667725, "logits/rejected": -2.260173797607422, "logps/chosen": -342.9111633300781, "logps/rejected": -425.59710693359375, "loss": 0.5823, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1128913164138794, "rewards/margins": 0.2950645089149475, "rewards/rejected": -1.4079556465148926, "step": 123 }, { "epoch": 0.13, "learning_rate": 4.990508714559182e-05, "logits/chosen": -1.9671399593353271, "logits/rejected": -2.216414213180542, "logps/chosen": -371.3886413574219, "logps/rejected": -412.00323486328125, "loss": 0.513, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6249299049377441, "rewards/margins": 0.6542832255363464, "rewards/rejected": -2.2792131900787354, "step": 124 }, { "epoch": 0.13, "learning_rate": 4.989701850946613e-05, "logits/chosen": -1.9954829216003418, "logits/rejected": -2.0708837509155273, "logps/chosen": -309.5416259765625, "logps/rejected": -369.55902099609375, "loss": 0.6247, "rewards/accuracies": 0.625, "rewards/chosen": -1.3325786590576172, "rewards/margins": 0.2878706455230713, "rewards/rejected": -1.6204493045806885, "step": 125 }, { "epoch": 0.13, "learning_rate": 4.988862146483585e-05, "logits/chosen": -2.04398775100708, "logits/rejected": -2.3061468601226807, "logps/chosen": -311.40283203125, "logps/rejected": -330.2856750488281, "loss": 0.5071, "rewards/accuracies": 0.75, "rewards/chosen": -0.9415925741195679, "rewards/margins": 0.5770285725593567, "rewards/rejected": -1.5186210870742798, "step": 126 }, { "epoch": 0.13, "learning_rate": 4.987989612246368e-05, "logits/chosen": -2.1247596740722656, "logits/rejected": -2.293691396713257, "logps/chosen": -415.2500915527344, "logps/rejected": -361.0720520019531, "loss": 0.5442, "rewards/accuracies": 0.625, "rewards/chosen": -0.9840108752250671, "rewards/margins": 0.4814227521419525, "rewards/rejected": -1.4654337167739868, "step": 127 }, { "epoch": 0.13, "learning_rate": 4.9870842597442755e-05, "logits/chosen": -2.21590518951416, "logits/rejected": -2.1943631172180176, "logps/chosen": -387.32720947265625, "logps/rejected": -422.17059326171875, "loss": 0.493, "rewards/accuracies": 0.75, "rewards/chosen": -1.0035918951034546, "rewards/margins": 0.6130063533782959, "rewards/rejected": -1.61659836769104, "step": 128 }, { "epoch": 0.13, "learning_rate": 4.9861461009195224e-05, "logits/chosen": -2.2312891483306885, "logits/rejected": -2.3044235706329346, "logps/chosen": -297.28729248046875, "logps/rejected": -300.91070556640625, "loss": 0.8, "rewards/accuracies": 0.4375, "rewards/chosen": -1.2972065210342407, "rewards/margins": -0.13482597470283508, "rewards/rejected": -1.162380576133728, "step": 129 }, { "epoch": 0.13, "learning_rate": 4.9851751481470565e-05, "logits/chosen": -2.3871798515319824, "logits/rejected": -2.3541009426116943, "logps/chosen": -389.8529357910156, "logps/rejected": -395.540283203125, "loss": 0.7455, "rewards/accuracies": 0.3125, "rewards/chosen": -1.4040565490722656, "rewards/margins": -0.02014276757836342, "rewards/rejected": -1.3839137554168701, "step": 130 }, { "epoch": 0.14, "learning_rate": 4.984171414234401e-05, "logits/chosen": -2.3224058151245117, "logits/rejected": -2.5203804969787598, "logps/chosen": -278.0612487792969, "logps/rejected": -292.171142578125, "loss": 0.698, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2112613916397095, "rewards/margins": 0.22594159841537476, "rewards/rejected": -1.437203049659729, "step": 131 }, { "epoch": 0.14, "learning_rate": 4.983134912421485e-05, "logits/chosen": -2.1884591579437256, "logits/rejected": -2.0368237495422363, "logps/chosen": -277.64117431640625, "logps/rejected": -261.66094970703125, "loss": 0.5748, "rewards/accuracies": 0.75, "rewards/chosen": -1.172703742980957, "rewards/margins": 0.33002516627311707, "rewards/rejected": -1.502728819847107, "step": 132 }, { "epoch": 0.14, "learning_rate": 4.982065656380468e-05, "logits/chosen": -2.079421281814575, "logits/rejected": -2.2217986583709717, "logps/chosen": -295.58087158203125, "logps/rejected": -291.7632751464844, "loss": 0.5565, "rewards/accuracies": 0.75, "rewards/chosen": -0.9564344882965088, "rewards/margins": 0.4054575562477112, "rewards/rejected": -1.3618921041488647, "step": 133 }, { "epoch": 0.14, "learning_rate": 4.9809636602155604e-05, "logits/chosen": -2.1835222244262695, "logits/rejected": -2.2144436836242676, "logps/chosen": -248.64321899414062, "logps/rejected": -231.23239135742188, "loss": 0.6353, "rewards/accuracies": 0.5, "rewards/chosen": -1.2899762392044067, "rewards/margins": 0.29608777165412903, "rewards/rejected": -1.586064100265503, "step": 134 }, { "epoch": 0.14, "learning_rate": 4.9798289384628355e-05, "logits/chosen": -2.047929048538208, "logits/rejected": -2.020115852355957, "logps/chosen": -270.7432556152344, "logps/rejected": -295.75714111328125, "loss": 0.7271, "rewards/accuracies": 0.5625, "rewards/chosen": -1.225740671157837, "rewards/margins": 0.14127963781356812, "rewards/rejected": -1.3670202493667603, "step": 135 }, { "epoch": 0.14, "learning_rate": 4.978661506090042e-05, "logits/chosen": -2.268289089202881, "logits/rejected": -2.264258623123169, "logps/chosen": -335.73406982421875, "logps/rejected": -326.88641357421875, "loss": 0.8802, "rewards/accuracies": 0.625, "rewards/chosen": -2.066948890686035, "rewards/margins": -0.09082351624965668, "rewards/rejected": -1.9761252403259277, "step": 136 }, { "epoch": 0.14, "learning_rate": 4.9774613784964e-05, "logits/chosen": -2.366272449493408, "logits/rejected": -2.413400888442993, "logps/chosen": -275.4363708496094, "logps/rejected": -316.3116149902344, "loss": 0.705, "rewards/accuracies": 0.625, "rewards/chosen": -1.7615418434143066, "rewards/margins": 0.17158068716526031, "rewards/rejected": -1.9331226348876953, "step": 137 }, { "epoch": 0.14, "learning_rate": 4.9762285715124054e-05, "logits/chosen": -2.370572328567505, "logits/rejected": -2.273383617401123, "logps/chosen": -342.66046142578125, "logps/rejected": -399.414794921875, "loss": 1.0301, "rewards/accuracies": 0.4375, "rewards/chosen": -1.7256594896316528, "rewards/margins": -0.3764263093471527, "rewards/rejected": -1.3492331504821777, "step": 138 }, { "epoch": 0.14, "learning_rate": 4.974963101399614e-05, "logits/chosen": -2.196343421936035, "logits/rejected": -2.460721969604492, "logps/chosen": -255.3898162841797, "logps/rejected": -318.3489990234375, "loss": 0.5858, "rewards/accuracies": 0.625, "rewards/chosen": -0.9483575820922852, "rewards/margins": 0.5218918919563293, "rewards/rejected": -1.4702494144439697, "step": 139 }, { "epoch": 0.14, "learning_rate": 4.973664984850435e-05, "logits/chosen": -2.305603265762329, "logits/rejected": -2.2728540897369385, "logps/chosen": -351.310791015625, "logps/rejected": -322.8355712890625, "loss": 0.692, "rewards/accuracies": 0.5, "rewards/chosen": -1.272456169128418, "rewards/margins": 0.04384595528244972, "rewards/rejected": -1.3163020610809326, "step": 140 }, { "epoch": 0.15, "learning_rate": 4.9723342389879e-05, "logits/chosen": -2.463696241378784, "logits/rejected": -2.424192190170288, "logps/chosen": -487.9200439453125, "logps/rejected": -463.71844482421875, "loss": 0.5906, "rewards/accuracies": 0.5625, "rewards/chosen": -1.156785011291504, "rewards/margins": 0.42006048560142517, "rewards/rejected": -1.5768455266952515, "step": 141 }, { "epoch": 0.15, "learning_rate": 4.970970881365449e-05, "logits/chosen": -2.1991195678710938, "logits/rejected": -2.2735817432403564, "logps/chosen": -333.13214111328125, "logps/rejected": -371.25213623046875, "loss": 0.59, "rewards/accuracies": 0.625, "rewards/chosen": -1.0519524812698364, "rewards/margins": 0.3118639290332794, "rewards/rejected": -1.363816261291504, "step": 142 }, { "epoch": 0.15, "learning_rate": 4.9695749299666894e-05, "logits/chosen": -2.0732052326202393, "logits/rejected": -2.0948421955108643, "logps/chosen": -355.09381103515625, "logps/rejected": -370.39007568359375, "loss": 0.6914, "rewards/accuracies": 0.5, "rewards/chosen": -1.0995197296142578, "rewards/margins": 0.12801453471183777, "rewards/rejected": -1.227534294128418, "step": 143 }, { "epoch": 0.15, "learning_rate": 4.9681464032051635e-05, "logits/chosen": -2.281567335128784, "logits/rejected": -2.19057559967041, "logps/chosen": -407.2377624511719, "logps/rejected": -354.5672302246094, "loss": 0.8131, "rewards/accuracies": 0.4375, "rewards/chosen": -1.355703592300415, "rewards/margins": -0.11926855146884918, "rewards/rejected": -1.2364351749420166, "step": 144 }, { "epoch": 0.15, "learning_rate": 4.966685319924106e-05, "logits/chosen": -2.3482041358947754, "logits/rejected": -2.267275094985962, "logps/chosen": -445.82208251953125, "logps/rejected": -458.0115051269531, "loss": 0.7112, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0608644485473633, "rewards/margins": 0.03552216291427612, "rewards/rejected": -1.0963865518569946, "step": 145 }, { "epoch": 0.15, "learning_rate": 4.965191699396191e-05, "logits/chosen": -2.1460695266723633, "logits/rejected": -2.340147018432617, "logps/chosen": -305.9909362792969, "logps/rejected": -318.31256103515625, "loss": 0.7068, "rewards/accuracies": 0.5, "rewards/chosen": -0.9504708647727966, "rewards/margins": 0.06828048825263977, "rewards/rejected": -1.0187513828277588, "step": 146 }, { "epoch": 0.15, "learning_rate": 4.963665561323286e-05, "logits/chosen": -2.2726097106933594, "logits/rejected": -2.2365224361419678, "logps/chosen": -287.2446594238281, "logps/rejected": -313.90203857421875, "loss": 0.8693, "rewards/accuracies": 0.375, "rewards/chosen": -1.1069557666778564, "rewards/margins": -0.2487190216779709, "rewards/rejected": -0.8582366704940796, "step": 147 }, { "epoch": 0.15, "learning_rate": 4.962106925836183e-05, "logits/chosen": -2.1673455238342285, "logits/rejected": -2.1659748554229736, "logps/chosen": -363.5993957519531, "logps/rejected": -345.0345153808594, "loss": 0.6932, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8967840075492859, "rewards/margins": 0.05857213959097862, "rewards/rejected": -0.9553561210632324, "step": 148 }, { "epoch": 0.15, "learning_rate": 4.9605158134943356e-05, "logits/chosen": -2.167635679244995, "logits/rejected": -2.10685396194458, "logps/chosen": -278.79949951171875, "logps/rejected": -243.8157501220703, "loss": 0.8424, "rewards/accuracies": 0.3125, "rewards/chosen": -0.8083875179290771, "rewards/margins": -0.2297360599040985, "rewards/rejected": -0.5786514282226562, "step": 149 }, { "epoch": 0.16, "learning_rate": 4.9588922452855935e-05, "logits/chosen": -1.9295530319213867, "logits/rejected": -2.084747314453125, "logps/chosen": -350.78887939453125, "logps/rejected": -416.31549072265625, "loss": 0.6965, "rewards/accuracies": 0.4375, "rewards/chosen": -0.61838698387146, "rewards/margins": 0.09959676116704941, "rewards/rejected": -0.7179837226867676, "step": 150 }, { "epoch": 0.16, "learning_rate": 4.9572362426259176e-05, "logits/chosen": -2.1817588806152344, "logits/rejected": -2.141252040863037, "logps/chosen": -347.57403564453125, "logps/rejected": -378.94281005859375, "loss": 0.6127, "rewards/accuracies": 0.625, "rewards/chosen": -0.6715837121009827, "rewards/margins": 0.36849769949913025, "rewards/rejected": -1.04008150100708, "step": 151 }, { "epoch": 0.16, "learning_rate": 4.955547827359103e-05, "logits/chosen": -2.249030590057373, "logits/rejected": -1.901309609413147, "logps/chosen": -358.05859375, "logps/rejected": -263.93365478515625, "loss": 0.7291, "rewards/accuracies": 0.375, "rewards/chosen": -0.6999510526657104, "rewards/margins": -0.013744346797466278, "rewards/rejected": -0.6862066984176636, "step": 152 }, { "epoch": 0.16, "learning_rate": 4.953827021756489e-05, "logits/chosen": -1.9777555465698242, "logits/rejected": -1.9771215915679932, "logps/chosen": -374.28192138671875, "logps/rejected": -446.4751892089844, "loss": 0.7191, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6933267116546631, "rewards/margins": 0.10663817822933197, "rewards/rejected": -0.7999648451805115, "step": 153 }, { "epoch": 0.16, "learning_rate": 4.952073848516663e-05, "logits/chosen": -2.353224515914917, "logits/rejected": -2.316944122314453, "logps/chosen": -409.6300048828125, "logps/rejected": -406.71124267578125, "loss": 0.7778, "rewards/accuracies": 0.5, "rewards/chosen": -0.6479594707489014, "rewards/margins": -0.08428651094436646, "rewards/rejected": -0.5636729598045349, "step": 154 }, { "epoch": 0.16, "learning_rate": 4.9502883307651674e-05, "logits/chosen": -1.9488294124603271, "logits/rejected": -1.9255703687667847, "logps/chosen": -289.2491760253906, "logps/rejected": -412.5265808105469, "loss": 0.5643, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4605676531791687, "rewards/margins": 0.313413143157959, "rewards/rejected": -0.7739807367324829, "step": 155 }, { "epoch": 0.16, "learning_rate": 4.9484704920541856e-05, "logits/chosen": -1.9965554475784302, "logits/rejected": -2.217256546020508, "logps/chosen": -285.490966796875, "logps/rejected": -356.04559326171875, "loss": 0.7259, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5536916851997375, "rewards/margins": 0.03260548785328865, "rewards/rejected": -0.5862972140312195, "step": 156 }, { "epoch": 0.16, "learning_rate": 4.9466203563622424e-05, "logits/chosen": -2.1669509410858154, "logits/rejected": -2.293706178665161, "logps/chosen": -394.7683410644531, "logps/rejected": -457.5030822753906, "loss": 0.7323, "rewards/accuracies": 0.5, "rewards/chosen": -0.6224880814552307, "rewards/margins": 0.012505665421485901, "rewards/rejected": -0.6349937915802002, "step": 157 }, { "epoch": 0.16, "learning_rate": 4.944737948093876e-05, "logits/chosen": -1.9717931747436523, "logits/rejected": -2.068146228790283, "logps/chosen": -258.3878173828125, "logps/rejected": -263.28363037109375, "loss": 0.6178, "rewards/accuracies": 0.625, "rewards/chosen": -0.25086289644241333, "rewards/margins": 0.19410811364650726, "rewards/rejected": -0.4449709951877594, "step": 158 }, { "epoch": 0.16, "learning_rate": 4.942823292079325e-05, "logits/chosen": -2.1289565563201904, "logits/rejected": -2.1196882724761963, "logps/chosen": -301.2019958496094, "logps/rejected": -265.33685302734375, "loss": 0.6924, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6575217843055725, "rewards/margins": 0.06726698577404022, "rewards/rejected": -0.7247887849807739, "step": 159 }, { "epoch": 0.17, "learning_rate": 4.940876413574195e-05, "logits/chosen": -2.016998291015625, "logits/rejected": -2.271897077560425, "logps/chosen": -302.09454345703125, "logps/rejected": -424.992919921875, "loss": 0.7552, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4835960865020752, "rewards/margins": -0.06482114642858505, "rewards/rejected": -0.41877493262290955, "step": 160 }, { "epoch": 0.17, "learning_rate": 4.938897338259132e-05, "logits/chosen": -2.087447166442871, "logits/rejected": -1.9530307054519653, "logps/chosen": -336.6803894042969, "logps/rejected": -300.6052551269531, "loss": 0.727, "rewards/accuracies": 0.3125, "rewards/chosen": -0.46374762058258057, "rewards/margins": -0.05146384611725807, "rewards/rejected": -0.4122838079929352, "step": 161 }, { "epoch": 0.17, "learning_rate": 4.936886092239475e-05, "logits/chosen": -2.277801990509033, "logits/rejected": -2.1965315341949463, "logps/chosen": -356.08038330078125, "logps/rejected": -355.2462158203125, "loss": 0.6807, "rewards/accuracies": 0.625, "rewards/chosen": -0.35021650791168213, "rewards/margins": 0.04458609223365784, "rewards/rejected": -0.39480262994766235, "step": 162 }, { "epoch": 0.17, "learning_rate": 4.93484270204492e-05, "logits/chosen": -2.132495403289795, "logits/rejected": -2.1767892837524414, "logps/chosen": -416.3873291015625, "logps/rejected": -446.61444091796875, "loss": 0.6713, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3273276686668396, "rewards/margins": 0.06858228892087936, "rewards/rejected": -0.39590996503829956, "step": 163 }, { "epoch": 0.17, "learning_rate": 4.932767194629164e-05, "logits/chosen": -1.9876537322998047, "logits/rejected": -2.0633606910705566, "logps/chosen": -398.9766845703125, "logps/rejected": -377.8792419433594, "loss": 0.7513, "rewards/accuracies": 0.25, "rewards/chosen": -0.6136964559555054, "rewards/margins": -0.046500250697135925, "rewards/rejected": -0.567196249961853, "step": 164 }, { "epoch": 0.17, "learning_rate": 4.930659597369554e-05, "logits/chosen": -2.009962320327759, "logits/rejected": -2.057422399520874, "logps/chosen": -303.35882568359375, "logps/rejected": -332.4872131347656, "loss": 0.6449, "rewards/accuracies": 0.6875, "rewards/chosen": -0.36901330947875977, "rewards/margins": 0.15158693492412567, "rewards/rejected": -0.5206002593040466, "step": 165 }, { "epoch": 0.17, "learning_rate": 4.928519938066722e-05, "logits/chosen": -1.9507710933685303, "logits/rejected": -1.9371974468231201, "logps/chosen": -350.26031494140625, "logps/rejected": -332.67572021484375, "loss": 0.6892, "rewards/accuracies": 0.5625, "rewards/chosen": -0.47843092679977417, "rewards/margins": 0.03055078350007534, "rewards/rejected": -0.5089817047119141, "step": 166 }, { "epoch": 0.17, "learning_rate": 4.926348244944221e-05, "logits/chosen": -1.8575907945632935, "logits/rejected": -1.8456588983535767, "logps/chosen": -298.292236328125, "logps/rejected": -305.32904052734375, "loss": 0.6269, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5008257031440735, "rewards/margins": 0.18142402172088623, "rewards/rejected": -0.6822497248649597, "step": 167 }, { "epoch": 0.17, "learning_rate": 4.9241445466481504e-05, "logits/chosen": -1.9923934936523438, "logits/rejected": -2.1334385871887207, "logps/chosen": -278.231689453125, "logps/rejected": -368.37152099609375, "loss": 0.7279, "rewards/accuracies": 0.5, "rewards/chosen": -0.43066713213920593, "rewards/margins": -0.028812985867261887, "rewards/rejected": -0.40185415744781494, "step": 168 }, { "epoch": 0.18, "learning_rate": 4.921908872246782e-05, "logits/chosen": -2.099191665649414, "logits/rejected": -2.299363136291504, "logps/chosen": -298.4676208496094, "logps/rejected": -369.09075927734375, "loss": 0.5683, "rewards/accuracies": 0.6875, "rewards/chosen": -0.42483800649642944, "rewards/margins": 0.3080124258995056, "rewards/rejected": -0.7328504323959351, "step": 169 }, { "epoch": 0.18, "learning_rate": 4.91964125123017e-05, "logits/chosen": -2.2204477787017822, "logits/rejected": -2.0866146087646484, "logps/chosen": -417.0639343261719, "logps/rejected": -406.372802734375, "loss": 0.7542, "rewards/accuracies": 0.5, "rewards/chosen": -0.6449086666107178, "rewards/margins": -0.07101374119520187, "rewards/rejected": -0.5738948583602905, "step": 170 }, { "epoch": 0.18, "learning_rate": 4.9173417135097715e-05, "logits/chosen": -2.208749294281006, "logits/rejected": -2.0630850791931152, "logps/chosen": -286.49749755859375, "logps/rejected": -274.74822998046875, "loss": 0.7148, "rewards/accuracies": 0.3125, "rewards/chosen": -0.36861342191696167, "rewards/margins": -0.0281071700155735, "rewards/rejected": -0.3405062258243561, "step": 171 }, { "epoch": 0.18, "learning_rate": 4.9150102894180415e-05, "logits/chosen": -1.9276704788208008, "logits/rejected": -1.7694021463394165, "logps/chosen": -305.76702880859375, "logps/rejected": -299.9190368652344, "loss": 0.7089, "rewards/accuracies": 0.5, "rewards/chosen": -0.5562857985496521, "rewards/margins": 0.01368020474910736, "rewards/rejected": -0.5699659585952759, "step": 172 }, { "epoch": 0.18, "learning_rate": 4.91264700970804e-05, "logits/chosen": -2.1296586990356445, "logits/rejected": -2.081202983856201, "logps/chosen": -253.95947265625, "logps/rejected": -282.4194641113281, "loss": 0.6342, "rewards/accuracies": 0.75, "rewards/chosen": -0.3127891719341278, "rewards/margins": 0.14373984932899475, "rewards/rejected": -0.45652902126312256, "step": 173 }, { "epoch": 0.18, "learning_rate": 4.910251905553025e-05, "logits/chosen": -2.120836019515991, "logits/rejected": -2.152477979660034, "logps/chosen": -461.200927734375, "logps/rejected": -470.2550048828125, "loss": 0.6578, "rewards/accuracies": 0.5, "rewards/chosen": -0.39715662598609924, "rewards/margins": 0.12175296247005463, "rewards/rejected": -0.5189095735549927, "step": 174 }, { "epoch": 0.18, "learning_rate": 4.9078250085460384e-05, "logits/chosen": -2.0472662448883057, "logits/rejected": -2.0290732383728027, "logps/chosen": -368.0150451660156, "logps/rejected": -276.0652770996094, "loss": 0.6663, "rewards/accuracies": 0.625, "rewards/chosen": -0.5132391452789307, "rewards/margins": 0.08550245314836502, "rewards/rejected": -0.5987416505813599, "step": 175 }, { "epoch": 0.18, "learning_rate": 4.905366350699493e-05, "logits/chosen": -2.0048584938049316, "logits/rejected": -2.020286798477173, "logps/chosen": -341.3944396972656, "logps/rejected": -444.3612060546875, "loss": 0.6891, "rewards/accuracies": 0.4375, "rewards/chosen": -0.48147350549697876, "rewards/margins": 0.06385768949985504, "rewards/rejected": -0.5453312397003174, "step": 176 }, { "epoch": 0.18, "learning_rate": 4.902875964444746e-05, "logits/chosen": -1.9281437397003174, "logits/rejected": -2.0846328735351562, "logps/chosen": -397.08453369140625, "logps/rejected": -412.8231201171875, "loss": 0.7184, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5112196207046509, "rewards/margins": 0.010270453989505768, "rewards/rejected": -0.5214900374412537, "step": 177 }, { "epoch": 0.18, "learning_rate": 4.9003538826316795e-05, "logits/chosen": -1.9262962341308594, "logits/rejected": -1.8548663854599, "logps/chosen": -326.5889587402344, "logps/rejected": -336.98443603515625, "loss": 0.6411, "rewards/accuracies": 0.625, "rewards/chosen": -0.37680521607398987, "rewards/margins": 0.14640317857265472, "rewards/rejected": -0.5232084393501282, "step": 178 }, { "epoch": 0.19, "learning_rate": 4.897800138528253e-05, "logits/chosen": -2.234349250793457, "logits/rejected": -2.1834311485290527, "logps/chosen": -302.0099182128906, "logps/rejected": -293.316162109375, "loss": 0.7266, "rewards/accuracies": 0.5, "rewards/chosen": -0.42696690559387207, "rewards/margins": -0.030406557023525238, "rewards/rejected": -0.39656031131744385, "step": 179 }, { "epoch": 0.19, "learning_rate": 4.8952147658200806e-05, "logits/chosen": -1.9105005264282227, "logits/rejected": -2.006873607635498, "logps/chosen": -307.2244873046875, "logps/rejected": -327.2491455078125, "loss": 0.6388, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4698525667190552, "rewards/margins": 0.18418261408805847, "rewards/rejected": -0.6540351510047913, "step": 180 }, { "epoch": 0.19, "learning_rate": 4.892597798609976e-05, "logits/chosen": -1.8944015502929688, "logits/rejected": -1.8353430032730103, "logps/chosen": -372.86474609375, "logps/rejected": -328.3533020019531, "loss": 0.7458, "rewards/accuracies": 0.5, "rewards/chosen": -0.5451046228408813, "rewards/margins": -0.07491657137870789, "rewards/rejected": -0.4701881408691406, "step": 181 }, { "epoch": 0.19, "learning_rate": 4.889949271417504e-05, "logits/chosen": -2.0069048404693604, "logits/rejected": -2.1132149696350098, "logps/chosen": -313.0219421386719, "logps/rejected": -377.8055114746094, "loss": 0.6898, "rewards/accuracies": 0.625, "rewards/chosen": -0.4439217150211334, "rewards/margins": 0.06591986864805222, "rewards/rejected": -0.5098415613174438, "step": 182 }, { "epoch": 0.19, "learning_rate": 4.88726921917853e-05, "logits/chosen": -1.847022294998169, "logits/rejected": -1.861661434173584, "logps/chosen": -222.52427673339844, "logps/rejected": -241.1929931640625, "loss": 0.6617, "rewards/accuracies": 0.5, "rewards/chosen": -0.39014291763305664, "rewards/margins": 0.09657852351665497, "rewards/rejected": -0.486721396446228, "step": 183 }, { "epoch": 0.19, "learning_rate": 4.884557677244754e-05, "logits/chosen": -1.9963531494140625, "logits/rejected": -2.0941479206085205, "logps/chosen": -274.606689453125, "logps/rejected": -268.11395263671875, "loss": 0.6133, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4282204508781433, "rewards/margins": 0.23392558097839355, "rewards/rejected": -0.6621460318565369, "step": 184 }, { "epoch": 0.19, "learning_rate": 4.881814681383248e-05, "logits/chosen": -1.8474693298339844, "logits/rejected": -2.122403144836426, "logps/chosen": -254.52745056152344, "logps/rejected": -355.0078430175781, "loss": 0.6778, "rewards/accuracies": 0.5, "rewards/chosen": -0.3664317727088928, "rewards/margins": 0.07878479361534119, "rewards/rejected": -0.4452165961265564, "step": 185 }, { "epoch": 0.19, "learning_rate": 4.879040267775981e-05, "logits/chosen": -1.891446828842163, "logits/rejected": -1.794939637184143, "logps/chosen": -383.87451171875, "logps/rejected": -405.0089416503906, "loss": 0.7267, "rewards/accuracies": 0.625, "rewards/chosen": -0.592932403087616, "rewards/margins": -0.023355990648269653, "rewards/rejected": -0.5695763826370239, "step": 186 }, { "epoch": 0.19, "learning_rate": 4.8762344730193445e-05, "logits/chosen": -1.923872709274292, "logits/rejected": -2.09379243850708, "logps/chosen": -251.91183471679688, "logps/rejected": -267.0522155761719, "loss": 0.618, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5020468831062317, "rewards/margins": 0.21398763358592987, "rewards/rejected": -0.7160345315933228, "step": 187 }, { "epoch": 0.19, "learning_rate": 4.873397334123667e-05, "logits/chosen": -1.7248388528823853, "logits/rejected": -2.1145920753479004, "logps/chosen": -270.976806640625, "logps/rejected": -385.2688293457031, "loss": 0.6094, "rewards/accuracies": 0.625, "rewards/chosen": -0.5947083234786987, "rewards/margins": 0.23461143672466278, "rewards/rejected": -0.8293198347091675, "step": 188 }, { "epoch": 0.2, "learning_rate": 4.8705288885127295e-05, "logits/chosen": -2.289656162261963, "logits/rejected": -2.2240102291107178, "logps/chosen": -408.82403564453125, "logps/rejected": -361.85711669921875, "loss": 0.7461, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6849729418754578, "rewards/margins": -0.0317959301173687, "rewards/rejected": -0.6531770825386047, "step": 189 }, { "epoch": 0.2, "learning_rate": 4.867629174023268e-05, "logits/chosen": -2.281062602996826, "logits/rejected": -2.0911028385162354, "logps/chosen": -390.974365234375, "logps/rejected": -376.9208984375, "loss": 0.7288, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5994336009025574, "rewards/margins": -0.05144501477479935, "rewards/rejected": -0.5479886531829834, "step": 190 }, { "epoch": 0.2, "learning_rate": 4.864698228904478e-05, "logits/chosen": -1.8639394044876099, "logits/rejected": -1.969814658164978, "logps/chosen": -390.7828674316406, "logps/rejected": -316.2579345703125, "loss": 0.7376, "rewards/accuracies": 0.5, "rewards/chosen": -0.6630164384841919, "rewards/margins": -0.02051009237766266, "rewards/rejected": -0.6425063610076904, "step": 191 }, { "epoch": 0.2, "learning_rate": 4.861736091817506e-05, "logits/chosen": -2.084822654724121, "logits/rejected": -1.8905832767486572, "logps/chosen": -373.1632080078125, "logps/rejected": -256.5694274902344, "loss": 0.6745, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6736202239990234, "rewards/margins": 0.06500263512134552, "rewards/rejected": -0.738622784614563, "step": 192 }, { "epoch": 0.2, "learning_rate": 4.858742801834942e-05, "logits/chosen": -2.0825746059417725, "logits/rejected": -1.8865240812301636, "logps/chosen": -371.4250793457031, "logps/rejected": -295.2694396972656, "loss": 0.7168, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5424689650535583, "rewards/margins": -0.019147779792547226, "rewards/rejected": -0.5233211517333984, "step": 193 }, { "epoch": 0.2, "learning_rate": 4.855718398440307e-05, "logits/chosen": -2.1740431785583496, "logits/rejected": -1.8387389183044434, "logps/chosen": -307.1974182128906, "logps/rejected": -293.7174987792969, "loss": 0.6362, "rewards/accuracies": 0.5, "rewards/chosen": -0.6124351620674133, "rewards/margins": 0.22457748651504517, "rewards/rejected": -0.8370125889778137, "step": 194 }, { "epoch": 0.2, "learning_rate": 4.852662921527522e-05, "logits/chosen": -2.0800061225891113, "logits/rejected": -2.2033514976501465, "logps/chosen": -314.9759216308594, "logps/rejected": -373.83087158203125, "loss": 0.6388, "rewards/accuracies": 0.75, "rewards/chosen": -0.7755237221717834, "rewards/margins": 0.17267946898937225, "rewards/rejected": -0.9482032060623169, "step": 195 }, { "epoch": 0.2, "learning_rate": 4.8495764114003966e-05, "logits/chosen": -2.0974619388580322, "logits/rejected": -2.1461567878723145, "logps/chosen": -360.7484436035156, "logps/rejected": -402.9318542480469, "loss": 0.6068, "rewards/accuracies": 0.75, "rewards/chosen": -0.7390251159667969, "rewards/margins": 0.220990851521492, "rewards/rejected": -0.9600158929824829, "step": 196 }, { "epoch": 0.2, "learning_rate": 4.8464589087720846e-05, "logits/chosen": -2.0587756633758545, "logits/rejected": -1.9800175428390503, "logps/chosen": -287.4089050292969, "logps/rejected": -284.52667236328125, "loss": 0.7495, "rewards/accuracies": 0.25, "rewards/chosen": -0.6148634552955627, "rewards/margins": -0.05161774903535843, "rewards/rejected": -0.5632455945014954, "step": 197 }, { "epoch": 0.21, "learning_rate": 4.8433104547645527e-05, "logits/chosen": -2.166761875152588, "logits/rejected": -2.1352829933166504, "logps/chosen": -270.46527099609375, "logps/rejected": -276.6797180175781, "loss": 0.6281, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7134184241294861, "rewards/margins": 0.1845559924840927, "rewards/rejected": -0.89797443151474, "step": 198 }, { "epoch": 0.21, "learning_rate": 4.840131090908038e-05, "logits/chosen": -2.013166904449463, "logits/rejected": -2.0285515785217285, "logps/chosen": -250.39393615722656, "logps/rejected": -247.9554443359375, "loss": 0.6774, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5990790128707886, "rewards/margins": 0.09994714707136154, "rewards/rejected": -0.6990260481834412, "step": 199 }, { "epoch": 0.21, "learning_rate": 4.8369208591404997e-05, "logits/chosen": -2.070176124572754, "logits/rejected": -2.184037446975708, "logps/chosen": -290.0277099609375, "logps/rejected": -408.5084228515625, "loss": 0.7326, "rewards/accuracies": 0.5, "rewards/chosen": -0.8427464962005615, "rewards/margins": -0.015421424061059952, "rewards/rejected": -0.8273251056671143, "step": 200 }, { "epoch": 0.21, "learning_rate": 4.833679801807064e-05, "logits/chosen": -1.9874109029769897, "logits/rejected": -2.082557201385498, "logps/chosen": -298.5231628417969, "logps/rejected": -384.05853271484375, "loss": 0.6293, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9263674020767212, "rewards/margins": 0.19633004069328308, "rewards/rejected": -1.1226974725723267, "step": 201 }, { "epoch": 0.21, "learning_rate": 4.8304079616594686e-05, "logits/chosen": -2.2528278827667236, "logits/rejected": -2.320962905883789, "logps/chosen": -476.85968017578125, "logps/rejected": -352.87298583984375, "loss": 0.7128, "rewards/accuracies": 0.5, "rewards/chosen": -0.9438900947570801, "rewards/margins": 0.07938252389431, "rewards/rejected": -1.0232725143432617, "step": 202 }, { "epoch": 0.21, "learning_rate": 4.8271053818554965e-05, "logits/chosen": -2.0623865127563477, "logits/rejected": -2.1591854095458984, "logps/chosen": -272.2887878417969, "logps/rejected": -340.0892639160156, "loss": 0.726, "rewards/accuracies": 0.5, "rewards/chosen": -0.6589000821113586, "rewards/margins": -0.008545447140932083, "rewards/rejected": -0.6503546833992004, "step": 203 }, { "epoch": 0.21, "learning_rate": 4.823772105958408e-05, "logits/chosen": -2.2021212577819824, "logits/rejected": -2.2701587677001953, "logps/chosen": -332.5159912109375, "logps/rejected": -383.0360412597656, "loss": 0.5916, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8989717960357666, "rewards/margins": 0.3648967444896698, "rewards/rejected": -1.2638685703277588, "step": 204 }, { "epoch": 0.21, "learning_rate": 4.820408177936365e-05, "logits/chosen": -2.264617443084717, "logits/rejected": -2.322549343109131, "logps/chosen": -434.6703796386719, "logps/rejected": -497.743408203125, "loss": 0.6572, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8982381820678711, "rewards/margins": 0.15654003620147705, "rewards/rejected": -1.0547782182693481, "step": 205 }, { "epoch": 0.21, "learning_rate": 4.817013642161853e-05, "logits/chosen": -2.034374237060547, "logits/rejected": -1.8990505933761597, "logps/chosen": -335.35137939453125, "logps/rejected": -308.3555603027344, "loss": 0.7744, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9858347773551941, "rewards/margins": -0.07680069655179977, "rewards/rejected": -0.9090341925621033, "step": 206 }, { "epoch": 0.21, "learning_rate": 4.813588543411093e-05, "logits/chosen": -2.0272958278656006, "logits/rejected": -2.040910243988037, "logps/chosen": -263.2490234375, "logps/rejected": -330.78326416015625, "loss": 0.611, "rewards/accuracies": 0.75, "rewards/chosen": -0.806855320930481, "rewards/margins": 0.22991889715194702, "rewards/rejected": -1.0367741584777832, "step": 207 }, { "epoch": 0.22, "learning_rate": 4.810132926863454e-05, "logits/chosen": -1.954245924949646, "logits/rejected": -2.222029685974121, "logps/chosen": -344.208984375, "logps/rejected": -378.04827880859375, "loss": 0.5867, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7623510360717773, "rewards/margins": 0.3259902894496918, "rewards/rejected": -1.088341236114502, "step": 208 }, { "epoch": 0.22, "learning_rate": 4.806646838100852e-05, "logits/chosen": -1.9601213932037354, "logits/rejected": -1.9726929664611816, "logps/chosen": -363.0174255371094, "logps/rejected": -362.3369140625, "loss": 0.6792, "rewards/accuracies": 0.4375, "rewards/chosen": -0.85859614610672, "rewards/margins": 0.1350986659526825, "rewards/rejected": -0.9936947822570801, "step": 209 }, { "epoch": 0.22, "learning_rate": 4.803130323107157e-05, "logits/chosen": -2.3228697776794434, "logits/rejected": -2.4339702129364014, "logps/chosen": -412.57208251953125, "logps/rejected": -507.545166015625, "loss": 0.6152, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8819563984870911, "rewards/margins": 0.24080882966518402, "rewards/rejected": -1.1227651834487915, "step": 210 }, { "epoch": 0.22, "learning_rate": 4.7995834282675764e-05, "logits/chosen": -1.9379347562789917, "logits/rejected": -1.9101741313934326, "logps/chosen": -336.1128234863281, "logps/rejected": -344.0860595703125, "loss": 0.6526, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7465636730194092, "rewards/margins": 0.16460736095905304, "rewards/rejected": -0.9111709594726562, "step": 211 }, { "epoch": 0.22, "learning_rate": 4.796006200368054e-05, "logits/chosen": -1.9772237539291382, "logits/rejected": -2.0779454708099365, "logps/chosen": -300.3962097167969, "logps/rejected": -379.80230712890625, "loss": 0.6292, "rewards/accuracies": 0.625, "rewards/chosen": -0.6897981762886047, "rewards/margins": 0.20077620446681976, "rewards/rejected": -0.8905743360519409, "step": 212 }, { "epoch": 0.22, "learning_rate": 4.79239868659464e-05, "logits/chosen": -2.019927978515625, "logits/rejected": -2.08197283744812, "logps/chosen": -249.34629821777344, "logps/rejected": -299.1027526855469, "loss": 0.6968, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8061387538909912, "rewards/margins": 0.06293849647045135, "rewards/rejected": -0.8690773248672485, "step": 213 }, { "epoch": 0.22, "learning_rate": 4.788760934532883e-05, "logits/chosen": -2.124903678894043, "logits/rejected": -2.041473865509033, "logps/chosen": -268.1578674316406, "logps/rejected": -329.6964111328125, "loss": 0.7397, "rewards/accuracies": 0.5, "rewards/chosen": -0.9142539501190186, "rewards/margins": 0.021660268306732178, "rewards/rejected": -0.935914158821106, "step": 214 }, { "epoch": 0.22, "learning_rate": 4.785092992167192e-05, "logits/chosen": -2.1390061378479004, "logits/rejected": -2.125261068344116, "logps/chosen": -268.4858703613281, "logps/rejected": -305.736083984375, "loss": 0.6402, "rewards/accuracies": 0.625, "rewards/chosen": -0.7436612248420715, "rewards/margins": 0.18916505575180054, "rewards/rejected": -0.9328262209892273, "step": 215 }, { "epoch": 0.22, "learning_rate": 4.781394907880204e-05, "logits/chosen": -2.0572445392608643, "logits/rejected": -2.2498245239257812, "logps/chosen": -288.4680480957031, "logps/rejected": -313.41729736328125, "loss": 0.5816, "rewards/accuracies": 0.625, "rewards/chosen": -0.6026442646980286, "rewards/margins": 0.3183574676513672, "rewards/rejected": -0.9210017919540405, "step": 216 }, { "epoch": 0.22, "learning_rate": 4.777666730452151e-05, "logits/chosen": -1.8694506883621216, "logits/rejected": -2.019477367401123, "logps/chosen": -265.6673889160156, "logps/rejected": -356.0314025878906, "loss": 0.5947, "rewards/accuracies": 0.625, "rewards/chosen": -0.7154449820518494, "rewards/margins": 0.4170495271682739, "rewards/rejected": -1.1324944496154785, "step": 217 }, { "epoch": 0.23, "learning_rate": 4.7739085090602145e-05, "logits/chosen": -2.254331111907959, "logits/rejected": -2.3572165966033936, "logps/chosen": -305.95074462890625, "logps/rejected": -340.34893798828125, "loss": 0.7385, "rewards/accuracies": 0.5, "rewards/chosen": -0.7424416542053223, "rewards/margins": 0.05813989043235779, "rewards/rejected": -0.8005815148353577, "step": 218 }, { "epoch": 0.23, "learning_rate": 4.770120293277875e-05, "logits/chosen": -2.0773184299468994, "logits/rejected": -2.0856757164001465, "logps/chosen": -351.6342468261719, "logps/rejected": -304.958251953125, "loss": 0.5738, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8276374936103821, "rewards/margins": 0.34980258345603943, "rewards/rejected": -1.1774399280548096, "step": 219 }, { "epoch": 0.23, "learning_rate": 4.76630213307426e-05, "logits/chosen": -1.996044635772705, "logits/rejected": -2.1620826721191406, "logps/chosen": -286.35565185546875, "logps/rejected": -381.3570861816406, "loss": 0.8382, "rewards/accuracies": 0.375, "rewards/chosen": -0.9452384114265442, "rewards/margins": -0.1867210865020752, "rewards/rejected": -0.7585172057151794, "step": 220 }, { "epoch": 0.23, "learning_rate": 4.762454078813483e-05, "logits/chosen": -1.959717035293579, "logits/rejected": -2.038839340209961, "logps/chosen": -327.7545471191406, "logps/rejected": -336.3837585449219, "loss": 0.7531, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8737196326255798, "rewards/margins": -0.06513047218322754, "rewards/rejected": -0.8085891008377075, "step": 221 }, { "epoch": 0.23, "learning_rate": 4.758576181253981e-05, "logits/chosen": -2.221623659133911, "logits/rejected": -2.0508334636688232, "logps/chosen": -380.0113830566406, "logps/rejected": -305.5745544433594, "loss": 0.6948, "rewards/accuracies": 0.5625, "rewards/chosen": -0.907548189163208, "rewards/margins": 0.19952180981636047, "rewards/rejected": -1.107069969177246, "step": 222 }, { "epoch": 0.23, "learning_rate": 4.754668491547845e-05, "logits/chosen": -2.2138311862945557, "logits/rejected": -1.9480187892913818, "logps/chosen": -353.3214111328125, "logps/rejected": -298.512939453125, "loss": 0.6672, "rewards/accuracies": 0.625, "rewards/chosen": -0.7623633146286011, "rewards/margins": 0.12673720717430115, "rewards/rejected": -0.8891006112098694, "step": 223 }, { "epoch": 0.23, "learning_rate": 4.750731061240143e-05, "logits/chosen": -2.000711679458618, "logits/rejected": -2.172668933868408, "logps/chosen": -283.4869384765625, "logps/rejected": -291.0225830078125, "loss": 0.6861, "rewards/accuracies": 0.6875, "rewards/chosen": -0.46767231822013855, "rewards/margins": 0.11719869822263718, "rewards/rejected": -0.5848710536956787, "step": 224 }, { "epoch": 0.23, "learning_rate": 4.746763942268243e-05, "logits/chosen": -1.973673701286316, "logits/rejected": -2.03965163230896, "logps/chosen": -380.54974365234375, "logps/rejected": -390.66119384765625, "loss": 0.6839, "rewards/accuracies": 0.5, "rewards/chosen": -0.5797699093818665, "rewards/margins": 0.08640918880701065, "rewards/rejected": -0.6661791205406189, "step": 225 }, { "epoch": 0.23, "learning_rate": 4.742767186961125e-05, "logits/chosen": -2.1579782962799072, "logits/rejected": -2.12062668800354, "logps/chosen": -371.7868957519531, "logps/rejected": -279.1838684082031, "loss": 0.7117, "rewards/accuracies": 0.6875, "rewards/chosen": -0.43902456760406494, "rewards/margins": 0.0770123153924942, "rewards/rejected": -0.5160369277000427, "step": 226 }, { "epoch": 0.24, "learning_rate": 4.7387408480386945e-05, "logits/chosen": -2.1720871925354004, "logits/rejected": -2.2602319717407227, "logps/chosen": -340.2154235839844, "logps/rejected": -415.40625, "loss": 0.6475, "rewards/accuracies": 0.4375, "rewards/chosen": -0.4649474322795868, "rewards/margins": 0.14594462513923645, "rewards/rejected": -0.6108919978141785, "step": 227 }, { "epoch": 0.24, "learning_rate": 4.7346849786110834e-05, "logits/chosen": -1.956856608390808, "logits/rejected": -2.2350995540618896, "logps/chosen": -277.75689697265625, "logps/rejected": -373.5628967285156, "loss": 0.6256, "rewards/accuracies": 0.5625, "rewards/chosen": -0.30955129861831665, "rewards/margins": 0.2196839451789856, "rewards/rejected": -0.529235303401947, "step": 228 }, { "epoch": 0.24, "learning_rate": 4.7305996321779516e-05, "logits/chosen": -1.9420428276062012, "logits/rejected": -2.040480613708496, "logps/chosen": -330.3153991699219, "logps/rejected": -419.3126220703125, "loss": 0.6871, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7027783393859863, "rewards/margins": 0.27350372076034546, "rewards/rejected": -0.976282000541687, "step": 229 }, { "epoch": 0.24, "learning_rate": 4.726484862627779e-05, "logits/chosen": -2.0949978828430176, "logits/rejected": -2.049705743789673, "logps/chosen": -399.0694580078125, "logps/rejected": -362.0954284667969, "loss": 0.7168, "rewards/accuracies": 0.5625, "rewards/chosen": -0.26283103227615356, "rewards/margins": 0.01691259816288948, "rewards/rejected": -0.27974364161491394, "step": 230 }, { "epoch": 0.24, "learning_rate": 4.722340724237159e-05, "logits/chosen": -1.8245809078216553, "logits/rejected": -2.162997007369995, "logps/chosen": -250.3448486328125, "logps/rejected": -344.0015563964844, "loss": 0.6018, "rewards/accuracies": 0.75, "rewards/chosen": -0.40172523260116577, "rewards/margins": 0.23018088936805725, "rewards/rejected": -0.6319061517715454, "step": 231 }, { "epoch": 0.24, "learning_rate": 4.718167271670077e-05, "logits/chosen": -2.0060951709747314, "logits/rejected": -2.127163887023926, "logps/chosen": -301.98492431640625, "logps/rejected": -334.6331481933594, "loss": 0.5626, "rewards/accuracies": 0.75, "rewards/chosen": -0.17217136919498444, "rewards/margins": 0.36563175916671753, "rewards/rejected": -0.5378031730651855, "step": 232 }, { "epoch": 0.24, "learning_rate": 4.7139645599771956e-05, "logits/chosen": -2.229623794555664, "logits/rejected": -2.2746386528015137, "logps/chosen": -340.92559814453125, "logps/rejected": -386.106689453125, "loss": 0.6761, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6003557443618774, "rewards/margins": 0.10827502608299255, "rewards/rejected": -0.7086308002471924, "step": 233 }, { "epoch": 0.24, "learning_rate": 4.709732644595122e-05, "logits/chosen": -2.075230360031128, "logits/rejected": -1.8063032627105713, "logps/chosen": -322.08856201171875, "logps/rejected": -251.28103637695312, "loss": 0.7255, "rewards/accuracies": 0.25, "rewards/chosen": -0.5996315479278564, "rewards/margins": -0.027767587453126907, "rewards/rejected": -0.5718639492988586, "step": 234 }, { "epoch": 0.24, "learning_rate": 4.7054715813456795e-05, "logits/chosen": -2.074021816253662, "logits/rejected": -1.947763442993164, "logps/chosen": -357.5520324707031, "logps/rejected": -347.5523681640625, "loss": 0.648, "rewards/accuracies": 0.5, "rewards/chosen": -0.7392159700393677, "rewards/margins": 0.21554100513458252, "rewards/rejected": -0.954757034778595, "step": 235 }, { "epoch": 0.24, "learning_rate": 4.701181426435175e-05, "logits/chosen": -1.9467442035675049, "logits/rejected": -1.974783182144165, "logps/chosen": -358.291748046875, "logps/rejected": -400.4483947753906, "loss": 0.7035, "rewards/accuracies": 0.4375, "rewards/chosen": -0.329414039850235, "rewards/margins": 0.15945447981357574, "rewards/rejected": -0.4888685643672943, "step": 236 }, { "epoch": 0.25, "learning_rate": 4.69686223645365e-05, "logits/chosen": -2.1252236366271973, "logits/rejected": -2.0597469806671143, "logps/chosen": -314.242919921875, "logps/rejected": -304.12176513671875, "loss": 0.7022, "rewards/accuracies": 0.375, "rewards/chosen": -0.42311131954193115, "rewards/margins": 0.1071557104587555, "rewards/rejected": -0.5302670001983643, "step": 237 }, { "epoch": 0.25, "learning_rate": 4.692514068374142e-05, "logits/chosen": -2.0920791625976562, "logits/rejected": -2.227646827697754, "logps/chosen": -389.5137634277344, "logps/rejected": -376.8587341308594, "loss": 0.6463, "rewards/accuracies": 0.625, "rewards/chosen": -0.4674568772315979, "rewards/margins": 0.22089587152004242, "rewards/rejected": -0.6883527636528015, "step": 238 }, { "epoch": 0.25, "learning_rate": 4.6881369795519266e-05, "logits/chosen": -2.0833654403686523, "logits/rejected": -2.075024366378784, "logps/chosen": -489.9222412109375, "logps/rejected": -418.9756164550781, "loss": 0.609, "rewards/accuracies": 0.6875, "rewards/chosen": -0.603772759437561, "rewards/margins": 0.2230098843574524, "rewards/rejected": -0.8267825841903687, "step": 239 }, { "epoch": 0.25, "learning_rate": 4.683731027723764e-05, "logits/chosen": -2.1447250843048096, "logits/rejected": -2.3132801055908203, "logps/chosen": -311.90301513671875, "logps/rejected": -416.7982177734375, "loss": 0.72, "rewards/accuracies": 0.5, "rewards/chosen": -1.117811679840088, "rewards/margins": 0.1118747889995575, "rewards/rejected": -1.2296864986419678, "step": 240 }, { "epoch": 0.25, "learning_rate": 4.679296271007137e-05, "logits/chosen": -1.8714052438735962, "logits/rejected": -1.9733188152313232, "logps/chosen": -378.4604797363281, "logps/rejected": -340.4316101074219, "loss": 0.7227, "rewards/accuracies": 0.5625, "rewards/chosen": -0.46567368507385254, "rewards/margins": 0.10245680809020996, "rewards/rejected": -0.5681304931640625, "step": 241 }, { "epoch": 0.25, "learning_rate": 4.674832767899486e-05, "logits/chosen": -2.0161495208740234, "logits/rejected": -2.013415813446045, "logps/chosen": -334.1321105957031, "logps/rejected": -480.6463928222656, "loss": 0.7282, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8040814399719238, "rewards/margins": 0.06382356584072113, "rewards/rejected": -0.8679050207138062, "step": 242 }, { "epoch": 0.25, "learning_rate": 4.6703405772774325e-05, "logits/chosen": -2.1323914527893066, "logits/rejected": -2.001523971557617, "logps/chosen": -299.8099670410156, "logps/rejected": -278.1839294433594, "loss": 0.6323, "rewards/accuracies": 0.625, "rewards/chosen": -0.27376043796539307, "rewards/margins": 0.16408132016658783, "rewards/rejected": -0.4378418028354645, "step": 243 }, { "epoch": 0.25, "learning_rate": 4.66581975839601e-05, "logits/chosen": -2.1282427310943604, "logits/rejected": -2.040194272994995, "logps/chosen": -349.28826904296875, "logps/rejected": -320.7705078125, "loss": 0.6147, "rewards/accuracies": 0.625, "rewards/chosen": -0.6112862229347229, "rewards/margins": 0.27279651165008545, "rewards/rejected": -0.8840827345848083, "step": 244 }, { "epoch": 0.25, "learning_rate": 4.661270370887872e-05, "logits/chosen": -2.2946279048919678, "logits/rejected": -2.311314105987549, "logps/chosen": -298.4400634765625, "logps/rejected": -319.92828369140625, "loss": 0.6892, "rewards/accuracies": 0.625, "rewards/chosen": -0.5072231888771057, "rewards/margins": 0.09647183120250702, "rewards/rejected": -0.6036950349807739, "step": 245 }, { "epoch": 0.25, "learning_rate": 4.6566924747625176e-05, "logits/chosen": -2.1643552780151367, "logits/rejected": -2.13722562789917, "logps/chosen": -328.9122009277344, "logps/rejected": -411.6603088378906, "loss": 0.7268, "rewards/accuracies": 0.5, "rewards/chosen": -0.5001087188720703, "rewards/margins": 0.027627088129520416, "rewards/rejected": -0.527735710144043, "step": 246 }, { "epoch": 0.26, "learning_rate": 4.652086130405492e-05, "logits/chosen": -2.0007681846618652, "logits/rejected": -2.173635959625244, "logps/chosen": -344.0201416015625, "logps/rejected": -489.677978515625, "loss": 0.5542, "rewards/accuracies": 0.6875, "rewards/chosen": -0.647387683391571, "rewards/margins": 0.5166620016098022, "rewards/rejected": -1.164049744606018, "step": 247 }, { "epoch": 0.26, "learning_rate": 4.647451398577589e-05, "logits/chosen": -2.2768120765686035, "logits/rejected": -2.2112040519714355, "logps/chosen": -351.68658447265625, "logps/rejected": -304.02362060546875, "loss": 0.8001, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5356206297874451, "rewards/margins": -0.1319734901189804, "rewards/rejected": -0.40364715456962585, "step": 248 }, { "epoch": 0.26, "learning_rate": 4.6427883404140564e-05, "logits/chosen": -2.0852530002593994, "logits/rejected": -2.1016790866851807, "logps/chosen": -337.05499267578125, "logps/rejected": -388.4207763671875, "loss": 0.7004, "rewards/accuracies": 0.5, "rewards/chosen": -0.922147274017334, "rewards/margins": 0.11045366525650024, "rewards/rejected": -1.0326008796691895, "step": 249 }, { "epoch": 0.26, "learning_rate": 4.638097017423783e-05, "logits/chosen": -2.044597625732422, "logits/rejected": -2.14152193069458, "logps/chosen": -334.3042297363281, "logps/rejected": -319.0080871582031, "loss": 0.5122, "rewards/accuracies": 0.75, "rewards/chosen": -0.36447468400001526, "rewards/margins": 0.62729412317276, "rewards/rejected": -0.9917687773704529, "step": 250 }, { "epoch": 0.26, "learning_rate": 4.6333774914884897e-05, "logits/chosen": -2.073789119720459, "logits/rejected": -2.1668505668640137, "logps/chosen": -308.4037780761719, "logps/rejected": -298.2853698730469, "loss": 0.7838, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6153988242149353, "rewards/margins": -0.0456385537981987, "rewards/rejected": -0.569760262966156, "step": 251 }, { "epoch": 0.26, "learning_rate": 4.6286298248619144e-05, "logits/chosen": -2.121100902557373, "logits/rejected": -2.0319623947143555, "logps/chosen": -342.6925048828125, "logps/rejected": -355.6612243652344, "loss": 0.6799, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8498857617378235, "rewards/margins": 0.18424755334854126, "rewards/rejected": -1.0341331958770752, "step": 252 }, { "epoch": 0.26, "learning_rate": 4.62385408016899e-05, "logits/chosen": -2.084768533706665, "logits/rejected": -2.15020489692688, "logps/chosen": -250.22640991210938, "logps/rejected": -272.5914001464844, "loss": 0.5436, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3922913670539856, "rewards/margins": 0.36925047636032104, "rewards/rejected": -0.7615418434143066, "step": 253 }, { "epoch": 0.26, "learning_rate": 4.619050320405017e-05, "logits/chosen": -2.3483760356903076, "logits/rejected": -2.152430772781372, "logps/chosen": -317.35296630859375, "logps/rejected": -290.811767578125, "loss": 0.6754, "rewards/accuracies": 0.625, "rewards/chosen": -0.718281626701355, "rewards/margins": 0.11232803016901016, "rewards/rejected": -0.8306095600128174, "step": 254 }, { "epoch": 0.26, "learning_rate": 4.614218608934834e-05, "logits/chosen": -2.1370747089385986, "logits/rejected": -2.1002144813537598, "logps/chosen": -395.0451965332031, "logps/rejected": -446.2868347167969, "loss": 0.6049, "rewards/accuracies": 0.625, "rewards/chosen": -0.938556969165802, "rewards/margins": 0.3290979564189911, "rewards/rejected": -1.2676548957824707, "step": 255 }, { "epoch": 0.27, "learning_rate": 4.60935900949198e-05, "logits/chosen": -1.9551351070404053, "logits/rejected": -1.9744064807891846, "logps/chosen": -372.6743469238281, "logps/rejected": -486.5342102050781, "loss": 0.7102, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7985714077949524, "rewards/margins": 0.2688364088535309, "rewards/rejected": -1.0674078464508057, "step": 256 }, { "epoch": 0.27, "learning_rate": 4.6044715861778596e-05, "logits/chosen": -2.051593065261841, "logits/rejected": -2.114675521850586, "logps/chosen": -294.7435302734375, "logps/rejected": -326.8503112792969, "loss": 0.5986, "rewards/accuracies": 0.5625, "rewards/chosen": -0.621825098991394, "rewards/margins": 0.25512248277664185, "rewards/rejected": -0.8769477009773254, "step": 257 }, { "epoch": 0.27, "learning_rate": 4.5995564034608884e-05, "logits/chosen": -2.160278797149658, "logits/rejected": -2.1137940883636475, "logps/chosen": -393.886474609375, "logps/rejected": -350.8844299316406, "loss": 0.7498, "rewards/accuracies": 0.375, "rewards/chosen": -0.8584200143814087, "rewards/margins": -0.06297742575407028, "rewards/rejected": -0.795442521572113, "step": 258 }, { "epoch": 0.27, "learning_rate": 4.5946135261756504e-05, "logits/chosen": -2.062591791152954, "logits/rejected": -2.16622257232666, "logps/chosen": -323.41131591796875, "logps/rejected": -332.9070739746094, "loss": 0.6248, "rewards/accuracies": 0.625, "rewards/chosen": -0.5620593428611755, "rewards/margins": 0.3896656036376953, "rewards/rejected": -0.9517249464988708, "step": 259 }, { "epoch": 0.27, "learning_rate": 4.5896430195220364e-05, "logits/chosen": -1.9204814434051514, "logits/rejected": -1.8729685544967651, "logps/chosen": -288.1690979003906, "logps/rejected": -313.9109802246094, "loss": 0.6466, "rewards/accuracies": 0.625, "rewards/chosen": -0.849123477935791, "rewards/margins": 0.2756129801273346, "rewards/rejected": -1.1247365474700928, "step": 260 }, { "epoch": 0.27, "learning_rate": 4.584644949064391e-05, "logits/chosen": -2.176421642303467, "logits/rejected": -2.2713022232055664, "logps/chosen": -273.307373046875, "logps/rejected": -273.297607421875, "loss": 0.7478, "rewards/accuracies": 0.25, "rewards/chosen": -0.8873642086982727, "rewards/margins": -0.060915715992450714, "rewards/rejected": -0.8264484405517578, "step": 261 }, { "epoch": 0.27, "learning_rate": 4.579619380730642e-05, "logits/chosen": -2.1005711555480957, "logits/rejected": -2.1496832370758057, "logps/chosen": -251.0666961669922, "logps/rejected": -300.4075012207031, "loss": 0.6459, "rewards/accuracies": 0.625, "rewards/chosen": -0.5288766622543335, "rewards/margins": 0.20617865025997162, "rewards/rejected": -0.7350552678108215, "step": 262 }, { "epoch": 0.27, "learning_rate": 4.574566380811432e-05, "logits/chosen": -2.277989387512207, "logits/rejected": -2.2332725524902344, "logps/chosen": -357.2762145996094, "logps/rejected": -382.5482482910156, "loss": 0.7107, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6581230163574219, "rewards/margins": 0.030204597860574722, "rewards/rejected": -0.6883276700973511, "step": 263 }, { "epoch": 0.27, "learning_rate": 4.5694860159592465e-05, "logits/chosen": -1.9560626745224, "logits/rejected": -2.078892230987549, "logps/chosen": -314.22076416015625, "logps/rejected": -294.4151306152344, "loss": 0.6152, "rewards/accuracies": 0.625, "rewards/chosen": -0.6360582113265991, "rewards/margins": 0.2562226951122284, "rewards/rejected": -0.8922808766365051, "step": 264 }, { "epoch": 0.27, "learning_rate": 4.5643783531875323e-05, "logits/chosen": -2.0318408012390137, "logits/rejected": -2.1967966556549072, "logps/chosen": -251.95309448242188, "logps/rejected": -406.43310546875, "loss": 0.5652, "rewards/accuracies": 0.625, "rewards/chosen": -0.6249865293502808, "rewards/margins": 0.6104640960693359, "rewards/rejected": -1.2354506254196167, "step": 265 }, { "epoch": 0.28, "learning_rate": 4.559243459869814e-05, "logits/chosen": -1.9235318899154663, "logits/rejected": -2.222059726715088, "logps/chosen": -282.896728515625, "logps/rejected": -374.2673645019531, "loss": 0.6222, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7379283905029297, "rewards/margins": 0.2646501362323761, "rewards/rejected": -1.0025784969329834, "step": 266 }, { "epoch": 0.28, "learning_rate": 4.5540814037388056e-05, "logits/chosen": -1.9744443893432617, "logits/rejected": -1.974491834640503, "logps/chosen": -377.78466796875, "logps/rejected": -405.6194152832031, "loss": 0.9336, "rewards/accuracies": 0.3125, "rewards/chosen": -0.8520271182060242, "rewards/margins": -0.32225731015205383, "rewards/rejected": -0.529769778251648, "step": 267 }, { "epoch": 0.28, "learning_rate": 4.5488922528855176e-05, "logits/chosen": -2.0435807704925537, "logits/rejected": -1.990431308746338, "logps/chosen": -306.81048583984375, "logps/rejected": -369.6317138671875, "loss": 0.6084, "rewards/accuracies": 0.625, "rewards/chosen": -0.726284921169281, "rewards/margins": 0.3399674594402313, "rewards/rejected": -1.06625235080719, "step": 268 }, { "epoch": 0.28, "learning_rate": 4.543676075758356e-05, "logits/chosen": -2.0265307426452637, "logits/rejected": -1.9903606176376343, "logps/chosen": -359.7049255371094, "logps/rejected": -348.72723388671875, "loss": 0.7122, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7237378358840942, "rewards/margins": 0.028506018221378326, "rewards/rejected": -0.7522438764572144, "step": 269 }, { "epoch": 0.28, "learning_rate": 4.538432941162226e-05, "logits/chosen": -2.327871799468994, "logits/rejected": -2.218278408050537, "logps/chosen": -379.38812255859375, "logps/rejected": -380.68609619140625, "loss": 0.7641, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5538444519042969, "rewards/margins": -0.09269000589847565, "rewards/rejected": -0.46115440130233765, "step": 270 }, { "epoch": 0.28, "learning_rate": 4.5331629182576153e-05, "logits/chosen": -2.1100308895111084, "logits/rejected": -2.077500581741333, "logps/chosen": -265.77972412109375, "logps/rejected": -386.9795227050781, "loss": 0.6898, "rewards/accuracies": 0.5, "rewards/chosen": -0.6752620339393616, "rewards/margins": 0.13913662731647491, "rewards/rejected": -0.8143986463546753, "step": 271 }, { "epoch": 0.28, "learning_rate": 4.5278660765596884e-05, "logits/chosen": -2.0439562797546387, "logits/rejected": -2.036978244781494, "logps/chosen": -364.2665710449219, "logps/rejected": -373.9276123046875, "loss": 0.6934, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7833287119865417, "rewards/margins": 0.15880194306373596, "rewards/rejected": -0.9421306848526001, "step": 272 }, { "epoch": 0.28, "learning_rate": 4.522542485937369e-05, "logits/chosen": -2.2488677501678467, "logits/rejected": -2.270367383956909, "logps/chosen": -329.8542175292969, "logps/rejected": -341.32464599609375, "loss": 0.6091, "rewards/accuracies": 0.625, "rewards/chosen": -0.5054783225059509, "rewards/margins": 0.29985660314559937, "rewards/rejected": -0.8053349256515503, "step": 273 }, { "epoch": 0.28, "learning_rate": 4.5171922166124154e-05, "logits/chosen": -2.2042911052703857, "logits/rejected": -2.2818048000335693, "logps/chosen": -345.41571044921875, "logps/rejected": -371.5228271484375, "loss": 0.5853, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5683087110519409, "rewards/margins": 0.31007736921310425, "rewards/rejected": -0.8783860802650452, "step": 274 }, { "epoch": 0.28, "learning_rate": 4.5118153391584974e-05, "logits/chosen": -2.283979892730713, "logits/rejected": -2.3613948822021484, "logps/chosen": -326.4739074707031, "logps/rejected": -346.43701171875, "loss": 0.6944, "rewards/accuracies": 0.5, "rewards/chosen": -0.5925798416137695, "rewards/margins": 0.04503173753619194, "rewards/rejected": -0.6376115679740906, "step": 275 }, { "epoch": 0.29, "learning_rate": 4.5064119245002626e-05, "logits/chosen": -1.9776369333267212, "logits/rejected": -2.186469793319702, "logps/chosen": -316.29217529296875, "logps/rejected": -339.24908447265625, "loss": 0.5385, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4294860363006592, "rewards/margins": 0.5443057417869568, "rewards/rejected": -0.973791778087616, "step": 276 }, { "epoch": 0.29, "learning_rate": 4.500982043912404e-05, "logits/chosen": -1.8511924743652344, "logits/rejected": -1.7418498992919922, "logps/chosen": -221.07847595214844, "logps/rejected": -236.75009155273438, "loss": 0.8087, "rewards/accuracies": 0.3125, "rewards/chosen": -0.6640509963035583, "rewards/margins": -0.18910002708435059, "rewards/rejected": -0.47495099902153015, "step": 277 }, { "epoch": 0.29, "learning_rate": 4.495525769018717e-05, "logits/chosen": -2.1778125762939453, "logits/rejected": -2.3195223808288574, "logps/chosen": -315.2249755859375, "logps/rejected": -350.7799987792969, "loss": 0.538, "rewards/accuracies": 0.75, "rewards/chosen": -0.5389224290847778, "rewards/margins": 0.4431789517402649, "rewards/rejected": -0.9821013808250427, "step": 278 }, { "epoch": 0.29, "learning_rate": 4.490043171791155e-05, "logits/chosen": -2.139204502105713, "logits/rejected": -2.0121872425079346, "logps/chosen": -411.3319396972656, "logps/rejected": -478.99822998046875, "loss": 0.542, "rewards/accuracies": 0.75, "rewards/chosen": -0.5370864272117615, "rewards/margins": 0.40471482276916504, "rewards/rejected": -0.9418012499809265, "step": 279 }, { "epoch": 0.29, "learning_rate": 4.484534324548883e-05, "logits/chosen": -1.9133659601211548, "logits/rejected": -1.7547776699066162, "logps/chosen": -291.21502685546875, "logps/rejected": -296.52691650390625, "loss": 0.7422, "rewards/accuracies": 0.5, "rewards/chosen": -0.7586185932159424, "rewards/margins": 0.04123706370592117, "rewards/rejected": -0.7998557686805725, "step": 280 }, { "epoch": 0.29, "learning_rate": 4.4789992999573194e-05, "logits/chosen": -1.887819766998291, "logits/rejected": -2.074976682662964, "logps/chosen": -238.8917999267578, "logps/rejected": -290.9273681640625, "loss": 0.7374, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8456001281738281, "rewards/margins": 0.0228101909160614, "rewards/rejected": -0.8684103488922119, "step": 281 }, { "epoch": 0.29, "learning_rate": 4.47343817102718e-05, "logits/chosen": -1.776401400566101, "logits/rejected": -1.8924405574798584, "logps/chosen": -291.739990234375, "logps/rejected": -354.3161926269531, "loss": 0.7368, "rewards/accuracies": 0.375, "rewards/chosen": -0.637408971786499, "rewards/margins": -0.0014106258749961853, "rewards/rejected": -0.6359982490539551, "step": 282 }, { "epoch": 0.29, "learning_rate": 4.467851011113515e-05, "logits/chosen": -2.391042470932007, "logits/rejected": -2.4325904846191406, "logps/chosen": -423.8915710449219, "logps/rejected": -501.38568115234375, "loss": 0.7102, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9706649780273438, "rewards/margins": 0.06408338248729706, "rewards/rejected": -1.0347484350204468, "step": 283 }, { "epoch": 0.29, "learning_rate": 4.4622378939147416e-05, "logits/chosen": -2.258568048477173, "logits/rejected": -2.2797646522521973, "logps/chosen": -324.49005126953125, "logps/rejected": -288.9884948730469, "loss": 0.6483, "rewards/accuracies": 0.5625, "rewards/chosen": -0.46772387623786926, "rewards/margins": 0.14248529076576233, "rewards/rejected": -0.6102092266082764, "step": 284 }, { "epoch": 0.3, "learning_rate": 4.456598893471668e-05, "logits/chosen": -2.1972222328186035, "logits/rejected": -2.0766568183898926, "logps/chosen": -391.5879211425781, "logps/rejected": -395.998779296875, "loss": 0.6513, "rewards/accuracies": 0.5, "rewards/chosen": -0.5539337396621704, "rewards/margins": 0.13942018151283264, "rewards/rejected": -0.6933539509773254, "step": 285 }, { "epoch": 0.3, "learning_rate": 4.450934084166524e-05, "logits/chosen": -2.177605152130127, "logits/rejected": -2.378321886062622, "logps/chosen": -396.26800537109375, "logps/rejected": -513.7933349609375, "loss": 0.5245, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5082396268844604, "rewards/margins": 0.6496228575706482, "rewards/rejected": -1.1578625440597534, "step": 286 }, { "epoch": 0.3, "learning_rate": 4.445243540721972e-05, "logits/chosen": -2.266407012939453, "logits/rejected": -2.261568069458008, "logps/chosen": -301.5278015136719, "logps/rejected": -347.5808410644531, "loss": 0.6835, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9058988094329834, "rewards/margins": 0.060255490243434906, "rewards/rejected": -0.9661542773246765, "step": 287 }, { "epoch": 0.3, "learning_rate": 4.4395273382001286e-05, "logits/chosen": -1.9136242866516113, "logits/rejected": -2.1110732555389404, "logps/chosen": -214.5594024658203, "logps/rejected": -286.8577880859375, "loss": 0.6752, "rewards/accuracies": 0.5, "rewards/chosen": -0.7257100343704224, "rewards/margins": 0.190011665225029, "rewards/rejected": -0.9157217144966125, "step": 288 }, { "epoch": 0.3, "learning_rate": 4.433785552001568e-05, "logits/chosen": -2.236163854598999, "logits/rejected": -2.3334312438964844, "logps/chosen": -378.4542541503906, "logps/rejected": -426.2008361816406, "loss": 0.6507, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7116307020187378, "rewards/margins": 0.14856423437595367, "rewards/rejected": -0.860194981098175, "step": 289 }, { "epoch": 0.3, "learning_rate": 4.428018257864333e-05, "logits/chosen": -2.0460734367370605, "logits/rejected": -2.1440231800079346, "logps/chosen": -280.720703125, "logps/rejected": -361.7416687011719, "loss": 0.5533, "rewards/accuracies": 0.6875, "rewards/chosen": -0.24584394693374634, "rewards/margins": 0.45910245180130005, "rewards/rejected": -0.7049463987350464, "step": 290 }, { "epoch": 0.3, "learning_rate": 4.4222255318629294e-05, "logits/chosen": -1.7771077156066895, "logits/rejected": -2.246995210647583, "logps/chosen": -307.57232666015625, "logps/rejected": -455.2540283203125, "loss": 0.7022, "rewards/accuracies": 0.5, "rewards/chosen": -0.4229920506477356, "rewards/margins": 0.13145704567432404, "rewards/rejected": -0.5544491410255432, "step": 291 }, { "epoch": 0.3, "learning_rate": 4.4164074504073313e-05, "logits/chosen": -2.343817949295044, "logits/rejected": -2.2758233547210693, "logps/chosen": -376.11419677734375, "logps/rejected": -370.3990478515625, "loss": 0.6549, "rewards/accuracies": 0.6875, "rewards/chosen": -0.604290783405304, "rewards/margins": 0.13700008392333984, "rewards/rejected": -0.7412909269332886, "step": 292 }, { "epoch": 0.3, "learning_rate": 4.410564090241966e-05, "logits/chosen": -1.545508623123169, "logits/rejected": -1.8171346187591553, "logps/chosen": -221.09950256347656, "logps/rejected": -334.39544677734375, "loss": 0.5676, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5746042132377625, "rewards/margins": 0.4071567952632904, "rewards/rejected": -0.9817609786987305, "step": 293 }, { "epoch": 0.3, "learning_rate": 4.4046955284447044e-05, "logits/chosen": -2.077967643737793, "logits/rejected": -2.2846693992614746, "logps/chosen": -324.32476806640625, "logps/rejected": -440.4974365234375, "loss": 0.7404, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6503464579582214, "rewards/margins": 0.04034698009490967, "rewards/rejected": -0.6906934976577759, "step": 294 }, { "epoch": 0.31, "learning_rate": 4.398801842425842e-05, "logits/chosen": -2.1588635444641113, "logits/rejected": -2.1049904823303223, "logps/chosen": -406.24176025390625, "logps/rejected": -372.340087890625, "loss": 0.585, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7770822048187256, "rewards/margins": 0.31246596574783325, "rewards/rejected": -1.0895482301712036, "step": 295 }, { "epoch": 0.31, "learning_rate": 4.392883109927083e-05, "logits/chosen": -2.0670382976531982, "logits/rejected": -1.8806589841842651, "logps/chosen": -401.965087890625, "logps/rejected": -393.48516845703125, "loss": 0.6777, "rewards/accuracies": 0.625, "rewards/chosen": -0.8736264705657959, "rewards/margins": 0.1775364875793457, "rewards/rejected": -1.0511629581451416, "step": 296 }, { "epoch": 0.31, "learning_rate": 4.38693940902051e-05, "logits/chosen": -1.988671064376831, "logits/rejected": -1.9722682237625122, "logps/chosen": -387.1339111328125, "logps/rejected": -409.888916015625, "loss": 0.6198, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7169901132583618, "rewards/margins": 0.27162882685661316, "rewards/rejected": -0.9886189699172974, "step": 297 }, { "epoch": 0.31, "learning_rate": 4.3809708181075556e-05, "logits/chosen": -2.186702013015747, "logits/rejected": -2.0955049991607666, "logps/chosen": -395.58294677734375, "logps/rejected": -347.0909118652344, "loss": 0.8175, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9574447274208069, "rewards/margins": -0.11547727137804031, "rewards/rejected": -0.8419675230979919, "step": 298 }, { "epoch": 0.31, "learning_rate": 4.374977415917969e-05, "logits/chosen": -2.026996612548828, "logits/rejected": -1.772495985031128, "logps/chosen": -379.002685546875, "logps/rejected": -381.895751953125, "loss": 0.561, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6855794191360474, "rewards/margins": 0.3581579923629761, "rewards/rejected": -1.0437374114990234, "step": 299 }, { "epoch": 0.31, "learning_rate": 4.3689592815087764e-05, "logits/chosen": -1.9657152891159058, "logits/rejected": -1.8686068058013916, "logps/chosen": -367.0695495605469, "logps/rejected": -369.2501525878906, "loss": 0.7995, "rewards/accuracies": 0.4375, "rewards/chosen": -1.0888875722885132, "rewards/margins": -0.10578227788209915, "rewards/rejected": -0.9831052422523499, "step": 300 }, { "epoch": 0.31, "learning_rate": 4.3629164942632386e-05, "logits/chosen": -1.7416181564331055, "logits/rejected": -1.8769294023513794, "logps/chosen": -255.93417358398438, "logps/rejected": -316.71331787109375, "loss": 0.5563, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7211109399795532, "rewards/margins": 0.501852810382843, "rewards/rejected": -1.222963809967041, "step": 301 }, { "epoch": 0.31, "learning_rate": 4.3568491338898055e-05, "logits/chosen": -2.201251745223999, "logits/rejected": -2.0464892387390137, "logps/chosen": -273.45849609375, "logps/rejected": -306.01361083984375, "loss": 0.8629, "rewards/accuracies": 0.4375, "rewards/chosen": -0.645514965057373, "rewards/margins": -0.2321767956018448, "rewards/rejected": -0.41333818435668945, "step": 302 }, { "epoch": 0.31, "learning_rate": 4.350757280421061e-05, "logits/chosen": -1.9395544528961182, "logits/rejected": -1.923555612564087, "logps/chosen": -401.73516845703125, "logps/rejected": -390.874267578125, "loss": 0.5915, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9947636723518372, "rewards/margins": 0.2778167724609375, "rewards/rejected": -1.2725805044174194, "step": 303 }, { "epoch": 0.31, "learning_rate": 4.34464101421267e-05, "logits/chosen": -1.8383352756500244, "logits/rejected": -1.9566022157669067, "logps/chosen": -381.46258544921875, "logps/rejected": -383.20050048828125, "loss": 0.5982, "rewards/accuracies": 0.625, "rewards/chosen": -0.5637756586074829, "rewards/margins": 0.3097578287124634, "rewards/rejected": -0.8735334277153015, "step": 304 }, { "epoch": 0.32, "learning_rate": 4.338500415942319e-05, "logits/chosen": -2.0855202674865723, "logits/rejected": -1.8852460384368896, "logps/chosen": -313.52093505859375, "logps/rejected": -327.4878234863281, "loss": 0.601, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6615315079689026, "rewards/margins": 0.2937285006046295, "rewards/rejected": -0.9552599787712097, "step": 305 }, { "epoch": 0.32, "learning_rate": 4.3323355666086506e-05, "logits/chosen": -2.2466461658477783, "logits/rejected": -2.194441556930542, "logps/chosen": -385.48297119140625, "logps/rejected": -358.59783935546875, "loss": 0.7045, "rewards/accuracies": 0.375, "rewards/chosen": -0.8933424353599548, "rewards/margins": 0.060363732278347015, "rewards/rejected": -0.9537062048912048, "step": 306 }, { "epoch": 0.32, "learning_rate": 4.326146547530196e-05, "logits/chosen": -1.9754979610443115, "logits/rejected": -2.028249502182007, "logps/chosen": -392.3201599121094, "logps/rejected": -451.3328552246094, "loss": 0.5464, "rewards/accuracies": 0.75, "rewards/chosen": -0.7265093922615051, "rewards/margins": 0.5438079833984375, "rewards/rejected": -1.2703173160552979, "step": 307 }, { "epoch": 0.32, "learning_rate": 4.3199334403442976e-05, "logits/chosen": -1.9868967533111572, "logits/rejected": -1.997020959854126, "logps/chosen": -324.3955078125, "logps/rejected": -345.5137634277344, "loss": 0.8214, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8653470873832703, "rewards/margins": -0.15076835453510284, "rewards/rejected": -0.7145787477493286, "step": 308 }, { "epoch": 0.32, "learning_rate": 4.313696327006042e-05, "logits/chosen": -2.213704824447632, "logits/rejected": -2.111079692840576, "logps/chosen": -404.150634765625, "logps/rejected": -373.7305603027344, "loss": 0.7839, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9824564456939697, "rewards/margins": 0.003427162766456604, "rewards/rejected": -0.9858837127685547, "step": 309 }, { "epoch": 0.32, "learning_rate": 4.3074352897871686e-05, "logits/chosen": -1.8593621253967285, "logits/rejected": -2.1451752185821533, "logps/chosen": -312.2982482910156, "logps/rejected": -318.39642333984375, "loss": 0.6276, "rewards/accuracies": 0.625, "rewards/chosen": -0.7266647815704346, "rewards/margins": 0.3218391537666321, "rewards/rejected": -1.0485039949417114, "step": 310 }, { "epoch": 0.32, "learning_rate": 4.301150411274992e-05, "logits/chosen": -1.75832200050354, "logits/rejected": -1.8182921409606934, "logps/chosen": -294.1172790527344, "logps/rejected": -396.57208251953125, "loss": 0.6559, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8868988752365112, "rewards/margins": 0.2270849496126175, "rewards/rejected": -1.1139838695526123, "step": 311 }, { "epoch": 0.32, "learning_rate": 4.294841774371308e-05, "logits/chosen": -1.9476118087768555, "logits/rejected": -1.9689973592758179, "logps/chosen": -333.6837158203125, "logps/rejected": -371.0637512207031, "loss": 0.596, "rewards/accuracies": 0.625, "rewards/chosen": -0.839272677898407, "rewards/margins": 0.3770362436771393, "rewards/rejected": -1.2163089513778687, "step": 312 }, { "epoch": 0.32, "learning_rate": 4.288509462291302e-05, "logits/chosen": -2.073699474334717, "logits/rejected": -1.9370347261428833, "logps/chosen": -378.5625305175781, "logps/rejected": -395.1451416015625, "loss": 0.6107, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7811661958694458, "rewards/margins": 0.2788164019584656, "rewards/rejected": -1.0599825382232666, "step": 313 }, { "epoch": 0.33, "learning_rate": 4.2821535585624504e-05, "logits/chosen": -1.9244226217269897, "logits/rejected": -2.0157883167266846, "logps/chosen": -374.1487121582031, "logps/rejected": -391.6187438964844, "loss": 0.7111, "rewards/accuracies": 0.5, "rewards/chosen": -0.7071002721786499, "rewards/margins": 0.08099737763404846, "rewards/rejected": -0.7880975604057312, "step": 314 }, { "epoch": 0.33, "learning_rate": 4.2757741470234214e-05, "logits/chosen": -2.2348544597625732, "logits/rejected": -2.087557554244995, "logps/chosen": -326.5330810546875, "logps/rejected": -336.2727355957031, "loss": 0.6565, "rewards/accuracies": 0.625, "rewards/chosen": -0.4954121708869934, "rewards/margins": 0.1729755401611328, "rewards/rejected": -0.668387770652771, "step": 315 }, { "epoch": 0.33, "learning_rate": 4.269371311822965e-05, "logits/chosen": -2.137007236480713, "logits/rejected": -2.230994701385498, "logps/chosen": -402.22711181640625, "logps/rejected": -455.44390869140625, "loss": 0.5746, "rewards/accuracies": 0.625, "rewards/chosen": -1.0957834720611572, "rewards/margins": 0.4251984655857086, "rewards/rejected": -1.520982027053833, "step": 316 }, { "epoch": 0.33, "learning_rate": 4.2629451374188055e-05, "logits/chosen": -2.0285918712615967, "logits/rejected": -2.096553087234497, "logps/chosen": -352.289306640625, "logps/rejected": -334.0386962890625, "loss": 0.7908, "rewards/accuracies": 0.5, "rewards/chosen": -0.6512686610221863, "rewards/margins": -0.014007307589054108, "rewards/rejected": -0.6372612714767456, "step": 317 }, { "epoch": 0.33, "learning_rate": 4.256495708576527e-05, "logits/chosen": -2.0124435424804688, "logits/rejected": -2.203395128250122, "logps/chosen": -344.8824768066406, "logps/rejected": -407.80548095703125, "loss": 0.5603, "rewards/accuracies": 0.5625, "rewards/chosen": -0.666597843170166, "rewards/margins": 0.5224538445472717, "rewards/rejected": -1.1890517473220825, "step": 318 }, { "epoch": 0.33, "learning_rate": 4.250023110368457e-05, "logits/chosen": -2.0517170429229736, "logits/rejected": -2.260333299636841, "logps/chosen": -319.9532470703125, "logps/rejected": -426.59197998046875, "loss": 0.5659, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0301696062088013, "rewards/margins": 0.4518246054649353, "rewards/rejected": -1.4819941520690918, "step": 319 }, { "epoch": 0.33, "learning_rate": 4.243527428172541e-05, "logits/chosen": -1.7961515188217163, "logits/rejected": -2.0152459144592285, "logps/chosen": -338.9334411621094, "logps/rejected": -429.2481384277344, "loss": 0.6529, "rewards/accuracies": 0.625, "rewards/chosen": -0.7761648893356323, "rewards/margins": 0.3605046570301056, "rewards/rejected": -1.136669635772705, "step": 320 }, { "epoch": 0.33, "learning_rate": 4.237008747671217e-05, "logits/chosen": -2.139997720718384, "logits/rejected": -2.038111686706543, "logps/chosen": -311.0231018066406, "logps/rejected": -327.61065673828125, "loss": 0.6711, "rewards/accuracies": 0.75, "rewards/chosen": -1.0172241926193237, "rewards/margins": 0.20826146006584167, "rewards/rejected": -1.2254855632781982, "step": 321 }, { "epoch": 0.33, "learning_rate": 4.2304671548502896e-05, "logits/chosen": -1.859197735786438, "logits/rejected": -2.0383810997009277, "logps/chosen": -319.2428894042969, "logps/rejected": -297.6011657714844, "loss": 0.7805, "rewards/accuracies": 0.375, "rewards/chosen": -0.34115666151046753, "rewards/margins": -0.06757514923810959, "rewards/rejected": -0.27358150482177734, "step": 322 }, { "epoch": 0.33, "learning_rate": 4.223902735997788e-05, "logits/chosen": -2.063511371612549, "logits/rejected": -2.0269269943237305, "logps/chosen": -329.1101379394531, "logps/rejected": -359.3594665527344, "loss": 0.5742, "rewards/accuracies": 0.75, "rewards/chosen": -0.6491307616233826, "rewards/margins": 0.3211410343647003, "rewards/rejected": -0.9702718257904053, "step": 323 }, { "epoch": 0.34, "learning_rate": 4.217315577702836e-05, "logits/chosen": -2.3090927600860596, "logits/rejected": -2.2255845069885254, "logps/chosen": -438.4259338378906, "logps/rejected": -385.08880615234375, "loss": 0.7974, "rewards/accuracies": 0.5, "rewards/chosen": -0.6954346895217896, "rewards/margins": 0.034093111753463745, "rewards/rejected": -0.7295278310775757, "step": 324 }, { "epoch": 0.34, "learning_rate": 4.2107057668545044e-05, "logits/chosen": -2.138420581817627, "logits/rejected": -2.3774194717407227, "logps/chosen": -241.03421020507812, "logps/rejected": -291.4215087890625, "loss": 0.5889, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7353395223617554, "rewards/margins": 0.4763728976249695, "rewards/rejected": -1.2117124795913696, "step": 325 }, { "epoch": 0.34, "learning_rate": 4.204073390640666e-05, "logits/chosen": -2.095376968383789, "logits/rejected": -2.1427035331726074, "logps/chosen": -359.9441223144531, "logps/rejected": -460.4444580078125, "loss": 0.6791, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9195146560668945, "rewards/margins": 0.12283174693584442, "rewards/rejected": -1.042346477508545, "step": 326 }, { "epoch": 0.34, "learning_rate": 4.1974185365468467e-05, "logits/chosen": -2.082658052444458, "logits/rejected": -2.0861730575561523, "logps/chosen": -364.9388427734375, "logps/rejected": -403.16510009765625, "loss": 0.6084, "rewards/accuracies": 0.625, "rewards/chosen": -0.5946727991104126, "rewards/margins": 0.3608367443084717, "rewards/rejected": -0.955509603023529, "step": 327 }, { "epoch": 0.34, "learning_rate": 4.19074129235507e-05, "logits/chosen": -2.088836431503296, "logits/rejected": -2.0834221839904785, "logps/chosen": -319.285888671875, "logps/rejected": -320.6772766113281, "loss": 0.8216, "rewards/accuracies": 0.4375, "rewards/chosen": -0.882214367389679, "rewards/margins": -0.12802943587303162, "rewards/rejected": -0.7541849613189697, "step": 328 }, { "epoch": 0.34, "learning_rate": 4.184041746142702e-05, "logits/chosen": -2.211498498916626, "logits/rejected": -2.1443381309509277, "logps/chosen": -379.1222229003906, "logps/rejected": -416.7032470703125, "loss": 0.7431, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7400021553039551, "rewards/margins": -0.029454410076141357, "rewards/rejected": -0.7105477452278137, "step": 329 }, { "epoch": 0.34, "learning_rate": 4.177319986281285e-05, "logits/chosen": -1.9428646564483643, "logits/rejected": -2.0589828491210938, "logps/chosen": -338.52618408203125, "logps/rejected": -396.5542297363281, "loss": 0.6202, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3697546124458313, "rewards/margins": 0.2811446785926819, "rewards/rejected": -0.6508992314338684, "step": 330 }, { "epoch": 0.34, "learning_rate": 4.170576101435376e-05, "logits/chosen": -2.2952022552490234, "logits/rejected": -2.321949005126953, "logps/chosen": -275.3761901855469, "logps/rejected": -350.7015075683594, "loss": 0.6968, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3928203880786896, "rewards/margins": 0.07630396634340286, "rewards/rejected": -0.46912431716918945, "step": 331 }, { "epoch": 0.34, "learning_rate": 4.163810180561376e-05, "logits/chosen": -1.8680933713912964, "logits/rejected": -2.0927417278289795, "logps/chosen": -303.8800354003906, "logps/rejected": -341.4588928222656, "loss": 0.7411, "rewards/accuracies": 0.5, "rewards/chosen": -0.6674574613571167, "rewards/margins": -0.02869322896003723, "rewards/rejected": -0.6387642621994019, "step": 332 }, { "epoch": 0.34, "learning_rate": 4.157022312906352e-05, "logits/chosen": -1.946225881576538, "logits/rejected": -1.9390500783920288, "logps/chosen": -340.5247497558594, "logps/rejected": -309.8658447265625, "loss": 0.6641, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6571869850158691, "rewards/margins": 0.13291773200035095, "rewards/rejected": -0.7901047468185425, "step": 333 }, { "epoch": 0.35, "learning_rate": 4.150212588006871e-05, "logits/chosen": -2.1470143795013428, "logits/rejected": -2.4466309547424316, "logps/chosen": -355.8979797363281, "logps/rejected": -360.3694763183594, "loss": 0.7295, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8183680772781372, "rewards/margins": -0.04079904779791832, "rewards/rejected": -0.7775689363479614, "step": 334 }, { "epoch": 0.35, "learning_rate": 4.143381095687805e-05, "logits/chosen": -1.8094087839126587, "logits/rejected": -2.024905204772949, "logps/chosen": -291.0030517578125, "logps/rejected": -401.3894348144531, "loss": 0.5393, "rewards/accuracies": 0.8125, "rewards/chosen": -0.30385422706604004, "rewards/margins": 0.38516706228256226, "rewards/rejected": -0.6890213489532471, "step": 335 }, { "epoch": 0.35, "learning_rate": 4.136527926061157e-05, "logits/chosen": -2.2869348526000977, "logits/rejected": -2.3848395347595215, "logps/chosen": -354.1495361328125, "logps/rejected": -422.5235900878906, "loss": 0.7545, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5594637393951416, "rewards/margins": -0.08133678883314133, "rewards/rejected": -0.4781269133090973, "step": 336 }, { "epoch": 0.35, "learning_rate": 4.1296531695248666e-05, "logits/chosen": -2.1156575679779053, "logits/rejected": -2.0625457763671875, "logps/chosen": -420.004150390625, "logps/rejected": -347.8268737792969, "loss": 0.7191, "rewards/accuracies": 0.5, "rewards/chosen": -0.7436313033103943, "rewards/margins": 0.03417450189590454, "rewards/rejected": -0.777805745601654, "step": 337 }, { "epoch": 0.35, "learning_rate": 4.1227569167616206e-05, "logits/chosen": -1.9652526378631592, "logits/rejected": -2.0387110710144043, "logps/chosen": -294.15997314453125, "logps/rejected": -333.1921081542969, "loss": 0.6429, "rewards/accuracies": 0.75, "rewards/chosen": -0.547760009765625, "rewards/margins": 0.18485116958618164, "rewards/rejected": -0.7326111793518066, "step": 338 }, { "epoch": 0.35, "learning_rate": 4.1158392587376536e-05, "logits/chosen": -1.9591008424758911, "logits/rejected": -1.9757970571517944, "logps/chosen": -308.64453125, "logps/rejected": -373.3291320800781, "loss": 0.7154, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3303202986717224, "rewards/margins": 0.0813257172703743, "rewards/rejected": -0.4116460382938385, "step": 339 }, { "epoch": 0.35, "learning_rate": 4.108900286701552e-05, "logits/chosen": -2.0622799396514893, "logits/rejected": -2.0668537616729736, "logps/chosen": -209.19119262695312, "logps/rejected": -266.43145751953125, "loss": 0.6393, "rewards/accuracies": 0.625, "rewards/chosen": -0.61204993724823, "rewards/margins": 0.28989237546920776, "rewards/rejected": -0.901942253112793, "step": 340 }, { "epoch": 0.35, "learning_rate": 4.101940092183048e-05, "logits/chosen": -2.208672523498535, "logits/rejected": -2.2501301765441895, "logps/chosen": -449.76763916015625, "logps/rejected": -333.26617431640625, "loss": 0.735, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4438176155090332, "rewards/margins": 0.0284462571144104, "rewards/rejected": -0.472263902425766, "step": 341 }, { "epoch": 0.35, "learning_rate": 4.0949587669918124e-05, "logits/chosen": -2.2545619010925293, "logits/rejected": -2.284487009048462, "logps/chosen": -368.6788635253906, "logps/rejected": -404.1095275878906, "loss": 0.6716, "rewards/accuracies": 0.5, "rewards/chosen": -0.6989068984985352, "rewards/margins": 0.13742834329605103, "rewards/rejected": -0.8363352417945862, "step": 342 }, { "epoch": 0.36, "learning_rate": 4.087956403216243e-05, "logits/chosen": -2.098728895187378, "logits/rejected": -1.9426969289779663, "logps/chosen": -364.287109375, "logps/rejected": -345.9195556640625, "loss": 0.6768, "rewards/accuracies": 0.5, "rewards/chosen": -0.5762618780136108, "rewards/margins": 0.06659980118274689, "rewards/rejected": -0.6428617238998413, "step": 343 }, { "epoch": 0.36, "learning_rate": 4.0809330932222525e-05, "logits/chosen": -2.0413217544555664, "logits/rejected": -1.770898699760437, "logps/chosen": -338.1258850097656, "logps/rejected": -310.4683837890625, "loss": 0.6924, "rewards/accuracies": 0.5625, "rewards/chosen": -0.612076461315155, "rewards/margins": 0.09379884600639343, "rewards/rejected": -0.7058753371238708, "step": 344 }, { "epoch": 0.36, "learning_rate": 4.073888929652048e-05, "logits/chosen": -1.8582487106323242, "logits/rejected": -1.9838684797286987, "logps/chosen": -279.77764892578125, "logps/rejected": -287.03314208984375, "loss": 0.6344, "rewards/accuracies": 0.5, "rewards/chosen": -0.22681152820587158, "rewards/margins": 0.17675894498825073, "rewards/rejected": -0.4035705029964447, "step": 345 }, { "epoch": 0.36, "learning_rate": 4.066824005422907e-05, "logits/chosen": -2.2785511016845703, "logits/rejected": -2.248875379562378, "logps/chosen": -282.7828369140625, "logps/rejected": -282.3833923339844, "loss": 0.6969, "rewards/accuracies": 0.625, "rewards/chosen": -0.5595721006393433, "rewards/margins": 0.007087539881467819, "rewards/rejected": -0.5666596293449402, "step": 346 }, { "epoch": 0.36, "learning_rate": 4.0597384137259576e-05, "logits/chosen": -1.8838849067687988, "logits/rejected": -1.992185115814209, "logps/chosen": -249.31353759765625, "logps/rejected": -291.52215576171875, "loss": 0.639, "rewards/accuracies": 0.75, "rewards/chosen": -0.25019168853759766, "rewards/margins": 0.13961070775985718, "rewards/rejected": -0.3898024260997772, "step": 347 }, { "epoch": 0.36, "learning_rate": 4.052632248024943e-05, "logits/chosen": -2.194199562072754, "logits/rejected": -2.2093465328216553, "logps/chosen": -359.925048828125, "logps/rejected": -341.787109375, "loss": 0.6809, "rewards/accuracies": 0.625, "rewards/chosen": -0.44080251455307007, "rewards/margins": 0.10257270187139511, "rewards/rejected": -0.5433753132820129, "step": 348 }, { "epoch": 0.36, "learning_rate": 4.045505602054994e-05, "logits/chosen": -1.9952166080474854, "logits/rejected": -1.9181216955184937, "logps/chosen": -272.50146484375, "logps/rejected": -296.2552795410156, "loss": 0.7549, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5810081958770752, "rewards/margins": -0.0541771724820137, "rewards/rejected": -0.5268309712409973, "step": 349 }, { "epoch": 0.36, "learning_rate": 4.0383585698213876e-05, "logits/chosen": -2.166259527206421, "logits/rejected": -1.9561712741851807, "logps/chosen": -392.21014404296875, "logps/rejected": -360.259033203125, "loss": 0.723, "rewards/accuracies": 0.625, "rewards/chosen": -0.7585784196853638, "rewards/margins": 0.024934954941272736, "rewards/rejected": -0.7835134267807007, "step": 350 }, { "epoch": 0.36, "learning_rate": 4.03119124559831e-05, "logits/chosen": -2.117828845977783, "logits/rejected": -2.2696075439453125, "logps/chosen": -370.1112060546875, "logps/rejected": -357.4794921875, "loss": 0.7401, "rewards/accuracies": 0.5, "rewards/chosen": -0.7355208396911621, "rewards/margins": 0.01835598051548004, "rewards/rejected": -0.7538768649101257, "step": 351 }, { "epoch": 0.36, "learning_rate": 4.024003723927614e-05, "logits/chosen": -2.15438175201416, "logits/rejected": -2.240237236022949, "logps/chosen": -291.9135437011719, "logps/rejected": -306.6573791503906, "loss": 0.6362, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6013116836547852, "rewards/margins": 0.29444190859794617, "rewards/rejected": -0.8957535028457642, "step": 352 }, { "epoch": 0.37, "learning_rate": 4.016796099617569e-05, "logits/chosen": -2.123490571975708, "logits/rejected": -1.9996310472488403, "logps/chosen": -320.6844787597656, "logps/rejected": -328.69732666015625, "loss": 0.7201, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6908583045005798, "rewards/margins": 0.018049392849206924, "rewards/rejected": -0.7089077234268188, "step": 353 }, { "epoch": 0.37, "learning_rate": 4.009568467741611e-05, "logits/chosen": -2.1538658142089844, "logits/rejected": -2.2801826000213623, "logps/chosen": -332.2469482421875, "logps/rejected": -398.9875793457031, "loss": 0.6108, "rewards/accuracies": 0.625, "rewards/chosen": -0.35775309801101685, "rewards/margins": 0.2206544280052185, "rewards/rejected": -0.5784075260162354, "step": 354 }, { "epoch": 0.37, "learning_rate": 4.0023209236370905e-05, "logits/chosen": -2.1057112216949463, "logits/rejected": -1.9648573398590088, "logps/chosen": -304.4049377441406, "logps/rejected": -331.1800842285156, "loss": 0.5937, "rewards/accuracies": 0.625, "rewards/chosen": -0.4884049594402313, "rewards/margins": 0.28548693656921387, "rewards/rejected": -0.7738919258117676, "step": 355 }, { "epoch": 0.37, "learning_rate": 3.9950535629040154e-05, "logits/chosen": -2.075382947921753, "logits/rejected": -2.061739206314087, "logps/chosen": -287.4841003417969, "logps/rejected": -279.5746765136719, "loss": 0.6667, "rewards/accuracies": 0.5625, "rewards/chosen": -0.42838478088378906, "rewards/margins": 0.09299005568027496, "rewards/rejected": -0.5213748216629028, "step": 356 }, { "epoch": 0.37, "learning_rate": 3.9877664814037844e-05, "logits/chosen": -2.041006088256836, "logits/rejected": -2.146519899368286, "logps/chosen": -234.5111083984375, "logps/rejected": -315.4188232421875, "loss": 0.5746, "rewards/accuracies": 0.75, "rewards/chosen": -0.44257616996765137, "rewards/margins": 0.30319535732269287, "rewards/rejected": -0.745771586894989, "step": 357 }, { "epoch": 0.37, "learning_rate": 3.98045977525793e-05, "logits/chosen": -1.9708685874938965, "logits/rejected": -2.119077444076538, "logps/chosen": -233.92739868164062, "logps/rejected": -251.74658203125, "loss": 0.7386, "rewards/accuracies": 0.5, "rewards/chosen": -0.43048954010009766, "rewards/margins": 0.0007461756467819214, "rewards/rejected": -0.43123573064804077, "step": 358 }, { "epoch": 0.37, "learning_rate": 3.973133540846844e-05, "logits/chosen": -2.2124104499816895, "logits/rejected": -2.3888068199157715, "logps/chosen": -378.053955078125, "logps/rejected": -432.74102783203125, "loss": 0.6094, "rewards/accuracies": 0.625, "rewards/chosen": -0.4846843481063843, "rewards/margins": 0.2898719012737274, "rewards/rejected": -0.7745562195777893, "step": 359 }, { "epoch": 0.37, "learning_rate": 3.965787874808513e-05, "logits/chosen": -2.182685375213623, "logits/rejected": -2.245464324951172, "logps/chosen": -303.55377197265625, "logps/rejected": -304.9583435058594, "loss": 0.7466, "rewards/accuracies": 0.5, "rewards/chosen": -0.72776859998703, "rewards/margins": -0.010826468467712402, "rewards/rejected": -0.7169421911239624, "step": 360 }, { "epoch": 0.37, "learning_rate": 3.958422874037236e-05, "logits/chosen": -2.2944741249084473, "logits/rejected": -2.1541600227355957, "logps/chosen": -325.5491638183594, "logps/rejected": -359.56488037109375, "loss": 0.6708, "rewards/accuracies": 0.625, "rewards/chosen": -0.6380666494369507, "rewards/margins": 0.19594302773475647, "rewards/rejected": -0.8340096473693848, "step": 361 }, { "epoch": 0.37, "learning_rate": 3.951038635682353e-05, "logits/chosen": -2.183659553527832, "logits/rejected": -2.309375762939453, "logps/chosen": -233.40463256835938, "logps/rejected": -258.43792724609375, "loss": 0.5859, "rewards/accuracies": 0.6875, "rewards/chosen": -0.34500959515571594, "rewards/margins": 0.3104623556137085, "rewards/rejected": -0.655471920967102, "step": 362 }, { "epoch": 0.38, "learning_rate": 3.943635257146958e-05, "logits/chosen": -2.189570903778076, "logits/rejected": -2.307717800140381, "logps/chosen": -310.58837890625, "logps/rejected": -396.47412109375, "loss": 0.5968, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5562077760696411, "rewards/margins": 0.26935189962387085, "rewards/rejected": -0.825559675693512, "step": 363 }, { "epoch": 0.38, "learning_rate": 3.936212836086621e-05, "logits/chosen": -2.087996244430542, "logits/rejected": -2.061500310897827, "logps/chosen": -320.0885009765625, "logps/rejected": -367.2314453125, "loss": 0.5962, "rewards/accuracies": 0.75, "rewards/chosen": -0.5524263381958008, "rewards/margins": 0.3210427165031433, "rewards/rejected": -0.8734689354896545, "step": 364 }, { "epoch": 0.38, "learning_rate": 3.9287714704080916e-05, "logits/chosen": -2.2036032676696777, "logits/rejected": -2.247941017150879, "logps/chosen": -310.3907470703125, "logps/rejected": -373.12005615234375, "loss": 0.6413, "rewards/accuracies": 0.625, "rewards/chosen": -0.7271711230278015, "rewards/margins": 0.1365385353565216, "rewards/rejected": -0.8637096881866455, "step": 365 }, { "epoch": 0.38, "learning_rate": 3.9213112582680136e-05, "logits/chosen": -2.071441888809204, "logits/rejected": -2.2175564765930176, "logps/chosen": -350.5240478515625, "logps/rejected": -344.10015869140625, "loss": 0.819, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7361069321632385, "rewards/margins": -0.15310856699943542, "rewards/rejected": -0.5829984545707703, "step": 366 }, { "epoch": 0.38, "learning_rate": 3.913832298071629e-05, "logits/chosen": -2.137769937515259, "logits/rejected": -2.1863951683044434, "logps/chosen": -261.62445068359375, "logps/rejected": -287.54925537109375, "loss": 0.5843, "rewards/accuracies": 0.625, "rewards/chosen": -0.5611026287078857, "rewards/margins": 0.35456323623657227, "rewards/rejected": -0.915665864944458, "step": 367 }, { "epoch": 0.38, "learning_rate": 3.906334688471479e-05, "logits/chosen": -2.322150230407715, "logits/rejected": -2.2364730834960938, "logps/chosen": -372.37115478515625, "logps/rejected": -441.963623046875, "loss": 0.7185, "rewards/accuracies": 0.375, "rewards/chosen": -0.7501680850982666, "rewards/margins": 0.004912780597805977, "rewards/rejected": -0.7550809383392334, "step": 368 }, { "epoch": 0.38, "learning_rate": 3.8988185283661006e-05, "logits/chosen": -2.3357200622558594, "logits/rejected": -2.281803846359253, "logps/chosen": -349.5588073730469, "logps/rejected": -427.9027099609375, "loss": 0.6668, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6943049430847168, "rewards/margins": 0.08358533680438995, "rewards/rejected": -0.7778902649879456, "step": 369 }, { "epoch": 0.38, "learning_rate": 3.8912839168987286e-05, "logits/chosen": -2.0027129650115967, "logits/rejected": -2.201495885848999, "logps/chosen": -335.1200866699219, "logps/rejected": -356.2291259765625, "loss": 0.7201, "rewards/accuracies": 0.375, "rewards/chosen": -0.825676441192627, "rewards/margins": 0.053414199501276016, "rewards/rejected": -0.8790906667709351, "step": 370 }, { "epoch": 0.38, "learning_rate": 3.883730953455981e-05, "logits/chosen": -2.0758540630340576, "logits/rejected": -2.2228283882141113, "logps/chosen": -330.1047058105469, "logps/rejected": -359.3675842285156, "loss": 0.6519, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9475977420806885, "rewards/margins": 0.2040516585111618, "rewards/rejected": -1.1516493558883667, "step": 371 }, { "epoch": 0.39, "learning_rate": 3.876159737666551e-05, "logits/chosen": -2.0982770919799805, "logits/rejected": -2.1729726791381836, "logps/chosen": -386.24560546875, "logps/rejected": -394.7810974121094, "loss": 0.7604, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8945655226707458, "rewards/margins": -0.06084037944674492, "rewards/rejected": -0.8337251543998718, "step": 372 }, { "epoch": 0.39, "learning_rate": 3.868570369399894e-05, "logits/chosen": -2.116682767868042, "logits/rejected": -2.1849117279052734, "logps/chosen": -251.51715087890625, "logps/rejected": -259.1833801269531, "loss": 0.6078, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6918532252311707, "rewards/margins": 0.33846980333328247, "rewards/rejected": -1.0303230285644531, "step": 373 }, { "epoch": 0.39, "learning_rate": 3.860962948764906e-05, "logits/chosen": -2.0886361598968506, "logits/rejected": -2.046086549758911, "logps/chosen": -292.8084411621094, "logps/rejected": -357.3808898925781, "loss": 0.6235, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5758143067359924, "rewards/margins": 0.24546638131141663, "rewards/rejected": -0.8212807178497314, "step": 374 }, { "epoch": 0.39, "learning_rate": 3.85333757610861e-05, "logits/chosen": -2.0787835121154785, "logits/rejected": -2.094371795654297, "logps/chosen": -318.9442443847656, "logps/rejected": -354.12188720703125, "loss": 0.6207, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6455579996109009, "rewards/margins": 0.18885663151741028, "rewards/rejected": -0.8344146013259888, "step": 375 }, { "epoch": 0.39, "learning_rate": 3.845694352014825e-05, "logits/chosen": -2.0175139904022217, "logits/rejected": -2.0191752910614014, "logps/chosen": -341.6969299316406, "logps/rejected": -383.448974609375, "loss": 0.7511, "rewards/accuracies": 0.625, "rewards/chosen": -0.768385648727417, "rewards/margins": -0.034262366592884064, "rewards/rejected": -0.7341232895851135, "step": 376 }, { "epoch": 0.39, "learning_rate": 3.838033377302844e-05, "logits/chosen": -2.143493413925171, "logits/rejected": -2.2795639038085938, "logps/chosen": -304.6917724609375, "logps/rejected": -347.99774169921875, "loss": 0.7008, "rewards/accuracies": 0.5, "rewards/chosen": -0.7183622717857361, "rewards/margins": 0.0656728744506836, "rewards/rejected": -0.7840351462364197, "step": 377 }, { "epoch": 0.39, "learning_rate": 3.830354753026102e-05, "logits/chosen": -2.0630643367767334, "logits/rejected": -2.1921281814575195, "logps/chosen": -290.36041259765625, "logps/rejected": -370.9847717285156, "loss": 0.6672, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6518306732177734, "rewards/margins": 0.20937411487102509, "rewards/rejected": -0.8612047433853149, "step": 378 }, { "epoch": 0.39, "learning_rate": 3.8226585804708435e-05, "logits/chosen": -2.1670899391174316, "logits/rejected": -2.202104330062866, "logps/chosen": -399.4073486328125, "logps/rejected": -372.1466979980469, "loss": 0.7435, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7544846534729004, "rewards/margins": -0.017259221524000168, "rewards/rejected": -0.7372254133224487, "step": 379 }, { "epoch": 0.39, "learning_rate": 3.8149449611547886e-05, "logits/chosen": -2.1285555362701416, "logits/rejected": -2.118056297302246, "logps/chosen": -325.7785949707031, "logps/rejected": -354.8650817871094, "loss": 0.6628, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7010979056358337, "rewards/margins": 0.0963049978017807, "rewards/rejected": -0.7974028587341309, "step": 380 }, { "epoch": 0.39, "learning_rate": 3.807213996825788e-05, "logits/chosen": -2.134826183319092, "logits/rejected": -2.105046272277832, "logps/chosen": -347.5355529785156, "logps/rejected": -370.0594482421875, "loss": 0.5515, "rewards/accuracies": 0.75, "rewards/chosen": -0.640611469745636, "rewards/margins": 0.43629205226898193, "rewards/rejected": -1.0769035816192627, "step": 381 }, { "epoch": 0.4, "learning_rate": 3.7994657894604906e-05, "logits/chosen": -1.999627709388733, "logits/rejected": -2.0539674758911133, "logps/chosen": -324.84100341796875, "logps/rejected": -294.4129333496094, "loss": 0.5771, "rewards/accuracies": 0.75, "rewards/chosen": -0.6821228265762329, "rewards/margins": 0.3325861096382141, "rewards/rejected": -1.0147089958190918, "step": 382 }, { "epoch": 0.4, "learning_rate": 3.791700441262987e-05, "logits/chosen": -2.2994258403778076, "logits/rejected": -2.4870550632476807, "logps/chosen": -308.85284423828125, "logps/rejected": -387.4725341796875, "loss": 0.5876, "rewards/accuracies": 0.625, "rewards/chosen": -0.8057706356048584, "rewards/margins": 0.3048417270183563, "rewards/rejected": -1.110612392425537, "step": 383 }, { "epoch": 0.4, "learning_rate": 3.78391805466347e-05, "logits/chosen": -2.0464038848876953, "logits/rejected": -1.9356979131698608, "logps/chosen": -348.22613525390625, "logps/rejected": -336.2864685058594, "loss": 0.7458, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8788054585456848, "rewards/margins": 0.08573350310325623, "rewards/rejected": -0.9645389914512634, "step": 384 }, { "epoch": 0.4, "learning_rate": 3.7761187323168804e-05, "logits/chosen": -2.0775394439697266, "logits/rejected": -2.0725295543670654, "logps/chosen": -378.3978576660156, "logps/rejected": -367.72894287109375, "loss": 0.7507, "rewards/accuracies": 0.5, "rewards/chosen": -0.8533260822296143, "rewards/margins": -0.06236880645155907, "rewards/rejected": -0.7909572124481201, "step": 385 }, { "epoch": 0.4, "learning_rate": 3.7683025771015515e-05, "logits/chosen": -2.129138946533203, "logits/rejected": -2.243818521499634, "logps/chosen": -343.4689636230469, "logps/rejected": -372.4132995605469, "loss": 0.6416, "rewards/accuracies": 0.6875, "rewards/chosen": -0.645578145980835, "rewards/margins": 0.2473578155040741, "rewards/rejected": -0.8929359912872314, "step": 386 }, { "epoch": 0.4, "learning_rate": 3.760469692117854e-05, "logits/chosen": -2.071223735809326, "logits/rejected": -1.9962562322616577, "logps/chosen": -256.22509765625, "logps/rejected": -263.2807312011719, "loss": 0.6128, "rewards/accuracies": 0.5625, "rewards/chosen": -0.746870219707489, "rewards/margins": 0.27368226647377014, "rewards/rejected": -1.0205525159835815, "step": 387 }, { "epoch": 0.4, "learning_rate": 3.752620180686837e-05, "logits/chosen": -2.195363998413086, "logits/rejected": -2.3401169776916504, "logps/chosen": -329.45050048828125, "logps/rejected": -367.7939758300781, "loss": 0.6203, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8209664821624756, "rewards/margins": 0.39947718381881714, "rewards/rejected": -1.2204437255859375, "step": 388 }, { "epoch": 0.4, "learning_rate": 3.744754146348862e-05, "logits/chosen": -2.3540046215057373, "logits/rejected": -2.102443218231201, "logps/chosen": -439.45501708984375, "logps/rejected": -325.82647705078125, "loss": 0.7747, "rewards/accuracies": 0.5, "rewards/chosen": -1.0380897521972656, "rewards/margins": -0.03507265821099281, "rewards/rejected": -1.0030171871185303, "step": 389 }, { "epoch": 0.4, "learning_rate": 3.736871692862239e-05, "logits/chosen": -2.0107619762420654, "logits/rejected": -2.133056879043579, "logps/chosen": -280.13372802734375, "logps/rejected": -372.2601623535156, "loss": 0.5378, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8871760368347168, "rewards/margins": 0.42683565616607666, "rewards/rejected": -1.3140116930007935, "step": 390 }, { "epoch": 0.4, "learning_rate": 3.7289729242018586e-05, "logits/chosen": -2.216970682144165, "logits/rejected": -2.1814701557159424, "logps/chosen": -248.73269653320312, "logps/rejected": -246.78436279296875, "loss": 0.4933, "rewards/accuracies": 0.75, "rewards/chosen": -0.7154771089553833, "rewards/margins": 0.5849171280860901, "rewards/rejected": -1.3003942966461182, "step": 391 }, { "epoch": 0.41, "learning_rate": 3.721057944557819e-05, "logits/chosen": -2.026475667953491, "logits/rejected": -2.0671896934509277, "logps/chosen": -298.8050842285156, "logps/rejected": -324.1427307128906, "loss": 0.5981, "rewards/accuracies": 0.6875, "rewards/chosen": -0.523551344871521, "rewards/margins": 0.30002254247665405, "rewards/rejected": -0.823573887348175, "step": 392 }, { "epoch": 0.41, "learning_rate": 3.713126858334052e-05, "logits/chosen": -1.7506847381591797, "logits/rejected": -1.871716022491455, "logps/chosen": -272.1562194824219, "logps/rejected": -365.52752685546875, "loss": 0.5324, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7902772426605225, "rewards/margins": 0.6367801427841187, "rewards/rejected": -1.4270575046539307, "step": 393 }, { "epoch": 0.41, "learning_rate": 3.705179770146946e-05, "logits/chosen": -2.215156316757202, "logits/rejected": -2.17736554145813, "logps/chosen": -338.9674987792969, "logps/rejected": -318.7024230957031, "loss": 0.7397, "rewards/accuracies": 0.25, "rewards/chosen": -0.7532888650894165, "rewards/margins": -0.0309628713876009, "rewards/rejected": -0.7223260402679443, "step": 394 }, { "epoch": 0.41, "learning_rate": 3.697216784823967e-05, "logits/chosen": -2.0116758346557617, "logits/rejected": -2.1581830978393555, "logps/chosen": -242.60418701171875, "logps/rejected": -268.9742431640625, "loss": 0.8924, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1447850465774536, "rewards/margins": -0.10007806122303009, "rewards/rejected": -1.0447068214416504, "step": 395 }, { "epoch": 0.41, "learning_rate": 3.689238007402275e-05, "logits/chosen": -2.0335168838500977, "logits/rejected": -1.962857723236084, "logps/chosen": -239.37826538085938, "logps/rejected": -250.44223022460938, "loss": 0.7472, "rewards/accuracies": 0.4375, "rewards/chosen": -0.880851149559021, "rewards/margins": 0.022362351417541504, "rewards/rejected": -0.9032134413719177, "step": 396 }, { "epoch": 0.41, "learning_rate": 3.6812435431273374e-05, "logits/chosen": -2.365676164627075, "logits/rejected": -2.3459362983703613, "logps/chosen": -479.81549072265625, "logps/rejected": -578.3353271484375, "loss": 0.586, "rewards/accuracies": 0.625, "rewards/chosen": -0.9297402501106262, "rewards/margins": 0.4031886160373688, "rewards/rejected": -1.3329288959503174, "step": 397 }, { "epoch": 0.41, "learning_rate": 3.673233497451541e-05, "logits/chosen": -2.2611470222473145, "logits/rejected": -2.1032679080963135, "logps/chosen": -301.1606140136719, "logps/rejected": -320.68115234375, "loss": 0.9109, "rewards/accuracies": 0.3125, "rewards/chosen": -1.3444305658340454, "rewards/margins": -0.23366227746009827, "rewards/rejected": -1.1107683181762695, "step": 398 }, { "epoch": 0.41, "learning_rate": 3.665207976032804e-05, "logits/chosen": -2.068887948989868, "logits/rejected": -2.339625835418701, "logps/chosen": -389.18975830078125, "logps/rejected": -526.3670654296875, "loss": 0.526, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0946829319000244, "rewards/margins": 0.6931131482124329, "rewards/rejected": -1.7877960205078125, "step": 399 }, { "epoch": 0.41, "learning_rate": 3.65716708473318e-05, "logits/chosen": -2.0643274784088135, "logits/rejected": -1.928961992263794, "logps/chosen": -350.87506103515625, "logps/rejected": -305.41510009765625, "loss": 0.8925, "rewards/accuracies": 0.375, "rewards/chosen": -1.5230122804641724, "rewards/margins": -0.1989414244890213, "rewards/rejected": -1.324070930480957, "step": 400 }, { "epoch": 0.42, "learning_rate": 3.64911092961746e-05, "logits/chosen": -1.9068621397018433, "logits/rejected": -2.146777629852295, "logps/chosen": -401.4383544921875, "logps/rejected": -405.74755859375, "loss": 0.6681, "rewards/accuracies": 0.625, "rewards/chosen": -1.1909916400909424, "rewards/margins": 0.21687617897987366, "rewards/rejected": -1.4078677892684937, "step": 401 }, { "epoch": 0.42, "learning_rate": 3.641039616951776e-05, "logits/chosen": -1.996084451675415, "logits/rejected": -1.9715969562530518, "logps/chosen": -285.1630859375, "logps/rejected": -260.9218444824219, "loss": 0.7071, "rewards/accuracies": 0.5, "rewards/chosen": -1.1817179918289185, "rewards/margins": 0.04514620825648308, "rewards/rejected": -1.226864218711853, "step": 402 }, { "epoch": 0.42, "learning_rate": 3.632953253202199e-05, "logits/chosen": -1.890580177307129, "logits/rejected": -2.0182905197143555, "logps/chosen": -329.42694091796875, "logps/rejected": -450.1892395019531, "loss": 0.6209, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0331366062164307, "rewards/margins": 0.35443419218063354, "rewards/rejected": -1.3875707387924194, "step": 403 }, { "epoch": 0.42, "learning_rate": 3.6248519450333315e-05, "logits/chosen": -2.3349204063415527, "logits/rejected": -2.1745338439941406, "logps/chosen": -383.373291015625, "logps/rejected": -421.9166259765625, "loss": 0.7308, "rewards/accuracies": 0.625, "rewards/chosen": -1.1230090856552124, "rewards/margins": 0.11969134211540222, "rewards/rejected": -1.2427003383636475, "step": 404 }, { "epoch": 0.42, "learning_rate": 3.6167357993069075e-05, "logits/chosen": -2.124786615371704, "logits/rejected": -2.2353270053863525, "logps/chosen": -398.7025146484375, "logps/rejected": -462.741455078125, "loss": 0.6585, "rewards/accuracies": 0.625, "rewards/chosen": -1.0880743265151978, "rewards/margins": 0.24901491403579712, "rewards/rejected": -1.3370893001556396, "step": 405 }, { "epoch": 0.42, "learning_rate": 3.608604923080373e-05, "logits/chosen": -2.124338150024414, "logits/rejected": -1.9945569038391113, "logps/chosen": -410.68701171875, "logps/rejected": -351.4372863769531, "loss": 0.6415, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3134312629699707, "rewards/margins": 0.23311342298984528, "rewards/rejected": -1.5465446710586548, "step": 406 }, { "epoch": 0.42, "learning_rate": 3.6004594236054836e-05, "logits/chosen": -1.9870651960372925, "logits/rejected": -1.9105538129806519, "logps/chosen": -301.83245849609375, "logps/rejected": -309.3512268066406, "loss": 0.5724, "rewards/accuracies": 0.625, "rewards/chosen": -0.9611720442771912, "rewards/margins": 0.3760131001472473, "rewards/rejected": -1.337185263633728, "step": 407 }, { "epoch": 0.42, "learning_rate": 3.592299408326883e-05, "logits/chosen": -2.201324462890625, "logits/rejected": -2.078688383102417, "logps/chosen": -331.58795166015625, "logps/rejected": -408.33770751953125, "loss": 0.5669, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2536283731460571, "rewards/margins": 0.36857056617736816, "rewards/rejected": -1.6221990585327148, "step": 408 }, { "epoch": 0.42, "learning_rate": 3.584124984880689e-05, "logits/chosen": -1.9430594444274902, "logits/rejected": -2.1800172328948975, "logps/chosen": -255.10409545898438, "logps/rejected": -318.5274658203125, "loss": 0.4678, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7731481194496155, "rewards/margins": 0.704475462436676, "rewards/rejected": -1.4776235818862915, "step": 409 }, { "epoch": 0.42, "learning_rate": 3.575936261093073e-05, "logits/chosen": -2.026442766189575, "logits/rejected": -2.3520658016204834, "logps/chosen": -224.86386108398438, "logps/rejected": -284.865234375, "loss": 0.6649, "rewards/accuracies": 0.625, "rewards/chosen": -0.988176703453064, "rewards/margins": 0.25633472204208374, "rewards/rejected": -1.2445114850997925, "step": 410 }, { "epoch": 0.43, "learning_rate": 3.5677333449788374e-05, "logits/chosen": -2.2224576473236084, "logits/rejected": -2.2077910900115967, "logps/chosen": -376.9631042480469, "logps/rejected": -320.0201721191406, "loss": 0.6791, "rewards/accuracies": 0.5, "rewards/chosen": -1.145774006843567, "rewards/margins": 0.23624449968338013, "rewards/rejected": -1.3820184469223022, "step": 411 }, { "epoch": 0.43, "learning_rate": 3.559516344739991e-05, "logits/chosen": -2.0415990352630615, "logits/rejected": -2.077162981033325, "logps/chosen": -276.7152404785156, "logps/rejected": -296.462646484375, "loss": 0.573, "rewards/accuracies": 0.75, "rewards/chosen": -0.8756453394889832, "rewards/margins": 0.3228147625923157, "rewards/rejected": -1.1984599828720093, "step": 412 }, { "epoch": 0.43, "learning_rate": 3.551285368764321e-05, "logits/chosen": -2.171372890472412, "logits/rejected": -2.1741371154785156, "logps/chosen": -273.3794860839844, "logps/rejected": -258.83477783203125, "loss": 0.7963, "rewards/accuracies": 0.625, "rewards/chosen": -1.3004168272018433, "rewards/margins": 0.03895503282546997, "rewards/rejected": -1.3393718004226685, "step": 413 }, { "epoch": 0.43, "learning_rate": 3.543040525623965e-05, "logits/chosen": -2.0618252754211426, "logits/rejected": -2.1750998497009277, "logps/chosen": -234.17120361328125, "logps/rejected": -300.029296875, "loss": 0.5077, "rewards/accuracies": 0.875, "rewards/chosen": -1.0354472398757935, "rewards/margins": 0.45164865255355835, "rewards/rejected": -1.487095832824707, "step": 414 }, { "epoch": 0.43, "learning_rate": 3.534781924073978e-05, "logits/chosen": -1.8313791751861572, "logits/rejected": -2.19712233543396, "logps/chosen": -263.08843994140625, "logps/rejected": -374.9747314453125, "loss": 0.6435, "rewards/accuracies": 0.5, "rewards/chosen": -1.3530242443084717, "rewards/margins": 0.28404536843299866, "rewards/rejected": -1.6370694637298584, "step": 415 }, { "epoch": 0.43, "learning_rate": 3.5265096730508974e-05, "logits/chosen": -1.992910385131836, "logits/rejected": -2.050726890563965, "logps/chosen": -332.4587707519531, "logps/rejected": -448.9661560058594, "loss": 0.5689, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1743048429489136, "rewards/margins": 0.4015694260597229, "rewards/rejected": -1.5758743286132812, "step": 416 }, { "epoch": 0.43, "learning_rate": 3.518223881671305e-05, "logits/chosen": -2.2300572395324707, "logits/rejected": -2.292898416519165, "logps/chosen": -361.424560546875, "logps/rejected": -408.99267578125, "loss": 0.7222, "rewards/accuracies": 0.625, "rewards/chosen": -1.523597240447998, "rewards/margins": 0.003444090485572815, "rewards/rejected": -1.5270413160324097, "step": 417 }, { "epoch": 0.43, "learning_rate": 3.509924659230392e-05, "logits/chosen": -1.9317662715911865, "logits/rejected": -2.1398890018463135, "logps/chosen": -201.24151611328125, "logps/rejected": -316.9627685546875, "loss": 0.7506, "rewards/accuracies": 0.4375, "rewards/chosen": -1.2115095853805542, "rewards/margins": 0.002029839903116226, "rewards/rejected": -1.2135393619537354, "step": 418 }, { "epoch": 0.43, "learning_rate": 3.501612115200512e-05, "logits/chosen": -1.8960869312286377, "logits/rejected": -1.8769932985305786, "logps/chosen": -230.08914184570312, "logps/rejected": -274.8809814453125, "loss": 0.7128, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2795968055725098, "rewards/margins": 0.10936379432678223, "rewards/rejected": -1.3889607191085815, "step": 419 }, { "epoch": 0.43, "learning_rate": 3.4932863592297395e-05, "logits/chosen": -2.1262502670288086, "logits/rejected": -2.0916502475738525, "logps/chosen": -288.3411865234375, "logps/rejected": -374.0597839355469, "loss": 0.587, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2651339769363403, "rewards/margins": 0.49100592732429504, "rewards/rejected": -1.7561399936676025, "step": 420 }, { "epoch": 0.44, "learning_rate": 3.4849475011404246e-05, "logits/chosen": -1.932436227798462, "logits/rejected": -2.0958409309387207, "logps/chosen": -385.390380859375, "logps/rejected": -420.717529296875, "loss": 0.6097, "rewards/accuracies": 0.75, "rewards/chosen": -1.2712079286575317, "rewards/margins": 0.29501456022262573, "rewards/rejected": -1.5662224292755127, "step": 421 }, { "epoch": 0.44, "learning_rate": 3.476595650927741e-05, "logits/chosen": -2.2775261402130127, "logits/rejected": -2.2956340312957764, "logps/chosen": -357.61767578125, "logps/rejected": -349.096923828125, "loss": 0.736, "rewards/accuracies": 0.625, "rewards/chosen": -1.1353936195373535, "rewards/margins": 0.031133286654949188, "rewards/rejected": -1.1665267944335938, "step": 422 }, { "epoch": 0.44, "learning_rate": 3.468230918758242e-05, "logits/chosen": -2.1961703300476074, "logits/rejected": -2.291398048400879, "logps/chosen": -308.3431701660156, "logps/rejected": -314.66851806640625, "loss": 0.6423, "rewards/accuracies": 0.625, "rewards/chosen": -1.1682064533233643, "rewards/margins": 0.12875376641750336, "rewards/rejected": -1.2969601154327393, "step": 423 }, { "epoch": 0.44, "learning_rate": 3.459853414968397e-05, "logits/chosen": -2.1401329040527344, "logits/rejected": -2.0424118041992188, "logps/chosen": -323.16693115234375, "logps/rejected": -300.6637268066406, "loss": 0.7578, "rewards/accuracies": 0.4375, "rewards/chosen": -1.309455156326294, "rewards/margins": 0.23452845215797424, "rewards/rejected": -1.5439834594726562, "step": 424 }, { "epoch": 0.44, "learning_rate": 3.451463250063146e-05, "logits/chosen": -2.1163456439971924, "logits/rejected": -2.154703140258789, "logps/chosen": -307.1343688964844, "logps/rejected": -349.54791259765625, "loss": 0.6657, "rewards/accuracies": 0.4375, "rewards/chosen": -1.2518941164016724, "rewards/margins": 0.306789755821228, "rewards/rejected": -1.55868399143219, "step": 425 }, { "epoch": 0.44, "learning_rate": 3.443060534714434e-05, "logits/chosen": -1.820733666419983, "logits/rejected": -1.8521438837051392, "logps/chosen": -297.1275939941406, "logps/rejected": -264.0213623046875, "loss": 0.761, "rewards/accuracies": 0.4375, "rewards/chosen": -1.155602216720581, "rewards/margins": 0.04654591530561447, "rewards/rejected": -1.2021480798721313, "step": 426 }, { "epoch": 0.44, "learning_rate": 3.4346453797597576e-05, "logits/chosen": -2.069772243499756, "logits/rejected": -1.9325459003448486, "logps/chosen": -291.765869140625, "logps/rejected": -291.6902160644531, "loss": 0.6282, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4001034498214722, "rewards/margins": 0.3071330487728119, "rewards/rejected": -1.7072365283966064, "step": 427 }, { "epoch": 0.44, "learning_rate": 3.426217896200699e-05, "logits/chosen": -1.9241890907287598, "logits/rejected": -1.9453171491622925, "logps/chosen": -356.77081298828125, "logps/rejected": -332.4874572753906, "loss": 0.7444, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3764888048171997, "rewards/margins": 0.23870763182640076, "rewards/rejected": -1.6151964664459229, "step": 428 }, { "epoch": 0.44, "learning_rate": 3.417778195201464e-05, "logits/chosen": -2.045527935028076, "logits/rejected": -2.2840356826782227, "logps/chosen": -358.7695617675781, "logps/rejected": -411.1256408691406, "loss": 0.6085, "rewards/accuracies": 0.625, "rewards/chosen": -1.457747459411621, "rewards/margins": 0.2899523377418518, "rewards/rejected": -1.7476999759674072, "step": 429 }, { "epoch": 0.45, "learning_rate": 3.4093263880874136e-05, "logits/chosen": -2.2168030738830566, "logits/rejected": -2.026329755783081, "logps/chosen": -391.6089172363281, "logps/rejected": -444.5746154785156, "loss": 0.7252, "rewards/accuracies": 0.5, "rewards/chosen": -1.3314502239227295, "rewards/margins": 0.08630897104740143, "rewards/rejected": -1.4177591800689697, "step": 430 }, { "epoch": 0.45, "learning_rate": 3.400862586343597e-05, "logits/chosen": -2.105616569519043, "logits/rejected": -2.0884623527526855, "logps/chosen": -383.8026123046875, "logps/rejected": -367.6466064453125, "loss": 0.577, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5232832431793213, "rewards/margins": 0.3972979784011841, "rewards/rejected": -1.920581340789795, "step": 431 }, { "epoch": 0.45, "learning_rate": 3.392386901613282e-05, "logits/chosen": -2.2307028770446777, "logits/rejected": -1.9826226234436035, "logps/chosen": -296.00274658203125, "logps/rejected": -274.8576354980469, "loss": 0.8504, "rewards/accuracies": 0.625, "rewards/chosen": -1.0603615045547485, "rewards/margins": -0.11922366172075272, "rewards/rejected": -0.9411377310752869, "step": 432 }, { "epoch": 0.45, "learning_rate": 3.383899445696477e-05, "logits/chosen": -1.8530570268630981, "logits/rejected": -1.8290507793426514, "logps/chosen": -352.0951843261719, "logps/rejected": -424.3826599121094, "loss": 0.6405, "rewards/accuracies": 0.5, "rewards/chosen": -1.2079800367355347, "rewards/margins": 0.37576451897621155, "rewards/rejected": -1.5837446451187134, "step": 433 }, { "epoch": 0.45, "learning_rate": 3.375400330548466e-05, "logits/chosen": -2.0532162189483643, "logits/rejected": -1.988155722618103, "logps/chosen": -418.684814453125, "logps/rejected": -460.6349792480469, "loss": 0.8049, "rewards/accuracies": 0.4375, "rewards/chosen": -1.395944595336914, "rewards/margins": -0.09554408490657806, "rewards/rejected": -1.3004004955291748, "step": 434 }, { "epoch": 0.45, "learning_rate": 3.366889668278321e-05, "logits/chosen": -1.9795726537704468, "logits/rejected": -2.1528823375701904, "logps/chosen": -267.9879455566406, "logps/rejected": -301.9024963378906, "loss": 0.7124, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9974009990692139, "rewards/margins": 0.12711931765079498, "rewards/rejected": -1.1245203018188477, "step": 435 }, { "epoch": 0.45, "learning_rate": 3.358367571147433e-05, "logits/chosen": -1.886863112449646, "logits/rejected": -2.1218717098236084, "logps/chosen": -370.65704345703125, "logps/rejected": -416.7871398925781, "loss": 0.6567, "rewards/accuracies": 0.5625, "rewards/chosen": -1.184512734413147, "rewards/margins": 0.2182311713695526, "rewards/rejected": -1.4027438163757324, "step": 436 }, { "epoch": 0.45, "learning_rate": 3.3498341515680214e-05, "logits/chosen": -2.1803653240203857, "logits/rejected": -2.1077873706817627, "logps/chosen": -321.48834228515625, "logps/rejected": -281.7978820800781, "loss": 0.7596, "rewards/accuracies": 0.5, "rewards/chosen": -0.9977477192878723, "rewards/margins": -0.019190065562725067, "rewards/rejected": -0.9785577058792114, "step": 437 }, { "epoch": 0.45, "learning_rate": 3.3412895221016605e-05, "logits/chosen": -1.888815999031067, "logits/rejected": -2.005762815475464, "logps/chosen": -184.04368591308594, "logps/rejected": -236.44418334960938, "loss": 0.9197, "rewards/accuracies": 0.4375, "rewards/chosen": -1.0804985761642456, "rewards/margins": -0.14094766974449158, "rewards/rejected": -0.9395509958267212, "step": 438 }, { "epoch": 0.45, "learning_rate": 3.332733795457789e-05, "logits/chosen": -1.9857516288757324, "logits/rejected": -1.7923184633255005, "logps/chosen": -267.1208801269531, "logps/rejected": -263.0241394042969, "loss": 0.5819, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9991573095321655, "rewards/margins": 0.37854859232902527, "rewards/rejected": -1.3777059316635132, "step": 439 }, { "epoch": 0.46, "learning_rate": 3.324167084492226e-05, "logits/chosen": -1.880125880241394, "logits/rejected": -2.0244104862213135, "logps/chosen": -300.0595703125, "logps/rejected": -503.9610290527344, "loss": 0.469, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1374258995056152, "rewards/margins": 0.9749306440353394, "rewards/rejected": -2.112356662750244, "step": 440 }, { "epoch": 0.46, "learning_rate": 3.3155895022056784e-05, "logits/chosen": -2.1099507808685303, "logits/rejected": -2.1651086807250977, "logps/chosen": -297.15704345703125, "logps/rejected": -323.30389404296875, "loss": 0.7954, "rewards/accuracies": 0.5, "rewards/chosen": -1.2452776432037354, "rewards/margins": -0.07082469016313553, "rewards/rejected": -1.1744530200958252, "step": 441 }, { "epoch": 0.46, "learning_rate": 3.3070011617422566e-05, "logits/chosen": -1.9197720289230347, "logits/rejected": -1.991129755973816, "logps/chosen": -326.0094909667969, "logps/rejected": -374.349609375, "loss": 0.5772, "rewards/accuracies": 0.75, "rewards/chosen": -0.8590810298919678, "rewards/margins": 0.45791155099868774, "rewards/rejected": -1.3169926404953003, "step": 442 }, { "epoch": 0.46, "learning_rate": 3.2984021763879755e-05, "logits/chosen": -2.2275571823120117, "logits/rejected": -2.2043910026550293, "logps/chosen": -371.75927734375, "logps/rejected": -395.1778564453125, "loss": 0.6456, "rewards/accuracies": 0.625, "rewards/chosen": -1.126926302909851, "rewards/margins": 0.2308472990989685, "rewards/rejected": -1.3577736616134644, "step": 443 }, { "epoch": 0.46, "learning_rate": 3.2897926595692664e-05, "logits/chosen": -2.181673526763916, "logits/rejected": -2.3108913898468018, "logps/chosen": -357.79888916015625, "logps/rejected": -431.6429443359375, "loss": 0.62, "rewards/accuracies": 0.625, "rewards/chosen": -0.9887603521347046, "rewards/margins": 0.32495391368865967, "rewards/rejected": -1.3137142658233643, "step": 444 }, { "epoch": 0.46, "learning_rate": 3.2811727248514754e-05, "logits/chosen": -2.1919546127319336, "logits/rejected": -2.1633992195129395, "logps/chosen": -421.3163757324219, "logps/rejected": -446.4229431152344, "loss": 0.8466, "rewards/accuracies": 0.375, "rewards/chosen": -1.3915138244628906, "rewards/margins": -0.18495866656303406, "rewards/rejected": -1.2065550088882446, "step": 445 }, { "epoch": 0.46, "learning_rate": 3.272542485937369e-05, "logits/chosen": -2.2480454444885254, "logits/rejected": -2.2334165573120117, "logps/chosen": -532.5480346679688, "logps/rejected": -461.50408935546875, "loss": 0.6875, "rewards/accuracies": 0.625, "rewards/chosen": -1.161598563194275, "rewards/margins": 0.1340804100036621, "rewards/rejected": -1.2956790924072266, "step": 446 }, { "epoch": 0.46, "learning_rate": 3.263902056665631e-05, "logits/chosen": -2.1135687828063965, "logits/rejected": -1.9993455410003662, "logps/chosen": -338.2395324707031, "logps/rejected": -354.9482727050781, "loss": 0.7578, "rewards/accuracies": 0.25, "rewards/chosen": -1.4973227977752686, "rewards/margins": -0.08258108794689178, "rewards/rejected": -1.4147417545318604, "step": 447 }, { "epoch": 0.46, "learning_rate": 3.2552515510093674e-05, "logits/chosen": -1.923673391342163, "logits/rejected": -2.005218982696533, "logps/chosen": -276.5377502441406, "logps/rejected": -285.30633544921875, "loss": 0.8311, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4315433502197266, "rewards/margins": 0.01938357949256897, "rewards/rejected": -1.4509271383285522, "step": 448 }, { "epoch": 0.46, "learning_rate": 3.2465910830745924e-05, "logits/chosen": -2.0653865337371826, "logits/rejected": -2.0072455406188965, "logps/chosen": -217.55328369140625, "logps/rejected": -182.0865936279297, "loss": 0.8905, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2046016454696655, "rewards/margins": -0.10025043040513992, "rewards/rejected": -1.104351282119751, "step": 449 }, { "epoch": 0.47, "learning_rate": 3.237920767098735e-05, "logits/chosen": -2.1028592586517334, "logits/rejected": -2.201251745223999, "logps/chosen": -325.9986572265625, "logps/rejected": -432.1500244140625, "loss": 0.7399, "rewards/accuracies": 0.375, "rewards/chosen": -1.1958823204040527, "rewards/margins": 0.0005789399147033691, "rewards/rejected": -1.1964612007141113, "step": 450 }, { "epoch": 0.47, "learning_rate": 3.229240717449122e-05, "logits/chosen": -2.1992039680480957, "logits/rejected": -2.378601312637329, "logps/chosen": -370.0502014160156, "logps/rejected": -408.4455871582031, "loss": 0.7355, "rewards/accuracies": 0.375, "rewards/chosen": -1.168852686882019, "rewards/margins": -0.0028562992811203003, "rewards/rejected": -1.1659963130950928, "step": 451 }, { "epoch": 0.47, "learning_rate": 3.2205510486214777e-05, "logits/chosen": -2.1426610946655273, "logits/rejected": -2.160429000854492, "logps/chosen": -307.41278076171875, "logps/rejected": -312.3187561035156, "loss": 0.6221, "rewards/accuracies": 0.625, "rewards/chosen": -1.0617390871047974, "rewards/margins": 0.29938191175460815, "rewards/rejected": -1.3611209392547607, "step": 452 }, { "epoch": 0.47, "learning_rate": 3.211851875238408e-05, "logits/chosen": -2.1584270000457764, "logits/rejected": -1.9756699800491333, "logps/chosen": -247.97222900390625, "logps/rejected": -275.8572692871094, "loss": 0.7132, "rewards/accuracies": 0.5, "rewards/chosen": -1.2110902070999146, "rewards/margins": 0.21431638300418854, "rewards/rejected": -1.425406575202942, "step": 453 }, { "epoch": 0.47, "learning_rate": 3.203143312047889e-05, "logits/chosen": -2.284712314605713, "logits/rejected": -2.316433906555176, "logps/chosen": -397.7944030761719, "logps/rejected": -435.2258605957031, "loss": 0.5451, "rewards/accuracies": 0.75, "rewards/chosen": -1.0755599737167358, "rewards/margins": 0.4171152710914612, "rewards/rejected": -1.4926753044128418, "step": 454 }, { "epoch": 0.47, "learning_rate": 3.1944254739217585e-05, "logits/chosen": -2.3242409229278564, "logits/rejected": -2.2877395153045654, "logps/chosen": -317.1751403808594, "logps/rejected": -319.5546569824219, "loss": 0.6585, "rewards/accuracies": 0.625, "rewards/chosen": -1.0918810367584229, "rewards/margins": 0.174340158700943, "rewards/rejected": -1.266221284866333, "step": 455 }, { "epoch": 0.47, "learning_rate": 3.1856984758541924e-05, "logits/chosen": -2.1573104858398438, "logits/rejected": -2.1013479232788086, "logps/chosen": -402.46051025390625, "logps/rejected": -317.1357421875, "loss": 0.5846, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2507472038269043, "rewards/margins": 0.32065922021865845, "rewards/rejected": -1.5714064836502075, "step": 456 }, { "epoch": 0.47, "learning_rate": 3.176962432960197e-05, "logits/chosen": -2.3160417079925537, "logits/rejected": -2.1284031867980957, "logps/chosen": -401.79803466796875, "logps/rejected": -336.5020446777344, "loss": 0.8215, "rewards/accuracies": 0.5, "rewards/chosen": -1.1392488479614258, "rewards/margins": -0.045140765607357025, "rewards/rejected": -1.094107985496521, "step": 457 }, { "epoch": 0.47, "learning_rate": 3.168217460474081e-05, "logits/chosen": -2.1930458545684814, "logits/rejected": -2.081754684448242, "logps/chosen": -408.9210510253906, "logps/rejected": -364.5238037109375, "loss": 0.7234, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2171686887741089, "rewards/margins": 0.20472437143325806, "rewards/rejected": -1.4218928813934326, "step": 458 }, { "epoch": 0.48, "learning_rate": 3.159463673747945e-05, "logits/chosen": -1.8765006065368652, "logits/rejected": -1.8720567226409912, "logps/chosen": -301.31402587890625, "logps/rejected": -357.6469421386719, "loss": 0.6946, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7754146456718445, "rewards/margins": 0.04356713593006134, "rewards/rejected": -0.8189818263053894, "step": 459 }, { "epoch": 0.48, "learning_rate": 3.150701188250152e-05, "logits/chosen": -2.129390001296997, "logits/rejected": -1.9843943119049072, "logps/chosen": -334.6359558105469, "logps/rejected": -345.7218322753906, "loss": 0.6842, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9497905373573303, "rewards/margins": 0.12164813280105591, "rewards/rejected": -1.0714386701583862, "step": 460 }, { "epoch": 0.48, "learning_rate": 3.141930119563812e-05, "logits/chosen": -1.9846031665802002, "logits/rejected": -2.1700026988983154, "logps/chosen": -316.71832275390625, "logps/rejected": -362.517822265625, "loss": 0.6957, "rewards/accuracies": 0.4375, "rewards/chosen": -1.0000622272491455, "rewards/margins": 0.041672270745038986, "rewards/rejected": -1.0417344570159912, "step": 461 }, { "epoch": 0.48, "learning_rate": 3.133150583385247e-05, "logits/chosen": -2.246786594390869, "logits/rejected": -2.29536771774292, "logps/chosen": -401.23394775390625, "logps/rejected": -394.4549865722656, "loss": 0.6917, "rewards/accuracies": 0.625, "rewards/chosen": -1.0588868856430054, "rewards/margins": 0.1614934504032135, "rewards/rejected": -1.220380425453186, "step": 462 }, { "epoch": 0.48, "learning_rate": 3.124362695522476e-05, "logits/chosen": -1.9954516887664795, "logits/rejected": -2.391841411590576, "logps/chosen": -270.8871154785156, "logps/rejected": -376.65106201171875, "loss": 0.6857, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9002559781074524, "rewards/margins": 0.10894503444433212, "rewards/rejected": -1.0092010498046875, "step": 463 }, { "epoch": 0.48, "learning_rate": 3.115566571893681e-05, "logits/chosen": -2.2636733055114746, "logits/rejected": -2.159451484680176, "logps/chosen": -318.85888671875, "logps/rejected": -295.5828857421875, "loss": 0.6792, "rewards/accuracies": 0.625, "rewards/chosen": -1.0352829694747925, "rewards/margins": 0.12933281064033508, "rewards/rejected": -1.1646157503128052, "step": 464 }, { "epoch": 0.48, "learning_rate": 3.1067623285256766e-05, "logits/chosen": -2.083453893661499, "logits/rejected": -2.077559471130371, "logps/chosen": -277.09197998046875, "logps/rejected": -313.90728759765625, "loss": 0.7098, "rewards/accuracies": 0.375, "rewards/chosen": -0.8074424266815186, "rewards/margins": 0.011367838829755783, "rewards/rejected": -0.8188102841377258, "step": 465 }, { "epoch": 0.48, "learning_rate": 3.097950081552387e-05, "logits/chosen": -2.062037467956543, "logits/rejected": -1.969789981842041, "logps/chosen": -272.09674072265625, "logps/rejected": -282.55560302734375, "loss": 0.6871, "rewards/accuracies": 0.625, "rewards/chosen": -0.8802644610404968, "rewards/margins": 0.13163542747497559, "rewards/rejected": -1.0118999481201172, "step": 466 }, { "epoch": 0.48, "learning_rate": 3.089129947213305e-05, "logits/chosen": -2.0714964866638184, "logits/rejected": -2.0289595127105713, "logps/chosen": -338.4779968261719, "logps/rejected": -305.9598083496094, "loss": 0.6018, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8691202998161316, "rewards/margins": 0.25768163800239563, "rewards/rejected": -1.1268019676208496, "step": 467 }, { "epoch": 0.48, "learning_rate": 3.080302041851966e-05, "logits/chosen": -2.1845309734344482, "logits/rejected": -2.330949306488037, "logps/chosen": -318.61444091796875, "logps/rejected": -337.0727844238281, "loss": 0.7127, "rewards/accuracies": 0.5, "rewards/chosen": -1.0258458852767944, "rewards/margins": 0.06928322464227676, "rewards/rejected": -1.0951290130615234, "step": 468 }, { "epoch": 0.49, "learning_rate": 3.071466481914409e-05, "logits/chosen": -1.974360704421997, "logits/rejected": -1.8974449634552002, "logps/chosen": -328.3250732421875, "logps/rejected": -353.5456237792969, "loss": 0.811, "rewards/accuracies": 0.4375, "rewards/chosen": -1.0966472625732422, "rewards/margins": -0.16713061928749084, "rewards/rejected": -0.9295165538787842, "step": 469 }, { "epoch": 0.49, "learning_rate": 3.062623383947643e-05, "logits/chosen": -1.96671462059021, "logits/rejected": -2.175767421722412, "logps/chosen": -325.94085693359375, "logps/rejected": -443.0379943847656, "loss": 0.7812, "rewards/accuracies": 0.5, "rewards/chosen": -1.099274754524231, "rewards/margins": -0.10562913119792938, "rewards/rejected": -0.9936455488204956, "step": 470 }, { "epoch": 0.49, "learning_rate": 3.053772864598108e-05, "logits/chosen": -1.8880025148391724, "logits/rejected": -2.134125232696533, "logps/chosen": -347.37060546875, "logps/rejected": -415.47735595703125, "loss": 0.6779, "rewards/accuracies": 0.5625, "rewards/chosen": -1.082190990447998, "rewards/margins": 0.09107710421085358, "rewards/rejected": -1.1732680797576904, "step": 471 }, { "epoch": 0.49, "learning_rate": 3.0449150406101367e-05, "logits/chosen": -1.7812613248825073, "logits/rejected": -1.7713969945907593, "logps/chosen": -293.71978759765625, "logps/rejected": -305.0588073730469, "loss": 0.712, "rewards/accuracies": 0.5, "rewards/chosen": -1.0079898834228516, "rewards/margins": 0.062352173030376434, "rewards/rejected": -1.0703420639038086, "step": 472 }, { "epoch": 0.49, "learning_rate": 3.0360500288244155e-05, "logits/chosen": -2.2740883827209473, "logits/rejected": -2.167978286743164, "logps/chosen": -434.8294372558594, "logps/rejected": -422.0832214355469, "loss": 0.784, "rewards/accuracies": 0.4375, "rewards/chosen": -1.0591871738433838, "rewards/margins": -0.10172367095947266, "rewards/rejected": -0.9574634432792664, "step": 473 }, { "epoch": 0.49, "learning_rate": 3.0271779461764426e-05, "logits/chosen": -2.059706926345825, "logits/rejected": -2.0634119510650635, "logps/chosen": -401.7886047363281, "logps/rejected": -436.7006530761719, "loss": 0.6583, "rewards/accuracies": 0.75, "rewards/chosen": -0.9169749021530151, "rewards/margins": 0.19741462171077728, "rewards/rejected": -1.114389419555664, "step": 474 }, { "epoch": 0.49, "learning_rate": 3.018298909694986e-05, "logits/chosen": -2.1678502559661865, "logits/rejected": -2.2958884239196777, "logps/chosen": -328.1385498046875, "logps/rejected": -411.2935485839844, "loss": 0.6487, "rewards/accuracies": 0.625, "rewards/chosen": -0.87379390001297, "rewards/margins": 0.17571806907653809, "rewards/rejected": -1.0495120286941528, "step": 475 }, { "epoch": 0.49, "learning_rate": 3.0094130365005395e-05, "logits/chosen": -2.092456817626953, "logits/rejected": -2.266845226287842, "logps/chosen": -225.6428680419922, "logps/rejected": -335.9315185546875, "loss": 0.6278, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7999565601348877, "rewards/margins": 0.19611772894859314, "rewards/rejected": -0.996074378490448, "step": 476 }, { "epoch": 0.49, "learning_rate": 3.0005204438037765e-05, "logits/chosen": -2.205846071243286, "logits/rejected": -2.052035093307495, "logps/chosen": -361.5894775390625, "logps/rejected": -309.0181884765625, "loss": 0.6715, "rewards/accuracies": 0.625, "rewards/chosen": -0.9857699871063232, "rewards/margins": 0.09481573104858398, "rewards/rejected": -1.0805857181549072, "step": 477 }, { "epoch": 0.5, "learning_rate": 2.991621248904007e-05, "logits/chosen": -2.26173996925354, "logits/rejected": -2.031365394592285, "logps/chosen": -369.19915771484375, "logps/rejected": -294.8736877441406, "loss": 0.7289, "rewards/accuracies": 0.3125, "rewards/chosen": -1.128753423690796, "rewards/margins": -0.01859595626592636, "rewards/rejected": -1.1101574897766113, "step": 478 }, { "epoch": 0.5, "learning_rate": 2.9827155691876262e-05, "logits/chosen": -1.8554052114486694, "logits/rejected": -2.2059624195098877, "logps/chosen": -317.9852600097656, "logps/rejected": -354.1583251953125, "loss": 0.5558, "rewards/accuracies": 0.75, "rewards/chosen": -0.9314447045326233, "rewards/margins": 0.35100990533828735, "rewards/rejected": -1.2824546098709106, "step": 479 }, { "epoch": 0.5, "learning_rate": 2.973803522126571e-05, "logits/chosen": -1.9700522422790527, "logits/rejected": -1.8781499862670898, "logps/chosen": -307.0520935058594, "logps/rejected": -313.407470703125, "loss": 0.6873, "rewards/accuracies": 0.5, "rewards/chosen": -1.0591685771942139, "rewards/margins": 0.05153223127126694, "rewards/rejected": -1.1107008457183838, "step": 480 }, { "epoch": 0.5, "learning_rate": 2.9648852252767668e-05, "logits/chosen": -2.1033730506896973, "logits/rejected": -2.0786190032958984, "logps/chosen": -452.3450012207031, "logps/rejected": -511.24700927734375, "loss": 0.6695, "rewards/accuracies": 0.5, "rewards/chosen": -1.161694884300232, "rewards/margins": 0.10767564922571182, "rewards/rejected": -1.2693705558776855, "step": 481 }, { "epoch": 0.5, "learning_rate": 2.9559607962765773e-05, "logits/chosen": -2.103732109069824, "logits/rejected": -2.215973377227783, "logps/chosen": -326.31787109375, "logps/rejected": -398.490478515625, "loss": 0.6273, "rewards/accuracies": 0.5625, "rewards/chosen": -1.019250750541687, "rewards/margins": 0.20697328448295593, "rewards/rejected": -1.2262240648269653, "step": 482 }, { "epoch": 0.5, "learning_rate": 2.947030352845255e-05, "logits/chosen": -2.1636712551116943, "logits/rejected": -2.1790904998779297, "logps/chosen": -335.7892761230469, "logps/rejected": -397.0627746582031, "loss": 0.5934, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9784788489341736, "rewards/margins": 0.27349093556404114, "rewards/rejected": -1.251969814300537, "step": 483 }, { "epoch": 0.5, "learning_rate": 2.9380940127813834e-05, "logits/chosen": -2.165933609008789, "logits/rejected": -2.2048957347869873, "logps/chosen": -421.8327331542969, "logps/rejected": -405.9539794921875, "loss": 0.7287, "rewards/accuracies": 0.5625, "rewards/chosen": -1.179675579071045, "rewards/margins": 0.05797319859266281, "rewards/rejected": -1.2376487255096436, "step": 484 }, { "epoch": 0.5, "learning_rate": 2.9291518939613315e-05, "logits/chosen": -2.1944916248321533, "logits/rejected": -2.3757896423339844, "logps/chosen": -486.90582275390625, "logps/rejected": -422.9759521484375, "loss": 0.703, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3329401016235352, "rewards/margins": 0.04630117490887642, "rewards/rejected": -1.3792412281036377, "step": 485 }, { "epoch": 0.5, "learning_rate": 2.9202041143376896e-05, "logits/chosen": -1.9283084869384766, "logits/rejected": -2.046499490737915, "logps/chosen": -340.8263244628906, "logps/rejected": -354.3287658691406, "loss": 0.6069, "rewards/accuracies": 0.8125, "rewards/chosen": -1.017566204071045, "rewards/margins": 0.2470768392086029, "rewards/rejected": -1.2646431922912598, "step": 486 }, { "epoch": 0.5, "learning_rate": 2.9112507919377213e-05, "logits/chosen": -1.9073665142059326, "logits/rejected": -1.9913743734359741, "logps/chosen": -233.45367431640625, "logps/rejected": -262.6000061035156, "loss": 0.6771, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9823710918426514, "rewards/margins": 0.10720720142126083, "rewards/rejected": -1.0895782709121704, "step": 487 }, { "epoch": 0.51, "learning_rate": 2.9022920448618e-05, "logits/chosen": -2.0144119262695312, "logits/rejected": -2.010000228881836, "logps/chosen": -270.91790771484375, "logps/rejected": -324.4449462890625, "loss": 0.7047, "rewards/accuracies": 0.5, "rewards/chosen": -0.950652539730072, "rewards/margins": 0.02743140608072281, "rewards/rejected": -0.9780839681625366, "step": 488 }, { "epoch": 0.51, "learning_rate": 2.8933279912818566e-05, "logits/chosen": -2.200765609741211, "logits/rejected": -1.9420582056045532, "logps/chosen": -326.8210144042969, "logps/rejected": -328.5808410644531, "loss": 0.6967, "rewards/accuracies": 0.625, "rewards/chosen": -1.0532310009002686, "rewards/margins": 0.0682058334350586, "rewards/rejected": -1.1214368343353271, "step": 489 }, { "epoch": 0.51, "learning_rate": 2.8843587494398177e-05, "logits/chosen": -2.297065496444702, "logits/rejected": -2.367543935775757, "logps/chosen": -335.68994140625, "logps/rejected": -337.0887756347656, "loss": 0.6224, "rewards/accuracies": 0.75, "rewards/chosen": -1.0686800479888916, "rewards/margins": 0.21843653917312622, "rewards/rejected": -1.287116527557373, "step": 490 }, { "epoch": 0.51, "learning_rate": 2.875384437646046e-05, "logits/chosen": -2.2880589962005615, "logits/rejected": -2.273855209350586, "logps/chosen": -306.8229675292969, "logps/rejected": -322.33050537109375, "loss": 0.5281, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9717899560928345, "rewards/margins": 0.5251017808914185, "rewards/rejected": -1.4968918561935425, "step": 491 }, { "epoch": 0.51, "learning_rate": 2.8664051742777803e-05, "logits/chosen": -2.119661569595337, "logits/rejected": -1.9243615865707397, "logps/chosen": -326.88348388671875, "logps/rejected": -394.1810302734375, "loss": 0.6364, "rewards/accuracies": 0.5, "rewards/chosen": -0.9266807436943054, "rewards/margins": 0.20886686444282532, "rewards/rejected": -1.1355476379394531, "step": 492 }, { "epoch": 0.51, "learning_rate": 2.8574210777775755e-05, "logits/chosen": -2.2085232734680176, "logits/rejected": -2.188079357147217, "logps/chosen": -315.6412353515625, "logps/rejected": -290.0874938964844, "loss": 0.7217, "rewards/accuracies": 0.375, "rewards/chosen": -1.09775710105896, "rewards/margins": 0.024385623633861542, "rewards/rejected": -1.1221426725387573, "step": 493 }, { "epoch": 0.51, "learning_rate": 2.8484322666517373e-05, "logits/chosen": -2.236124277114868, "logits/rejected": -2.3427038192749023, "logps/chosen": -274.1993103027344, "logps/rejected": -298.519775390625, "loss": 0.6895, "rewards/accuracies": 0.5, "rewards/chosen": -0.9323225617408752, "rewards/margins": 0.053042903542518616, "rewards/rejected": -0.9853654503822327, "step": 494 }, { "epoch": 0.51, "learning_rate": 2.83943885946876e-05, "logits/chosen": -2.3090035915374756, "logits/rejected": -2.3383147716522217, "logps/chosen": -351.7496643066406, "logps/rejected": -356.39678955078125, "loss": 0.776, "rewards/accuracies": 0.5, "rewards/chosen": -1.235669493675232, "rewards/margins": -0.09604780375957489, "rewards/rejected": -1.139621615409851, "step": 495 }, { "epoch": 0.51, "learning_rate": 2.8304409748577653e-05, "logits/chosen": -2.266005516052246, "logits/rejected": -2.2317018508911133, "logps/chosen": -342.5093994140625, "logps/rejected": -360.4115295410156, "loss": 0.6021, "rewards/accuracies": 0.75, "rewards/chosen": -0.800186276435852, "rewards/margins": 0.3189813792705536, "rewards/rejected": -1.119167685508728, "step": 496 }, { "epoch": 0.51, "learning_rate": 2.821438731506933e-05, "logits/chosen": -2.1894872188568115, "logits/rejected": -2.2929162979125977, "logps/chosen": -366.48651123046875, "logps/rejected": -422.5482177734375, "loss": 0.6365, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1337993144989014, "rewards/margins": 0.18402203917503357, "rewards/rejected": -1.3178215026855469, "step": 497 }, { "epoch": 0.52, "learning_rate": 2.8124322481619388e-05, "logits/chosen": -2.1163060665130615, "logits/rejected": -2.0039188861846924, "logps/chosen": -361.08380126953125, "logps/rejected": -269.610595703125, "loss": 0.7452, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3107686042785645, "rewards/margins": -0.045146312564611435, "rewards/rejected": -1.2656222581863403, "step": 498 }, { "epoch": 0.52, "learning_rate": 2.803421643624386e-05, "logits/chosen": -2.105262041091919, "logits/rejected": -1.973724126815796, "logps/chosen": -311.098388671875, "logps/rejected": -314.92181396484375, "loss": 0.6544, "rewards/accuracies": 0.5, "rewards/chosen": -1.0980561971664429, "rewards/margins": 0.178094744682312, "rewards/rejected": -1.2761509418487549, "step": 499 }, { "epoch": 0.52, "learning_rate": 2.7944070367502402e-05, "logits/chosen": -2.1020731925964355, "logits/rejected": -2.2140867710113525, "logps/chosen": -277.33599853515625, "logps/rejected": -283.2631530761719, "loss": 0.6616, "rewards/accuracies": 0.6875, "rewards/chosen": -1.192931056022644, "rewards/margins": 0.10631629824638367, "rewards/rejected": -1.2992472648620605, "step": 500 } ], "logging_steps": 1, "max_steps": 965, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }