diff --git "a/checkpoint-2000/trainer_state.json" "b/checkpoint-2000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2000/trainer_state.json" @@ -0,0 +1,28021 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.6178010471204187, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.000000000000001e-07, + "logits/chosen": -2.729219675064087, + "logits/rejected": -2.713034152984619, + "logps/chosen": -183.00042724609375, + "logps/rejected": -183.33316040039062, + "loss": 0.6973, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.004850482568144798, + "rewards/margins": -0.007815884426236153, + "rewards/rejected": 0.0029654023237526417, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": -2.480727195739746, + "logits/rejected": -2.563934564590454, + "logps/chosen": -159.55963134765625, + "logps/rejected": -157.36929321289062, + "loss": 0.6902, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.004495501983910799, + "rewards/margins": 0.006143546663224697, + "rewards/rejected": -0.0016480451449751854, + "step": 2 + }, + { + "epoch": 0.0, + "learning_rate": 1.5e-06, + "logits/chosen": -2.856149911880493, + "logits/rejected": -2.8624300956726074, + "logps/chosen": -241.56802368164062, + "logps/rejected": -251.95797729492188, + "loss": 0.6969, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.01090860366821289, + "rewards/margins": -0.007115649990737438, + "rewards/rejected": -0.0037929536774754524, + "step": 3 + }, + { + "epoch": 0.01, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -2.473580837249756, + "logits/rejected": -2.6020100116729736, + "logps/chosen": -138.55348205566406, + "logps/rejected": -167.7603759765625, + "loss": 0.6844, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.01281428337097168, + "rewards/margins": 0.01771531254053116, + "rewards/rejected": -0.0049010273069143295, + "step": 4 + }, + { + "epoch": 0.01, + "learning_rate": 2.5e-06, + "logits/chosen": -2.446133613586426, + "logits/rejected": -2.5022342205047607, + "logps/chosen": -140.56512451171875, + "logps/rejected": -178.04331970214844, + "loss": 0.6995, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.013703584671020508, + "rewards/margins": -0.012241363525390625, + "rewards/rejected": -0.001462221029214561, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 3e-06, + "logits/chosen": -2.5403974056243896, + "logits/rejected": -2.650925874710083, + "logps/chosen": -162.82369995117188, + "logps/rejected": -214.57489013671875, + "loss": 0.6909, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.003328490536659956, + "rewards/margins": 0.004930590279400349, + "rewards/rejected": -0.0016021011397242546, + "step": 6 + }, + { + "epoch": 0.01, + "learning_rate": 3.5000000000000004e-06, + "logits/chosen": -2.6547656059265137, + "logits/rejected": -2.577648162841797, + "logps/chosen": -219.3771514892578, + "logps/rejected": -215.02362060546875, + "loss": 0.6931, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0027452707290649414, + "rewards/margins": 0.0002449510502628982, + "rewards/rejected": 0.0025003196205943823, + "step": 7 + }, + { + "epoch": 0.01, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -2.5475914478302, + "logits/rejected": -2.586148500442505, + "logps/chosen": -214.5223388671875, + "logps/rejected": -236.43626403808594, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.007447218522429466, + "rewards/margins": 0.0010163071565330029, + "rewards/rejected": 0.006430912297219038, + "step": 8 + }, + { + "epoch": 0.01, + "learning_rate": 4.5e-06, + "logits/chosen": -2.657726526260376, + "logits/rejected": -2.7398831844329834, + "logps/chosen": -158.13832092285156, + "logps/rejected": -176.91400146484375, + "loss": 0.6877, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.009470987133681774, + "rewards/margins": 0.011165929958224297, + "rewards/rejected": -0.0016949418932199478, + "step": 9 + }, + { + "epoch": 0.01, + "learning_rate": 5e-06, + "logits/chosen": -2.3761770725250244, + "logits/rejected": -2.4064137935638428, + "logps/chosen": -176.59835815429688, + "logps/rejected": -163.67300415039062, + "loss": 0.6909, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.002156305592507124, + "rewards/margins": 0.004867983516305685, + "rewards/rejected": -0.002711677923798561, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 5.500000000000001e-06, + "logits/chosen": -2.6009013652801514, + "logits/rejected": -2.645084857940674, + "logps/chosen": -197.91195678710938, + "logps/rejected": -245.907470703125, + "loss": 0.695, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.001973224338144064, + "rewards/margins": -0.0031423806212842464, + "rewards/rejected": 0.0011691567488014698, + "step": 11 + }, + { + "epoch": 0.02, + "learning_rate": 6e-06, + "logits/chosen": -2.70407772064209, + "logits/rejected": -2.713822364807129, + "logps/chosen": -190.91189575195312, + "logps/rejected": -190.74317932128906, + "loss": 0.6921, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0004999642260372639, + "rewards/margins": 0.002835321705788374, + "rewards/rejected": -0.00233535747975111, + "step": 12 + }, + { + "epoch": 0.02, + "learning_rate": 6.5000000000000004e-06, + "logits/chosen": -2.585596799850464, + "logits/rejected": -2.6356894969940186, + "logps/chosen": -214.14459228515625, + "logps/rejected": -242.9970245361328, + "loss": 0.687, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.006649780552834272, + "rewards/margins": 0.01303944643586874, + "rewards/rejected": -0.019689226523041725, + "step": 13 + }, + { + "epoch": 0.02, + "learning_rate": 7.000000000000001e-06, + "logits/chosen": -2.7044496536254883, + "logits/rejected": -2.6631391048431396, + "logps/chosen": -183.95582580566406, + "logps/rejected": -169.8933868408203, + "loss": 0.6967, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.01296384446322918, + "rewards/margins": -0.006747078616172075, + "rewards/rejected": -0.0062167649157345295, + "step": 14 + }, + { + "epoch": 0.02, + "learning_rate": 7.5e-06, + "logits/chosen": -2.6691248416900635, + "logits/rejected": -2.6515817642211914, + "logps/chosen": -161.9134979248047, + "logps/rejected": -170.60101318359375, + "loss": 0.6866, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.009864186868071556, + "rewards/margins": 0.01341402530670166, + "rewards/rejected": -0.0035498379729688168, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 8.000000000000001e-06, + "logits/chosen": -2.6293349266052246, + "logits/rejected": -2.652617931365967, + "logps/chosen": -161.3071746826172, + "logps/rejected": -169.07638549804688, + "loss": 0.6808, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.005685711745172739, + "rewards/margins": 0.025187280029058456, + "rewards/rejected": -0.019501566886901855, + "step": 16 + }, + { + "epoch": 0.02, + "learning_rate": 8.500000000000002e-06, + "logits/chosen": -2.6463351249694824, + "logits/rejected": -2.696180582046509, + "logps/chosen": -154.61085510253906, + "logps/rejected": -148.3529510498047, + "loss": 0.6916, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0029291389510035515, + "rewards/margins": 0.0034487005323171616, + "rewards/rejected": -0.0005195615813136101, + "step": 17 + }, + { + "epoch": 0.02, + "learning_rate": 9e-06, + "logits/chosen": -2.5302553176879883, + "logits/rejected": -2.4991636276245117, + "logps/chosen": -152.20655822753906, + "logps/rejected": -131.0479278564453, + "loss": 0.6954, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.009920882061123848, + "rewards/margins": -0.004111624322831631, + "rewards/rejected": -0.005809259135276079, + "step": 18 + }, + { + "epoch": 0.02, + "learning_rate": 9.5e-06, + "logits/chosen": -2.5337584018707275, + "logits/rejected": -2.6624388694763184, + "logps/chosen": -160.88140869140625, + "logps/rejected": -195.38058471679688, + "loss": 0.6829, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.540081575512886e-05, + "rewards/margins": 0.021059704944491386, + "rewards/rejected": -0.021044302731752396, + "step": 19 + }, + { + "epoch": 0.03, + "learning_rate": 1e-05, + "logits/chosen": -2.7187581062316895, + "logits/rejected": -2.699402093887329, + "logps/chosen": -184.01405334472656, + "logps/rejected": -200.99124145507812, + "loss": 0.6898, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.007425189018249512, + "rewards/margins": 0.007250881753861904, + "rewards/rejected": -0.014676070772111416, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 1.05e-05, + "logits/chosen": -2.5241212844848633, + "logits/rejected": -2.5988845825195312, + "logps/chosen": -181.3677215576172, + "logps/rejected": -156.8072509765625, + "loss": 0.687, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.006232499144971371, + "rewards/margins": 0.01287851296365261, + "rewards/rejected": -0.006646013353019953, + "step": 21 + }, + { + "epoch": 0.03, + "learning_rate": 1.1000000000000001e-05, + "logits/chosen": -2.531973361968994, + "logits/rejected": -2.594179153442383, + "logps/chosen": -166.6884307861328, + "logps/rejected": -173.5614013671875, + "loss": 0.6942, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.007440138608217239, + "rewards/margins": -0.0019538167398422956, + "rewards/rejected": -0.0054863216355443, + "step": 22 + }, + { + "epoch": 0.03, + "learning_rate": 1.1500000000000002e-05, + "logits/chosen": -2.4815938472747803, + "logits/rejected": -2.4820916652679443, + "logps/chosen": -131.22227478027344, + "logps/rejected": -131.2086181640625, + "loss": 0.6915, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0006843092851340771, + "rewards/margins": 0.003669046564027667, + "rewards/rejected": -0.004353356547653675, + "step": 23 + }, + { + "epoch": 0.03, + "learning_rate": 1.2e-05, + "logits/chosen": -2.4899590015411377, + "logits/rejected": -2.5294911861419678, + "logps/chosen": -149.33543395996094, + "logps/rejected": -141.8245391845703, + "loss": 0.6983, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.017502309754490852, + "rewards/margins": -0.010094404220581055, + "rewards/rejected": -0.0074079036712646484, + "step": 24 + }, + { + "epoch": 0.03, + "learning_rate": 1.25e-05, + "logits/chosen": -2.5714809894561768, + "logits/rejected": -2.5547332763671875, + "logps/chosen": -167.8873291015625, + "logps/rejected": -173.3172149658203, + "loss": 0.69, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00382807245478034, + "rewards/margins": 0.006511807441711426, + "rewards/rejected": -0.010339880362153053, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 1.3000000000000001e-05, + "logits/chosen": -2.470759630203247, + "logits/rejected": -2.4105560779571533, + "logps/chosen": -174.09585571289062, + "logps/rejected": -175.88677978515625, + "loss": 0.6914, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0010800834279507399, + "rewards/margins": 0.004079150035977364, + "rewards/rejected": -0.0029990668408572674, + "step": 26 + }, + { + "epoch": 0.04, + "learning_rate": 1.3500000000000001e-05, + "logits/chosen": -2.6590945720672607, + "logits/rejected": -2.649517059326172, + "logps/chosen": -174.6903839111328, + "logps/rejected": -165.15533447265625, + "loss": 0.69, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.012396741658449173, + "rewards/margins": 0.006480884738266468, + "rewards/rejected": -0.018877625465393066, + "step": 27 + }, + { + "epoch": 0.04, + "learning_rate": 1.4000000000000001e-05, + "logits/chosen": -2.517886161804199, + "logits/rejected": -2.6399176120758057, + "logps/chosen": -154.28765869140625, + "logps/rejected": -181.42474365234375, + "loss": 0.6867, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.01030280627310276, + "rewards/margins": 0.01349327526986599, + "rewards/rejected": -0.0237960796803236, + "step": 28 + }, + { + "epoch": 0.04, + "learning_rate": 1.45e-05, + "logits/chosen": -2.5826056003570557, + "logits/rejected": -2.5789594650268555, + "logps/chosen": -157.37286376953125, + "logps/rejected": -154.72024536132812, + "loss": 0.6894, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.016228055581450462, + "rewards/margins": 0.007857107557356358, + "rewards/rejected": -0.024085164070129395, + "step": 29 + }, + { + "epoch": 0.04, + "learning_rate": 1.5e-05, + "logits/chosen": -2.6575980186462402, + "logits/rejected": -2.704176664352417, + "logps/chosen": -159.80322265625, + "logps/rejected": -165.82786560058594, + "loss": 0.6829, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.00888748187571764, + "rewards/margins": 0.02125699445605278, + "rewards/rejected": -0.030144479125738144, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 1.55e-05, + "logits/chosen": -2.650428533554077, + "logits/rejected": -2.693262815475464, + "logps/chosen": -164.53692626953125, + "logps/rejected": -174.57235717773438, + "loss": 0.6852, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.001265859231352806, + "rewards/margins": 0.01664908044040203, + "rewards/rejected": -0.01538322027772665, + "step": 31 + }, + { + "epoch": 0.04, + "learning_rate": 1.6000000000000003e-05, + "logits/chosen": -2.6280901432037354, + "logits/rejected": -2.6152122020721436, + "logps/chosen": -174.9736328125, + "logps/rejected": -209.5363311767578, + "loss": 0.6859, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.013908598572015762, + "rewards/margins": 0.016062045469880104, + "rewards/rejected": -0.029970645904541016, + "step": 32 + }, + { + "epoch": 0.04, + "learning_rate": 1.65e-05, + "logits/chosen": -2.5862505435943604, + "logits/rejected": -2.5280563831329346, + "logps/chosen": -160.01731872558594, + "logps/rejected": -172.03590393066406, + "loss": 0.694, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03881430625915527, + "rewards/margins": -0.0009270897135138512, + "rewards/rejected": -0.03788721561431885, + "step": 33 + }, + { + "epoch": 0.04, + "learning_rate": 1.7000000000000003e-05, + "logits/chosen": -2.522751808166504, + "logits/rejected": -2.49782657623291, + "logps/chosen": -167.8050537109375, + "logps/rejected": -177.90399169921875, + "loss": 0.7061, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.031221888959407806, + "rewards/margins": -0.02547764778137207, + "rewards/rejected": -0.005744242575019598, + "step": 34 + }, + { + "epoch": 0.05, + "learning_rate": 1.75e-05, + "logits/chosen": -2.557039260864258, + "logits/rejected": -2.4520561695098877, + "logps/chosen": -143.8395538330078, + "logps/rejected": -140.37631225585938, + "loss": 0.6944, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.039483096450567245, + "rewards/margins": -0.0023098706733435392, + "rewards/rejected": -0.03717322647571564, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 1.8e-05, + "logits/chosen": -2.6500961780548096, + "logits/rejected": -2.5340402126312256, + "logps/chosen": -172.447998046875, + "logps/rejected": -198.5001678466797, + "loss": 0.7036, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.035362813621759415, + "rewards/margins": -0.020008588209748268, + "rewards/rejected": -0.015354226343333721, + "step": 36 + }, + { + "epoch": 0.05, + "learning_rate": 1.85e-05, + "logits/chosen": -2.6206865310668945, + "logits/rejected": -2.6976592540740967, + "logps/chosen": -167.8064422607422, + "logps/rejected": -185.81964111328125, + "loss": 0.6987, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.03894533962011337, + "rewards/margins": -0.009856510907411575, + "rewards/rejected": -0.029088832437992096, + "step": 37 + }, + { + "epoch": 0.05, + "learning_rate": 1.9e-05, + "logits/chosen": -2.434297800064087, + "logits/rejected": -2.651834487915039, + "logps/chosen": -169.50946044921875, + "logps/rejected": -257.7209167480469, + "loss": 0.6941, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.03687644004821777, + "rewards/margins": -0.0012842637952417135, + "rewards/rejected": -0.035592176020145416, + "step": 38 + }, + { + "epoch": 0.05, + "learning_rate": 1.9500000000000003e-05, + "logits/chosen": -2.4155445098876953, + "logits/rejected": -2.487833023071289, + "logps/chosen": -180.0078582763672, + "logps/rejected": -207.1255340576172, + "loss": 0.6978, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.037531279027462006, + "rewards/margins": -0.008796263486146927, + "rewards/rejected": -0.028735019266605377, + "step": 39 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "logits/chosen": -2.6476895809173584, + "logits/rejected": -2.707326650619507, + "logps/chosen": -182.0277862548828, + "logps/rejected": -180.2669677734375, + "loss": 0.6857, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.029536105692386627, + "rewards/margins": 0.015740489587187767, + "rewards/rejected": -0.04527659714221954, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 2.05e-05, + "logits/chosen": -2.691012144088745, + "logits/rejected": -2.688505172729492, + "logps/chosen": -173.92041015625, + "logps/rejected": -208.61599731445312, + "loss": 0.6781, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.01940765418112278, + "rewards/margins": 0.03146040812134743, + "rewards/rejected": -0.05086805671453476, + "step": 41 + }, + { + "epoch": 0.05, + "learning_rate": 2.1e-05, + "logits/chosen": -2.619537353515625, + "logits/rejected": -2.6051697731018066, + "logps/chosen": -193.06666564941406, + "logps/rejected": -178.87713623046875, + "loss": 0.699, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.035286471247673035, + "rewards/margins": -0.010646676644682884, + "rewards/rejected": -0.0246397964656353, + "step": 42 + }, + { + "epoch": 0.06, + "learning_rate": 2.15e-05, + "logits/chosen": -2.611173391342163, + "logits/rejected": -2.6004340648651123, + "logps/chosen": -188.3818817138672, + "logps/rejected": -214.30926513671875, + "loss": 0.6674, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0015826929593458772, + "rewards/margins": 0.05373835563659668, + "rewards/rejected": -0.05215566232800484, + "step": 43 + }, + { + "epoch": 0.06, + "learning_rate": 2.2000000000000003e-05, + "logits/chosen": -2.667587995529175, + "logits/rejected": -2.688136339187622, + "logps/chosen": -188.89466857910156, + "logps/rejected": -207.63580322265625, + "loss": 0.6982, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04630360007286072, + "rewards/margins": -0.008599425666034222, + "rewards/rejected": -0.03770418092608452, + "step": 44 + }, + { + "epoch": 0.06, + "learning_rate": 2.25e-05, + "logits/chosen": -2.5165350437164307, + "logits/rejected": -2.441213607788086, + "logps/chosen": -170.480712890625, + "logps/rejected": -159.08981323242188, + "loss": 0.6945, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03015182353556156, + "rewards/margins": -0.0018717292696237564, + "rewards/rejected": -0.028280090540647507, + "step": 45 + }, + { + "epoch": 0.06, + "learning_rate": 2.3000000000000003e-05, + "logits/chosen": -2.523725748062134, + "logits/rejected": -2.6105730533599854, + "logps/chosen": -170.0879364013672, + "logps/rejected": -197.1090087890625, + "loss": 0.6884, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.022913338616490364, + "rewards/margins": 0.010608267970383167, + "rewards/rejected": -0.033521607518196106, + "step": 46 + }, + { + "epoch": 0.06, + "learning_rate": 2.35e-05, + "logits/chosen": -2.5311026573181152, + "logits/rejected": -2.54695463180542, + "logps/chosen": -178.0673065185547, + "logps/rejected": -182.32875061035156, + "loss": 0.6816, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.008729481138288975, + "rewards/margins": 0.02449822425842285, + "rewards/rejected": -0.03322770446538925, + "step": 47 + }, + { + "epoch": 0.06, + "learning_rate": 2.4e-05, + "logits/chosen": -2.6589465141296387, + "logits/rejected": -2.6210758686065674, + "logps/chosen": -182.40536499023438, + "logps/rejected": -169.12103271484375, + "loss": 0.6799, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.003579068696126342, + "rewards/margins": 0.02751018851995468, + "rewards/rejected": -0.031089257448911667, + "step": 48 + }, + { + "epoch": 0.06, + "learning_rate": 2.45e-05, + "logits/chosen": -2.513227701187134, + "logits/rejected": -2.543900489807129, + "logps/chosen": -170.51919555664062, + "logps/rejected": -168.29690551757812, + "loss": 0.7047, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.021081138402223587, + "rewards/margins": -0.022327663376927376, + "rewards/rejected": 0.0012465240433812141, + "step": 49 + }, + { + "epoch": 0.07, + "learning_rate": 2.5e-05, + "logits/chosen": -2.434335470199585, + "logits/rejected": -2.4906935691833496, + "logps/chosen": -162.16989135742188, + "logps/rejected": -212.56082153320312, + "loss": 0.6807, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.027100684121251106, + "rewards/margins": 0.02660501003265381, + "rewards/rejected": -0.053705692291259766, + "step": 50 + }, + { + "epoch": 0.07, + "learning_rate": 2.5500000000000003e-05, + "logits/chosen": -2.536504030227661, + "logits/rejected": -2.6495611667633057, + "logps/chosen": -171.29580688476562, + "logps/rejected": -191.54605102539062, + "loss": 0.6815, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.025197075679898262, + "rewards/margins": 0.024894431233406067, + "rewards/rejected": -0.05009150505065918, + "step": 51 + }, + { + "epoch": 0.07, + "learning_rate": 2.6000000000000002e-05, + "logits/chosen": -2.6077020168304443, + "logits/rejected": -2.564440965652466, + "logps/chosen": -170.64273071289062, + "logps/rejected": -217.3800811767578, + "loss": 0.7054, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.029342200607061386, + "rewards/margins": -0.02205488458275795, + "rewards/rejected": -0.007287311367690563, + "step": 52 + }, + { + "epoch": 0.07, + "learning_rate": 2.6500000000000004e-05, + "logits/chosen": -2.4113476276397705, + "logits/rejected": -2.4167730808258057, + "logps/chosen": -158.03866577148438, + "logps/rejected": -203.66368103027344, + "loss": 0.7, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.03733339160680771, + "rewards/margins": -0.011522197164595127, + "rewards/rejected": -0.025811197236180305, + "step": 53 + }, + { + "epoch": 0.07, + "learning_rate": 2.7000000000000002e-05, + "logits/chosen": -2.5948705673217773, + "logits/rejected": -2.584592342376709, + "logps/chosen": -163.16751098632812, + "logps/rejected": -196.3957061767578, + "loss": 0.687, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.050568290054798126, + "rewards/margins": 0.014712072908878326, + "rewards/rejected": -0.06528037041425705, + "step": 54 + }, + { + "epoch": 0.07, + "learning_rate": 2.7500000000000004e-05, + "logits/chosen": -2.524641513824463, + "logits/rejected": -2.445185661315918, + "logps/chosen": -167.29495239257812, + "logps/rejected": -178.15013122558594, + "loss": 0.694, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03202950954437256, + "rewards/margins": 0.00026745768263936043, + "rewards/rejected": -0.03229696676135063, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 2.8000000000000003e-05, + "logits/chosen": -2.7647242546081543, + "logits/rejected": -2.721259117126465, + "logps/chosen": -179.40574645996094, + "logps/rejected": -180.4486541748047, + "loss": 0.6771, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0023113014176487923, + "rewards/margins": 0.03354344516992569, + "rewards/rejected": -0.035854749381542206, + "step": 56 + }, + { + "epoch": 0.07, + "learning_rate": 2.8499999999999998e-05, + "logits/chosen": -2.676018714904785, + "logits/rejected": -2.6648664474487305, + "logps/chosen": -197.95919799804688, + "logps/rejected": -175.45230102539062, + "loss": 0.6764, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.00892419833689928, + "rewards/margins": 0.036644406616687775, + "rewards/rejected": -0.04556860774755478, + "step": 57 + }, + { + "epoch": 0.08, + "learning_rate": 2.9e-05, + "logits/chosen": -2.644045829772949, + "logits/rejected": -2.7341084480285645, + "logps/chosen": -180.7408905029297, + "logps/rejected": -189.23818969726562, + "loss": 0.6963, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0153807383030653, + "rewards/margins": -0.005131007172167301, + "rewards/rejected": -0.010249733924865723, + "step": 58 + }, + { + "epoch": 0.08, + "learning_rate": 2.95e-05, + "logits/chosen": -2.6532645225524902, + "logits/rejected": -2.552724599838257, + "logps/chosen": -178.77191162109375, + "logps/rejected": -175.42742919921875, + "loss": 0.6841, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03405280038714409, + "rewards/margins": 0.020096803084015846, + "rewards/rejected": -0.054149605333805084, + "step": 59 + }, + { + "epoch": 0.08, + "learning_rate": 3e-05, + "logits/chosen": -2.5994341373443604, + "logits/rejected": -2.6342806816101074, + "logps/chosen": -176.28402709960938, + "logps/rejected": -210.64498901367188, + "loss": 0.6975, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.025557922199368477, + "rewards/margins": -0.004868890158832073, + "rewards/rejected": -0.02068903297185898, + "step": 60 + }, + { + "epoch": 0.08, + "learning_rate": 3.05e-05, + "logits/chosen": -2.58650279045105, + "logits/rejected": -2.7233827114105225, + "logps/chosen": -184.57443237304688, + "logps/rejected": -175.98129272460938, + "loss": 0.7081, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.06466653198003769, + "rewards/margins": -0.02823822945356369, + "rewards/rejected": -0.0364283062517643, + "step": 61 + }, + { + "epoch": 0.08, + "learning_rate": 3.1e-05, + "logits/chosen": -2.6913092136383057, + "logits/rejected": -2.5653786659240723, + "logps/chosen": -203.69729614257812, + "logps/rejected": -190.5261688232422, + "loss": 0.7161, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.04825315251946449, + "rewards/margins": -0.04361088201403618, + "rewards/rejected": -0.004642271436750889, + "step": 62 + }, + { + "epoch": 0.08, + "learning_rate": 3.15e-05, + "logits/chosen": -2.666816473007202, + "logits/rejected": -2.604631185531616, + "logps/chosen": -225.96981811523438, + "logps/rejected": -193.25982666015625, + "loss": 0.7001, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.040044669061899185, + "rewards/margins": -0.012522673234343529, + "rewards/rejected": -0.027521992102265358, + "step": 63 + }, + { + "epoch": 0.08, + "learning_rate": 3.2000000000000005e-05, + "logits/chosen": -2.6570017337799072, + "logits/rejected": -2.633661985397339, + "logps/chosen": -188.25892639160156, + "logps/rejected": -171.55096435546875, + "loss": 0.7173, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.08987895399332047, + "rewards/margins": -0.04469916597008705, + "rewards/rejected": -0.04517979919910431, + "step": 64 + }, + { + "epoch": 0.09, + "learning_rate": 3.2500000000000004e-05, + "logits/chosen": -2.582909107208252, + "logits/rejected": -2.587308883666992, + "logps/chosen": -168.5953369140625, + "logps/rejected": -180.11024475097656, + "loss": 0.689, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.038271259516477585, + "rewards/margins": 0.01016156654804945, + "rewards/rejected": -0.04843283072113991, + "step": 65 + }, + { + "epoch": 0.09, + "learning_rate": 3.3e-05, + "logits/chosen": -2.7339940071105957, + "logits/rejected": -2.6128218173980713, + "logps/chosen": -187.80320739746094, + "logps/rejected": -176.04495239257812, + "loss": 0.6948, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.08385930210351944, + "rewards/margins": -0.0003412736114114523, + "rewards/rejected": -0.08351802825927734, + "step": 66 + }, + { + "epoch": 0.09, + "learning_rate": 3.35e-05, + "logits/chosen": -2.406219482421875, + "logits/rejected": -2.435150623321533, + "logps/chosen": -150.17405700683594, + "logps/rejected": -174.16119384765625, + "loss": 0.68, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.030688336119055748, + "rewards/margins": 0.027779744938015938, + "rewards/rejected": -0.058468081057071686, + "step": 67 + }, + { + "epoch": 0.09, + "learning_rate": 3.4000000000000007e-05, + "logits/chosen": -2.7922043800354004, + "logits/rejected": -2.8177366256713867, + "logps/chosen": -202.07823181152344, + "logps/rejected": -233.48065185546875, + "loss": 0.688, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.10107097774744034, + "rewards/margins": 0.014442582614719868, + "rewards/rejected": -0.11551356315612793, + "step": 68 + }, + { + "epoch": 0.09, + "learning_rate": 3.45e-05, + "logits/chosen": -2.621925115585327, + "logits/rejected": -2.548135280609131, + "logps/chosen": -163.22723388671875, + "logps/rejected": -183.32742309570312, + "loss": 0.6853, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.009019946679472923, + "rewards/margins": 0.020815372467041016, + "rewards/rejected": -0.029835321009159088, + "step": 69 + }, + { + "epoch": 0.09, + "learning_rate": 3.5e-05, + "logits/chosen": -2.7518372535705566, + "logits/rejected": -2.7260899543762207, + "logps/chosen": -223.34564208984375, + "logps/rejected": -245.5427703857422, + "loss": 0.7156, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.12107516080141068, + "rewards/margins": -0.03887636959552765, + "rewards/rejected": -0.08219879120588303, + "step": 70 + }, + { + "epoch": 0.09, + "learning_rate": 3.55e-05, + "logits/chosen": -2.5108494758605957, + "logits/rejected": -2.4832143783569336, + "logps/chosen": -166.49508666992188, + "logps/rejected": -157.7345428466797, + "loss": 0.6918, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.05453377217054367, + "rewards/margins": 0.0069806561805307865, + "rewards/rejected": -0.06151442974805832, + "step": 71 + }, + { + "epoch": 0.09, + "learning_rate": 3.6e-05, + "logits/chosen": -2.636711359024048, + "logits/rejected": -2.6597518920898438, + "logps/chosen": -174.1949005126953, + "logps/rejected": -189.0026397705078, + "loss": 0.6782, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.005148457363247871, + "rewards/margins": 0.03288703039288521, + "rewards/rejected": -0.03803548216819763, + "step": 72 + }, + { + "epoch": 0.1, + "learning_rate": 3.65e-05, + "logits/chosen": -2.6395423412323, + "logits/rejected": -2.6543948650360107, + "logps/chosen": -146.5878448486328, + "logps/rejected": -180.54176330566406, + "loss": 0.7075, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0687551274895668, + "rewards/margins": -0.02330932579934597, + "rewards/rejected": -0.045445799827575684, + "step": 73 + }, + { + "epoch": 0.1, + "learning_rate": 3.7e-05, + "logits/chosen": -2.4257636070251465, + "logits/rejected": -2.502183675765991, + "logps/chosen": -179.7578125, + "logps/rejected": -227.62875366210938, + "loss": 0.7125, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.03154797852039337, + "rewards/margins": -0.03330230712890625, + "rewards/rejected": 0.0017543300054967403, + "step": 74 + }, + { + "epoch": 0.1, + "learning_rate": 3.7500000000000003e-05, + "logits/chosen": -2.3436474800109863, + "logits/rejected": -2.498957633972168, + "logps/chosen": -151.80540466308594, + "logps/rejected": -164.84146118164062, + "loss": 0.6777, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.017311906442046165, + "rewards/margins": 0.03238987550139427, + "rewards/rejected": -0.049701791256666183, + "step": 75 + }, + { + "epoch": 0.1, + "learning_rate": 3.8e-05, + "logits/chosen": -2.558389663696289, + "logits/rejected": -2.6029064655303955, + "logps/chosen": -176.9400177001953, + "logps/rejected": -177.18336486816406, + "loss": 0.7002, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04031562805175781, + "rewards/margins": -0.009864617139101028, + "rewards/rejected": -0.030451007187366486, + "step": 76 + }, + { + "epoch": 0.1, + "learning_rate": 3.85e-05, + "logits/chosen": -2.651350259780884, + "logits/rejected": -2.7427544593811035, + "logps/chosen": -210.8737335205078, + "logps/rejected": -199.6597137451172, + "loss": 0.6787, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.048238325864076614, + "rewards/margins": 0.031043197959661484, + "rewards/rejected": -0.0792815238237381, + "step": 77 + }, + { + "epoch": 0.1, + "learning_rate": 3.9000000000000006e-05, + "logits/chosen": -2.4323079586029053, + "logits/rejected": -2.576862335205078, + "logps/chosen": -127.96165466308594, + "logps/rejected": -161.3166961669922, + "loss": 0.724, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1004774421453476, + "rewards/margins": -0.053969353437423706, + "rewards/rejected": -0.046508073806762695, + "step": 78 + }, + { + "epoch": 0.1, + "learning_rate": 3.9500000000000005e-05, + "logits/chosen": -2.526383638381958, + "logits/rejected": -2.552208423614502, + "logps/chosen": -183.66246032714844, + "logps/rejected": -195.2118377685547, + "loss": 0.6934, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.10480672121047974, + "rewards/margins": 0.005925657227635384, + "rewards/rejected": -0.11073236912488937, + "step": 79 + }, + { + "epoch": 0.1, + "learning_rate": 4e-05, + "logits/chosen": -2.635021209716797, + "logits/rejected": -2.7062149047851562, + "logps/chosen": -203.2865753173828, + "logps/rejected": -200.7851104736328, + "loss": 0.6553, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.012298012152314186, + "rewards/margins": 0.08196020871400833, + "rewards/rejected": -0.09425821155309677, + "step": 80 + }, + { + "epoch": 0.11, + "learning_rate": 4.05e-05, + "logits/chosen": -2.48476505279541, + "logits/rejected": -2.613018274307251, + "logps/chosen": -159.0533905029297, + "logps/rejected": -222.8763427734375, + "loss": 0.6488, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.017749834805727005, + "rewards/margins": 0.10052147507667542, + "rewards/rejected": -0.11827130615711212, + "step": 81 + }, + { + "epoch": 0.11, + "learning_rate": 4.1e-05, + "logits/chosen": -2.5390264987945557, + "logits/rejected": -2.5940589904785156, + "logps/chosen": -220.54562377929688, + "logps/rejected": -193.7894744873047, + "loss": 0.7168, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.15124498307704926, + "rewards/margins": -0.04028485342860222, + "rewards/rejected": -0.11096014082431793, + "step": 82 + }, + { + "epoch": 0.11, + "learning_rate": 4.15e-05, + "logits/chosen": -2.6519391536712646, + "logits/rejected": -2.6670496463775635, + "logps/chosen": -174.1206512451172, + "logps/rejected": -182.67996215820312, + "loss": 0.689, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.049321744590997696, + "rewards/margins": 0.012697530910372734, + "rewards/rejected": -0.06201927736401558, + "step": 83 + }, + { + "epoch": 0.11, + "learning_rate": 4.2e-05, + "logits/chosen": -2.717592239379883, + "logits/rejected": -2.7927956581115723, + "logps/chosen": -169.05294799804688, + "logps/rejected": -190.73846435546875, + "loss": 0.664, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03787894546985626, + "rewards/margins": 0.06431596726179123, + "rewards/rejected": -0.1021949052810669, + "step": 84 + }, + { + "epoch": 0.11, + "learning_rate": 4.25e-05, + "logits/chosen": -2.5930697917938232, + "logits/rejected": -2.509345531463623, + "logps/chosen": -179.2568817138672, + "logps/rejected": -168.377685546875, + "loss": 0.6953, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05612773820757866, + "rewards/margins": 0.006718709133565426, + "rewards/rejected": -0.06284645199775696, + "step": 85 + }, + { + "epoch": 0.11, + "learning_rate": 4.3e-05, + "logits/chosen": -2.5787692070007324, + "logits/rejected": -2.64978289604187, + "logps/chosen": -172.04168701171875, + "logps/rejected": -160.20840454101562, + "loss": 0.6896, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11709931492805481, + "rewards/margins": 0.018632344901561737, + "rewards/rejected": -0.13573165237903595, + "step": 86 + }, + { + "epoch": 0.11, + "learning_rate": 4.35e-05, + "logits/chosen": -2.480304002761841, + "logits/rejected": -2.4922451972961426, + "logps/chosen": -200.24014282226562, + "logps/rejected": -218.20352172851562, + "loss": 0.6417, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08723511546850204, + "rewards/margins": 0.12010292708873749, + "rewards/rejected": -0.20733806490898132, + "step": 87 + }, + { + "epoch": 0.12, + "learning_rate": 4.4000000000000006e-05, + "logits/chosen": -2.7480030059814453, + "logits/rejected": -2.703220844268799, + "logps/chosen": -164.8146514892578, + "logps/rejected": -173.18063354492188, + "loss": 0.7038, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1111496239900589, + "rewards/margins": -0.00250411219894886, + "rewards/rejected": -0.10864551365375519, + "step": 88 + }, + { + "epoch": 0.12, + "learning_rate": 4.4500000000000004e-05, + "logits/chosen": -2.4365036487579346, + "logits/rejected": -2.5382070541381836, + "logps/chosen": -173.4228515625, + "logps/rejected": -228.0253448486328, + "loss": 0.662, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.022041939198970795, + "rewards/margins": 0.07041654735803604, + "rewards/rejected": -0.09245848655700684, + "step": 89 + }, + { + "epoch": 0.12, + "learning_rate": 4.5e-05, + "logits/chosen": -2.5812180042266846, + "logits/rejected": -2.6432223320007324, + "logps/chosen": -155.36810302734375, + "logps/rejected": -164.1707000732422, + "loss": 0.7, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2002555876970291, + "rewards/margins": -0.011146757751703262, + "rewards/rejected": -0.18910883367061615, + "step": 90 + }, + { + "epoch": 0.12, + "learning_rate": 4.55e-05, + "logits/chosen": -2.6511974334716797, + "logits/rejected": -2.7204787731170654, + "logps/chosen": -172.86270141601562, + "logps/rejected": -176.37405395507812, + "loss": 0.6919, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.11853408813476562, + "rewards/margins": 0.03132196143269539, + "rewards/rejected": -0.14985604584217072, + "step": 91 + }, + { + "epoch": 0.12, + "learning_rate": 4.600000000000001e-05, + "logits/chosen": -2.753451108932495, + "logits/rejected": -2.6007282733917236, + "logps/chosen": -192.64649963378906, + "logps/rejected": -164.47393798828125, + "loss": 0.7061, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.26501375436782837, + "rewards/margins": -0.007466696202754974, + "rewards/rejected": -0.2575470507144928, + "step": 92 + }, + { + "epoch": 0.12, + "learning_rate": 4.6500000000000005e-05, + "logits/chosen": -2.6831321716308594, + "logits/rejected": -2.657895088195801, + "logps/chosen": -191.9042510986328, + "logps/rejected": -165.17892456054688, + "loss": 0.7455, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.21064424514770508, + "rewards/margins": -0.08327949792146683, + "rewards/rejected": -0.12736473977565765, + "step": 93 + }, + { + "epoch": 0.12, + "learning_rate": 4.7e-05, + "logits/chosen": -2.5191149711608887, + "logits/rejected": -2.6359121799468994, + "logps/chosen": -171.92013549804688, + "logps/rejected": -177.6357421875, + "loss": 0.7085, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.274119108915329, + "rewards/margins": -0.0034166108816862106, + "rewards/rejected": -0.2707024812698364, + "step": 94 + }, + { + "epoch": 0.12, + "learning_rate": 4.75e-05, + "logits/chosen": -2.731271505355835, + "logits/rejected": -2.7019336223602295, + "logps/chosen": -180.59613037109375, + "logps/rejected": -184.8212890625, + "loss": 0.6521, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14632394909858704, + "rewards/margins": 0.09543509036302567, + "rewards/rejected": -0.2417590469121933, + "step": 95 + }, + { + "epoch": 0.13, + "learning_rate": 4.8e-05, + "logits/chosen": -2.490906000137329, + "logits/rejected": -2.6088736057281494, + "logps/chosen": -162.78256225585938, + "logps/rejected": -176.74588012695312, + "loss": 0.6336, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.14169403910636902, + "rewards/margins": 0.14634834229946136, + "rewards/rejected": -0.2880423963069916, + "step": 96 + }, + { + "epoch": 0.13, + "learning_rate": 4.85e-05, + "logits/chosen": -2.6869685649871826, + "logits/rejected": -2.6618151664733887, + "logps/chosen": -158.98098754882812, + "logps/rejected": -165.8663330078125, + "loss": 0.6656, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17300596833229065, + "rewards/margins": 0.07225295156240463, + "rewards/rejected": -0.24525892734527588, + "step": 97 + }, + { + "epoch": 0.13, + "learning_rate": 4.9e-05, + "logits/chosen": -2.655244827270508, + "logits/rejected": -2.7159557342529297, + "logps/chosen": -187.57041931152344, + "logps/rejected": -189.180908203125, + "loss": 0.6596, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16985346376895905, + "rewards/margins": 0.0829225480556488, + "rewards/rejected": -0.25277602672576904, + "step": 98 + }, + { + "epoch": 0.13, + "learning_rate": 4.9500000000000004e-05, + "logits/chosen": -2.5713696479797363, + "logits/rejected": -2.619272470474243, + "logps/chosen": -164.45220947265625, + "logps/rejected": -189.011962890625, + "loss": 0.6332, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1459609568119049, + "rewards/margins": 0.14748626947402954, + "rewards/rejected": -0.29344722628593445, + "step": 99 + }, + { + "epoch": 0.13, + "learning_rate": 5e-05, + "logits/chosen": -2.7377047538757324, + "logits/rejected": -2.807655096054077, + "logps/chosen": -212.2166748046875, + "logps/rejected": -205.24362182617188, + "loss": 0.7779, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.3374475836753845, + "rewards/margins": -0.14080718159675598, + "rewards/rejected": -0.19664038717746735, + "step": 100 + }, + { + "epoch": 0.13, + "learning_rate": 4.999997432392803e-05, + "logits/chosen": -2.604559898376465, + "logits/rejected": -2.5846409797668457, + "logps/chosen": -182.06011962890625, + "logps/rejected": -185.51695251464844, + "loss": 0.723, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.2137359231710434, + "rewards/margins": -0.0511900931596756, + "rewards/rejected": -0.1625458300113678, + "step": 101 + }, + { + "epoch": 0.13, + "learning_rate": 4.9999897295764844e-05, + "logits/chosen": -2.7066619396209717, + "logits/rejected": -2.6727826595306396, + "logps/chosen": -194.32232666015625, + "logps/rejected": -219.6756134033203, + "loss": 0.6905, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2863636910915375, + "rewards/margins": 0.04078531265258789, + "rewards/rejected": -0.32714903354644775, + "step": 102 + }, + { + "epoch": 0.13, + "learning_rate": 4.9999768915668665e-05, + "logits/chosen": -2.5064425468444824, + "logits/rejected": -2.49991512298584, + "logps/chosen": -168.347900390625, + "logps/rejected": -165.95089721679688, + "loss": 0.6714, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.28736329078674316, + "rewards/margins": 0.0547042116522789, + "rewards/rejected": -0.3420674800872803, + "step": 103 + }, + { + "epoch": 0.14, + "learning_rate": 4.999958918390321e-05, + "logits/chosen": -2.568999767303467, + "logits/rejected": -2.629004716873169, + "logps/chosen": -186.33514404296875, + "logps/rejected": -215.6304168701172, + "loss": 0.6716, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23962916433811188, + "rewards/margins": 0.06436805427074432, + "rewards/rejected": -0.3039971888065338, + "step": 104 + }, + { + "epoch": 0.14, + "learning_rate": 4.999935810083766e-05, + "logits/chosen": -2.667313575744629, + "logits/rejected": -2.6559674739837646, + "logps/chosen": -154.65960693359375, + "logps/rejected": -155.13522338867188, + "loss": 0.667, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2854609489440918, + "rewards/margins": 0.0853797197341919, + "rewards/rejected": -0.3708406686782837, + "step": 105 + }, + { + "epoch": 0.14, + "learning_rate": 4.999907566694667e-05, + "logits/chosen": -2.5488526821136475, + "logits/rejected": -2.485109567642212, + "logps/chosen": -222.78338623046875, + "logps/rejected": -222.6953887939453, + "loss": 0.6656, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28172624111175537, + "rewards/margins": 0.07234585285186768, + "rewards/rejected": -0.35407203435897827, + "step": 106 + }, + { + "epoch": 0.14, + "learning_rate": 4.9998741882810384e-05, + "logits/chosen": -2.872877597808838, + "logits/rejected": -2.8446714878082275, + "logps/chosen": -182.39260864257812, + "logps/rejected": -213.32289123535156, + "loss": 0.6628, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2876412272453308, + "rewards/margins": 0.08153553307056427, + "rewards/rejected": -0.3691767454147339, + "step": 107 + }, + { + "epoch": 0.14, + "learning_rate": 4.999835674911443e-05, + "logits/chosen": -2.7004892826080322, + "logits/rejected": -2.6362013816833496, + "logps/chosen": -162.95108032226562, + "logps/rejected": -161.67657470703125, + "loss": 0.6744, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3595612049102783, + "rewards/margins": 0.053390078246593475, + "rewards/rejected": -0.4129512906074524, + "step": 108 + }, + { + "epoch": 0.14, + "learning_rate": 4.999792026664991e-05, + "logits/chosen": -2.8488755226135254, + "logits/rejected": -2.914703845977783, + "logps/chosen": -173.82363891601562, + "logps/rejected": -182.16908264160156, + "loss": 0.7065, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.48598194122314453, + "rewards/margins": 0.03310241550207138, + "rewards/rejected": -0.5190844535827637, + "step": 109 + }, + { + "epoch": 0.14, + "learning_rate": 4.9997432436313384e-05, + "logits/chosen": -2.5648086071014404, + "logits/rejected": -2.5932183265686035, + "logps/chosen": -185.70846557617188, + "logps/rejected": -189.32725524902344, + "loss": 0.6842, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4091907739639282, + "rewards/margins": 0.04006391391158104, + "rewards/rejected": -0.4492546617984772, + "step": 110 + }, + { + "epoch": 0.15, + "learning_rate": 4.99968932591069e-05, + "logits/chosen": -2.770993232727051, + "logits/rejected": -2.7647171020507812, + "logps/chosen": -208.0184326171875, + "logps/rejected": -202.5152587890625, + "loss": 0.6867, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.49601614475250244, + "rewards/margins": 0.027304889634251595, + "rewards/rejected": -0.5233210921287537, + "step": 111 + }, + { + "epoch": 0.15, + "learning_rate": 4.999630273613799e-05, + "logits/chosen": -2.4182567596435547, + "logits/rejected": -2.611553430557251, + "logps/chosen": -138.6142578125, + "logps/rejected": -187.48898315429688, + "loss": 0.7333, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5143216252326965, + "rewards/margins": -0.011710070073604584, + "rewards/rejected": -0.5026116371154785, + "step": 112 + }, + { + "epoch": 0.15, + "learning_rate": 4.999566086861961e-05, + "logits/chosen": -2.7679452896118164, + "logits/rejected": -2.8067147731781006, + "logps/chosen": -186.18055725097656, + "logps/rejected": -198.92605590820312, + "loss": 0.8218, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.6542762517929077, + "rewards/margins": -0.2115614116191864, + "rewards/rejected": -0.4427148103713989, + "step": 113 + }, + { + "epoch": 0.15, + "learning_rate": 4.999496765787024e-05, + "logits/chosen": -2.842291831970215, + "logits/rejected": -2.8023149967193604, + "logps/chosen": -162.9304962158203, + "logps/rejected": -182.9091796875, + "loss": 0.6623, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5319749116897583, + "rewards/margins": 0.08390979468822479, + "rewards/rejected": -0.6158846616744995, + "step": 114 + }, + { + "epoch": 0.15, + "learning_rate": 4.9994223105313774e-05, + "logits/chosen": -2.980092763900757, + "logits/rejected": -2.9631030559539795, + "logps/chosen": -217.55894470214844, + "logps/rejected": -226.1593780517578, + "loss": 0.6824, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4032873809337616, + "rewards/margins": 0.0588911734521389, + "rewards/rejected": -0.4621785283088684, + "step": 115 + }, + { + "epoch": 0.15, + "learning_rate": 4.9993427212479606e-05, + "logits/chosen": -2.5391757488250732, + "logits/rejected": -2.725529909133911, + "logps/chosen": -179.54200744628906, + "logps/rejected": -204.42666625976562, + "loss": 0.6975, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.39779067039489746, + "rewards/margins": 0.0027880650013685226, + "rewards/rejected": -0.40057870745658875, + "step": 116 + }, + { + "epoch": 0.15, + "learning_rate": 4.999257998100254e-05, + "logits/chosen": -2.733851671218872, + "logits/rejected": -2.87308931350708, + "logps/chosen": -176.08485412597656, + "logps/rejected": -193.49484252929688, + "loss": 0.6449, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.40423309803009033, + "rewards/margins": 0.11808924376964569, + "rewards/rejected": -0.5223223567008972, + "step": 117 + }, + { + "epoch": 0.15, + "learning_rate": 4.999168141262289e-05, + "logits/chosen": -2.7447009086608887, + "logits/rejected": -2.808476448059082, + "logps/chosen": -224.22067260742188, + "logps/rejected": -265.44964599609375, + "loss": 0.543, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5002200603485107, + "rewards/margins": 0.40809527039527893, + "rewards/rejected": -0.9083153605461121, + "step": 118 + }, + { + "epoch": 0.16, + "learning_rate": 4.9990731509186376e-05, + "logits/chosen": -2.639643430709839, + "logits/rejected": -2.7004880905151367, + "logps/chosen": -126.65451049804688, + "logps/rejected": -152.20460510253906, + "loss": 0.6935, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5373567342758179, + "rewards/margins": 0.048874713480472565, + "rewards/rejected": -0.5862314105033875, + "step": 119 + }, + { + "epoch": 0.16, + "learning_rate": 4.998973027264419e-05, + "logits/chosen": -2.6515791416168213, + "logits/rejected": -2.7365236282348633, + "logps/chosen": -174.46641540527344, + "logps/rejected": -225.31158447265625, + "loss": 0.6869, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5558931827545166, + "rewards/margins": 0.04842944070696831, + "rewards/rejected": -0.6043226718902588, + "step": 120 + }, + { + "epoch": 0.16, + "learning_rate": 4.998867770505295e-05, + "logits/chosen": -2.724031686782837, + "logits/rejected": -2.709028482437134, + "logps/chosen": -166.08251953125, + "logps/rejected": -183.7383575439453, + "loss": 0.6879, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5829455256462097, + "rewards/margins": 0.03844447806477547, + "rewards/rejected": -0.6213899850845337, + "step": 121 + }, + { + "epoch": 0.16, + "learning_rate": 4.9987573808574726e-05, + "logits/chosen": -2.696485996246338, + "logits/rejected": -2.779482364654541, + "logps/chosen": -161.10256958007812, + "logps/rejected": -185.7605743408203, + "loss": 0.5904, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.41624942421913147, + "rewards/margins": 0.23752669990062714, + "rewards/rejected": -0.6537761092185974, + "step": 122 + }, + { + "epoch": 0.16, + "learning_rate": 4.9986418585477016e-05, + "logits/chosen": -2.7433488368988037, + "logits/rejected": -2.8069894313812256, + "logps/chosen": -160.246826171875, + "logps/rejected": -171.8643798828125, + "loss": 0.728, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4797200560569763, + "rewards/margins": -0.032596245408058167, + "rewards/rejected": -0.44712376594543457, + "step": 123 + }, + { + "epoch": 0.16, + "learning_rate": 4.998521203813274e-05, + "logits/chosen": -2.738048791885376, + "logits/rejected": -2.747162342071533, + "logps/chosen": -190.9744873046875, + "logps/rejected": -181.24745178222656, + "loss": 0.755, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5226253271102905, + "rewards/margins": -0.04068867489695549, + "rewards/rejected": -0.48193663358688354, + "step": 124 + }, + { + "epoch": 0.16, + "learning_rate": 4.9983954169020256e-05, + "logits/chosen": -2.6100339889526367, + "logits/rejected": -2.634096145629883, + "logps/chosen": -176.01905822753906, + "logps/rejected": -161.22412109375, + "loss": 0.748, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5368779301643372, + "rewards/margins": -0.09010656177997589, + "rewards/rejected": -0.4467713534832001, + "step": 125 + }, + { + "epoch": 0.16, + "learning_rate": 4.9982644980723334e-05, + "logits/chosen": -2.797285556793213, + "logits/rejected": -2.825375556945801, + "logps/chosen": -141.42039489746094, + "logps/rejected": -148.95436096191406, + "loss": 0.7302, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5232774615287781, + "rewards/margins": -0.04633237421512604, + "rewards/rejected": -0.47694510221481323, + "step": 126 + }, + { + "epoch": 0.17, + "learning_rate": 4.998128447593117e-05, + "logits/chosen": -2.6646504402160645, + "logits/rejected": -2.7834081649780273, + "logps/chosen": -230.68499755859375, + "logps/rejected": -245.01414489746094, + "loss": 0.661, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.687300980091095, + "rewards/margins": 0.14017513394355774, + "rewards/rejected": -0.8274761438369751, + "step": 127 + }, + { + "epoch": 0.17, + "learning_rate": 4.997987265743834e-05, + "logits/chosen": -2.775637149810791, + "logits/rejected": -2.738879919052124, + "logps/chosen": -166.31509399414062, + "logps/rejected": -169.01321411132812, + "loss": 0.6557, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4408762454986572, + "rewards/margins": 0.11477227509021759, + "rewards/rejected": -0.5556485652923584, + "step": 128 + }, + { + "epoch": 0.17, + "learning_rate": 4.997840952814484e-05, + "logits/chosen": -2.637038469314575, + "logits/rejected": -2.666442394256592, + "logps/chosen": -151.3613739013672, + "logps/rejected": -160.8176727294922, + "loss": 0.6752, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4970799684524536, + "rewards/margins": 0.08162279427051544, + "rewards/rejected": -0.5787028074264526, + "step": 129 + }, + { + "epoch": 0.17, + "learning_rate": 4.9976895091056075e-05, + "logits/chosen": -2.7471365928649902, + "logits/rejected": -2.6176395416259766, + "logps/chosen": -204.17808532714844, + "logps/rejected": -224.61956787109375, + "loss": 0.6923, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6104612946510315, + "rewards/margins": 0.11507527530193329, + "rewards/rejected": -0.725536584854126, + "step": 130 + }, + { + "epoch": 0.17, + "learning_rate": 4.9975329349282826e-05, + "logits/chosen": -2.7353034019470215, + "logits/rejected": -2.743035078048706, + "logps/chosen": -184.143798828125, + "logps/rejected": -197.323974609375, + "loss": 0.6711, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6166396141052246, + "rewards/margins": 0.08574585616588593, + "rewards/rejected": -0.7023855447769165, + "step": 131 + }, + { + "epoch": 0.17, + "learning_rate": 4.9973712306041256e-05, + "logits/chosen": -2.6548473834991455, + "logits/rejected": -2.6762022972106934, + "logps/chosen": -193.84849548339844, + "logps/rejected": -175.150634765625, + "loss": 0.7597, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6773683428764343, + "rewards/margins": -0.10929510742425919, + "rewards/rejected": -0.5680732131004333, + "step": 132 + }, + { + "epoch": 0.17, + "learning_rate": 4.997204396465292e-05, + "logits/chosen": -2.787050724029541, + "logits/rejected": -2.777578353881836, + "logps/chosen": -195.51785278320312, + "logps/rejected": -190.51622009277344, + "loss": 0.7073, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5759019255638123, + "rewards/margins": 0.019099120050668716, + "rewards/rejected": -0.5950011014938354, + "step": 133 + }, + { + "epoch": 0.18, + "learning_rate": 4.997032432854472e-05, + "logits/chosen": -2.6198296546936035, + "logits/rejected": -2.661146640777588, + "logps/chosen": -149.22142028808594, + "logps/rejected": -173.4526824951172, + "loss": 0.6885, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5247339010238647, + "rewards/margins": 0.05187266319990158, + "rewards/rejected": -0.5766065120697021, + "step": 134 + }, + { + "epoch": 0.18, + "learning_rate": 4.996855340124894e-05, + "logits/chosen": -2.637789487838745, + "logits/rejected": -2.6017353534698486, + "logps/chosen": -158.05914306640625, + "logps/rejected": -175.1423797607422, + "loss": 0.6985, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5851290822029114, + "rewards/margins": 0.03735842555761337, + "rewards/rejected": -0.6224875450134277, + "step": 135 + }, + { + "epoch": 0.18, + "learning_rate": 4.996673118640323e-05, + "logits/chosen": -2.37221360206604, + "logits/rejected": -2.5038788318634033, + "logps/chosen": -143.90518188476562, + "logps/rejected": -222.84115600585938, + "loss": 0.6539, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.36046820878982544, + "rewards/margins": 0.14497990906238556, + "rewards/rejected": -0.5054481029510498, + "step": 136 + }, + { + "epoch": 0.18, + "learning_rate": 4.996485768775055e-05, + "logits/chosen": -2.807823419570923, + "logits/rejected": -2.800899028778076, + "logps/chosen": -169.72129821777344, + "logps/rejected": -181.5267333984375, + "loss": 0.6754, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.543328046798706, + "rewards/margins": 0.11098619550466537, + "rewards/rejected": -0.6543142199516296, + "step": 137 + }, + { + "epoch": 0.18, + "learning_rate": 4.996293290913926e-05, + "logits/chosen": -2.795060634613037, + "logits/rejected": -2.9101216793060303, + "logps/chosen": -136.24945068359375, + "logps/rejected": -162.91119384765625, + "loss": 0.61, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3900853395462036, + "rewards/margins": 0.19977441430091858, + "rewards/rejected": -0.5898597836494446, + "step": 138 + }, + { + "epoch": 0.18, + "learning_rate": 4.9960956854522986e-05, + "logits/chosen": -2.7642905712127686, + "logits/rejected": -2.7773826122283936, + "logps/chosen": -158.06378173828125, + "logps/rejected": -221.30577087402344, + "loss": 0.625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4831012189388275, + "rewards/margins": 0.18627440929412842, + "rewards/rejected": -0.6693755984306335, + "step": 139 + }, + { + "epoch": 0.18, + "learning_rate": 4.995892952796074e-05, + "logits/chosen": -2.7154903411865234, + "logits/rejected": -2.7624685764312744, + "logps/chosen": -187.28146362304688, + "logps/rejected": -189.83071899414062, + "loss": 0.6819, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6895617842674255, + "rewards/margins": 0.06184637174010277, + "rewards/rejected": -0.7514082193374634, + "step": 140 + }, + { + "epoch": 0.18, + "learning_rate": 4.995685093361682e-05, + "logits/chosen": -2.7003986835479736, + "logits/rejected": -2.754859209060669, + "logps/chosen": -160.55992126464844, + "logps/rejected": -172.67434692382812, + "loss": 0.725, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6082602739334106, + "rewards/margins": -0.01646682247519493, + "rewards/rejected": -0.5917934775352478, + "step": 141 + }, + { + "epoch": 0.19, + "learning_rate": 4.9954721075760824e-05, + "logits/chosen": -2.7585508823394775, + "logits/rejected": -2.7562849521636963, + "logps/chosen": -186.20509338378906, + "logps/rejected": -190.92132568359375, + "loss": 0.6594, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5081749558448792, + "rewards/margins": 0.10397283732891083, + "rewards/rejected": -0.6121478080749512, + "step": 142 + }, + { + "epoch": 0.19, + "learning_rate": 4.995253995876767e-05, + "logits/chosen": -2.808187246322632, + "logits/rejected": -2.869479179382324, + "logps/chosen": -172.86203002929688, + "logps/rejected": -175.89739990234375, + "loss": 0.5562, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31780511140823364, + "rewards/margins": 0.3512324392795563, + "rewards/rejected": -0.6690375804901123, + "step": 143 + }, + { + "epoch": 0.19, + "learning_rate": 4.995030758711756e-05, + "logits/chosen": -2.9907169342041016, + "logits/rejected": -2.968203544616699, + "logps/chosen": -191.64285278320312, + "logps/rejected": -177.73028564453125, + "loss": 0.7513, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7068686485290527, + "rewards/margins": -0.05182289704680443, + "rewards/rejected": -0.6550456881523132, + "step": 144 + }, + { + "epoch": 0.19, + "learning_rate": 4.994802396539598e-05, + "logits/chosen": -2.8123016357421875, + "logits/rejected": -2.8668291568756104, + "logps/chosen": -172.08924865722656, + "logps/rejected": -195.8844451904297, + "loss": 0.696, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5609222054481506, + "rewards/margins": 0.05493137985467911, + "rewards/rejected": -0.6158535480499268, + "step": 145 + }, + { + "epoch": 0.19, + "learning_rate": 4.994568909829368e-05, + "logits/chosen": -2.892430305480957, + "logits/rejected": -2.762629985809326, + "logps/chosen": -216.95150756835938, + "logps/rejected": -187.0184783935547, + "loss": 0.7119, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7327225208282471, + "rewards/margins": -0.021204425022006035, + "rewards/rejected": -0.7115181684494019, + "step": 146 + }, + { + "epoch": 0.19, + "learning_rate": 4.9943302990606684e-05, + "logits/chosen": -2.7017452716827393, + "logits/rejected": -2.7307963371276855, + "logps/chosen": -198.5173797607422, + "logps/rejected": -185.40316772460938, + "loss": 0.6751, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6512372493743896, + "rewards/margins": 0.10363547503948212, + "rewards/rejected": -0.754872739315033, + "step": 147 + }, + { + "epoch": 0.19, + "learning_rate": 4.994086564723626e-05, + "logits/chosen": -2.835409641265869, + "logits/rejected": -2.8388915061950684, + "logps/chosen": -173.46127319335938, + "logps/rejected": -185.3079376220703, + "loss": 0.6937, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6533925533294678, + "rewards/margins": 0.033594585955142975, + "rewards/rejected": -0.6869871616363525, + "step": 148 + }, + { + "epoch": 0.2, + "learning_rate": 4.9938377073188905e-05, + "logits/chosen": -2.9569547176361084, + "logits/rejected": -2.9091203212738037, + "logps/chosen": -201.7881317138672, + "logps/rejected": -180.14285278320312, + "loss": 0.7056, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7249323725700378, + "rewards/margins": 0.014494583010673523, + "rewards/rejected": -0.7394269704818726, + "step": 149 + }, + { + "epoch": 0.2, + "learning_rate": 4.993583727357638e-05, + "logits/chosen": -2.6668853759765625, + "logits/rejected": -2.694221258163452, + "logps/chosen": -198.40594482421875, + "logps/rejected": -201.02980041503906, + "loss": 0.7579, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.652782678604126, + "rewards/margins": -0.08612270653247833, + "rewards/rejected": -0.5666600465774536, + "step": 150 + }, + { + "epoch": 0.2, + "learning_rate": 4.993324625361565e-05, + "logits/chosen": -2.757725954055786, + "logits/rejected": -2.7627735137939453, + "logps/chosen": -200.16226196289062, + "logps/rejected": -188.62083435058594, + "loss": 0.6402, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44451814889907837, + "rewards/margins": 0.1431855410337448, + "rewards/rejected": -0.5877037048339844, + "step": 151 + }, + { + "epoch": 0.2, + "learning_rate": 4.993060401862888e-05, + "logits/chosen": -2.7355546951293945, + "logits/rejected": -2.7701869010925293, + "logps/chosen": -170.37046813964844, + "logps/rejected": -182.28440856933594, + "loss": 0.6787, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5462682247161865, + "rewards/margins": 0.08645598590373993, + "rewards/rejected": -0.6327242255210876, + "step": 152 + }, + { + "epoch": 0.2, + "learning_rate": 4.9927910574043465e-05, + "logits/chosen": -2.893017530441284, + "logits/rejected": -2.9069924354553223, + "logps/chosen": -215.6539764404297, + "logps/rejected": -256.93194580078125, + "loss": 0.642, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.635912299156189, + "rewards/margins": 0.1641692817211151, + "rewards/rejected": -0.8000816702842712, + "step": 153 + }, + { + "epoch": 0.2, + "learning_rate": 4.992516592539196e-05, + "logits/chosen": -2.544395685195923, + "logits/rejected": -2.563568353652954, + "logps/chosen": -144.60275268554688, + "logps/rejected": -150.6583709716797, + "loss": 0.6613, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.40625688433647156, + "rewards/margins": 0.11070521920919418, + "rewards/rejected": -0.5169621706008911, + "step": 154 + }, + { + "epoch": 0.2, + "learning_rate": 4.9922370078312105e-05, + "logits/chosen": -2.6519908905029297, + "logits/rejected": -2.6390066146850586, + "logps/chosen": -187.1649932861328, + "logps/rejected": -163.97708129882812, + "loss": 0.6462, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4761255383491516, + "rewards/margins": 0.15067748725414276, + "rewards/rejected": -0.6268030405044556, + "step": 155 + }, + { + "epoch": 0.2, + "learning_rate": 4.991952303854682e-05, + "logits/chosen": -2.823216676712036, + "logits/rejected": -2.8248276710510254, + "logps/chosen": -171.64088439941406, + "logps/rejected": -205.40994262695312, + "loss": 0.5695, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.49252718687057495, + "rewards/margins": 0.3139771521091461, + "rewards/rejected": -0.8065043687820435, + "step": 156 + }, + { + "epoch": 0.21, + "learning_rate": 4.9916624811944175e-05, + "logits/chosen": -2.6669604778289795, + "logits/rejected": -2.720827102661133, + "logps/chosen": -145.63650512695312, + "logps/rejected": -143.77822875976562, + "loss": 0.6691, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4779016673564911, + "rewards/margins": 0.07923712581396103, + "rewards/rejected": -0.5571387410163879, + "step": 157 + }, + { + "epoch": 0.21, + "learning_rate": 4.991367540445735e-05, + "logits/chosen": -2.780358076095581, + "logits/rejected": -2.7683119773864746, + "logps/chosen": -163.51113891601562, + "logps/rejected": -147.0047149658203, + "loss": 0.7295, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6608467698097229, + "rewards/margins": -0.023487316444516182, + "rewards/rejected": -0.6373594403266907, + "step": 158 + }, + { + "epoch": 0.21, + "learning_rate": 4.991067482214471e-05, + "logits/chosen": -2.660963296890259, + "logits/rejected": -2.6401174068450928, + "logps/chosen": -173.09176635742188, + "logps/rejected": -171.8109130859375, + "loss": 0.7108, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6182070970535278, + "rewards/margins": -0.01903488114476204, + "rewards/rejected": -0.5991722345352173, + "step": 159 + }, + { + "epoch": 0.21, + "learning_rate": 4.9907623071169686e-05, + "logits/chosen": -2.7048563957214355, + "logits/rejected": -2.5354063510894775, + "logps/chosen": -228.5687255859375, + "logps/rejected": -184.3094482421875, + "loss": 0.7866, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7742022275924683, + "rewards/margins": -0.1047876849770546, + "rewards/rejected": -0.6694144606590271, + "step": 160 + }, + { + "epoch": 0.21, + "learning_rate": 4.990452015780085e-05, + "logits/chosen": -2.678699016571045, + "logits/rejected": -2.6732139587402344, + "logps/chosen": -217.17123413085938, + "logps/rejected": -211.92489624023438, + "loss": 0.8004, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7689281702041626, + "rewards/margins": -0.17873500287532806, + "rewards/rejected": -0.5901932120323181, + "step": 161 + }, + { + "epoch": 0.21, + "learning_rate": 4.9901366088411846e-05, + "logits/chosen": -2.650327444076538, + "logits/rejected": -2.6380615234375, + "logps/chosen": -160.45660400390625, + "logps/rejected": -149.87843322753906, + "loss": 0.7334, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4278574585914612, + "rewards/margins": -0.050989780575037, + "rewards/rejected": -0.3768676817417145, + "step": 162 + }, + { + "epoch": 0.21, + "learning_rate": 4.98981608694814e-05, + "logits/chosen": -2.636261463165283, + "logits/rejected": -2.6120524406433105, + "logps/chosen": -189.56497192382812, + "logps/rejected": -184.8382568359375, + "loss": 0.7106, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6862293481826782, + "rewards/margins": 0.02107788249850273, + "rewards/rejected": -0.7073072791099548, + "step": 163 + }, + { + "epoch": 0.21, + "learning_rate": 4.9894904507593316e-05, + "logits/chosen": -2.681283712387085, + "logits/rejected": -2.6194002628326416, + "logps/chosen": -158.73968505859375, + "logps/rejected": -180.73330688476562, + "loss": 0.6498, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4341495633125305, + "rewards/margins": 0.13567671179771423, + "rewards/rejected": -0.5698262453079224, + "step": 164 + }, + { + "epoch": 0.22, + "learning_rate": 4.989159700943643e-05, + "logits/chosen": -2.822274923324585, + "logits/rejected": -2.8219101428985596, + "logps/chosen": -182.0315704345703, + "logps/rejected": -193.2274169921875, + "loss": 0.7126, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6531980037689209, + "rewards/margins": 0.01201358437538147, + "rewards/rejected": -0.6652116179466248, + "step": 165 + }, + { + "epoch": 0.22, + "learning_rate": 4.988823838180464e-05, + "logits/chosen": -2.804276943206787, + "logits/rejected": -2.8529317378997803, + "logps/chosen": -183.54083251953125, + "logps/rejected": -199.4310302734375, + "loss": 0.6961, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5883633494377136, + "rewards/margins": 0.03549729287624359, + "rewards/rejected": -0.6238605976104736, + "step": 166 + }, + { + "epoch": 0.22, + "learning_rate": 4.988482863159684e-05, + "logits/chosen": -2.7629952430725098, + "logits/rejected": -2.852982759475708, + "logps/chosen": -223.6051483154297, + "logps/rejected": -215.63427734375, + "loss": 0.605, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5412927865982056, + "rewards/margins": 0.22360366582870483, + "rewards/rejected": -0.7648964524269104, + "step": 167 + }, + { + "epoch": 0.22, + "learning_rate": 4.988136776581696e-05, + "logits/chosen": -2.696824789047241, + "logits/rejected": -2.7131996154785156, + "logps/chosen": -161.2986297607422, + "logps/rejected": -180.24172973632812, + "loss": 0.6756, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.47621679306030273, + "rewards/margins": 0.07477270066738129, + "rewards/rejected": -0.5509894490242004, + "step": 168 + }, + { + "epoch": 0.22, + "learning_rate": 4.9877855791573915e-05, + "logits/chosen": -2.5992307662963867, + "logits/rejected": -2.5558767318725586, + "logps/chosen": -177.31790161132812, + "logps/rejected": -173.00013732910156, + "loss": 0.7614, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6591533422470093, + "rewards/margins": -0.10336636006832123, + "rewards/rejected": -0.5557870268821716, + "step": 169 + }, + { + "epoch": 0.22, + "learning_rate": 4.9874292716081595e-05, + "logits/chosen": -2.480238914489746, + "logits/rejected": -2.531926155090332, + "logps/chosen": -173.81201171875, + "logps/rejected": -177.76727294921875, + "loss": 0.6506, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4206813871860504, + "rewards/margins": 0.10889497399330139, + "rewards/rejected": -0.5295763611793518, + "step": 170 + }, + { + "epoch": 0.22, + "learning_rate": 4.9870678546658865e-05, + "logits/chosen": -2.68393611907959, + "logits/rejected": -2.8312528133392334, + "logps/chosen": -238.86911010742188, + "logps/rejected": -268.19732666015625, + "loss": 0.6572, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5521610975265503, + "rewards/margins": 0.11217445880174637, + "rewards/rejected": -0.6643356084823608, + "step": 171 + }, + { + "epoch": 0.23, + "learning_rate": 4.9867013290729535e-05, + "logits/chosen": -2.580007314682007, + "logits/rejected": -2.5705273151397705, + "logps/chosen": -165.80308532714844, + "logps/rejected": -203.9613800048828, + "loss": 0.7206, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6492197513580322, + "rewards/margins": 0.02221706137061119, + "rewards/rejected": -0.6714367866516113, + "step": 172 + }, + { + "epoch": 0.23, + "learning_rate": 4.986329695582237e-05, + "logits/chosen": -2.7853593826293945, + "logits/rejected": -2.7307794094085693, + "logps/chosen": -211.93991088867188, + "logps/rejected": -200.86334228515625, + "loss": 0.7051, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5223960876464844, + "rewards/margins": 0.0009156223386526108, + "rewards/rejected": -0.5233116745948792, + "step": 173 + }, + { + "epoch": 0.23, + "learning_rate": 4.985952954957103e-05, + "logits/chosen": -2.6804401874542236, + "logits/rejected": -2.6449456214904785, + "logps/chosen": -187.6370391845703, + "logps/rejected": -193.33245849609375, + "loss": 0.6809, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.513522207736969, + "rewards/margins": 0.04058818519115448, + "rewards/rejected": -0.5541103482246399, + "step": 174 + }, + { + "epoch": 0.23, + "learning_rate": 4.985571107971408e-05, + "logits/chosen": -2.622426986694336, + "logits/rejected": -2.618734836578369, + "logps/chosen": -152.2515869140625, + "logps/rejected": -168.34747314453125, + "loss": 0.7282, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5195332765579224, + "rewards/margins": -0.02337510883808136, + "rewards/rejected": -0.4961581528186798, + "step": 175 + }, + { + "epoch": 0.23, + "learning_rate": 4.9851841554095e-05, + "logits/chosen": -2.6712746620178223, + "logits/rejected": -2.6609811782836914, + "logps/chosen": -198.9906005859375, + "logps/rejected": -164.44154357910156, + "loss": 0.7105, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4613041877746582, + "rewards/margins": -0.0029089637100696564, + "rewards/rejected": -0.45839521288871765, + "step": 176 + }, + { + "epoch": 0.23, + "learning_rate": 4.9847920980662134e-05, + "logits/chosen": -2.6356289386749268, + "logits/rejected": -2.6573214530944824, + "logps/chosen": -175.52487182617188, + "logps/rejected": -187.29832458496094, + "loss": 0.6474, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.40463200211524963, + "rewards/margins": 0.12262441217899323, + "rewards/rejected": -0.527256429195404, + "step": 177 + }, + { + "epoch": 0.23, + "learning_rate": 4.984394936746865e-05, + "logits/chosen": -2.357494354248047, + "logits/rejected": -2.411952018737793, + "logps/chosen": -139.59608459472656, + "logps/rejected": -156.5337371826172, + "loss": 0.679, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4118153154850006, + "rewards/margins": 0.04770222678780556, + "rewards/rejected": -0.45951756834983826, + "step": 178 + }, + { + "epoch": 0.23, + "learning_rate": 4.98399267226726e-05, + "logits/chosen": -2.5636932849884033, + "logits/rejected": -2.6556129455566406, + "logps/chosen": -175.670166015625, + "logps/rejected": -179.19627380371094, + "loss": 0.6572, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6697508692741394, + "rewards/margins": 0.1040661484003067, + "rewards/rejected": -0.7738169431686401, + "step": 179 + }, + { + "epoch": 0.24, + "learning_rate": 4.9835853054536846e-05, + "logits/chosen": -2.5892560482025146, + "logits/rejected": -2.579235315322876, + "logps/chosen": -168.14564514160156, + "logps/rejected": -163.5547332763672, + "loss": 0.624, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5034769177436829, + "rewards/margins": 0.1633673459291458, + "rewards/rejected": -0.6668442487716675, + "step": 180 + }, + { + "epoch": 0.24, + "learning_rate": 4.9831728371429046e-05, + "logits/chosen": -2.5526325702667236, + "logits/rejected": -2.602790355682373, + "logps/chosen": -167.693115234375, + "logps/rejected": -191.10301208496094, + "loss": 0.6836, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5507184267044067, + "rewards/margins": 0.07049673795700073, + "rewards/rejected": -0.6212151646614075, + "step": 181 + }, + { + "epoch": 0.24, + "learning_rate": 4.982755268182164e-05, + "logits/chosen": -2.581120729446411, + "logits/rejected": -2.61881947517395, + "logps/chosen": -176.85264587402344, + "logps/rejected": -202.72483825683594, + "loss": 0.6354, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6271121501922607, + "rewards/margins": 0.17561408877372742, + "rewards/rejected": -0.8027262687683105, + "step": 182 + }, + { + "epoch": 0.24, + "learning_rate": 4.982332599429187e-05, + "logits/chosen": -2.5083706378936768, + "logits/rejected": -2.5868587493896484, + "logps/chosen": -145.9921112060547, + "logps/rejected": -150.42713928222656, + "loss": 0.6945, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.45637643337249756, + "rewards/margins": 0.06362758576869965, + "rewards/rejected": -0.5200040340423584, + "step": 183 + }, + { + "epoch": 0.24, + "learning_rate": 4.981904831752171e-05, + "logits/chosen": -2.563215970993042, + "logits/rejected": -2.6187920570373535, + "logps/chosen": -147.48265075683594, + "logps/rejected": -155.67405700683594, + "loss": 0.7287, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5919477939605713, + "rewards/margins": -0.029735613614320755, + "rewards/rejected": -0.5622121691703796, + "step": 184 + }, + { + "epoch": 0.24, + "learning_rate": 4.981471966029787e-05, + "logits/chosen": -2.447539806365967, + "logits/rejected": -2.4295990467071533, + "logps/chosen": -153.93881225585938, + "logps/rejected": -169.62408447265625, + "loss": 0.6502, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6545721292495728, + "rewards/margins": 0.11100000143051147, + "rewards/rejected": -0.7655720710754395, + "step": 185 + }, + { + "epoch": 0.24, + "learning_rate": 4.981034003151178e-05, + "logits/chosen": -2.4045794010162354, + "logits/rejected": -2.446890354156494, + "logps/chosen": -134.2223358154297, + "logps/rejected": -149.72105407714844, + "loss": 0.6335, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.46206632256507874, + "rewards/margins": 0.19417575001716614, + "rewards/rejected": -0.6562421321868896, + "step": 186 + }, + { + "epoch": 0.24, + "learning_rate": 4.980590944015958e-05, + "logits/chosen": -2.68265700340271, + "logits/rejected": -2.667114496231079, + "logps/chosen": -167.9902801513672, + "logps/rejected": -171.3512420654297, + "loss": 0.6734, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5563911199569702, + "rewards/margins": 0.08469439297914505, + "rewards/rejected": -0.6410855054855347, + "step": 187 + }, + { + "epoch": 0.25, + "learning_rate": 4.98014278953421e-05, + "logits/chosen": -2.5728375911712646, + "logits/rejected": -2.681403875350952, + "logps/chosen": -159.7633056640625, + "logps/rejected": -211.886962890625, + "loss": 0.6439, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.531548023223877, + "rewards/margins": 0.1921529322862625, + "rewards/rejected": -0.7237009406089783, + "step": 188 + }, + { + "epoch": 0.25, + "learning_rate": 4.979689540626479e-05, + "logits/chosen": -2.324286937713623, + "logits/rejected": -2.453277349472046, + "logps/chosen": -168.53738403320312, + "logps/rejected": -180.93719482421875, + "loss": 0.6538, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4037819802761078, + "rewards/margins": 0.17486746609210968, + "rewards/rejected": -0.5786494016647339, + "step": 189 + }, + { + "epoch": 0.25, + "learning_rate": 4.9792311982237774e-05, + "logits/chosen": -2.773432493209839, + "logits/rejected": -2.74585223197937, + "logps/chosen": -157.39044189453125, + "logps/rejected": -167.32725524902344, + "loss": 0.6255, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5002456307411194, + "rewards/margins": 0.22034718096256256, + "rewards/rejected": -0.7205928564071655, + "step": 190 + }, + { + "epoch": 0.25, + "learning_rate": 4.9787677632675825e-05, + "logits/chosen": -2.6729888916015625, + "logits/rejected": -2.7148032188415527, + "logps/chosen": -162.77774047851562, + "logps/rejected": -221.85386657714844, + "loss": 0.6881, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6836182475090027, + "rewards/margins": 0.14588770270347595, + "rewards/rejected": -0.829505980014801, + "step": 191 + }, + { + "epoch": 0.25, + "learning_rate": 4.978299236709826e-05, + "logits/chosen": -2.556713581085205, + "logits/rejected": -2.5743775367736816, + "logps/chosen": -197.9322052001953, + "logps/rejected": -203.74917602539062, + "loss": 0.6874, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7575306296348572, + "rewards/margins": 0.05150505527853966, + "rewards/rejected": -0.80903559923172, + "step": 192 + }, + { + "epoch": 0.25, + "learning_rate": 4.977825619512904e-05, + "logits/chosen": -2.390803813934326, + "logits/rejected": -2.551340341567993, + "logps/chosen": -152.84097290039062, + "logps/rejected": -192.7227325439453, + "loss": 0.7108, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5909304618835449, + "rewards/margins": -0.009836459532380104, + "rewards/rejected": -0.581093966960907, + "step": 193 + }, + { + "epoch": 0.25, + "learning_rate": 4.977346912649666e-05, + "logits/chosen": -2.451486587524414, + "logits/rejected": -2.423938751220703, + "logps/chosen": -208.7131805419922, + "logps/rejected": -195.0940399169922, + "loss": 0.7075, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5515446066856384, + "rewards/margins": 0.014907769858837128, + "rewards/rejected": -0.5664523839950562, + "step": 194 + }, + { + "epoch": 0.26, + "learning_rate": 4.9768631171034175e-05, + "logits/chosen": -2.4102437496185303, + "logits/rejected": -2.524508237838745, + "logps/chosen": -171.2530975341797, + "logps/rejected": -197.77008056640625, + "loss": 0.6437, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6411187648773193, + "rewards/margins": 0.20387138426303864, + "rewards/rejected": -0.8449901342391968, + "step": 195 + }, + { + "epoch": 0.26, + "learning_rate": 4.9763742338679145e-05, + "logits/chosen": -2.6650915145874023, + "logits/rejected": -2.5682055950164795, + "logps/chosen": -280.52142333984375, + "logps/rejected": -250.82424926757812, + "loss": 0.6956, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7708845138549805, + "rewards/margins": 0.03139163926243782, + "rewards/rejected": -0.8022761344909668, + "step": 196 + }, + { + "epoch": 0.26, + "learning_rate": 4.975880263947367e-05, + "logits/chosen": -2.66872239112854, + "logits/rejected": -2.6272640228271484, + "logps/chosen": -206.67318725585938, + "logps/rejected": -169.04757690429688, + "loss": 0.725, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7810104489326477, + "rewards/margins": -0.02610369399189949, + "rewards/rejected": -0.7549068331718445, + "step": 197 + }, + { + "epoch": 0.26, + "learning_rate": 4.9753812083564304e-05, + "logits/chosen": -2.4464945793151855, + "logits/rejected": -2.4812588691711426, + "logps/chosen": -153.33660888671875, + "logps/rejected": -176.61399841308594, + "loss": 0.6034, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.44514474272727966, + "rewards/margins": 0.24550259113311768, + "rewards/rejected": -0.6906473636627197, + "step": 198 + }, + { + "epoch": 0.26, + "learning_rate": 4.974877068120208e-05, + "logits/chosen": -2.635669231414795, + "logits/rejected": -2.648510456085205, + "logps/chosen": -182.48388671875, + "logps/rejected": -196.19873046875, + "loss": 0.6649, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6389026641845703, + "rewards/margins": 0.10669447481632233, + "rewards/rejected": -0.7455971240997314, + "step": 199 + }, + { + "epoch": 0.26, + "learning_rate": 4.974367844274248e-05, + "logits/chosen": -2.5759198665618896, + "logits/rejected": -2.723337173461914, + "logps/chosen": -179.0137939453125, + "logps/rejected": -255.97052001953125, + "loss": 0.6338, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4965980052947998, + "rewards/margins": 0.18533286452293396, + "rewards/rejected": -0.6819308996200562, + "step": 200 + }, + { + "epoch": 0.26, + "learning_rate": 4.973853537864538e-05, + "logits/chosen": -2.7438008785247803, + "logits/rejected": -2.8121700286865234, + "logps/chosen": -160.9208221435547, + "logps/rejected": -169.40225219726562, + "loss": 0.6689, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6913070678710938, + "rewards/margins": 0.10465458035469055, + "rewards/rejected": -0.7959617376327515, + "step": 201 + }, + { + "epoch": 0.26, + "learning_rate": 4.973334149947508e-05, + "logits/chosen": -2.70800518989563, + "logits/rejected": -2.6374926567077637, + "logps/chosen": -183.4274444580078, + "logps/rejected": -180.7521209716797, + "loss": 0.6144, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7033702731132507, + "rewards/margins": 0.2224593162536621, + "rewards/rejected": -0.9258295893669128, + "step": 202 + }, + { + "epoch": 0.27, + "learning_rate": 4.972809681590026e-05, + "logits/chosen": -2.66047739982605, + "logits/rejected": -2.710866928100586, + "logps/chosen": -194.5672607421875, + "logps/rejected": -210.85208129882812, + "loss": 0.655, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8557997345924377, + "rewards/margins": 0.16004985570907593, + "rewards/rejected": -1.0158495903015137, + "step": 203 + }, + { + "epoch": 0.27, + "learning_rate": 4.972280133869396e-05, + "logits/chosen": -2.433838129043579, + "logits/rejected": -2.564758539199829, + "logps/chosen": -171.2584686279297, + "logps/rejected": -213.25494384765625, + "loss": 0.5993, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.43924885988235474, + "rewards/margins": 0.2442169040441513, + "rewards/rejected": -0.6834657788276672, + "step": 204 + }, + { + "epoch": 0.27, + "learning_rate": 4.971745507873352e-05, + "logits/chosen": -2.681500196456909, + "logits/rejected": -2.6873316764831543, + "logps/chosen": -150.2528839111328, + "logps/rejected": -154.3059844970703, + "loss": 0.627, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7566659450531006, + "rewards/margins": 0.23124736547470093, + "rewards/rejected": -0.9879133701324463, + "step": 205 + }, + { + "epoch": 0.27, + "learning_rate": 4.971205804700063e-05, + "logits/chosen": -2.464470624923706, + "logits/rejected": -2.3958442211151123, + "logps/chosen": -293.574462890625, + "logps/rejected": -252.32489013671875, + "loss": 0.6749, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5428895354270935, + "rewards/margins": 0.14122185111045837, + "rewards/rejected": -0.6841113567352295, + "step": 206 + }, + { + "epoch": 0.27, + "learning_rate": 4.970661025458125e-05, + "logits/chosen": -2.5775954723358154, + "logits/rejected": -2.6041054725646973, + "logps/chosen": -170.66627502441406, + "logps/rejected": -163.29644775390625, + "loss": 0.721, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9231572151184082, + "rewards/margins": 0.11258751899003983, + "rewards/rejected": -1.0357446670532227, + "step": 207 + }, + { + "epoch": 0.27, + "learning_rate": 4.9701111712665625e-05, + "logits/chosen": -2.6646294593811035, + "logits/rejected": -2.7594456672668457, + "logps/chosen": -200.36981201171875, + "logps/rejected": -186.59059143066406, + "loss": 0.7514, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.3253411054611206, + "rewards/margins": -0.0334821492433548, + "rewards/rejected": -1.2918590307235718, + "step": 208 + }, + { + "epoch": 0.27, + "learning_rate": 4.969556243254822e-05, + "logits/chosen": -2.5144646167755127, + "logits/rejected": -2.594902992248535, + "logps/chosen": -147.14231872558594, + "logps/rejected": -176.36328125, + "loss": 0.6548, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6609407067298889, + "rewards/margins": 0.12413067370653152, + "rewards/rejected": -0.7850713729858398, + "step": 209 + }, + { + "epoch": 0.27, + "learning_rate": 4.968996242562774e-05, + "logits/chosen": -2.6077287197113037, + "logits/rejected": -2.6607818603515625, + "logps/chosen": -199.4670867919922, + "logps/rejected": -201.5868377685547, + "loss": 0.6797, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8754231333732605, + "rewards/margins": 0.13014619052410126, + "rewards/rejected": -1.005569338798523, + "step": 210 + }, + { + "epoch": 0.28, + "learning_rate": 4.968431170340706e-05, + "logits/chosen": -2.740494966506958, + "logits/rejected": -2.620009660720825, + "logps/chosen": -210.96929931640625, + "logps/rejected": -204.82090759277344, + "loss": 0.7721, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2561805248260498, + "rewards/margins": -0.05324437841773033, + "rewards/rejected": -1.2029361724853516, + "step": 211 + }, + { + "epoch": 0.28, + "learning_rate": 4.9678610277493275e-05, + "logits/chosen": -2.6105682849884033, + "logits/rejected": -2.5892579555511475, + "logps/chosen": -198.35635375976562, + "logps/rejected": -207.89016723632812, + "loss": 0.6961, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.989006757736206, + "rewards/margins": 0.060812097042798996, + "rewards/rejected": -1.0498188734054565, + "step": 212 + }, + { + "epoch": 0.28, + "learning_rate": 4.967285815959759e-05, + "logits/chosen": -2.714409589767456, + "logits/rejected": -2.7895750999450684, + "logps/chosen": -208.87754821777344, + "logps/rejected": -222.0183563232422, + "loss": 0.57, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8426048159599304, + "rewards/margins": 0.3045133650302887, + "rewards/rejected": -1.1471182107925415, + "step": 213 + }, + { + "epoch": 0.28, + "learning_rate": 4.9667055361535354e-05, + "logits/chosen": -2.748204231262207, + "logits/rejected": -2.832871675491333, + "logps/chosen": -201.07078552246094, + "logps/rejected": -212.4540252685547, + "loss": 0.6954, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2435117959976196, + "rewards/margins": 0.14082738757133484, + "rewards/rejected": -1.384339451789856, + "step": 214 + }, + { + "epoch": 0.28, + "learning_rate": 4.9661201895226e-05, + "logits/chosen": -2.7127251625061035, + "logits/rejected": -2.751798629760742, + "logps/chosen": -220.7108154296875, + "logps/rejected": -195.08290100097656, + "loss": 0.6729, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7872539758682251, + "rewards/margins": 0.12220478057861328, + "rewards/rejected": -0.9094586968421936, + "step": 215 + }, + { + "epoch": 0.28, + "learning_rate": 4.965529777269306e-05, + "logits/chosen": -2.6204776763916016, + "logits/rejected": -2.664301872253418, + "logps/chosen": -166.66172790527344, + "logps/rejected": -205.79847717285156, + "loss": 0.7922, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9887948036193848, + "rewards/margins": -0.06140782684087753, + "rewards/rejected": -0.927386999130249, + "step": 216 + }, + { + "epoch": 0.28, + "learning_rate": 4.964934300606411e-05, + "logits/chosen": -2.48382830619812, + "logits/rejected": -2.493025541305542, + "logps/chosen": -169.9669189453125, + "logps/rejected": -181.6856689453125, + "loss": 0.7197, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6650858521461487, + "rewards/margins": 0.017848990857601166, + "rewards/rejected": -0.6829348802566528, + "step": 217 + }, + { + "epoch": 0.29, + "learning_rate": 4.964333760757074e-05, + "logits/chosen": -2.648463726043701, + "logits/rejected": -2.6576759815216064, + "logps/chosen": -178.34747314453125, + "logps/rejected": -188.2498779296875, + "loss": 0.6239, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.95872962474823, + "rewards/margins": 0.26567110419273376, + "rewards/rejected": -1.2244007587432861, + "step": 218 + }, + { + "epoch": 0.29, + "learning_rate": 4.963728158954856e-05, + "logits/chosen": -2.9130921363830566, + "logits/rejected": -2.894216299057007, + "logps/chosen": -221.04931640625, + "logps/rejected": -237.8624725341797, + "loss": 0.6594, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.084302306175232, + "rewards/margins": 0.13000546395778656, + "rewards/rejected": -1.2143077850341797, + "step": 219 + }, + { + "epoch": 0.29, + "learning_rate": 4.963117496443715e-05, + "logits/chosen": -2.68157958984375, + "logits/rejected": -2.8279314041137695, + "logps/chosen": -165.5657196044922, + "logps/rejected": -202.75865173339844, + "loss": 0.6575, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7596678733825684, + "rewards/margins": 0.2009599506855011, + "rewards/rejected": -0.9606277942657471, + "step": 220 + }, + { + "epoch": 0.29, + "learning_rate": 4.9625017744780045e-05, + "logits/chosen": -2.614312171936035, + "logits/rejected": -2.6585357189178467, + "logps/chosen": -216.45230102539062, + "logps/rejected": -195.06643676757812, + "loss": 0.7323, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9276168346405029, + "rewards/margins": -0.02379007264971733, + "rewards/rejected": -0.9038268327713013, + "step": 221 + }, + { + "epoch": 0.29, + "learning_rate": 4.96188099432247e-05, + "logits/chosen": -2.7345945835113525, + "logits/rejected": -2.702479362487793, + "logps/chosen": -227.8175048828125, + "logps/rejected": -238.62513732910156, + "loss": 0.6852, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9885910749435425, + "rewards/margins": 0.07757923752069473, + "rewards/rejected": -1.0661702156066895, + "step": 222 + }, + { + "epoch": 0.29, + "learning_rate": 4.9612551572522464e-05, + "logits/chosen": -2.6463451385498047, + "logits/rejected": -2.6843183040618896, + "logps/chosen": -150.67254638671875, + "logps/rejected": -155.01316833496094, + "loss": 0.6833, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0718083381652832, + "rewards/margins": 0.0895613357424736, + "rewards/rejected": -1.1613696813583374, + "step": 223 + }, + { + "epoch": 0.29, + "learning_rate": 4.960624264552858e-05, + "logits/chosen": -2.6091978549957275, + "logits/rejected": -2.6224098205566406, + "logps/chosen": -134.08544921875, + "logps/rejected": -145.00137329101562, + "loss": 0.617, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6298830509185791, + "rewards/margins": 0.1894720196723938, + "rewards/rejected": -0.8193551301956177, + "step": 224 + }, + { + "epoch": 0.29, + "learning_rate": 4.9599883175202124e-05, + "logits/chosen": -2.689610004425049, + "logits/rejected": -2.641515016555786, + "logps/chosen": -175.27386474609375, + "logps/rejected": -176.47059631347656, + "loss": 0.6933, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8815383911132812, + "rewards/margins": 0.05579657852649689, + "rewards/rejected": -0.9373350143432617, + "step": 225 + }, + { + "epoch": 0.3, + "learning_rate": 4.9593473174605974e-05, + "logits/chosen": -2.673809051513672, + "logits/rejected": -2.6921756267547607, + "logps/chosen": -210.48333740234375, + "logps/rejected": -222.2653045654297, + "loss": 0.7605, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.9074676036834717, + "rewards/margins": -0.08400504291057587, + "rewards/rejected": -0.8234625458717346, + "step": 226 + }, + { + "epoch": 0.3, + "learning_rate": 4.958701265690685e-05, + "logits/chosen": -2.608705759048462, + "logits/rejected": -2.622281074523926, + "logps/chosen": -194.5504150390625, + "logps/rejected": -197.58035278320312, + "loss": 0.7456, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.1094239950180054, + "rewards/margins": -0.014511600136756897, + "rewards/rejected": -1.0949124097824097, + "step": 227 + }, + { + "epoch": 0.3, + "learning_rate": 4.958050163537519e-05, + "logits/chosen": -2.598935127258301, + "logits/rejected": -2.648134231567383, + "logps/chosen": -135.84756469726562, + "logps/rejected": -160.54318237304688, + "loss": 0.7163, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7799862623214722, + "rewards/margins": 0.07278753817081451, + "rewards/rejected": -0.8527737855911255, + "step": 228 + }, + { + "epoch": 0.3, + "learning_rate": 4.957394012338519e-05, + "logits/chosen": -2.5633938312530518, + "logits/rejected": -2.528005838394165, + "logps/chosen": -179.5242919921875, + "logps/rejected": -175.9528350830078, + "loss": 0.6776, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6330910325050354, + "rewards/margins": 0.09188088774681091, + "rewards/rejected": -0.7249718904495239, + "step": 229 + }, + { + "epoch": 0.3, + "learning_rate": 4.956732813441477e-05, + "logits/chosen": -2.681288957595825, + "logits/rejected": -2.742069959640503, + "logps/chosen": -157.13189697265625, + "logps/rejected": -170.25006103515625, + "loss": 0.6152, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7350885272026062, + "rewards/margins": 0.18710875511169434, + "rewards/rejected": -0.9221972227096558, + "step": 230 + }, + { + "epoch": 0.3, + "learning_rate": 4.956066568204552e-05, + "logits/chosen": -2.6132476329803467, + "logits/rejected": -2.6484899520874023, + "logps/chosen": -175.98236083984375, + "logps/rejected": -185.96463012695312, + "loss": 0.5907, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6415359973907471, + "rewards/margins": 0.3225414752960205, + "rewards/rejected": -0.9640775322914124, + "step": 231 + }, + { + "epoch": 0.3, + "learning_rate": 4.955395277996268e-05, + "logits/chosen": -2.654163122177124, + "logits/rejected": -2.6380820274353027, + "logps/chosen": -191.48721313476562, + "logps/rejected": -160.77561950683594, + "loss": 0.724, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.868155837059021, + "rewards/margins": 0.054594431072473526, + "rewards/rejected": -0.9227503538131714, + "step": 232 + }, + { + "epoch": 0.3, + "learning_rate": 4.954718944195512e-05, + "logits/chosen": -2.6072440147399902, + "logits/rejected": -2.6623966693878174, + "logps/chosen": -184.31289672851562, + "logps/rejected": -173.9798126220703, + "loss": 0.7546, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.8332484364509583, + "rewards/margins": -0.08707739412784576, + "rewards/rejected": -0.7461711168289185, + "step": 233 + }, + { + "epoch": 0.31, + "learning_rate": 4.954037568191534e-05, + "logits/chosen": -2.6110591888427734, + "logits/rejected": -2.568448781967163, + "logps/chosen": -222.3007049560547, + "logps/rejected": -193.83935546875, + "loss": 0.769, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9347423315048218, + "rewards/margins": -0.09616127610206604, + "rewards/rejected": -0.8385810256004333, + "step": 234 + }, + { + "epoch": 0.31, + "learning_rate": 4.9533511513839384e-05, + "logits/chosen": -2.7532308101654053, + "logits/rejected": -2.7335729598999023, + "logps/chosen": -218.836669921875, + "logps/rejected": -247.48204040527344, + "loss": 0.7134, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9885715246200562, + "rewards/margins": 0.15268389880657196, + "rewards/rejected": -1.1412553787231445, + "step": 235 + }, + { + "epoch": 0.31, + "learning_rate": 4.9526596951826824e-05, + "logits/chosen": -2.6721489429473877, + "logits/rejected": -2.6597042083740234, + "logps/chosen": -187.82127380371094, + "logps/rejected": -174.8271026611328, + "loss": 0.6244, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5805238485336304, + "rewards/margins": 0.22618348896503448, + "rewards/rejected": -0.8067073225975037, + "step": 236 + }, + { + "epoch": 0.31, + "learning_rate": 4.951963201008076e-05, + "logits/chosen": -2.855266571044922, + "logits/rejected": -2.8631551265716553, + "logps/chosen": -243.0494384765625, + "logps/rejected": -228.80966186523438, + "loss": 0.7247, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.934760332107544, + "rewards/margins": 0.034967467188835144, + "rewards/rejected": -0.9697277545928955, + "step": 237 + }, + { + "epoch": 0.31, + "learning_rate": 4.951261670290781e-05, + "logits/chosen": -2.664848566055298, + "logits/rejected": -2.730018138885498, + "logps/chosen": -192.22723388671875, + "logps/rejected": -171.22459411621094, + "loss": 0.6765, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6986839771270752, + "rewards/margins": 0.09998993575572968, + "rewards/rejected": -0.7986739277839661, + "step": 238 + }, + { + "epoch": 0.31, + "learning_rate": 4.950555104471799e-05, + "logits/chosen": -2.6024093627929688, + "logits/rejected": -2.6131458282470703, + "logps/chosen": -164.9868927001953, + "logps/rejected": -153.97813415527344, + "loss": 0.7123, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7849129438400269, + "rewards/margins": 0.029222920536994934, + "rewards/rejected": -0.8141359090805054, + "step": 239 + }, + { + "epoch": 0.31, + "learning_rate": 4.949843505002477e-05, + "logits/chosen": -2.509110689163208, + "logits/rejected": -2.5836997032165527, + "logps/chosen": -156.37550354003906, + "logps/rejected": -177.84518432617188, + "loss": 0.5906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40633201599121094, + "rewards/margins": 0.2858356237411499, + "rewards/rejected": -0.6921676397323608, + "step": 240 + }, + { + "epoch": 0.32, + "learning_rate": 4.9491268733445034e-05, + "logits/chosen": -2.5929789543151855, + "logits/rejected": -2.593715190887451, + "logps/chosen": -159.74212646484375, + "logps/rejected": -177.87425231933594, + "loss": 0.6413, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4330030679702759, + "rewards/margins": 0.15460006892681122, + "rewards/rejected": -0.5876031517982483, + "step": 241 + }, + { + "epoch": 0.32, + "learning_rate": 4.9484052109698984e-05, + "logits/chosen": -2.581435441970825, + "logits/rejected": -2.598817825317383, + "logps/chosen": -169.76776123046875, + "logps/rejected": -170.72422790527344, + "loss": 0.6379, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5581039786338806, + "rewards/margins": 0.1779794692993164, + "rewards/rejected": -0.736083447933197, + "step": 242 + }, + { + "epoch": 0.32, + "learning_rate": 4.947678519361021e-05, + "logits/chosen": -2.5178732872009277, + "logits/rejected": -2.5693912506103516, + "logps/chosen": -173.69699096679688, + "logps/rejected": -174.0066680908203, + "loss": 0.6503, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5450050830841064, + "rewards/margins": 0.11375146359205246, + "rewards/rejected": -0.6587565541267395, + "step": 243 + }, + { + "epoch": 0.32, + "learning_rate": 4.946946800010556e-05, + "logits/chosen": -2.5523407459259033, + "logits/rejected": -2.537888288497925, + "logps/chosen": -175.92843627929688, + "logps/rejected": -160.6066436767578, + "loss": 0.7642, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7941773533821106, + "rewards/margins": -0.09637541323900223, + "rewards/rejected": -0.697801947593689, + "step": 244 + }, + { + "epoch": 0.32, + "learning_rate": 4.946210054421518e-05, + "logits/chosen": -2.688391923904419, + "logits/rejected": -2.702990770339966, + "logps/chosen": -222.63352966308594, + "logps/rejected": -215.47323608398438, + "loss": 0.6372, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6142177581787109, + "rewards/margins": 0.2073356807231903, + "rewards/rejected": -0.8215534687042236, + "step": 245 + }, + { + "epoch": 0.32, + "learning_rate": 4.945468284107246e-05, + "logits/chosen": -2.5451714992523193, + "logits/rejected": -2.5484871864318848, + "logps/chosen": -196.5067138671875, + "logps/rejected": -190.59320068359375, + "loss": 0.6548, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5240797996520996, + "rewards/margins": 0.11533726006746292, + "rewards/rejected": -0.6394170522689819, + "step": 246 + }, + { + "epoch": 0.32, + "learning_rate": 4.944721490591401e-05, + "logits/chosen": -2.472393274307251, + "logits/rejected": -2.587679386138916, + "logps/chosen": -165.8987274169922, + "logps/rejected": -188.1234130859375, + "loss": 0.5868, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4879373013973236, + "rewards/margins": 0.2605348229408264, + "rewards/rejected": -0.7484720349311829, + "step": 247 + }, + { + "epoch": 0.32, + "learning_rate": 4.9439696754079595e-05, + "logits/chosen": -2.800347089767456, + "logits/rejected": -2.7822184562683105, + "logps/chosen": -248.7989044189453, + "logps/rejected": -214.1162567138672, + "loss": 0.8436, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8426647782325745, + "rewards/margins": -0.20171405375003815, + "rewards/rejected": -0.6409507393836975, + "step": 248 + }, + { + "epoch": 0.33, + "learning_rate": 4.9432128401012144e-05, + "logits/chosen": -2.5632007122039795, + "logits/rejected": -2.5047194957733154, + "logps/chosen": -185.19070434570312, + "logps/rejected": -204.64364624023438, + "loss": 0.7785, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7219403386116028, + "rewards/margins": -0.07751601189374924, + "rewards/rejected": -0.6444243788719177, + "step": 249 + }, + { + "epoch": 0.33, + "learning_rate": 4.9424509862257706e-05, + "logits/chosen": -2.529867649078369, + "logits/rejected": -2.5607314109802246, + "logps/chosen": -219.9513397216797, + "logps/rejected": -240.11671447753906, + "loss": 0.6592, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4247688055038452, + "rewards/margins": 0.10957615077495575, + "rewards/rejected": -0.5343449115753174, + "step": 250 + }, + { + "epoch": 0.33, + "learning_rate": 4.941684115346541e-05, + "logits/chosen": -2.7805376052856445, + "logits/rejected": -2.837836742401123, + "logps/chosen": -177.21543884277344, + "logps/rejected": -213.7078094482422, + "loss": 0.6014, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5495901107788086, + "rewards/margins": 0.268564373254776, + "rewards/rejected": -0.8181545734405518, + "step": 251 + }, + { + "epoch": 0.33, + "learning_rate": 4.940912229038745e-05, + "logits/chosen": -2.56017804145813, + "logits/rejected": -2.5503697395324707, + "logps/chosen": -170.02735900878906, + "logps/rejected": -161.25509643554688, + "loss": 0.7194, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5123621225357056, + "rewards/margins": -0.0268712155520916, + "rewards/rejected": -0.4854908883571625, + "step": 252 + }, + { + "epoch": 0.33, + "learning_rate": 4.9401353288879024e-05, + "logits/chosen": -2.5448572635650635, + "logits/rejected": -2.56659197807312, + "logps/chosen": -169.89077758789062, + "logps/rejected": -190.93545532226562, + "loss": 0.6295, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4579932391643524, + "rewards/margins": 0.1804841011762619, + "rewards/rejected": -0.6384773254394531, + "step": 253 + }, + { + "epoch": 0.33, + "learning_rate": 4.9393534164898335e-05, + "logits/chosen": -2.55391526222229, + "logits/rejected": -2.5868449211120605, + "logps/chosen": -158.18197631835938, + "logps/rejected": -197.00271606445312, + "loss": 0.6476, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6932105422019958, + "rewards/margins": 0.13315898180007935, + "rewards/rejected": -0.8263695240020752, + "step": 254 + }, + { + "epoch": 0.33, + "learning_rate": 4.9385664934506526e-05, + "logits/chosen": -2.554259777069092, + "logits/rejected": -2.684239625930786, + "logps/chosen": -159.8101348876953, + "logps/rejected": -183.30532836914062, + "loss": 0.6516, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5770647525787354, + "rewards/margins": 0.14322030544281006, + "rewards/rejected": -0.7202850580215454, + "step": 255 + }, + { + "epoch": 0.34, + "learning_rate": 4.937774561386768e-05, + "logits/chosen": -2.5128979682922363, + "logits/rejected": -2.6388025283813477, + "logps/chosen": -174.58401489257812, + "logps/rejected": -184.96910095214844, + "loss": 0.577, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5345829129219055, + "rewards/margins": 0.3504784107208252, + "rewards/rejected": -0.8850612640380859, + "step": 256 + }, + { + "epoch": 0.34, + "learning_rate": 4.936977621924875e-05, + "logits/chosen": -2.6937482357025146, + "logits/rejected": -2.650275707244873, + "logps/chosen": -165.79302978515625, + "logps/rejected": -177.26170349121094, + "loss": 0.6382, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.614943265914917, + "rewards/margins": 0.14949731528759003, + "rewards/rejected": -0.7644405961036682, + "step": 257 + }, + { + "epoch": 0.34, + "learning_rate": 4.9361756767019564e-05, + "logits/chosen": -2.5641212463378906, + "logits/rejected": -2.6175105571746826, + "logps/chosen": -182.44837951660156, + "logps/rejected": -225.40805053710938, + "loss": 0.692, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6991927027702332, + "rewards/margins": 0.0908452570438385, + "rewards/rejected": -0.790037989616394, + "step": 258 + }, + { + "epoch": 0.34, + "learning_rate": 4.935368727365276e-05, + "logits/chosen": -2.6357033252716064, + "logits/rejected": -2.57110857963562, + "logps/chosen": -182.70291137695312, + "logps/rejected": -223.89857482910156, + "loss": 0.6278, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7485102415084839, + "rewards/margins": 0.1697012186050415, + "rewards/rejected": -0.9182114601135254, + "step": 259 + }, + { + "epoch": 0.34, + "learning_rate": 4.934556775572377e-05, + "logits/chosen": -2.6168265342712402, + "logits/rejected": -2.6644468307495117, + "logps/chosen": -165.17758178710938, + "logps/rejected": -188.50119018554688, + "loss": 0.7554, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5862306356430054, + "rewards/margins": -0.08211595565080643, + "rewards/rejected": -0.5041146874427795, + "step": 260 + }, + { + "epoch": 0.34, + "learning_rate": 4.9337398229910784e-05, + "logits/chosen": -2.595906972885132, + "logits/rejected": -2.594231128692627, + "logps/chosen": -188.159912109375, + "logps/rejected": -176.15074157714844, + "loss": 0.6936, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6705946922302246, + "rewards/margins": 0.03840280696749687, + "rewards/rejected": -0.7089974284172058, + "step": 261 + }, + { + "epoch": 0.34, + "learning_rate": 4.932917871299471e-05, + "logits/chosen": -2.672065496444702, + "logits/rejected": -2.6840691566467285, + "logps/chosen": -173.274169921875, + "logps/rejected": -183.5944061279297, + "loss": 0.6154, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48323020339012146, + "rewards/margins": 0.2292642742395401, + "rewards/rejected": -0.7124944925308228, + "step": 262 + }, + { + "epoch": 0.34, + "learning_rate": 4.9320909221859134e-05, + "logits/chosen": -2.660583972930908, + "logits/rejected": -2.7142574787139893, + "logps/chosen": -192.77145385742188, + "logps/rejected": -185.00450134277344, + "loss": 0.6823, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7117425799369812, + "rewards/margins": 0.10454593598842621, + "rewards/rejected": -0.8162885308265686, + "step": 263 + }, + { + "epoch": 0.35, + "learning_rate": 4.9312589773490304e-05, + "logits/chosen": -2.5676193237304688, + "logits/rejected": -2.510430335998535, + "logps/chosen": -176.65432739257812, + "logps/rejected": -154.3079833984375, + "loss": 0.7678, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8498928546905518, + "rewards/margins": -0.05464668944478035, + "rewards/rejected": -0.7952461242675781, + "step": 264 + }, + { + "epoch": 0.35, + "learning_rate": 4.930422038497708e-05, + "logits/chosen": -2.5558042526245117, + "logits/rejected": -2.672940492630005, + "logps/chosen": -173.69500732421875, + "logps/rejected": -180.78118896484375, + "loss": 0.5704, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5608630776405334, + "rewards/margins": 0.38767609000205994, + "rewards/rejected": -0.948539137840271, + "step": 265 + }, + { + "epoch": 0.35, + "learning_rate": 4.92958010735109e-05, + "logits/chosen": -2.508685350418091, + "logits/rejected": -2.4861252307891846, + "logps/chosen": -196.0287628173828, + "logps/rejected": -212.8455810546875, + "loss": 0.6453, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6239847540855408, + "rewards/margins": 0.18039949238300323, + "rewards/rejected": -0.8043842315673828, + "step": 266 + }, + { + "epoch": 0.35, + "learning_rate": 4.928733185638575e-05, + "logits/chosen": -2.641526222229004, + "logits/rejected": -2.630765199661255, + "logps/chosen": -205.0409698486328, + "logps/rejected": -201.60269165039062, + "loss": 0.651, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7301749587059021, + "rewards/margins": 0.1289985626935959, + "rewards/rejected": -0.859173595905304, + "step": 267 + }, + { + "epoch": 0.35, + "learning_rate": 4.927881275099815e-05, + "logits/chosen": -2.5749480724334717, + "logits/rejected": -2.5817878246307373, + "logps/chosen": -187.1511688232422, + "logps/rejected": -236.642822265625, + "loss": 0.6596, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6212616562843323, + "rewards/margins": 0.13300594687461853, + "rewards/rejected": -0.7542675733566284, + "step": 268 + }, + { + "epoch": 0.35, + "learning_rate": 4.927024377484705e-05, + "logits/chosen": -2.6451222896575928, + "logits/rejected": -2.700425863265991, + "logps/chosen": -176.2154083251953, + "logps/rejected": -211.41744995117188, + "loss": 0.7046, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8719490766525269, + "rewards/margins": 0.05133984610438347, + "rewards/rejected": -0.9232889413833618, + "step": 269 + }, + { + "epoch": 0.35, + "learning_rate": 4.9261624945533855e-05, + "logits/chosen": -2.5780887603759766, + "logits/rejected": -2.669583320617676, + "logps/chosen": -181.3977813720703, + "logps/rejected": -242.43626403808594, + "loss": 0.6873, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6802850365638733, + "rewards/margins": 0.029887204989790916, + "rewards/rejected": -0.7101722955703735, + "step": 270 + }, + { + "epoch": 0.35, + "learning_rate": 4.925295628076241e-05, + "logits/chosen": -2.6090972423553467, + "logits/rejected": -2.683279275894165, + "logps/chosen": -192.7779541015625, + "logps/rejected": -241.75607299804688, + "loss": 0.6072, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7047219276428223, + "rewards/margins": 0.2544190287590027, + "rewards/rejected": -0.9591410160064697, + "step": 271 + }, + { + "epoch": 0.36, + "learning_rate": 4.9244237798338866e-05, + "logits/chosen": -2.7260890007019043, + "logits/rejected": -2.7258718013763428, + "logps/chosen": -206.13421630859375, + "logps/rejected": -211.11663818359375, + "loss": 0.7085, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.855944037437439, + "rewards/margins": 0.07277999073266983, + "rewards/rejected": -0.9287241101264954, + "step": 272 + }, + { + "epoch": 0.36, + "learning_rate": 4.923546951617175e-05, + "logits/chosen": -2.648552417755127, + "logits/rejected": -2.5934343338012695, + "logps/chosen": -170.73721313476562, + "logps/rejected": -186.487548828125, + "loss": 0.6374, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7491826415061951, + "rewards/margins": 0.1971735656261444, + "rewards/rejected": -0.9463562369346619, + "step": 273 + }, + { + "epoch": 0.36, + "learning_rate": 4.922665145227187e-05, + "logits/chosen": -2.5815610885620117, + "logits/rejected": -2.5234570503234863, + "logps/chosen": -153.86798095703125, + "logps/rejected": -140.2138671875, + "loss": 0.8232, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9449439644813538, + "rewards/margins": -0.13928692042827606, + "rewards/rejected": -0.8056570291519165, + "step": 274 + }, + { + "epoch": 0.36, + "learning_rate": 4.9217783624752266e-05, + "logits/chosen": -2.4257960319519043, + "logits/rejected": -2.481285333633423, + "logps/chosen": -129.22506713867188, + "logps/rejected": -133.507080078125, + "loss": 0.6863, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7729750275611877, + "rewards/margins": 0.07025709748268127, + "rewards/rejected": -0.8432320356369019, + "step": 275 + }, + { + "epoch": 0.36, + "learning_rate": 4.920886605182823e-05, + "logits/chosen": -2.8374218940734863, + "logits/rejected": -2.840282440185547, + "logps/chosen": -183.80465698242188, + "logps/rejected": -185.8097381591797, + "loss": 0.6878, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8535721898078918, + "rewards/margins": 0.04765475541353226, + "rewards/rejected": -0.9012269377708435, + "step": 276 + }, + { + "epoch": 0.36, + "learning_rate": 4.919989875181722e-05, + "logits/chosen": -2.7011358737945557, + "logits/rejected": -2.758044719696045, + "logps/chosen": -176.79345703125, + "logps/rejected": -174.94239807128906, + "loss": 0.6994, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.159913182258606, + "rewards/margins": 0.09292294830083847, + "rewards/rejected": -1.2528361082077026, + "step": 277 + }, + { + "epoch": 0.36, + "learning_rate": 4.919088174313884e-05, + "logits/chosen": -2.614689826965332, + "logits/rejected": -2.645404577255249, + "logps/chosen": -134.0303955078125, + "logps/rejected": -162.16595458984375, + "loss": 0.6113, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6662919521331787, + "rewards/margins": 0.26484376192092896, + "rewards/rejected": -0.9311355948448181, + "step": 278 + }, + { + "epoch": 0.37, + "learning_rate": 4.91818150443148e-05, + "logits/chosen": -2.6832685470581055, + "logits/rejected": -2.670450448989868, + "logps/chosen": -196.30052185058594, + "logps/rejected": -179.43482971191406, + "loss": 0.6394, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9087074995040894, + "rewards/margins": 0.1679982841014862, + "rewards/rejected": -1.076705813407898, + "step": 279 + }, + { + "epoch": 0.37, + "learning_rate": 4.917269867396886e-05, + "logits/chosen": -2.8207449913024902, + "logits/rejected": -2.773184061050415, + "logps/chosen": -198.20509338378906, + "logps/rejected": -184.33151245117188, + "loss": 0.7475, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9387542009353638, + "rewards/margins": 0.003928817808628082, + "rewards/rejected": -0.9426830410957336, + "step": 280 + }, + { + "epoch": 0.37, + "learning_rate": 4.916353265082686e-05, + "logits/chosen": -2.75034236907959, + "logits/rejected": -2.7130117416381836, + "logps/chosen": -187.1002197265625, + "logps/rejected": -193.2103271484375, + "loss": 0.8817, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.0758837461471558, + "rewards/margins": -0.2836476266384125, + "rewards/rejected": -0.7922362089157104, + "step": 281 + }, + { + "epoch": 0.37, + "learning_rate": 4.9154316993716565e-05, + "logits/chosen": -2.7884602546691895, + "logits/rejected": -2.889566421508789, + "logps/chosen": -173.12225341796875, + "logps/rejected": -181.60842895507812, + "loss": 0.6636, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.0141631364822388, + "rewards/margins": 0.16198423504829407, + "rewards/rejected": -1.1761474609375, + "step": 282 + }, + { + "epoch": 0.37, + "learning_rate": 4.9145051721567734e-05, + "logits/chosen": -2.711033344268799, + "logits/rejected": -2.731414318084717, + "logps/chosen": -190.81759643554688, + "logps/rejected": -214.6211700439453, + "loss": 0.5886, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9142743945121765, + "rewards/margins": 0.34260398149490356, + "rewards/rejected": -1.25687837600708, + "step": 283 + }, + { + "epoch": 0.37, + "learning_rate": 4.913573685341205e-05, + "logits/chosen": -2.534449338912964, + "logits/rejected": -2.6240017414093018, + "logps/chosen": -158.4725341796875, + "logps/rejected": -144.93075561523438, + "loss": 0.6633, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8281882405281067, + "rewards/margins": 0.08525969088077545, + "rewards/rejected": -0.9134478569030762, + "step": 284 + }, + { + "epoch": 0.37, + "learning_rate": 4.9126372408383025e-05, + "logits/chosen": -2.8126165866851807, + "logits/rejected": -2.9268059730529785, + "logps/chosen": -165.80160522460938, + "logps/rejected": -195.13014221191406, + "loss": 0.6719, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0856614112854004, + "rewards/margins": 0.1156027615070343, + "rewards/rejected": -1.2012642621994019, + "step": 285 + }, + { + "epoch": 0.37, + "learning_rate": 4.911695840571605e-05, + "logits/chosen": -2.8474719524383545, + "logits/rejected": -2.870950698852539, + "logps/chosen": -185.34619140625, + "logps/rejected": -204.24951171875, + "loss": 0.6981, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9124755859375, + "rewards/margins": 0.032730571925640106, + "rewards/rejected": -0.9452061057090759, + "step": 286 + }, + { + "epoch": 0.38, + "learning_rate": 4.910749486474828e-05, + "logits/chosen": -2.7202816009521484, + "logits/rejected": -2.7738454341888428, + "logps/chosen": -183.0021514892578, + "logps/rejected": -176.14559936523438, + "loss": 0.7423, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8500471711158752, + "rewards/margins": -0.009690776467323303, + "rewards/rejected": -0.8403564691543579, + "step": 287 + }, + { + "epoch": 0.38, + "learning_rate": 4.909798180491865e-05, + "logits/chosen": -2.8064842224121094, + "logits/rejected": -2.8379313945770264, + "logps/chosen": -181.36424255371094, + "logps/rejected": -186.806884765625, + "loss": 0.7156, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9925443530082703, + "rewards/margins": 0.02362808585166931, + "rewards/rejected": -1.0161724090576172, + "step": 288 + }, + { + "epoch": 0.38, + "learning_rate": 4.9088419245767803e-05, + "logits/chosen": -2.591299057006836, + "logits/rejected": -2.6505491733551025, + "logps/chosen": -165.84970092773438, + "logps/rejected": -189.2133331298828, + "loss": 0.6588, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6631417870521545, + "rewards/margins": 0.1258070468902588, + "rewards/rejected": -0.7889488339424133, + "step": 289 + }, + { + "epoch": 0.38, + "learning_rate": 4.907880720693804e-05, + "logits/chosen": -2.9960107803344727, + "logits/rejected": -2.9374802112579346, + "logps/chosen": -213.4750518798828, + "logps/rejected": -237.82672119140625, + "loss": 0.656, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8233738541603088, + "rewards/margins": 0.1019565686583519, + "rewards/rejected": -0.9253304600715637, + "step": 290 + }, + { + "epoch": 0.38, + "learning_rate": 4.9069145708173324e-05, + "logits/chosen": -2.6092634201049805, + "logits/rejected": -2.6358261108398438, + "logps/chosen": -197.22938537597656, + "logps/rejected": -190.7332000732422, + "loss": 0.613, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7702199220657349, + "rewards/margins": 0.23700743913650513, + "rewards/rejected": -1.0072274208068848, + "step": 291 + }, + { + "epoch": 0.38, + "learning_rate": 4.9059434769319205e-05, + "logits/chosen": -2.731299638748169, + "logits/rejected": -2.8226795196533203, + "logps/chosen": -202.76451110839844, + "logps/rejected": -244.38844299316406, + "loss": 0.589, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6375839114189148, + "rewards/margins": 0.2866172790527344, + "rewards/rejected": -0.9242011904716492, + "step": 292 + }, + { + "epoch": 0.38, + "learning_rate": 4.904967441032278e-05, + "logits/chosen": -2.478205680847168, + "logits/rejected": -2.541795253753662, + "logps/chosen": -195.00408935546875, + "logps/rejected": -225.96731567382812, + "loss": 0.6568, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6428064107894897, + "rewards/margins": 0.17480693757534027, + "rewards/rejected": -0.8176133632659912, + "step": 293 + }, + { + "epoch": 0.38, + "learning_rate": 4.903986465123266e-05, + "logits/chosen": -2.692394733428955, + "logits/rejected": -2.7363274097442627, + "logps/chosen": -167.2650146484375, + "logps/rejected": -220.62228393554688, + "loss": 0.7124, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8301478624343872, + "rewards/margins": 0.07167816907167435, + "rewards/rejected": -0.901826024055481, + "step": 294 + }, + { + "epoch": 0.39, + "learning_rate": 4.903000551219894e-05, + "logits/chosen": -2.782505989074707, + "logits/rejected": -2.81825590133667, + "logps/chosen": -153.23898315429688, + "logps/rejected": -157.7719268798828, + "loss": 0.7488, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8630144596099854, + "rewards/margins": -0.03182988613843918, + "rewards/rejected": -0.8311845660209656, + "step": 295 + }, + { + "epoch": 0.39, + "learning_rate": 4.902009701347313e-05, + "logits/chosen": -2.7017691135406494, + "logits/rejected": -2.751739978790283, + "logps/chosen": -199.3386688232422, + "logps/rejected": -189.38465881347656, + "loss": 0.7063, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7444071769714355, + "rewards/margins": 0.05856693163514137, + "rewards/rejected": -0.8029740452766418, + "step": 296 + }, + { + "epoch": 0.39, + "learning_rate": 4.901013917540814e-05, + "logits/chosen": -2.62956166267395, + "logits/rejected": -2.65138578414917, + "logps/chosen": -209.6748046875, + "logps/rejected": -200.0463409423828, + "loss": 0.6986, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.778890073299408, + "rewards/margins": 0.03998234495520592, + "rewards/rejected": -0.8188724517822266, + "step": 297 + }, + { + "epoch": 0.39, + "learning_rate": 4.900013201845821e-05, + "logits/chosen": -2.604013204574585, + "logits/rejected": -2.6575911045074463, + "logps/chosen": -192.33749389648438, + "logps/rejected": -196.74574279785156, + "loss": 0.7202, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7086251378059387, + "rewards/margins": 0.026372164487838745, + "rewards/rejected": -0.7349973917007446, + "step": 298 + }, + { + "epoch": 0.39, + "learning_rate": 4.899007556317893e-05, + "logits/chosen": -2.672982931137085, + "logits/rejected": -2.7704596519470215, + "logps/chosen": -232.0260009765625, + "logps/rejected": -230.7442169189453, + "loss": 0.666, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.703597366809845, + "rewards/margins": 0.1020139679312706, + "rewards/rejected": -0.8056113123893738, + "step": 299 + }, + { + "epoch": 0.39, + "learning_rate": 4.8979969830227086e-05, + "logits/chosen": -2.7685508728027344, + "logits/rejected": -2.798677921295166, + "logps/chosen": -170.82254028320312, + "logps/rejected": -211.26219177246094, + "loss": 0.6242, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6501718759536743, + "rewards/margins": 0.23585857450962067, + "rewards/rejected": -0.8860303163528442, + "step": 300 + }, + { + "epoch": 0.39, + "learning_rate": 4.896981484036074e-05, + "logits/chosen": -2.742246150970459, + "logits/rejected": -2.7133634090423584, + "logps/chosen": -188.86024475097656, + "logps/rejected": -199.79791259765625, + "loss": 0.5836, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5299299955368042, + "rewards/margins": 0.2819333076477051, + "rewards/rejected": -0.8118634223937988, + "step": 301 + }, + { + "epoch": 0.4, + "learning_rate": 4.895961061443911e-05, + "logits/chosen": -2.726637840270996, + "logits/rejected": -2.728475332260132, + "logps/chosen": -204.32278442382812, + "logps/rejected": -233.36709594726562, + "loss": 0.8344, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8258933424949646, + "rewards/margins": -0.08616884052753448, + "rewards/rejected": -0.7397244572639465, + "step": 302 + }, + { + "epoch": 0.4, + "learning_rate": 4.894935717342255e-05, + "logits/chosen": -2.757063865661621, + "logits/rejected": -2.7701942920684814, + "logps/chosen": -192.92251586914062, + "logps/rejected": -192.76185607910156, + "loss": 0.7007, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.71357262134552, + "rewards/margins": 0.04112683981657028, + "rewards/rejected": -0.7546994686126709, + "step": 303 + }, + { + "epoch": 0.4, + "learning_rate": 4.8939054538372496e-05, + "logits/chosen": -2.6160852909088135, + "logits/rejected": -2.6638543605804443, + "logps/chosen": -153.6888427734375, + "logps/rejected": -204.19139099121094, + "loss": 0.6179, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5954157710075378, + "rewards/margins": 0.2527124583721161, + "rewards/rejected": -0.8481282591819763, + "step": 304 + }, + { + "epoch": 0.4, + "learning_rate": 4.8928702730451456e-05, + "logits/chosen": -2.635458469390869, + "logits/rejected": -2.740834951400757, + "logps/chosen": -229.58181762695312, + "logps/rejected": -209.87832641601562, + "loss": 0.7217, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7451784610748291, + "rewards/margins": 0.019871072843670845, + "rewards/rejected": -0.7650495171546936, + "step": 305 + }, + { + "epoch": 0.4, + "learning_rate": 4.891830177092294e-05, + "logits/chosen": -2.515532970428467, + "logits/rejected": -2.5850772857666016, + "logps/chosen": -173.58749389648438, + "logps/rejected": -197.440185546875, + "loss": 0.6521, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6022768616676331, + "rewards/margins": 0.11426748335361481, + "rewards/rejected": -0.7165443301200867, + "step": 306 + }, + { + "epoch": 0.4, + "learning_rate": 4.8907851681151396e-05, + "logits/chosen": -2.708148956298828, + "logits/rejected": -2.735069751739502, + "logps/chosen": -146.8754425048828, + "logps/rejected": -182.95477294921875, + "loss": 0.6006, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35234156250953674, + "rewards/margins": 0.24402308464050293, + "rewards/rejected": -0.5963646173477173, + "step": 307 + }, + { + "epoch": 0.4, + "learning_rate": 4.889735248260221e-05, + "logits/chosen": -2.657132625579834, + "logits/rejected": -2.7396936416625977, + "logps/chosen": -172.76268005371094, + "logps/rejected": -185.73153686523438, + "loss": 0.6844, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7841547727584839, + "rewards/margins": 0.08935101330280304, + "rewards/rejected": -0.8735058307647705, + "step": 308 + }, + { + "epoch": 0.4, + "learning_rate": 4.8886804196841626e-05, + "logits/chosen": -2.6601943969726562, + "logits/rejected": -2.6764605045318604, + "logps/chosen": -182.19937133789062, + "logps/rejected": -196.57550048828125, + "loss": 0.6327, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7912774085998535, + "rewards/margins": 0.19412587583065033, + "rewards/rejected": -0.9854032397270203, + "step": 309 + }, + { + "epoch": 0.41, + "learning_rate": 4.887620684553674e-05, + "logits/chosen": -2.5685248374938965, + "logits/rejected": -2.5472164154052734, + "logps/chosen": -161.2342529296875, + "logps/rejected": -201.6751708984375, + "loss": 0.683, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6504772901535034, + "rewards/margins": 0.089483842253685, + "rewards/rejected": -0.7399611473083496, + "step": 310 + }, + { + "epoch": 0.41, + "learning_rate": 4.886556045045542e-05, + "logits/chosen": -2.7824556827545166, + "logits/rejected": -2.772510528564453, + "logps/chosen": -183.0452880859375, + "logps/rejected": -197.40408325195312, + "loss": 0.8643, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.0302786827087402, + "rewards/margins": -0.2259717881679535, + "rewards/rejected": -0.8043068647384644, + "step": 311 + }, + { + "epoch": 0.41, + "learning_rate": 4.8854865033466275e-05, + "logits/chosen": -2.457235097885132, + "logits/rejected": -2.5225629806518555, + "logps/chosen": -137.17259216308594, + "logps/rejected": -151.13467407226562, + "loss": 0.5958, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6421889662742615, + "rewards/margins": 0.3034355044364929, + "rewards/rejected": -0.9456245303153992, + "step": 312 + }, + { + "epoch": 0.41, + "learning_rate": 4.88441206165386e-05, + "logits/chosen": -2.7572522163391113, + "logits/rejected": -2.7597944736480713, + "logps/chosen": -194.38626098632812, + "logps/rejected": -208.2905731201172, + "loss": 0.848, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.8563791513442993, + "rewards/margins": -0.21535280346870422, + "rewards/rejected": -0.6410263180732727, + "step": 313 + }, + { + "epoch": 0.41, + "learning_rate": 4.8833327221742356e-05, + "logits/chosen": -2.589763641357422, + "logits/rejected": -2.5469565391540527, + "logps/chosen": -159.60533142089844, + "logps/rejected": -153.5530548095703, + "loss": 0.6174, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6452800035476685, + "rewards/margins": 0.2377607673406601, + "rewards/rejected": -0.8830407857894897, + "step": 314 + }, + { + "epoch": 0.41, + "learning_rate": 4.88224848712481e-05, + "logits/chosen": -2.6565144062042236, + "logits/rejected": -2.694272994995117, + "logps/chosen": -175.2017059326172, + "logps/rejected": -171.8043975830078, + "loss": 0.6785, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.514467179775238, + "rewards/margins": 0.11190642416477203, + "rewards/rejected": -0.6263736486434937, + "step": 315 + }, + { + "epoch": 0.41, + "learning_rate": 4.881159358732694e-05, + "logits/chosen": -2.5067477226257324, + "logits/rejected": -2.551682710647583, + "logps/chosen": -187.95533752441406, + "logps/rejected": -237.65907287597656, + "loss": 0.6805, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6460933685302734, + "rewards/margins": 0.15910674631595612, + "rewards/rejected": -0.8052000403404236, + "step": 316 + }, + { + "epoch": 0.41, + "learning_rate": 4.8800653392350526e-05, + "logits/chosen": -2.4550201892852783, + "logits/rejected": -2.519341468811035, + "logps/chosen": -156.3563690185547, + "logps/rejected": -171.7753143310547, + "loss": 0.7012, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7904012799263, + "rewards/margins": 0.05794687569141388, + "rewards/rejected": -0.8483481407165527, + "step": 317 + }, + { + "epoch": 0.42, + "learning_rate": 4.8789664308790936e-05, + "logits/chosen": -2.7174458503723145, + "logits/rejected": -2.6885735988616943, + "logps/chosen": -172.3370361328125, + "logps/rejected": -164.46519470214844, + "loss": 0.6861, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6009785532951355, + "rewards/margins": 0.08145736157894135, + "rewards/rejected": -0.682435929775238, + "step": 318 + }, + { + "epoch": 0.42, + "learning_rate": 4.8778626359220715e-05, + "logits/chosen": -2.731224536895752, + "logits/rejected": -2.6857399940490723, + "logps/chosen": -185.449951171875, + "logps/rejected": -206.4256591796875, + "loss": 0.7672, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7531261444091797, + "rewards/margins": -0.07965384423732758, + "rewards/rejected": -0.6734722852706909, + "step": 319 + }, + { + "epoch": 0.42, + "learning_rate": 4.8767539566312734e-05, + "logits/chosen": -2.5604355335235596, + "logits/rejected": -2.704071044921875, + "logps/chosen": -174.9366912841797, + "logps/rejected": -202.98768615722656, + "loss": 0.6137, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6721182465553284, + "rewards/margins": 0.21909648180007935, + "rewards/rejected": -0.8912147283554077, + "step": 320 + }, + { + "epoch": 0.42, + "learning_rate": 4.875640395284023e-05, + "logits/chosen": -2.758918046951294, + "logits/rejected": -2.7820916175842285, + "logps/chosen": -195.2515869140625, + "logps/rejected": -231.6067352294922, + "loss": 0.5285, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.38608187437057495, + "rewards/margins": 0.4270917773246765, + "rewards/rejected": -0.8131736516952515, + "step": 321 + }, + { + "epoch": 0.42, + "learning_rate": 4.874521954167671e-05, + "logits/chosen": -2.7890138626098633, + "logits/rejected": -2.8068411350250244, + "logps/chosen": -209.79307556152344, + "logps/rejected": -207.8020477294922, + "loss": 0.6367, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6039459109306335, + "rewards/margins": 0.1874314397573471, + "rewards/rejected": -0.7913773059844971, + "step": 322 + }, + { + "epoch": 0.42, + "learning_rate": 4.8733986355795905e-05, + "logits/chosen": -2.675286054611206, + "logits/rejected": -2.699337959289551, + "logps/chosen": -233.40780639648438, + "logps/rejected": -208.18724060058594, + "loss": 0.6598, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6822580695152283, + "rewards/margins": 0.1497945785522461, + "rewards/rejected": -0.8320526480674744, + "step": 323 + }, + { + "epoch": 0.42, + "learning_rate": 4.8722704418271745e-05, + "logits/chosen": -2.5140862464904785, + "logits/rejected": -2.632192850112915, + "logps/chosen": -176.1000213623047, + "logps/rejected": -196.43846130371094, + "loss": 0.7119, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7464176416397095, + "rewards/margins": 0.01839565485715866, + "rewards/rejected": -0.7648133039474487, + "step": 324 + }, + { + "epoch": 0.43, + "learning_rate": 4.871137375227829e-05, + "logits/chosen": -2.6084189414978027, + "logits/rejected": -2.6140480041503906, + "logps/chosen": -188.7124481201172, + "logps/rejected": -177.5293426513672, + "loss": 0.748, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.618392288684845, + "rewards/margins": 0.016466360539197922, + "rewards/rejected": -0.6348586678504944, + "step": 325 + }, + { + "epoch": 0.43, + "learning_rate": 4.869999438108971e-05, + "logits/chosen": -2.6994099617004395, + "logits/rejected": -2.743457555770874, + "logps/chosen": -182.1778564453125, + "logps/rejected": -194.54884338378906, + "loss": 0.6603, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5676161050796509, + "rewards/margins": 0.1599595993757248, + "rewards/rejected": -0.7275756597518921, + "step": 326 + }, + { + "epoch": 0.43, + "learning_rate": 4.8688566328080215e-05, + "logits/chosen": -2.5730295181274414, + "logits/rejected": -2.573648452758789, + "logps/chosen": -199.49908447265625, + "logps/rejected": -235.17568969726562, + "loss": 0.5873, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.379108190536499, + "rewards/margins": 0.30178508162498474, + "rewards/rejected": -0.6808933019638062, + "step": 327 + }, + { + "epoch": 0.43, + "learning_rate": 4.867708961672399e-05, + "logits/chosen": -2.5466060638427734, + "logits/rejected": -2.661322593688965, + "logps/chosen": -183.09585571289062, + "logps/rejected": -185.6378936767578, + "loss": 0.666, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.49351951479911804, + "rewards/margins": 0.11638712882995605, + "rewards/rejected": -0.6099066734313965, + "step": 328 + }, + { + "epoch": 0.43, + "learning_rate": 4.866556427059519e-05, + "logits/chosen": -2.7101516723632812, + "logits/rejected": -2.644563913345337, + "logps/chosen": -187.73348999023438, + "logps/rejected": -169.40293884277344, + "loss": 0.7781, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8404239416122437, + "rewards/margins": -0.026825089007616043, + "rewards/rejected": -0.8135988116264343, + "step": 329 + }, + { + "epoch": 0.43, + "learning_rate": 4.865399031336787e-05, + "logits/chosen": -2.4861361980438232, + "logits/rejected": -2.565809726715088, + "logps/chosen": -146.98423767089844, + "logps/rejected": -163.1490478515625, + "loss": 0.7333, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5940142869949341, + "rewards/margins": 0.03410058468580246, + "rewards/rejected": -0.6281149387359619, + "step": 330 + }, + { + "epoch": 0.43, + "learning_rate": 4.8642367768815936e-05, + "logits/chosen": -2.6412410736083984, + "logits/rejected": -2.747821807861328, + "logps/chosen": -174.8326873779297, + "logps/rejected": -219.83612060546875, + "loss": 0.5947, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4866630434989929, + "rewards/margins": 0.2575409412384033, + "rewards/rejected": -0.744204044342041, + "step": 331 + }, + { + "epoch": 0.43, + "learning_rate": 4.863069666081307e-05, + "logits/chosen": -2.6463019847869873, + "logits/rejected": -2.796405553817749, + "logps/chosen": -164.44869995117188, + "logps/rejected": -217.47230529785156, + "loss": 0.6092, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5829634070396423, + "rewards/margins": 0.26363080739974976, + "rewards/rejected": -0.8465942144393921, + "step": 332 + }, + { + "epoch": 0.44, + "learning_rate": 4.861897701333274e-05, + "logits/chosen": -2.6675515174865723, + "logits/rejected": -2.6761202812194824, + "logps/chosen": -187.8043212890625, + "logps/rejected": -175.5827178955078, + "loss": 0.8043, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.8770172595977783, + "rewards/margins": -0.06917458772659302, + "rewards/rejected": -0.8078427314758301, + "step": 333 + }, + { + "epoch": 0.44, + "learning_rate": 4.86072088504481e-05, + "logits/chosen": -2.6193923950195312, + "logits/rejected": -2.652172088623047, + "logps/chosen": -180.9799346923828, + "logps/rejected": -197.08245849609375, + "loss": 0.756, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.8450060486793518, + "rewards/margins": -0.008023982867598534, + "rewards/rejected": -0.8369821310043335, + "step": 334 + }, + { + "epoch": 0.44, + "learning_rate": 4.859539219633199e-05, + "logits/chosen": -2.4201056957244873, + "logits/rejected": -2.547577381134033, + "logps/chosen": -142.2472381591797, + "logps/rejected": -171.77197265625, + "loss": 0.5808, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25414225459098816, + "rewards/margins": 0.2882387042045593, + "rewards/rejected": -0.5423809289932251, + "step": 335 + }, + { + "epoch": 0.44, + "learning_rate": 4.8583527075256804e-05, + "logits/chosen": -2.6075916290283203, + "logits/rejected": -2.6171302795410156, + "logps/chosen": -185.60699462890625, + "logps/rejected": -199.539306640625, + "loss": 0.6039, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6799944043159485, + "rewards/margins": 0.21594738960266113, + "rewards/rejected": -0.8959417939186096, + "step": 336 + }, + { + "epoch": 0.44, + "learning_rate": 4.857161351159454e-05, + "logits/chosen": -2.7377195358276367, + "logits/rejected": -2.722515821456909, + "logps/chosen": -225.74755859375, + "logps/rejected": -224.8936004638672, + "loss": 0.6793, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8174278140068054, + "rewards/margins": 0.10962995886802673, + "rewards/rejected": -0.9270578026771545, + "step": 337 + }, + { + "epoch": 0.44, + "learning_rate": 4.8559651529816664e-05, + "logits/chosen": -2.597797393798828, + "logits/rejected": -2.6779723167419434, + "logps/chosen": -159.58380126953125, + "logps/rejected": -185.880615234375, + "loss": 0.679, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7312408089637756, + "rewards/margins": 0.1112150177359581, + "rewards/rejected": -0.8424558639526367, + "step": 338 + }, + { + "epoch": 0.44, + "learning_rate": 4.854764115449411e-05, + "logits/chosen": -2.7188682556152344, + "logits/rejected": -2.691462993621826, + "logps/chosen": -143.06845092773438, + "logps/rejected": -142.25157165527344, + "loss": 0.6675, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.675825297832489, + "rewards/margins": 0.130048006772995, + "rewards/rejected": -0.8058732748031616, + "step": 339 + }, + { + "epoch": 0.45, + "learning_rate": 4.853558241029723e-05, + "logits/chosen": -2.629163980484009, + "logits/rejected": -2.5702826976776123, + "logps/chosen": -224.11196899414062, + "logps/rejected": -171.19097900390625, + "loss": 0.718, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6522189378738403, + "rewards/margins": 0.046554647386074066, + "rewards/rejected": -0.6987735629081726, + "step": 340 + }, + { + "epoch": 0.45, + "learning_rate": 4.8523475321995715e-05, + "logits/chosen": -2.7033562660217285, + "logits/rejected": -2.5383901596069336, + "logps/chosen": -181.98219299316406, + "logps/rejected": -172.13865661621094, + "loss": 0.719, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5484719276428223, + "rewards/margins": 0.0074146464467048645, + "rewards/rejected": -0.5558865666389465, + "step": 341 + }, + { + "epoch": 0.45, + "learning_rate": 4.8511319914458555e-05, + "logits/chosen": -2.5836706161499023, + "logits/rejected": -2.5528974533081055, + "logps/chosen": -223.9678497314453, + "logps/rejected": -211.67184448242188, + "loss": 0.7904, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7669371366500854, + "rewards/margins": -0.11547183990478516, + "rewards/rejected": -0.6514652967453003, + "step": 342 + }, + { + "epoch": 0.45, + "learning_rate": 4.849911621265401e-05, + "logits/chosen": -2.652245044708252, + "logits/rejected": -2.6907477378845215, + "logps/chosen": -172.45669555664062, + "logps/rejected": -173.8350067138672, + "loss": 0.7916, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.713798999786377, + "rewards/margins": -0.09893038123846054, + "rewards/rejected": -0.6148686408996582, + "step": 343 + }, + { + "epoch": 0.45, + "learning_rate": 4.848686424164953e-05, + "logits/chosen": -2.703127384185791, + "logits/rejected": -2.704859733581543, + "logps/chosen": -214.36392211914062, + "logps/rejected": -185.55931091308594, + "loss": 0.7281, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7356032133102417, + "rewards/margins": -0.018105890601873398, + "rewards/rejected": -0.7174972891807556, + "step": 344 + }, + { + "epoch": 0.45, + "learning_rate": 4.84745640266117e-05, + "logits/chosen": -2.690885066986084, + "logits/rejected": -2.740234851837158, + "logps/chosen": -176.29266357421875, + "logps/rejected": -206.47596740722656, + "loss": 0.5979, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6022865176200867, + "rewards/margins": 0.31845831871032715, + "rewards/rejected": -0.9207448363304138, + "step": 345 + }, + { + "epoch": 0.45, + "learning_rate": 4.846221559280624e-05, + "logits/chosen": -2.6713180541992188, + "logits/rejected": -2.7118892669677734, + "logps/chosen": -144.1902313232422, + "logps/rejected": -170.08709716796875, + "loss": 0.7913, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.737549364566803, + "rewards/margins": -0.0928279459476471, + "rewards/rejected": -0.6447213888168335, + "step": 346 + }, + { + "epoch": 0.45, + "learning_rate": 4.844981896559787e-05, + "logits/chosen": -2.7589731216430664, + "logits/rejected": -2.6973843574523926, + "logps/chosen": -185.05255126953125, + "logps/rejected": -177.13946533203125, + "loss": 0.7678, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8261479735374451, + "rewards/margins": -0.09880837798118591, + "rewards/rejected": -0.7273396253585815, + "step": 347 + }, + { + "epoch": 0.46, + "learning_rate": 4.8437374170450344e-05, + "logits/chosen": -2.659578323364258, + "logits/rejected": -2.617631196975708, + "logps/chosen": -175.1558074951172, + "logps/rejected": -198.41343688964844, + "loss": 0.6794, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8928545117378235, + "rewards/margins": 0.10537480562925339, + "rewards/rejected": -0.9982293844223022, + "step": 348 + }, + { + "epoch": 0.46, + "learning_rate": 4.842488123292632e-05, + "logits/chosen": -2.6015379428863525, + "logits/rejected": -2.5835628509521484, + "logps/chosen": -188.88092041015625, + "logps/rejected": -183.85733032226562, + "loss": 0.6908, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7475460171699524, + "rewards/margins": 0.20796708762645721, + "rewards/rejected": -0.9555131196975708, + "step": 349 + }, + { + "epoch": 0.46, + "learning_rate": 4.8412340178687374e-05, + "logits/chosen": -2.652589797973633, + "logits/rejected": -2.7427022457122803, + "logps/chosen": -198.62112426757812, + "logps/rejected": -206.33192443847656, + "loss": 0.5545, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6447871923446655, + "rewards/margins": 0.38781607151031494, + "rewards/rejected": -1.0326032638549805, + "step": 350 + }, + { + "epoch": 0.46, + "learning_rate": 4.839975103349391e-05, + "logits/chosen": -2.79940128326416, + "logits/rejected": -2.8857078552246094, + "logps/chosen": -206.7621307373047, + "logps/rejected": -231.99472045898438, + "loss": 0.5811, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5604078769683838, + "rewards/margins": 0.5052842497825623, + "rewards/rejected": -1.0656920671463013, + "step": 351 + }, + { + "epoch": 0.46, + "learning_rate": 4.8387113823205096e-05, + "logits/chosen": -2.562626361846924, + "logits/rejected": -2.5439820289611816, + "logps/chosen": -198.9171142578125, + "logps/rejected": -184.92816162109375, + "loss": 0.7251, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6547099351882935, + "rewards/margins": 0.07420587539672852, + "rewards/rejected": -0.7289157509803772, + "step": 352 + }, + { + "epoch": 0.46, + "learning_rate": 4.8374428573778864e-05, + "logits/chosen": -2.6110103130340576, + "logits/rejected": -2.612607479095459, + "logps/chosen": -176.07400512695312, + "logps/rejected": -181.38418579101562, + "loss": 0.8065, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.758232593536377, + "rewards/margins": -0.0620691180229187, + "rewards/rejected": -0.6961634755134583, + "step": 353 + }, + { + "epoch": 0.46, + "learning_rate": 4.8361695311271795e-05, + "logits/chosen": -2.8273181915283203, + "logits/rejected": -2.830404758453369, + "logps/chosen": -205.66763305664062, + "logps/rejected": -222.3781280517578, + "loss": 0.671, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8250411152839661, + "rewards/margins": 0.13250316679477692, + "rewards/rejected": -0.9575443267822266, + "step": 354 + }, + { + "epoch": 0.46, + "learning_rate": 4.83489140618391e-05, + "logits/chosen": -2.666761875152588, + "logits/rejected": -2.757248878479004, + "logps/chosen": -215.3235626220703, + "logps/rejected": -216.06568908691406, + "loss": 0.6942, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7060136795043945, + "rewards/margins": 0.16570770740509033, + "rewards/rejected": -0.8717214465141296, + "step": 355 + }, + { + "epoch": 0.47, + "learning_rate": 4.833608485173457e-05, + "logits/chosen": -2.3725719451904297, + "logits/rejected": -2.4071693420410156, + "logps/chosen": -146.78704833984375, + "logps/rejected": -186.87950134277344, + "loss": 0.6066, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4698937237262726, + "rewards/margins": 0.2167559713125229, + "rewards/rejected": -0.6866496801376343, + "step": 356 + }, + { + "epoch": 0.47, + "learning_rate": 4.8323207707310496e-05, + "logits/chosen": -2.6677677631378174, + "logits/rejected": -2.5784990787506104, + "logps/chosen": -204.05075073242188, + "logps/rejected": -224.5699920654297, + "loss": 0.5351, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4006405174732208, + "rewards/margins": 0.5226905941963196, + "rewards/rejected": -0.923331081867218, + "step": 357 + }, + { + "epoch": 0.47, + "learning_rate": 4.831028265501764e-05, + "logits/chosen": -2.6300296783447266, + "logits/rejected": -2.810920238494873, + "logps/chosen": -177.45635986328125, + "logps/rejected": -234.4469451904297, + "loss": 0.5034, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4587719440460205, + "rewards/margins": 0.5265656113624573, + "rewards/rejected": -0.9853376150131226, + "step": 358 + }, + { + "epoch": 0.47, + "learning_rate": 4.829730972140517e-05, + "logits/chosen": -2.5978078842163086, + "logits/rejected": -2.570225954055786, + "logps/chosen": -133.3565673828125, + "logps/rejected": -136.15902709960938, + "loss": 0.632, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3776158094406128, + "rewards/margins": 0.1921352744102478, + "rewards/rejected": -0.5697510242462158, + "step": 359 + }, + { + "epoch": 0.47, + "learning_rate": 4.8284288933120594e-05, + "logits/chosen": -2.5982682704925537, + "logits/rejected": -2.6651642322540283, + "logps/chosen": -181.2775115966797, + "logps/rejected": -203.33876037597656, + "loss": 0.5832, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4960383474826813, + "rewards/margins": 0.31738704442977905, + "rewards/rejected": -0.8134254217147827, + "step": 360 + }, + { + "epoch": 0.47, + "learning_rate": 4.8271220316909735e-05, + "logits/chosen": -2.761739730834961, + "logits/rejected": -2.715710401535034, + "logps/chosen": -191.56158447265625, + "logps/rejected": -175.9281463623047, + "loss": 0.7011, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.62202388048172, + "rewards/margins": 0.03054755926132202, + "rewards/rejected": -0.652571439743042, + "step": 361 + }, + { + "epoch": 0.47, + "learning_rate": 4.825810389961666e-05, + "logits/chosen": -2.614866018295288, + "logits/rejected": -2.6899471282958984, + "logps/chosen": -135.42494201660156, + "logps/rejected": -168.2887725830078, + "loss": 0.6542, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5677900910377502, + "rewards/margins": 0.13706976175308228, + "rewards/rejected": -0.7048598527908325, + "step": 362 + }, + { + "epoch": 0.48, + "learning_rate": 4.8244939708183596e-05, + "logits/chosen": -2.643815040588379, + "logits/rejected": -2.6820266246795654, + "logps/chosen": -178.99908447265625, + "logps/rejected": -212.68692016601562, + "loss": 0.5719, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5018581748008728, + "rewards/margins": 0.3394116163253784, + "rewards/rejected": -0.841269850730896, + "step": 363 + }, + { + "epoch": 0.48, + "learning_rate": 4.823172776965094e-05, + "logits/chosen": -2.702993869781494, + "logits/rejected": -2.7458367347717285, + "logps/chosen": -239.1004638671875, + "logps/rejected": -227.72450256347656, + "loss": 0.6493, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4328669309616089, + "rewards/margins": 0.1465311497449875, + "rewards/rejected": -0.5793980360031128, + "step": 364 + }, + { + "epoch": 0.48, + "learning_rate": 4.821846811115713e-05, + "logits/chosen": -2.658914804458618, + "logits/rejected": -2.728074073791504, + "logps/chosen": -207.8568115234375, + "logps/rejected": -194.11788940429688, + "loss": 0.643, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5129541754722595, + "rewards/margins": 0.19354072213172913, + "rewards/rejected": -0.706494927406311, + "step": 365 + }, + { + "epoch": 0.48, + "learning_rate": 4.820516075993865e-05, + "logits/chosen": -2.741415023803711, + "logits/rejected": -2.6675500869750977, + "logps/chosen": -156.90858459472656, + "logps/rejected": -166.85830688476562, + "loss": 0.8793, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.9281965494155884, + "rewards/margins": -0.2124374508857727, + "rewards/rejected": -0.7157591581344604, + "step": 366 + }, + { + "epoch": 0.48, + "learning_rate": 4.819180574332994e-05, + "logits/chosen": -2.7196927070617676, + "logits/rejected": -2.748945713043213, + "logps/chosen": -202.15493774414062, + "logps/rejected": -194.124755859375, + "loss": 0.7879, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8033719062805176, + "rewards/margins": -0.02236950770020485, + "rewards/rejected": -0.7810022830963135, + "step": 367 + }, + { + "epoch": 0.48, + "learning_rate": 4.8178403088763355e-05, + "logits/chosen": -2.678246259689331, + "logits/rejected": -2.729729652404785, + "logps/chosen": -160.1791534423828, + "logps/rejected": -185.8133544921875, + "loss": 0.7682, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5224413871765137, + "rewards/margins": -0.03584878519177437, + "rewards/rejected": -0.4865925908088684, + "step": 368 + }, + { + "epoch": 0.48, + "learning_rate": 4.8164952823769085e-05, + "logits/chosen": -2.649317979812622, + "logits/rejected": -2.655850410461426, + "logps/chosen": -241.83953857421875, + "logps/rejected": -226.36309814453125, + "loss": 0.6643, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6058496236801147, + "rewards/margins": 0.16187947988510132, + "rewards/rejected": -0.7677291035652161, + "step": 369 + }, + { + "epoch": 0.48, + "learning_rate": 4.815145497597514e-05, + "logits/chosen": -2.711923837661743, + "logits/rejected": -2.7140746116638184, + "logps/chosen": -157.05551147460938, + "logps/rejected": -153.10430908203125, + "loss": 0.7461, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6182957291603088, + "rewards/margins": -0.003988802433013916, + "rewards/rejected": -0.6143069267272949, + "step": 370 + }, + { + "epoch": 0.49, + "learning_rate": 4.8137909573107246e-05, + "logits/chosen": -2.632993459701538, + "logits/rejected": -2.7051479816436768, + "logps/chosen": -193.94879150390625, + "logps/rejected": -189.07920837402344, + "loss": 0.6392, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6661685705184937, + "rewards/margins": 0.24933494627475739, + "rewards/rejected": -0.9155035614967346, + "step": 371 + }, + { + "epoch": 0.49, + "learning_rate": 4.812431664298883e-05, + "logits/chosen": -2.8046534061431885, + "logits/rejected": -2.7321465015411377, + "logps/chosen": -205.9781951904297, + "logps/rejected": -212.83636474609375, + "loss": 0.6998, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.49066948890686035, + "rewards/margins": 0.02486901544034481, + "rewards/rejected": -0.5155385136604309, + "step": 372 + }, + { + "epoch": 0.49, + "learning_rate": 4.811067621354094e-05, + "logits/chosen": -2.471921920776367, + "logits/rejected": -2.464104175567627, + "logps/chosen": -146.0660400390625, + "logps/rejected": -147.8424835205078, + "loss": 0.6922, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5911726951599121, + "rewards/margins": 0.050756677985191345, + "rewards/rejected": -0.6419293284416199, + "step": 373 + }, + { + "epoch": 0.49, + "learning_rate": 4.8096988312782174e-05, + "logits/chosen": -2.733851671218872, + "logits/rejected": -2.734400510787964, + "logps/chosen": -201.3523712158203, + "logps/rejected": -213.01361083984375, + "loss": 0.6494, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4939979016780853, + "rewards/margins": 0.25225183367729187, + "rewards/rejected": -0.7462497353553772, + "step": 374 + }, + { + "epoch": 0.49, + "learning_rate": 4.8083252968828665e-05, + "logits/chosen": -2.5853540897369385, + "logits/rejected": -2.5439412593841553, + "logps/chosen": -240.09124755859375, + "logps/rejected": -200.9342803955078, + "loss": 0.7954, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7007974982261658, + "rewards/margins": -0.10030128061771393, + "rewards/rejected": -0.6004961133003235, + "step": 375 + }, + { + "epoch": 0.49, + "learning_rate": 4.8069470209893974e-05, + "logits/chosen": -2.4914681911468506, + "logits/rejected": -2.5363824367523193, + "logps/chosen": -176.0529327392578, + "logps/rejected": -212.58383178710938, + "loss": 0.59, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4798307418823242, + "rewards/margins": 0.2897476851940155, + "rewards/rejected": -0.7695784568786621, + "step": 376 + }, + { + "epoch": 0.49, + "learning_rate": 4.8055640064289086e-05, + "logits/chosen": -2.627121686935425, + "logits/rejected": -2.6512184143066406, + "logps/chosen": -174.460205078125, + "logps/rejected": -205.20346069335938, + "loss": 0.6553, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5525593757629395, + "rewards/margins": 0.1847618669271469, + "rewards/rejected": -0.7373212575912476, + "step": 377 + }, + { + "epoch": 0.49, + "learning_rate": 4.80417625604223e-05, + "logits/chosen": -2.405069589614868, + "logits/rejected": -2.5229604244232178, + "logps/chosen": -179.3883819580078, + "logps/rejected": -195.5463409423828, + "loss": 0.5946, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7376325726509094, + "rewards/margins": 0.28595036268234253, + "rewards/rejected": -1.023582935333252, + "step": 378 + }, + { + "epoch": 0.5, + "learning_rate": 4.8027837726799205e-05, + "logits/chosen": -2.6785571575164795, + "logits/rejected": -2.725235939025879, + "logps/chosen": -177.39048767089844, + "logps/rejected": -180.63433837890625, + "loss": 0.7008, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4321979880332947, + "rewards/margins": 0.021436618641018867, + "rewards/rejected": -0.4536346197128296, + "step": 379 + }, + { + "epoch": 0.5, + "learning_rate": 4.801386559202259e-05, + "logits/chosen": -2.338463544845581, + "logits/rejected": -2.3258635997772217, + "logps/chosen": -136.97865295410156, + "logps/rejected": -175.3179168701172, + "loss": 0.6352, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3434058427810669, + "rewards/margins": 0.15200775861740112, + "rewards/rejected": -0.495413601398468, + "step": 380 + }, + { + "epoch": 0.5, + "learning_rate": 4.799984618479242e-05, + "logits/chosen": -2.6278395652770996, + "logits/rejected": -2.508350372314453, + "logps/chosen": -154.35214233398438, + "logps/rejected": -188.7537841796875, + "loss": 0.6372, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4511483311653137, + "rewards/margins": 0.18060770630836487, + "rewards/rejected": -0.6317560076713562, + "step": 381 + }, + { + "epoch": 0.5, + "learning_rate": 4.798577953390577e-05, + "logits/chosen": -2.561044931411743, + "logits/rejected": -2.6394059658050537, + "logps/chosen": -197.1724395751953, + "logps/rejected": -189.59738159179688, + "loss": 0.6657, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5811476111412048, + "rewards/margins": 0.09012848883867264, + "rewards/rejected": -0.6712760329246521, + "step": 382 + }, + { + "epoch": 0.5, + "learning_rate": 4.797166566825675e-05, + "logits/chosen": -2.4525938034057617, + "logits/rejected": -2.509936809539795, + "logps/chosen": -134.09764099121094, + "logps/rejected": -151.4327850341797, + "loss": 0.6024, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.24274428188800812, + "rewards/margins": 0.2151957005262375, + "rewards/rejected": -0.4579399824142456, + "step": 383 + }, + { + "epoch": 0.5, + "learning_rate": 4.795750461683644e-05, + "logits/chosen": -2.5733039379119873, + "logits/rejected": -2.624908924102783, + "logps/chosen": -152.46946716308594, + "logps/rejected": -176.73748779296875, + "loss": 0.7849, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5938777923583984, + "rewards/margins": -0.11171096563339233, + "rewards/rejected": -0.4821667969226837, + "step": 384 + }, + { + "epoch": 0.5, + "learning_rate": 4.794329640873285e-05, + "logits/chosen": -2.552907705307007, + "logits/rejected": -2.564852237701416, + "logps/chosen": -215.24732971191406, + "logps/rejected": -214.79995727539062, + "loss": 0.725, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.522678017616272, + "rewards/margins": 0.005169212818145752, + "rewards/rejected": -0.527847170829773, + "step": 385 + }, + { + "epoch": 0.51, + "learning_rate": 4.7929041073130867e-05, + "logits/chosen": -2.681658983230591, + "logits/rejected": -2.733393669128418, + "logps/chosen": -154.22958374023438, + "logps/rejected": -182.50245666503906, + "loss": 0.6893, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.47931969165802, + "rewards/margins": 0.07224328815937042, + "rewards/rejected": -0.5515629649162292, + "step": 386 + }, + { + "epoch": 0.51, + "learning_rate": 4.7914738639312165e-05, + "logits/chosen": -2.462939739227295, + "logits/rejected": -2.5003418922424316, + "logps/chosen": -152.27236938476562, + "logps/rejected": -162.45211791992188, + "loss": 0.6494, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.47114118933677673, + "rewards/margins": 0.16562382876873016, + "rewards/rejected": -0.6367650628089905, + "step": 387 + }, + { + "epoch": 0.51, + "learning_rate": 4.790038913665519e-05, + "logits/chosen": -2.4999754428863525, + "logits/rejected": -2.5065388679504395, + "logps/chosen": -213.994873046875, + "logps/rejected": -214.79046630859375, + "loss": 0.6382, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5417348146438599, + "rewards/margins": 0.19294968247413635, + "rewards/rejected": -0.7346844673156738, + "step": 388 + }, + { + "epoch": 0.51, + "learning_rate": 4.788599259463502e-05, + "logits/chosen": -2.5474274158477783, + "logits/rejected": -2.5991437435150146, + "logps/chosen": -180.50039672851562, + "logps/rejected": -225.59510803222656, + "loss": 0.6276, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7096745371818542, + "rewards/margins": 0.2028300166130066, + "rewards/rejected": -0.9125044941902161, + "step": 389 + }, + { + "epoch": 0.51, + "learning_rate": 4.787154904282341e-05, + "logits/chosen": -2.5795090198516846, + "logits/rejected": -2.577873468399048, + "logps/chosen": -163.51168823242188, + "logps/rejected": -140.55410766601562, + "loss": 0.794, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5473388433456421, + "rewards/margins": -0.11595198512077332, + "rewards/rejected": -0.4313868582248688, + "step": 390 + }, + { + "epoch": 0.51, + "learning_rate": 4.7857058510888645e-05, + "logits/chosen": -2.516535758972168, + "logits/rejected": -2.5428926944732666, + "logps/chosen": -208.22772216796875, + "logps/rejected": -231.32977294921875, + "loss": 0.6828, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5902161598205566, + "rewards/margins": 0.16028547286987305, + "rewards/rejected": -0.7505015730857849, + "step": 391 + }, + { + "epoch": 0.51, + "learning_rate": 4.7842521028595526e-05, + "logits/chosen": -2.379845380783081, + "logits/rejected": -2.3832809925079346, + "logps/chosen": -200.7274169921875, + "logps/rejected": -164.21742248535156, + "loss": 0.6737, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5952718257904053, + "rewards/margins": 0.09693758189678192, + "rewards/rejected": -0.6922094225883484, + "step": 392 + }, + { + "epoch": 0.51, + "learning_rate": 4.7827936625805284e-05, + "logits/chosen": -2.6402270793914795, + "logits/rejected": -2.7206149101257324, + "logps/chosen": -166.23167419433594, + "logps/rejected": -214.43209838867188, + "loss": 0.6564, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5481399893760681, + "rewards/margins": 0.19679725170135498, + "rewards/rejected": -0.7449373006820679, + "step": 393 + }, + { + "epoch": 0.52, + "learning_rate": 4.7813305332475535e-05, + "logits/chosen": -2.5474841594696045, + "logits/rejected": -2.398405075073242, + "logps/chosen": -184.8474884033203, + "logps/rejected": -174.7438507080078, + "loss": 0.814, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6769263744354248, + "rewards/margins": -0.10798013210296631, + "rewards/rejected": -0.5689462423324585, + "step": 394 + }, + { + "epoch": 0.52, + "learning_rate": 4.77986271786602e-05, + "logits/chosen": -2.4804301261901855, + "logits/rejected": -2.41290545463562, + "logps/chosen": -196.86683654785156, + "logps/rejected": -196.78880310058594, + "loss": 0.7094, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5920239686965942, + "rewards/margins": 0.030932970345020294, + "rewards/rejected": -0.6229569315910339, + "step": 395 + }, + { + "epoch": 0.52, + "learning_rate": 4.778390219450949e-05, + "logits/chosen": -2.37176775932312, + "logits/rejected": -2.5298891067504883, + "logps/chosen": -161.76962280273438, + "logps/rejected": -179.2218475341797, + "loss": 0.6622, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6016180515289307, + "rewards/margins": 0.17984583973884583, + "rewards/rejected": -0.7814638614654541, + "step": 396 + }, + { + "epoch": 0.52, + "learning_rate": 4.776913041026976e-05, + "logits/chosen": -2.4908242225646973, + "logits/rejected": -2.6029117107391357, + "logps/chosen": -147.92144775390625, + "logps/rejected": -160.30494689941406, + "loss": 0.7192, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.4882547855377197, + "rewards/margins": 0.03175659850239754, + "rewards/rejected": -0.5200113654136658, + "step": 397 + }, + { + "epoch": 0.52, + "learning_rate": 4.775431185628353e-05, + "logits/chosen": -2.4055118560791016, + "logits/rejected": -2.442111015319824, + "logps/chosen": -168.19540405273438, + "logps/rejected": -166.55137634277344, + "loss": 0.7503, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6233083605766296, + "rewards/margins": -0.03455538675189018, + "rewards/rejected": -0.5887529850006104, + "step": 398 + }, + { + "epoch": 0.52, + "learning_rate": 4.7739446562989384e-05, + "logits/chosen": -2.4722681045532227, + "logits/rejected": -2.572155237197876, + "logps/chosen": -207.08755493164062, + "logps/rejected": -210.96417236328125, + "loss": 0.6213, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5305930376052856, + "rewards/margins": 0.21656833589076996, + "rewards/rejected": -0.7471613883972168, + "step": 399 + }, + { + "epoch": 0.52, + "learning_rate": 4.772453456092191e-05, + "logits/chosen": -2.158127546310425, + "logits/rejected": -2.197726249694824, + "logps/chosen": -188.421142578125, + "logps/rejected": -190.73631286621094, + "loss": 0.6757, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4641679525375366, + "rewards/margins": 0.13141167163848877, + "rewards/rejected": -0.5955795645713806, + "step": 400 + }, + { + "epoch": 0.52, + "learning_rate": 4.7709575880711634e-05, + "logits/chosen": -2.5840139389038086, + "logits/rejected": -2.645707130432129, + "logps/chosen": -188.22262573242188, + "logps/rejected": -196.6393585205078, + "loss": 0.6874, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7071235775947571, + "rewards/margins": 0.0883268266916275, + "rewards/rejected": -0.7954504489898682, + "step": 401 + }, + { + "epoch": 0.53, + "learning_rate": 4.769457055308497e-05, + "logits/chosen": -2.4420454502105713, + "logits/rejected": -2.4484505653381348, + "logps/chosen": -169.99232482910156, + "logps/rejected": -169.7140350341797, + "loss": 0.7436, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5594102144241333, + "rewards/margins": -0.01718483492732048, + "rewards/rejected": -0.5422253608703613, + "step": 402 + }, + { + "epoch": 0.53, + "learning_rate": 4.767951860886415e-05, + "logits/chosen": -2.530424118041992, + "logits/rejected": -2.601032257080078, + "logps/chosen": -208.61627197265625, + "logps/rejected": -235.3765411376953, + "loss": 0.671, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7119523882865906, + "rewards/margins": 0.13131096959114075, + "rewards/rejected": -0.8432632684707642, + "step": 403 + }, + { + "epoch": 0.53, + "learning_rate": 4.766442007896715e-05, + "logits/chosen": -2.662515640258789, + "logits/rejected": -2.574281930923462, + "logps/chosen": -206.23391723632812, + "logps/rejected": -185.04434204101562, + "loss": 0.6698, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6086325645446777, + "rewards/margins": 0.1881732940673828, + "rewards/rejected": -0.7968058586120605, + "step": 404 + }, + { + "epoch": 0.53, + "learning_rate": 4.764927499440767e-05, + "logits/chosen": -2.4306862354278564, + "logits/rejected": -2.48148512840271, + "logps/chosen": -149.6880340576172, + "logps/rejected": -164.9370574951172, + "loss": 0.6242, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5055115222930908, + "rewards/margins": 0.18054383993148804, + "rewards/rejected": -0.6860553026199341, + "step": 405 + }, + { + "epoch": 0.53, + "learning_rate": 4.763408338629498e-05, + "logits/chosen": -2.6945927143096924, + "logits/rejected": -2.712038278579712, + "logps/chosen": -195.50482177734375, + "logps/rejected": -207.51431274414062, + "loss": 0.7339, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7351135015487671, + "rewards/margins": 0.00292108952999115, + "rewards/rejected": -0.7380346059799194, + "step": 406 + }, + { + "epoch": 0.53, + "learning_rate": 4.761884528583396e-05, + "logits/chosen": -2.4739484786987305, + "logits/rejected": -2.5098178386688232, + "logps/chosen": -199.3061065673828, + "logps/rejected": -180.8711395263672, + "loss": 0.7055, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6267393827438354, + "rewards/margins": 0.034921929240226746, + "rewards/rejected": -0.6616613268852234, + "step": 407 + }, + { + "epoch": 0.53, + "learning_rate": 4.760356072432498e-05, + "logits/chosen": -2.5606906414031982, + "logits/rejected": -2.6224427223205566, + "logps/chosen": -147.893798828125, + "logps/rejected": -156.9450225830078, + "loss": 0.7825, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8380388021469116, + "rewards/margins": -0.05697247013449669, + "rewards/rejected": -0.7810662388801575, + "step": 408 + }, + { + "epoch": 0.54, + "learning_rate": 4.7588229733163834e-05, + "logits/chosen": -2.479783058166504, + "logits/rejected": -2.5184426307678223, + "logps/chosen": -214.60830688476562, + "logps/rejected": -204.91534423828125, + "loss": 0.6624, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.745360255241394, + "rewards/margins": 0.17358280718326569, + "rewards/rejected": -0.9189431071281433, + "step": 409 + }, + { + "epoch": 0.54, + "learning_rate": 4.757285234384169e-05, + "logits/chosen": -2.5682218074798584, + "logits/rejected": -2.5506396293640137, + "logps/chosen": -158.36090087890625, + "logps/rejected": -163.76829528808594, + "loss": 0.7196, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7472302913665771, + "rewards/margins": 0.03290612995624542, + "rewards/rejected": -0.7801364064216614, + "step": 410 + }, + { + "epoch": 0.54, + "learning_rate": 4.755742858794503e-05, + "logits/chosen": -2.476301908493042, + "logits/rejected": -2.4283623695373535, + "logps/chosen": -180.48712158203125, + "logps/rejected": -190.4640655517578, + "loss": 0.7285, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5941387414932251, + "rewards/margins": -0.008222660049796104, + "rewards/rejected": -0.5859161019325256, + "step": 411 + }, + { + "epoch": 0.54, + "learning_rate": 4.754195849715557e-05, + "logits/chosen": -2.4996767044067383, + "logits/rejected": -2.5315232276916504, + "logps/chosen": -211.7996826171875, + "logps/rejected": -221.70632934570312, + "loss": 0.6269, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6848435997962952, + "rewards/margins": 0.2251822054386139, + "rewards/rejected": -0.9100258350372314, + "step": 412 + }, + { + "epoch": 0.54, + "learning_rate": 4.75264421032502e-05, + "logits/chosen": -2.3915770053863525, + "logits/rejected": -2.443027973175049, + "logps/chosen": -189.36318969726562, + "logps/rejected": -234.4507293701172, + "loss": 0.59, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6983102560043335, + "rewards/margins": 0.33968544006347656, + "rewards/rejected": -1.03799569606781, + "step": 413 + }, + { + "epoch": 0.54, + "learning_rate": 4.751087943810093e-05, + "logits/chosen": -2.4122979640960693, + "logits/rejected": -2.4623374938964844, + "logps/chosen": -150.90496826171875, + "logps/rejected": -155.33792114257812, + "loss": 0.5973, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6139358282089233, + "rewards/margins": 0.2593153715133667, + "rewards/rejected": -0.87325119972229, + "step": 414 + }, + { + "epoch": 0.54, + "learning_rate": 4.749527053367481e-05, + "logits/chosen": -2.4751663208007812, + "logits/rejected": -2.5102648735046387, + "logps/chosen": -199.7110595703125, + "logps/rejected": -211.5991973876953, + "loss": 0.6728, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5334303379058838, + "rewards/margins": 0.10008269548416138, + "rewards/rejected": -0.6335129737854004, + "step": 415 + }, + { + "epoch": 0.54, + "learning_rate": 4.747961542203386e-05, + "logits/chosen": -2.458789110183716, + "logits/rejected": -2.4540915489196777, + "logps/chosen": -202.07606506347656, + "logps/rejected": -233.15286254882812, + "loss": 0.7338, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7183946371078491, + "rewards/margins": -0.020335379987955093, + "rewards/rejected": -0.6980592608451843, + "step": 416 + }, + { + "epoch": 0.55, + "learning_rate": 4.746391413533503e-05, + "logits/chosen": -2.577547788619995, + "logits/rejected": -2.6487934589385986, + "logps/chosen": -200.61962890625, + "logps/rejected": -197.4369354248047, + "loss": 0.6989, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6610112190246582, + "rewards/margins": 0.05467906594276428, + "rewards/rejected": -0.7156902551651001, + "step": 417 + }, + { + "epoch": 0.55, + "learning_rate": 4.74481667058301e-05, + "logits/chosen": -2.460482120513916, + "logits/rejected": -2.428621768951416, + "logps/chosen": -173.61727905273438, + "logps/rejected": -174.00869750976562, + "loss": 0.6909, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6227931380271912, + "rewards/margins": 0.061159878969192505, + "rewards/rejected": -0.6839529275894165, + "step": 418 + }, + { + "epoch": 0.55, + "learning_rate": 4.743237316586564e-05, + "logits/chosen": -2.7055728435516357, + "logits/rejected": -2.694801092147827, + "logps/chosen": -233.16864013671875, + "logps/rejected": -219.25518798828125, + "loss": 0.5653, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7030515670776367, + "rewards/margins": 0.3454846143722534, + "rewards/rejected": -1.0485361814498901, + "step": 419 + }, + { + "epoch": 0.55, + "learning_rate": 4.741653354788295e-05, + "logits/chosen": -2.5690486431121826, + "logits/rejected": -2.5955686569213867, + "logps/chosen": -219.93161010742188, + "logps/rejected": -219.0428466796875, + "loss": 0.6281, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6727169156074524, + "rewards/margins": 0.190028116106987, + "rewards/rejected": -0.8627450466156006, + "step": 420 + }, + { + "epoch": 0.55, + "learning_rate": 4.7400647884417956e-05, + "logits/chosen": -2.4904708862304688, + "logits/rejected": -2.62727689743042, + "logps/chosen": -178.09906005859375, + "logps/rejected": -193.81495666503906, + "loss": 0.9252, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8763137459754944, + "rewards/margins": -0.29989194869995117, + "rewards/rejected": -0.5764217376708984, + "step": 421 + }, + { + "epoch": 0.55, + "learning_rate": 4.7384716208101166e-05, + "logits/chosen": -2.443995952606201, + "logits/rejected": -2.4667203426361084, + "logps/chosen": -187.14244079589844, + "logps/rejected": -178.54718017578125, + "loss": 0.6852, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5054149031639099, + "rewards/margins": 0.07299378514289856, + "rewards/rejected": -0.5784087777137756, + "step": 422 + }, + { + "epoch": 0.55, + "learning_rate": 4.736873855165762e-05, + "logits/chosen": -2.535909652709961, + "logits/rejected": -2.5671615600585938, + "logps/chosen": -188.70001220703125, + "logps/rejected": -204.35406494140625, + "loss": 0.5707, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.46517521142959595, + "rewards/margins": 0.3614599108695984, + "rewards/rejected": -0.8266351222991943, + "step": 423 + }, + { + "epoch": 0.55, + "learning_rate": 4.735271494790678e-05, + "logits/chosen": -2.528953790664673, + "logits/rejected": -2.6590449810028076, + "logps/chosen": -187.15150451660156, + "logps/rejected": -224.23590087890625, + "loss": 0.5801, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5339723229408264, + "rewards/margins": 0.28138864040374756, + "rewards/rejected": -0.8153610229492188, + "step": 424 + }, + { + "epoch": 0.56, + "learning_rate": 4.733664542976253e-05, + "logits/chosen": -2.4930217266082764, + "logits/rejected": -2.5632638931274414, + "logps/chosen": -190.24932861328125, + "logps/rejected": -229.89834594726562, + "loss": 0.6699, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7542927265167236, + "rewards/margins": 0.1013142317533493, + "rewards/rejected": -0.8556069731712341, + "step": 425 + }, + { + "epoch": 0.56, + "learning_rate": 4.732053003023301e-05, + "logits/chosen": -2.4712095260620117, + "logits/rejected": -2.5324177742004395, + "logps/chosen": -174.3723907470703, + "logps/rejected": -206.7808837890625, + "loss": 0.6939, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6538185477256775, + "rewards/margins": 0.04219439998269081, + "rewards/rejected": -0.6960129141807556, + "step": 426 + }, + { + "epoch": 0.56, + "learning_rate": 4.730436878242064e-05, + "logits/chosen": -2.5648436546325684, + "logits/rejected": -2.6675539016723633, + "logps/chosen": -201.86000061035156, + "logps/rejected": -222.20242309570312, + "loss": 0.5946, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.752447783946991, + "rewards/margins": 0.23879489302635193, + "rewards/rejected": -0.9912427067756653, + "step": 427 + }, + { + "epoch": 0.56, + "learning_rate": 4.7288161719522016e-05, + "logits/chosen": -2.55771541595459, + "logits/rejected": -2.597944974899292, + "logps/chosen": -168.26791381835938, + "logps/rejected": -158.33251953125, + "loss": 0.647, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.43088653683662415, + "rewards/margins": 0.134020134806633, + "rewards/rejected": -0.5649065971374512, + "step": 428 + }, + { + "epoch": 0.56, + "learning_rate": 4.727190887482783e-05, + "logits/chosen": -2.801307201385498, + "logits/rejected": -2.7668023109436035, + "logps/chosen": -221.6161346435547, + "logps/rejected": -212.2379913330078, + "loss": 0.6348, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5888996124267578, + "rewards/margins": 0.16605983674526215, + "rewards/rejected": -0.7549594044685364, + "step": 429 + }, + { + "epoch": 0.56, + "learning_rate": 4.725561028172282e-05, + "logits/chosen": -2.459953784942627, + "logits/rejected": -2.4980850219726562, + "logps/chosen": -201.91897583007812, + "logps/rejected": -208.00759887695312, + "loss": 0.6881, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6136542558670044, + "rewards/margins": 0.06774169206619263, + "rewards/rejected": -0.6813960075378418, + "step": 430 + }, + { + "epoch": 0.56, + "learning_rate": 4.7239265973685696e-05, + "logits/chosen": -2.551776647567749, + "logits/rejected": -2.5754733085632324, + "logps/chosen": -185.41299438476562, + "logps/rejected": -187.89947509765625, + "loss": 0.6601, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5381264090538025, + "rewards/margins": 0.11568471044301987, + "rewards/rejected": -0.6538110971450806, + "step": 431 + }, + { + "epoch": 0.57, + "learning_rate": 4.722287598428907e-05, + "logits/chosen": -2.588721990585327, + "logits/rejected": -2.629152774810791, + "logps/chosen": -197.33180236816406, + "logps/rejected": -211.42718505859375, + "loss": 0.5501, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6033443808555603, + "rewards/margins": 0.405744731426239, + "rewards/rejected": -1.0090891122817993, + "step": 432 + }, + { + "epoch": 0.57, + "learning_rate": 4.720644034719938e-05, + "logits/chosen": -2.5057499408721924, + "logits/rejected": -2.6138525009155273, + "logps/chosen": -175.42535400390625, + "logps/rejected": -199.4925079345703, + "loss": 0.723, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8974592685699463, + "rewards/margins": 0.013055291026830673, + "rewards/rejected": -0.9105146527290344, + "step": 433 + }, + { + "epoch": 0.57, + "learning_rate": 4.7189959096176825e-05, + "logits/chosen": -2.4195330142974854, + "logits/rejected": -2.467151165008545, + "logps/chosen": -168.0084686279297, + "logps/rejected": -181.73574829101562, + "loss": 0.8057, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7599891424179077, + "rewards/margins": -0.08929312229156494, + "rewards/rejected": -0.6706960201263428, + "step": 434 + }, + { + "epoch": 0.57, + "learning_rate": 4.7173432265075334e-05, + "logits/chosen": -2.511300802230835, + "logits/rejected": -2.510061502456665, + "logps/chosen": -195.68515014648438, + "logps/rejected": -173.744140625, + "loss": 0.6779, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8897146582603455, + "rewards/margins": 0.1359187215566635, + "rewards/rejected": -1.0256333351135254, + "step": 435 + }, + { + "epoch": 0.57, + "learning_rate": 4.7156859887842416e-05, + "logits/chosen": -2.5270705223083496, + "logits/rejected": -2.5860514640808105, + "logps/chosen": -211.92176818847656, + "logps/rejected": -210.06216430664062, + "loss": 0.7143, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7650929689407349, + "rewards/margins": 0.06967158615589142, + "rewards/rejected": -0.8347646594047546, + "step": 436 + }, + { + "epoch": 0.57, + "learning_rate": 4.714024199851915e-05, + "logits/chosen": -2.6095833778381348, + "logits/rejected": -2.683137893676758, + "logps/chosen": -166.6543426513672, + "logps/rejected": -197.75425720214844, + "loss": 0.7672, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9517545104026794, + "rewards/margins": -0.0435512438416481, + "rewards/rejected": -0.9082032442092896, + "step": 437 + }, + { + "epoch": 0.57, + "learning_rate": 4.712357863124013e-05, + "logits/chosen": -2.583829641342163, + "logits/rejected": -2.4270806312561035, + "logps/chosen": -184.97288513183594, + "logps/rejected": -151.17689514160156, + "loss": 0.8427, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9021130800247192, + "rewards/margins": -0.22366446256637573, + "rewards/rejected": -0.6784486770629883, + "step": 438 + }, + { + "epoch": 0.57, + "learning_rate": 4.710686982023332e-05, + "logits/chosen": -2.6779322624206543, + "logits/rejected": -2.6819846630096436, + "logps/chosen": -202.7177734375, + "logps/rejected": -180.97412109375, + "loss": 0.6769, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7709564566612244, + "rewards/margins": 0.1275014728307724, + "rewards/rejected": -0.8984578251838684, + "step": 439 + }, + { + "epoch": 0.58, + "learning_rate": 4.709011559982006e-05, + "logits/chosen": -2.5081942081451416, + "logits/rejected": -2.457127094268799, + "logps/chosen": -199.16038513183594, + "logps/rejected": -227.66439819335938, + "loss": 0.715, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8098132014274597, + "rewards/margins": 0.005300614982843399, + "rewards/rejected": -0.8151137232780457, + "step": 440 + }, + { + "epoch": 0.58, + "learning_rate": 4.707331600441495e-05, + "logits/chosen": -2.5350513458251953, + "logits/rejected": -2.498892307281494, + "logps/chosen": -169.93370056152344, + "logps/rejected": -166.59365844726562, + "loss": 0.6809, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6930581331253052, + "rewards/margins": 0.06999292969703674, + "rewards/rejected": -0.7630510330200195, + "step": 441 + }, + { + "epoch": 0.58, + "learning_rate": 4.705647106852581e-05, + "logits/chosen": -2.5681025981903076, + "logits/rejected": -2.5515594482421875, + "logps/chosen": -232.0432586669922, + "logps/rejected": -224.9477081298828, + "loss": 0.7262, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9011371731758118, + "rewards/margins": 0.08735474199056625, + "rewards/rejected": -0.988491952419281, + "step": 442 + }, + { + "epoch": 0.58, + "learning_rate": 4.7039580826753564e-05, + "logits/chosen": -2.538968563079834, + "logits/rejected": -2.5133233070373535, + "logps/chosen": -158.14520263671875, + "logps/rejected": -201.77728271484375, + "loss": 0.6074, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6365475654602051, + "rewards/margins": 0.2535760998725891, + "rewards/rejected": -0.8901236057281494, + "step": 443 + }, + { + "epoch": 0.58, + "learning_rate": 4.7022645313792235e-05, + "logits/chosen": -2.5245423316955566, + "logits/rejected": -2.588785171508789, + "logps/chosen": -211.5054473876953, + "logps/rejected": -183.34361267089844, + "loss": 0.6541, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7383630275726318, + "rewards/margins": 0.1513664424419403, + "rewards/rejected": -0.8897294998168945, + "step": 444 + }, + { + "epoch": 0.58, + "learning_rate": 4.700566456442882e-05, + "logits/chosen": -2.426429033279419, + "logits/rejected": -2.4127559661865234, + "logps/chosen": -170.16091918945312, + "logps/rejected": -181.26731872558594, + "loss": 0.685, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.555845320224762, + "rewards/margins": 0.09964090585708618, + "rewards/rejected": -0.6554862260818481, + "step": 445 + }, + { + "epoch": 0.58, + "learning_rate": 4.6988638613543216e-05, + "logits/chosen": -2.5047028064727783, + "logits/rejected": -2.6594462394714355, + "logps/chosen": -192.5770263671875, + "logps/rejected": -196.17718505859375, + "loss": 0.6536, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6389065384864807, + "rewards/margins": 0.1103520542383194, + "rewards/rejected": -0.7492585778236389, + "step": 446 + }, + { + "epoch": 0.59, + "learning_rate": 4.6971567496108206e-05, + "logits/chosen": -2.4204087257385254, + "logits/rejected": -2.4866108894348145, + "logps/chosen": -186.6705780029297, + "logps/rejected": -183.05654907226562, + "loss": 0.6868, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7241983413696289, + "rewards/margins": 0.062242452055215836, + "rewards/rejected": -0.7864408493041992, + "step": 447 + }, + { + "epoch": 0.59, + "learning_rate": 4.695445124718931e-05, + "logits/chosen": -2.599712610244751, + "logits/rejected": -2.678144693374634, + "logps/chosen": -171.8934326171875, + "logps/rejected": -187.55401611328125, + "loss": 0.7119, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8643355369567871, + "rewards/margins": 0.07741285860538483, + "rewards/rejected": -0.9417483806610107, + "step": 448 + }, + { + "epoch": 0.59, + "learning_rate": 4.693728990194479e-05, + "logits/chosen": -2.5691065788269043, + "logits/rejected": -2.515324592590332, + "logps/chosen": -175.92041015625, + "logps/rejected": -169.9009552001953, + "loss": 0.7915, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7048065662384033, + "rewards/margins": -0.13590610027313232, + "rewards/rejected": -0.5689005851745605, + "step": 449 + }, + { + "epoch": 0.59, + "learning_rate": 4.692008349562551e-05, + "logits/chosen": -2.5656256675720215, + "logits/rejected": -2.5296895503997803, + "logps/chosen": -181.8729248046875, + "logps/rejected": -166.05845642089844, + "loss": 0.6528, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6643526554107666, + "rewards/margins": 0.1753290742635727, + "rewards/rejected": -0.8396817445755005, + "step": 450 + }, + { + "epoch": 0.59, + "learning_rate": 4.690283206357491e-05, + "logits/chosen": -2.6595234870910645, + "logits/rejected": -2.6016488075256348, + "logps/chosen": -227.77488708496094, + "logps/rejected": -197.7751007080078, + "loss": 0.7501, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9334564208984375, + "rewards/margins": 0.008943833410739899, + "rewards/rejected": -0.942400336265564, + "step": 451 + }, + { + "epoch": 0.59, + "learning_rate": 4.6885535641228904e-05, + "logits/chosen": -2.6872832775115967, + "logits/rejected": -2.6639106273651123, + "logps/chosen": -198.11798095703125, + "logps/rejected": -215.65846252441406, + "loss": 0.6973, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.661568284034729, + "rewards/margins": 0.10554268211126328, + "rewards/rejected": -0.7671110033988953, + "step": 452 + }, + { + "epoch": 0.59, + "learning_rate": 4.6868194264115833e-05, + "logits/chosen": -2.6474506855010986, + "logits/rejected": -2.6153347492218018, + "logps/chosen": -179.79026794433594, + "logps/rejected": -181.140869140625, + "loss": 0.668, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7167541980743408, + "rewards/margins": 0.10385677218437195, + "rewards/rejected": -0.8206108808517456, + "step": 453 + }, + { + "epoch": 0.59, + "learning_rate": 4.685080796785637e-05, + "logits/chosen": -2.6611721515655518, + "logits/rejected": -2.5946366786956787, + "logps/chosen": -175.3307647705078, + "logps/rejected": -182.19512939453125, + "loss": 0.6937, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5467984080314636, + "rewards/margins": 0.12983733415603638, + "rewards/rejected": -0.6766356825828552, + "step": 454 + }, + { + "epoch": 0.6, + "learning_rate": 4.683337678816345e-05, + "logits/chosen": -2.564751148223877, + "logits/rejected": -2.610701322555542, + "logps/chosen": -160.96405029296875, + "logps/rejected": -211.45936584472656, + "loss": 0.5668, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6448365449905396, + "rewards/margins": 0.34849783778190613, + "rewards/rejected": -0.9933344125747681, + "step": 455 + }, + { + "epoch": 0.6, + "learning_rate": 4.6815900760842236e-05, + "logits/chosen": -2.5778658390045166, + "logits/rejected": -2.650463104248047, + "logps/chosen": -178.8852081298828, + "logps/rejected": -184.25819396972656, + "loss": 0.6166, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6312435865402222, + "rewards/margins": 0.21195663511753082, + "rewards/rejected": -0.8432002067565918, + "step": 456 + }, + { + "epoch": 0.6, + "learning_rate": 4.679837992178996e-05, + "logits/chosen": -2.6255109310150146, + "logits/rejected": -2.568089246749878, + "logps/chosen": -186.581787109375, + "logps/rejected": -179.66636657714844, + "loss": 0.7966, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7909551858901978, + "rewards/margins": -0.12891662120819092, + "rewards/rejected": -0.6620385646820068, + "step": 457 + }, + { + "epoch": 0.6, + "learning_rate": 4.678081430699594e-05, + "logits/chosen": -2.548609972000122, + "logits/rejected": -2.5789599418640137, + "logps/chosen": -149.14419555664062, + "logps/rejected": -175.63099670410156, + "loss": 0.672, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6500214338302612, + "rewards/margins": 0.1032710000872612, + "rewards/rejected": -0.7532925009727478, + "step": 458 + }, + { + "epoch": 0.6, + "learning_rate": 4.676320395254146e-05, + "logits/chosen": -2.6516225337982178, + "logits/rejected": -2.5997962951660156, + "logps/chosen": -179.6166229248047, + "logps/rejected": -181.91600036621094, + "loss": 0.6797, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5761557221412659, + "rewards/margins": 0.09436798840761185, + "rewards/rejected": -0.6705237627029419, + "step": 459 + }, + { + "epoch": 0.6, + "learning_rate": 4.674554889459968e-05, + "logits/chosen": -2.547447681427002, + "logits/rejected": -2.531512498855591, + "logps/chosen": -158.489013671875, + "logps/rejected": -158.9821319580078, + "loss": 0.6921, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.551589846611023, + "rewards/margins": 0.06521686166524887, + "rewards/rejected": -0.61680668592453, + "step": 460 + }, + { + "epoch": 0.6, + "learning_rate": 4.672784916943562e-05, + "logits/chosen": -2.6113603115081787, + "logits/rejected": -2.7179596424102783, + "logps/chosen": -161.98703002929688, + "logps/rejected": -183.0166778564453, + "loss": 0.6035, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4798649549484253, + "rewards/margins": 0.25054579973220825, + "rewards/rejected": -0.7304107546806335, + "step": 461 + }, + { + "epoch": 0.6, + "learning_rate": 4.6710104813406034e-05, + "logits/chosen": -2.542372703552246, + "logits/rejected": -2.522775650024414, + "logps/chosen": -179.12545776367188, + "logps/rejected": -175.42910766601562, + "loss": 0.6264, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5295111536979675, + "rewards/margins": 0.1804201453924179, + "rewards/rejected": -0.7099313139915466, + "step": 462 + }, + { + "epoch": 0.61, + "learning_rate": 4.669231586295934e-05, + "logits/chosen": -2.604105234146118, + "logits/rejected": -2.6281673908233643, + "logps/chosen": -201.89901733398438, + "logps/rejected": -226.806396484375, + "loss": 0.7092, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4338040053844452, + "rewards/margins": 0.028335200622677803, + "rewards/rejected": -0.46213918924331665, + "step": 463 + }, + { + "epoch": 0.61, + "learning_rate": 4.667448235463557e-05, + "logits/chosen": -2.639315605163574, + "logits/rejected": -2.572453022003174, + "logps/chosen": -211.58755493164062, + "logps/rejected": -191.2517547607422, + "loss": 0.5992, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4935193359851837, + "rewards/margins": 0.28121161460876465, + "rewards/rejected": -0.7747309803962708, + "step": 464 + }, + { + "epoch": 0.61, + "learning_rate": 4.665660432506629e-05, + "logits/chosen": -2.577928304672241, + "logits/rejected": -2.5669662952423096, + "logps/chosen": -200.63458251953125, + "logps/rejected": -192.37644958496094, + "loss": 0.684, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6348070502281189, + "rewards/margins": 0.1238020658493042, + "rewards/rejected": -0.7586091756820679, + "step": 465 + }, + { + "epoch": 0.61, + "learning_rate": 4.6638681810974496e-05, + "logits/chosen": -2.5102195739746094, + "logits/rejected": -2.4570584297180176, + "logps/chosen": -184.1721954345703, + "logps/rejected": -186.46142578125, + "loss": 0.6753, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5391435027122498, + "rewards/margins": 0.12175430357456207, + "rewards/rejected": -0.6608977913856506, + "step": 466 + }, + { + "epoch": 0.61, + "learning_rate": 4.6620714849174576e-05, + "logits/chosen": -2.5037519931793213, + "logits/rejected": -2.514277219772339, + "logps/chosen": -221.79428100585938, + "logps/rejected": -226.41201782226562, + "loss": 0.6953, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4555547833442688, + "rewards/margins": 0.06287840008735657, + "rewards/rejected": -0.5184332132339478, + "step": 467 + }, + { + "epoch": 0.61, + "learning_rate": 4.660270347657219e-05, + "logits/chosen": -2.4214844703674316, + "logits/rejected": -2.451406717300415, + "logps/chosen": -164.8655242919922, + "logps/rejected": -202.42665100097656, + "loss": 0.6557, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4477751851081848, + "rewards/margins": 0.17700761556625366, + "rewards/rejected": -0.6247828006744385, + "step": 468 + }, + { + "epoch": 0.61, + "learning_rate": 4.658464773016428e-05, + "logits/chosen": -2.6548826694488525, + "logits/rejected": -2.7413270473480225, + "logps/chosen": -203.27706909179688, + "logps/rejected": -195.96438598632812, + "loss": 0.7196, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6162900924682617, + "rewards/margins": 0.04476276412606239, + "rewards/rejected": -0.6610528230667114, + "step": 469 + }, + { + "epoch": 0.62, + "learning_rate": 4.6566547647038864e-05, + "logits/chosen": -2.6256935596466064, + "logits/rejected": -2.544764757156372, + "logps/chosen": -197.11715698242188, + "logps/rejected": -174.37977600097656, + "loss": 0.5997, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7993966937065125, + "rewards/margins": 0.2561304569244385, + "rewards/rejected": -1.0555272102355957, + "step": 470 + }, + { + "epoch": 0.62, + "learning_rate": 4.6548403264375074e-05, + "logits/chosen": -2.4777655601501465, + "logits/rejected": -2.466064691543579, + "logps/chosen": -206.86097717285156, + "logps/rejected": -222.15383911132812, + "loss": 0.8362, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8698564767837524, + "rewards/margins": -0.1482725888490677, + "rewards/rejected": -0.721583902835846, + "step": 471 + }, + { + "epoch": 0.62, + "learning_rate": 4.6530214619443037e-05, + "logits/chosen": -2.5265445709228516, + "logits/rejected": -2.5728039741516113, + "logps/chosen": -205.71484375, + "logps/rejected": -193.68008422851562, + "loss": 0.7739, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6624203324317932, + "rewards/margins": -0.09644677489995956, + "rewards/rejected": -0.5659735202789307, + "step": 472 + }, + { + "epoch": 0.62, + "learning_rate": 4.6511981749603775e-05, + "logits/chosen": -2.4683680534362793, + "logits/rejected": -2.553187847137451, + "logps/chosen": -188.4405975341797, + "logps/rejected": -206.36973571777344, + "loss": 0.6291, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.44957125186920166, + "rewards/margins": 0.21964800357818604, + "rewards/rejected": -0.6692192554473877, + "step": 473 + }, + { + "epoch": 0.62, + "learning_rate": 4.6493704692309175e-05, + "logits/chosen": -2.4824113845825195, + "logits/rejected": -2.4895737171173096, + "logps/chosen": -173.37237548828125, + "logps/rejected": -182.10430908203125, + "loss": 0.6773, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5202856063842773, + "rewards/margins": 0.12215665727853775, + "rewards/rejected": -0.6424421668052673, + "step": 474 + }, + { + "epoch": 0.62, + "learning_rate": 4.647538348510189e-05, + "logits/chosen": -2.6831271648406982, + "logits/rejected": -2.7396373748779297, + "logps/chosen": -175.16824340820312, + "logps/rejected": -194.82957458496094, + "loss": 0.653, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6846888065338135, + "rewards/margins": 0.17653468251228333, + "rewards/rejected": -0.861223578453064, + "step": 475 + }, + { + "epoch": 0.62, + "learning_rate": 4.645701816561523e-05, + "logits/chosen": -2.55379581451416, + "logits/rejected": -2.6678338050842285, + "logps/chosen": -214.75527954101562, + "logps/rejected": -180.83958435058594, + "loss": 0.6331, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6419520974159241, + "rewards/margins": 0.22850137948989868, + "rewards/rejected": -0.8704534769058228, + "step": 476 + }, + { + "epoch": 0.62, + "learning_rate": 4.643860877157314e-05, + "logits/chosen": -2.560502529144287, + "logits/rejected": -2.6059296131134033, + "logps/chosen": -225.27346801757812, + "logps/rejected": -238.6981658935547, + "loss": 0.7312, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6071743369102478, + "rewards/margins": 0.0013379361480474472, + "rewards/rejected": -0.6085121631622314, + "step": 477 + }, + { + "epoch": 0.63, + "learning_rate": 4.642015534079012e-05, + "logits/chosen": -2.6502907276153564, + "logits/rejected": -2.4651901721954346, + "logps/chosen": -194.60215759277344, + "logps/rejected": -208.71707153320312, + "loss": 0.6499, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.41506141424179077, + "rewards/margins": 0.14238694310188293, + "rewards/rejected": -0.5574483871459961, + "step": 478 + }, + { + "epoch": 0.63, + "learning_rate": 4.640165791117106e-05, + "logits/chosen": -2.597409248352051, + "logits/rejected": -2.671415090560913, + "logps/chosen": -169.68240356445312, + "logps/rejected": -189.24998474121094, + "loss": 0.6624, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6197757124900818, + "rewards/margins": 0.10906066745519638, + "rewards/rejected": -0.7288363575935364, + "step": 479 + }, + { + "epoch": 0.63, + "learning_rate": 4.63831165207113e-05, + "logits/chosen": -2.2800724506378174, + "logits/rejected": -2.3521509170532227, + "logps/chosen": -171.45944213867188, + "logps/rejected": -175.93577575683594, + "loss": 0.6243, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.498868465423584, + "rewards/margins": 0.19707906246185303, + "rewards/rejected": -0.6959475874900818, + "step": 480 + }, + { + "epoch": 0.63, + "learning_rate": 4.6364531207496426e-05, + "logits/chosen": -2.549999713897705, + "logits/rejected": -2.5626368522644043, + "logps/chosen": -162.32720947265625, + "logps/rejected": -192.87612915039062, + "loss": 0.7312, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5747624635696411, + "rewards/margins": 0.024262502789497375, + "rewards/rejected": -0.5990250110626221, + "step": 481 + }, + { + "epoch": 0.63, + "learning_rate": 4.634590200970227e-05, + "logits/chosen": -2.482072591781616, + "logits/rejected": -2.6162638664245605, + "logps/chosen": -176.58897399902344, + "logps/rejected": -207.56654357910156, + "loss": 0.6501, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7275586128234863, + "rewards/margins": 0.14506082236766815, + "rewards/rejected": -0.8726193904876709, + "step": 482 + }, + { + "epoch": 0.63, + "learning_rate": 4.632722896559481e-05, + "logits/chosen": -2.668024778366089, + "logits/rejected": -2.6363847255706787, + "logps/chosen": -263.9557800292969, + "logps/rejected": -249.15301513671875, + "loss": 0.7493, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0229195356369019, + "rewards/margins": -0.055171869695186615, + "rewards/rejected": -0.967747688293457, + "step": 483 + }, + { + "epoch": 0.63, + "learning_rate": 4.630851211353007e-05, + "logits/chosen": -2.6272599697113037, + "logits/rejected": -2.644105911254883, + "logps/chosen": -169.70347595214844, + "logps/rejected": -194.94564819335938, + "loss": 0.6384, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7246192693710327, + "rewards/margins": 0.25421109795570374, + "rewards/rejected": -0.9788303971290588, + "step": 484 + }, + { + "epoch": 0.63, + "learning_rate": 4.628975149195407e-05, + "logits/chosen": -2.530022621154785, + "logits/rejected": -2.5307507514953613, + "logps/chosen": -165.1283721923828, + "logps/rejected": -191.00559997558594, + "loss": 0.6223, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6375387907028198, + "rewards/margins": 0.26712220907211304, + "rewards/rejected": -0.9046609997749329, + "step": 485 + }, + { + "epoch": 0.64, + "learning_rate": 4.6270947139402744e-05, + "logits/chosen": -2.54582142829895, + "logits/rejected": -2.5507116317749023, + "logps/chosen": -197.43624877929688, + "logps/rejected": -200.06396484375, + "loss": 0.695, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6543585062026978, + "rewards/margins": 0.08520226180553436, + "rewards/rejected": -0.7395609021186829, + "step": 486 + }, + { + "epoch": 0.64, + "learning_rate": 4.6252099094501834e-05, + "logits/chosen": -2.502086639404297, + "logits/rejected": -2.507930040359497, + "logps/chosen": -192.9934539794922, + "logps/rejected": -207.4755859375, + "loss": 0.5737, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5259783864021301, + "rewards/margins": 0.3163459002971649, + "rewards/rejected": -0.8423242568969727, + "step": 487 + }, + { + "epoch": 0.64, + "learning_rate": 4.623320739596685e-05, + "logits/chosen": -2.5075860023498535, + "logits/rejected": -2.537548065185547, + "logps/chosen": -180.3726348876953, + "logps/rejected": -195.20867919921875, + "loss": 0.6342, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6498013138771057, + "rewards/margins": 0.19693784415721893, + "rewards/rejected": -0.8467391729354858, + "step": 488 + }, + { + "epoch": 0.64, + "learning_rate": 4.621427208260296e-05, + "logits/chosen": -2.5124759674072266, + "logits/rejected": -2.5460541248321533, + "logps/chosen": -170.8051300048828, + "logps/rejected": -183.51568603515625, + "loss": 0.6292, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7085085511207581, + "rewards/margins": 0.24482092261314392, + "rewards/rejected": -0.9533295035362244, + "step": 489 + }, + { + "epoch": 0.64, + "learning_rate": 4.6195293193304915e-05, + "logits/chosen": -2.568577289581299, + "logits/rejected": -2.730349063873291, + "logps/chosen": -157.42147827148438, + "logps/rejected": -204.0237274169922, + "loss": 0.6416, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7542508244514465, + "rewards/margins": 0.18506285548210144, + "rewards/rejected": -0.9393137693405151, + "step": 490 + }, + { + "epoch": 0.64, + "learning_rate": 4.6176270767056976e-05, + "logits/chosen": -2.658346652984619, + "logits/rejected": -2.697397232055664, + "logps/chosen": -160.69436645507812, + "logps/rejected": -182.4514923095703, + "loss": 0.6066, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5536147952079773, + "rewards/margins": 0.21064530313014984, + "rewards/rejected": -0.7642600536346436, + "step": 491 + }, + { + "epoch": 0.64, + "learning_rate": 4.615720484293286e-05, + "logits/chosen": -2.621380090713501, + "logits/rejected": -2.5975751876831055, + "logps/chosen": -172.44583129882812, + "logps/rejected": -168.9969482421875, + "loss": 0.7824, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.7546152472496033, + "rewards/margins": -0.09415875375270844, + "rewards/rejected": -0.6604564785957336, + "step": 492 + }, + { + "epoch": 0.65, + "learning_rate": 4.613809546009558e-05, + "logits/chosen": -2.5632882118225098, + "logits/rejected": -2.528463840484619, + "logps/chosen": -174.4139404296875, + "logps/rejected": -178.07061767578125, + "loss": 0.7851, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.783379077911377, + "rewards/margins": -0.13342618942260742, + "rewards/rejected": -0.64995276927948, + "step": 493 + }, + { + "epoch": 0.65, + "learning_rate": 4.611894265779748e-05, + "logits/chosen": -2.6725411415100098, + "logits/rejected": -2.6337125301361084, + "logps/chosen": -158.43734741210938, + "logps/rejected": -149.51394653320312, + "loss": 0.6479, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7037851810455322, + "rewards/margins": 0.15217718482017517, + "rewards/rejected": -0.8559622764587402, + "step": 494 + }, + { + "epoch": 0.65, + "learning_rate": 4.609974647538003e-05, + "logits/chosen": -2.5839614868164062, + "logits/rejected": -2.5436882972717285, + "logps/chosen": -186.00880432128906, + "logps/rejected": -176.95623779296875, + "loss": 0.6352, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7761020660400391, + "rewards/margins": 0.2549971640110016, + "rewards/rejected": -1.0310992002487183, + "step": 495 + }, + { + "epoch": 0.65, + "learning_rate": 4.608050695227385e-05, + "logits/chosen": -2.663245439529419, + "logits/rejected": -2.629906415939331, + "logps/chosen": -187.883056640625, + "logps/rejected": -181.8142852783203, + "loss": 0.727, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8239506483078003, + "rewards/margins": 0.07172393053770065, + "rewards/rejected": -0.8956745862960815, + "step": 496 + }, + { + "epoch": 0.65, + "learning_rate": 4.606122412799857e-05, + "logits/chosen": -2.864988327026367, + "logits/rejected": -2.8613595962524414, + "logps/chosen": -179.66754150390625, + "logps/rejected": -183.17547607421875, + "loss": 0.7754, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.7847499847412109, + "rewards/margins": -0.03732617199420929, + "rewards/rejected": -0.7474238872528076, + "step": 497 + }, + { + "epoch": 0.65, + "learning_rate": 4.6041898042162764e-05, + "logits/chosen": -2.67081618309021, + "logits/rejected": -2.630650281906128, + "logps/chosen": -184.00350952148438, + "logps/rejected": -213.1297607421875, + "loss": 0.7759, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9428163170814514, + "rewards/margins": -0.10311193764209747, + "rewards/rejected": -0.8397043347358704, + "step": 498 + }, + { + "epoch": 0.65, + "learning_rate": 4.602252873446386e-05, + "logits/chosen": -2.5779166221618652, + "logits/rejected": -2.657582998275757, + "logps/chosen": -255.50413513183594, + "logps/rejected": -295.5150146484375, + "loss": 0.6389, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6901088356971741, + "rewards/margins": 0.2417818158864975, + "rewards/rejected": -0.9318906664848328, + "step": 499 + }, + { + "epoch": 0.65, + "learning_rate": 4.60031162446881e-05, + "logits/chosen": -2.6082730293273926, + "logits/rejected": -2.7072064876556396, + "logps/chosen": -186.39288330078125, + "logps/rejected": -197.77581787109375, + "loss": 0.7185, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8791942596435547, + "rewards/margins": -0.021152300760149956, + "rewards/rejected": -0.8580418825149536, + "step": 500 + }, + { + "epoch": 0.66, + "learning_rate": 4.5983660612710365e-05, + "logits/chosen": -2.44614315032959, + "logits/rejected": -2.4296889305114746, + "logps/chosen": -276.84466552734375, + "logps/rejected": -263.2348327636719, + "loss": 0.7359, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.878140389919281, + "rewards/margins": -0.04955806955695152, + "rewards/rejected": -0.8285822868347168, + "step": 501 + }, + { + "epoch": 0.66, + "learning_rate": 4.596416187849423e-05, + "logits/chosen": -2.619124412536621, + "logits/rejected": -2.6439716815948486, + "logps/chosen": -178.675537109375, + "logps/rejected": -180.601806640625, + "loss": 0.646, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.639778733253479, + "rewards/margins": 0.13400733470916748, + "rewards/rejected": -0.7737860679626465, + "step": 502 + }, + { + "epoch": 0.66, + "learning_rate": 4.5944620082091745e-05, + "logits/chosen": -2.5724973678588867, + "logits/rejected": -2.5802013874053955, + "logps/chosen": -155.44479370117188, + "logps/rejected": -137.69415283203125, + "loss": 0.6405, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7297097444534302, + "rewards/margins": 0.16804258525371552, + "rewards/rejected": -0.8977524042129517, + "step": 503 + }, + { + "epoch": 0.66, + "learning_rate": 4.5925035263643444e-05, + "logits/chosen": -2.74474835395813, + "logits/rejected": -2.7397449016571045, + "logps/chosen": -203.23423767089844, + "logps/rejected": -243.91566467285156, + "loss": 0.5604, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6856958866119385, + "rewards/margins": 0.44254228472709656, + "rewards/rejected": -1.1282380819320679, + "step": 504 + }, + { + "epoch": 0.66, + "learning_rate": 4.5905407463378225e-05, + "logits/chosen": -2.6757442951202393, + "logits/rejected": -2.591583728790283, + "logps/chosen": -182.1700897216797, + "logps/rejected": -183.5037078857422, + "loss": 0.6673, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6767271161079407, + "rewards/margins": 0.10155776888132095, + "rewards/rejected": -0.7782848477363586, + "step": 505 + }, + { + "epoch": 0.66, + "learning_rate": 4.588573672161326e-05, + "logits/chosen": -2.50715708732605, + "logits/rejected": -2.579833507537842, + "logps/chosen": -151.3063201904297, + "logps/rejected": -178.596923828125, + "loss": 0.667, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5291253328323364, + "rewards/margins": 0.0967598482966423, + "rewards/rejected": -0.6258851289749146, + "step": 506 + }, + { + "epoch": 0.66, + "learning_rate": 4.586602307875396e-05, + "logits/chosen": -2.591066598892212, + "logits/rejected": -2.5947020053863525, + "logps/chosen": -197.9405517578125, + "logps/rejected": -199.92247009277344, + "loss": 0.7325, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8253288269042969, + "rewards/margins": 0.021833447739481926, + "rewards/rejected": -0.8471622467041016, + "step": 507 + }, + { + "epoch": 0.66, + "learning_rate": 4.5846266575293816e-05, + "logits/chosen": -2.6202571392059326, + "logits/rejected": -2.734771728515625, + "logps/chosen": -162.9832000732422, + "logps/rejected": -186.2266845703125, + "loss": 0.6053, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8867194652557373, + "rewards/margins": 0.30634805560112, + "rewards/rejected": -1.1930675506591797, + "step": 508 + }, + { + "epoch": 0.67, + "learning_rate": 4.582646725181441e-05, + "logits/chosen": -2.785458564758301, + "logits/rejected": -2.7122888565063477, + "logps/chosen": -183.4889678955078, + "logps/rejected": -173.61659240722656, + "loss": 0.738, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8772555589675903, + "rewards/margins": -0.023770909756422043, + "rewards/rejected": -0.853484570980072, + "step": 509 + }, + { + "epoch": 0.67, + "learning_rate": 4.580662514898522e-05, + "logits/chosen": -2.644120216369629, + "logits/rejected": -2.631178855895996, + "logps/chosen": -183.57171630859375, + "logps/rejected": -197.05075073242188, + "loss": 0.6977, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8446710705757141, + "rewards/margins": 0.08859595656394958, + "rewards/rejected": -0.9332669973373413, + "step": 510 + }, + { + "epoch": 0.67, + "learning_rate": 4.5786740307563636e-05, + "logits/chosen": -2.693103790283203, + "logits/rejected": -2.724278688430786, + "logps/chosen": -200.4249725341797, + "logps/rejected": -188.05783081054688, + "loss": 0.746, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9058306813240051, + "rewards/margins": -0.03762813284993172, + "rewards/rejected": -0.8682026863098145, + "step": 511 + }, + { + "epoch": 0.67, + "learning_rate": 4.576681276839483e-05, + "logits/chosen": -2.6434645652770996, + "logits/rejected": -2.641044855117798, + "logps/chosen": -160.1006317138672, + "logps/rejected": -175.09759521484375, + "loss": 0.6674, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8065643310546875, + "rewards/margins": 0.14114204049110413, + "rewards/rejected": -0.9477063417434692, + "step": 512 + }, + { + "epoch": 0.67, + "learning_rate": 4.574684257241168e-05, + "logits/chosen": -2.797175884246826, + "logits/rejected": -2.7650790214538574, + "logps/chosen": -195.47828674316406, + "logps/rejected": -176.28883361816406, + "loss": 0.6555, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6963549852371216, + "rewards/margins": 0.11164915561676025, + "rewards/rejected": -0.8080041408538818, + "step": 513 + }, + { + "epoch": 0.67, + "learning_rate": 4.572682976063468e-05, + "logits/chosen": -2.6579763889312744, + "logits/rejected": -2.6137382984161377, + "logps/chosen": -196.55987548828125, + "logps/rejected": -191.19906616210938, + "loss": 0.697, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6601398587226868, + "rewards/margins": 0.11734984815120697, + "rewards/rejected": -0.7774897217750549, + "step": 514 + }, + { + "epoch": 0.67, + "learning_rate": 4.5706774374171854e-05, + "logits/chosen": -2.549034595489502, + "logits/rejected": -2.645655632019043, + "logps/chosen": -161.01344299316406, + "logps/rejected": -181.75225830078125, + "loss": 0.6994, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6926446557044983, + "rewards/margins": 0.08336181193590164, + "rewards/rejected": -0.7760064005851746, + "step": 515 + }, + { + "epoch": 0.68, + "learning_rate": 4.56866764542187e-05, + "logits/chosen": -2.6017000675201416, + "logits/rejected": -2.6268606185913086, + "logps/chosen": -196.15757751464844, + "logps/rejected": -206.0800323486328, + "loss": 0.7811, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7164332866668701, + "rewards/margins": -0.03772084414958954, + "rewards/rejected": -0.6787124872207642, + "step": 516 + }, + { + "epoch": 0.68, + "learning_rate": 4.566653604205805e-05, + "logits/chosen": -2.723987579345703, + "logits/rejected": -2.8009376525878906, + "logps/chosen": -170.0558624267578, + "logps/rejected": -188.37606811523438, + "loss": 0.5947, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8201983571052551, + "rewards/margins": 0.3081669807434082, + "rewards/rejected": -1.128365397453308, + "step": 517 + }, + { + "epoch": 0.68, + "learning_rate": 4.5646353179060057e-05, + "logits/chosen": -2.6936893463134766, + "logits/rejected": -2.5472261905670166, + "logps/chosen": -199.6653289794922, + "logps/rejected": -171.17935180664062, + "loss": 0.7299, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5833121538162231, + "rewards/margins": 0.0050363726913928986, + "rewards/rejected": -0.5883485674858093, + "step": 518 + }, + { + "epoch": 0.68, + "learning_rate": 4.562612790668204e-05, + "logits/chosen": -2.8099305629730225, + "logits/rejected": -2.858193874359131, + "logps/chosen": -191.54202270507812, + "logps/rejected": -249.16433715820312, + "loss": 0.7677, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9312639236450195, + "rewards/margins": -0.06794437021017075, + "rewards/rejected": -0.8633195161819458, + "step": 519 + }, + { + "epoch": 0.68, + "learning_rate": 4.560586026646845e-05, + "logits/chosen": -2.5662684440612793, + "logits/rejected": -2.685896396636963, + "logps/chosen": -165.303955078125, + "logps/rejected": -192.7646484375, + "loss": 0.8062, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7227790355682373, + "rewards/margins": -0.14969179034233093, + "rewards/rejected": -0.5730872750282288, + "step": 520 + }, + { + "epoch": 0.68, + "learning_rate": 4.558555030005075e-05, + "logits/chosen": -2.5218491554260254, + "logits/rejected": -2.5660927295684814, + "logps/chosen": -157.65261840820312, + "logps/rejected": -177.45657348632812, + "loss": 0.6963, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7852264642715454, + "rewards/margins": 0.06401225179433823, + "rewards/rejected": -0.8492387533187866, + "step": 521 + }, + { + "epoch": 0.68, + "learning_rate": 4.556519804914736e-05, + "logits/chosen": -2.7509756088256836, + "logits/rejected": -2.6521804332733154, + "logps/chosen": -178.81834411621094, + "logps/rejected": -171.83724975585938, + "loss": 0.6878, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5949856042861938, + "rewards/margins": 0.05688241496682167, + "rewards/rejected": -0.6518680453300476, + "step": 522 + }, + { + "epoch": 0.68, + "learning_rate": 4.554480355556354e-05, + "logits/chosen": -2.7910945415496826, + "logits/rejected": -2.8190691471099854, + "logps/chosen": -241.06617736816406, + "logps/rejected": -238.115478515625, + "loss": 0.7471, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7306909561157227, + "rewards/margins": -0.022471264004707336, + "rewards/rejected": -0.7082197070121765, + "step": 523 + }, + { + "epoch": 0.69, + "learning_rate": 4.552436686119134e-05, + "logits/chosen": -2.630984306335449, + "logits/rejected": -2.585669994354248, + "logps/chosen": -185.63345336914062, + "logps/rejected": -161.37725830078125, + "loss": 0.7325, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7190293073654175, + "rewards/margins": -0.00044285133481025696, + "rewards/rejected": -0.7185863852500916, + "step": 524 + }, + { + "epoch": 0.69, + "learning_rate": 4.550388800800948e-05, + "logits/chosen": -2.581951856613159, + "logits/rejected": -2.5290775299072266, + "logps/chosen": -272.72467041015625, + "logps/rejected": -239.87452697753906, + "loss": 0.7218, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7720150351524353, + "rewards/margins": -0.0015503056347370148, + "rewards/rejected": -0.7704647779464722, + "step": 525 + }, + { + "epoch": 0.69, + "learning_rate": 4.548336703808328e-05, + "logits/chosen": -2.6833231449127197, + "logits/rejected": -2.715363025665283, + "logps/chosen": -194.5189971923828, + "logps/rejected": -201.06585693359375, + "loss": 0.662, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7054163217544556, + "rewards/margins": 0.18057569861412048, + "rewards/rejected": -0.8859919309616089, + "step": 526 + }, + { + "epoch": 0.69, + "learning_rate": 4.546280399356457e-05, + "logits/chosen": -2.634359836578369, + "logits/rejected": -2.760921001434326, + "logps/chosen": -183.7833251953125, + "logps/rejected": -212.13803100585938, + "loss": 0.638, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7029492259025574, + "rewards/margins": 0.2215665578842163, + "rewards/rejected": -0.9245157837867737, + "step": 527 + }, + { + "epoch": 0.69, + "learning_rate": 4.54421989166916e-05, + "logits/chosen": -2.6926794052124023, + "logits/rejected": -2.64449405670166, + "logps/chosen": -185.593994140625, + "logps/rejected": -182.61541748046875, + "loss": 0.6729, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.913753092288971, + "rewards/margins": 0.08913831412792206, + "rewards/rejected": -1.0028914213180542, + "step": 528 + }, + { + "epoch": 0.69, + "learning_rate": 4.542155184978898e-05, + "logits/chosen": -2.6856396198272705, + "logits/rejected": -2.7312099933624268, + "logps/chosen": -134.99691772460938, + "logps/rejected": -154.96502685546875, + "loss": 0.6361, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5297491550445557, + "rewards/margins": 0.20353960990905762, + "rewards/rejected": -0.7332887649536133, + "step": 529 + }, + { + "epoch": 0.69, + "learning_rate": 4.540086283526754e-05, + "logits/chosen": -2.493201971054077, + "logits/rejected": -2.410156488418579, + "logps/chosen": -226.10279846191406, + "logps/rejected": -228.56134033203125, + "loss": 0.7409, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.9568287134170532, + "rewards/margins": -0.040587056428194046, + "rewards/rejected": -0.9162416458129883, + "step": 530 + }, + { + "epoch": 0.7, + "learning_rate": 4.538013191562431e-05, + "logits/chosen": -2.5931506156921387, + "logits/rejected": -2.642500162124634, + "logps/chosen": -173.00411987304688, + "logps/rejected": -200.85208129882812, + "loss": 0.7962, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7807260751724243, + "rewards/margins": -0.091963991522789, + "rewards/rejected": -0.6887621283531189, + "step": 531 + }, + { + "epoch": 0.7, + "learning_rate": 4.5359359133442356e-05, + "logits/chosen": -2.560065984725952, + "logits/rejected": -2.5762548446655273, + "logps/chosen": -172.8507843017578, + "logps/rejected": -193.83258056640625, + "loss": 0.7569, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7677234411239624, + "rewards/margins": 0.06710009276866913, + "rewards/rejected": -0.8348235487937927, + "step": 532 + }, + { + "epoch": 0.7, + "learning_rate": 4.533854453139077e-05, + "logits/chosen": -2.7403817176818848, + "logits/rejected": -2.8105673789978027, + "logps/chosen": -215.16644287109375, + "logps/rejected": -243.01504516601562, + "loss": 0.68, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6151594519615173, + "rewards/margins": 0.12640537321567535, + "rewards/rejected": -0.7415647506713867, + "step": 533 + }, + { + "epoch": 0.7, + "learning_rate": 4.5317688152224515e-05, + "logits/chosen": -2.3807733058929443, + "logits/rejected": -2.496799945831299, + "logps/chosen": -217.8972625732422, + "logps/rejected": -202.62594604492188, + "loss": 0.7236, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8797967433929443, + "rewards/margins": 0.07766968756914139, + "rewards/rejected": -0.9574664235115051, + "step": 534 + }, + { + "epoch": 0.7, + "learning_rate": 4.52967900387844e-05, + "logits/chosen": -2.475679636001587, + "logits/rejected": -2.5895590782165527, + "logps/chosen": -168.10928344726562, + "logps/rejected": -231.15797424316406, + "loss": 0.6107, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6619678139686584, + "rewards/margins": 0.2175792008638382, + "rewards/rejected": -0.8795469999313354, + "step": 535 + }, + { + "epoch": 0.7, + "learning_rate": 4.5275850233996925e-05, + "logits/chosen": -2.6471853256225586, + "logits/rejected": -2.6549582481384277, + "logps/chosen": -165.9920654296875, + "logps/rejected": -146.60623168945312, + "loss": 0.6382, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7165468335151672, + "rewards/margins": 0.16847476363182068, + "rewards/rejected": -0.8850216865539551, + "step": 536 + }, + { + "epoch": 0.7, + "learning_rate": 4.525486878087426e-05, + "logits/chosen": -2.6429080963134766, + "logits/rejected": -2.658334732055664, + "logps/chosen": -196.52894592285156, + "logps/rejected": -198.9338836669922, + "loss": 0.6805, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.759401798248291, + "rewards/margins": 0.09011213481426239, + "rewards/rejected": -0.8495139479637146, + "step": 537 + }, + { + "epoch": 0.7, + "learning_rate": 4.523384572251409e-05, + "logits/chosen": -2.7667698860168457, + "logits/rejected": -2.823873996734619, + "logps/chosen": -162.4055938720703, + "logps/rejected": -157.1728057861328, + "loss": 0.6762, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8268322348594666, + "rewards/margins": 0.13257114589214325, + "rewards/rejected": -0.9594033360481262, + "step": 538 + }, + { + "epoch": 0.71, + "learning_rate": 4.52127811020996e-05, + "logits/chosen": -2.6934759616851807, + "logits/rejected": -2.6713662147521973, + "logps/chosen": -232.28492736816406, + "logps/rejected": -225.9707489013672, + "loss": 0.6317, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7875734567642212, + "rewards/margins": 0.22967377305030823, + "rewards/rejected": -1.017247200012207, + "step": 539 + }, + { + "epoch": 0.71, + "learning_rate": 4.5191674962899314e-05, + "logits/chosen": -2.551192045211792, + "logits/rejected": -2.4736685752868652, + "logps/chosen": -186.871337890625, + "logps/rejected": -182.81036376953125, + "loss": 0.7075, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7137445211410522, + "rewards/margins": -0.007666848599910736, + "rewards/rejected": -0.7060777544975281, + "step": 540 + }, + { + "epoch": 0.71, + "learning_rate": 4.5170527348267054e-05, + "logits/chosen": -2.664978265762329, + "logits/rejected": -2.723323345184326, + "logps/chosen": -181.50009155273438, + "logps/rejected": -218.21827697753906, + "loss": 0.7018, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7352172136306763, + "rewards/margins": 0.05204403027892113, + "rewards/rejected": -0.7872611880302429, + "step": 541 + }, + { + "epoch": 0.71, + "learning_rate": 4.5149338301641845e-05, + "logits/chosen": -2.622648000717163, + "logits/rejected": -2.658144474029541, + "logps/chosen": -161.30264282226562, + "logps/rejected": -163.90829467773438, + "loss": 0.6798, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7303928136825562, + "rewards/margins": 0.10052241384983063, + "rewards/rejected": -0.8309152126312256, + "step": 542 + }, + { + "epoch": 0.71, + "learning_rate": 4.512810786654779e-05, + "logits/chosen": -2.7670962810516357, + "logits/rejected": -2.738875389099121, + "logps/chosen": -198.58111572265625, + "logps/rejected": -200.0714569091797, + "loss": 0.655, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6926169395446777, + "rewards/margins": 0.17897403240203857, + "rewards/rejected": -0.8715909719467163, + "step": 543 + }, + { + "epoch": 0.71, + "learning_rate": 4.510683608659403e-05, + "logits/chosen": -2.6602683067321777, + "logits/rejected": -2.5785250663757324, + "logps/chosen": -156.00543212890625, + "logps/rejected": -148.37716674804688, + "loss": 0.6658, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.49638113379478455, + "rewards/margins": 0.09628915786743164, + "rewards/rejected": -0.5926702618598938, + "step": 544 + }, + { + "epoch": 0.71, + "learning_rate": 4.508552300547463e-05, + "logits/chosen": -2.6110527515411377, + "logits/rejected": -2.5978469848632812, + "logps/chosen": -212.41812133789062, + "logps/rejected": -218.17910766601562, + "loss": 0.6521, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6882189512252808, + "rewards/margins": 0.12167090177536011, + "rewards/rejected": -0.8098899722099304, + "step": 545 + }, + { + "epoch": 0.71, + "learning_rate": 4.506416866696848e-05, + "logits/chosen": -2.706355571746826, + "logits/rejected": -2.8541836738586426, + "logps/chosen": -169.2959747314453, + "logps/rejected": -200.96258544921875, + "loss": 0.5834, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8061919808387756, + "rewards/margins": 0.32039594650268555, + "rewards/rejected": -1.126587986946106, + "step": 546 + }, + { + "epoch": 0.72, + "learning_rate": 4.504277311493922e-05, + "logits/chosen": -2.6320695877075195, + "logits/rejected": -2.7103400230407715, + "logps/chosen": -153.4121551513672, + "logps/rejected": -177.87684631347656, + "loss": 0.6849, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5557705163955688, + "rewards/margins": 0.1074688509106636, + "rewards/rejected": -0.6632393598556519, + "step": 547 + }, + { + "epoch": 0.72, + "learning_rate": 4.502133639333516e-05, + "logits/chosen": -2.779649019241333, + "logits/rejected": -2.880277633666992, + "logps/chosen": -169.7550506591797, + "logps/rejected": -205.27264404296875, + "loss": 0.735, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8947506546974182, + "rewards/margins": -0.022921644151210785, + "rewards/rejected": -0.8718290328979492, + "step": 548 + }, + { + "epoch": 0.72, + "learning_rate": 4.499985854618915e-05, + "logits/chosen": -2.786224126815796, + "logits/rejected": -2.7625045776367188, + "logps/chosen": -194.6786346435547, + "logps/rejected": -171.3389434814453, + "loss": 0.717, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7077013850212097, + "rewards/margins": 0.03108108416199684, + "rewards/rejected": -0.7387824058532715, + "step": 549 + }, + { + "epoch": 0.72, + "learning_rate": 4.497833961761855e-05, + "logits/chosen": -2.649867057800293, + "logits/rejected": -2.616454839706421, + "logps/chosen": -181.42636108398438, + "logps/rejected": -187.4470672607422, + "loss": 0.6947, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5831429958343506, + "rewards/margins": 0.03419647365808487, + "rewards/rejected": -0.617339551448822, + "step": 550 + }, + { + "epoch": 0.72, + "learning_rate": 4.495677965182506e-05, + "logits/chosen": -2.752405881881714, + "logits/rejected": -2.801346778869629, + "logps/chosen": -199.30345153808594, + "logps/rejected": -204.13937377929688, + "loss": 0.6993, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8525046706199646, + "rewards/margins": 0.04431546479463577, + "rewards/rejected": -0.8968201875686646, + "step": 551 + }, + { + "epoch": 0.72, + "learning_rate": 4.4935178693094714e-05, + "logits/chosen": -2.6438167095184326, + "logits/rejected": -2.677015781402588, + "logps/chosen": -176.24278259277344, + "logps/rejected": -183.13050842285156, + "loss": 0.5542, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5732276439666748, + "rewards/margins": 0.34234461188316345, + "rewards/rejected": -0.9155722856521606, + "step": 552 + }, + { + "epoch": 0.72, + "learning_rate": 4.491353678579774e-05, + "logits/chosen": -2.7374353408813477, + "logits/rejected": -2.724957227706909, + "logps/chosen": -168.60983276367188, + "logps/rejected": -178.94168090820312, + "loss": 0.6891, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7404319643974304, + "rewards/margins": 0.09779490530490875, + "rewards/rejected": -0.8382267951965332, + "step": 553 + }, + { + "epoch": 0.73, + "learning_rate": 4.489185397438845e-05, + "logits/chosen": -2.569044351577759, + "logits/rejected": -2.4559781551361084, + "logps/chosen": -122.60645294189453, + "logps/rejected": -121.57733154296875, + "loss": 0.7095, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5707786679267883, + "rewards/margins": 0.018663479015231133, + "rewards/rejected": -0.589442253112793, + "step": 554 + }, + { + "epoch": 0.73, + "learning_rate": 4.4870130303405214e-05, + "logits/chosen": -2.6766295433044434, + "logits/rejected": -2.6980645656585693, + "logps/chosen": -168.9379425048828, + "logps/rejected": -186.576904296875, + "loss": 0.6257, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5627405643463135, + "rewards/margins": 0.19951972365379333, + "rewards/rejected": -0.7622602581977844, + "step": 555 + }, + { + "epoch": 0.73, + "learning_rate": 4.484836581747032e-05, + "logits/chosen": -2.6670761108398438, + "logits/rejected": -2.690931558609009, + "logps/chosen": -201.3208770751953, + "logps/rejected": -182.73782348632812, + "loss": 0.7019, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6702454686164856, + "rewards/margins": 0.044957034289836884, + "rewards/rejected": -0.7152025103569031, + "step": 556 + }, + { + "epoch": 0.73, + "learning_rate": 4.4826560561289865e-05, + "logits/chosen": -2.5527472496032715, + "logits/rejected": -2.5464069843292236, + "logps/chosen": -188.2076416015625, + "logps/rejected": -173.2044677734375, + "loss": 0.6589, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6993139982223511, + "rewards/margins": 0.14693805575370789, + "rewards/rejected": -0.8462520837783813, + "step": 557 + }, + { + "epoch": 0.73, + "learning_rate": 4.4804714579653736e-05, + "logits/chosen": -2.694082736968994, + "logits/rejected": -2.6660871505737305, + "logps/chosen": -191.5103759765625, + "logps/rejected": -182.28424072265625, + "loss": 0.828, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8855656981468201, + "rewards/margins": -0.18190453946590424, + "rewards/rejected": -0.7036612033843994, + "step": 558 + }, + { + "epoch": 0.73, + "learning_rate": 4.4782827917435454e-05, + "logits/chosen": -2.8244099617004395, + "logits/rejected": -2.811955690383911, + "logps/chosen": -202.20606994628906, + "logps/rejected": -215.3787841796875, + "loss": 0.78, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9945522546768188, + "rewards/margins": -0.01987658441066742, + "rewards/rejected": -0.9746755957603455, + "step": 559 + }, + { + "epoch": 0.73, + "learning_rate": 4.4760900619592085e-05, + "logits/chosen": -2.6444430351257324, + "logits/rejected": -2.6651206016540527, + "logps/chosen": -176.60488891601562, + "logps/rejected": -185.3902130126953, + "loss": 0.6646, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8423606157302856, + "rewards/margins": 0.12972018122673035, + "rewards/rejected": -0.9720807075500488, + "step": 560 + }, + { + "epoch": 0.73, + "learning_rate": 4.4738932731164194e-05, + "logits/chosen": -2.6871867179870605, + "logits/rejected": -2.693873405456543, + "logps/chosen": -199.2325439453125, + "logps/rejected": -196.27951049804688, + "loss": 0.7607, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.791627049446106, + "rewards/margins": -0.03532877564430237, + "rewards/rejected": -0.7562982439994812, + "step": 561 + }, + { + "epoch": 0.74, + "learning_rate": 4.47169242972757e-05, + "logits/chosen": -2.7293338775634766, + "logits/rejected": -2.7268292903900146, + "logps/chosen": -210.5205078125, + "logps/rejected": -198.358642578125, + "loss": 0.8319, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9199119806289673, + "rewards/margins": -0.1811569780111313, + "rewards/rejected": -0.73875492811203, + "step": 562 + }, + { + "epoch": 0.74, + "learning_rate": 4.469487536313381e-05, + "logits/chosen": -2.591933012008667, + "logits/rejected": -2.6240625381469727, + "logps/chosen": -159.45765686035156, + "logps/rejected": -182.98690795898438, + "loss": 0.6585, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7039793729782104, + "rewards/margins": 0.1428254246711731, + "rewards/rejected": -0.8468047380447388, + "step": 563 + }, + { + "epoch": 0.74, + "learning_rate": 4.467278597402894e-05, + "logits/chosen": -2.5494794845581055, + "logits/rejected": -2.541128635406494, + "logps/chosen": -163.43701171875, + "logps/rejected": -166.59567260742188, + "loss": 0.6618, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5548999905586243, + "rewards/margins": 0.15531884133815765, + "rewards/rejected": -0.7102188467979431, + "step": 564 + }, + { + "epoch": 0.74, + "learning_rate": 4.465065617533457e-05, + "logits/chosen": -2.6077260971069336, + "logits/rejected": -2.5411581993103027, + "logps/chosen": -205.11749267578125, + "logps/rejected": -210.03114318847656, + "loss": 0.7476, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7790584564208984, + "rewards/margins": 0.018771033734083176, + "rewards/rejected": -0.7978294491767883, + "step": 565 + }, + { + "epoch": 0.74, + "learning_rate": 4.462848601250722e-05, + "logits/chosen": -2.645805835723877, + "logits/rejected": -2.5978736877441406, + "logps/chosen": -155.65281677246094, + "logps/rejected": -156.344970703125, + "loss": 0.6091, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5091183185577393, + "rewards/margins": 0.21079742908477783, + "rewards/rejected": -0.7199157476425171, + "step": 566 + }, + { + "epoch": 0.74, + "learning_rate": 4.4606275531086295e-05, + "logits/chosen": -2.783046007156372, + "logits/rejected": -2.771733522415161, + "logps/chosen": -192.69659423828125, + "logps/rejected": -176.84600830078125, + "loss": 0.8056, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.7212276458740234, + "rewards/margins": -0.1634465754032135, + "rewards/rejected": -0.5577811002731323, + "step": 567 + }, + { + "epoch": 0.74, + "learning_rate": 4.4584024776694035e-05, + "logits/chosen": -2.6829848289489746, + "logits/rejected": -2.69008731842041, + "logps/chosen": -160.82737731933594, + "logps/rejected": -197.73060607910156, + "loss": 0.6111, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5301526188850403, + "rewards/margins": 0.34297025203704834, + "rewards/rejected": -0.8731229305267334, + "step": 568 + }, + { + "epoch": 0.74, + "learning_rate": 4.45617337950354e-05, + "logits/chosen": -2.56026029586792, + "logits/rejected": -2.687854290008545, + "logps/chosen": -168.34933471679688, + "logps/rejected": -196.67616271972656, + "loss": 0.6421, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6444015502929688, + "rewards/margins": 0.2516985237598419, + "rewards/rejected": -0.8961001038551331, + "step": 569 + }, + { + "epoch": 0.75, + "learning_rate": 4.453940263189797e-05, + "logits/chosen": -2.7095446586608887, + "logits/rejected": -2.6890676021575928, + "logps/chosen": -207.63217163085938, + "logps/rejected": -215.61146545410156, + "loss": 0.6531, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.670657753944397, + "rewards/margins": 0.11775386333465576, + "rewards/rejected": -0.7884116172790527, + "step": 570 + }, + { + "epoch": 0.75, + "learning_rate": 4.4517031333151874e-05, + "logits/chosen": -2.6166491508483887, + "logits/rejected": -2.618901014328003, + "logps/chosen": -162.6239776611328, + "logps/rejected": -197.70130920410156, + "loss": 0.6682, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7643678784370422, + "rewards/margins": 0.09301282465457916, + "rewards/rejected": -0.857380747795105, + "step": 571 + }, + { + "epoch": 0.75, + "learning_rate": 4.449461994474968e-05, + "logits/chosen": -2.703096866607666, + "logits/rejected": -2.726106882095337, + "logps/chosen": -142.5395965576172, + "logps/rejected": -187.09109497070312, + "loss": 0.6036, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5837680697441101, + "rewards/margins": 0.24383926391601562, + "rewards/rejected": -0.8276073336601257, + "step": 572 + }, + { + "epoch": 0.75, + "learning_rate": 4.44721685127263e-05, + "logits/chosen": -2.6250483989715576, + "logits/rejected": -2.651967763900757, + "logps/chosen": -170.19578552246094, + "logps/rejected": -179.1065673828125, + "loss": 0.6918, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6374863386154175, + "rewards/margins": 0.08425635099411011, + "rewards/rejected": -0.7217426300048828, + "step": 573 + }, + { + "epoch": 0.75, + "learning_rate": 4.4449677083198896e-05, + "logits/chosen": -2.602550745010376, + "logits/rejected": -2.712120294570923, + "logps/chosen": -157.83306884765625, + "logps/rejected": -180.02627563476562, + "loss": 0.6177, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6926807761192322, + "rewards/margins": 0.24708034098148346, + "rewards/rejected": -0.9397611618041992, + "step": 574 + }, + { + "epoch": 0.75, + "learning_rate": 4.4427145702366804e-05, + "logits/chosen": -2.6738860607147217, + "logits/rejected": -2.696126937866211, + "logps/chosen": -144.033203125, + "logps/rejected": -177.09173583984375, + "loss": 0.6195, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41337087750434875, + "rewards/margins": 0.2742076814174652, + "rewards/rejected": -0.6875784993171692, + "step": 575 + }, + { + "epoch": 0.75, + "learning_rate": 4.440457441651139e-05, + "logits/chosen": -2.5672965049743652, + "logits/rejected": -2.638249635696411, + "logps/chosen": -120.28506469726562, + "logps/rejected": -148.57440185546875, + "loss": 0.6443, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5808289051055908, + "rewards/margins": 0.20668093860149384, + "rewards/rejected": -0.7875099182128906, + "step": 576 + }, + { + "epoch": 0.76, + "learning_rate": 4.4381963271996044e-05, + "logits/chosen": -2.580237865447998, + "logits/rejected": -2.6456117630004883, + "logps/chosen": -178.21829223632812, + "logps/rejected": -236.28274536132812, + "loss": 0.6341, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7112021446228027, + "rewards/margins": 0.18514475226402283, + "rewards/rejected": -0.896346926689148, + "step": 577 + }, + { + "epoch": 0.76, + "learning_rate": 4.435931231526597e-05, + "logits/chosen": -2.65242075920105, + "logits/rejected": -2.6544103622436523, + "logps/chosen": -211.60411071777344, + "logps/rejected": -235.25247192382812, + "loss": 0.7102, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.874238908290863, + "rewards/margins": 0.050793394446372986, + "rewards/rejected": -0.9250323176383972, + "step": 578 + }, + { + "epoch": 0.76, + "learning_rate": 4.433662159284818e-05, + "logits/chosen": -2.6615688800811768, + "logits/rejected": -2.6471145153045654, + "logps/chosen": -191.8739471435547, + "logps/rejected": -193.9615020751953, + "loss": 0.7658, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7925047874450684, + "rewards/margins": -0.02974768541753292, + "rewards/rejected": -0.7627571225166321, + "step": 579 + }, + { + "epoch": 0.76, + "learning_rate": 4.4313891151351375e-05, + "logits/chosen": -2.7032251358032227, + "logits/rejected": -2.746817111968994, + "logps/chosen": -137.22633361816406, + "logps/rejected": -143.8455352783203, + "loss": 0.6467, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.762434184551239, + "rewards/margins": 0.17979009449481964, + "rewards/rejected": -0.9422242641448975, + "step": 580 + }, + { + "epoch": 0.76, + "learning_rate": 4.429112103746582e-05, + "logits/chosen": -2.7502665519714355, + "logits/rejected": -2.6706719398498535, + "logps/chosen": -173.37606811523438, + "logps/rejected": -231.8168182373047, + "loss": 0.6431, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7993868589401245, + "rewards/margins": 0.17561480402946472, + "rewards/rejected": -0.9750015735626221, + "step": 581 + }, + { + "epoch": 0.76, + "learning_rate": 4.4268311297963295e-05, + "logits/chosen": -2.752943754196167, + "logits/rejected": -2.7461259365081787, + "logps/chosen": -194.4534912109375, + "logps/rejected": -190.17312622070312, + "loss": 0.7105, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8308650851249695, + "rewards/margins": 0.08960982412099838, + "rewards/rejected": -0.9204750061035156, + "step": 582 + }, + { + "epoch": 0.76, + "learning_rate": 4.4245461979696937e-05, + "logits/chosen": -2.6702442169189453, + "logits/rejected": -2.6548166275024414, + "logps/chosen": -146.55252075195312, + "logps/rejected": -167.94862365722656, + "loss": 0.6055, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6801164150238037, + "rewards/margins": 0.27670958638191223, + "rewards/rejected": -0.9568260312080383, + "step": 583 + }, + { + "epoch": 0.76, + "learning_rate": 4.422257312960123e-05, + "logits/chosen": -2.6083147525787354, + "logits/rejected": -2.590127944946289, + "logps/chosen": -203.12689208984375, + "logps/rejected": -296.73431396484375, + "loss": 0.6476, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9686077833175659, + "rewards/margins": 0.2269180417060852, + "rewards/rejected": -1.1955257654190063, + "step": 584 + }, + { + "epoch": 0.77, + "learning_rate": 4.419964479469182e-05, + "logits/chosen": -2.757424831390381, + "logits/rejected": -2.861577272415161, + "logps/chosen": -225.32345581054688, + "logps/rejected": -259.8649597167969, + "loss": 0.6847, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.280800700187683, + "rewards/margins": 0.11630728095769882, + "rewards/rejected": -1.3971078395843506, + "step": 585 + }, + { + "epoch": 0.77, + "learning_rate": 4.417667702206548e-05, + "logits/chosen": -2.7444422245025635, + "logits/rejected": -2.7280027866363525, + "logps/chosen": -192.2440185546875, + "logps/rejected": -232.78993225097656, + "loss": 0.6904, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8600846529006958, + "rewards/margins": 0.0870635136961937, + "rewards/rejected": -0.9471482038497925, + "step": 586 + }, + { + "epoch": 0.77, + "learning_rate": 4.415366985889998e-05, + "logits/chosen": -2.7146828174591064, + "logits/rejected": -2.74285888671875, + "logps/chosen": -218.16299438476562, + "logps/rejected": -260.4196472167969, + "loss": 0.586, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9427146911621094, + "rewards/margins": 0.3159841001033783, + "rewards/rejected": -1.2586987018585205, + "step": 587 + }, + { + "epoch": 0.77, + "learning_rate": 4.413062335245402e-05, + "logits/chosen": -2.773749589920044, + "logits/rejected": -2.797802686691284, + "logps/chosen": -172.09722900390625, + "logps/rejected": -212.4481658935547, + "loss": 0.6053, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6340043544769287, + "rewards/margins": 0.2507874667644501, + "rewards/rejected": -0.884791910648346, + "step": 588 + }, + { + "epoch": 0.77, + "learning_rate": 4.410753755006708e-05, + "logits/chosen": -2.5004916191101074, + "logits/rejected": -2.52752685546875, + "logps/chosen": -148.71890258789062, + "logps/rejected": -165.1573944091797, + "loss": 0.5859, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6494738459587097, + "rewards/margins": 0.30606481432914734, + "rewards/rejected": -0.9555386304855347, + "step": 589 + }, + { + "epoch": 0.77, + "learning_rate": 4.408441249915938e-05, + "logits/chosen": -2.7454075813293457, + "logits/rejected": -2.7361526489257812, + "logps/chosen": -173.478271484375, + "logps/rejected": -190.50579833984375, + "loss": 0.7229, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8305282592773438, + "rewards/margins": 0.005613129585981369, + "rewards/rejected": -0.8361413478851318, + "step": 590 + }, + { + "epoch": 0.77, + "learning_rate": 4.4061248247231776e-05, + "logits/chosen": -2.5979197025299072, + "logits/rejected": -2.723015308380127, + "logps/chosen": -188.89573669433594, + "logps/rejected": -205.31224060058594, + "loss": 0.5976, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0508185625076294, + "rewards/margins": 0.3824726641178131, + "rewards/rejected": -1.4332913160324097, + "step": 591 + }, + { + "epoch": 0.77, + "learning_rate": 4.4038044841865614e-05, + "logits/chosen": -2.586750030517578, + "logits/rejected": -2.719773054122925, + "logps/chosen": -157.79367065429688, + "logps/rejected": -174.4828643798828, + "loss": 0.7857, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0488148927688599, + "rewards/margins": 0.035450052469968796, + "rewards/rejected": -1.084264874458313, + "step": 592 + }, + { + "epoch": 0.78, + "learning_rate": 4.401480233072268e-05, + "logits/chosen": -2.6652653217315674, + "logits/rejected": -2.6724514961242676, + "logps/chosen": -178.22274780273438, + "logps/rejected": -182.68753051757812, + "loss": 0.8022, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.05448317527771, + "rewards/margins": -0.15002277493476868, + "rewards/rejected": -0.9044604897499084, + "step": 593 + }, + { + "epoch": 0.78, + "learning_rate": 4.399152076154509e-05, + "logits/chosen": -2.73130202293396, + "logits/rejected": -2.709644079208374, + "logps/chosen": -179.2704620361328, + "logps/rejected": -201.22584533691406, + "loss": 0.6426, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1346244812011719, + "rewards/margins": 0.15731468796730042, + "rewards/rejected": -1.2919390201568604, + "step": 594 + }, + { + "epoch": 0.78, + "learning_rate": 4.396820018215518e-05, + "logits/chosen": -2.703145742416382, + "logits/rejected": -2.724557876586914, + "logps/chosen": -185.3358154296875, + "logps/rejected": -200.76834106445312, + "loss": 0.6528, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8781923055648804, + "rewards/margins": 0.12637923657894135, + "rewards/rejected": -1.004571557044983, + "step": 595 + }, + { + "epoch": 0.78, + "learning_rate": 4.394484064045542e-05, + "logits/chosen": -2.7452504634857178, + "logits/rejected": -2.7653212547302246, + "logps/chosen": -172.15745544433594, + "logps/rejected": -198.72596740722656, + "loss": 0.6422, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0150758028030396, + "rewards/margins": 0.16777384281158447, + "rewards/rejected": -1.182849645614624, + "step": 596 + }, + { + "epoch": 0.78, + "learning_rate": 4.392144218442831e-05, + "logits/chosen": -2.7004809379577637, + "logits/rejected": -2.697211980819702, + "logps/chosen": -219.4707794189453, + "logps/rejected": -230.44607543945312, + "loss": 0.76, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.201088309288025, + "rewards/margins": -0.015895769000053406, + "rewards/rejected": -1.185192584991455, + "step": 597 + }, + { + "epoch": 0.78, + "learning_rate": 4.3898004862136286e-05, + "logits/chosen": -2.7089436054229736, + "logits/rejected": -2.7471840381622314, + "logps/chosen": -178.98870849609375, + "logps/rejected": -186.9143829345703, + "loss": 0.7313, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2119524478912354, + "rewards/margins": 0.017535462975502014, + "rewards/rejected": -1.2294878959655762, + "step": 598 + }, + { + "epoch": 0.78, + "learning_rate": 4.3874528721721624e-05, + "logits/chosen": -2.684823751449585, + "logits/rejected": -2.6925339698791504, + "logps/chosen": -177.93600463867188, + "logps/rejected": -196.6461181640625, + "loss": 0.6405, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6519026160240173, + "rewards/margins": 0.23315802216529846, + "rewards/rejected": -0.8850606679916382, + "step": 599 + }, + { + "epoch": 0.79, + "learning_rate": 4.385101381140633e-05, + "logits/chosen": -2.646517038345337, + "logits/rejected": -2.6656806468963623, + "logps/chosen": -133.48558044433594, + "logps/rejected": -172.92977905273438, + "loss": 0.5829, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7560878992080688, + "rewards/margins": 0.29395031929016113, + "rewards/rejected": -1.0500380992889404, + "step": 600 + }, + { + "epoch": 0.79, + "learning_rate": 4.382746017949203e-05, + "logits/chosen": -2.566793441772461, + "logits/rejected": -2.5946078300476074, + "logps/chosen": -183.47927856445312, + "logps/rejected": -224.47052001953125, + "loss": 0.5809, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7541132569313049, + "rewards/margins": 0.33301299810409546, + "rewards/rejected": -1.08712637424469, + "step": 601 + }, + { + "epoch": 0.79, + "learning_rate": 4.380386787435992e-05, + "logits/chosen": -2.5148720741271973, + "logits/rejected": -2.431692600250244, + "logps/chosen": -172.89552307128906, + "logps/rejected": -179.71328735351562, + "loss": 0.6558, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9377825260162354, + "rewards/margins": 0.14634236693382263, + "rewards/rejected": -1.0841249227523804, + "step": 602 + }, + { + "epoch": 0.79, + "learning_rate": 4.378023694447061e-05, + "logits/chosen": -2.536116600036621, + "logits/rejected": -2.6387391090393066, + "logps/chosen": -143.99391174316406, + "logps/rejected": -181.4296875, + "loss": 0.5612, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8889681100845337, + "rewards/margins": 0.41685062646865845, + "rewards/rejected": -1.3058186769485474, + "step": 603 + }, + { + "epoch": 0.79, + "learning_rate": 4.375656743836407e-05, + "logits/chosen": -2.74141526222229, + "logits/rejected": -2.6809630393981934, + "logps/chosen": -208.21409606933594, + "logps/rejected": -211.6444549560547, + "loss": 0.8776, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.2066153287887573, + "rewards/margins": -0.26936572790145874, + "rewards/rejected": -0.9372495412826538, + "step": 604 + }, + { + "epoch": 0.79, + "learning_rate": 4.373285940465948e-05, + "logits/chosen": -2.6372122764587402, + "logits/rejected": -2.668998956680298, + "logps/chosen": -201.66514587402344, + "logps/rejected": -199.07876586914062, + "loss": 0.7134, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0678907632827759, + "rewards/margins": 0.08854639530181885, + "rewards/rejected": -1.1564371585845947, + "step": 605 + }, + { + "epoch": 0.79, + "learning_rate": 4.370911289205518e-05, + "logits/chosen": -2.638387680053711, + "logits/rejected": -2.5716373920440674, + "logps/chosen": -205.70352172851562, + "logps/rejected": -193.94918823242188, + "loss": 0.8545, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4150516986846924, + "rewards/margins": -0.1472320556640625, + "rewards/rejected": -1.2678195238113403, + "step": 606 + }, + { + "epoch": 0.79, + "learning_rate": 4.368532794932854e-05, + "logits/chosen": -2.53859543800354, + "logits/rejected": -2.731680154800415, + "logps/chosen": -164.7539520263672, + "logps/rejected": -206.26397705078125, + "loss": 0.7565, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2516262531280518, + "rewards/margins": 0.0020126476883888245, + "rewards/rejected": -1.2536388635635376, + "step": 607 + }, + { + "epoch": 0.8, + "learning_rate": 4.366150462533588e-05, + "logits/chosen": -2.729710340499878, + "logits/rejected": -2.6896419525146484, + "logps/chosen": -191.62673950195312, + "logps/rejected": -183.93675231933594, + "loss": 0.7964, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.2322921752929688, + "rewards/margins": -0.07104435563087463, + "rewards/rejected": -1.1612478494644165, + "step": 608 + }, + { + "epoch": 0.8, + "learning_rate": 4.363764296901234e-05, + "logits/chosen": -2.5539419651031494, + "logits/rejected": -2.580127239227295, + "logps/chosen": -188.94125366210938, + "logps/rejected": -215.50633239746094, + "loss": 0.665, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0811361074447632, + "rewards/margins": 0.14258897304534912, + "rewards/rejected": -1.2237250804901123, + "step": 609 + }, + { + "epoch": 0.8, + "learning_rate": 4.361374302937182e-05, + "logits/chosen": -2.505577802658081, + "logits/rejected": -2.5432798862457275, + "logps/chosen": -220.49464416503906, + "logps/rejected": -237.48265075683594, + "loss": 0.7922, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.2271060943603516, + "rewards/margins": -0.0697195902466774, + "rewards/rejected": -1.1573865413665771, + "step": 610 + }, + { + "epoch": 0.8, + "learning_rate": 4.358980485550683e-05, + "logits/chosen": -2.7005603313446045, + "logits/rejected": -2.764317274093628, + "logps/chosen": -191.95233154296875, + "logps/rejected": -204.19862365722656, + "loss": 0.5799, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1303952932357788, + "rewards/margins": 0.4044473469257355, + "rewards/rejected": -1.5348427295684814, + "step": 611 + }, + { + "epoch": 0.8, + "learning_rate": 4.356582849658845e-05, + "logits/chosen": -2.6627776622772217, + "logits/rejected": -2.712620258331299, + "logps/chosen": -246.655517578125, + "logps/rejected": -261.1457214355469, + "loss": 0.737, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0685715675354004, + "rewards/margins": 0.14301545917987823, + "rewards/rejected": -1.2115869522094727, + "step": 612 + }, + { + "epoch": 0.8, + "learning_rate": 4.354181400186617e-05, + "logits/chosen": -2.617241382598877, + "logits/rejected": -2.6537880897521973, + "logps/chosen": -179.02081298828125, + "logps/rejected": -185.8905792236328, + "loss": 0.6874, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0464906692504883, + "rewards/margins": 0.13806383311748505, + "rewards/rejected": -1.1845545768737793, + "step": 613 + }, + { + "epoch": 0.8, + "learning_rate": 4.351776142066782e-05, + "logits/chosen": -2.577658176422119, + "logits/rejected": -2.62485933303833, + "logps/chosen": -187.30638122558594, + "logps/rejected": -213.94476318359375, + "loss": 0.6266, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8167380094528198, + "rewards/margins": 0.23687534034252167, + "rewards/rejected": -1.053613305091858, + "step": 614 + }, + { + "epoch": 0.8, + "learning_rate": 4.349367080239946e-05, + "logits/chosen": -2.627676248550415, + "logits/rejected": -2.682474136352539, + "logps/chosen": -146.130615234375, + "logps/rejected": -170.17799377441406, + "loss": 0.5688, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8195935487747192, + "rewards/margins": 0.3726154565811157, + "rewards/rejected": -1.1922091245651245, + "step": 615 + }, + { + "epoch": 0.81, + "learning_rate": 4.34695421965453e-05, + "logits/chosen": -2.5791420936584473, + "logits/rejected": -2.5302531719207764, + "logps/chosen": -175.85426330566406, + "logps/rejected": -170.61648559570312, + "loss": 0.7291, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.085453748703003, + "rewards/margins": -0.03215315565466881, + "rewards/rejected": -1.0533006191253662, + "step": 616 + }, + { + "epoch": 0.81, + "learning_rate": 4.344537565266755e-05, + "logits/chosen": -2.757708787918091, + "logits/rejected": -2.792357921600342, + "logps/chosen": -217.09375, + "logps/rejected": -204.7152099609375, + "loss": 0.6347, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9034044742584229, + "rewards/margins": 0.4320634603500366, + "rewards/rejected": -1.335468053817749, + "step": 617 + }, + { + "epoch": 0.81, + "learning_rate": 4.342117122040637e-05, + "logits/chosen": -2.6127302646636963, + "logits/rejected": -2.6188175678253174, + "logps/chosen": -166.92210388183594, + "logps/rejected": -188.92709350585938, + "loss": 0.7474, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.006317138671875, + "rewards/margins": -0.026082061231136322, + "rewards/rejected": -0.9802349805831909, + "step": 618 + }, + { + "epoch": 0.81, + "learning_rate": 4.339692894947974e-05, + "logits/chosen": -2.401575803756714, + "logits/rejected": -2.467298984527588, + "logps/chosen": -161.87429809570312, + "logps/rejected": -187.6583251953125, + "loss": 0.6542, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7750260829925537, + "rewards/margins": 0.2767452895641327, + "rewards/rejected": -1.0517714023590088, + "step": 619 + }, + { + "epoch": 0.81, + "learning_rate": 4.3372648889683364e-05, + "logits/chosen": -2.485372543334961, + "logits/rejected": -2.494091510772705, + "logps/chosen": -135.72772216796875, + "logps/rejected": -164.1346435546875, + "loss": 0.5291, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5809693336486816, + "rewards/margins": 0.3957809805870056, + "rewards/rejected": -0.9767501354217529, + "step": 620 + }, + { + "epoch": 0.81, + "learning_rate": 4.334833109089057e-05, + "logits/chosen": -2.7087979316711426, + "logits/rejected": -2.6643896102905273, + "logps/chosen": -227.6691131591797, + "logps/rejected": -246.98077392578125, + "loss": 0.6771, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9124298095703125, + "rewards/margins": 0.15321853756904602, + "rewards/rejected": -1.0656484365463257, + "step": 621 + }, + { + "epoch": 0.81, + "learning_rate": 4.33239756030522e-05, + "logits/chosen": -2.490464687347412, + "logits/rejected": -2.57242488861084, + "logps/chosen": -236.5983428955078, + "logps/rejected": -217.5277862548828, + "loss": 0.6671, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7107295393943787, + "rewards/margins": 0.13387471437454224, + "rewards/rejected": -0.8446043133735657, + "step": 622 + }, + { + "epoch": 0.82, + "learning_rate": 4.329958247619651e-05, + "logits/chosen": -2.6979587078094482, + "logits/rejected": -2.720431089401245, + "logps/chosen": -152.257080078125, + "logps/rejected": -158.9082489013672, + "loss": 0.5986, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.735335648059845, + "rewards/margins": 0.4063275456428528, + "rewards/rejected": -1.1416633129119873, + "step": 623 + }, + { + "epoch": 0.82, + "learning_rate": 4.3275151760429075e-05, + "logits/chosen": -2.494333267211914, + "logits/rejected": -2.522716522216797, + "logps/chosen": -171.63621520996094, + "logps/rejected": -163.959716796875, + "loss": 0.6896, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7268607020378113, + "rewards/margins": 0.06785643100738525, + "rewards/rejected": -0.7947170734405518, + "step": 624 + }, + { + "epoch": 0.82, + "learning_rate": 4.325068350593268e-05, + "logits/chosen": -2.4268102645874023, + "logits/rejected": -2.4749088287353516, + "logps/chosen": -178.3427734375, + "logps/rejected": -186.25009155273438, + "loss": 0.6225, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5268755555152893, + "rewards/margins": 0.207400843501091, + "rewards/rejected": -0.7342764139175415, + "step": 625 + }, + { + "epoch": 0.82, + "learning_rate": 4.322617776296723e-05, + "logits/chosen": -2.6472067832946777, + "logits/rejected": -2.662853717803955, + "logps/chosen": -194.68817138671875, + "logps/rejected": -203.64810180664062, + "loss": 0.7129, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7313871383666992, + "rewards/margins": 0.030208323150873184, + "rewards/rejected": -0.7615953683853149, + "step": 626 + }, + { + "epoch": 0.82, + "learning_rate": 4.320163458186961e-05, + "logits/chosen": -2.5693845748901367, + "logits/rejected": -2.5331075191497803, + "logps/chosen": -169.76651000976562, + "logps/rejected": -178.60678100585938, + "loss": 0.658, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8332303762435913, + "rewards/margins": 0.1909269392490387, + "rewards/rejected": -1.0241572856903076, + "step": 627 + }, + { + "epoch": 0.82, + "learning_rate": 4.317705401305362e-05, + "logits/chosen": -2.318687915802002, + "logits/rejected": -2.345287322998047, + "logps/chosen": -218.33798217773438, + "logps/rejected": -217.30389404296875, + "loss": 0.6021, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8331419825553894, + "rewards/margins": 0.3256155550479889, + "rewards/rejected": -1.1587576866149902, + "step": 628 + }, + { + "epoch": 0.82, + "learning_rate": 4.315243610700986e-05, + "logits/chosen": -2.685421943664551, + "logits/rejected": -2.777416944503784, + "logps/chosen": -217.2689971923828, + "logps/rejected": -223.85684204101562, + "loss": 0.6324, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0812008380889893, + "rewards/margins": 0.19831162691116333, + "rewards/rejected": -1.2795124053955078, + "step": 629 + }, + { + "epoch": 0.82, + "learning_rate": 4.312778091430563e-05, + "logits/chosen": -2.4046859741210938, + "logits/rejected": -2.7304351329803467, + "logps/chosen": -157.87550354003906, + "logps/rejected": -225.56103515625, + "loss": 0.6807, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9263030886650085, + "rewards/margins": 0.20371997356414795, + "rewards/rejected": -1.1300231218338013, + "step": 630 + }, + { + "epoch": 0.83, + "learning_rate": 4.310308848558479e-05, + "logits/chosen": -2.4605460166931152, + "logits/rejected": -2.577059030532837, + "logps/chosen": -147.3010711669922, + "logps/rejected": -183.1886444091797, + "loss": 0.5799, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6091120839118958, + "rewards/margins": 0.33578452467918396, + "rewards/rejected": -0.9448965787887573, + "step": 631 + }, + { + "epoch": 0.83, + "learning_rate": 4.3078358871567706e-05, + "logits/chosen": -2.5022783279418945, + "logits/rejected": -2.6256165504455566, + "logps/chosen": -154.47483825683594, + "logps/rejected": -203.1712188720703, + "loss": 0.696, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8408339023590088, + "rewards/margins": 0.0674683153629303, + "rewards/rejected": -0.9083021879196167, + "step": 632 + }, + { + "epoch": 0.83, + "learning_rate": 4.305359212305115e-05, + "logits/chosen": -2.570892095565796, + "logits/rejected": -2.6330907344818115, + "logps/chosen": -152.4366455078125, + "logps/rejected": -172.27908325195312, + "loss": 0.6883, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6601619124412537, + "rewards/margins": 0.09768363833427429, + "rewards/rejected": -0.7578455209732056, + "step": 633 + }, + { + "epoch": 0.83, + "learning_rate": 4.302878829090813e-05, + "logits/chosen": -2.5576834678649902, + "logits/rejected": -2.6245827674865723, + "logps/chosen": -217.8241424560547, + "logps/rejected": -200.61273193359375, + "loss": 0.7646, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9752216339111328, + "rewards/margins": -0.06534964591264725, + "rewards/rejected": -0.9098719358444214, + "step": 634 + }, + { + "epoch": 0.83, + "learning_rate": 4.300394742608784e-05, + "logits/chosen": -2.4099199771881104, + "logits/rejected": -2.4199118614196777, + "logps/chosen": -137.7400665283203, + "logps/rejected": -167.20249938964844, + "loss": 0.6885, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8331056833267212, + "rewards/margins": 0.23110516369342804, + "rewards/rejected": -1.0642107725143433, + "step": 635 + }, + { + "epoch": 0.83, + "learning_rate": 4.2979069579615564e-05, + "logits/chosen": -2.5262954235076904, + "logits/rejected": -2.6143667697906494, + "logps/chosen": -173.13040161132812, + "logps/rejected": -237.11602783203125, + "loss": 0.7272, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6415895819664001, + "rewards/margins": 0.039109162986278534, + "rewards/rejected": -0.6806987524032593, + "step": 636 + }, + { + "epoch": 0.83, + "learning_rate": 4.2954154802592514e-05, + "logits/chosen": -2.4960150718688965, + "logits/rejected": -2.578279733657837, + "logps/chosen": -139.1173095703125, + "logps/rejected": -168.22586059570312, + "loss": 0.6952, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9741197824478149, + "rewards/margins": 0.2649148106575012, + "rewards/rejected": -1.2390345335006714, + "step": 637 + }, + { + "epoch": 0.84, + "learning_rate": 4.292920314619578e-05, + "logits/chosen": -2.573779821395874, + "logits/rejected": -2.5826001167297363, + "logps/chosen": -200.8004913330078, + "logps/rejected": -207.52462768554688, + "loss": 0.7511, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8658602833747864, + "rewards/margins": -0.03227938339114189, + "rewards/rejected": -0.8335809707641602, + "step": 638 + }, + { + "epoch": 0.84, + "learning_rate": 4.290421466167822e-05, + "logits/chosen": -2.225033760070801, + "logits/rejected": -2.293822765350342, + "logps/chosen": -223.53115844726562, + "logps/rejected": -244.39378356933594, + "loss": 0.6364, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5871797204017639, + "rewards/margins": 0.15462249517440796, + "rewards/rejected": -0.7418022155761719, + "step": 639 + }, + { + "epoch": 0.84, + "learning_rate": 4.2879189400368314e-05, + "logits/chosen": -2.5370843410491943, + "logits/rejected": -2.6005024909973145, + "logps/chosen": -185.48890686035156, + "logps/rejected": -200.16712951660156, + "loss": 0.6332, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8029256463050842, + "rewards/margins": 0.19223058223724365, + "rewards/rejected": -0.9951562881469727, + "step": 640 + }, + { + "epoch": 0.84, + "learning_rate": 4.2854127413670096e-05, + "logits/chosen": -2.5340185165405273, + "logits/rejected": -2.466094732284546, + "logps/chosen": -166.66929626464844, + "logps/rejected": -157.72235107421875, + "loss": 0.9073, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.1363290548324585, + "rewards/margins": -0.30132997035980225, + "rewards/rejected": -0.8349990248680115, + "step": 641 + }, + { + "epoch": 0.84, + "learning_rate": 4.282902875306304e-05, + "logits/chosen": -2.4717962741851807, + "logits/rejected": -2.5328433513641357, + "logps/chosen": -135.24365234375, + "logps/rejected": -178.5400390625, + "loss": 0.6674, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.42678678035736084, + "rewards/margins": 0.11809446662664413, + "rewards/rejected": -0.5448811650276184, + "step": 642 + }, + { + "epoch": 0.84, + "learning_rate": 4.280389347010194e-05, + "logits/chosen": -2.51080584526062, + "logits/rejected": -2.6007397174835205, + "logps/chosen": -160.39112854003906, + "logps/rejected": -168.5418243408203, + "loss": 0.7279, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6518469452857971, + "rewards/margins": 0.04480691999197006, + "rewards/rejected": -0.6966539025306702, + "step": 643 + }, + { + "epoch": 0.84, + "learning_rate": 4.277872161641682e-05, + "logits/chosen": -2.3960509300231934, + "logits/rejected": -2.51719331741333, + "logps/chosen": -172.503662109375, + "logps/rejected": -211.3986053466797, + "loss": 0.6125, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6145063638687134, + "rewards/margins": 0.2776188254356384, + "rewards/rejected": -0.892125129699707, + "step": 644 + }, + { + "epoch": 0.84, + "learning_rate": 4.275351324371283e-05, + "logits/chosen": -2.6039271354675293, + "logits/rejected": -2.669445037841797, + "logps/chosen": -167.97862243652344, + "logps/rejected": -215.40682983398438, + "loss": 0.5704, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5948019027709961, + "rewards/margins": 0.3271142840385437, + "rewards/rejected": -0.921916127204895, + "step": 645 + }, + { + "epoch": 0.85, + "learning_rate": 4.2728268403770145e-05, + "logits/chosen": -2.222475528717041, + "logits/rejected": -2.220109462738037, + "logps/chosen": -179.1140594482422, + "logps/rejected": -203.96224975585938, + "loss": 0.6496, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5808567404747009, + "rewards/margins": 0.15382859110832214, + "rewards/rejected": -0.7346853017807007, + "step": 646 + }, + { + "epoch": 0.85, + "learning_rate": 4.270298714844381e-05, + "logits/chosen": -2.5251059532165527, + "logits/rejected": -2.5589253902435303, + "logps/chosen": -173.88092041015625, + "logps/rejected": -178.63037109375, + "loss": 0.6081, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7299823760986328, + "rewards/margins": 0.28834477066993713, + "rewards/rejected": -1.018327236175537, + "step": 647 + }, + { + "epoch": 0.85, + "learning_rate": 4.267766952966369e-05, + "logits/chosen": -2.281771421432495, + "logits/rejected": -2.4591827392578125, + "logps/chosen": -148.9844207763672, + "logps/rejected": -189.53805541992188, + "loss": 0.5192, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5873172879219055, + "rewards/margins": 0.48181021213531494, + "rewards/rejected": -1.0691275596618652, + "step": 648 + }, + { + "epoch": 0.85, + "learning_rate": 4.2652315599434354e-05, + "logits/chosen": -2.4655325412750244, + "logits/rejected": -2.5449090003967285, + "logps/chosen": -154.69180297851562, + "logps/rejected": -193.77456665039062, + "loss": 0.6647, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7018711566925049, + "rewards/margins": 0.11128158867359161, + "rewards/rejected": -0.8131527900695801, + "step": 649 + }, + { + "epoch": 0.85, + "learning_rate": 4.262692540983496e-05, + "logits/chosen": -2.4712820053100586, + "logits/rejected": -2.3742597103118896, + "logps/chosen": -190.89785766601562, + "logps/rejected": -224.13925170898438, + "loss": 0.7112, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.897792398929596, + "rewards/margins": 0.05793970078229904, + "rewards/rejected": -0.9557321667671204, + "step": 650 + }, + { + "epoch": 0.85, + "learning_rate": 4.2601499013019126e-05, + "logits/chosen": -2.483391284942627, + "logits/rejected": -2.4832870960235596, + "logps/chosen": -173.18243408203125, + "logps/rejected": -187.56520080566406, + "loss": 0.7033, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6757019758224487, + "rewards/margins": 0.04309311881661415, + "rewards/rejected": -0.7187950611114502, + "step": 651 + }, + { + "epoch": 0.85, + "learning_rate": 4.257603646121484e-05, + "logits/chosen": -2.5267128944396973, + "logits/rejected": -2.5165576934814453, + "logps/chosen": -165.43246459960938, + "logps/rejected": -212.12628173828125, + "loss": 0.6429, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6913148164749146, + "rewards/margins": 0.17455829679965973, + "rewards/rejected": -0.8658731579780579, + "step": 652 + }, + { + "epoch": 0.85, + "learning_rate": 4.2550537806724384e-05, + "logits/chosen": -2.3866825103759766, + "logits/rejected": -2.4320545196533203, + "logps/chosen": -175.04782104492188, + "logps/rejected": -203.4820556640625, + "loss": 0.5674, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6151914000511169, + "rewards/margins": 0.4012424945831299, + "rewards/rejected": -1.0164339542388916, + "step": 653 + }, + { + "epoch": 0.86, + "learning_rate": 4.2525003101924164e-05, + "logits/chosen": -2.5912859439849854, + "logits/rejected": -2.576547861099243, + "logps/chosen": -219.98446655273438, + "logps/rejected": -202.2375946044922, + "loss": 0.7016, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9261603355407715, + "rewards/margins": 0.16095884144306183, + "rewards/rejected": -1.0871191024780273, + "step": 654 + }, + { + "epoch": 0.86, + "learning_rate": 4.249943239926467e-05, + "logits/chosen": -2.648683786392212, + "logits/rejected": -2.5860180854797363, + "logps/chosen": -175.8250732421875, + "logps/rejected": -156.6357879638672, + "loss": 0.7841, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6031679511070251, + "rewards/margins": -0.12085067480802536, + "rewards/rejected": -0.4823172092437744, + "step": 655 + }, + { + "epoch": 0.86, + "learning_rate": 4.247382575127031e-05, + "logits/chosen": -2.474801778793335, + "logits/rejected": -2.3887550830841064, + "logps/chosen": -198.35394287109375, + "logps/rejected": -172.00584411621094, + "loss": 0.7847, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.0092177391052246, + "rewards/margins": -0.09214206784963608, + "rewards/rejected": -0.9170756340026855, + "step": 656 + }, + { + "epoch": 0.86, + "learning_rate": 4.2448183210539334e-05, + "logits/chosen": -2.392206907272339, + "logits/rejected": -2.489854335784912, + "logps/chosen": -174.3745574951172, + "logps/rejected": -220.22213745117188, + "loss": 0.5326, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5829483270645142, + "rewards/margins": 0.5717104077339172, + "rewards/rejected": -1.1546587944030762, + "step": 657 + }, + { + "epoch": 0.86, + "learning_rate": 4.2422504829743724e-05, + "logits/chosen": -2.61334228515625, + "logits/rejected": -2.6003055572509766, + "logps/chosen": -213.92164611816406, + "logps/rejected": -232.10540771484375, + "loss": 0.6787, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8696513175964355, + "rewards/margins": 0.26525798439979553, + "rewards/rejected": -1.1349092721939087, + "step": 658 + }, + { + "epoch": 0.86, + "learning_rate": 4.239679066162907e-05, + "logits/chosen": -2.529151678085327, + "logits/rejected": -2.584458351135254, + "logps/chosen": -160.20848083496094, + "logps/rejected": -203.44912719726562, + "loss": 0.6072, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5726284980773926, + "rewards/margins": 0.27831003069877625, + "rewards/rejected": -0.8509385585784912, + "step": 659 + }, + { + "epoch": 0.86, + "learning_rate": 4.237104075901449e-05, + "logits/chosen": -2.543184995651245, + "logits/rejected": -2.5258021354675293, + "logps/chosen": -180.32992553710938, + "logps/rejected": -199.39439392089844, + "loss": 0.6992, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.811248779296875, + "rewards/margins": 0.16956302523612976, + "rewards/rejected": -0.9808117747306824, + "step": 660 + }, + { + "epoch": 0.87, + "learning_rate": 4.234525517479248e-05, + "logits/chosen": -2.471468925476074, + "logits/rejected": -2.4575130939483643, + "logps/chosen": -172.39584350585938, + "logps/rejected": -185.70445251464844, + "loss": 0.7311, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.81290203332901, + "rewards/margins": 0.009819276630878448, + "rewards/rejected": -0.8227213025093079, + "step": 661 + }, + { + "epoch": 0.87, + "learning_rate": 4.2319433961928844e-05, + "logits/chosen": -2.608738899230957, + "logits/rejected": -2.4687836170196533, + "logps/chosen": -185.58847045898438, + "logps/rejected": -162.63841247558594, + "loss": 0.7846, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.7829579710960388, + "rewards/margins": -0.0719057098031044, + "rewards/rejected": -0.7110522985458374, + "step": 662 + }, + { + "epoch": 0.87, + "learning_rate": 4.229357717346257e-05, + "logits/chosen": -2.5522217750549316, + "logits/rejected": -2.489596366882324, + "logps/chosen": -203.3766326904297, + "logps/rejected": -254.9163055419922, + "loss": 0.722, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7322261333465576, + "rewards/margins": 0.030108261853456497, + "rewards/rejected": -0.7623343467712402, + "step": 663 + }, + { + "epoch": 0.87, + "learning_rate": 4.226768486250572e-05, + "logits/chosen": -2.624052047729492, + "logits/rejected": -2.6371800899505615, + "logps/chosen": -184.49551391601562, + "logps/rejected": -204.43121337890625, + "loss": 0.7533, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8451451659202576, + "rewards/margins": -0.06043161079287529, + "rewards/rejected": -0.7847135663032532, + "step": 664 + }, + { + "epoch": 0.87, + "learning_rate": 4.224175708224332e-05, + "logits/chosen": -2.5530622005462646, + "logits/rejected": -2.638960123062134, + "logps/chosen": -176.18704223632812, + "logps/rejected": -174.23574829101562, + "loss": 0.7083, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5662491321563721, + "rewards/margins": 0.15944433212280273, + "rewards/rejected": -0.72569340467453, + "step": 665 + }, + { + "epoch": 0.87, + "learning_rate": 4.221579388593326e-05, + "logits/chosen": -2.688939094543457, + "logits/rejected": -2.66377854347229, + "logps/chosen": -160.33746337890625, + "logps/rejected": -172.418212890625, + "loss": 0.6331, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5651705265045166, + "rewards/margins": 0.20773081481456757, + "rewards/rejected": -0.7729012966156006, + "step": 666 + }, + { + "epoch": 0.87, + "learning_rate": 4.218979532690616e-05, + "logits/chosen": -2.484663248062134, + "logits/rejected": -2.424830675125122, + "logps/chosen": -191.41098022460938, + "logps/rejected": -164.51815795898438, + "loss": 0.8708, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9628737568855286, + "rewards/margins": -0.2227509766817093, + "rewards/rejected": -0.7401228547096252, + "step": 667 + }, + { + "epoch": 0.87, + "learning_rate": 4.216376145856529e-05, + "logits/chosen": -2.4857797622680664, + "logits/rejected": -2.4634532928466797, + "logps/chosen": -185.9803009033203, + "logps/rejected": -229.830078125, + "loss": 0.8148, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8997754454612732, + "rewards/margins": -0.06498207151889801, + "rewards/rejected": -0.8347933888435364, + "step": 668 + }, + { + "epoch": 0.88, + "learning_rate": 4.213769233438646e-05, + "logits/chosen": -2.5399997234344482, + "logits/rejected": -2.670645236968994, + "logps/chosen": -160.7857666015625, + "logps/rejected": -172.67477416992188, + "loss": 0.7154, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.5905480980873108, + "rewards/margins": -0.010487288236618042, + "rewards/rejected": -0.5800608396530151, + "step": 669 + }, + { + "epoch": 0.88, + "learning_rate": 4.211158800791788e-05, + "logits/chosen": -2.584531784057617, + "logits/rejected": -2.6492509841918945, + "logps/chosen": -179.07191467285156, + "logps/rejected": -236.59219360351562, + "loss": 0.632, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8085418343544006, + "rewards/margins": 0.2305898368358612, + "rewards/rejected": -1.0391316413879395, + "step": 670 + }, + { + "epoch": 0.88, + "learning_rate": 4.208544853278008e-05, + "logits/chosen": -2.7273218631744385, + "logits/rejected": -2.674079656600952, + "logps/chosen": -194.35113525390625, + "logps/rejected": -203.67144775390625, + "loss": 0.7744, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7947997450828552, + "rewards/margins": -0.0895846039056778, + "rewards/rejected": -0.7052150964736938, + "step": 671 + }, + { + "epoch": 0.88, + "learning_rate": 4.205927396266577e-05, + "logits/chosen": -2.5353407859802246, + "logits/rejected": -2.473275899887085, + "logps/chosen": -189.8173828125, + "logps/rejected": -172.57937622070312, + "loss": 0.7574, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7492716312408447, + "rewards/margins": -0.07650469243526459, + "rewards/rejected": -0.672766923904419, + "step": 672 + }, + { + "epoch": 0.88, + "learning_rate": 4.203306435133978e-05, + "logits/chosen": -2.4454925060272217, + "logits/rejected": -2.436471462249756, + "logps/chosen": -142.18865966796875, + "logps/rejected": -181.4835968017578, + "loss": 0.5801, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.555526077747345, + "rewards/margins": 0.41016635298728943, + "rewards/rejected": -0.9656924605369568, + "step": 673 + }, + { + "epoch": 0.88, + "learning_rate": 4.200681975263888e-05, + "logits/chosen": -2.5019893646240234, + "logits/rejected": -2.5508639812469482, + "logps/chosen": -164.3115997314453, + "logps/rejected": -172.49722290039062, + "loss": 0.7833, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.6514595746994019, + "rewards/margins": -0.11908778548240662, + "rewards/rejected": -0.5323717594146729, + "step": 674 + }, + { + "epoch": 0.88, + "learning_rate": 4.1980540220471744e-05, + "logits/chosen": -2.5552783012390137, + "logits/rejected": -2.547940731048584, + "logps/chosen": -187.34030151367188, + "logps/rejected": -205.068115234375, + "loss": 0.7329, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6662761569023132, + "rewards/margins": 0.10254745930433273, + "rewards/rejected": -0.7688236236572266, + "step": 675 + }, + { + "epoch": 0.88, + "learning_rate": 4.195422580881878e-05, + "logits/chosen": -2.5980396270751953, + "logits/rejected": -2.633117437362671, + "logps/chosen": -169.28765869140625, + "logps/rejected": -175.1426544189453, + "loss": 0.783, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.738059937953949, + "rewards/margins": -0.070980504155159, + "rewards/rejected": -0.6670793890953064, + "step": 676 + }, + { + "epoch": 0.89, + "learning_rate": 4.192787657173204e-05, + "logits/chosen": -2.5799622535705566, + "logits/rejected": -2.64198637008667, + "logps/chosen": -169.6929931640625, + "logps/rejected": -194.3636932373047, + "loss": 0.5487, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3179047703742981, + "rewards/margins": 0.39317452907562256, + "rewards/rejected": -0.7110792398452759, + "step": 677 + }, + { + "epoch": 0.89, + "learning_rate": 4.1901492563335115e-05, + "logits/chosen": -2.402824878692627, + "logits/rejected": -2.479947328567505, + "logps/chosen": -168.98193359375, + "logps/rejected": -200.73716735839844, + "loss": 0.7085, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7477744221687317, + "rewards/margins": 0.06644614040851593, + "rewards/rejected": -0.8142206072807312, + "step": 678 + }, + { + "epoch": 0.89, + "learning_rate": 4.187507383782303e-05, + "logits/chosen": -2.6026642322540283, + "logits/rejected": -2.6655664443969727, + "logps/chosen": -158.02281188964844, + "logps/rejected": -161.2207489013672, + "loss": 0.6549, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7262417078018188, + "rewards/margins": 0.15959057211875916, + "rewards/rejected": -0.8858322501182556, + "step": 679 + }, + { + "epoch": 0.89, + "learning_rate": 4.1848620449462115e-05, + "logits/chosen": -2.683466911315918, + "logits/rejected": -2.7357425689697266, + "logps/chosen": -183.96287536621094, + "logps/rejected": -185.6376190185547, + "loss": 0.6963, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7100127935409546, + "rewards/margins": 0.05447208136320114, + "rewards/rejected": -0.7644848823547363, + "step": 680 + }, + { + "epoch": 0.89, + "learning_rate": 4.1822132452589885e-05, + "logits/chosen": -2.594881057739258, + "logits/rejected": -2.6565728187561035, + "logps/chosen": -182.09751892089844, + "logps/rejected": -229.0118408203125, + "loss": 0.6154, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.530844509601593, + "rewards/margins": 0.22644518315792084, + "rewards/rejected": -0.757289707660675, + "step": 681 + }, + { + "epoch": 0.89, + "learning_rate": 4.1795609901614966e-05, + "logits/chosen": -2.451526641845703, + "logits/rejected": -2.444248914718628, + "logps/chosen": -159.2547607421875, + "logps/rejected": -141.80270385742188, + "loss": 0.7846, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.6390894651412964, + "rewards/margins": -0.018851161003112793, + "rewards/rejected": -0.6202382445335388, + "step": 682 + }, + { + "epoch": 0.89, + "learning_rate": 4.176905285101695e-05, + "logits/chosen": -2.7691075801849365, + "logits/rejected": -2.7721362113952637, + "logps/chosen": -177.64317321777344, + "logps/rejected": -199.67703247070312, + "loss": 0.6667, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5808727145195007, + "rewards/margins": 0.10210245847702026, + "rewards/rejected": -0.682975172996521, + "step": 683 + }, + { + "epoch": 0.9, + "learning_rate": 4.17424613553463e-05, + "logits/chosen": -2.6382815837860107, + "logits/rejected": -2.6818408966064453, + "logps/chosen": -199.1725616455078, + "logps/rejected": -222.76007080078125, + "loss": 0.7032, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7650430202484131, + "rewards/margins": 0.04612483084201813, + "rewards/rejected": -0.8111678957939148, + "step": 684 + }, + { + "epoch": 0.9, + "learning_rate": 4.171583546922423e-05, + "logits/chosen": -2.702162504196167, + "logits/rejected": -2.6568708419799805, + "logps/chosen": -196.53262329101562, + "logps/rejected": -185.51022338867188, + "loss": 0.7123, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7624577283859253, + "rewards/margins": 0.01695454865694046, + "rewards/rejected": -0.7794123291969299, + "step": 685 + }, + { + "epoch": 0.9, + "learning_rate": 4.1689175247342584e-05, + "logits/chosen": -2.6585752964019775, + "logits/rejected": -2.6265594959259033, + "logps/chosen": -184.20936584472656, + "logps/rejected": -177.85145568847656, + "loss": 0.8148, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6442875266075134, + "rewards/margins": -0.16329364478588104, + "rewards/rejected": -0.48099392652511597, + "step": 686 + }, + { + "epoch": 0.9, + "learning_rate": 4.1662480744463744e-05, + "logits/chosen": -2.4449877738952637, + "logits/rejected": -2.6396286487579346, + "logps/chosen": -160.87420654296875, + "logps/rejected": -172.5674285888672, + "loss": 0.6514, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6140396595001221, + "rewards/margins": 0.16129107773303986, + "rewards/rejected": -0.7753307223320007, + "step": 687 + }, + { + "epoch": 0.9, + "learning_rate": 4.163575201542052e-05, + "logits/chosen": -2.6318554878234863, + "logits/rejected": -2.7358765602111816, + "logps/chosen": -158.69976806640625, + "logps/rejected": -195.73008728027344, + "loss": 0.5346, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.351512610912323, + "rewards/margins": 0.4037948548793793, + "rewards/rejected": -0.7553074955940247, + "step": 688 + }, + { + "epoch": 0.9, + "learning_rate": 4.1608989115116e-05, + "logits/chosen": -2.5840697288513184, + "logits/rejected": -2.585153579711914, + "logps/chosen": -170.15512084960938, + "logps/rejected": -178.3724365234375, + "loss": 0.7501, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7801914811134338, + "rewards/margins": -0.008921336382627487, + "rewards/rejected": -0.7712701559066772, + "step": 689 + }, + { + "epoch": 0.9, + "learning_rate": 4.158219209852349e-05, + "logits/chosen": -2.6560463905334473, + "logits/rejected": -2.6988308429718018, + "logps/chosen": -175.9833526611328, + "logps/rejected": -184.16749572753906, + "loss": 0.6545, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.49359026551246643, + "rewards/margins": 0.11187607795000076, + "rewards/rejected": -0.605466365814209, + "step": 690 + }, + { + "epoch": 0.9, + "learning_rate": 4.155536102068636e-05, + "logits/chosen": -2.5828936100006104, + "logits/rejected": -2.6212708950042725, + "logps/chosen": -181.11277770996094, + "logps/rejected": -201.1419219970703, + "loss": 0.8704, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.7604485750198364, + "rewards/margins": -0.27131497859954834, + "rewards/rejected": -0.4891335368156433, + "step": 691 + }, + { + "epoch": 0.91, + "learning_rate": 4.152849593671793e-05, + "logits/chosen": -2.716947078704834, + "logits/rejected": -2.65468168258667, + "logps/chosen": -193.97682189941406, + "logps/rejected": -207.4134521484375, + "loss": 0.7631, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6618382930755615, + "rewards/margins": -0.0877380520105362, + "rewards/rejected": -0.5741002559661865, + "step": 692 + }, + { + "epoch": 0.91, + "learning_rate": 4.1501596901801384e-05, + "logits/chosen": -2.557304620742798, + "logits/rejected": -2.6196579933166504, + "logps/chosen": -181.52548217773438, + "logps/rejected": -195.8028564453125, + "loss": 0.7537, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6008011698722839, + "rewards/margins": -0.043041955679655075, + "rewards/rejected": -0.5577592253684998, + "step": 693 + }, + { + "epoch": 0.91, + "learning_rate": 4.147466397118968e-05, + "logits/chosen": -2.567335605621338, + "logits/rejected": -2.617478609085083, + "logps/chosen": -179.599853515625, + "logps/rejected": -198.3054656982422, + "loss": 0.6778, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8331121802330017, + "rewards/margins": 0.06144791468977928, + "rewards/rejected": -0.8945600986480713, + "step": 694 + }, + { + "epoch": 0.91, + "learning_rate": 4.144769720020533e-05, + "logits/chosen": -2.576467990875244, + "logits/rejected": -2.8038601875305176, + "logps/chosen": -174.2159423828125, + "logps/rejected": -233.60653686523438, + "loss": 0.6067, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6737346649169922, + "rewards/margins": 0.23855170607566833, + "rewards/rejected": -0.9122863411903381, + "step": 695 + }, + { + "epoch": 0.91, + "learning_rate": 4.142069664424041e-05, + "logits/chosen": -2.678194284439087, + "logits/rejected": -2.6755244731903076, + "logps/chosen": -198.93954467773438, + "logps/rejected": -184.6238555908203, + "loss": 0.723, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5593891143798828, + "rewards/margins": 0.06559957563877106, + "rewards/rejected": -0.6249886751174927, + "step": 696 + }, + { + "epoch": 0.91, + "learning_rate": 4.139366235875637e-05, + "logits/chosen": -2.5291852951049805, + "logits/rejected": -2.5268821716308594, + "logps/chosen": -193.07666015625, + "logps/rejected": -176.4041290283203, + "loss": 0.8631, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.8501958250999451, + "rewards/margins": -0.2449750304222107, + "rewards/rejected": -0.6052207946777344, + "step": 697 + }, + { + "epoch": 0.91, + "learning_rate": 4.136659439928397e-05, + "logits/chosen": -2.7552402019500732, + "logits/rejected": -2.730792760848999, + "logps/chosen": -154.31442260742188, + "logps/rejected": -201.8627166748047, + "loss": 0.635, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6791579723358154, + "rewards/margins": 0.18784941732883453, + "rewards/rejected": -0.867007315158844, + "step": 698 + }, + { + "epoch": 0.91, + "learning_rate": 4.13394928214231e-05, + "logits/chosen": -2.8812167644500732, + "logits/rejected": -2.929877996444702, + "logps/chosen": -226.31317138671875, + "logps/rejected": -223.9871826171875, + "loss": 0.7274, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.759143590927124, + "rewards/margins": 0.025182515382766724, + "rewards/rejected": -0.7843260765075684, + "step": 699 + }, + { + "epoch": 0.92, + "learning_rate": 4.1312357680842735e-05, + "logits/chosen": -2.7592363357543945, + "logits/rejected": -2.787437915802002, + "logps/chosen": -176.06568908691406, + "logps/rejected": -198.5147705078125, + "loss": 0.7985, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.6221618056297302, + "rewards/margins": -0.08630318939685822, + "rewards/rejected": -0.5358585715293884, + "step": 700 + }, + { + "epoch": 0.92, + "learning_rate": 4.128518903328078e-05, + "logits/chosen": -2.5527689456939697, + "logits/rejected": -2.569162368774414, + "logps/chosen": -168.37327575683594, + "logps/rejected": -153.46510314941406, + "loss": 0.7245, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7768934369087219, + "rewards/margins": -0.018674585968255997, + "rewards/rejected": -0.7582188844680786, + "step": 701 + }, + { + "epoch": 0.92, + "learning_rate": 4.125798693454396e-05, + "logits/chosen": -2.6978816986083984, + "logits/rejected": -2.775778293609619, + "logps/chosen": -181.85899353027344, + "logps/rejected": -186.74415588378906, + "loss": 0.6015, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6942383050918579, + "rewards/margins": 0.28554674983024597, + "rewards/rejected": -0.9797850847244263, + "step": 702 + }, + { + "epoch": 0.92, + "learning_rate": 4.123075144050772e-05, + "logits/chosen": -2.420675754547119, + "logits/rejected": -2.4404735565185547, + "logps/chosen": -187.6428985595703, + "logps/rejected": -196.76223754882812, + "loss": 0.6309, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.48836079239845276, + "rewards/margins": 0.17586764693260193, + "rewards/rejected": -0.6642284393310547, + "step": 703 + }, + { + "epoch": 0.92, + "learning_rate": 4.120348260711611e-05, + "logits/chosen": -2.696930170059204, + "logits/rejected": -2.7356936931610107, + "logps/chosen": -224.2950897216797, + "logps/rejected": -201.0372314453125, + "loss": 0.6719, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5922297239303589, + "rewards/margins": 0.0864262580871582, + "rewards/rejected": -0.6786559224128723, + "step": 704 + }, + { + "epoch": 0.92, + "learning_rate": 4.117618049038165e-05, + "logits/chosen": -2.433835029602051, + "logits/rejected": -2.5175817012786865, + "logps/chosen": -205.09742736816406, + "logps/rejected": -253.45188903808594, + "loss": 0.6352, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5550520420074463, + "rewards/margins": 0.19067110121250153, + "rewards/rejected": -0.7457231879234314, + "step": 705 + }, + { + "epoch": 0.92, + "learning_rate": 4.1148845146385214e-05, + "logits/chosen": -2.7056033611297607, + "logits/rejected": -2.6854336261749268, + "logps/chosen": -176.9462127685547, + "logps/rejected": -163.0907745361328, + "loss": 0.7122, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.585013747215271, + "rewards/margins": 0.12466025352478027, + "rewards/rejected": -0.7096740007400513, + "step": 706 + }, + { + "epoch": 0.93, + "learning_rate": 4.112147663127596e-05, + "logits/chosen": -2.5832924842834473, + "logits/rejected": -2.6641876697540283, + "logps/chosen": -291.26531982421875, + "logps/rejected": -285.40997314453125, + "loss": 0.8873, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9817371368408203, + "rewards/margins": -0.22740018367767334, + "rewards/rejected": -0.754336953163147, + "step": 707 + }, + { + "epoch": 0.93, + "learning_rate": 4.109407500127116e-05, + "logits/chosen": -2.7471392154693604, + "logits/rejected": -2.6390812397003174, + "logps/chosen": -175.82046508789062, + "logps/rejected": -154.3852081298828, + "loss": 0.7906, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8870856165885925, + "rewards/margins": -0.14796000719070435, + "rewards/rejected": -0.7391257286071777, + "step": 708 + }, + { + "epoch": 0.93, + "learning_rate": 4.106664031265611e-05, + "logits/chosen": -2.671312093734741, + "logits/rejected": -2.690074920654297, + "logps/chosen": -165.29336547851562, + "logps/rejected": -158.92807006835938, + "loss": 0.6831, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6163545846939087, + "rewards/margins": 0.10010550171136856, + "rewards/rejected": -0.716460108757019, + "step": 709 + }, + { + "epoch": 0.93, + "learning_rate": 4.103917262178402e-05, + "logits/chosen": -2.6318283081054688, + "logits/rejected": -2.65425968170166, + "logps/chosen": -161.3974609375, + "logps/rejected": -159.72401428222656, + "loss": 0.674, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6365061402320862, + "rewards/margins": 0.0781688243150711, + "rewards/rejected": -0.7146750688552856, + "step": 710 + }, + { + "epoch": 0.93, + "learning_rate": 4.1011671985075865e-05, + "logits/chosen": -2.6295552253723145, + "logits/rejected": -2.5962119102478027, + "logps/chosen": -183.000732421875, + "logps/rejected": -172.703125, + "loss": 0.7224, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6528486013412476, + "rewards/margins": -0.025449033826589584, + "rewards/rejected": -0.6273995041847229, + "step": 711 + }, + { + "epoch": 0.93, + "learning_rate": 4.098413845902033e-05, + "logits/chosen": -2.6849284172058105, + "logits/rejected": -2.7957422733306885, + "logps/chosen": -182.01235961914062, + "logps/rejected": -198.17079162597656, + "loss": 0.6462, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.577471911907196, + "rewards/margins": 0.14899027347564697, + "rewards/rejected": -0.7264621257781982, + "step": 712 + }, + { + "epoch": 0.93, + "learning_rate": 4.095657210017364e-05, + "logits/chosen": -2.5923876762390137, + "logits/rejected": -2.6000397205352783, + "logps/chosen": -200.19229125976562, + "logps/rejected": -190.99234008789062, + "loss": 0.7024, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5822942852973938, + "rewards/margins": 0.05264808610081673, + "rewards/rejected": -0.6349424123764038, + "step": 713 + }, + { + "epoch": 0.93, + "learning_rate": 4.092897296515944e-05, + "logits/chosen": -2.501671552658081, + "logits/rejected": -2.496816635131836, + "logps/chosen": -186.364990234375, + "logps/rejected": -185.56118774414062, + "loss": 0.7653, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7025803327560425, + "rewards/margins": -0.02450786530971527, + "rewards/rejected": -0.6780725121498108, + "step": 714 + }, + { + "epoch": 0.94, + "learning_rate": 4.090134111066874e-05, + "logits/chosen": -2.688492774963379, + "logits/rejected": -2.6672098636627197, + "logps/chosen": -168.45094299316406, + "logps/rejected": -199.4703826904297, + "loss": 0.6693, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6713076233863831, + "rewards/margins": 0.13783812522888184, + "rewards/rejected": -0.8091457486152649, + "step": 715 + }, + { + "epoch": 0.94, + "learning_rate": 4.0873676593459725e-05, + "logits/chosen": -2.4400930404663086, + "logits/rejected": -2.5282256603240967, + "logps/chosen": -153.5916748046875, + "logps/rejected": -178.5830841064453, + "loss": 0.6766, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4613954722881317, + "rewards/margins": 0.07974334061145782, + "rewards/rejected": -0.541138768196106, + "step": 716 + }, + { + "epoch": 0.94, + "learning_rate": 4.08459794703577e-05, + "logits/chosen": -2.6582794189453125, + "logits/rejected": -2.626995086669922, + "logps/chosen": -208.72027587890625, + "logps/rejected": -197.92471313476562, + "loss": 0.7412, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6259132027626038, + "rewards/margins": -0.06286411732435226, + "rewards/rejected": -0.5630490779876709, + "step": 717 + }, + { + "epoch": 0.94, + "learning_rate": 4.081824979825492e-05, + "logits/chosen": -2.6991827487945557, + "logits/rejected": -2.7381110191345215, + "logps/chosen": -182.42140197753906, + "logps/rejected": -178.61044311523438, + "loss": 0.6404, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6173000931739807, + "rewards/margins": 0.1515488624572754, + "rewards/rejected": -0.7688489556312561, + "step": 718 + }, + { + "epoch": 0.94, + "learning_rate": 4.07904876341105e-05, + "logits/chosen": -2.5119433403015137, + "logits/rejected": -2.567487955093384, + "logps/chosen": -191.7221221923828, + "logps/rejected": -211.65084838867188, + "loss": 0.6315, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7405194640159607, + "rewards/margins": 0.18211159110069275, + "rewards/rejected": -0.922631025314331, + "step": 719 + }, + { + "epoch": 0.94, + "learning_rate": 4.076269303495033e-05, + "logits/chosen": -2.662774085998535, + "logits/rejected": -2.568493366241455, + "logps/chosen": -183.8392791748047, + "logps/rejected": -171.7985076904297, + "loss": 0.8173, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.6661974787712097, + "rewards/margins": -0.2225557267665863, + "rewards/rejected": -0.4436417520046234, + "step": 720 + }, + { + "epoch": 0.94, + "learning_rate": 4.073486605786689e-05, + "logits/chosen": -2.7839298248291016, + "logits/rejected": -2.884078025817871, + "logps/chosen": -219.7886962890625, + "logps/rejected": -244.15625, + "loss": 0.6198, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7247489094734192, + "rewards/margins": 0.2933644950389862, + "rewards/rejected": -1.018113374710083, + "step": 721 + }, + { + "epoch": 0.95, + "learning_rate": 4.0707006760019175e-05, + "logits/chosen": -2.6340246200561523, + "logits/rejected": -2.6862027645111084, + "logps/chosen": -178.46224975585938, + "logps/rejected": -213.61317443847656, + "loss": 0.6549, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6767200231552124, + "rewards/margins": 0.19127850234508514, + "rewards/rejected": -0.8679986000061035, + "step": 722 + }, + { + "epoch": 0.95, + "learning_rate": 4.067911519863257e-05, + "logits/chosen": -2.531005620956421, + "logits/rejected": -2.562150001525879, + "logps/chosen": -194.95269775390625, + "logps/rejected": -199.6376953125, + "loss": 0.6151, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5694127082824707, + "rewards/margins": 0.24292032420635223, + "rewards/rejected": -0.8123329877853394, + "step": 723 + }, + { + "epoch": 0.95, + "learning_rate": 4.065119143099874e-05, + "logits/chosen": -2.714223861694336, + "logits/rejected": -2.7573318481445312, + "logps/chosen": -191.0901641845703, + "logps/rejected": -251.58689880371094, + "loss": 0.5495, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.43053001165390015, + "rewards/margins": 0.36911848187446594, + "rewards/rejected": -0.7996485233306885, + "step": 724 + }, + { + "epoch": 0.95, + "learning_rate": 4.062323551447549e-05, + "logits/chosen": -2.6741175651550293, + "logits/rejected": -2.6618902683258057, + "logps/chosen": -181.743896484375, + "logps/rejected": -191.43267822265625, + "loss": 0.6072, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5965117812156677, + "rewards/margins": 0.20273733139038086, + "rewards/rejected": -0.7992490530014038, + "step": 725 + }, + { + "epoch": 0.95, + "learning_rate": 4.059524750648668e-05, + "logits/chosen": -2.6173298358917236, + "logits/rejected": -2.719005584716797, + "logps/chosen": -128.7156982421875, + "logps/rejected": -163.35720825195312, + "loss": 0.6558, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4427485764026642, + "rewards/margins": 0.11390358209609985, + "rewards/rejected": -0.5566521883010864, + "step": 726 + }, + { + "epoch": 0.95, + "learning_rate": 4.056722746452207e-05, + "logits/chosen": -2.7046196460723877, + "logits/rejected": -2.626006603240967, + "logps/chosen": -192.15426635742188, + "logps/rejected": -203.4480743408203, + "loss": 0.8411, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.9564641714096069, + "rewards/margins": -0.2163528949022293, + "rewards/rejected": -0.740111231803894, + "step": 727 + }, + { + "epoch": 0.95, + "learning_rate": 4.053917544613723e-05, + "logits/chosen": -2.6192758083343506, + "logits/rejected": -2.649017333984375, + "logps/chosen": -175.56922912597656, + "logps/rejected": -234.2914276123047, + "loss": 0.7219, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8219558000564575, + "rewards/margins": 0.023496918380260468, + "rewards/rejected": -0.8454526662826538, + "step": 728 + }, + { + "epoch": 0.95, + "learning_rate": 4.051109150895343e-05, + "logits/chosen": -2.610642671585083, + "logits/rejected": -2.615621566772461, + "logps/chosen": -195.70941162109375, + "logps/rejected": -194.20928955078125, + "loss": 0.7166, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6751452088356018, + "rewards/margins": 0.013915710151195526, + "rewards/rejected": -0.6890608668327332, + "step": 729 + }, + { + "epoch": 0.96, + "learning_rate": 4.0482975710657455e-05, + "logits/chosen": -2.456711530685425, + "logits/rejected": -2.5238044261932373, + "logps/chosen": -199.232666015625, + "logps/rejected": -211.0916748046875, + "loss": 0.6943, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7001582980155945, + "rewards/margins": 0.0675555020570755, + "rewards/rejected": -0.7677137851715088, + "step": 730 + }, + { + "epoch": 0.96, + "learning_rate": 4.045482810900159e-05, + "logits/chosen": -2.4926917552948, + "logits/rejected": -2.4629762172698975, + "logps/chosen": -201.10394287109375, + "logps/rejected": -212.8970947265625, + "loss": 0.6293, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5266129970550537, + "rewards/margins": 0.1588543802499771, + "rewards/rejected": -0.6854673624038696, + "step": 731 + }, + { + "epoch": 0.96, + "learning_rate": 4.042664876180341e-05, + "logits/chosen": -2.6272189617156982, + "logits/rejected": -2.6395418643951416, + "logps/chosen": -173.2360076904297, + "logps/rejected": -186.01959228515625, + "loss": 0.6232, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5731245875358582, + "rewards/margins": 0.21896684169769287, + "rewards/rejected": -0.7920913696289062, + "step": 732 + }, + { + "epoch": 0.96, + "learning_rate": 4.0398437726945716e-05, + "logits/chosen": -2.711691379547119, + "logits/rejected": -2.7025833129882812, + "logps/chosen": -189.71885681152344, + "logps/rejected": -210.70761108398438, + "loss": 0.7556, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.6554818153381348, + "rewards/margins": -0.10323198139667511, + "rewards/rejected": -0.5522497892379761, + "step": 733 + }, + { + "epoch": 0.96, + "learning_rate": 4.037019506237638e-05, + "logits/chosen": -2.603912830352783, + "logits/rejected": -2.686739683151245, + "logps/chosen": -186.6495819091797, + "logps/rejected": -210.13623046875, + "loss": 0.623, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7819252610206604, + "rewards/margins": 0.26127052307128906, + "rewards/rejected": -1.0431957244873047, + "step": 734 + }, + { + "epoch": 0.96, + "learning_rate": 4.034192082610828e-05, + "logits/chosen": -2.6118476390838623, + "logits/rejected": -2.5994834899902344, + "logps/chosen": -164.40380859375, + "logps/rejected": -141.6078643798828, + "loss": 0.6502, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5243350863456726, + "rewards/margins": 0.12014760076999664, + "rewards/rejected": -0.6444827318191528, + "step": 735 + }, + { + "epoch": 0.96, + "learning_rate": 4.031361507621911e-05, + "logits/chosen": -2.501145362854004, + "logits/rejected": -2.5378122329711914, + "logps/chosen": -204.41793823242188, + "logps/rejected": -225.99737548828125, + "loss": 0.6733, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8046965003013611, + "rewards/margins": 0.13057535886764526, + "rewards/rejected": -0.9352718591690063, + "step": 736 + }, + { + "epoch": 0.96, + "learning_rate": 4.02852778708513e-05, + "logits/chosen": -2.7680859565734863, + "logits/rejected": -2.828195333480835, + "logps/chosen": -221.7406768798828, + "logps/rejected": -239.29904174804688, + "loss": 0.6113, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9672201871871948, + "rewards/margins": 0.20203211903572083, + "rewards/rejected": -1.1692522764205933, + "step": 737 + }, + { + "epoch": 0.97, + "learning_rate": 4.0256909268211914e-05, + "logits/chosen": -2.7258799076080322, + "logits/rejected": -2.7185235023498535, + "logps/chosen": -152.56524658203125, + "logps/rejected": -151.9055633544922, + "loss": 0.7743, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7225962281227112, + "rewards/margins": -0.00214192271232605, + "rewards/rejected": -0.7204542756080627, + "step": 738 + }, + { + "epoch": 0.97, + "learning_rate": 4.0228509326572496e-05, + "logits/chosen": -2.5518906116485596, + "logits/rejected": -2.5055699348449707, + "logps/chosen": -212.88555908203125, + "logps/rejected": -177.05625915527344, + "loss": 0.6684, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5254353284835815, + "rewards/margins": 0.1856461465358734, + "rewards/rejected": -0.7110814452171326, + "step": 739 + }, + { + "epoch": 0.97, + "learning_rate": 4.0200078104268944e-05, + "logits/chosen": -2.642047166824341, + "logits/rejected": -2.56471848487854, + "logps/chosen": -159.68580627441406, + "logps/rejected": -175.0918731689453, + "loss": 0.7278, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7405662536621094, + "rewards/margins": -0.027810221537947655, + "rewards/rejected": -0.7127560377120972, + "step": 740 + }, + { + "epoch": 0.97, + "learning_rate": 4.017161565970144e-05, + "logits/chosen": -2.485898971557617, + "logits/rejected": -2.4769692420959473, + "logps/chosen": -139.57974243164062, + "logps/rejected": -172.51040649414062, + "loss": 0.7386, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7599843144416809, + "rewards/margins": -0.04223020374774933, + "rewards/rejected": -0.717754065990448, + "step": 741 + }, + { + "epoch": 0.97, + "learning_rate": 4.014312205133428e-05, + "logits/chosen": -2.585129737854004, + "logits/rejected": -2.5427184104919434, + "logps/chosen": -200.5863494873047, + "logps/rejected": -221.85003662109375, + "loss": 0.7925, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8390079736709595, + "rewards/margins": -0.11233790963888168, + "rewards/rejected": -0.7266700863838196, + "step": 742 + }, + { + "epoch": 0.97, + "learning_rate": 4.011459733769579e-05, + "logits/chosen": -2.524702548980713, + "logits/rejected": -2.5253896713256836, + "logps/chosen": -183.24224853515625, + "logps/rejected": -185.02462768554688, + "loss": 0.8256, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.8384305238723755, + "rewards/margins": -0.15028908848762512, + "rewards/rejected": -0.6881413459777832, + "step": 743 + }, + { + "epoch": 0.97, + "learning_rate": 4.0086041577378166e-05, + "logits/chosen": -2.4046497344970703, + "logits/rejected": -2.4749138355255127, + "logps/chosen": -178.8353729248047, + "logps/rejected": -187.75807189941406, + "loss": 0.7012, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6531072854995728, + "rewards/margins": 0.04634825140237808, + "rewards/rejected": -0.6994554996490479, + "step": 744 + }, + { + "epoch": 0.98, + "learning_rate": 4.005745482903739e-05, + "logits/chosen": -2.347320795059204, + "logits/rejected": -2.4178340435028076, + "logps/chosen": -176.78187561035156, + "logps/rejected": -216.67189025878906, + "loss": 0.6512, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8543468713760376, + "rewards/margins": 0.18746432662010193, + "rewards/rejected": -1.0418111085891724, + "step": 745 + }, + { + "epoch": 0.98, + "learning_rate": 4.002883715139309e-05, + "logits/chosen": -2.698127269744873, + "logits/rejected": -2.731611967086792, + "logps/chosen": -199.87789916992188, + "logps/rejected": -235.65176391601562, + "loss": 0.6355, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.751441240310669, + "rewards/margins": 0.19296592473983765, + "rewards/rejected": -0.944407045841217, + "step": 746 + }, + { + "epoch": 0.98, + "learning_rate": 4.000018860322845e-05, + "logits/chosen": -2.8345818519592285, + "logits/rejected": -2.795546531677246, + "logps/chosen": -204.96926879882812, + "logps/rejected": -216.8634490966797, + "loss": 0.6564, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6679666638374329, + "rewards/margins": 0.1266886293888092, + "rewards/rejected": -0.7946553230285645, + "step": 747 + }, + { + "epoch": 0.98, + "learning_rate": 3.9971509243390025e-05, + "logits/chosen": -2.5881407260894775, + "logits/rejected": -2.6218934059143066, + "logps/chosen": -189.1737060546875, + "logps/rejected": -197.3935546875, + "loss": 0.7232, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.7519857883453369, + "rewards/margins": -0.0216445904225111, + "rewards/rejected": -0.7303412556648254, + "step": 748 + }, + { + "epoch": 0.98, + "learning_rate": 3.99427991307877e-05, + "logits/chosen": -2.5421721935272217, + "logits/rejected": -2.579972743988037, + "logps/chosen": -193.97085571289062, + "logps/rejected": -230.16653442382812, + "loss": 0.705, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8407448530197144, + "rewards/margins": 0.10392985492944717, + "rewards/rejected": -0.9446746706962585, + "step": 749 + }, + { + "epoch": 0.98, + "learning_rate": 3.9914058324394486e-05, + "logits/chosen": -2.632858991622925, + "logits/rejected": -2.6479578018188477, + "logps/chosen": -201.80633544921875, + "logps/rejected": -199.26773071289062, + "loss": 0.5815, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5169165134429932, + "rewards/margins": 0.283170223236084, + "rewards/rejected": -0.8000867366790771, + "step": 750 + }, + { + "epoch": 0.98, + "learning_rate": 3.9885286883246476e-05, + "logits/chosen": -2.655200481414795, + "logits/rejected": -2.6914384365081787, + "logps/chosen": -178.44381713867188, + "logps/rejected": -149.90350341796875, + "loss": 0.7038, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6623535752296448, + "rewards/margins": 0.008262221701443195, + "rewards/rejected": -0.6706157326698303, + "step": 751 + }, + { + "epoch": 0.98, + "learning_rate": 3.985648486644267e-05, + "logits/chosen": -2.3822569847106934, + "logits/rejected": -2.3234899044036865, + "logps/chosen": -181.5887451171875, + "logps/rejected": -169.651123046875, + "loss": 0.7508, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.7002227902412415, + "rewards/margins": -0.04468606412410736, + "rewards/rejected": -0.6555367708206177, + "step": 752 + }, + { + "epoch": 0.99, + "learning_rate": 3.982765233314489e-05, + "logits/chosen": -2.6348283290863037, + "logits/rejected": -2.7363362312316895, + "logps/chosen": -175.08506774902344, + "logps/rejected": -196.49099731445312, + "loss": 0.6209, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5249355435371399, + "rewards/margins": 0.2879376709461212, + "rewards/rejected": -0.812873125076294, + "step": 753 + }, + { + "epoch": 0.99, + "learning_rate": 3.979878934257762e-05, + "logits/chosen": -2.608914375305176, + "logits/rejected": -2.7260642051696777, + "logps/chosen": -182.08633422851562, + "logps/rejected": -173.453857421875, + "loss": 0.6227, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4596517086029053, + "rewards/margins": 0.3150676190853119, + "rewards/rejected": -0.7747193574905396, + "step": 754 + }, + { + "epoch": 0.99, + "learning_rate": 3.976989595402793e-05, + "logits/chosen": -2.684398651123047, + "logits/rejected": -2.7180964946746826, + "logps/chosen": -186.74403381347656, + "logps/rejected": -203.94711303710938, + "loss": 0.7122, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6536604762077332, + "rewards/margins": 0.010378487408161163, + "rewards/rejected": -0.6640389561653137, + "step": 755 + }, + { + "epoch": 0.99, + "learning_rate": 3.974097222684532e-05, + "logits/chosen": -2.380387544631958, + "logits/rejected": -2.638767719268799, + "logps/chosen": -143.05783081054688, + "logps/rejected": -187.193359375, + "loss": 0.7379, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7300621271133423, + "rewards/margins": 0.04467529058456421, + "rewards/rejected": -0.7747373580932617, + "step": 756 + }, + { + "epoch": 0.99, + "learning_rate": 3.9712018220441596e-05, + "logits/chosen": -2.4832100868225098, + "logits/rejected": -2.588329792022705, + "logps/chosen": -214.15151977539062, + "logps/rejected": -225.47116088867188, + "loss": 0.6362, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.670771598815918, + "rewards/margins": 0.2148047238588333, + "rewards/rejected": -0.8855763673782349, + "step": 757 + }, + { + "epoch": 0.99, + "learning_rate": 3.9683033994290767e-05, + "logits/chosen": -2.648101806640625, + "logits/rejected": -2.6975598335266113, + "logps/chosen": -163.26866149902344, + "logps/rejected": -165.8538055419922, + "loss": 0.6858, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.622085690498352, + "rewards/margins": 0.08007562160491943, + "rewards/rejected": -0.7021613717079163, + "step": 758 + }, + { + "epoch": 0.99, + "learning_rate": 3.965401960792894e-05, + "logits/chosen": -2.549471139907837, + "logits/rejected": -2.6215145587921143, + "logps/chosen": -155.6943359375, + "logps/rejected": -152.1805877685547, + "loss": 0.7184, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.651746392250061, + "rewards/margins": -0.030698398128151894, + "rewards/rejected": -0.6210479736328125, + "step": 759 + }, + { + "epoch": 0.99, + "learning_rate": 3.962497512095412e-05, + "logits/chosen": -2.599458932876587, + "logits/rejected": -2.577223062515259, + "logps/chosen": -168.08985900878906, + "logps/rejected": -152.3730926513672, + "loss": 0.7308, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6834767460823059, + "rewards/margins": -0.05063985288143158, + "rewards/rejected": -0.6328368186950684, + "step": 760 + }, + { + "epoch": 1.0, + "learning_rate": 3.95959005930262e-05, + "logits/chosen": -2.6772971153259277, + "logits/rejected": -2.6833689212799072, + "logps/chosen": -263.3359375, + "logps/rejected": -237.99044799804688, + "loss": 0.6881, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8629517555236816, + "rewards/margins": 0.06857781112194061, + "rewards/rejected": -0.9315295219421387, + "step": 761 + }, + { + "epoch": 1.0, + "learning_rate": 3.9566796083866756e-05, + "logits/chosen": -2.4484739303588867, + "logits/rejected": -2.3874497413635254, + "logps/chosen": -143.9307403564453, + "logps/rejected": -152.17724609375, + "loss": 0.6827, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5658427476882935, + "rewards/margins": 0.05139755457639694, + "rewards/rejected": -0.617240309715271, + "step": 762 + }, + { + "epoch": 1.0, + "learning_rate": 3.953766165325892e-05, + "logits/chosen": -2.5935730934143066, + "logits/rejected": -2.6049649715423584, + "logps/chosen": -252.82337951660156, + "logps/rejected": -228.11959838867188, + "loss": 0.8136, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8735777735710144, + "rewards/margins": -0.16885007917881012, + "rewards/rejected": -0.7047276496887207, + "step": 763 + }, + { + "epoch": 1.0, + "learning_rate": 3.9508497361047334e-05, + "logits/chosen": -2.6745100021362305, + "logits/rejected": -2.8012068271636963, + "logps/chosen": -131.25433349609375, + "logps/rejected": -153.92941284179688, + "loss": 0.6063, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5639945268630981, + "rewards/margins": 0.26005828380584717, + "rewards/rejected": -0.8240528106689453, + "step": 764 + }, + { + "epoch": 1.0, + "learning_rate": 3.9479303267137944e-05, + "logits/chosen": -2.6887001991271973, + "logits/rejected": -2.631093978881836, + "logps/chosen": -167.7516632080078, + "logps/rejected": -160.34854125976562, + "loss": 0.2575, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12638841569423676, + "rewards/margins": 1.77146315574646, + "rewards/rejected": -1.6450748443603516, + "step": 765 + }, + { + "epoch": 1.0, + "learning_rate": 3.9450079431497936e-05, + "logits/chosen": -2.4736642837524414, + "logits/rejected": -2.563966751098633, + "logps/chosen": -179.4278564453125, + "logps/rejected": -186.982421875, + "loss": 0.2379, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.33454737067222595, + "rewards/margins": 2.0607388019561768, + "rewards/rejected": -1.7261914014816284, + "step": 766 + }, + { + "epoch": 1.0, + "learning_rate": 3.9420825914155554e-05, + "logits/chosen": -2.6249935626983643, + "logits/rejected": -2.7404773235321045, + "logps/chosen": -154.4555206298828, + "logps/rejected": -208.5882568359375, + "loss": 0.1634, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.31436601281166077, + "rewards/margins": 2.54439115524292, + "rewards/rejected": -2.230025053024292, + "step": 767 + }, + { + "epoch": 1.01, + "learning_rate": 3.939154277520006e-05, + "logits/chosen": -2.4601221084594727, + "logits/rejected": -2.501352071762085, + "logps/chosen": -144.6915740966797, + "logps/rejected": -181.44070434570312, + "loss": 0.2538, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06428545713424683, + "rewards/margins": 1.5839626789093018, + "rewards/rejected": -1.5196770429611206, + "step": 768 + }, + { + "epoch": 1.01, + "learning_rate": 3.9362230074781506e-05, + "logits/chosen": -2.6759233474731445, + "logits/rejected": -2.677365779876709, + "logps/chosen": -187.21469116210938, + "logps/rejected": -176.88739013671875, + "loss": 0.2348, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3633385896682739, + "rewards/margins": 2.0741219520568848, + "rewards/rejected": -1.7107833623886108, + "step": 769 + }, + { + "epoch": 1.01, + "learning_rate": 3.9332887873110695e-05, + "logits/chosen": -2.677248954772949, + "logits/rejected": -2.6646389961242676, + "logps/chosen": -165.93006896972656, + "logps/rejected": -155.79644775390625, + "loss": 0.2781, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.005598783493041992, + "rewards/margins": 1.6051024198532104, + "rewards/rejected": -1.6107012033462524, + "step": 770 + }, + { + "epoch": 1.01, + "learning_rate": 3.9303516230459035e-05, + "logits/chosen": -2.54484224319458, + "logits/rejected": -2.4810094833374023, + "logps/chosen": -156.8101043701172, + "logps/rejected": -180.32305908203125, + "loss": 0.15, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12862610816955566, + "rewards/margins": 2.2837862968444824, + "rewards/rejected": -2.155160427093506, + "step": 771 + }, + { + "epoch": 1.01, + "learning_rate": 3.92741152071584e-05, + "logits/chosen": -2.670797824859619, + "logits/rejected": -2.716228723526001, + "logps/chosen": -160.2844696044922, + "logps/rejected": -186.7455291748047, + "loss": 0.1597, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3678353428840637, + "rewards/margins": 2.3705198764801025, + "rewards/rejected": -2.0026843547821045, + "step": 772 + }, + { + "epoch": 1.01, + "learning_rate": 3.924468486360101e-05, + "logits/chosen": -2.617213249206543, + "logits/rejected": -2.666292905807495, + "logps/chosen": -165.1317901611328, + "logps/rejected": -208.27572631835938, + "loss": 0.1997, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14538979530334473, + "rewards/margins": 1.8602721691131592, + "rewards/rejected": -1.7148823738098145, + "step": 773 + }, + { + "epoch": 1.01, + "learning_rate": 3.921522526023931e-05, + "logits/chosen": -2.7032179832458496, + "logits/rejected": -2.6985368728637695, + "logps/chosen": -187.04617309570312, + "logps/rejected": -231.27273559570312, + "loss": 0.1408, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4273063540458679, + "rewards/margins": 2.765761375427246, + "rewards/rejected": -2.3384549617767334, + "step": 774 + }, + { + "epoch": 1.01, + "learning_rate": 3.918573645758586e-05, + "logits/chosen": -2.677338123321533, + "logits/rejected": -2.735537528991699, + "logps/chosen": -192.94326782226562, + "logps/rejected": -228.2654571533203, + "loss": 0.1494, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17543727159500122, + "rewards/margins": 2.4333107471466064, + "rewards/rejected": -2.25787353515625, + "step": 775 + }, + { + "epoch": 1.02, + "learning_rate": 3.915621851621318e-05, + "logits/chosen": -2.5823519229888916, + "logits/rejected": -2.5766096115112305, + "logps/chosen": -141.41311645507812, + "logps/rejected": -165.74465942382812, + "loss": 0.2209, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04889017716050148, + "rewards/margins": 1.707082986831665, + "rewards/rejected": -1.6581928730010986, + "step": 776 + }, + { + "epoch": 1.02, + "learning_rate": 3.9126671496753666e-05, + "logits/chosen": -2.4788622856140137, + "logits/rejected": -2.6510164737701416, + "logps/chosen": -140.3416748046875, + "logps/rejected": -186.62484741210938, + "loss": 0.2122, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08496019244194031, + "rewards/margins": 1.8061598539352417, + "rewards/rejected": -1.7211997509002686, + "step": 777 + }, + { + "epoch": 1.02, + "learning_rate": 3.909709545989942e-05, + "logits/chosen": -2.6408588886260986, + "logits/rejected": -2.648200511932373, + "logps/chosen": -174.47097778320312, + "logps/rejected": -224.0052947998047, + "loss": 0.1737, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.22252985835075378, + "rewards/margins": 2.2816243171691895, + "rewards/rejected": -2.0590946674346924, + "step": 778 + }, + { + "epoch": 1.02, + "learning_rate": 3.9067490466402156e-05, + "logits/chosen": -2.6448774337768555, + "logits/rejected": -2.742879629135132, + "logps/chosen": -174.5858612060547, + "logps/rejected": -204.8096160888672, + "loss": 0.1673, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.08154253661632538, + "rewards/margins": 2.480858325958252, + "rewards/rejected": -2.562401056289673, + "step": 779 + }, + { + "epoch": 1.02, + "learning_rate": 3.903785657707307e-05, + "logits/chosen": -2.6244313716888428, + "logits/rejected": -2.5507609844207764, + "logps/chosen": -161.88987731933594, + "logps/rejected": -171.99977111816406, + "loss": 0.2874, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.13563959300518036, + "rewards/margins": 2.074355363845825, + "rewards/rejected": -1.9387155771255493, + "step": 780 + }, + { + "epoch": 1.02, + "learning_rate": 3.9008193852782733e-05, + "logits/chosen": -2.402883291244507, + "logits/rejected": -2.421910285949707, + "logps/chosen": -190.97164916992188, + "logps/rejected": -180.68893432617188, + "loss": 0.1755, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.36178380250930786, + "rewards/margins": 2.5330705642700195, + "rewards/rejected": -2.1712868213653564, + "step": 781 + }, + { + "epoch": 1.02, + "learning_rate": 3.897850235446089e-05, + "logits/chosen": -2.6583588123321533, + "logits/rejected": -2.6155612468719482, + "logps/chosen": -177.75830078125, + "logps/rejected": -201.97372436523438, + "loss": 0.1814, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.07586677372455597, + "rewards/margins": 2.4885523319244385, + "rewards/rejected": -2.4126858711242676, + "step": 782 + }, + { + "epoch": 1.02, + "learning_rate": 3.894878214309645e-05, + "logits/chosen": -2.4456002712249756, + "logits/rejected": -2.5392351150512695, + "logps/chosen": -147.53250122070312, + "logps/rejected": -182.77304077148438, + "loss": 0.1543, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09826147556304932, + "rewards/margins": 2.4370062351226807, + "rewards/rejected": -2.3387451171875, + "step": 783 + }, + { + "epoch": 1.03, + "learning_rate": 3.8919033279737274e-05, + "logits/chosen": -2.775362968444824, + "logits/rejected": -2.770418405532837, + "logps/chosen": -171.49510192871094, + "logps/rejected": -208.9770965576172, + "loss": 0.1267, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1164775863289833, + "rewards/margins": 2.621551513671875, + "rewards/rejected": -2.5050742626190186, + "step": 784 + }, + { + "epoch": 1.03, + "learning_rate": 3.888925582549006e-05, + "logits/chosen": -2.382932662963867, + "logits/rejected": -2.3689355850219727, + "logps/chosen": -214.31198120117188, + "logps/rejected": -245.0536651611328, + "loss": 0.2215, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08265908807516098, + "rewards/margins": 2.2869906425476074, + "rewards/rejected": -2.369649648666382, + "step": 785 + }, + { + "epoch": 1.03, + "learning_rate": 3.885944984152027e-05, + "logits/chosen": -2.601214647293091, + "logits/rejected": -2.5713980197906494, + "logps/chosen": -172.2421112060547, + "logps/rejected": -205.98236083984375, + "loss": 0.2156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0705888569355011, + "rewards/margins": 2.1648547649383545, + "rewards/rejected": -2.235443353652954, + "step": 786 + }, + { + "epoch": 1.03, + "learning_rate": 3.882961538905194e-05, + "logits/chosen": -2.7650246620178223, + "logits/rejected": -2.756899118423462, + "logps/chosen": -159.0460205078125, + "logps/rejected": -218.6039581298828, + "loss": 0.1533, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06594078242778778, + "rewards/margins": 2.9581427574157715, + "rewards/rejected": -2.892202138900757, + "step": 787 + }, + { + "epoch": 1.03, + "learning_rate": 3.879975252936761e-05, + "logits/chosen": -2.8535146713256836, + "logits/rejected": -2.751579999923706, + "logps/chosen": -201.79710388183594, + "logps/rejected": -212.3226776123047, + "loss": 0.0961, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07383383810520172, + "rewards/margins": 2.675102472305298, + "rewards/rejected": -2.601268768310547, + "step": 788 + }, + { + "epoch": 1.03, + "learning_rate": 3.876986132380814e-05, + "logits/chosen": -2.7492191791534424, + "logits/rejected": -2.7570226192474365, + "logps/chosen": -163.69142150878906, + "logps/rejected": -198.6139373779297, + "loss": 0.1008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25924035906791687, + "rewards/margins": 2.795973539352417, + "rewards/rejected": -3.055213451385498, + "step": 789 + }, + { + "epoch": 1.03, + "learning_rate": 3.8739941833772643e-05, + "logits/chosen": -2.767152786254883, + "logits/rejected": -2.7644238471984863, + "logps/chosen": -194.49136352539062, + "logps/rejected": -214.67564392089844, + "loss": 0.203, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2309228777885437, + "rewards/margins": 2.918428421020508, + "rewards/rejected": -3.149351119995117, + "step": 790 + }, + { + "epoch": 1.04, + "learning_rate": 3.870999412071829e-05, + "logits/chosen": -2.614018440246582, + "logits/rejected": -2.600497245788574, + "logps/chosen": -169.53805541992188, + "logps/rejected": -169.95814514160156, + "loss": 0.2289, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.38845112919807434, + "rewards/margins": 1.9743072986602783, + "rewards/rejected": -2.3627583980560303, + "step": 791 + }, + { + "epoch": 1.04, + "learning_rate": 3.8680018246160295e-05, + "logits/chosen": -2.7072720527648926, + "logits/rejected": -2.7555181980133057, + "logps/chosen": -170.2549591064453, + "logps/rejected": -212.03839111328125, + "loss": 0.135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46249496936798096, + "rewards/margins": 2.740030288696289, + "rewards/rejected": -3.2025249004364014, + "step": 792 + }, + { + "epoch": 1.04, + "learning_rate": 3.865001427167164e-05, + "logits/chosen": -2.697120189666748, + "logits/rejected": -2.571460247039795, + "logps/chosen": -177.19332885742188, + "logps/rejected": -204.76942443847656, + "loss": 0.131, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6775528788566589, + "rewards/margins": 2.8083858489990234, + "rewards/rejected": -3.485938787460327, + "step": 793 + }, + { + "epoch": 1.04, + "learning_rate": 3.861998225888307e-05, + "logits/chosen": -2.5678954124450684, + "logits/rejected": -2.6190593242645264, + "logps/chosen": -153.89779663085938, + "logps/rejected": -167.26556396484375, + "loss": 0.1821, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5305047035217285, + "rewards/margins": 2.5454978942871094, + "rewards/rejected": -3.076002597808838, + "step": 794 + }, + { + "epoch": 1.04, + "learning_rate": 3.8589922269482924e-05, + "logits/chosen": -2.6134603023529053, + "logits/rejected": -2.6522388458251953, + "logps/chosen": -229.1393585205078, + "logps/rejected": -288.697998046875, + "loss": 0.107, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4708881080150604, + "rewards/margins": 3.699263095855713, + "rewards/rejected": -4.1701507568359375, + "step": 795 + }, + { + "epoch": 1.04, + "learning_rate": 3.855983436521699e-05, + "logits/chosen": -2.560218334197998, + "logits/rejected": -2.645477533340454, + "logps/chosen": -147.77294921875, + "logps/rejected": -212.90585327148438, + "loss": 0.1269, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7539626955986023, + "rewards/margins": 3.3151063919067383, + "rewards/rejected": -4.069068908691406, + "step": 796 + }, + { + "epoch": 1.04, + "learning_rate": 3.8529718607888394e-05, + "logits/chosen": -2.4999420642852783, + "logits/rejected": -2.5179193019866943, + "logps/chosen": -152.94679260253906, + "logps/rejected": -204.71592712402344, + "loss": 0.2147, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2636586427688599, + "rewards/margins": 2.3519301414489746, + "rewards/rejected": -3.615588903427124, + "step": 797 + }, + { + "epoch": 1.04, + "learning_rate": 3.8499575059357506e-05, + "logits/chosen": -2.671869993209839, + "logits/rejected": -2.640197515487671, + "logps/chosen": -181.50674438476562, + "logps/rejected": -215.46429443359375, + "loss": 0.1806, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2009189128875732, + "rewards/margins": 2.5449750423431396, + "rewards/rejected": -3.745894193649292, + "step": 798 + }, + { + "epoch": 1.05, + "learning_rate": 3.8469403781541745e-05, + "logits/chosen": -2.6338608264923096, + "logits/rejected": -2.760087728500366, + "logps/chosen": -197.01071166992188, + "logps/rejected": -244.93154907226562, + "loss": 0.1091, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5004479885101318, + "rewards/margins": 3.055344820022583, + "rewards/rejected": -4.555792808532715, + "step": 799 + }, + { + "epoch": 1.05, + "learning_rate": 3.843920483641551e-05, + "logits/chosen": -2.66325044631958, + "logits/rejected": -2.715461015701294, + "logps/chosen": -186.20314025878906, + "logps/rejected": -194.35206604003906, + "loss": 0.1372, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1806678771972656, + "rewards/margins": 2.5824830532073975, + "rewards/rejected": -3.763150691986084, + "step": 800 + }, + { + "epoch": 1.05, + "learning_rate": 3.840897828601002e-05, + "logits/chosen": -2.532099485397339, + "logits/rejected": -2.458197593688965, + "logps/chosen": -220.65689086914062, + "logps/rejected": -277.5340576171875, + "loss": 0.1136, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.040308952331543, + "rewards/margins": 3.4944987297058105, + "rewards/rejected": -4.5348076820373535, + "step": 801 + }, + { + "epoch": 1.05, + "learning_rate": 3.83787241924132e-05, + "logits/chosen": -2.4101672172546387, + "logits/rejected": -2.51715350151062, + "logps/chosen": -151.6785888671875, + "logps/rejected": -230.41976928710938, + "loss": 0.1068, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8751517534255981, + "rewards/margins": 4.202584743499756, + "rewards/rejected": -5.077736854553223, + "step": 802 + }, + { + "epoch": 1.05, + "learning_rate": 3.8348442617769564e-05, + "logits/chosen": -2.50626277923584, + "logits/rejected": -2.552149772644043, + "logps/chosen": -178.29891967773438, + "logps/rejected": -220.8915252685547, + "loss": 0.076, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.427808165550232, + "rewards/margins": 3.7678184509277344, + "rewards/rejected": -5.195626258850098, + "step": 803 + }, + { + "epoch": 1.05, + "learning_rate": 3.831813362428005e-05, + "logits/chosen": -2.5403552055358887, + "logits/rejected": -2.5728914737701416, + "logps/chosen": -184.95680236816406, + "logps/rejected": -228.92166137695312, + "loss": 0.2942, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.236476182937622, + "rewards/margins": 2.8043324947357178, + "rewards/rejected": -4.04080867767334, + "step": 804 + }, + { + "epoch": 1.05, + "learning_rate": 3.8287797274201934e-05, + "logits/chosen": -2.654428005218506, + "logits/rejected": -2.609748363494873, + "logps/chosen": -171.26904296875, + "logps/rejected": -200.14511108398438, + "loss": 0.1712, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2014542818069458, + "rewards/margins": 2.651529312133789, + "rewards/rejected": -3.8529834747314453, + "step": 805 + }, + { + "epoch": 1.05, + "learning_rate": 3.825743362984868e-05, + "logits/chosen": -2.585634231567383, + "logits/rejected": -2.6210579872131348, + "logps/chosen": -174.72225952148438, + "logps/rejected": -256.7543640136719, + "loss": 0.2016, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0417742729187012, + "rewards/margins": 4.115874290466309, + "rewards/rejected": -5.157649040222168, + "step": 806 + }, + { + "epoch": 1.06, + "learning_rate": 3.8227042753589824e-05, + "logits/chosen": -2.5954151153564453, + "logits/rejected": -2.5807945728302, + "logps/chosen": -200.4560546875, + "logps/rejected": -255.4353485107422, + "loss": 0.1359, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5734584331512451, + "rewards/margins": 4.025257110595703, + "rewards/rejected": -5.598715305328369, + "step": 807 + }, + { + "epoch": 1.06, + "learning_rate": 3.819662470785082e-05, + "logits/chosen": -2.7379062175750732, + "logits/rejected": -2.7494699954986572, + "logps/chosen": -176.42608642578125, + "logps/rejected": -223.4091796875, + "loss": 0.1528, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2805501222610474, + "rewards/margins": 3.246807813644409, + "rewards/rejected": -4.527358055114746, + "step": 808 + }, + { + "epoch": 1.06, + "learning_rate": 3.816617955511296e-05, + "logits/chosen": -2.7525413036346436, + "logits/rejected": -2.777355432510376, + "logps/chosen": -212.89166259765625, + "logps/rejected": -217.78802490234375, + "loss": 0.2782, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.408876657485962, + "rewards/margins": 2.0644912719726562, + "rewards/rejected": -3.4733681678771973, + "step": 809 + }, + { + "epoch": 1.06, + "learning_rate": 3.8135707357913176e-05, + "logits/chosen": -2.537975311279297, + "logits/rejected": -2.58882474899292, + "logps/chosen": -161.26962280273438, + "logps/rejected": -208.24331665039062, + "loss": 0.0719, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7949110269546509, + "rewards/margins": 3.6137166023254395, + "rewards/rejected": -4.408627510070801, + "step": 810 + }, + { + "epoch": 1.06, + "learning_rate": 3.8105208178843984e-05, + "logits/chosen": -2.3414132595062256, + "logits/rejected": -2.3927347660064697, + "logps/chosen": -203.45179748535156, + "logps/rejected": -210.25027465820312, + "loss": 0.1075, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0561504364013672, + "rewards/margins": 2.966663122177124, + "rewards/rejected": -4.022813320159912, + "step": 811 + }, + { + "epoch": 1.06, + "learning_rate": 3.8074682080553335e-05, + "logits/chosen": -2.690915584564209, + "logits/rejected": -2.6338562965393066, + "logps/chosen": -207.17599487304688, + "logps/rejected": -254.29653930664062, + "loss": 0.1109, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.002183198928833, + "rewards/margins": 3.457009792327881, + "rewards/rejected": -4.459193229675293, + "step": 812 + }, + { + "epoch": 1.06, + "learning_rate": 3.804412912574442e-05, + "logits/chosen": -2.4453845024108887, + "logits/rejected": -2.4959869384765625, + "logps/chosen": -172.0583038330078, + "logps/rejected": -218.82098388671875, + "loss": 0.2408, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0620150566101074, + "rewards/margins": 3.0320522785186768, + "rewards/rejected": -4.094067096710205, + "step": 813 + }, + { + "epoch": 1.07, + "learning_rate": 3.801354937717565e-05, + "logits/chosen": -2.683800220489502, + "logits/rejected": -2.5853464603424072, + "logps/chosen": -190.3063507080078, + "logps/rejected": -201.25628662109375, + "loss": 0.208, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1379764080047607, + "rewards/margins": 2.755608081817627, + "rewards/rejected": -3.893584728240967, + "step": 814 + }, + { + "epoch": 1.07, + "learning_rate": 3.798294289766043e-05, + "logits/chosen": -2.4754679203033447, + "logits/rejected": -2.4798026084899902, + "logps/chosen": -183.4767303466797, + "logps/rejected": -191.26222229003906, + "loss": 0.2135, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0173275470733643, + "rewards/margins": 2.415278673171997, + "rewards/rejected": -3.4326062202453613, + "step": 815 + }, + { + "epoch": 1.07, + "learning_rate": 3.795230975006712e-05, + "logits/chosen": -2.612645387649536, + "logits/rejected": -2.5963857173919678, + "logps/chosen": -160.24755859375, + "logps/rejected": -201.96185302734375, + "loss": 0.185, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.287351131439209, + "rewards/margins": 2.947019338607788, + "rewards/rejected": -4.234370231628418, + "step": 816 + }, + { + "epoch": 1.07, + "learning_rate": 3.792164999731881e-05, + "logits/chosen": -2.6370742321014404, + "logits/rejected": -2.70023512840271, + "logps/chosen": -190.13369750976562, + "logps/rejected": -205.581298828125, + "loss": 0.2281, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7540441751480103, + "rewards/margins": 3.2129039764404297, + "rewards/rejected": -4.966948509216309, + "step": 817 + }, + { + "epoch": 1.07, + "learning_rate": 3.789096370239328e-05, + "logits/chosen": -2.6114511489868164, + "logits/rejected": -2.739351272583008, + "logps/chosen": -196.17510986328125, + "logps/rejected": -247.4773712158203, + "loss": 0.0827, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0242459774017334, + "rewards/margins": 4.068726539611816, + "rewards/rejected": -5.092972755432129, + "step": 818 + }, + { + "epoch": 1.07, + "learning_rate": 3.786025092832279e-05, + "logits/chosen": -2.3249611854553223, + "logits/rejected": -2.407060384750366, + "logps/chosen": -207.06622314453125, + "logps/rejected": -227.32284545898438, + "loss": 0.1706, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2159541845321655, + "rewards/margins": 2.7725777626037598, + "rewards/rejected": -3.988532066345215, + "step": 819 + }, + { + "epoch": 1.07, + "learning_rate": 3.782951173819403e-05, + "logits/chosen": -2.586280345916748, + "logits/rejected": -2.5269079208374023, + "logps/chosen": -226.07937622070312, + "logps/rejected": -254.896240234375, + "loss": 0.1904, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9202920198440552, + "rewards/margins": 3.539307117462158, + "rewards/rejected": -5.459599018096924, + "step": 820 + }, + { + "epoch": 1.07, + "learning_rate": 3.7798746195147914e-05, + "logits/chosen": -2.4165151119232178, + "logits/rejected": -2.4409053325653076, + "logps/chosen": -226.0311279296875, + "logps/rejected": -254.019775390625, + "loss": 0.2368, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3306143283843994, + "rewards/margins": 2.6145148277282715, + "rewards/rejected": -3.945129156112671, + "step": 821 + }, + { + "epoch": 1.08, + "learning_rate": 3.776795436237954e-05, + "logits/chosen": -2.4291183948516846, + "logits/rejected": -2.502776861190796, + "logps/chosen": -158.3379669189453, + "logps/rejected": -233.86962890625, + "loss": 0.2155, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4095854759216309, + "rewards/margins": 3.3275306224823, + "rewards/rejected": -4.737115859985352, + "step": 822 + }, + { + "epoch": 1.08, + "learning_rate": 3.773713630313793e-05, + "logits/chosen": -2.695758819580078, + "logits/rejected": -2.656249523162842, + "logps/chosen": -202.1251220703125, + "logps/rejected": -217.68896484375, + "loss": 0.1232, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.282354712486267, + "rewards/margins": 3.205690860748291, + "rewards/rejected": -4.488044738769531, + "step": 823 + }, + { + "epoch": 1.08, + "learning_rate": 3.7706292080726055e-05, + "logits/chosen": -2.5363056659698486, + "logits/rejected": -2.5430288314819336, + "logps/chosen": -205.69509887695312, + "logps/rejected": -253.77920532226562, + "loss": 0.2525, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4872784614562988, + "rewards/margins": 3.4748756885528564, + "rewards/rejected": -4.962153911590576, + "step": 824 + }, + { + "epoch": 1.08, + "learning_rate": 3.767542175850058e-05, + "logits/chosen": -2.420227527618408, + "logits/rejected": -2.441901445388794, + "logps/chosen": -181.02056884765625, + "logps/rejected": -211.343505859375, + "loss": 0.1606, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2797247171401978, + "rewards/margins": 2.7236833572387695, + "rewards/rejected": -4.003407955169678, + "step": 825 + }, + { + "epoch": 1.08, + "learning_rate": 3.764452539987179e-05, + "logits/chosen": -2.47520112991333, + "logits/rejected": -2.458693027496338, + "logps/chosen": -167.35470581054688, + "logps/rejected": -193.6707305908203, + "loss": 0.1663, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3829278945922852, + "rewards/margins": 2.366436004638672, + "rewards/rejected": -3.749363899230957, + "step": 826 + }, + { + "epoch": 1.08, + "learning_rate": 3.761360306830345e-05, + "logits/chosen": -2.5185067653656006, + "logits/rejected": -2.50441575050354, + "logps/chosen": -178.70388793945312, + "logps/rejected": -218.41030883789062, + "loss": 0.1615, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.161661982536316, + "rewards/margins": 2.8637630939483643, + "rewards/rejected": -4.025424480438232, + "step": 827 + }, + { + "epoch": 1.08, + "learning_rate": 3.75826548273127e-05, + "logits/chosen": -2.633216142654419, + "logits/rejected": -2.6944453716278076, + "logps/chosen": -178.22970581054688, + "logps/rejected": -216.95306396484375, + "loss": 0.2062, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.33101487159729, + "rewards/margins": 2.8503544330596924, + "rewards/rejected": -4.181369304656982, + "step": 828 + }, + { + "epoch": 1.09, + "learning_rate": 3.7551680740469874e-05, + "logits/chosen": -2.604398727416992, + "logits/rejected": -2.655909299850464, + "logps/chosen": -233.8108367919922, + "logps/rejected": -283.73272705078125, + "loss": 0.133, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.088188648223877, + "rewards/margins": 3.245051622390747, + "rewards/rejected": -4.333240032196045, + "step": 829 + }, + { + "epoch": 1.09, + "learning_rate": 3.752068087139839e-05, + "logits/chosen": -2.6542930603027344, + "logits/rejected": -2.690643072128296, + "logps/chosen": -187.74871826171875, + "logps/rejected": -274.4953308105469, + "loss": 0.0891, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0974299907684326, + "rewards/margins": 4.077320575714111, + "rewards/rejected": -5.174749851226807, + "step": 830 + }, + { + "epoch": 1.09, + "learning_rate": 3.7489655283774657e-05, + "logits/chosen": -2.5698750019073486, + "logits/rejected": -2.680229902267456, + "logps/chosen": -154.89736938476562, + "logps/rejected": -205.21786499023438, + "loss": 0.1433, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6827576756477356, + "rewards/margins": 2.9387474060058594, + "rewards/rejected": -3.6215052604675293, + "step": 831 + }, + { + "epoch": 1.09, + "learning_rate": 3.7458604041327874e-05, + "logits/chosen": -2.5519180297851562, + "logits/rejected": -2.5656418800354004, + "logps/chosen": -190.4122314453125, + "logps/rejected": -213.00625610351562, + "loss": 0.1569, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.285630226135254, + "rewards/margins": 2.924367666244507, + "rewards/rejected": -4.209997653961182, + "step": 832 + }, + { + "epoch": 1.09, + "learning_rate": 3.742752720783997e-05, + "logits/chosen": -2.514334201812744, + "logits/rejected": -2.596501588821411, + "logps/chosen": -192.1654052734375, + "logps/rejected": -192.47653198242188, + "loss": 0.1317, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9184039235115051, + "rewards/margins": 2.5253825187683105, + "rewards/rejected": -3.443786382675171, + "step": 833 + }, + { + "epoch": 1.09, + "learning_rate": 3.7396424847145425e-05, + "logits/chosen": -2.484549045562744, + "logits/rejected": -2.6469879150390625, + "logps/chosen": -148.88473510742188, + "logps/rejected": -203.79083251953125, + "loss": 0.146, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3805378675460815, + "rewards/margins": 3.031029462814331, + "rewards/rejected": -4.411567211151123, + "step": 834 + }, + { + "epoch": 1.09, + "learning_rate": 3.736529702313114e-05, + "logits/chosen": -2.613891363143921, + "logits/rejected": -2.6539323329925537, + "logps/chosen": -157.11306762695312, + "logps/rejected": -214.69085693359375, + "loss": 0.1557, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9892842173576355, + "rewards/margins": 2.9719948768615723, + "rewards/rejected": -3.9612791538238525, + "step": 835 + }, + { + "epoch": 1.09, + "learning_rate": 3.733414379973635e-05, + "logits/chosen": -2.5769026279449463, + "logits/rejected": -2.5845699310302734, + "logps/chosen": -206.44366455078125, + "logps/rejected": -273.4783630371094, + "loss": 0.0569, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1630650758743286, + "rewards/margins": 3.958601474761963, + "rewards/rejected": -5.12166690826416, + "step": 836 + }, + { + "epoch": 1.1, + "learning_rate": 3.730296524095245e-05, + "logits/chosen": -2.5311949253082275, + "logits/rejected": -2.602909564971924, + "logps/chosen": -160.33348083496094, + "logps/rejected": -215.58802795410156, + "loss": 0.0861, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3376214504241943, + "rewards/margins": 3.4596052169799805, + "rewards/rejected": -4.797226428985596, + "step": 837 + }, + { + "epoch": 1.1, + "learning_rate": 3.7271761410822856e-05, + "logits/chosen": -2.503412961959839, + "logits/rejected": -2.4912285804748535, + "logps/chosen": -210.4758758544922, + "logps/rejected": -248.14144897460938, + "loss": 0.0999, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2760006189346313, + "rewards/margins": 3.6639609336853027, + "rewards/rejected": -4.939960956573486, + "step": 838 + }, + { + "epoch": 1.1, + "learning_rate": 3.724053237344294e-05, + "logits/chosen": -2.5100913047790527, + "logits/rejected": -2.6398231983184814, + "logps/chosen": -195.51553344726562, + "logps/rejected": -267.8132629394531, + "loss": 0.1612, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.472798466682434, + "rewards/margins": 3.3320348262786865, + "rewards/rejected": -4.80483341217041, + "step": 839 + }, + { + "epoch": 1.1, + "learning_rate": 3.720927819295979e-05, + "logits/chosen": -2.625965118408203, + "logits/rejected": -2.5181195735931396, + "logps/chosen": -195.74607849121094, + "logps/rejected": -225.23878479003906, + "loss": 0.2915, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5482072830200195, + "rewards/margins": 2.775569438934326, + "rewards/rejected": -4.3237762451171875, + "step": 840 + }, + { + "epoch": 1.1, + "learning_rate": 3.7177998933572186e-05, + "logits/chosen": -2.4342215061187744, + "logits/rejected": -2.4397995471954346, + "logps/chosen": -174.26548767089844, + "logps/rejected": -205.57427978515625, + "loss": 0.1855, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7328426837921143, + "rewards/margins": 2.65775990486145, + "rewards/rejected": -4.3906025886535645, + "step": 841 + }, + { + "epoch": 1.1, + "learning_rate": 3.7146694659530425e-05, + "logits/chosen": -2.3403022289276123, + "logits/rejected": -2.435096263885498, + "logps/chosen": -164.419921875, + "logps/rejected": -218.05117797851562, + "loss": 0.2598, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.064548969268799, + "rewards/margins": 2.839548110961914, + "rewards/rejected": -4.904096603393555, + "step": 842 + }, + { + "epoch": 1.1, + "learning_rate": 3.711536543513614e-05, + "logits/chosen": -2.5847721099853516, + "logits/rejected": -2.605027198791504, + "logps/chosen": -209.7950439453125, + "logps/rejected": -246.93609619140625, + "loss": 0.1416, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1541309356689453, + "rewards/margins": 3.411691665649414, + "rewards/rejected": -4.565822601318359, + "step": 843 + }, + { + "epoch": 1.1, + "learning_rate": 3.708401132474228e-05, + "logits/chosen": -2.5608434677124023, + "logits/rejected": -2.5270862579345703, + "logps/chosen": -178.58570861816406, + "logps/rejected": -224.67947387695312, + "loss": 0.0892, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.514496922492981, + "rewards/margins": 4.215465068817139, + "rewards/rejected": -4.72996187210083, + "step": 844 + }, + { + "epoch": 1.11, + "learning_rate": 3.705263239275284e-05, + "logits/chosen": -2.5800983905792236, + "logits/rejected": -2.674860715866089, + "logps/chosen": -186.04078674316406, + "logps/rejected": -223.0049591064453, + "loss": 0.1349, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.803220510482788, + "rewards/margins": 3.094984531402588, + "rewards/rejected": -4.898205280303955, + "step": 845 + }, + { + "epoch": 1.11, + "learning_rate": 3.702122870362286e-05, + "logits/chosen": -2.5478274822235107, + "logits/rejected": -2.5513339042663574, + "logps/chosen": -183.14073181152344, + "logps/rejected": -196.20660400390625, + "loss": 0.2509, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5068767070770264, + "rewards/margins": 2.48431134223938, + "rewards/rejected": -3.9911885261535645, + "step": 846 + }, + { + "epoch": 1.11, + "learning_rate": 3.698980032185821e-05, + "logits/chosen": -2.6585183143615723, + "logits/rejected": -2.657508373260498, + "logps/chosen": -188.41065979003906, + "logps/rejected": -238.37930297851562, + "loss": 0.2177, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8523781299591064, + "rewards/margins": 3.5584514141082764, + "rewards/rejected": -5.410830020904541, + "step": 847 + }, + { + "epoch": 1.11, + "learning_rate": 3.695834731201548e-05, + "logits/chosen": -2.541285991668701, + "logits/rejected": -2.4983725547790527, + "logps/chosen": -168.76828002929688, + "logps/rejected": -194.1188507080078, + "loss": 0.1259, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5406126976013184, + "rewards/margins": 3.1631386280059814, + "rewards/rejected": -4.703751564025879, + "step": 848 + }, + { + "epoch": 1.11, + "learning_rate": 3.692686973870184e-05, + "logits/chosen": -2.571265935897827, + "logits/rejected": -2.594231367111206, + "logps/chosen": -198.38992309570312, + "logps/rejected": -218.17489624023438, + "loss": 0.1462, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2703204154968262, + "rewards/margins": 2.8772685527801514, + "rewards/rejected": -4.147588729858398, + "step": 849 + }, + { + "epoch": 1.11, + "learning_rate": 3.689536766657494e-05, + "logits/chosen": -2.5608391761779785, + "logits/rejected": -2.6710116863250732, + "logps/chosen": -181.3638153076172, + "logps/rejected": -238.46327209472656, + "loss": 0.1191, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4569287300109863, + "rewards/margins": 3.5289573669433594, + "rewards/rejected": -4.9858856201171875, + "step": 850 + }, + { + "epoch": 1.11, + "learning_rate": 3.6863841160342723e-05, + "logits/chosen": -2.4190502166748047, + "logits/rejected": -2.4080772399902344, + "logps/chosen": -168.54371643066406, + "logps/rejected": -239.17726135253906, + "loss": 0.1511, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5876612663269043, + "rewards/margins": 3.6042914390563965, + "rewards/rejected": -5.191952228546143, + "step": 851 + }, + { + "epoch": 1.12, + "learning_rate": 3.683229028476334e-05, + "logits/chosen": -2.7769880294799805, + "logits/rejected": -2.769570827484131, + "logps/chosen": -244.13287353515625, + "logps/rejected": -266.6891174316406, + "loss": 0.0463, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8334072828292847, + "rewards/margins": 4.523943901062012, + "rewards/rejected": -6.357351303100586, + "step": 852 + }, + { + "epoch": 1.12, + "learning_rate": 3.6800715104645e-05, + "logits/chosen": -2.6356747150421143, + "logits/rejected": -2.676835298538208, + "logps/chosen": -202.72198486328125, + "logps/rejected": -218.10472106933594, + "loss": 0.1855, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5705070495605469, + "rewards/margins": 2.939119338989258, + "rewards/rejected": -4.509626388549805, + "step": 853 + }, + { + "epoch": 1.12, + "learning_rate": 3.676911568484583e-05, + "logits/chosen": -2.4276986122131348, + "logits/rejected": -2.474168062210083, + "logps/chosen": -172.96237182617188, + "logps/rejected": -211.53924560546875, + "loss": 0.0993, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.662520170211792, + "rewards/margins": 2.878678798675537, + "rewards/rejected": -4.541199207305908, + "step": 854 + }, + { + "epoch": 1.12, + "learning_rate": 3.673749209027375e-05, + "logits/chosen": -2.4743990898132324, + "logits/rejected": -2.4546518325805664, + "logps/chosen": -180.78338623046875, + "logps/rejected": -203.42556762695312, + "loss": 0.1602, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8969523906707764, + "rewards/margins": 2.973909378051758, + "rewards/rejected": -3.8708620071411133, + "step": 855 + }, + { + "epoch": 1.12, + "learning_rate": 3.6705844385886334e-05, + "logits/chosen": -2.610804796218872, + "logits/rejected": -2.624135732650757, + "logps/chosen": -185.5303497314453, + "logps/rejected": -201.92044067382812, + "loss": 0.3314, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7442423105239868, + "rewards/margins": 2.786667823791504, + "rewards/rejected": -4.530910491943359, + "step": 856 + }, + { + "epoch": 1.12, + "learning_rate": 3.667417263669068e-05, + "logits/chosen": -2.4083616733551025, + "logits/rejected": -2.6500542163848877, + "logps/chosen": -159.14784240722656, + "logps/rejected": -201.2216033935547, + "loss": 0.0765, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6013761758804321, + "rewards/margins": 3.6397151947021484, + "rewards/rejected": -5.241092205047607, + "step": 857 + }, + { + "epoch": 1.12, + "learning_rate": 3.6642476907743276e-05, + "logits/chosen": -2.4894661903381348, + "logits/rejected": -2.662278652191162, + "logps/chosen": -231.25991821289062, + "logps/rejected": -341.57867431640625, + "loss": 0.0664, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5516610145568848, + "rewards/margins": 4.967604160308838, + "rewards/rejected": -6.519265174865723, + "step": 858 + }, + { + "epoch": 1.12, + "learning_rate": 3.661075726414986e-05, + "logits/chosen": -2.5153987407684326, + "logits/rejected": -2.6378214359283447, + "logps/chosen": -197.08767700195312, + "logps/rejected": -257.2450256347656, + "loss": 0.0809, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.84122633934021, + "rewards/margins": 3.511990785598755, + "rewards/rejected": -5.353217124938965, + "step": 859 + }, + { + "epoch": 1.13, + "learning_rate": 3.6579013771065305e-05, + "logits/chosen": -2.3079957962036133, + "logits/rejected": -2.3357715606689453, + "logps/chosen": -141.00547790527344, + "logps/rejected": -158.54039001464844, + "loss": 0.2199, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.963112711906433, + "rewards/margins": 2.2221224308013916, + "rewards/rejected": -4.185235500335693, + "step": 860 + }, + { + "epoch": 1.13, + "learning_rate": 3.654724649369348e-05, + "logits/chosen": -2.587371587753296, + "logits/rejected": -2.6012768745422363, + "logps/chosen": -312.8792419433594, + "logps/rejected": -355.51837158203125, + "loss": 0.2491, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8825641870498657, + "rewards/margins": 4.342455863952637, + "rewards/rejected": -6.225020408630371, + "step": 861 + }, + { + "epoch": 1.13, + "learning_rate": 3.651545549728709e-05, + "logits/chosen": -2.416578769683838, + "logits/rejected": -2.4245691299438477, + "logps/chosen": -169.2411346435547, + "logps/rejected": -190.55685424804688, + "loss": 0.2026, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7248740196228027, + "rewards/margins": 2.488800048828125, + "rewards/rejected": -4.2136735916137695, + "step": 862 + }, + { + "epoch": 1.13, + "learning_rate": 3.6483640847147554e-05, + "logits/chosen": -2.4201467037200928, + "logits/rejected": -2.4285213947296143, + "logps/chosen": -189.98886108398438, + "logps/rejected": -231.26080322265625, + "loss": 0.1565, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8520042896270752, + "rewards/margins": 2.9693615436553955, + "rewards/rejected": -4.8213653564453125, + "step": 863 + }, + { + "epoch": 1.13, + "learning_rate": 3.645180260862492e-05, + "logits/chosen": -2.3902981281280518, + "logits/rejected": -2.4115846157073975, + "logps/chosen": -172.95211791992188, + "logps/rejected": -297.0254211425781, + "loss": 0.1506, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8114925622940063, + "rewards/margins": 3.8624112606048584, + "rewards/rejected": -5.673903465270996, + "step": 864 + }, + { + "epoch": 1.13, + "learning_rate": 3.6419940847117626e-05, + "logits/chosen": -2.375730514526367, + "logits/rejected": -2.5130038261413574, + "logps/chosen": -143.1685028076172, + "logps/rejected": -220.02430725097656, + "loss": 0.1561, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8501218557357788, + "rewards/margins": 3.1089580059051514, + "rewards/rejected": -4.959079742431641, + "step": 865 + }, + { + "epoch": 1.13, + "learning_rate": 3.638805562807249e-05, + "logits/chosen": -2.474574565887451, + "logits/rejected": -2.568755865097046, + "logps/chosen": -202.5961151123047, + "logps/rejected": -257.1708984375, + "loss": 0.0626, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8871090412139893, + "rewards/margins": 4.0226874351501465, + "rewards/rejected": -5.909796714782715, + "step": 866 + }, + { + "epoch": 1.13, + "learning_rate": 3.635614701698448e-05, + "logits/chosen": -2.459073543548584, + "logits/rejected": -2.4574649333953857, + "logps/chosen": -206.9467315673828, + "logps/rejected": -242.87551879882812, + "loss": 0.1437, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.290987491607666, + "rewards/margins": 3.7120237350463867, + "rewards/rejected": -6.003011226654053, + "step": 867 + }, + { + "epoch": 1.14, + "learning_rate": 3.632421507939661e-05, + "logits/chosen": -2.6140787601470947, + "logits/rejected": -2.612224578857422, + "logps/chosen": -223.91006469726562, + "logps/rejected": -291.6077880859375, + "loss": 0.0898, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.003418445587158, + "rewards/margins": 4.071234703063965, + "rewards/rejected": -6.074653625488281, + "step": 868 + }, + { + "epoch": 1.14, + "learning_rate": 3.629225988089983e-05, + "logits/chosen": -2.4317378997802734, + "logits/rejected": -2.6184799671173096, + "logps/chosen": -180.1226348876953, + "logps/rejected": -190.80392456054688, + "loss": 0.1817, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4009151458740234, + "rewards/margins": 2.961726427078247, + "rewards/rejected": -4.362641334533691, + "step": 869 + }, + { + "epoch": 1.14, + "learning_rate": 3.6260281487132846e-05, + "logits/chosen": -2.4276628494262695, + "logits/rejected": -2.3554205894470215, + "logps/chosen": -184.47543334960938, + "logps/rejected": -217.70945739746094, + "loss": 0.2233, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7305198907852173, + "rewards/margins": 3.172408103942871, + "rewards/rejected": -4.902928352355957, + "step": 870 + }, + { + "epoch": 1.14, + "learning_rate": 3.622827996378203e-05, + "logits/chosen": -2.5787577629089355, + "logits/rejected": -2.5663416385650635, + "logps/chosen": -188.5000762939453, + "logps/rejected": -188.4753875732422, + "loss": 0.2161, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2453864812850952, + "rewards/margins": 3.2564258575439453, + "rewards/rejected": -4.50181245803833, + "step": 871 + }, + { + "epoch": 1.14, + "learning_rate": 3.6196255376581254e-05, + "logits/chosen": -2.5789971351623535, + "logits/rejected": -2.5212442874908447, + "logps/chosen": -217.9554901123047, + "logps/rejected": -214.15769958496094, + "loss": 0.3423, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4449862241744995, + "rewards/margins": 2.885317325592041, + "rewards/rejected": -4.330303192138672, + "step": 872 + }, + { + "epoch": 1.14, + "learning_rate": 3.616420779131177e-05, + "logits/chosen": -2.3495595455169678, + "logits/rejected": -2.4180235862731934, + "logps/chosen": -194.8360595703125, + "logps/rejected": -278.6788024902344, + "loss": 0.1269, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.452872633934021, + "rewards/margins": 4.4202799797058105, + "rewards/rejected": -5.873153209686279, + "step": 873 + }, + { + "epoch": 1.14, + "learning_rate": 3.613213727380206e-05, + "logits/chosen": -2.5643696784973145, + "logits/rejected": -2.60063099861145, + "logps/chosen": -172.3582000732422, + "logps/rejected": -227.60833740234375, + "loss": 0.2933, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6060482263565063, + "rewards/margins": 3.702348232269287, + "rewards/rejected": -5.308396339416504, + "step": 874 + }, + { + "epoch": 1.15, + "learning_rate": 3.610004388992771e-05, + "logits/chosen": -2.4454939365386963, + "logits/rejected": -2.536020517349243, + "logps/chosen": -174.5612335205078, + "logps/rejected": -223.3062744140625, + "loss": 0.1318, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4121896028518677, + "rewards/margins": 3.2348647117614746, + "rewards/rejected": -4.647054195404053, + "step": 875 + }, + { + "epoch": 1.15, + "learning_rate": 3.6067927705611304e-05, + "logits/chosen": -2.4754014015197754, + "logits/rejected": -2.569913387298584, + "logps/chosen": -169.22718811035156, + "logps/rejected": -212.828857421875, + "loss": 0.176, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.641953945159912, + "rewards/margins": 3.349804401397705, + "rewards/rejected": -4.991758346557617, + "step": 876 + }, + { + "epoch": 1.15, + "learning_rate": 3.6035788786822225e-05, + "logits/chosen": -2.3765363693237305, + "logits/rejected": -2.436837911605835, + "logps/chosen": -183.66009521484375, + "logps/rejected": -224.55966186523438, + "loss": 0.151, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2727960348129272, + "rewards/margins": 2.735487461090088, + "rewards/rejected": -4.008283615112305, + "step": 877 + }, + { + "epoch": 1.15, + "learning_rate": 3.6003627199576564e-05, + "logits/chosen": -2.4541356563568115, + "logits/rejected": -2.5396711826324463, + "logps/chosen": -181.16824340820312, + "logps/rejected": -210.95333862304688, + "loss": 0.145, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5306426286697388, + "rewards/margins": 3.0862133502960205, + "rewards/rejected": -4.616855621337891, + "step": 878 + }, + { + "epoch": 1.15, + "learning_rate": 3.597144300993699e-05, + "logits/chosen": -2.450474977493286, + "logits/rejected": -2.512716293334961, + "logps/chosen": -183.2237548828125, + "logps/rejected": -270.755126953125, + "loss": 0.1186, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4356952905654907, + "rewards/margins": 3.521200656890869, + "rewards/rejected": -4.95689582824707, + "step": 879 + }, + { + "epoch": 1.15, + "learning_rate": 3.593923628401259e-05, + "logits/chosen": -2.49372935295105, + "logits/rejected": -2.5204477310180664, + "logps/chosen": -191.3932342529297, + "logps/rejected": -233.7782440185547, + "loss": 0.11, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4455353021621704, + "rewards/margins": 2.7963688373565674, + "rewards/rejected": -4.241904258728027, + "step": 880 + }, + { + "epoch": 1.15, + "learning_rate": 3.5907007087958726e-05, + "logits/chosen": -2.5309414863586426, + "logits/rejected": -2.7542734146118164, + "logps/chosen": -181.82196044921875, + "logps/rejected": -285.6480407714844, + "loss": 0.0853, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5503937005996704, + "rewards/margins": 3.967815399169922, + "rewards/rejected": -5.518209457397461, + "step": 881 + }, + { + "epoch": 1.15, + "learning_rate": 3.587475548797694e-05, + "logits/chosen": -2.563689708709717, + "logits/rejected": -2.4793877601623535, + "logps/chosen": -157.08343505859375, + "logps/rejected": -204.4723358154297, + "loss": 0.23, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6042224168777466, + "rewards/margins": 2.580112934112549, + "rewards/rejected": -4.184335231781006, + "step": 882 + }, + { + "epoch": 1.16, + "learning_rate": 3.5842481550314794e-05, + "logits/chosen": -2.490567445755005, + "logits/rejected": -2.5565853118896484, + "logps/chosen": -176.2081298828125, + "logps/rejected": -216.16978454589844, + "loss": 0.0932, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.209083080291748, + "rewards/margins": 3.1953208446502686, + "rewards/rejected": -4.4044036865234375, + "step": 883 + }, + { + "epoch": 1.16, + "learning_rate": 3.581018534126571e-05, + "logits/chosen": -2.52408504486084, + "logits/rejected": -2.550704002380371, + "logps/chosen": -211.55084228515625, + "logps/rejected": -234.77528381347656, + "loss": 0.1392, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2925100326538086, + "rewards/margins": 3.1794278621673584, + "rewards/rejected": -5.471938133239746, + "step": 884 + }, + { + "epoch": 1.16, + "learning_rate": 3.577786692716886e-05, + "logits/chosen": -2.5428433418273926, + "logits/rejected": -2.4777708053588867, + "logps/chosen": -165.32713317871094, + "logps/rejected": -190.3759765625, + "loss": 0.3405, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.022003650665283, + "rewards/margins": 2.210864543914795, + "rewards/rejected": -4.232868194580078, + "step": 885 + }, + { + "epoch": 1.16, + "learning_rate": 3.574552637440907e-05, + "logits/chosen": -2.411600351333618, + "logits/rejected": -2.4321343898773193, + "logps/chosen": -173.0433807373047, + "logps/rejected": -220.9730224609375, + "loss": 0.1121, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7523713111877441, + "rewards/margins": 3.258190631866455, + "rewards/rejected": -5.010562419891357, + "step": 886 + }, + { + "epoch": 1.16, + "learning_rate": 3.571316374941658e-05, + "logits/chosen": -2.3936336040496826, + "logits/rejected": -2.480240821838379, + "logps/chosen": -166.53944396972656, + "logps/rejected": -230.98146057128906, + "loss": 0.1113, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.883888304233551, + "rewards/margins": 3.743154287338257, + "rewards/rejected": -4.627042770385742, + "step": 887 + }, + { + "epoch": 1.16, + "learning_rate": 3.568077911866703e-05, + "logits/chosen": -2.4695091247558594, + "logits/rejected": -2.465665340423584, + "logps/chosen": -182.034912109375, + "logps/rejected": -231.8602294921875, + "loss": 0.1621, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.706071138381958, + "rewards/margins": 4.134305000305176, + "rewards/rejected": -5.840375900268555, + "step": 888 + }, + { + "epoch": 1.16, + "learning_rate": 3.564837254868118e-05, + "logits/chosen": -2.471785068511963, + "logits/rejected": -2.4700446128845215, + "logps/chosen": -168.14251708984375, + "logps/rejected": -205.8127899169922, + "loss": 0.1355, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1204754114151, + "rewards/margins": 2.8358914852142334, + "rewards/rejected": -3.956366539001465, + "step": 889 + }, + { + "epoch": 1.16, + "learning_rate": 3.561594410602495e-05, + "logits/chosen": -2.5301220417022705, + "logits/rejected": -2.5192551612854004, + "logps/chosen": -181.7959442138672, + "logps/rejected": -193.1697235107422, + "loss": 0.3494, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5961499214172363, + "rewards/margins": 2.624248504638672, + "rewards/rejected": -4.220398426055908, + "step": 890 + }, + { + "epoch": 1.17, + "learning_rate": 3.558349385730913e-05, + "logits/chosen": -2.67132306098938, + "logits/rejected": -2.6529135704040527, + "logps/chosen": -168.33639526367188, + "logps/rejected": -207.62216186523438, + "loss": 0.1778, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1539896726608276, + "rewards/margins": 3.851142644882202, + "rewards/rejected": -5.00513219833374, + "step": 891 + }, + { + "epoch": 1.17, + "learning_rate": 3.5551021869189286e-05, + "logits/chosen": -2.357180595397949, + "logits/rejected": -2.373013734817505, + "logps/chosen": -189.8119659423828, + "logps/rejected": -233.48062133789062, + "loss": 0.255, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8167555332183838, + "rewards/margins": 2.735703945159912, + "rewards/rejected": -4.552459716796875, + "step": 892 + }, + { + "epoch": 1.17, + "learning_rate": 3.55185282083657e-05, + "logits/chosen": -2.5426039695739746, + "logits/rejected": -2.6288843154907227, + "logps/chosen": -191.44874572753906, + "logps/rejected": -240.08837890625, + "loss": 0.1437, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5649197101593018, + "rewards/margins": 3.80615234375, + "rewards/rejected": -5.371071815490723, + "step": 893 + }, + { + "epoch": 1.17, + "learning_rate": 3.548601294158313e-05, + "logits/chosen": -2.582663059234619, + "logits/rejected": -2.585432291030884, + "logps/chosen": -197.2909393310547, + "logps/rejected": -229.80625915527344, + "loss": 0.154, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6019847393035889, + "rewards/margins": 3.1420061588287354, + "rewards/rejected": -4.743990421295166, + "step": 894 + }, + { + "epoch": 1.17, + "learning_rate": 3.5453476135630706e-05, + "logits/chosen": -2.5137104988098145, + "logits/rejected": -2.6177213191986084, + "logps/chosen": -198.3410186767578, + "logps/rejected": -256.9710693359375, + "loss": 0.1517, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4836442470550537, + "rewards/margins": 4.688479423522949, + "rewards/rejected": -6.172123908996582, + "step": 895 + }, + { + "epoch": 1.17, + "learning_rate": 3.542091785734184e-05, + "logits/chosen": -2.538341760635376, + "logits/rejected": -2.463745355606079, + "logps/chosen": -190.23663330078125, + "logps/rejected": -183.18646240234375, + "loss": 0.1924, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9041906595230103, + "rewards/margins": 2.5563762187957764, + "rewards/rejected": -4.460566520690918, + "step": 896 + }, + { + "epoch": 1.17, + "learning_rate": 3.538833817359401e-05, + "logits/chosen": -2.5077438354492188, + "logits/rejected": -2.4932098388671875, + "logps/chosen": -174.5640106201172, + "logps/rejected": -215.76971435546875, + "loss": 0.1574, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.460890531539917, + "rewards/margins": 3.877847194671631, + "rewards/rejected": -5.338737487792969, + "step": 897 + }, + { + "epoch": 1.18, + "learning_rate": 3.5355737151308686e-05, + "logits/chosen": -2.5635290145874023, + "logits/rejected": -2.613480567932129, + "logps/chosen": -174.78395080566406, + "logps/rejected": -211.57774353027344, + "loss": 0.1425, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5586146116256714, + "rewards/margins": 3.321430206298828, + "rewards/rejected": -4.880044937133789, + "step": 898 + }, + { + "epoch": 1.18, + "learning_rate": 3.5323114857451174e-05, + "logits/chosen": -2.665316581726074, + "logits/rejected": -2.619015693664551, + "logps/chosen": -179.59425354003906, + "logps/rejected": -238.99795532226562, + "loss": 0.18, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.855625867843628, + "rewards/margins": 3.34808349609375, + "rewards/rejected": -5.203709602355957, + "step": 899 + }, + { + "epoch": 1.18, + "learning_rate": 3.529047135903045e-05, + "logits/chosen": -2.5373458862304688, + "logits/rejected": -2.524705410003662, + "logps/chosen": -235.49473571777344, + "logps/rejected": -215.7819061279297, + "loss": 0.1887, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3308095932006836, + "rewards/margins": 3.505575180053711, + "rewards/rejected": -4.8363847732543945, + "step": 900 + }, + { + "epoch": 1.18, + "learning_rate": 3.525780672309907e-05, + "logits/chosen": -2.6096298694610596, + "logits/rejected": -2.6740314960479736, + "logps/chosen": -153.22409057617188, + "logps/rejected": -240.62652587890625, + "loss": 0.2415, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.565657138824463, + "rewards/margins": 3.297717332839966, + "rewards/rejected": -4.86337423324585, + "step": 901 + }, + { + "epoch": 1.18, + "learning_rate": 3.522512101675299e-05, + "logits/chosen": -2.3234548568725586, + "logits/rejected": -2.458498954772949, + "logps/chosen": -162.8458251953125, + "logps/rejected": -195.20492553710938, + "loss": 0.1239, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0964446067810059, + "rewards/margins": 3.5404043197631836, + "rewards/rejected": -4.6368489265441895, + "step": 902 + }, + { + "epoch": 1.18, + "learning_rate": 3.519241430713145e-05, + "logits/chosen": -2.495265007019043, + "logits/rejected": -2.479097604751587, + "logps/chosen": -236.592529296875, + "logps/rejected": -252.68394470214844, + "loss": 0.0643, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9273030161857605, + "rewards/margins": 3.584512948989868, + "rewards/rejected": -4.511816501617432, + "step": 903 + }, + { + "epoch": 1.18, + "learning_rate": 3.5159686661416834e-05, + "logits/chosen": -2.364778518676758, + "logits/rejected": -2.4186415672302246, + "logps/chosen": -172.12220764160156, + "logps/rejected": -220.8270263671875, + "loss": 0.1384, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0367463827133179, + "rewards/margins": 2.731386423110962, + "rewards/rejected": -3.7681326866149902, + "step": 904 + }, + { + "epoch": 1.18, + "learning_rate": 3.512693814683456e-05, + "logits/chosen": -2.630051612854004, + "logits/rejected": -2.6269326210021973, + "logps/chosen": -176.4068145751953, + "logps/rejected": -267.4722900390625, + "loss": 0.1077, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4196218252182007, + "rewards/margins": 3.308180809020996, + "rewards/rejected": -4.727802753448486, + "step": 905 + }, + { + "epoch": 1.19, + "learning_rate": 3.5094168830652854e-05, + "logits/chosen": -2.50472354888916, + "logits/rejected": -2.5423359870910645, + "logps/chosen": -184.10879516601562, + "logps/rejected": -237.14556884765625, + "loss": 0.0905, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1987833976745605, + "rewards/margins": 4.0037665367126465, + "rewards/rejected": -5.202549934387207, + "step": 906 + }, + { + "epoch": 1.19, + "learning_rate": 3.506137878018272e-05, + "logits/chosen": -2.5517704486846924, + "logits/rejected": -2.548060655593872, + "logps/chosen": -199.19915771484375, + "logps/rejected": -234.20936584472656, + "loss": 0.1199, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.356502890586853, + "rewards/margins": 3.5127556324005127, + "rewards/rejected": -4.869258403778076, + "step": 907 + }, + { + "epoch": 1.19, + "learning_rate": 3.502856806277773e-05, + "logits/chosen": -2.4246985912323, + "logits/rejected": -2.3955211639404297, + "logps/chosen": -199.87081909179688, + "logps/rejected": -204.13876342773438, + "loss": 0.1924, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4611130952835083, + "rewards/margins": 3.7410144805908203, + "rewards/rejected": -5.202127933502197, + "step": 908 + }, + { + "epoch": 1.19, + "learning_rate": 3.4995736745833895e-05, + "logits/chosen": -2.5710816383361816, + "logits/rejected": -2.6107711791992188, + "logps/chosen": -157.6302490234375, + "logps/rejected": -233.49349975585938, + "loss": 0.0864, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9010351300239563, + "rewards/margins": 3.532578706741333, + "rewards/rejected": -4.4336137771606445, + "step": 909 + }, + { + "epoch": 1.19, + "learning_rate": 3.496288489678958e-05, + "logits/chosen": -2.52079701423645, + "logits/rejected": -2.495419502258301, + "logps/chosen": -176.95236206054688, + "logps/rejected": -222.7263641357422, + "loss": 0.1798, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7388372421264648, + "rewards/margins": 3.1190779209136963, + "rewards/rejected": -4.857914924621582, + "step": 910 + }, + { + "epoch": 1.19, + "learning_rate": 3.493001258312529e-05, + "logits/chosen": -2.7223851680755615, + "logits/rejected": -2.791260242462158, + "logps/chosen": -206.46063232421875, + "logps/rejected": -213.7384796142578, + "loss": 0.089, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8603122234344482, + "rewards/margins": 3.696009635925293, + "rewards/rejected": -4.556321144104004, + "step": 911 + }, + { + "epoch": 1.19, + "learning_rate": 3.489711987236357e-05, + "logits/chosen": -2.5706539154052734, + "logits/rejected": -2.581258773803711, + "logps/chosen": -203.28652954101562, + "logps/rejected": -235.77676391601562, + "loss": 0.2027, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5995769500732422, + "rewards/margins": 3.2174808979034424, + "rewards/rejected": -4.817058086395264, + "step": 912 + }, + { + "epoch": 1.2, + "learning_rate": 3.4864206832068884e-05, + "logits/chosen": -2.6766703128814697, + "logits/rejected": -2.6705029010772705, + "logps/chosen": -180.23080444335938, + "logps/rejected": -235.3216094970703, + "loss": 0.1355, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.026005506515503, + "rewards/margins": 3.379913806915283, + "rewards/rejected": -4.405919075012207, + "step": 913 + }, + { + "epoch": 1.2, + "learning_rate": 3.483127352984742e-05, + "logits/chosen": -2.3571276664733887, + "logits/rejected": -2.5192885398864746, + "logps/chosen": -162.15489196777344, + "logps/rejected": -215.78590393066406, + "loss": 0.1989, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5191320180892944, + "rewards/margins": 3.3156440258026123, + "rewards/rejected": -4.834775924682617, + "step": 914 + }, + { + "epoch": 1.2, + "learning_rate": 3.479832003334702e-05, + "logits/chosen": -2.554062604904175, + "logits/rejected": -2.588319778442383, + "logps/chosen": -220.20921325683594, + "logps/rejected": -267.699951171875, + "loss": 0.0675, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2863855361938477, + "rewards/margins": 3.6802079677581787, + "rewards/rejected": -4.9665937423706055, + "step": 915 + }, + { + "epoch": 1.2, + "learning_rate": 3.476534641025698e-05, + "logits/chosen": -2.545753002166748, + "logits/rejected": -2.56862735748291, + "logps/chosen": -192.8128662109375, + "logps/rejected": -249.750732421875, + "loss": 0.1533, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2434145212173462, + "rewards/margins": 3.0867209434509277, + "rewards/rejected": -4.330134868621826, + "step": 916 + }, + { + "epoch": 1.2, + "learning_rate": 3.4732352728307966e-05, + "logits/chosen": -2.8638410568237305, + "logits/rejected": -2.8774917125701904, + "logps/chosen": -217.84780883789062, + "logps/rejected": -254.37258911132812, + "loss": 0.101, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.095003366470337, + "rewards/margins": 4.037415981292725, + "rewards/rejected": -5.132419586181641, + "step": 917 + }, + { + "epoch": 1.2, + "learning_rate": 3.469933905527182e-05, + "logits/chosen": -2.551844835281372, + "logits/rejected": -2.530813217163086, + "logps/chosen": -185.98265075683594, + "logps/rejected": -205.4719696044922, + "loss": 0.283, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1485133171081543, + "rewards/margins": 2.621171474456787, + "rewards/rejected": -3.7696852684020996, + "step": 918 + }, + { + "epoch": 1.2, + "learning_rate": 3.466630545896146e-05, + "logits/chosen": -2.413818359375, + "logits/rejected": -2.587848663330078, + "logps/chosen": -165.9118194580078, + "logps/rejected": -241.44552612304688, + "loss": 0.0605, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4189995527267456, + "rewards/margins": 3.7784295082092285, + "rewards/rejected": -5.197429180145264, + "step": 919 + }, + { + "epoch": 1.2, + "learning_rate": 3.463325200723071e-05, + "logits/chosen": -2.4706473350524902, + "logits/rejected": -2.5091466903686523, + "logps/chosen": -201.78915405273438, + "logps/rejected": -254.05223083496094, + "loss": 0.102, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2410138845443726, + "rewards/margins": 3.5578086376190186, + "rewards/rejected": -4.798822402954102, + "step": 920 + }, + { + "epoch": 1.21, + "learning_rate": 3.460017876797422e-05, + "logits/chosen": -2.6846604347229004, + "logits/rejected": -2.771611213684082, + "logps/chosen": -182.0282440185547, + "logps/rejected": -241.62359619140625, + "loss": 0.1928, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3348745107650757, + "rewards/margins": 3.6126434803009033, + "rewards/rejected": -4.947518348693848, + "step": 921 + }, + { + "epoch": 1.21, + "learning_rate": 3.456708580912725e-05, + "logits/chosen": -2.4890265464782715, + "logits/rejected": -2.4979093074798584, + "logps/chosen": -202.8487091064453, + "logps/rejected": -264.9278564453125, + "loss": 0.1011, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8148360848426819, + "rewards/margins": 3.85543155670166, + "rewards/rejected": -4.670267581939697, + "step": 922 + }, + { + "epoch": 1.21, + "learning_rate": 3.453397319866557e-05, + "logits/chosen": -2.672006607055664, + "logits/rejected": -2.696829319000244, + "logps/chosen": -230.58523559570312, + "logps/rejected": -281.7519836425781, + "loss": 0.2093, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6621661186218262, + "rewards/margins": 3.0963659286499023, + "rewards/rejected": -4.7585320472717285, + "step": 923 + }, + { + "epoch": 1.21, + "learning_rate": 3.4500841004605324e-05, + "logits/chosen": -2.795738697052002, + "logits/rejected": -2.686603546142578, + "logps/chosen": -207.413330078125, + "logps/rejected": -196.3363494873047, + "loss": 0.1551, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3747926950454712, + "rewards/margins": 2.683774948120117, + "rewards/rejected": -4.058568000793457, + "step": 924 + }, + { + "epoch": 1.21, + "learning_rate": 3.446768929500288e-05, + "logits/chosen": -2.398050546646118, + "logits/rejected": -2.525655746459961, + "logps/chosen": -146.66969299316406, + "logps/rejected": -216.01690673828125, + "loss": 0.1013, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3923161029815674, + "rewards/margins": 3.224682092666626, + "rewards/rejected": -4.616998195648193, + "step": 925 + }, + { + "epoch": 1.21, + "learning_rate": 3.443451813795469e-05, + "logits/chosen": -2.4953365325927734, + "logits/rejected": -2.539217948913574, + "logps/chosen": -178.64605712890625, + "logps/rejected": -244.44332885742188, + "loss": 0.0864, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8870552778244019, + "rewards/margins": 4.003358840942383, + "rewards/rejected": -4.890414237976074, + "step": 926 + }, + { + "epoch": 1.21, + "learning_rate": 3.4401327601597174e-05, + "logits/chosen": -2.5432627201080322, + "logits/rejected": -2.676295757293701, + "logps/chosen": -162.99037170410156, + "logps/rejected": -223.5609893798828, + "loss": 0.1389, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2814667224884033, + "rewards/margins": 3.612804889678955, + "rewards/rejected": -4.894271373748779, + "step": 927 + }, + { + "epoch": 1.21, + "learning_rate": 3.436811775410651e-05, + "logits/chosen": -2.607267141342163, + "logits/rejected": -2.661623477935791, + "logps/chosen": -182.5743408203125, + "logps/rejected": -186.42269897460938, + "loss": 0.3344, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5140290260314941, + "rewards/margins": 2.287888526916504, + "rewards/rejected": -3.801917314529419, + "step": 928 + }, + { + "epoch": 1.22, + "learning_rate": 3.43348886636986e-05, + "logits/chosen": -2.422396421432495, + "logits/rejected": -2.606943130493164, + "logps/chosen": -172.03494262695312, + "logps/rejected": -237.93191528320312, + "loss": 0.1241, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8440936803817749, + "rewards/margins": 4.104944705963135, + "rewards/rejected": -4.949038505554199, + "step": 929 + }, + { + "epoch": 1.22, + "learning_rate": 3.430164039862882e-05, + "logits/chosen": -2.6767523288726807, + "logits/rejected": -2.6473617553710938, + "logps/chosen": -182.46820068359375, + "logps/rejected": -241.1190185546875, + "loss": 0.1087, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0853631496429443, + "rewards/margins": 4.447392463684082, + "rewards/rejected": -5.5327558517456055, + "step": 930 + }, + { + "epoch": 1.22, + "learning_rate": 3.426837302719197e-05, + "logits/chosen": -2.4199132919311523, + "logits/rejected": -2.390627384185791, + "logps/chosen": -169.13812255859375, + "logps/rejected": -212.91665649414062, + "loss": 0.1015, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5601575374603271, + "rewards/margins": 4.0769805908203125, + "rewards/rejected": -5.637138366699219, + "step": 931 + }, + { + "epoch": 1.22, + "learning_rate": 3.42350866177221e-05, + "logits/chosen": -2.603242874145508, + "logits/rejected": -2.6754183769226074, + "logps/chosen": -180.8822479248047, + "logps/rejected": -224.43988037109375, + "loss": 0.2815, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2987481355667114, + "rewards/margins": 2.230471611022949, + "rewards/rejected": -3.52921986579895, + "step": 932 + }, + { + "epoch": 1.22, + "learning_rate": 3.420178123859233e-05, + "logits/chosen": -2.5300185680389404, + "logits/rejected": -2.5311319828033447, + "logps/chosen": -187.79489135742188, + "logps/rejected": -245.074951171875, + "loss": 0.1293, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.442630410194397, + "rewards/margins": 4.003236293792725, + "rewards/rejected": -5.445866107940674, + "step": 933 + }, + { + "epoch": 1.22, + "learning_rate": 3.416845695821476e-05, + "logits/chosen": -2.478121757507324, + "logits/rejected": -2.551612377166748, + "logps/chosen": -170.02886962890625, + "logps/rejected": -266.62445068359375, + "loss": 0.2426, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5788254737854004, + "rewards/margins": 3.298884630203247, + "rewards/rejected": -4.877709865570068, + "step": 934 + }, + { + "epoch": 1.22, + "learning_rate": 3.413511384504034e-05, + "logits/chosen": -2.541309356689453, + "logits/rejected": -2.6793863773345947, + "logps/chosen": -164.0281524658203, + "logps/rejected": -277.7725830078125, + "loss": 0.0432, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.379370927810669, + "rewards/margins": 4.741353988647461, + "rewards/rejected": -6.120724678039551, + "step": 935 + }, + { + "epoch": 1.23, + "learning_rate": 3.410175196755866e-05, + "logits/chosen": -2.5042457580566406, + "logits/rejected": -2.5594520568847656, + "logps/chosen": -231.91162109375, + "logps/rejected": -235.10675048828125, + "loss": 0.2673, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9761120080947876, + "rewards/margins": 2.6676290035247803, + "rewards/rejected": -4.643741130828857, + "step": 936 + }, + { + "epoch": 1.23, + "learning_rate": 3.40683713942979e-05, + "logits/chosen": -2.563133478164673, + "logits/rejected": -2.469977378845215, + "logps/chosen": -238.8995361328125, + "logps/rejected": -281.9659118652344, + "loss": 0.0544, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.357677936553955, + "rewards/margins": 4.857703685760498, + "rewards/rejected": -6.215381622314453, + "step": 937 + }, + { + "epoch": 1.23, + "learning_rate": 3.403497219382461e-05, + "logits/chosen": -2.6389622688293457, + "logits/rejected": -2.6753883361816406, + "logps/chosen": -155.08253479003906, + "logps/rejected": -225.48619079589844, + "loss": 0.0971, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.296984314918518, + "rewards/margins": 3.8950109481811523, + "rewards/rejected": -5.191995143890381, + "step": 938 + }, + { + "epoch": 1.23, + "learning_rate": 3.400155443474361e-05, + "logits/chosen": -2.5711069107055664, + "logits/rejected": -2.539632558822632, + "logps/chosen": -297.8182067871094, + "logps/rejected": -333.8896484375, + "loss": 0.1312, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.230386257171631, + "rewards/margins": 4.170665740966797, + "rewards/rejected": -6.401051998138428, + "step": 939 + }, + { + "epoch": 1.23, + "learning_rate": 3.396811818569785e-05, + "logits/chosen": -2.684633731842041, + "logits/rejected": -2.71695876121521, + "logps/chosen": -240.3818359375, + "logps/rejected": -274.24285888671875, + "loss": 0.1441, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5472981929779053, + "rewards/margins": 3.971374750137329, + "rewards/rejected": -5.518672466278076, + "step": 940 + }, + { + "epoch": 1.23, + "learning_rate": 3.3934663515368236e-05, + "logits/chosen": -2.6104092597961426, + "logits/rejected": -2.594395637512207, + "logps/chosen": -194.75103759765625, + "logps/rejected": -240.4493408203125, + "loss": 0.1974, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7447320222854614, + "rewards/margins": 3.628329277038574, + "rewards/rejected": -5.373061656951904, + "step": 941 + }, + { + "epoch": 1.23, + "learning_rate": 3.3901190492473554e-05, + "logits/chosen": -2.6302521228790283, + "logits/rejected": -2.7227888107299805, + "logps/chosen": -170.1224365234375, + "logps/rejected": -237.1372528076172, + "loss": 0.1896, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.631825566291809, + "rewards/margins": 2.9237256050109863, + "rewards/rejected": -4.555551052093506, + "step": 942 + }, + { + "epoch": 1.23, + "learning_rate": 3.3867699185770255e-05, + "logits/chosen": -2.7121634483337402, + "logits/rejected": -2.681286573410034, + "logps/chosen": -186.2149200439453, + "logps/rejected": -199.51651000976562, + "loss": 0.152, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9240370988845825, + "rewards/margins": 2.6966371536254883, + "rewards/rejected": -4.620674133300781, + "step": 943 + }, + { + "epoch": 1.24, + "learning_rate": 3.383418966405234e-05, + "logits/chosen": -2.650979995727539, + "logits/rejected": -2.7104907035827637, + "logps/chosen": -201.94180297851562, + "logps/rejected": -240.47752380371094, + "loss": 0.2521, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5346249341964722, + "rewards/margins": 3.591853618621826, + "rewards/rejected": -5.126478672027588, + "step": 944 + }, + { + "epoch": 1.24, + "learning_rate": 3.3800661996151264e-05, + "logits/chosen": -2.4865236282348633, + "logits/rejected": -2.513885736465454, + "logps/chosen": -193.3810577392578, + "logps/rejected": -244.85964965820312, + "loss": 0.1085, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4318909645080566, + "rewards/margins": 4.656887054443359, + "rewards/rejected": -6.088778018951416, + "step": 945 + }, + { + "epoch": 1.24, + "learning_rate": 3.376711625093571e-05, + "logits/chosen": -2.6373162269592285, + "logits/rejected": -2.581425905227661, + "logps/chosen": -169.510009765625, + "logps/rejected": -241.111572265625, + "loss": 0.13, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7232588529586792, + "rewards/margins": 4.364504337310791, + "rewards/rejected": -6.08776330947876, + "step": 946 + }, + { + "epoch": 1.24, + "learning_rate": 3.373355249731153e-05, + "logits/chosen": -2.6107871532440186, + "logits/rejected": -2.5420989990234375, + "logps/chosen": -181.519775390625, + "logps/rejected": -240.91998291015625, + "loss": 0.2422, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9072514772415161, + "rewards/margins": 3.571643352508545, + "rewards/rejected": -5.47889518737793, + "step": 947 + }, + { + "epoch": 1.24, + "learning_rate": 3.369997080422155e-05, + "logits/chosen": -2.550039768218994, + "logits/rejected": -2.630925178527832, + "logps/chosen": -230.58935546875, + "logps/rejected": -251.55810546875, + "loss": 0.1636, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.60187029838562, + "rewards/margins": 3.3375368118286133, + "rewards/rejected": -5.939406871795654, + "step": 948 + }, + { + "epoch": 1.24, + "learning_rate": 3.366637124064544e-05, + "logits/chosen": -2.6128439903259277, + "logits/rejected": -2.619659185409546, + "logps/chosen": -160.36740112304688, + "logps/rejected": -200.9753875732422, + "loss": 0.1855, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7813210487365723, + "rewards/margins": 2.907754898071289, + "rewards/rejected": -4.689075946807861, + "step": 949 + }, + { + "epoch": 1.24, + "learning_rate": 3.36327538755996e-05, + "logits/chosen": -2.562918186187744, + "logits/rejected": -2.6631853580474854, + "logps/chosen": -201.80294799804688, + "logps/rejected": -263.9149475097656, + "loss": 0.0469, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5491386651992798, + "rewards/margins": 4.151284217834473, + "rewards/rejected": -5.700422286987305, + "step": 950 + }, + { + "epoch": 1.24, + "learning_rate": 3.3599118778136965e-05, + "logits/chosen": -2.6970577239990234, + "logits/rejected": -2.7352042198181152, + "logps/chosen": -186.6042938232422, + "logps/rejected": -264.8851318359375, + "loss": 0.0584, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.238983631134033, + "rewards/margins": 3.6914262771606445, + "rewards/rejected": -5.930410385131836, + "step": 951 + }, + { + "epoch": 1.25, + "learning_rate": 3.356546601734692e-05, + "logits/chosen": -2.624141216278076, + "logits/rejected": -2.6024389266967773, + "logps/chosen": -169.18344116210938, + "logps/rejected": -194.87644958496094, + "loss": 0.2061, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8763582706451416, + "rewards/margins": 3.1667044162750244, + "rewards/rejected": -5.043062686920166, + "step": 952 + }, + { + "epoch": 1.25, + "learning_rate": 3.3531795662355115e-05, + "logits/chosen": -2.6674022674560547, + "logits/rejected": -2.7814764976501465, + "logps/chosen": -169.5562286376953, + "logps/rejected": -216.2843475341797, + "loss": 0.1286, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6532480716705322, + "rewards/margins": 3.105289936065674, + "rewards/rejected": -4.758538246154785, + "step": 953 + }, + { + "epoch": 1.25, + "learning_rate": 3.349810778232335e-05, + "logits/chosen": -2.4326975345611572, + "logits/rejected": -2.509242534637451, + "logps/chosen": -186.85870361328125, + "logps/rejected": -209.04522705078125, + "loss": 0.1173, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6442396640777588, + "rewards/margins": 3.101438522338867, + "rewards/rejected": -4.745677947998047, + "step": 954 + }, + { + "epoch": 1.25, + "learning_rate": 3.346440244644942e-05, + "logits/chosen": -2.597878932952881, + "logits/rejected": -2.6052956581115723, + "logps/chosen": -194.63372802734375, + "logps/rejected": -213.82635498046875, + "loss": 0.2907, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.813546895980835, + "rewards/margins": 2.6921236515045166, + "rewards/rejected": -4.505670547485352, + "step": 955 + }, + { + "epoch": 1.25, + "learning_rate": 3.3430679723966976e-05, + "logits/chosen": -2.5130550861358643, + "logits/rejected": -2.5555241107940674, + "logps/chosen": -144.39450073242188, + "logps/rejected": -201.18093872070312, + "loss": 0.26, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6298218965530396, + "rewards/margins": 2.9413676261901855, + "rewards/rejected": -4.5711894035339355, + "step": 956 + }, + { + "epoch": 1.25, + "learning_rate": 3.339693968414538e-05, + "logits/chosen": -2.536893844604492, + "logits/rejected": -2.672922372817993, + "logps/chosen": -144.6156768798828, + "logps/rejected": -195.33729553222656, + "loss": 0.283, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9383922815322876, + "rewards/margins": 2.168663501739502, + "rewards/rejected": -4.1070556640625, + "step": 957 + }, + { + "epoch": 1.25, + "learning_rate": 3.336318239628956e-05, + "logits/chosen": -2.6276655197143555, + "logits/rejected": -2.6729187965393066, + "logps/chosen": -169.65625, + "logps/rejected": -203.56922912597656, + "loss": 0.1577, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.270064115524292, + "rewards/margins": 3.2413148880004883, + "rewards/rejected": -4.511379241943359, + "step": 958 + }, + { + "epoch": 1.26, + "learning_rate": 3.3329407929739906e-05, + "logits/chosen": -2.708746910095215, + "logits/rejected": -2.751272201538086, + "logps/chosen": -200.1501922607422, + "logps/rejected": -251.32875061035156, + "loss": 0.197, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.698577880859375, + "rewards/margins": 3.6546480655670166, + "rewards/rejected": -5.3532257080078125, + "step": 959 + }, + { + "epoch": 1.26, + "learning_rate": 3.3295616353872026e-05, + "logits/chosen": -2.5632784366607666, + "logits/rejected": -2.617943048477173, + "logps/chosen": -173.5696258544922, + "logps/rejected": -198.16314697265625, + "loss": 0.2387, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.636803388595581, + "rewards/margins": 2.631190299987793, + "rewards/rejected": -4.267993450164795, + "step": 960 + }, + { + "epoch": 1.26, + "learning_rate": 3.326180773809676e-05, + "logits/chosen": -2.4898295402526855, + "logits/rejected": -2.5250651836395264, + "logps/chosen": -193.2848663330078, + "logps/rejected": -234.1524658203125, + "loss": 0.0843, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6962850093841553, + "rewards/margins": 3.261317253112793, + "rewards/rejected": -4.957602500915527, + "step": 961 + }, + { + "epoch": 1.26, + "learning_rate": 3.3227982151859873e-05, + "logits/chosen": -2.6020333766937256, + "logits/rejected": -2.590635299682617, + "logps/chosen": -200.148681640625, + "logps/rejected": -242.25833129882812, + "loss": 0.1046, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.01560640335083, + "rewards/margins": 4.58357572555542, + "rewards/rejected": -6.59918212890625, + "step": 962 + }, + { + "epoch": 1.26, + "learning_rate": 3.3194139664642035e-05, + "logits/chosen": -2.5765419006347656, + "logits/rejected": -2.623060464859009, + "logps/chosen": -206.80001831054688, + "logps/rejected": -253.6566162109375, + "loss": 0.123, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8534865379333496, + "rewards/margins": 3.36710786819458, + "rewards/rejected": -5.22059440612793, + "step": 963 + }, + { + "epoch": 1.26, + "learning_rate": 3.3160280345958614e-05, + "logits/chosen": -2.706054449081421, + "logits/rejected": -2.7712466716766357, + "logps/chosen": -218.05075073242188, + "logps/rejected": -267.1859436035156, + "loss": 0.0725, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8123269081115723, + "rewards/margins": 3.9265780448913574, + "rewards/rejected": -5.738905429840088, + "step": 964 + }, + { + "epoch": 1.26, + "learning_rate": 3.3126404265359545e-05, + "logits/chosen": -2.6757712364196777, + "logits/rejected": -2.7458887100219727, + "logps/chosen": -201.00808715820312, + "logps/rejected": -240.7850341796875, + "loss": 0.0353, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6226627826690674, + "rewards/margins": 4.559228897094727, + "rewards/rejected": -6.181891918182373, + "step": 965 + }, + { + "epoch": 1.26, + "learning_rate": 3.3092511492429216e-05, + "logits/chosen": -2.644242763519287, + "logits/rejected": -2.695011615753174, + "logps/chosen": -162.74783325195312, + "logps/rejected": -192.76516723632812, + "loss": 0.1974, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1141974925994873, + "rewards/margins": 2.7216439247131348, + "rewards/rejected": -4.835841178894043, + "step": 966 + }, + { + "epoch": 1.27, + "learning_rate": 3.305860209678628e-05, + "logits/chosen": -2.4524073600769043, + "logits/rejected": -2.549844264984131, + "logps/chosen": -194.18768310546875, + "logps/rejected": -271.9709167480469, + "loss": 0.2353, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8129292726516724, + "rewards/margins": 3.5435562133789062, + "rewards/rejected": -5.356485843658447, + "step": 967 + }, + { + "epoch": 1.27, + "learning_rate": 3.3024676148083555e-05, + "logits/chosen": -2.518044948577881, + "logits/rejected": -2.5665040016174316, + "logps/chosen": -208.9219970703125, + "logps/rejected": -242.92112731933594, + "loss": 0.0321, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6226799488067627, + "rewards/margins": 4.56795597076416, + "rewards/rejected": -6.190636157989502, + "step": 968 + }, + { + "epoch": 1.27, + "learning_rate": 3.299073371600784e-05, + "logits/chosen": -2.580070972442627, + "logits/rejected": -2.726609945297241, + "logps/chosen": -179.52996826171875, + "logps/rejected": -268.6859130859375, + "loss": 0.0683, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1363651752471924, + "rewards/margins": 3.852691173553467, + "rewards/rejected": -5.989056587219238, + "step": 969 + }, + { + "epoch": 1.27, + "learning_rate": 3.29567748702798e-05, + "logits/chosen": -2.806037425994873, + "logits/rejected": -2.7987043857574463, + "logps/chosen": -192.37161254882812, + "logps/rejected": -251.89358520507812, + "loss": 0.0796, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6633442640304565, + "rewards/margins": 3.375584602355957, + "rewards/rejected": -5.038928985595703, + "step": 970 + }, + { + "epoch": 1.27, + "learning_rate": 3.2922799680653816e-05, + "logits/chosen": -2.572334051132202, + "logits/rejected": -2.6026675701141357, + "logps/chosen": -241.74142456054688, + "logps/rejected": -274.17462158203125, + "loss": 0.2632, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3693084716796875, + "rewards/margins": 2.436305522918701, + "rewards/rejected": -4.805613994598389, + "step": 971 + }, + { + "epoch": 1.27, + "learning_rate": 3.288880821691785e-05, + "logits/chosen": -2.696925640106201, + "logits/rejected": -2.7146036624908447, + "logps/chosen": -237.42230224609375, + "logps/rejected": -229.28921508789062, + "loss": 0.1131, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2252135276794434, + "rewards/margins": 3.5753629207611084, + "rewards/rejected": -5.800576686859131, + "step": 972 + }, + { + "epoch": 1.27, + "learning_rate": 3.285480054889327e-05, + "logits/chosen": -2.474309206008911, + "logits/rejected": -2.537996292114258, + "logps/chosen": -221.17630004882812, + "logps/rejected": -248.78753662109375, + "loss": 0.1531, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4152324199676514, + "rewards/margins": 3.2296578884124756, + "rewards/rejected": -5.644889831542969, + "step": 973 + }, + { + "epoch": 1.27, + "learning_rate": 3.2820776746434764e-05, + "logits/chosen": -2.652519941329956, + "logits/rejected": -2.750889778137207, + "logps/chosen": -247.38218688964844, + "logps/rejected": -280.9444274902344, + "loss": 0.2395, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.102778911590576, + "rewards/margins": 2.95568585395813, + "rewards/rejected": -5.058465003967285, + "step": 974 + }, + { + "epoch": 1.28, + "learning_rate": 3.278673687943011e-05, + "logits/chosen": -2.430567979812622, + "logits/rejected": -2.61224365234375, + "logps/chosen": -188.28207397460938, + "logps/rejected": -259.6468811035156, + "loss": 0.057, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8571250438690186, + "rewards/margins": 4.498969554901123, + "rewards/rejected": -6.3560943603515625, + "step": 975 + }, + { + "epoch": 1.28, + "learning_rate": 3.2752681017800144e-05, + "logits/chosen": -2.642226457595825, + "logits/rejected": -2.6351287364959717, + "logps/chosen": -179.3868865966797, + "logps/rejected": -236.36964416503906, + "loss": 0.1584, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0731639862060547, + "rewards/margins": 3.906996965408325, + "rewards/rejected": -5.980160713195801, + "step": 976 + }, + { + "epoch": 1.28, + "learning_rate": 3.27186092314985e-05, + "logits/chosen": -2.643181562423706, + "logits/rejected": -2.6815311908721924, + "logps/chosen": -188.3582000732422, + "logps/rejected": -238.82688903808594, + "loss": 0.1769, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3766210079193115, + "rewards/margins": 3.4544308185577393, + "rewards/rejected": -5.831052303314209, + "step": 977 + }, + { + "epoch": 1.28, + "learning_rate": 3.2684521590511566e-05, + "logits/chosen": -2.5185253620147705, + "logits/rejected": -2.6350820064544678, + "logps/chosen": -212.4395294189453, + "logps/rejected": -258.64971923828125, + "loss": 0.0798, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6726995706558228, + "rewards/margins": 4.015776634216309, + "rewards/rejected": -5.688476085662842, + "step": 978 + }, + { + "epoch": 1.28, + "learning_rate": 3.2650418164858284e-05, + "logits/chosen": -2.552361011505127, + "logits/rejected": -2.6091713905334473, + "logps/chosen": -183.578857421875, + "logps/rejected": -226.1600341796875, + "loss": 0.1787, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2989511489868164, + "rewards/margins": 4.482161521911621, + "rewards/rejected": -6.781111717224121, + "step": 979 + }, + { + "epoch": 1.28, + "learning_rate": 3.261629902459e-05, + "logits/chosen": -2.7338201999664307, + "logits/rejected": -2.632646322250366, + "logps/chosen": -247.8429718017578, + "logps/rejected": -241.37171936035156, + "loss": 0.1638, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2429699897766113, + "rewards/margins": 3.6616616249084473, + "rewards/rejected": -5.904631614685059, + "step": 980 + }, + { + "epoch": 1.28, + "learning_rate": 3.258216423979037e-05, + "logits/chosen": -2.3317790031433105, + "logits/rejected": -2.4023687839508057, + "logps/chosen": -152.5063934326172, + "logps/rejected": -252.93801879882812, + "loss": 0.0677, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.337461471557617, + "rewards/margins": 3.838200569152832, + "rewards/rejected": -6.175662040710449, + "step": 981 + }, + { + "epoch": 1.29, + "learning_rate": 3.254801388057514e-05, + "logits/chosen": -2.5534322261810303, + "logits/rejected": -2.6023335456848145, + "logps/chosen": -219.31874084472656, + "logps/rejected": -287.8863220214844, + "loss": 0.0447, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.125908613204956, + "rewards/margins": 4.897000312805176, + "rewards/rejected": -8.022909164428711, + "step": 982 + }, + { + "epoch": 1.29, + "learning_rate": 3.2513848017092113e-05, + "logits/chosen": -2.5682573318481445, + "logits/rejected": -2.605602741241455, + "logps/chosen": -201.1256103515625, + "logps/rejected": -251.9600067138672, + "loss": 0.0874, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9630330801010132, + "rewards/margins": 4.16124963760376, + "rewards/rejected": -6.1242828369140625, + "step": 983 + }, + { + "epoch": 1.29, + "learning_rate": 3.2479666719520886e-05, + "logits/chosen": -2.4335856437683105, + "logits/rejected": -2.495433807373047, + "logps/chosen": -217.77142333984375, + "logps/rejected": -269.6983947753906, + "loss": 0.1568, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.553312063217163, + "rewards/margins": 4.387064456939697, + "rewards/rejected": -6.940376281738281, + "step": 984 + }, + { + "epoch": 1.29, + "learning_rate": 3.2445470058072766e-05, + "logits/chosen": -2.477215051651001, + "logits/rejected": -2.4980826377868652, + "logps/chosen": -175.28768920898438, + "logps/rejected": -211.9522247314453, + "loss": 0.2317, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.883258104324341, + "rewards/margins": 3.288342237472534, + "rewards/rejected": -6.171600341796875, + "step": 985 + }, + { + "epoch": 1.29, + "learning_rate": 3.2411258102990646e-05, + "logits/chosen": -2.698887348175049, + "logits/rejected": -2.8316826820373535, + "logps/chosen": -198.59188842773438, + "logps/rejected": -265.5428161621094, + "loss": 0.0484, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.087355613708496, + "rewards/margins": 4.889158248901367, + "rewards/rejected": -6.976513385772705, + "step": 986 + }, + { + "epoch": 1.29, + "learning_rate": 3.23770309245488e-05, + "logits/chosen": -2.6792445182800293, + "logits/rejected": -2.610353469848633, + "logps/chosen": -186.60247802734375, + "logps/rejected": -213.04379272460938, + "loss": 0.1424, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9458187818527222, + "rewards/margins": 3.5154922008514404, + "rewards/rejected": -5.461311340332031, + "step": 987 + }, + { + "epoch": 1.29, + "learning_rate": 3.23427885930528e-05, + "logits/chosen": -2.5607926845550537, + "logits/rejected": -2.6190388202667236, + "logps/chosen": -203.35629272460938, + "logps/rejected": -276.7921142578125, + "loss": 0.0539, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.604539632797241, + "rewards/margins": 4.186058521270752, + "rewards/rejected": -6.790597915649414, + "step": 988 + }, + { + "epoch": 1.29, + "learning_rate": 3.230853117883933e-05, + "logits/chosen": -2.3824515342712402, + "logits/rejected": -2.374831438064575, + "logps/chosen": -180.39971923828125, + "logps/rejected": -229.66053771972656, + "loss": 0.0811, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.316575527191162, + "rewards/margins": 4.488853454589844, + "rewards/rejected": -6.805428981781006, + "step": 989 + }, + { + "epoch": 1.3, + "learning_rate": 3.227425875227605e-05, + "logits/chosen": -2.517157793045044, + "logits/rejected": -2.5897037982940674, + "logps/chosen": -228.96835327148438, + "logps/rejected": -249.2227325439453, + "loss": 0.0852, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.941978693008423, + "rewards/margins": 3.9116504192352295, + "rewards/rejected": -6.8536295890808105, + "step": 990 + }, + { + "epoch": 1.3, + "learning_rate": 3.223997138376146e-05, + "logits/chosen": -2.5914738178253174, + "logits/rejected": -2.5838983058929443, + "logps/chosen": -221.4722900390625, + "logps/rejected": -228.54957580566406, + "loss": 0.1337, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.404142379760742, + "rewards/margins": 3.357062816619873, + "rewards/rejected": -6.761204719543457, + "step": 991 + }, + { + "epoch": 1.3, + "learning_rate": 3.220566914372477e-05, + "logits/chosen": -2.6407084465026855, + "logits/rejected": -2.6386706829071045, + "logps/chosen": -253.16580200195312, + "logps/rejected": -330.81640625, + "loss": 0.08, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.206897735595703, + "rewards/margins": 4.39909553527832, + "rewards/rejected": -6.605993270874023, + "step": 992 + }, + { + "epoch": 1.3, + "learning_rate": 3.2171352102625716e-05, + "logits/chosen": -2.722573757171631, + "logits/rejected": -2.6421186923980713, + "logps/chosen": -250.5634002685547, + "logps/rejected": -265.2950439453125, + "loss": 0.1562, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.815258741378784, + "rewards/margins": 3.4340462684631348, + "rewards/rejected": -7.249305248260498, + "step": 993 + }, + { + "epoch": 1.3, + "learning_rate": 3.213702033095444e-05, + "logits/chosen": -2.5136289596557617, + "logits/rejected": -2.5164742469787598, + "logps/chosen": -194.22906494140625, + "logps/rejected": -242.98043823242188, + "loss": 0.1075, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7019946575164795, + "rewards/margins": 4.0128560066223145, + "rewards/rejected": -6.714850902557373, + "step": 994 + }, + { + "epoch": 1.3, + "learning_rate": 3.210267389923135e-05, + "logits/chosen": -2.2657485008239746, + "logits/rejected": -2.3381145000457764, + "logps/chosen": -211.24606323242188, + "logps/rejected": -275.7134094238281, + "loss": 0.0392, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5696375370025635, + "rewards/margins": 4.8196611404418945, + "rewards/rejected": -7.389298439025879, + "step": 995 + }, + { + "epoch": 1.3, + "learning_rate": 3.2068312878006955e-05, + "logits/chosen": -2.5042364597320557, + "logits/rejected": -2.577590227127075, + "logps/chosen": -224.34481811523438, + "logps/rejected": -294.0433044433594, + "loss": 0.1099, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5805823802948, + "rewards/margins": 4.027166843414307, + "rewards/rejected": -6.607748985290527, + "step": 996 + }, + { + "epoch": 1.3, + "learning_rate": 3.2033937337861744e-05, + "logits/chosen": -2.566772222518921, + "logits/rejected": -2.635631561279297, + "logps/chosen": -182.17617797851562, + "logps/rejected": -221.6566925048828, + "loss": 0.1608, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.865788221359253, + "rewards/margins": 4.04500675201416, + "rewards/rejected": -6.910794734954834, + "step": 997 + }, + { + "epoch": 1.31, + "learning_rate": 3.199954734940603e-05, + "logits/chosen": -2.416757106781006, + "logits/rejected": -2.4752490520477295, + "logps/chosen": -227.98826599121094, + "logps/rejected": -235.98541259765625, + "loss": 0.0961, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.63022780418396, + "rewards/margins": 3.34237003326416, + "rewards/rejected": -6.972598075866699, + "step": 998 + }, + { + "epoch": 1.31, + "learning_rate": 3.196514298327979e-05, + "logits/chosen": -2.511079788208008, + "logits/rejected": -2.4900567531585693, + "logps/chosen": -203.03321838378906, + "logps/rejected": -313.2828369140625, + "loss": 0.1077, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.127234935760498, + "rewards/margins": 4.497407913208008, + "rewards/rejected": -7.624642848968506, + "step": 999 + }, + { + "epoch": 1.31, + "learning_rate": 3.193072431015254e-05, + "logits/chosen": -2.6488115787506104, + "logits/rejected": -2.532362937927246, + "logps/chosen": -202.8871612548828, + "logps/rejected": -230.8666534423828, + "loss": 0.2179, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9239931106567383, + "rewards/margins": 3.898569345474243, + "rewards/rejected": -6.822562217712402, + "step": 1000 + }, + { + "epoch": 1.31, + "learning_rate": 3.18962914007232e-05, + "logits/chosen": -2.4548916816711426, + "logits/rejected": -2.597581624984741, + "logps/chosen": -192.9923095703125, + "logps/rejected": -240.06515502929688, + "loss": 0.1361, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.854691743850708, + "rewards/margins": 4.2366790771484375, + "rewards/rejected": -7.091370582580566, + "step": 1001 + }, + { + "epoch": 1.31, + "learning_rate": 3.18618443257199e-05, + "logits/chosen": -2.5535387992858887, + "logits/rejected": -2.6721291542053223, + "logps/chosen": -204.35491943359375, + "logps/rejected": -282.872802734375, + "loss": 0.1299, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5433037281036377, + "rewards/margins": 4.382692337036133, + "rewards/rejected": -6.925995826721191, + "step": 1002 + }, + { + "epoch": 1.31, + "learning_rate": 3.182738315589991e-05, + "logits/chosen": -2.398437976837158, + "logits/rejected": -2.460413694381714, + "logps/chosen": -165.79678344726562, + "logps/rejected": -264.53472900390625, + "loss": 0.0473, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2765581607818604, + "rewards/margins": 4.5829925537109375, + "rewards/rejected": -6.859550476074219, + "step": 1003 + }, + { + "epoch": 1.31, + "learning_rate": 3.17929079620494e-05, + "logits/chosen": -2.467028856277466, + "logits/rejected": -2.4687981605529785, + "logps/chosen": -248.12551879882812, + "logps/rejected": -274.8841552734375, + "loss": 0.1201, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.922182559967041, + "rewards/margins": 4.145279884338379, + "rewards/rejected": -7.06746244430542, + "step": 1004 + }, + { + "epoch": 1.32, + "learning_rate": 3.17584188149834e-05, + "logits/chosen": -2.4509243965148926, + "logits/rejected": -2.467287302017212, + "logps/chosen": -188.75015258789062, + "logps/rejected": -223.9183349609375, + "loss": 0.1989, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4681475162506104, + "rewards/margins": 4.527399063110352, + "rewards/rejected": -6.995546340942383, + "step": 1005 + }, + { + "epoch": 1.32, + "learning_rate": 3.172391578554557e-05, + "logits/chosen": -2.685560941696167, + "logits/rejected": -2.6007328033447266, + "logps/chosen": -216.2244873046875, + "logps/rejected": -265.35394287109375, + "loss": 0.1203, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7310876846313477, + "rewards/margins": 3.903916835784912, + "rewards/rejected": -6.635004997253418, + "step": 1006 + }, + { + "epoch": 1.32, + "learning_rate": 3.1689398944608076e-05, + "logits/chosen": -2.526007652282715, + "logits/rejected": -2.581463575363159, + "logps/chosen": -196.76080322265625, + "logps/rejected": -256.2978515625, + "loss": 0.1789, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3379364013671875, + "rewards/margins": 4.69968318939209, + "rewards/rejected": -7.0376200675964355, + "step": 1007 + }, + { + "epoch": 1.32, + "learning_rate": 3.1654868363071484e-05, + "logits/chosen": -2.502946138381958, + "logits/rejected": -2.646716356277466, + "logps/chosen": -202.78024291992188, + "logps/rejected": -270.49053955078125, + "loss": 0.2017, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1844162940979004, + "rewards/margins": 3.92962384223938, + "rewards/rejected": -7.114039897918701, + "step": 1008 + }, + { + "epoch": 1.32, + "learning_rate": 3.162032411186456e-05, + "logits/chosen": -2.758046865463257, + "logits/rejected": -2.6140828132629395, + "logps/chosen": -259.43408203125, + "logps/rejected": -261.34332275390625, + "loss": 0.0381, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.656895160675049, + "rewards/margins": 4.708434104919434, + "rewards/rejected": -7.365328788757324, + "step": 1009 + }, + { + "epoch": 1.32, + "learning_rate": 3.158576626194417e-05, + "logits/chosen": -2.4280474185943604, + "logits/rejected": -2.4739177227020264, + "logps/chosen": -185.41958618164062, + "logps/rejected": -235.60659790039062, + "loss": 0.0968, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.496574640274048, + "rewards/margins": 4.274554252624512, + "rewards/rejected": -6.7711286544799805, + "step": 1010 + }, + { + "epoch": 1.32, + "learning_rate": 3.15511948842951e-05, + "logits/chosen": -2.6534996032714844, + "logits/rejected": -2.631805419921875, + "logps/chosen": -202.60643005371094, + "logps/rejected": -258.4774169921875, + "loss": 0.1789, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4497032165527344, + "rewards/margins": 4.621069431304932, + "rewards/rejected": -7.070772647857666, + "step": 1011 + }, + { + "epoch": 1.32, + "learning_rate": 3.151661004992992e-05, + "logits/chosen": -2.5260958671569824, + "logits/rejected": -2.5531349182128906, + "logps/chosen": -186.16738891601562, + "logps/rejected": -221.78717041015625, + "loss": 0.2935, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.791968584060669, + "rewards/margins": 3.0429458618164062, + "rewards/rejected": -5.834914684295654, + "step": 1012 + }, + { + "epoch": 1.33, + "learning_rate": 3.1482011829888836e-05, + "logits/chosen": -2.5383787155151367, + "logits/rejected": -2.659869909286499, + "logps/chosen": -170.639404296875, + "logps/rejected": -251.17396545410156, + "loss": 0.1266, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.320667266845703, + "rewards/margins": 4.520022869110107, + "rewards/rejected": -6.840689659118652, + "step": 1013 + }, + { + "epoch": 1.33, + "learning_rate": 3.1447400295239575e-05, + "logits/chosen": -2.710861921310425, + "logits/rejected": -2.675701141357422, + "logps/chosen": -217.94239807128906, + "logps/rejected": -268.5772705078125, + "loss": 0.0595, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7031490802764893, + "rewards/margins": 4.353590965270996, + "rewards/rejected": -7.056739807128906, + "step": 1014 + }, + { + "epoch": 1.33, + "learning_rate": 3.1412775517077195e-05, + "logits/chosen": -2.7215874195098877, + "logits/rejected": -2.7611916065216064, + "logps/chosen": -235.87506103515625, + "logps/rejected": -301.192138671875, + "loss": 0.0792, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.68082332611084, + "rewards/margins": 3.624584674835205, + "rewards/rejected": -6.305407524108887, + "step": 1015 + }, + { + "epoch": 1.33, + "learning_rate": 3.137813756652395e-05, + "logits/chosen": -2.631046772003174, + "logits/rejected": -2.614382266998291, + "logps/chosen": -247.91311645507812, + "logps/rejected": -307.5743408203125, + "loss": 0.2274, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0471534729003906, + "rewards/margins": 4.0268731117248535, + "rewards/rejected": -7.074026107788086, + "step": 1016 + }, + { + "epoch": 1.33, + "learning_rate": 3.134348651472917e-05, + "logits/chosen": -2.6354832649230957, + "logits/rejected": -2.556567907333374, + "logps/chosen": -185.06939697265625, + "logps/rejected": -198.97274780273438, + "loss": 0.3581, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.668581485748291, + "rewards/margins": 2.5096614360809326, + "rewards/rejected": -5.1782426834106445, + "step": 1017 + }, + { + "epoch": 1.33, + "learning_rate": 3.130882243286908e-05, + "logits/chosen": -2.6948163509368896, + "logits/rejected": -2.6446075439453125, + "logps/chosen": -229.8145751953125, + "logps/rejected": -259.83087158203125, + "loss": 0.1136, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1707406044006348, + "rewards/margins": 5.03431510925293, + "rewards/rejected": -7.205056190490723, + "step": 1018 + }, + { + "epoch": 1.33, + "learning_rate": 3.127414539214668e-05, + "logits/chosen": -2.5144197940826416, + "logits/rejected": -2.4032530784606934, + "logps/chosen": -177.1787872314453, + "logps/rejected": -192.51220703125, + "loss": 0.2843, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3650827407836914, + "rewards/margins": 2.8323113918304443, + "rewards/rejected": -5.197394371032715, + "step": 1019 + }, + { + "epoch": 1.34, + "learning_rate": 3.12394554637916e-05, + "logits/chosen": -2.4897267818450928, + "logits/rejected": -2.6113083362579346, + "logps/chosen": -193.52491760253906, + "logps/rejected": -287.58575439453125, + "loss": 0.0837, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4814534187316895, + "rewards/margins": 4.536114692687988, + "rewards/rejected": -7.017568588256836, + "step": 1020 + }, + { + "epoch": 1.34, + "learning_rate": 3.12047527190599e-05, + "logits/chosen": -2.726377487182617, + "logits/rejected": -2.6432230472564697, + "logps/chosen": -217.6285400390625, + "logps/rejected": -208.5411376953125, + "loss": 0.2537, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.526822805404663, + "rewards/margins": 2.986895799636841, + "rewards/rejected": -5.513718128204346, + "step": 1021 + }, + { + "epoch": 1.34, + "learning_rate": 3.1170037229234006e-05, + "logits/chosen": -2.357539176940918, + "logits/rejected": -2.461317300796509, + "logps/chosen": -185.66160583496094, + "logps/rejected": -275.49560546875, + "loss": 0.0502, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6269030570983887, + "rewards/margins": 4.0290093421936035, + "rewards/rejected": -6.655912399291992, + "step": 1022 + }, + { + "epoch": 1.34, + "learning_rate": 3.113530906562252e-05, + "logits/chosen": -2.610670328140259, + "logits/rejected": -2.6105544567108154, + "logps/chosen": -210.0528564453125, + "logps/rejected": -266.50494384765625, + "loss": 0.171, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.162548065185547, + "rewards/margins": 3.845745325088501, + "rewards/rejected": -6.008293151855469, + "step": 1023 + }, + { + "epoch": 1.34, + "learning_rate": 3.110056829956006e-05, + "logits/chosen": -2.6044304370880127, + "logits/rejected": -2.7036120891571045, + "logps/chosen": -215.3396759033203, + "logps/rejected": -276.84185791015625, + "loss": 0.1287, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1710519790649414, + "rewards/margins": 4.359486103057861, + "rewards/rejected": -6.530538082122803, + "step": 1024 + }, + { + "epoch": 1.34, + "learning_rate": 3.1065815002407136e-05, + "logits/chosen": -2.459763765335083, + "logits/rejected": -2.477097988128662, + "logps/chosen": -193.78274536132812, + "logps/rejected": -248.53659057617188, + "loss": 0.1329, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4779365062713623, + "rewards/margins": 4.562952995300293, + "rewards/rejected": -6.040889739990234, + "step": 1025 + }, + { + "epoch": 1.34, + "learning_rate": 3.103104924555e-05, + "logits/chosen": -2.554025650024414, + "logits/rejected": -2.598419189453125, + "logps/chosen": -186.47230529785156, + "logps/rejected": -253.75100708007812, + "loss": 0.1445, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8909106254577637, + "rewards/margins": 3.7882497310638428, + "rewards/rejected": -6.679160118103027, + "step": 1026 + }, + { + "epoch": 1.34, + "learning_rate": 3.099627110040052e-05, + "logits/chosen": -2.613118886947632, + "logits/rejected": -2.5829241275787354, + "logps/chosen": -165.73876953125, + "logps/rejected": -177.23910522460938, + "loss": 0.1843, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0123422145843506, + "rewards/margins": 2.8911550045013428, + "rewards/rejected": -4.903496742248535, + "step": 1027 + }, + { + "epoch": 1.35, + "learning_rate": 3.096148063839596e-05, + "logits/chosen": -2.572155237197876, + "logits/rejected": -2.5597805976867676, + "logps/chosen": -188.39637756347656, + "logps/rejected": -243.3074951171875, + "loss": 0.0828, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9949930906295776, + "rewards/margins": 4.393287658691406, + "rewards/rejected": -6.388280868530273, + "step": 1028 + }, + { + "epoch": 1.35, + "learning_rate": 3.0926677930998924e-05, + "logits/chosen": -2.5095479488372803, + "logits/rejected": -2.572796583175659, + "logps/chosen": -188.3842010498047, + "logps/rejected": -248.2862548828125, + "loss": 0.0909, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.235919713973999, + "rewards/margins": 3.50256609916687, + "rewards/rejected": -5.738486289978027, + "step": 1029 + }, + { + "epoch": 1.35, + "learning_rate": 3.0891863049697165e-05, + "logits/chosen": -2.7175307273864746, + "logits/rejected": -2.722472906112671, + "logps/chosen": -206.114990234375, + "logps/rejected": -230.72328186035156, + "loss": 0.1394, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7134565114974976, + "rewards/margins": 3.560715436935425, + "rewards/rejected": -5.274172782897949, + "step": 1030 + }, + { + "epoch": 1.35, + "learning_rate": 3.0857036066003414e-05, + "logits/chosen": -2.664355754852295, + "logits/rejected": -2.6181037425994873, + "logps/chosen": -209.98092651367188, + "logps/rejected": -228.79571533203125, + "loss": 0.0896, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.282691478729248, + "rewards/margins": 3.3601505756378174, + "rewards/rejected": -5.6428422927856445, + "step": 1031 + }, + { + "epoch": 1.35, + "learning_rate": 3.08221970514553e-05, + "logits/chosen": -2.2186408042907715, + "logits/rejected": -2.3232436180114746, + "logps/chosen": -210.20736694335938, + "logps/rejected": -275.1420593261719, + "loss": 0.2066, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2390856742858887, + "rewards/margins": 2.8809971809387207, + "rewards/rejected": -5.120083332061768, + "step": 1032 + }, + { + "epoch": 1.35, + "learning_rate": 3.0787346077615155e-05, + "logits/chosen": -2.5796632766723633, + "logits/rejected": -2.600548267364502, + "logps/chosen": -185.8663787841797, + "logps/rejected": -239.82757568359375, + "loss": 0.095, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8747427463531494, + "rewards/margins": 4.359163761138916, + "rewards/rejected": -6.233906269073486, + "step": 1033 + }, + { + "epoch": 1.35, + "learning_rate": 3.0752483216069846e-05, + "logits/chosen": -2.5223937034606934, + "logits/rejected": -2.641361713409424, + "logps/chosen": -208.06484985351562, + "logps/rejected": -309.947021484375, + "loss": 0.0683, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1871821880340576, + "rewards/margins": 5.526939392089844, + "rewards/rejected": -7.714120864868164, + "step": 1034 + }, + { + "epoch": 1.35, + "learning_rate": 3.071760853843069e-05, + "logits/chosen": -2.3823060989379883, + "logits/rejected": -2.4045443534851074, + "logps/chosen": -146.8202667236328, + "logps/rejected": -197.63870239257812, + "loss": 0.1845, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8644603490829468, + "rewards/margins": 3.577155590057373, + "rewards/rejected": -5.441615581512451, + "step": 1035 + }, + { + "epoch": 1.36, + "learning_rate": 3.068272211633326e-05, + "logits/chosen": -2.5341076850891113, + "logits/rejected": -2.542874336242676, + "logps/chosen": -176.34588623046875, + "logps/rejected": -224.5471954345703, + "loss": 0.1105, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1690847873687744, + "rewards/margins": 3.686138153076172, + "rewards/rejected": -5.855223178863525, + "step": 1036 + }, + { + "epoch": 1.36, + "learning_rate": 3.0647824021437266e-05, + "logits/chosen": -2.6451001167297363, + "logits/rejected": -2.5524368286132812, + "logps/chosen": -249.0947265625, + "logps/rejected": -272.3426513671875, + "loss": 0.1483, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4908173084259033, + "rewards/margins": 3.3694992065429688, + "rewards/rejected": -5.860316753387451, + "step": 1037 + }, + { + "epoch": 1.36, + "learning_rate": 3.061291432542639e-05, + "logits/chosen": -2.5714945793151855, + "logits/rejected": -2.6072607040405273, + "logps/chosen": -259.5276794433594, + "logps/rejected": -280.0762023925781, + "loss": 0.2905, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2500410079956055, + "rewards/margins": 2.3362009525299072, + "rewards/rejected": -4.586242198944092, + "step": 1038 + }, + { + "epoch": 1.36, + "learning_rate": 3.0577993100008135e-05, + "logits/chosen": -2.332847833633423, + "logits/rejected": -2.349351644515991, + "logps/chosen": -165.8587188720703, + "logps/rejected": -215.34115600585938, + "loss": 0.124, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8299568891525269, + "rewards/margins": 3.2485098838806152, + "rewards/rejected": -5.078466415405273, + "step": 1039 + }, + { + "epoch": 1.36, + "learning_rate": 3.0543060416913696e-05, + "logits/chosen": -2.6099398136138916, + "logits/rejected": -2.639852285385132, + "logps/chosen": -202.44798278808594, + "logps/rejected": -243.91697692871094, + "loss": 0.1581, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8386543989181519, + "rewards/margins": 2.7864112854003906, + "rewards/rejected": -4.625065803527832, + "step": 1040 + }, + { + "epoch": 1.36, + "learning_rate": 3.050811634789779e-05, + "logits/chosen": -2.3418991565704346, + "logits/rejected": -2.3136579990386963, + "logps/chosen": -198.81283569335938, + "logps/rejected": -239.3514404296875, + "loss": 0.2539, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.57821786403656, + "rewards/margins": 4.224590301513672, + "rewards/rejected": -5.8028082847595215, + "step": 1041 + }, + { + "epoch": 1.36, + "learning_rate": 3.0473160964738555e-05, + "logits/chosen": -2.389075994491577, + "logits/rejected": -2.4795174598693848, + "logps/chosen": -208.120361328125, + "logps/rejected": -263.8682556152344, + "loss": 0.1816, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5095570087432861, + "rewards/margins": 5.084875583648682, + "rewards/rejected": -6.594432830810547, + "step": 1042 + }, + { + "epoch": 1.37, + "learning_rate": 3.0438194339237325e-05, + "logits/chosen": -2.454801321029663, + "logits/rejected": -2.5571341514587402, + "logps/chosen": -162.8795928955078, + "logps/rejected": -212.25123596191406, + "loss": 0.1518, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0092012882232666, + "rewards/margins": 3.6485345363616943, + "rewards/rejected": -5.657735824584961, + "step": 1043 + }, + { + "epoch": 1.37, + "learning_rate": 3.0403216543218547e-05, + "logits/chosen": -2.417764663696289, + "logits/rejected": -2.367800712585449, + "logps/chosen": -258.430419921875, + "logps/rejected": -316.158447265625, + "loss": 0.153, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8694419860839844, + "rewards/margins": 3.8216443061828613, + "rewards/rejected": -5.691086292266846, + "step": 1044 + }, + { + "epoch": 1.37, + "learning_rate": 3.036822764852963e-05, + "logits/chosen": -2.5702414512634277, + "logits/rejected": -2.641486167907715, + "logps/chosen": -212.98876953125, + "logps/rejected": -257.76995849609375, + "loss": 0.1253, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.472564458847046, + "rewards/margins": 3.66194748878479, + "rewards/rejected": -5.134511947631836, + "step": 1045 + }, + { + "epoch": 1.37, + "learning_rate": 3.0333227727040742e-05, + "logits/chosen": -2.515430212020874, + "logits/rejected": -2.581775188446045, + "logps/chosen": -219.15296936035156, + "logps/rejected": -231.19659423828125, + "loss": 0.2006, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.740632176399231, + "rewards/margins": 3.2406795024871826, + "rewards/rejected": -4.981311798095703, + "step": 1046 + }, + { + "epoch": 1.37, + "learning_rate": 3.029821685064475e-05, + "logits/chosen": -2.4799652099609375, + "logits/rejected": -2.447035074234009, + "logps/chosen": -169.70779418945312, + "logps/rejected": -240.38546752929688, + "loss": 0.1335, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6411738395690918, + "rewards/margins": 3.4921600818634033, + "rewards/rejected": -5.133334159851074, + "step": 1047 + }, + { + "epoch": 1.37, + "learning_rate": 3.026319509125697e-05, + "logits/chosen": -2.508396625518799, + "logits/rejected": -2.5602493286132812, + "logps/chosen": -158.8992462158203, + "logps/rejected": -210.88685607910156, + "loss": 0.1773, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6758579015731812, + "rewards/margins": 3.2969624996185303, + "rewards/rejected": -4.972820281982422, + "step": 1048 + }, + { + "epoch": 1.37, + "learning_rate": 3.0228162520815117e-05, + "logits/chosen": -2.6537864208221436, + "logits/rejected": -2.778221368789673, + "logps/chosen": -240.8055877685547, + "logps/rejected": -292.8360290527344, + "loss": 0.0504, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1146202087402344, + "rewards/margins": 4.420568943023682, + "rewards/rejected": -5.535189628601074, + "step": 1049 + }, + { + "epoch": 1.37, + "learning_rate": 3.0193119211279097e-05, + "logits/chosen": -2.5143067836761475, + "logits/rejected": -2.462987184524536, + "logps/chosen": -143.20730590820312, + "logps/rejected": -179.5817413330078, + "loss": 0.2328, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6686921119689941, + "rewards/margins": 2.2758102416992188, + "rewards/rejected": -3.944502353668213, + "step": 1050 + }, + { + "epoch": 1.38, + "learning_rate": 3.015806523463085e-05, + "logits/chosen": -2.5343329906463623, + "logits/rejected": -2.601706027984619, + "logps/chosen": -205.8494873046875, + "logps/rejected": -250.8743133544922, + "loss": 0.2118, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7855427265167236, + "rewards/margins": 3.828920841217041, + "rewards/rejected": -5.614463806152344, + "step": 1051 + }, + { + "epoch": 1.38, + "learning_rate": 3.0123000662874272e-05, + "logits/chosen": -2.622143268585205, + "logits/rejected": -2.5708515644073486, + "logps/chosen": -189.2848663330078, + "logps/rejected": -189.11688232421875, + "loss": 0.1082, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7558749914169312, + "rewards/margins": 3.175283908843994, + "rewards/rejected": -4.931159019470215, + "step": 1052 + }, + { + "epoch": 1.38, + "learning_rate": 3.0087925568034998e-05, + "logits/chosen": -2.619046926498413, + "logits/rejected": -2.6472840309143066, + "logps/chosen": -173.87542724609375, + "logps/rejected": -196.43414306640625, + "loss": 0.0777, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4764899015426636, + "rewards/margins": 3.9893689155578613, + "rewards/rejected": -5.4658589363098145, + "step": 1053 + }, + { + "epoch": 1.38, + "learning_rate": 3.0052840022160273e-05, + "logits/chosen": -2.3901023864746094, + "logits/rejected": -2.383265972137451, + "logps/chosen": -170.65953063964844, + "logps/rejected": -226.2257080078125, + "loss": 0.1705, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9331625699996948, + "rewards/margins": 3.1136226654052734, + "rewards/rejected": -5.046785831451416, + "step": 1054 + }, + { + "epoch": 1.38, + "learning_rate": 3.0017744097318823e-05, + "logits/chosen": -2.4922289848327637, + "logits/rejected": -2.609678030014038, + "logps/chosen": -184.7035675048828, + "logps/rejected": -250.73880004882812, + "loss": 0.3324, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1903133392333984, + "rewards/margins": 3.84584641456604, + "rewards/rejected": -6.036159515380859, + "step": 1055 + }, + { + "epoch": 1.38, + "learning_rate": 2.9982637865600683e-05, + "logits/chosen": -2.578058958053589, + "logits/rejected": -2.746821403503418, + "logps/chosen": -197.53289794921875, + "logps/rejected": -242.20957946777344, + "loss": 0.1223, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.671994924545288, + "rewards/margins": 3.1215267181396484, + "rewards/rejected": -4.793521881103516, + "step": 1056 + }, + { + "epoch": 1.38, + "learning_rate": 2.994752139911706e-05, + "logits/chosen": -2.5955004692077637, + "logits/rejected": -2.5199217796325684, + "logps/chosen": -183.6725311279297, + "logps/rejected": -232.3446044921875, + "loss": 0.2787, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7911111116409302, + "rewards/margins": 2.9477579593658447, + "rewards/rejected": -4.738868713378906, + "step": 1057 + }, + { + "epoch": 1.38, + "learning_rate": 2.991239477000021e-05, + "logits/chosen": -2.5984296798706055, + "logits/rejected": -2.642394781112671, + "logps/chosen": -190.7852783203125, + "logps/rejected": -262.9029235839844, + "loss": 0.1771, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8628268241882324, + "rewards/margins": 3.912919044494629, + "rewards/rejected": -5.775745868682861, + "step": 1058 + }, + { + "epoch": 1.39, + "learning_rate": 2.9877258050403212e-05, + "logits/chosen": -2.396843433380127, + "logits/rejected": -2.354389190673828, + "logps/chosen": -206.04061889648438, + "logps/rejected": -225.9326171875, + "loss": 0.2668, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.888195514678955, + "rewards/margins": 2.6550073623657227, + "rewards/rejected": -4.543202877044678, + "step": 1059 + }, + { + "epoch": 1.39, + "learning_rate": 2.9842111312499914e-05, + "logits/chosen": -2.551333427429199, + "logits/rejected": -2.7733373641967773, + "logps/chosen": -205.36074829101562, + "logps/rejected": -311.148193359375, + "loss": 0.1176, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2061779499053955, + "rewards/margins": 3.903118371963501, + "rewards/rejected": -6.1092963218688965, + "step": 1060 + }, + { + "epoch": 1.39, + "learning_rate": 2.9806954628484734e-05, + "logits/chosen": -2.4500200748443604, + "logits/rejected": -2.5029385089874268, + "logps/chosen": -195.06527709960938, + "logps/rejected": -251.92361450195312, + "loss": 0.1579, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.33332097530365, + "rewards/margins": 3.7611982822418213, + "rewards/rejected": -5.094519138336182, + "step": 1061 + }, + { + "epoch": 1.39, + "learning_rate": 2.9771788070572514e-05, + "logits/chosen": -2.462061643600464, + "logits/rejected": -2.4915874004364014, + "logps/chosen": -190.62669372558594, + "logps/rejected": -284.8686218261719, + "loss": 0.1005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1493988037109375, + "rewards/margins": 3.9200797080993652, + "rewards/rejected": -6.0694780349731445, + "step": 1062 + }, + { + "epoch": 1.39, + "learning_rate": 2.9736611710998368e-05, + "logits/chosen": -2.415849208831787, + "logits/rejected": -2.5483460426330566, + "logps/chosen": -190.28884887695312, + "logps/rejected": -264.4375305175781, + "loss": 0.0893, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6152167320251465, + "rewards/margins": 4.213555812835693, + "rewards/rejected": -5.82877254486084, + "step": 1063 + }, + { + "epoch": 1.39, + "learning_rate": 2.9701425622017583e-05, + "logits/chosen": -2.7090094089508057, + "logits/rejected": -2.6982572078704834, + "logps/chosen": -259.99603271484375, + "logps/rejected": -283.6636047363281, + "loss": 0.2364, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3496899604797363, + "rewards/margins": 3.079162120819092, + "rewards/rejected": -5.42885160446167, + "step": 1064 + }, + { + "epoch": 1.39, + "learning_rate": 2.9666229875905373e-05, + "logits/chosen": -2.6412601470947266, + "logits/rejected": -2.725914478302002, + "logps/chosen": -214.32044982910156, + "logps/rejected": -253.17076110839844, + "loss": 0.0626, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0727146863937378, + "rewards/margins": 3.851958751678467, + "rewards/rejected": -4.924673557281494, + "step": 1065 + }, + { + "epoch": 1.4, + "learning_rate": 2.963102454495683e-05, + "logits/chosen": -2.287813901901245, + "logits/rejected": -2.355250358581543, + "logps/chosen": -189.98773193359375, + "logps/rejected": -227.09173583984375, + "loss": 0.1375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.555716872215271, + "rewards/margins": 3.5984201431274414, + "rewards/rejected": -5.15413761138916, + "step": 1066 + }, + { + "epoch": 1.4, + "learning_rate": 2.959580970148673e-05, + "logits/chosen": -2.5558524131774902, + "logits/rejected": -2.625699520111084, + "logps/chosen": -203.2003173828125, + "logps/rejected": -244.1453399658203, + "loss": 0.1273, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2883964776992798, + "rewards/margins": 3.6233742237091064, + "rewards/rejected": -4.911770343780518, + "step": 1067 + }, + { + "epoch": 1.4, + "learning_rate": 2.9560585417829368e-05, + "logits/chosen": -2.4778542518615723, + "logits/rejected": -2.482480764389038, + "logps/chosen": -191.04823303222656, + "logps/rejected": -240.44174194335938, + "loss": 0.1597, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.318474292755127, + "rewards/margins": 3.7633140087127686, + "rewards/rejected": -5.081788539886475, + "step": 1068 + }, + { + "epoch": 1.4, + "learning_rate": 2.952535176633846e-05, + "logits/chosen": -2.5196757316589355, + "logits/rejected": -2.553658962249756, + "logps/chosen": -191.95896911621094, + "logps/rejected": -253.894775390625, + "loss": 0.0567, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4344265460968018, + "rewards/margins": 4.60671329498291, + "rewards/rejected": -6.041139125823975, + "step": 1069 + }, + { + "epoch": 1.4, + "learning_rate": 2.9490108819386936e-05, + "logits/chosen": -2.6415488719940186, + "logits/rejected": -2.633355140686035, + "logps/chosen": -201.31932067871094, + "logps/rejected": -231.6967010498047, + "loss": 0.1365, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.711191177368164, + "rewards/margins": 3.2852399349212646, + "rewards/rejected": -4.996431350708008, + "step": 1070 + }, + { + "epoch": 1.4, + "learning_rate": 2.945485664936683e-05, + "logits/chosen": -2.6155481338500977, + "logits/rejected": -2.662614107131958, + "logps/chosen": -198.21116638183594, + "logps/rejected": -235.82608032226562, + "loss": 0.0687, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4048967361450195, + "rewards/margins": 4.216251850128174, + "rewards/rejected": -5.621149063110352, + "step": 1071 + }, + { + "epoch": 1.4, + "learning_rate": 2.9419595328689138e-05, + "logits/chosen": -2.542820453643799, + "logits/rejected": -2.683985471725464, + "logps/chosen": -178.39039611816406, + "logps/rejected": -239.9589080810547, + "loss": 0.0992, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7426559925079346, + "rewards/margins": 3.40950345993042, + "rewards/rejected": -5.152159690856934, + "step": 1072 + }, + { + "epoch": 1.4, + "learning_rate": 2.938432492978361e-05, + "logits/chosen": -2.539808750152588, + "logits/rejected": -2.5900888442993164, + "logps/chosen": -181.90489196777344, + "logps/rejected": -227.29217529296875, + "loss": 0.091, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.262054920196533, + "rewards/margins": 4.174936294555664, + "rewards/rejected": -6.436991214752197, + "step": 1073 + }, + { + "epoch": 1.41, + "learning_rate": 2.9349045525098688e-05, + "logits/chosen": -2.6640827655792236, + "logits/rejected": -2.758841037750244, + "logps/chosen": -195.34341430664062, + "logps/rejected": -239.19717407226562, + "loss": 0.0557, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5375292301177979, + "rewards/margins": 4.5306077003479, + "rewards/rejected": -6.068136215209961, + "step": 1074 + }, + { + "epoch": 1.41, + "learning_rate": 2.9313757187101297e-05, + "logits/chosen": -2.553307294845581, + "logits/rejected": -2.5362741947174072, + "logps/chosen": -235.52633666992188, + "logps/rejected": -271.10888671875, + "loss": 0.1098, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.423585295677185, + "rewards/margins": 4.111424446105957, + "rewards/rejected": -5.535008907318115, + "step": 1075 + }, + { + "epoch": 1.41, + "learning_rate": 2.9278459988276703e-05, + "logits/chosen": -2.579463481903076, + "logits/rejected": -2.5745797157287598, + "logps/chosen": -200.10809326171875, + "logps/rejected": -226.90008544921875, + "loss": 0.1644, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.766392469406128, + "rewards/margins": 3.521643877029419, + "rewards/rejected": -5.288036823272705, + "step": 1076 + }, + { + "epoch": 1.41, + "learning_rate": 2.9243154001128386e-05, + "logits/chosen": -2.518954038619995, + "logits/rejected": -2.5589892864227295, + "logps/chosen": -177.9794464111328, + "logps/rejected": -247.88150024414062, + "loss": 0.0643, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.475413203239441, + "rewards/margins": 4.184449195861816, + "rewards/rejected": -5.659862041473389, + "step": 1077 + }, + { + "epoch": 1.41, + "learning_rate": 2.920783929817786e-05, + "logits/chosen": -2.601881504058838, + "logits/rejected": -2.599308490753174, + "logps/chosen": -234.6541748046875, + "logps/rejected": -277.7001953125, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7284051179885864, + "rewards/margins": 4.9045305252075195, + "rewards/rejected": -6.632936000823975, + "step": 1078 + }, + { + "epoch": 1.41, + "learning_rate": 2.9172515951964558e-05, + "logits/chosen": -2.500915765762329, + "logits/rejected": -2.451711416244507, + "logps/chosen": -186.4530487060547, + "logps/rejected": -222.8610382080078, + "loss": 0.1396, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9778958559036255, + "rewards/margins": 3.4191246032714844, + "rewards/rejected": -5.39702033996582, + "step": 1079 + }, + { + "epoch": 1.41, + "learning_rate": 2.913718403504567e-05, + "logits/chosen": -2.477285861968994, + "logits/rejected": -2.5247223377227783, + "logps/chosen": -199.11573791503906, + "logps/rejected": -236.58143615722656, + "loss": 0.1449, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.050600528717041, + "rewards/margins": 4.136992454528809, + "rewards/rejected": -6.187592506408691, + "step": 1080 + }, + { + "epoch": 1.41, + "learning_rate": 2.9101843619995968e-05, + "logits/chosen": -2.3786354064941406, + "logits/rejected": -2.4065310955047607, + "logps/chosen": -156.8441925048828, + "logps/rejected": -228.4494171142578, + "loss": 0.1835, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0903360843658447, + "rewards/margins": 3.2863199710845947, + "rewards/rejected": -5.3766560554504395, + "step": 1081 + }, + { + "epoch": 1.42, + "learning_rate": 2.906649477940771e-05, + "logits/chosen": -2.254408359527588, + "logits/rejected": -2.312025547027588, + "logps/chosen": -191.6400909423828, + "logps/rejected": -242.19525146484375, + "loss": 0.2633, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.195997714996338, + "rewards/margins": 2.856938362121582, + "rewards/rejected": -5.05293607711792, + "step": 1082 + }, + { + "epoch": 1.42, + "learning_rate": 2.9031137585890445e-05, + "logits/chosen": -2.6021387577056885, + "logits/rejected": -2.577592372894287, + "logps/chosen": -206.00213623046875, + "logps/rejected": -231.03451538085938, + "loss": 0.1142, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0324621200561523, + "rewards/margins": 4.193613052368164, + "rewards/rejected": -6.226075172424316, + "step": 1083 + }, + { + "epoch": 1.42, + "learning_rate": 2.899577211207087e-05, + "logits/chosen": -2.2712721824645996, + "logits/rejected": -2.422102212905884, + "logps/chosen": -166.55764770507812, + "logps/rejected": -241.16822814941406, + "loss": 0.1552, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5752736330032349, + "rewards/margins": 3.9138262271881104, + "rewards/rejected": -5.489099979400635, + "step": 1084 + }, + { + "epoch": 1.42, + "learning_rate": 2.89603984305927e-05, + "logits/chosen": -2.5275464057922363, + "logits/rejected": -2.5604097843170166, + "logps/chosen": -201.57711791992188, + "logps/rejected": -240.2384490966797, + "loss": 0.1261, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.553159475326538, + "rewards/margins": 4.10635232925415, + "rewards/rejected": -5.659511566162109, + "step": 1085 + }, + { + "epoch": 1.42, + "learning_rate": 2.8925016614116534e-05, + "logits/chosen": -2.5933401584625244, + "logits/rejected": -2.5860490798950195, + "logps/chosen": -182.6704559326172, + "logps/rejected": -225.07212829589844, + "loss": 0.1808, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.235060214996338, + "rewards/margins": 3.8837714195251465, + "rewards/rejected": -6.118832111358643, + "step": 1086 + }, + { + "epoch": 1.42, + "learning_rate": 2.8889626735319635e-05, + "logits/chosen": -2.331376314163208, + "logits/rejected": -2.444274663925171, + "logps/chosen": -161.9290771484375, + "logps/rejected": -203.12196350097656, + "loss": 0.2062, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4727413654327393, + "rewards/margins": 2.9315404891967773, + "rewards/rejected": -5.404282093048096, + "step": 1087 + }, + { + "epoch": 1.42, + "learning_rate": 2.8854228866895855e-05, + "logits/chosen": -2.587686777114868, + "logits/rejected": -2.5008420944213867, + "logps/chosen": -176.20748901367188, + "logps/rejected": -219.10447692871094, + "loss": 0.1128, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.96158766746521, + "rewards/margins": 3.573758840560913, + "rewards/rejected": -5.535346508026123, + "step": 1088 + }, + { + "epoch": 1.43, + "learning_rate": 2.8818823081555445e-05, + "logits/chosen": -2.3704261779785156, + "logits/rejected": -2.3205065727233887, + "logps/chosen": -199.6273651123047, + "logps/rejected": -231.36050415039062, + "loss": 0.0941, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8766591548919678, + "rewards/margins": 3.104245185852051, + "rewards/rejected": -4.980905055999756, + "step": 1089 + }, + { + "epoch": 1.43, + "learning_rate": 2.8783409452024934e-05, + "logits/chosen": -2.391850233078003, + "logits/rejected": -2.3093979358673096, + "logps/chosen": -176.1939697265625, + "logps/rejected": -285.7266845703125, + "loss": 0.2012, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7692720890045166, + "rewards/margins": 3.776660442352295, + "rewards/rejected": -5.545932292938232, + "step": 1090 + }, + { + "epoch": 1.43, + "learning_rate": 2.874798805104696e-05, + "logits/chosen": -2.5358238220214844, + "logits/rejected": -2.5575406551361084, + "logps/chosen": -183.97039794921875, + "logps/rejected": -245.60678100585938, + "loss": 0.1238, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.222740411758423, + "rewards/margins": 3.4841675758361816, + "rewards/rejected": -5.706907749176025, + "step": 1091 + }, + { + "epoch": 1.43, + "learning_rate": 2.8712558951380097e-05, + "logits/chosen": -2.4555132389068604, + "logits/rejected": -2.400146245956421, + "logps/chosen": -184.021484375, + "logps/rejected": -201.93553161621094, + "loss": 0.175, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1929945945739746, + "rewards/margins": 2.8565635681152344, + "rewards/rejected": -5.049558639526367, + "step": 1092 + }, + { + "epoch": 1.43, + "learning_rate": 2.867712222579877e-05, + "logits/chosen": -2.6107163429260254, + "logits/rejected": -2.6007707118988037, + "logps/chosen": -201.79556274414062, + "logps/rejected": -230.68873596191406, + "loss": 0.1006, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.095113515853882, + "rewards/margins": 3.817943811416626, + "rewards/rejected": -5.913057327270508, + "step": 1093 + }, + { + "epoch": 1.43, + "learning_rate": 2.864167794709305e-05, + "logits/chosen": -2.4124159812927246, + "logits/rejected": -2.4699692726135254, + "logps/chosen": -168.76837158203125, + "logps/rejected": -225.75619506835938, + "loss": 0.1245, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8972877264022827, + "rewards/margins": 3.7775349617004395, + "rewards/rejected": -5.674822807312012, + "step": 1094 + }, + { + "epoch": 1.43, + "learning_rate": 2.860622618806852e-05, + "logits/chosen": -2.442751407623291, + "logits/rejected": -2.449471950531006, + "logps/chosen": -170.2368927001953, + "logps/rejected": -236.02626037597656, + "loss": 0.1948, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7432142496109009, + "rewards/margins": 3.9987921714782715, + "rewards/rejected": -5.742006301879883, + "step": 1095 + }, + { + "epoch": 1.43, + "learning_rate": 2.857076702154614e-05, + "logits/chosen": -2.6795055866241455, + "logits/rejected": -2.7390999794006348, + "logps/chosen": -234.45559692382812, + "logps/rejected": -291.91595458984375, + "loss": 0.0443, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0023720264434814, + "rewards/margins": 4.367403507232666, + "rewards/rejected": -6.369776248931885, + "step": 1096 + }, + { + "epoch": 1.44, + "learning_rate": 2.8535300520362075e-05, + "logits/chosen": -2.5263047218322754, + "logits/rejected": -2.5012240409851074, + "logps/chosen": -184.9441375732422, + "logps/rejected": -222.58851623535156, + "loss": 0.1757, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.249758720397949, + "rewards/margins": 3.7888245582580566, + "rewards/rejected": -6.038583755493164, + "step": 1097 + }, + { + "epoch": 1.44, + "learning_rate": 2.849982675736756e-05, + "logits/chosen": -2.4845621585845947, + "logits/rejected": -2.5776851177215576, + "logps/chosen": -181.5438232421875, + "logps/rejected": -216.2488555908203, + "loss": 0.2022, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0840976238250732, + "rewards/margins": 2.51057505607605, + "rewards/rejected": -4.594672203063965, + "step": 1098 + }, + { + "epoch": 1.44, + "learning_rate": 2.8464345805428753e-05, + "logits/chosen": -2.574110507965088, + "logits/rejected": -2.6576766967773438, + "logps/chosen": -215.7093505859375, + "logps/rejected": -269.7330017089844, + "loss": 0.0803, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.754449486732483, + "rewards/margins": 4.345558166503906, + "rewards/rejected": -6.100008010864258, + "step": 1099 + }, + { + "epoch": 1.44, + "learning_rate": 2.8428857737426556e-05, + "logits/chosen": -2.4669013023376465, + "logits/rejected": -2.5930404663085938, + "logps/chosen": -166.70077514648438, + "logps/rejected": -226.11407470703125, + "loss": 0.092, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0145058631896973, + "rewards/margins": 3.453352451324463, + "rewards/rejected": -5.467857837677002, + "step": 1100 + }, + { + "epoch": 1.44, + "learning_rate": 2.839336262625652e-05, + "logits/chosen": -2.428941249847412, + "logits/rejected": -2.4871275424957275, + "logps/chosen": -140.08157348632812, + "logps/rejected": -197.7716064453125, + "loss": 0.3108, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8832570314407349, + "rewards/margins": 2.8069450855255127, + "rewards/rejected": -4.690202236175537, + "step": 1101 + }, + { + "epoch": 1.44, + "learning_rate": 2.835786054482864e-05, + "logits/chosen": -2.521362781524658, + "logits/rejected": -2.5276265144348145, + "logps/chosen": -205.90907287597656, + "logps/rejected": -227.15150451660156, + "loss": 0.11, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6780078411102295, + "rewards/margins": 3.8448662757873535, + "rewards/rejected": -5.522873878479004, + "step": 1102 + }, + { + "epoch": 1.44, + "learning_rate": 2.832235156606724e-05, + "logits/chosen": -2.319436550140381, + "logits/rejected": -2.3180150985717773, + "logps/chosen": -184.51939392089844, + "logps/rejected": -211.324951171875, + "loss": 0.2366, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2260873317718506, + "rewards/margins": 2.3832032680511475, + "rewards/rejected": -4.609290599822998, + "step": 1103 + }, + { + "epoch": 1.45, + "learning_rate": 2.8286835762910803e-05, + "logits/chosen": -2.3269524574279785, + "logits/rejected": -2.430046796798706, + "logps/chosen": -172.78018188476562, + "logps/rejected": -257.3539733886719, + "loss": 0.1062, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6094239950180054, + "rewards/margins": 3.733208656311035, + "rewards/rejected": -5.342632293701172, + "step": 1104 + }, + { + "epoch": 1.45, + "learning_rate": 2.8251313208311837e-05, + "logits/chosen": -2.583339214324951, + "logits/rejected": -2.585059881210327, + "logps/chosen": -213.5316162109375, + "logps/rejected": -246.169921875, + "loss": 0.0823, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5528373718261719, + "rewards/margins": 4.198974132537842, + "rewards/rejected": -5.7518110275268555, + "step": 1105 + }, + { + "epoch": 1.45, + "learning_rate": 2.8215783975236715e-05, + "logits/chosen": -2.515561580657959, + "logits/rejected": -2.526459217071533, + "logps/chosen": -202.77517700195312, + "logps/rejected": -279.3590393066406, + "loss": 0.0808, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8745505809783936, + "rewards/margins": 4.848105430603027, + "rewards/rejected": -6.722655296325684, + "step": 1106 + }, + { + "epoch": 1.45, + "learning_rate": 2.8180248136665527e-05, + "logits/chosen": -2.510796546936035, + "logits/rejected": -2.520092487335205, + "logps/chosen": -191.71261596679688, + "logps/rejected": -236.56936645507812, + "loss": 0.1274, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3850890398025513, + "rewards/margins": 3.766491413116455, + "rewards/rejected": -5.151580333709717, + "step": 1107 + }, + { + "epoch": 1.45, + "learning_rate": 2.8144705765591938e-05, + "logits/chosen": -2.4325108528137207, + "logits/rejected": -2.542689800262451, + "logps/chosen": -171.6055145263672, + "logps/rejected": -228.088134765625, + "loss": 0.1781, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.90548837184906, + "rewards/margins": 3.6836540699005127, + "rewards/rejected": -5.589141845703125, + "step": 1108 + }, + { + "epoch": 1.45, + "learning_rate": 2.810915693502302e-05, + "logits/chosen": -2.387796401977539, + "logits/rejected": -2.47060489654541, + "logps/chosen": -190.63653564453125, + "logps/rejected": -245.0660400390625, + "loss": 0.0757, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7789469957351685, + "rewards/margins": 4.223084926605225, + "rewards/rejected": -6.0020318031311035, + "step": 1109 + }, + { + "epoch": 1.45, + "learning_rate": 2.807360171797912e-05, + "logits/chosen": -2.5332131385803223, + "logits/rejected": -2.645998239517212, + "logps/chosen": -195.35906982421875, + "logps/rejected": -257.46600341796875, + "loss": 0.0929, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.201244592666626, + "rewards/margins": 3.8987066745758057, + "rewards/rejected": -6.099951267242432, + "step": 1110 + }, + { + "epoch": 1.45, + "learning_rate": 2.803804018749371e-05, + "logits/chosen": -2.5551862716674805, + "logits/rejected": -2.4983575344085693, + "logps/chosen": -209.89344787597656, + "logps/rejected": -217.10499572753906, + "loss": 0.1739, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0726064443588257, + "rewards/margins": 4.014316558837891, + "rewards/rejected": -5.086923599243164, + "step": 1111 + }, + { + "epoch": 1.46, + "learning_rate": 2.800247241661321e-05, + "logits/chosen": -2.4822776317596436, + "logits/rejected": -2.6447525024414062, + "logps/chosen": -168.32168579101562, + "logps/rejected": -220.74522399902344, + "loss": 0.1471, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7664083242416382, + "rewards/margins": 2.9133448600769043, + "rewards/rejected": -4.679753303527832, + "step": 1112 + }, + { + "epoch": 1.46, + "learning_rate": 2.796689847839689e-05, + "logits/chosen": -2.5819203853607178, + "logits/rejected": -2.608567953109741, + "logps/chosen": -208.57598876953125, + "logps/rejected": -263.33038330078125, + "loss": 0.0906, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.667578101158142, + "rewards/margins": 3.736051082611084, + "rewards/rejected": -5.403628826141357, + "step": 1113 + }, + { + "epoch": 1.46, + "learning_rate": 2.793131844591666e-05, + "logits/chosen": -2.3598008155822754, + "logits/rejected": -2.369389057159424, + "logps/chosen": -180.89463806152344, + "logps/rejected": -215.64210510253906, + "loss": 0.1687, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8092491626739502, + "rewards/margins": 3.4173953533172607, + "rewards/rejected": -5.226644039154053, + "step": 1114 + }, + { + "epoch": 1.46, + "learning_rate": 2.7895732392256952e-05, + "logits/chosen": -2.470151662826538, + "logits/rejected": -2.427502393722534, + "logps/chosen": -257.4433288574219, + "logps/rejected": -305.4461364746094, + "loss": 0.1185, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1104799509048462, + "rewards/margins": 4.737048625946045, + "rewards/rejected": -5.847528457641602, + "step": 1115 + }, + { + "epoch": 1.46, + "learning_rate": 2.7860140390514583e-05, + "logits/chosen": -2.548470973968506, + "logits/rejected": -2.537538528442383, + "logps/chosen": -192.6724090576172, + "logps/rejected": -228.3590545654297, + "loss": 0.1689, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9351139068603516, + "rewards/margins": 3.451711654663086, + "rewards/rejected": -5.3868255615234375, + "step": 1116 + }, + { + "epoch": 1.46, + "learning_rate": 2.7824542513798567e-05, + "logits/chosen": -2.5892467498779297, + "logits/rejected": -2.6052474975585938, + "logps/chosen": -175.36236572265625, + "logps/rejected": -241.00650024414062, + "loss": 0.1305, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.789189338684082, + "rewards/margins": 3.8392183780670166, + "rewards/rejected": -5.628407955169678, + "step": 1117 + }, + { + "epoch": 1.46, + "learning_rate": 2.7788938835230005e-05, + "logits/chosen": -2.5941977500915527, + "logits/rejected": -2.637068271636963, + "logps/chosen": -211.3462677001953, + "logps/rejected": -270.5381164550781, + "loss": 0.0966, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7708839178085327, + "rewards/margins": 4.489347457885742, + "rewards/rejected": -6.2602314949035645, + "step": 1118 + }, + { + "epoch": 1.46, + "learning_rate": 2.77533294279419e-05, + "logits/chosen": -2.478257179260254, + "logits/rejected": -2.526747465133667, + "logps/chosen": -231.8919677734375, + "logps/rejected": -280.0306701660156, + "loss": 0.1169, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1189942359924316, + "rewards/margins": 3.775949478149414, + "rewards/rejected": -5.894943714141846, + "step": 1119 + }, + { + "epoch": 1.47, + "learning_rate": 2.771771436507903e-05, + "logits/chosen": -2.3841311931610107, + "logits/rejected": -2.4876410961151123, + "logps/chosen": -198.41287231445312, + "logps/rejected": -221.9237518310547, + "loss": 0.2222, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1843695640563965, + "rewards/margins": 2.918355941772461, + "rewards/rejected": -5.102725028991699, + "step": 1120 + }, + { + "epoch": 1.47, + "learning_rate": 2.7682093719797792e-05, + "logits/chosen": -2.219733238220215, + "logits/rejected": -2.293342113494873, + "logps/chosen": -189.82054138183594, + "logps/rejected": -285.5018615722656, + "loss": 0.1259, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8944518566131592, + "rewards/margins": 3.9687235355377197, + "rewards/rejected": -5.863175392150879, + "step": 1121 + }, + { + "epoch": 1.47, + "learning_rate": 2.764646756526603e-05, + "logits/chosen": -2.3371381759643555, + "logits/rejected": -2.45349383354187, + "logps/chosen": -167.41537475585938, + "logps/rejected": -219.73658752441406, + "loss": 0.2046, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6245447397232056, + "rewards/margins": 2.90950345993042, + "rewards/rejected": -4.534048557281494, + "step": 1122 + }, + { + "epoch": 1.47, + "learning_rate": 2.7610835974662942e-05, + "logits/chosen": -2.383814811706543, + "logits/rejected": -2.395169258117676, + "logps/chosen": -189.44192504882812, + "logps/rejected": -205.80313110351562, + "loss": 0.2283, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1368603706359863, + "rewards/margins": 3.572661876678467, + "rewards/rejected": -5.709522724151611, + "step": 1123 + }, + { + "epoch": 1.47, + "learning_rate": 2.757519902117886e-05, + "logits/chosen": -2.593309164047241, + "logits/rejected": -2.7519969940185547, + "logps/chosen": -204.20932006835938, + "logps/rejected": -286.3390197753906, + "loss": 0.053, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7084423303604126, + "rewards/margins": 4.267474174499512, + "rewards/rejected": -5.975915908813477, + "step": 1124 + }, + { + "epoch": 1.47, + "learning_rate": 2.7539556778015147e-05, + "logits/chosen": -2.3881609439849854, + "logits/rejected": -2.429624319076538, + "logps/chosen": -199.5579833984375, + "logps/rejected": -252.97906494140625, + "loss": 0.0869, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6453319787979126, + "rewards/margins": 4.320263862609863, + "rewards/rejected": -5.965595722198486, + "step": 1125 + }, + { + "epoch": 1.47, + "learning_rate": 2.7503909318384026e-05, + "logits/chosen": -2.4145028591156006, + "logits/rejected": -2.528611898422241, + "logps/chosen": -204.98658752441406, + "logps/rejected": -245.54029846191406, + "loss": 0.1359, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9250624179840088, + "rewards/margins": 3.8524794578552246, + "rewards/rejected": -5.777541637420654, + "step": 1126 + }, + { + "epoch": 1.48, + "learning_rate": 2.7468256715508428e-05, + "logits/chosen": -2.511584758758545, + "logits/rejected": -2.597658395767212, + "logps/chosen": -195.15286254882812, + "logps/rejected": -242.09451293945312, + "loss": 0.047, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0944178104400635, + "rewards/margins": 4.2750444412231445, + "rewards/rejected": -5.369461536407471, + "step": 1127 + }, + { + "epoch": 1.48, + "learning_rate": 2.743259904262187e-05, + "logits/chosen": -2.3979921340942383, + "logits/rejected": -2.3268072605133057, + "logps/chosen": -234.20535278320312, + "logps/rejected": -244.03814697265625, + "loss": 0.184, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6658861637115479, + "rewards/margins": 4.534457206726074, + "rewards/rejected": -6.200343608856201, + "step": 1128 + }, + { + "epoch": 1.48, + "learning_rate": 2.739693637296826e-05, + "logits/chosen": -2.5864174365997314, + "logits/rejected": -2.576723575592041, + "logps/chosen": -211.89031982421875, + "logps/rejected": -256.0931396484375, + "loss": 0.0308, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4756391048431396, + "rewards/margins": 4.727585315704346, + "rewards/rejected": -7.203224182128906, + "step": 1129 + }, + { + "epoch": 1.48, + "learning_rate": 2.7361268779801785e-05, + "logits/chosen": -2.5616533756256104, + "logits/rejected": -2.4383814334869385, + "logps/chosen": -204.30007934570312, + "logps/rejected": -222.07203674316406, + "loss": 0.1519, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9900273084640503, + "rewards/margins": 2.9745967388153076, + "rewards/rejected": -4.964623928070068, + "step": 1130 + }, + { + "epoch": 1.48, + "learning_rate": 2.7325596336386738e-05, + "logits/chosen": -2.412752151489258, + "logits/rejected": -2.5156502723693848, + "logps/chosen": -173.6313018798828, + "logps/rejected": -214.47862243652344, + "loss": 0.1265, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.889671802520752, + "rewards/margins": 3.6633248329162598, + "rewards/rejected": -5.5529961585998535, + "step": 1131 + }, + { + "epoch": 1.48, + "learning_rate": 2.7289919115997374e-05, + "logits/chosen": -2.408198833465576, + "logits/rejected": -2.432340621948242, + "logps/chosen": -187.71417236328125, + "logps/rejected": -268.82928466796875, + "loss": 0.1006, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0281875133514404, + "rewards/margins": 4.615522384643555, + "rewards/rejected": -6.643709659576416, + "step": 1132 + }, + { + "epoch": 1.48, + "learning_rate": 2.7254237191917776e-05, + "logits/chosen": -2.5138401985168457, + "logits/rejected": -2.485018730163574, + "logps/chosen": -195.2764892578125, + "logps/rejected": -224.8616180419922, + "loss": 0.3707, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.606025218963623, + "rewards/margins": 3.221513032913208, + "rewards/rejected": -5.82753849029541, + "step": 1133 + }, + { + "epoch": 1.48, + "learning_rate": 2.721855063744165e-05, + "logits/chosen": -2.5827510356903076, + "logits/rejected": -2.5125937461853027, + "logps/chosen": -193.630859375, + "logps/rejected": -233.06712341308594, + "loss": 0.1362, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9671474695205688, + "rewards/margins": 3.666337251663208, + "rewards/rejected": -5.633484840393066, + "step": 1134 + }, + { + "epoch": 1.49, + "learning_rate": 2.718285952587228e-05, + "logits/chosen": -2.41474986076355, + "logits/rejected": -2.3287770748138428, + "logps/chosen": -230.7402801513672, + "logps/rejected": -209.27938842773438, + "loss": 0.2741, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2288248538970947, + "rewards/margins": 3.0448858737945557, + "rewards/rejected": -5.27371072769165, + "step": 1135 + }, + { + "epoch": 1.49, + "learning_rate": 2.714716393052223e-05, + "logits/chosen": -2.2132129669189453, + "logits/rejected": -2.169909954071045, + "logps/chosen": -153.85777282714844, + "logps/rejected": -188.3119659423828, + "loss": 0.2845, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0806260108947754, + "rewards/margins": 2.9861631393432617, + "rewards/rejected": -5.066788673400879, + "step": 1136 + }, + { + "epoch": 1.49, + "learning_rate": 2.711146392471333e-05, + "logits/chosen": -2.4302916526794434, + "logits/rejected": -2.598952531814575, + "logps/chosen": -173.0673828125, + "logps/rejected": -249.73768615722656, + "loss": 0.1098, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1081714630126953, + "rewards/margins": 4.3053297996521, + "rewards/rejected": -6.413500785827637, + "step": 1137 + }, + { + "epoch": 1.49, + "learning_rate": 2.7075759581776462e-05, + "logits/chosen": -2.643765926361084, + "logits/rejected": -2.5694832801818848, + "logps/chosen": -215.6745147705078, + "logps/rejected": -290.8182678222656, + "loss": 0.0663, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.123965263366699, + "rewards/margins": 4.011368274688721, + "rewards/rejected": -6.135334014892578, + "step": 1138 + }, + { + "epoch": 1.49, + "learning_rate": 2.704005097505139e-05, + "logits/chosen": -2.3963794708251953, + "logits/rejected": -2.3991942405700684, + "logps/chosen": -182.93653869628906, + "logps/rejected": -240.177734375, + "loss": 0.21, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.388463020324707, + "rewards/margins": 3.3698763847351074, + "rewards/rejected": -5.758339881896973, + "step": 1139 + }, + { + "epoch": 1.49, + "learning_rate": 2.7004338177886672e-05, + "logits/chosen": -2.5511152744293213, + "logits/rejected": -2.49131441116333, + "logps/chosen": -220.4542236328125, + "logps/rejected": -254.0352783203125, + "loss": 0.2536, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.167400360107422, + "rewards/margins": 3.443826675415039, + "rewards/rejected": -6.611227512359619, + "step": 1140 + }, + { + "epoch": 1.49, + "learning_rate": 2.6968621263639444e-05, + "logits/chosen": -2.3561928272247314, + "logits/rejected": -2.4970171451568604, + "logps/chosen": -180.90406799316406, + "logps/rejected": -235.68060302734375, + "loss": 0.0669, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9202992916107178, + "rewards/margins": 4.159742832183838, + "rewards/rejected": -6.080041885375977, + "step": 1141 + }, + { + "epoch": 1.49, + "learning_rate": 2.693290030567532e-05, + "logits/chosen": -2.3658244609832764, + "logits/rejected": -2.5130107402801514, + "logps/chosen": -190.42832946777344, + "logps/rejected": -267.71026611328125, + "loss": 0.1073, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8670670986175537, + "rewards/margins": 3.933051586151123, + "rewards/rejected": -5.800118923187256, + "step": 1142 + }, + { + "epoch": 1.5, + "learning_rate": 2.6897175377368207e-05, + "logits/chosen": -2.482060194015503, + "logits/rejected": -2.4583144187927246, + "logps/chosen": -189.29702758789062, + "logps/rejected": -220.23678588867188, + "loss": 0.1193, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3088293075561523, + "rewards/margins": 3.29948353767395, + "rewards/rejected": -5.608312129974365, + "step": 1143 + }, + { + "epoch": 1.5, + "learning_rate": 2.686144655210016e-05, + "logits/chosen": -2.5877132415771484, + "logits/rejected": -2.5863866806030273, + "logps/chosen": -235.26358032226562, + "logps/rejected": -276.48602294921875, + "loss": 0.1604, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.122990369796753, + "rewards/margins": 4.248631000518799, + "rewards/rejected": -6.371621131896973, + "step": 1144 + }, + { + "epoch": 1.5, + "learning_rate": 2.6825713903261273e-05, + "logits/chosen": -2.542851448059082, + "logits/rejected": -2.595625162124634, + "logps/chosen": -275.7511901855469, + "logps/rejected": -342.91314697265625, + "loss": 0.1302, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5076751708984375, + "rewards/margins": 3.859196424484253, + "rewards/rejected": -5.3668718338012695, + "step": 1145 + }, + { + "epoch": 1.5, + "learning_rate": 2.6789977504249454e-05, + "logits/chosen": -2.401787281036377, + "logits/rejected": -2.335707664489746, + "logps/chosen": -214.10304260253906, + "logps/rejected": -229.9938507080078, + "loss": 0.1813, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9307721853256226, + "rewards/margins": 3.4869091510772705, + "rewards/rejected": -5.4176812171936035, + "step": 1146 + }, + { + "epoch": 1.5, + "learning_rate": 2.6754237428470336e-05, + "logits/chosen": -2.4084243774414062, + "logits/rejected": -2.3774447441101074, + "logps/chosen": -179.9666748046875, + "logps/rejected": -229.15269470214844, + "loss": 0.1165, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6553536653518677, + "rewards/margins": 4.051252841949463, + "rewards/rejected": -5.706606388092041, + "step": 1147 + }, + { + "epoch": 1.5, + "learning_rate": 2.6718493749337105e-05, + "logits/chosen": -2.5925581455230713, + "logits/rejected": -2.6709365844726562, + "logps/chosen": -170.56405639648438, + "logps/rejected": -229.6776123046875, + "loss": 0.1266, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9334518909454346, + "rewards/margins": 3.194617986679077, + "rewards/rejected": -5.128069877624512, + "step": 1148 + }, + { + "epoch": 1.5, + "learning_rate": 2.668274654027033e-05, + "logits/chosen": -2.6076910495758057, + "logits/rejected": -2.53147554397583, + "logps/chosen": -187.9995880126953, + "logps/rejected": -258.0810241699219, + "loss": 0.1186, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1982994079589844, + "rewards/margins": 4.555995941162109, + "rewards/rejected": -6.754295349121094, + "step": 1149 + }, + { + "epoch": 1.51, + "learning_rate": 2.664699587469786e-05, + "logits/chosen": -2.105681896209717, + "logits/rejected": -2.16200852394104, + "logps/chosen": -156.17245483398438, + "logps/rejected": -210.03741455078125, + "loss": 0.1257, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.890591025352478, + "rewards/margins": 3.807307243347168, + "rewards/rejected": -5.697897911071777, + "step": 1150 + }, + { + "epoch": 1.51, + "learning_rate": 2.6611241826054617e-05, + "logits/chosen": -2.5347132682800293, + "logits/rejected": -2.667999744415283, + "logps/chosen": -195.80181884765625, + "logps/rejected": -247.29022216796875, + "loss": 0.1161, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5774171352386475, + "rewards/margins": 3.9218239784240723, + "rewards/rejected": -5.499240875244141, + "step": 1151 + }, + { + "epoch": 1.51, + "learning_rate": 2.6575484467782486e-05, + "logits/chosen": -2.404939651489258, + "logits/rejected": -2.4487950801849365, + "logps/chosen": -175.7524871826172, + "logps/rejected": -238.2417449951172, + "loss": 0.1582, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.968187689781189, + "rewards/margins": 3.7637362480163574, + "rewards/rejected": -5.731924057006836, + "step": 1152 + }, + { + "epoch": 1.51, + "learning_rate": 2.6539723873330148e-05, + "logits/chosen": -2.6103971004486084, + "logits/rejected": -2.5818848609924316, + "logps/chosen": -232.71112060546875, + "logps/rejected": -282.5625305175781, + "loss": 0.1674, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5069048404693604, + "rewards/margins": 4.3278326988220215, + "rewards/rejected": -5.834737777709961, + "step": 1153 + }, + { + "epoch": 1.51, + "learning_rate": 2.6503960116152933e-05, + "logits/chosen": -2.594650983810425, + "logits/rejected": -2.6336145401000977, + "logps/chosen": -201.71214294433594, + "logps/rejected": -274.7174072265625, + "loss": 0.0664, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1206674575805664, + "rewards/margins": 4.755134105682373, + "rewards/rejected": -6.8758015632629395, + "step": 1154 + }, + { + "epoch": 1.51, + "learning_rate": 2.646819326971266e-05, + "logits/chosen": -2.4223642349243164, + "logits/rejected": -2.4635026454925537, + "logps/chosen": -180.48873901367188, + "logps/rejected": -240.8888397216797, + "loss": 0.1034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7180030345916748, + "rewards/margins": 3.8219757080078125, + "rewards/rejected": -5.539978981018066, + "step": 1155 + }, + { + "epoch": 1.51, + "learning_rate": 2.6432423407477496e-05, + "logits/chosen": -2.3538551330566406, + "logits/rejected": -2.403841018676758, + "logps/chosen": -191.4554901123047, + "logps/rejected": -209.89437866210938, + "loss": 0.1501, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.773792028427124, + "rewards/margins": 3.7147250175476074, + "rewards/rejected": -5.488516807556152, + "step": 1156 + }, + { + "epoch": 1.51, + "learning_rate": 2.6396650602921824e-05, + "logits/chosen": -2.512646436691284, + "logits/rejected": -2.472940444946289, + "logps/chosen": -192.06329345703125, + "logps/rejected": -226.666015625, + "loss": 0.2118, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1443490982055664, + "rewards/margins": 2.7057623863220215, + "rewards/rejected": -4.850111484527588, + "step": 1157 + }, + { + "epoch": 1.52, + "learning_rate": 2.636087492952603e-05, + "logits/chosen": -2.435938596725464, + "logits/rejected": -2.487466812133789, + "logps/chosen": -223.7642364501953, + "logps/rejected": -290.96099853515625, + "loss": 0.0736, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6000373363494873, + "rewards/margins": 4.173830032348633, + "rewards/rejected": -5.773867607116699, + "step": 1158 + }, + { + "epoch": 1.52, + "learning_rate": 2.6325096460776422e-05, + "logits/chosen": -2.2877049446105957, + "logits/rejected": -2.374433994293213, + "logps/chosen": -226.02969360351562, + "logps/rejected": -330.5922546386719, + "loss": 0.1132, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.808432698249817, + "rewards/margins": 5.300567150115967, + "rewards/rejected": -7.109000205993652, + "step": 1159 + }, + { + "epoch": 1.52, + "learning_rate": 2.6289315270165062e-05, + "logits/chosen": -2.394732713699341, + "logits/rejected": -2.3233091831207275, + "logps/chosen": -202.18605041503906, + "logps/rejected": -218.09930419921875, + "loss": 0.28, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.454371929168701, + "rewards/margins": 2.983107328414917, + "rewards/rejected": -5.437479496002197, + "step": 1160 + }, + { + "epoch": 1.52, + "learning_rate": 2.625353143118955e-05, + "logits/chosen": -2.543612003326416, + "logits/rejected": -2.3408596515655518, + "logps/chosen": -219.27423095703125, + "logps/rejected": -246.36692810058594, + "loss": 0.1986, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.151402473449707, + "rewards/margins": 3.9970099925994873, + "rewards/rejected": -6.148412227630615, + "step": 1161 + }, + { + "epoch": 1.52, + "learning_rate": 2.621774501735299e-05, + "logits/chosen": -2.561230421066284, + "logits/rejected": -2.5563735961914062, + "logps/chosen": -212.06178283691406, + "logps/rejected": -280.7877197265625, + "loss": 0.0694, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.349179983139038, + "rewards/margins": 4.015913486480713, + "rewards/rejected": -6.36509370803833, + "step": 1162 + }, + { + "epoch": 1.52, + "learning_rate": 2.6181956102163724e-05, + "logits/chosen": -2.438438653945923, + "logits/rejected": -2.411284923553467, + "logps/chosen": -204.26626586914062, + "logps/rejected": -224.2980194091797, + "loss": 0.1596, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6477456092834473, + "rewards/margins": 3.6775503158569336, + "rewards/rejected": -5.325295448303223, + "step": 1163 + }, + { + "epoch": 1.52, + "learning_rate": 2.6146164759135266e-05, + "logits/chosen": -2.4624485969543457, + "logits/rejected": -2.608297109603882, + "logps/chosen": -198.11630249023438, + "logps/rejected": -279.4952087402344, + "loss": 0.0752, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9120194911956787, + "rewards/margins": 4.4521660804748535, + "rewards/rejected": -6.3641862869262695, + "step": 1164 + }, + { + "epoch": 1.52, + "learning_rate": 2.6110371061786104e-05, + "logits/chosen": -2.579885244369507, + "logits/rejected": -2.588857412338257, + "logps/chosen": -218.35621643066406, + "logps/rejected": -338.2559509277344, + "loss": 0.1651, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9608259201049805, + "rewards/margins": 4.070187568664551, + "rewards/rejected": -6.0310139656066895, + "step": 1165 + }, + { + "epoch": 1.53, + "learning_rate": 2.607457508363955e-05, + "logits/chosen": -2.5413784980773926, + "logits/rejected": -2.462733507156372, + "logps/chosen": -185.4339599609375, + "logps/rejected": -215.44322204589844, + "loss": 0.0722, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2230255603790283, + "rewards/margins": 3.1862130165100098, + "rewards/rejected": -5.409238815307617, + "step": 1166 + }, + { + "epoch": 1.53, + "learning_rate": 2.6038776898223627e-05, + "logits/chosen": -2.5814647674560547, + "logits/rejected": -2.5881781578063965, + "logps/chosen": -212.20013427734375, + "logps/rejected": -267.906494140625, + "loss": 0.2913, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.045924186706543, + "rewards/margins": 3.724151611328125, + "rewards/rejected": -5.770075798034668, + "step": 1167 + }, + { + "epoch": 1.53, + "learning_rate": 2.6002976579070872e-05, + "logits/chosen": -2.512990951538086, + "logits/rejected": -2.4913716316223145, + "logps/chosen": -245.27142333984375, + "logps/rejected": -293.2515869140625, + "loss": 0.0678, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.694406270980835, + "rewards/margins": 5.159485816955566, + "rewards/rejected": -6.8538923263549805, + "step": 1168 + }, + { + "epoch": 1.53, + "learning_rate": 2.5967174199718202e-05, + "logits/chosen": -2.5783042907714844, + "logits/rejected": -2.533820629119873, + "logps/chosen": -174.74606323242188, + "logps/rejected": -205.39010620117188, + "loss": 0.0997, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0044188499450684, + "rewards/margins": 3.723245143890381, + "rewards/rejected": -5.727663993835449, + "step": 1169 + }, + { + "epoch": 1.53, + "learning_rate": 2.5931369833706797e-05, + "logits/chosen": -2.505870819091797, + "logits/rejected": -2.652935028076172, + "logps/chosen": -165.0854034423828, + "logps/rejected": -210.19493103027344, + "loss": 0.1241, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9117227792739868, + "rewards/margins": 3.3760457038879395, + "rewards/rejected": -5.287768363952637, + "step": 1170 + }, + { + "epoch": 1.53, + "learning_rate": 2.5895563554581865e-05, + "logits/chosen": -2.5375044345855713, + "logits/rejected": -2.6303701400756836, + "logps/chosen": -214.3314208984375, + "logps/rejected": -259.046875, + "loss": 0.1468, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.904581069946289, + "rewards/margins": 3.818908452987671, + "rewards/rejected": -5.723488807678223, + "step": 1171 + }, + { + "epoch": 1.53, + "learning_rate": 2.5859755435892597e-05, + "logits/chosen": -2.544905185699463, + "logits/rejected": -2.5176284313201904, + "logps/chosen": -202.62747192382812, + "logps/rejected": -243.34954833984375, + "loss": 0.1014, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1793625354766846, + "rewards/margins": 3.771909236907959, + "rewards/rejected": -5.9512715339660645, + "step": 1172 + }, + { + "epoch": 1.54, + "learning_rate": 2.5823945551191937e-05, + "logits/chosen": -2.6635918617248535, + "logits/rejected": -2.5688891410827637, + "logps/chosen": -256.446044921875, + "logps/rejected": -284.71209716796875, + "loss": 0.028, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6801475286483765, + "rewards/margins": 4.665665626525879, + "rewards/rejected": -6.345813274383545, + "step": 1173 + }, + { + "epoch": 1.54, + "learning_rate": 2.578813397403645e-05, + "logits/chosen": -2.4600229263305664, + "logits/rejected": -2.5726194381713867, + "logps/chosen": -198.30300903320312, + "logps/rejected": -260.0888671875, + "loss": 0.1484, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1120827198028564, + "rewards/margins": 3.8566527366638184, + "rewards/rejected": -5.968735218048096, + "step": 1174 + }, + { + "epoch": 1.54, + "learning_rate": 2.5752320777986195e-05, + "logits/chosen": -2.505962371826172, + "logits/rejected": -2.575345039367676, + "logps/chosen": -172.0994873046875, + "logps/rejected": -230.88931274414062, + "loss": 0.1991, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8466949462890625, + "rewards/margins": 3.9375131130218506, + "rewards/rejected": -5.784208297729492, + "step": 1175 + }, + { + "epoch": 1.54, + "learning_rate": 2.5716506036604542e-05, + "logits/chosen": -2.6372785568237305, + "logits/rejected": -2.6130564212799072, + "logps/chosen": -193.61634826660156, + "logps/rejected": -225.62921142578125, + "loss": 0.2258, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.31103515625, + "rewards/margins": 2.853492021560669, + "rewards/rejected": -5.164527416229248, + "step": 1176 + }, + { + "epoch": 1.54, + "learning_rate": 2.568068982345804e-05, + "logits/chosen": -2.5906612873077393, + "logits/rejected": -2.5320322513580322, + "logps/chosen": -192.87355041503906, + "logps/rejected": -217.69412231445312, + "loss": 0.1555, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6314491033554077, + "rewards/margins": 3.411200761795044, + "rewards/rejected": -5.042649745941162, + "step": 1177 + }, + { + "epoch": 1.54, + "learning_rate": 2.5644872212116267e-05, + "logits/chosen": -2.3863308429718018, + "logits/rejected": -2.561542272567749, + "logps/chosen": -139.75320434570312, + "logps/rejected": -212.48846435546875, + "loss": 0.2087, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.422823905944824, + "rewards/margins": 3.4507193565368652, + "rewards/rejected": -5.873542785644531, + "step": 1178 + }, + { + "epoch": 1.54, + "learning_rate": 2.560905327615168e-05, + "logits/chosen": -2.362597942352295, + "logits/rejected": -2.4775197505950928, + "logps/chosen": -202.98377990722656, + "logps/rejected": -229.90570068359375, + "loss": 0.1899, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9989982843399048, + "rewards/margins": 3.311842441558838, + "rewards/rejected": -5.310841083526611, + "step": 1179 + }, + { + "epoch": 1.54, + "learning_rate": 2.557323308913942e-05, + "logits/chosen": -2.266062021255493, + "logits/rejected": -2.3673911094665527, + "logps/chosen": -228.26878356933594, + "logps/rejected": -313.76556396484375, + "loss": 0.1365, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4849106073379517, + "rewards/margins": 5.162468433380127, + "rewards/rejected": -6.647378921508789, + "step": 1180 + }, + { + "epoch": 1.55, + "learning_rate": 2.553741172465724e-05, + "logits/chosen": -2.4847447872161865, + "logits/rejected": -2.4782495498657227, + "logps/chosen": -189.3985595703125, + "logps/rejected": -236.57984924316406, + "loss": 0.1747, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.518495798110962, + "rewards/margins": 4.221526145935059, + "rewards/rejected": -5.740021705627441, + "step": 1181 + }, + { + "epoch": 1.55, + "learning_rate": 2.5501589256285285e-05, + "logits/chosen": -2.3993282318115234, + "logits/rejected": -2.4668867588043213, + "logps/chosen": -237.7062530517578, + "logps/rejected": -278.96942138671875, + "loss": 0.0514, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9264401197433472, + "rewards/margins": 4.7197160720825195, + "rewards/rejected": -6.646156311035156, + "step": 1182 + }, + { + "epoch": 1.55, + "learning_rate": 2.546576575760598e-05, + "logits/chosen": -2.532160758972168, + "logits/rejected": -2.6826624870300293, + "logps/chosen": -221.54660034179688, + "logps/rejected": -259.85107421875, + "loss": 0.0424, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0069048404693604, + "rewards/margins": 4.225020408630371, + "rewards/rejected": -6.231925010681152, + "step": 1183 + }, + { + "epoch": 1.55, + "learning_rate": 2.542994130220388e-05, + "logits/chosen": -2.351355791091919, + "logits/rejected": -2.322763681411743, + "logps/chosen": -237.50283813476562, + "logps/rejected": -239.27328491210938, + "loss": 0.264, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.335080623626709, + "rewards/margins": 3.1680402755737305, + "rewards/rejected": -5.503120422363281, + "step": 1184 + }, + { + "epoch": 1.55, + "learning_rate": 2.539411596366546e-05, + "logits/chosen": -2.569291830062866, + "logits/rejected": -2.662236213684082, + "logps/chosen": -198.28985595703125, + "logps/rejected": -284.72100830078125, + "loss": 0.0381, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.120055675506592, + "rewards/margins": 4.714890480041504, + "rewards/rejected": -6.834946632385254, + "step": 1185 + }, + { + "epoch": 1.55, + "learning_rate": 2.535828981557906e-05, + "logits/chosen": -2.396310329437256, + "logits/rejected": -2.3363780975341797, + "logps/chosen": -190.41680908203125, + "logps/rejected": -225.03347778320312, + "loss": 0.1013, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5107342004776, + "rewards/margins": 3.9485929012298584, + "rewards/rejected": -5.45932674407959, + "step": 1186 + }, + { + "epoch": 1.55, + "learning_rate": 2.5322462931534658e-05, + "logits/chosen": -2.4046525955200195, + "logits/rejected": -2.4155662059783936, + "logps/chosen": -143.59957885742188, + "logps/rejected": -197.567138671875, + "loss": 0.1609, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.510847806930542, + "rewards/margins": 4.117862701416016, + "rewards/rejected": -5.628711223602295, + "step": 1187 + }, + { + "epoch": 1.55, + "learning_rate": 2.5286635385123725e-05, + "logits/chosen": -2.4825637340545654, + "logits/rejected": -2.488572359085083, + "logps/chosen": -185.13006591796875, + "logps/rejected": -208.24627685546875, + "loss": 0.1996, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4900832176208496, + "rewards/margins": 2.789546489715576, + "rewards/rejected": -5.279629707336426, + "step": 1188 + }, + { + "epoch": 1.56, + "learning_rate": 2.525080724993914e-05, + "logits/chosen": -2.3754312992095947, + "logits/rejected": -2.317816734313965, + "logps/chosen": -210.82420349121094, + "logps/rejected": -247.4285125732422, + "loss": 0.2063, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.501269817352295, + "rewards/margins": 3.8780133724212646, + "rewards/rejected": -6.379282474517822, + "step": 1189 + }, + { + "epoch": 1.56, + "learning_rate": 2.521497859957495e-05, + "logits/chosen": -2.61700439453125, + "logits/rejected": -2.6015262603759766, + "logps/chosen": -230.24192810058594, + "logps/rejected": -280.5144348144531, + "loss": 0.2983, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.067976713180542, + "rewards/margins": 3.6285672187805176, + "rewards/rejected": -6.696544647216797, + "step": 1190 + }, + { + "epoch": 1.56, + "learning_rate": 2.5179149507626288e-05, + "logits/chosen": -2.602006435394287, + "logits/rejected": -2.6151204109191895, + "logps/chosen": -249.7869873046875, + "logps/rejected": -265.90185546875, + "loss": 0.1049, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9429850578308105, + "rewards/margins": 3.4793856143951416, + "rewards/rejected": -5.422370910644531, + "step": 1191 + }, + { + "epoch": 1.56, + "learning_rate": 2.5143320047689173e-05, + "logits/chosen": -2.554598093032837, + "logits/rejected": -2.557563066482544, + "logps/chosen": -191.95619201660156, + "logps/rejected": -236.6368408203125, + "loss": 0.0909, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.783348798751831, + "rewards/margins": 3.685852527618408, + "rewards/rejected": -5.46920108795166, + "step": 1192 + }, + { + "epoch": 1.56, + "learning_rate": 2.510749029336038e-05, + "logits/chosen": -2.360487699508667, + "logits/rejected": -2.4794256687164307, + "logps/chosen": -185.53933715820312, + "logps/rejected": -252.9842529296875, + "loss": 0.0871, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6528232097625732, + "rewards/margins": 4.595056533813477, + "rewards/rejected": -6.247879505157471, + "step": 1193 + }, + { + "epoch": 1.56, + "learning_rate": 2.5071660318237312e-05, + "logits/chosen": -2.4765918254852295, + "logits/rejected": -2.4997401237487793, + "logps/chosen": -267.73760986328125, + "logps/rejected": -309.74078369140625, + "loss": 0.2239, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.543215274810791, + "rewards/margins": 2.8270809650421143, + "rewards/rejected": -5.370296001434326, + "step": 1194 + }, + { + "epoch": 1.56, + "learning_rate": 2.5035830195917803e-05, + "logits/chosen": -2.3442373275756836, + "logits/rejected": -2.4678149223327637, + "logps/chosen": -178.58602905273438, + "logps/rejected": -250.99200439453125, + "loss": 0.0798, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.024785041809082, + "rewards/margins": 4.233987808227539, + "rewards/rejected": -6.258773326873779, + "step": 1195 + }, + { + "epoch": 1.57, + "learning_rate": 2.5e-05, + "logits/chosen": -2.5049970149993896, + "logits/rejected": -2.524064779281616, + "logps/chosen": -178.7131805419922, + "logps/rejected": -241.450439453125, + "loss": 0.0956, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.132033348083496, + "rewards/margins": 4.712368488311768, + "rewards/rejected": -6.844402313232422, + "step": 1196 + }, + { + "epoch": 1.57, + "learning_rate": 2.49641698040822e-05, + "logits/chosen": -2.5503926277160645, + "logits/rejected": -2.6383919715881348, + "logps/chosen": -220.08641052246094, + "logps/rejected": -252.4928436279297, + "loss": 0.1522, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.065068244934082, + "rewards/margins": 3.896519184112549, + "rewards/rejected": -5.961587429046631, + "step": 1197 + }, + { + "epoch": 1.57, + "learning_rate": 2.4928339681762687e-05, + "logits/chosen": -2.492288827896118, + "logits/rejected": -2.5964560508728027, + "logps/chosen": -194.37997436523438, + "logps/rejected": -258.8657531738281, + "loss": 0.0818, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0502939224243164, + "rewards/margins": 4.57408332824707, + "rewards/rejected": -6.624377250671387, + "step": 1198 + }, + { + "epoch": 1.57, + "learning_rate": 2.489250970663963e-05, + "logits/chosen": -2.3082306385040283, + "logits/rejected": -2.5480411052703857, + "logps/chosen": -190.9907684326172, + "logps/rejected": -308.1433410644531, + "loss": 0.0553, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.308465003967285, + "rewards/margins": 4.765655040740967, + "rewards/rejected": -7.074119567871094, + "step": 1199 + }, + { + "epoch": 1.57, + "learning_rate": 2.485667995231084e-05, + "logits/chosen": -2.4776437282562256, + "logits/rejected": -2.474857807159424, + "logps/chosen": -186.9247589111328, + "logps/rejected": -220.29751586914062, + "loss": 0.1593, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.372529983520508, + "rewards/margins": 3.439894914627075, + "rewards/rejected": -5.812424659729004, + "step": 1200 + }, + { + "epoch": 1.57, + "learning_rate": 2.4820850492373718e-05, + "logits/chosen": -2.36360502243042, + "logits/rejected": -2.397672176361084, + "logps/chosen": -233.58541870117188, + "logps/rejected": -228.21426391601562, + "loss": 0.2224, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0872836112976074, + "rewards/margins": 3.649059534072876, + "rewards/rejected": -5.7363433837890625, + "step": 1201 + }, + { + "epoch": 1.57, + "learning_rate": 2.4785021400425053e-05, + "logits/chosen": -2.4734957218170166, + "logits/rejected": -2.481133460998535, + "logps/chosen": -187.26792907714844, + "logps/rejected": -231.77871704101562, + "loss": 0.2656, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1196248531341553, + "rewards/margins": 3.671448230743408, + "rewards/rejected": -5.791072368621826, + "step": 1202 + }, + { + "epoch": 1.57, + "learning_rate": 2.474919275006086e-05, + "logits/chosen": -2.1534435749053955, + "logits/rejected": -2.3020589351654053, + "logps/chosen": -155.80584716796875, + "logps/rejected": -215.63375854492188, + "loss": 0.1296, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.150587797164917, + "rewards/margins": 3.296694040298462, + "rewards/rejected": -5.447281837463379, + "step": 1203 + }, + { + "epoch": 1.58, + "learning_rate": 2.4713364614876274e-05, + "logits/chosen": -2.3445792198181152, + "logits/rejected": -2.4130592346191406, + "logps/chosen": -238.4275360107422, + "logps/rejected": -264.49609375, + "loss": 0.0657, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1871018409729004, + "rewards/margins": 3.938580274581909, + "rewards/rejected": -6.125682830810547, + "step": 1204 + }, + { + "epoch": 1.58, + "learning_rate": 2.4677537068465355e-05, + "logits/chosen": -2.504162311553955, + "logits/rejected": -2.586043357849121, + "logps/chosen": -182.72637939453125, + "logps/rejected": -219.2725830078125, + "loss": 0.1691, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5585496425628662, + "rewards/margins": 4.131922245025635, + "rewards/rejected": -5.690471649169922, + "step": 1205 + }, + { + "epoch": 1.58, + "learning_rate": 2.4641710184420945e-05, + "logits/chosen": -2.375551462173462, + "logits/rejected": -2.4032394886016846, + "logps/chosen": -237.87876892089844, + "logps/rejected": -292.5626525878906, + "loss": 0.0899, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7856876850128174, + "rewards/margins": 4.658097267150879, + "rewards/rejected": -6.443785667419434, + "step": 1206 + }, + { + "epoch": 1.58, + "learning_rate": 2.4605884036334546e-05, + "logits/chosen": -2.454979658126831, + "logits/rejected": -2.485933542251587, + "logps/chosen": -212.19326782226562, + "logps/rejected": -259.90643310546875, + "loss": 0.0975, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8596946001052856, + "rewards/margins": 3.944255828857422, + "rewards/rejected": -5.803950309753418, + "step": 1207 + }, + { + "epoch": 1.58, + "learning_rate": 2.4570058697796125e-05, + "logits/chosen": -2.3916454315185547, + "logits/rejected": -2.321455478668213, + "logps/chosen": -169.2388916015625, + "logps/rejected": -203.18197631835938, + "loss": 0.2062, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6446034908294678, + "rewards/margins": 3.9655921459198, + "rewards/rejected": -5.610195636749268, + "step": 1208 + }, + { + "epoch": 1.58, + "learning_rate": 2.4534234242394015e-05, + "logits/chosen": -2.1381185054779053, + "logits/rejected": -2.1411941051483154, + "logps/chosen": -225.93563842773438, + "logps/rejected": -275.65191650390625, + "loss": 0.1476, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.519797682762146, + "rewards/margins": 5.228862762451172, + "rewards/rejected": -6.748660564422607, + "step": 1209 + }, + { + "epoch": 1.58, + "learning_rate": 2.449841074371472e-05, + "logits/chosen": -2.505796432495117, + "logits/rejected": -2.374293088912964, + "logps/chosen": -160.7147216796875, + "logps/rejected": -183.5177459716797, + "loss": 0.2437, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8080675601959229, + "rewards/margins": 2.3371920585632324, + "rewards/rejected": -4.145259857177734, + "step": 1210 + }, + { + "epoch": 1.59, + "learning_rate": 2.4462588275342773e-05, + "logits/chosen": -2.4670982360839844, + "logits/rejected": -2.412968635559082, + "logps/chosen": -186.62237548828125, + "logps/rejected": -243.04965209960938, + "loss": 0.0987, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.491491436958313, + "rewards/margins": 4.842393398284912, + "rewards/rejected": -6.333884239196777, + "step": 1211 + }, + { + "epoch": 1.59, + "learning_rate": 2.4426766910860585e-05, + "logits/chosen": -2.463569402694702, + "logits/rejected": -2.427149534225464, + "logps/chosen": -210.4026336669922, + "logps/rejected": -261.46533203125, + "loss": 0.1528, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.509092330932617, + "rewards/margins": 3.3179991245269775, + "rewards/rejected": -5.827091693878174, + "step": 1212 + }, + { + "epoch": 1.59, + "learning_rate": 2.439094672384833e-05, + "logits/chosen": -2.441802978515625, + "logits/rejected": -2.381190776824951, + "logps/chosen": -205.2437744140625, + "logps/rejected": -229.05203247070312, + "loss": 0.239, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3627405166625977, + "rewards/margins": 2.829514741897583, + "rewards/rejected": -5.192255973815918, + "step": 1213 + }, + { + "epoch": 1.59, + "learning_rate": 2.4355127787883732e-05, + "logits/chosen": -2.330437183380127, + "logits/rejected": -2.4237754344940186, + "logps/chosen": -171.38157653808594, + "logps/rejected": -239.57034301757812, + "loss": 0.1137, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.096087694168091, + "rewards/margins": 3.611819267272949, + "rewards/rejected": -5.707907199859619, + "step": 1214 + }, + { + "epoch": 1.59, + "learning_rate": 2.4319310176541958e-05, + "logits/chosen": -2.339777708053589, + "logits/rejected": -2.3736634254455566, + "logps/chosen": -186.14675903320312, + "logps/rejected": -221.11712646484375, + "loss": 0.218, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6867961883544922, + "rewards/margins": 4.029036521911621, + "rewards/rejected": -5.715832710266113, + "step": 1215 + }, + { + "epoch": 1.59, + "learning_rate": 2.428349396339547e-05, + "logits/chosen": -2.350100040435791, + "logits/rejected": -2.423497200012207, + "logps/chosen": -169.83596801757812, + "logps/rejected": -221.52064514160156, + "loss": 0.118, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2997121810913086, + "rewards/margins": 2.9824600219726562, + "rewards/rejected": -5.282171726226807, + "step": 1216 + }, + { + "epoch": 1.59, + "learning_rate": 2.424767922201381e-05, + "logits/chosen": -2.2392563819885254, + "logits/rejected": -2.282517671585083, + "logps/chosen": -172.95179748535156, + "logps/rejected": -230.56796264648438, + "loss": 0.3067, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.248317003250122, + "rewards/margins": 4.031771183013916, + "rewards/rejected": -6.280088424682617, + "step": 1217 + }, + { + "epoch": 1.59, + "learning_rate": 2.4211866025963557e-05, + "logits/chosen": -2.47070574760437, + "logits/rejected": -2.340616464614868, + "logps/chosen": -197.75460815429688, + "logps/rejected": -219.54380798339844, + "loss": 0.206, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.206007957458496, + "rewards/margins": 3.441049814224243, + "rewards/rejected": -5.64705753326416, + "step": 1218 + }, + { + "epoch": 1.6, + "learning_rate": 2.417605444880807e-05, + "logits/chosen": -2.449453830718994, + "logits/rejected": -2.486593246459961, + "logps/chosen": -248.49826049804688, + "logps/rejected": -259.3140869140625, + "loss": 0.1304, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6558809280395508, + "rewards/margins": 4.491630554199219, + "rewards/rejected": -6.147511005401611, + "step": 1219 + }, + { + "epoch": 1.6, + "learning_rate": 2.4140244564107402e-05, + "logits/chosen": -2.431633949279785, + "logits/rejected": -2.420821189880371, + "logps/chosen": -182.12156677246094, + "logps/rejected": -262.09075927734375, + "loss": 0.1415, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2471213340759277, + "rewards/margins": 4.404972076416016, + "rewards/rejected": -6.652093887329102, + "step": 1220 + }, + { + "epoch": 1.6, + "learning_rate": 2.4104436445418145e-05, + "logits/chosen": -2.4458508491516113, + "logits/rejected": -2.5268325805664062, + "logps/chosen": -195.7959442138672, + "logps/rejected": -255.20120239257812, + "loss": 0.1643, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9084779024124146, + "rewards/margins": 3.752932548522949, + "rewards/rejected": -5.661409854888916, + "step": 1221 + }, + { + "epoch": 1.6, + "learning_rate": 2.4068630166293215e-05, + "logits/chosen": -2.3241653442382812, + "logits/rejected": -2.2756261825561523, + "logps/chosen": -190.4071044921875, + "logps/rejected": -212.93801879882812, + "loss": 0.1683, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0445754528045654, + "rewards/margins": 3.6302294731140137, + "rewards/rejected": -5.6748046875, + "step": 1222 + }, + { + "epoch": 1.6, + "learning_rate": 2.4032825800281804e-05, + "logits/chosen": -2.4622201919555664, + "logits/rejected": -2.5122151374816895, + "logps/chosen": -229.62994384765625, + "logps/rejected": -266.20513916015625, + "loss": 0.2343, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7815600633621216, + "rewards/margins": 3.7686362266540527, + "rewards/rejected": -5.550196170806885, + "step": 1223 + }, + { + "epoch": 1.6, + "learning_rate": 2.3997023420929137e-05, + "logits/chosen": -2.431450128555298, + "logits/rejected": -2.4412267208099365, + "logps/chosen": -170.38693237304688, + "logps/rejected": -225.38699340820312, + "loss": 0.1432, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7164604663848877, + "rewards/margins": 4.028445243835449, + "rewards/rejected": -5.744906425476074, + "step": 1224 + }, + { + "epoch": 1.6, + "learning_rate": 2.3961223101776375e-05, + "logits/chosen": -2.480684518814087, + "logits/rejected": -2.5819950103759766, + "logps/chosen": -180.00588989257812, + "logps/rejected": -227.92233276367188, + "loss": 0.1112, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1921019554138184, + "rewards/margins": 4.109508514404297, + "rewards/rejected": -5.301610946655273, + "step": 1225 + }, + { + "epoch": 1.6, + "learning_rate": 2.392542491636045e-05, + "logits/chosen": -2.2564690113067627, + "logits/rejected": -2.4720845222473145, + "logps/chosen": -161.28631591796875, + "logps/rejected": -184.99612426757812, + "loss": 0.2086, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5437161922454834, + "rewards/margins": 2.3674418926239014, + "rewards/rejected": -4.911158561706543, + "step": 1226 + }, + { + "epoch": 1.61, + "learning_rate": 2.3889628938213905e-05, + "logits/chosen": -2.4496514797210693, + "logits/rejected": -2.399341106414795, + "logps/chosen": -195.6259765625, + "logps/rejected": -241.14181518554688, + "loss": 0.1629, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.14137601852417, + "rewards/margins": 4.007162570953369, + "rewards/rejected": -6.148538589477539, + "step": 1227 + }, + { + "epoch": 1.61, + "learning_rate": 2.3853835240864743e-05, + "logits/chosen": -2.426723003387451, + "logits/rejected": -2.4882919788360596, + "logps/chosen": -202.53793334960938, + "logps/rejected": -264.81658935546875, + "loss": 0.0853, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5354681015014648, + "rewards/margins": 4.929140090942383, + "rewards/rejected": -6.4646077156066895, + "step": 1228 + }, + { + "epoch": 1.61, + "learning_rate": 2.381804389783628e-05, + "logits/chosen": -2.1970605850219727, + "logits/rejected": -2.3454794883728027, + "logps/chosen": -138.98629760742188, + "logps/rejected": -207.320068359375, + "loss": 0.139, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.326474666595459, + "rewards/margins": 3.5799500942230225, + "rewards/rejected": -4.9064249992370605, + "step": 1229 + }, + { + "epoch": 1.61, + "learning_rate": 2.3782254982647013e-05, + "logits/chosen": -2.420474052429199, + "logits/rejected": -2.390350341796875, + "logps/chosen": -215.09228515625, + "logps/rejected": -247.38101196289062, + "loss": 0.2326, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.475163698196411, + "rewards/margins": 2.718937635421753, + "rewards/rejected": -5.194101333618164, + "step": 1230 + }, + { + "epoch": 1.61, + "learning_rate": 2.374646856881045e-05, + "logits/chosen": -2.438455581665039, + "logits/rejected": -2.4569973945617676, + "logps/chosen": -207.00938415527344, + "logps/rejected": -248.0533447265625, + "loss": 0.158, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6947007179260254, + "rewards/margins": 4.077005863189697, + "rewards/rejected": -5.771706581115723, + "step": 1231 + }, + { + "epoch": 1.61, + "learning_rate": 2.3710684729834954e-05, + "logits/chosen": -2.3892476558685303, + "logits/rejected": -2.425858736038208, + "logps/chosen": -215.9761199951172, + "logps/rejected": -243.03236389160156, + "loss": 0.1767, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1680920124053955, + "rewards/margins": 2.8967502117156982, + "rewards/rejected": -5.064842224121094, + "step": 1232 + }, + { + "epoch": 1.61, + "learning_rate": 2.367490353922358e-05, + "logits/chosen": -2.2655513286590576, + "logits/rejected": -2.279118299484253, + "logps/chosen": -187.2826385498047, + "logps/rejected": -246.75619506835938, + "loss": 0.1181, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3723695278167725, + "rewards/margins": 3.709144115447998, + "rewards/rejected": -6.081512928009033, + "step": 1233 + }, + { + "epoch": 1.62, + "learning_rate": 2.3639125070473975e-05, + "logits/chosen": -2.3893938064575195, + "logits/rejected": -2.4231905937194824, + "logps/chosen": -159.58692932128906, + "logps/rejected": -198.72528076171875, + "loss": 0.1449, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6047216653823853, + "rewards/margins": 3.348755359649658, + "rewards/rejected": -4.953476905822754, + "step": 1234 + }, + { + "epoch": 1.62, + "learning_rate": 2.3603349397078182e-05, + "logits/chosen": -2.5526320934295654, + "logits/rejected": -2.5546908378601074, + "logps/chosen": -233.10751342773438, + "logps/rejected": -267.2059326171875, + "loss": 0.0528, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9297378063201904, + "rewards/margins": 4.423281669616699, + "rewards/rejected": -6.353020191192627, + "step": 1235 + }, + { + "epoch": 1.62, + "learning_rate": 2.3567576592522507e-05, + "logits/chosen": -2.1117677688598633, + "logits/rejected": -2.2029826641082764, + "logps/chosen": -150.48196411132812, + "logps/rejected": -209.81484985351562, + "loss": 0.1511, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6189525127410889, + "rewards/margins": 3.7750935554504395, + "rewards/rejected": -5.394045352935791, + "step": 1236 + }, + { + "epoch": 1.62, + "learning_rate": 2.3531806730287342e-05, + "logits/chosen": -2.132078170776367, + "logits/rejected": -2.2218964099884033, + "logps/chosen": -174.1147003173828, + "logps/rejected": -216.79774475097656, + "loss": 0.1611, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5399067401885986, + "rewards/margins": 3.2161121368408203, + "rewards/rejected": -4.756019115447998, + "step": 1237 + }, + { + "epoch": 1.62, + "learning_rate": 2.349603988384708e-05, + "logits/chosen": -2.2425410747528076, + "logits/rejected": -2.31071400642395, + "logps/chosen": -157.0716552734375, + "logps/rejected": -202.0670166015625, + "loss": 0.1725, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7749437093734741, + "rewards/margins": 3.3446571826934814, + "rewards/rejected": -5.119600772857666, + "step": 1238 + }, + { + "epoch": 1.62, + "learning_rate": 2.3460276126669854e-05, + "logits/chosen": -2.3171401023864746, + "logits/rejected": -2.38438081741333, + "logps/chosen": -161.9048309326172, + "logps/rejected": -238.03546142578125, + "loss": 0.0461, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7505953311920166, + "rewards/margins": 3.8352746963500977, + "rewards/rejected": -5.585869789123535, + "step": 1239 + }, + { + "epoch": 1.62, + "learning_rate": 2.342451553221752e-05, + "logits/chosen": -2.318977117538452, + "logits/rejected": -2.299112319946289, + "logps/chosen": -186.75778198242188, + "logps/rejected": -210.36260986328125, + "loss": 0.1398, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0520501136779785, + "rewards/margins": 2.9656755924224854, + "rewards/rejected": -5.017725467681885, + "step": 1240 + }, + { + "epoch": 1.62, + "learning_rate": 2.338875817394539e-05, + "logits/chosen": -2.405032157897949, + "logits/rejected": -2.467423915863037, + "logps/chosen": -197.04295349121094, + "logps/rejected": -275.3404846191406, + "loss": 0.131, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4611997604370117, + "rewards/margins": 3.7919762134552, + "rewards/rejected": -5.253175735473633, + "step": 1241 + }, + { + "epoch": 1.63, + "learning_rate": 2.3353004125302142e-05, + "logits/chosen": -2.3426947593688965, + "logits/rejected": -2.354165554046631, + "logps/chosen": -181.57960510253906, + "logps/rejected": -194.0592041015625, + "loss": 0.1492, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4765923023223877, + "rewards/margins": 2.6731584072113037, + "rewards/rejected": -5.149750709533691, + "step": 1242 + }, + { + "epoch": 1.63, + "learning_rate": 2.331725345972968e-05, + "logits/chosen": -2.530708074569702, + "logits/rejected": -2.583585739135742, + "logps/chosen": -209.6914825439453, + "logps/rejected": -298.46234130859375, + "loss": 0.0449, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1505860090255737, + "rewards/margins": 5.370021343231201, + "rewards/rejected": -6.520607948303223, + "step": 1243 + }, + { + "epoch": 1.63, + "learning_rate": 2.32815062506629e-05, + "logits/chosen": -2.2825493812561035, + "logits/rejected": -2.3804893493652344, + "logps/chosen": -208.253662109375, + "logps/rejected": -249.202392578125, + "loss": 0.1949, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.712033748626709, + "rewards/margins": 3.4290895462036133, + "rewards/rejected": -5.141123294830322, + "step": 1244 + }, + { + "epoch": 1.63, + "learning_rate": 2.3245762571529667e-05, + "logits/chosen": -2.347567558288574, + "logits/rejected": -2.3474326133728027, + "logps/chosen": -166.81358337402344, + "logps/rejected": -211.6610107421875, + "loss": 0.1883, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3718667030334473, + "rewards/margins": 2.700474262237549, + "rewards/rejected": -5.072341442108154, + "step": 1245 + }, + { + "epoch": 1.63, + "learning_rate": 2.3210022495750552e-05, + "logits/chosen": -2.2256298065185547, + "logits/rejected": -2.2157697677612305, + "logps/chosen": -166.595458984375, + "logps/rejected": -223.0438995361328, + "loss": 0.1909, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0562028884887695, + "rewards/margins": 3.8893682956695557, + "rewards/rejected": -5.945570945739746, + "step": 1246 + }, + { + "epoch": 1.63, + "learning_rate": 2.317428609673873e-05, + "logits/chosen": -2.4161572456359863, + "logits/rejected": -2.433016777038574, + "logps/chosen": -229.64622497558594, + "logps/rejected": -297.77685546875, + "loss": 0.2011, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.022761106491089, + "rewards/margins": 4.363578796386719, + "rewards/rejected": -6.3863396644592285, + "step": 1247 + }, + { + "epoch": 1.63, + "learning_rate": 2.3138553447899835e-05, + "logits/chosen": -2.347311496734619, + "logits/rejected": -2.374821662902832, + "logps/chosen": -203.28680419921875, + "logps/rejected": -227.5606231689453, + "loss": 0.1231, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7865922451019287, + "rewards/margins": 3.9671671390533447, + "rewards/rejected": -5.753759384155273, + "step": 1248 + }, + { + "epoch": 1.63, + "learning_rate": 2.3102824622631803e-05, + "logits/chosen": -2.4332902431488037, + "logits/rejected": -2.480926990509033, + "logps/chosen": -204.81556701660156, + "logps/rejected": -269.7750549316406, + "loss": 0.0878, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8985775709152222, + "rewards/margins": 5.506340026855469, + "rewards/rejected": -7.404917240142822, + "step": 1249 + }, + { + "epoch": 1.64, + "learning_rate": 2.3067099694324686e-05, + "logits/chosen": -2.1479830741882324, + "logits/rejected": -2.2253804206848145, + "logps/chosen": -164.16336059570312, + "logps/rejected": -203.90867614746094, + "loss": 0.125, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.071167230606079, + "rewards/margins": 3.142298460006714, + "rewards/rejected": -5.213465690612793, + "step": 1250 + }, + { + "epoch": 1.64, + "learning_rate": 2.3031378736360562e-05, + "logits/chosen": -2.228970527648926, + "logits/rejected": -2.30505108833313, + "logps/chosen": -181.6044921875, + "logps/rejected": -256.03192138671875, + "loss": 0.1165, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9760847091674805, + "rewards/margins": 4.596839904785156, + "rewards/rejected": -6.572924613952637, + "step": 1251 + }, + { + "epoch": 1.64, + "learning_rate": 2.299566182211333e-05, + "logits/chosen": -2.149162530899048, + "logits/rejected": -2.2387490272521973, + "logps/chosen": -170.30865478515625, + "logps/rejected": -207.37643432617188, + "loss": 0.1036, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.128716230392456, + "rewards/margins": 3.322714328765869, + "rewards/rejected": -5.451430320739746, + "step": 1252 + }, + { + "epoch": 1.64, + "learning_rate": 2.295994902494861e-05, + "logits/chosen": -2.3936662673950195, + "logits/rejected": -2.4781973361968994, + "logps/chosen": -177.29734802246094, + "logps/rejected": -223.9259033203125, + "loss": 0.1026, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.371659755706787, + "rewards/margins": 3.1970856189727783, + "rewards/rejected": -5.568745136260986, + "step": 1253 + }, + { + "epoch": 1.64, + "learning_rate": 2.292424041822355e-05, + "logits/chosen": -2.3342461585998535, + "logits/rejected": -2.3712892532348633, + "logps/chosen": -193.8314208984375, + "logps/rejected": -251.5005645751953, + "loss": 0.0865, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.419714093208313, + "rewards/margins": 4.518192768096924, + "rewards/rejected": -5.937906742095947, + "step": 1254 + }, + { + "epoch": 1.64, + "learning_rate": 2.2888536075286675e-05, + "logits/chosen": -2.3822429180145264, + "logits/rejected": -2.443021059036255, + "logps/chosen": -174.8991241455078, + "logps/rejected": -240.3403778076172, + "loss": 0.1694, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1065659523010254, + "rewards/margins": 3.122783660888672, + "rewards/rejected": -5.229349613189697, + "step": 1255 + }, + { + "epoch": 1.64, + "learning_rate": 2.2852836069477773e-05, + "logits/chosen": -2.330996036529541, + "logits/rejected": -2.399650812149048, + "logps/chosen": -154.3611297607422, + "logps/rejected": -209.98532104492188, + "loss": 0.3558, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0913143157958984, + "rewards/margins": 3.176445722579956, + "rewards/rejected": -5.267760276794434, + "step": 1256 + }, + { + "epoch": 1.65, + "learning_rate": 2.281714047412773e-05, + "logits/chosen": -2.2353427410125732, + "logits/rejected": -2.297071695327759, + "logps/chosen": -158.347900390625, + "logps/rejected": -213.35313415527344, + "loss": 0.1291, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4900450706481934, + "rewards/margins": 4.13687801361084, + "rewards/rejected": -5.626923084259033, + "step": 1257 + }, + { + "epoch": 1.65, + "learning_rate": 2.2781449362558347e-05, + "logits/chosen": -2.4674859046936035, + "logits/rejected": -2.404099702835083, + "logps/chosen": -200.00509643554688, + "logps/rejected": -236.1136474609375, + "loss": 0.1887, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1089351177215576, + "rewards/margins": 3.3504581451416016, + "rewards/rejected": -5.459392547607422, + "step": 1258 + }, + { + "epoch": 1.65, + "learning_rate": 2.2745762808082223e-05, + "logits/chosen": -2.396791458129883, + "logits/rejected": -2.413886070251465, + "logps/chosen": -194.5991973876953, + "logps/rejected": -239.6669464111328, + "loss": 0.0943, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5336006879806519, + "rewards/margins": 4.422511100769043, + "rewards/rejected": -5.956111907958984, + "step": 1259 + }, + { + "epoch": 1.65, + "learning_rate": 2.2710080884002632e-05, + "logits/chosen": -2.2711637020111084, + "logits/rejected": -2.331444025039673, + "logps/chosen": -173.2117156982422, + "logps/rejected": -221.156982421875, + "loss": 0.1955, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9484103918075562, + "rewards/margins": 3.345860719680786, + "rewards/rejected": -5.294271469116211, + "step": 1260 + }, + { + "epoch": 1.65, + "learning_rate": 2.2674403663613267e-05, + "logits/chosen": -2.2264342308044434, + "logits/rejected": -2.237760305404663, + "logps/chosen": -180.58277893066406, + "logps/rejected": -201.0824737548828, + "loss": 0.1895, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9844893217086792, + "rewards/margins": 2.5527989864349365, + "rewards/rejected": -4.537288188934326, + "step": 1261 + }, + { + "epoch": 1.65, + "learning_rate": 2.263873122019822e-05, + "logits/chosen": -2.376873731613159, + "logits/rejected": -2.4398136138916016, + "logps/chosen": -209.56585693359375, + "logps/rejected": -250.48629760742188, + "loss": 0.101, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2790703773498535, + "rewards/margins": 4.363531112670898, + "rewards/rejected": -5.642601013183594, + "step": 1262 + }, + { + "epoch": 1.65, + "learning_rate": 2.2603063627031744e-05, + "logits/chosen": -2.4660024642944336, + "logits/rejected": -2.45780611038208, + "logps/chosen": -192.86569213867188, + "logps/rejected": -225.05224609375, + "loss": 0.1482, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3924591541290283, + "rewards/margins": 3.239257574081421, + "rewards/rejected": -5.631716728210449, + "step": 1263 + }, + { + "epoch": 1.65, + "learning_rate": 2.2567400957378132e-05, + "logits/chosen": -2.1617038249969482, + "logits/rejected": -2.348353862762451, + "logps/chosen": -165.9205780029297, + "logps/rejected": -235.53817749023438, + "loss": 0.0752, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4605586528778076, + "rewards/margins": 4.106925010681152, + "rewards/rejected": -6.567483901977539, + "step": 1264 + }, + { + "epoch": 1.66, + "learning_rate": 2.253174328449158e-05, + "logits/chosen": -2.337563991546631, + "logits/rejected": -2.335984230041504, + "logps/chosen": -145.81805419921875, + "logps/rejected": -213.2615509033203, + "loss": 0.1421, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5050880908966064, + "rewards/margins": 3.914152145385742, + "rewards/rejected": -5.4192399978637695, + "step": 1265 + }, + { + "epoch": 1.66, + "learning_rate": 2.2496090681615984e-05, + "logits/chosen": -2.3932511806488037, + "logits/rejected": -2.390071392059326, + "logps/chosen": -180.98995971679688, + "logps/rejected": -240.12033081054688, + "loss": 0.1459, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.034759283065796, + "rewards/margins": 3.6752748489379883, + "rewards/rejected": -5.710034370422363, + "step": 1266 + }, + { + "epoch": 1.66, + "learning_rate": 2.246044322198486e-05, + "logits/chosen": -2.36371111869812, + "logits/rejected": -2.446136713027954, + "logps/chosen": -205.06329345703125, + "logps/rejected": -275.208251953125, + "loss": 0.0453, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8262585401535034, + "rewards/margins": 4.66420841217041, + "rewards/rejected": -6.490467071533203, + "step": 1267 + }, + { + "epoch": 1.66, + "learning_rate": 2.2424800978821146e-05, + "logits/chosen": -2.2914657592773438, + "logits/rejected": -2.257660150527954, + "logps/chosen": -183.5614776611328, + "logps/rejected": -212.4899444580078, + "loss": 0.1915, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.272273540496826, + "rewards/margins": 3.5495388507843018, + "rewards/rejected": -5.821812629699707, + "step": 1268 + }, + { + "epoch": 1.66, + "learning_rate": 2.238916402533706e-05, + "logits/chosen": -2.3114066123962402, + "logits/rejected": -2.2589988708496094, + "logps/chosen": -217.54476928710938, + "logps/rejected": -282.8260803222656, + "loss": 0.0682, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5531392097473145, + "rewards/margins": 5.133091449737549, + "rewards/rejected": -7.686229705810547, + "step": 1269 + }, + { + "epoch": 1.66, + "learning_rate": 2.235353243473398e-05, + "logits/chosen": -2.3127739429473877, + "logits/rejected": -2.4178357124328613, + "logps/chosen": -204.81356811523438, + "logps/rejected": -234.25479125976562, + "loss": 0.0786, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7156574726104736, + "rewards/margins": 3.8326406478881836, + "rewards/rejected": -5.548297882080078, + "step": 1270 + }, + { + "epoch": 1.66, + "learning_rate": 2.231790628020222e-05, + "logits/chosen": -2.3831496238708496, + "logits/rejected": -2.439579725265503, + "logps/chosen": -174.45242309570312, + "logps/rejected": -231.3064422607422, + "loss": 0.1941, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.631500005722046, + "rewards/margins": 3.1058244705200195, + "rewards/rejected": -5.737324237823486, + "step": 1271 + }, + { + "epoch": 1.66, + "learning_rate": 2.228228563492098e-05, + "logits/chosen": -2.2605385780334473, + "logits/rejected": -2.354445695877075, + "logps/chosen": -166.48532104492188, + "logps/rejected": -237.2100830078125, + "loss": 0.1137, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8933758735656738, + "rewards/margins": 5.097873210906982, + "rewards/rejected": -6.991249084472656, + "step": 1272 + }, + { + "epoch": 1.67, + "learning_rate": 2.224667057205811e-05, + "logits/chosen": -2.3615005016326904, + "logits/rejected": -2.4618277549743652, + "logps/chosen": -171.42034912109375, + "logps/rejected": -229.09051513671875, + "loss": 0.1163, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7531858682632446, + "rewards/margins": 4.137459754943848, + "rewards/rejected": -5.890645503997803, + "step": 1273 + }, + { + "epoch": 1.67, + "learning_rate": 2.2211061164769997e-05, + "logits/chosen": -2.32946515083313, + "logits/rejected": -2.414400815963745, + "logps/chosen": -171.33204650878906, + "logps/rejected": -245.31137084960938, + "loss": 0.1297, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4534742832183838, + "rewards/margins": 4.338977336883545, + "rewards/rejected": -5.79245138168335, + "step": 1274 + }, + { + "epoch": 1.67, + "learning_rate": 2.2175457486201435e-05, + "logits/chosen": -2.363718032836914, + "logits/rejected": -2.3333187103271484, + "logps/chosen": -210.88121032714844, + "logps/rejected": -290.4528503417969, + "loss": 0.2139, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9432132244110107, + "rewards/margins": 4.5440449714660645, + "rewards/rejected": -6.4872589111328125, + "step": 1275 + }, + { + "epoch": 1.67, + "learning_rate": 2.2139859609485426e-05, + "logits/chosen": -2.479445219039917, + "logits/rejected": -2.463468313217163, + "logps/chosen": -185.5249786376953, + "logps/rejected": -236.50135803222656, + "loss": 0.1566, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8689696788787842, + "rewards/margins": 3.38698673248291, + "rewards/rejected": -5.255956172943115, + "step": 1276 + }, + { + "epoch": 1.67, + "learning_rate": 2.2104267607743057e-05, + "logits/chosen": -2.5215115547180176, + "logits/rejected": -2.5244312286376953, + "logps/chosen": -202.2774658203125, + "logps/rejected": -298.4288330078125, + "loss": 0.0628, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.681636095046997, + "rewards/margins": 4.273119926452637, + "rewards/rejected": -5.954756736755371, + "step": 1277 + }, + { + "epoch": 1.67, + "learning_rate": 2.2068681554083345e-05, + "logits/chosen": -2.3372130393981934, + "logits/rejected": -2.2861101627349854, + "logps/chosen": -186.33172607421875, + "logps/rejected": -212.44961547851562, + "loss": 0.0975, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8394958972930908, + "rewards/margins": 4.057079315185547, + "rewards/rejected": -5.896574974060059, + "step": 1278 + }, + { + "epoch": 1.67, + "learning_rate": 2.2033101521603113e-05, + "logits/chosen": -2.3356683254241943, + "logits/rejected": -2.3762290477752686, + "logps/chosen": -183.888671875, + "logps/rejected": -213.61656188964844, + "loss": 0.1375, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6777007579803467, + "rewards/margins": 3.63214111328125, + "rewards/rejected": -5.309842109680176, + "step": 1279 + }, + { + "epoch": 1.68, + "learning_rate": 2.199752758338679e-05, + "logits/chosen": -2.2114038467407227, + "logits/rejected": -2.213822603225708, + "logps/chosen": -217.16354370117188, + "logps/rejected": -244.71615600585938, + "loss": 0.0744, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5485525131225586, + "rewards/margins": 4.005685806274414, + "rewards/rejected": -5.554238796234131, + "step": 1280 + }, + { + "epoch": 1.68, + "learning_rate": 2.19619598125063e-05, + "logits/chosen": -2.31715726852417, + "logits/rejected": -2.3685436248779297, + "logps/chosen": -158.84544372558594, + "logps/rejected": -211.08718872070312, + "loss": 0.179, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8646739721298218, + "rewards/margins": 3.7703466415405273, + "rewards/rejected": -5.635020732879639, + "step": 1281 + }, + { + "epoch": 1.68, + "learning_rate": 2.192639828202089e-05, + "logits/chosen": -2.1907973289489746, + "logits/rejected": -2.3144352436065674, + "logps/chosen": -156.4062042236328, + "logps/rejected": -226.2644805908203, + "loss": 0.165, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.341085910797119, + "rewards/margins": 3.6836130619049072, + "rewards/rejected": -6.024698734283447, + "step": 1282 + }, + { + "epoch": 1.68, + "learning_rate": 2.1890843064976986e-05, + "logits/chosen": -2.3618502616882324, + "logits/rejected": -2.280946731567383, + "logps/chosen": -269.5596923828125, + "logps/rejected": -297.6914978027344, + "loss": 0.1965, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.018094062805176, + "rewards/margins": 3.8387670516967773, + "rewards/rejected": -5.856860637664795, + "step": 1283 + }, + { + "epoch": 1.68, + "learning_rate": 2.1855294234408068e-05, + "logits/chosen": -2.2608089447021484, + "logits/rejected": -2.3064064979553223, + "logps/chosen": -216.48863220214844, + "logps/rejected": -296.0035400390625, + "loss": 0.0479, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.011268377304077, + "rewards/margins": 5.205287456512451, + "rewards/rejected": -7.216555595397949, + "step": 1284 + }, + { + "epoch": 1.68, + "learning_rate": 2.181975186333448e-05, + "logits/chosen": -2.4084208011627197, + "logits/rejected": -2.374807834625244, + "logps/chosen": -215.48873901367188, + "logps/rejected": -245.37855529785156, + "loss": 0.1717, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6708320379257202, + "rewards/margins": 3.883884906768799, + "rewards/rejected": -5.554717540740967, + "step": 1285 + }, + { + "epoch": 1.68, + "learning_rate": 2.1784216024763284e-05, + "logits/chosen": -2.316883087158203, + "logits/rejected": -2.348716974258423, + "logps/chosen": -177.52059936523438, + "logps/rejected": -211.66531372070312, + "loss": 0.2091, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.326066255569458, + "rewards/margins": 2.9530606269836426, + "rewards/rejected": -5.2791266441345215, + "step": 1286 + }, + { + "epoch": 1.68, + "learning_rate": 2.1748686791688176e-05, + "logits/chosen": -2.0353989601135254, + "logits/rejected": -2.2340800762176514, + "logps/chosen": -170.76766967773438, + "logps/rejected": -247.34408569335938, + "loss": 0.1125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1162915229797363, + "rewards/margins": 3.6448287963867188, + "rewards/rejected": -5.761120796203613, + "step": 1287 + }, + { + "epoch": 1.69, + "learning_rate": 2.1713164237089203e-05, + "logits/chosen": -2.2831475734710693, + "logits/rejected": -2.4075961112976074, + "logps/chosen": -218.43325805664062, + "logps/rejected": -298.3542785644531, + "loss": 0.0354, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9271820783615112, + "rewards/margins": 5.319320201873779, + "rewards/rejected": -7.246501445770264, + "step": 1288 + }, + { + "epoch": 1.69, + "learning_rate": 2.167764843393277e-05, + "logits/chosen": -2.09723162651062, + "logits/rejected": -2.134756326675415, + "logps/chosen": -198.67884826660156, + "logps/rejected": -287.30194091796875, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6649614572525024, + "rewards/margins": 5.773735523223877, + "rewards/rejected": -7.438697338104248, + "step": 1289 + }, + { + "epoch": 1.69, + "learning_rate": 2.1642139455171366e-05, + "logits/chosen": -2.425549030303955, + "logits/rejected": -2.3592019081115723, + "logps/chosen": -238.9408416748047, + "logps/rejected": -244.46258544921875, + "loss": 0.1034, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0925796031951904, + "rewards/margins": 3.8478012084960938, + "rewards/rejected": -5.940381050109863, + "step": 1290 + }, + { + "epoch": 1.69, + "learning_rate": 2.160663737374348e-05, + "logits/chosen": -2.410580635070801, + "logits/rejected": -2.388131618499756, + "logps/chosen": -182.27984619140625, + "logps/rejected": -218.9298095703125, + "loss": 0.0488, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7531981468200684, + "rewards/margins": 3.949941396713257, + "rewards/rejected": -5.703139305114746, + "step": 1291 + }, + { + "epoch": 1.69, + "learning_rate": 2.1571142262573457e-05, + "logits/chosen": -2.3673043251037598, + "logits/rejected": -2.3377084732055664, + "logps/chosen": -184.42393493652344, + "logps/rejected": -208.62554931640625, + "loss": 0.5051, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3096277713775635, + "rewards/margins": 3.3661365509033203, + "rewards/rejected": -5.675764083862305, + "step": 1292 + }, + { + "epoch": 1.69, + "learning_rate": 2.153565419457126e-05, + "logits/chosen": -2.4195940494537354, + "logits/rejected": -2.5033788681030273, + "logps/chosen": -224.14312744140625, + "logps/rejected": -261.00238037109375, + "loss": 0.1343, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.700939178466797, + "rewards/margins": 4.364261627197266, + "rewards/rejected": -7.065200328826904, + "step": 1293 + }, + { + "epoch": 1.69, + "learning_rate": 2.1500173242632446e-05, + "logits/chosen": -2.4174857139587402, + "logits/rejected": -2.502803087234497, + "logps/chosen": -191.3031005859375, + "logps/rejected": -272.18359375, + "loss": 0.1702, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2729105949401855, + "rewards/margins": 3.6686196327209473, + "rewards/rejected": -5.941530227661133, + "step": 1294 + }, + { + "epoch": 1.7, + "learning_rate": 2.1464699479637934e-05, + "logits/chosen": -2.4822442531585693, + "logits/rejected": -2.4661717414855957, + "logps/chosen": -210.88076782226562, + "logps/rejected": -254.47091674804688, + "loss": 0.0412, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7708749771118164, + "rewards/margins": 4.22068977355957, + "rewards/rejected": -5.9915642738342285, + "step": 1295 + }, + { + "epoch": 1.7, + "learning_rate": 2.1429232978453862e-05, + "logits/chosen": -2.267918586730957, + "logits/rejected": -2.307711124420166, + "logps/chosen": -186.9233856201172, + "logps/rejected": -215.50152587890625, + "loss": 0.1894, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2469704151153564, + "rewards/margins": 2.900423765182495, + "rewards/rejected": -5.147394180297852, + "step": 1296 + }, + { + "epoch": 1.7, + "learning_rate": 2.1393773811931483e-05, + "logits/chosen": -2.3875741958618164, + "logits/rejected": -2.3467512130737305, + "logps/chosen": -236.7337188720703, + "logps/rejected": -249.71170043945312, + "loss": 0.1381, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6323497295379639, + "rewards/margins": 4.115944862365723, + "rewards/rejected": -5.748294830322266, + "step": 1297 + }, + { + "epoch": 1.7, + "learning_rate": 2.135832205290696e-05, + "logits/chosen": -2.140883445739746, + "logits/rejected": -2.140561819076538, + "logps/chosen": -154.9495086669922, + "logps/rejected": -183.38150024414062, + "loss": 0.2046, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.129896879196167, + "rewards/margins": 2.7430248260498047, + "rewards/rejected": -4.872921943664551, + "step": 1298 + }, + { + "epoch": 1.7, + "learning_rate": 2.132287777420124e-05, + "logits/chosen": -2.575924873352051, + "logits/rejected": -2.5873146057128906, + "logps/chosen": -207.08197021484375, + "logps/rejected": -276.8204650878906, + "loss": 0.1962, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9058212041854858, + "rewards/margins": 3.004783868789673, + "rewards/rejected": -4.910604953765869, + "step": 1299 + }, + { + "epoch": 1.7, + "learning_rate": 2.128744104861991e-05, + "logits/chosen": -2.3695623874664307, + "logits/rejected": -2.3930909633636475, + "logps/chosen": -198.6455841064453, + "logps/rejected": -247.45181274414062, + "loss": 0.1258, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.527387261390686, + "rewards/margins": 4.684881687164307, + "rewards/rejected": -6.212268829345703, + "step": 1300 + }, + { + "epoch": 1.7, + "learning_rate": 2.125201194895305e-05, + "logits/chosen": -2.5727107524871826, + "logits/rejected": -2.6951513290405273, + "logps/chosen": -189.20774841308594, + "logps/rejected": -238.7978515625, + "loss": 0.1884, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4452967643737793, + "rewards/margins": 3.642787456512451, + "rewards/rejected": -5.0880842208862305, + "step": 1301 + }, + { + "epoch": 1.7, + "learning_rate": 2.121659054797507e-05, + "logits/chosen": -2.428898811340332, + "logits/rejected": -2.4013214111328125, + "logps/chosen": -180.28622436523438, + "logps/rejected": -235.53460693359375, + "loss": 0.1445, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5883967876434326, + "rewards/margins": 4.086200714111328, + "rewards/rejected": -6.67459774017334, + "step": 1302 + }, + { + "epoch": 1.71, + "learning_rate": 2.118117691844456e-05, + "logits/chosen": -2.2566795349121094, + "logits/rejected": -2.307943344116211, + "logps/chosen": -156.98703002929688, + "logps/rejected": -214.59457397460938, + "loss": 0.0783, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.781563401222229, + "rewards/margins": 4.179291248321533, + "rewards/rejected": -5.960855007171631, + "step": 1303 + }, + { + "epoch": 1.71, + "learning_rate": 2.1145771133104157e-05, + "logits/chosen": -2.356423854827881, + "logits/rejected": -2.328737735748291, + "logps/chosen": -198.40892028808594, + "logps/rejected": -247.4863739013672, + "loss": 0.1923, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.413034439086914, + "rewards/margins": 3.7330081462860107, + "rewards/rejected": -6.146042346954346, + "step": 1304 + }, + { + "epoch": 1.71, + "learning_rate": 2.111037326468037e-05, + "logits/chosen": -2.3714675903320312, + "logits/rejected": -2.4389638900756836, + "logps/chosen": -145.8385467529297, + "logps/rejected": -205.7078094482422, + "loss": 0.1328, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7978813648223877, + "rewards/margins": 3.8263258934020996, + "rewards/rejected": -5.624207019805908, + "step": 1305 + }, + { + "epoch": 1.71, + "learning_rate": 2.107498338588347e-05, + "logits/chosen": -2.4245078563690186, + "logits/rejected": -2.4312448501586914, + "logps/chosen": -203.7790985107422, + "logps/rejected": -272.0660400390625, + "loss": 0.2024, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9577341079711914, + "rewards/margins": 3.9791784286499023, + "rewards/rejected": -5.936913013458252, + "step": 1306 + }, + { + "epoch": 1.71, + "learning_rate": 2.1039601569407298e-05, + "logits/chosen": -2.5488319396972656, + "logits/rejected": -2.5804145336151123, + "logps/chosen": -188.79185485839844, + "logps/rejected": -268.08892822265625, + "loss": 0.2418, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8628536462783813, + "rewards/margins": 4.096201419830322, + "rewards/rejected": -5.959055423736572, + "step": 1307 + }, + { + "epoch": 1.71, + "learning_rate": 2.1004227887929133e-05, + "logits/chosen": -2.3830885887145996, + "logits/rejected": -2.466336488723755, + "logps/chosen": -188.71043395996094, + "logps/rejected": -249.99742126464844, + "loss": 0.1585, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.626366376876831, + "rewards/margins": 3.4328842163085938, + "rewards/rejected": -5.059250354766846, + "step": 1308 + }, + { + "epoch": 1.71, + "learning_rate": 2.0968862414109567e-05, + "logits/chosen": -2.2690930366516113, + "logits/rejected": -2.346137762069702, + "logps/chosen": -188.47799682617188, + "logps/rejected": -262.9131164550781, + "loss": 0.082, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9617085456848145, + "rewards/margins": 4.4581780433654785, + "rewards/rejected": -6.419886112213135, + "step": 1309 + }, + { + "epoch": 1.71, + "learning_rate": 2.0933505220592295e-05, + "logits/chosen": -2.3986053466796875, + "logits/rejected": -2.4108328819274902, + "logps/chosen": -205.296630859375, + "logps/rejected": -268.40216064453125, + "loss": 0.3053, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9937715530395508, + "rewards/margins": 3.6593408584594727, + "rewards/rejected": -5.653112888336182, + "step": 1310 + }, + { + "epoch": 1.72, + "learning_rate": 2.0898156380004034e-05, + "logits/chosen": -2.4971718788146973, + "logits/rejected": -2.4974489212036133, + "logps/chosen": -217.9031219482422, + "logps/rejected": -226.8319854736328, + "loss": 0.1389, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.684149146080017, + "rewards/margins": 3.9660165309906006, + "rewards/rejected": -5.650165557861328, + "step": 1311 + }, + { + "epoch": 1.72, + "learning_rate": 2.086281596495434e-05, + "logits/chosen": -2.3978464603424072, + "logits/rejected": -2.463926315307617, + "logps/chosen": -176.58837890625, + "logps/rejected": -223.87771606445312, + "loss": 0.2303, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7821848392486572, + "rewards/margins": 3.7116270065307617, + "rewards/rejected": -5.49381160736084, + "step": 1312 + }, + { + "epoch": 1.72, + "learning_rate": 2.0827484048035445e-05, + "logits/chosen": -2.300832748413086, + "logits/rejected": -2.372866153717041, + "logps/chosen": -229.6999053955078, + "logps/rejected": -272.33740234375, + "loss": 0.1505, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2939400672912598, + "rewards/margins": 3.326303482055664, + "rewards/rejected": -5.620244026184082, + "step": 1313 + }, + { + "epoch": 1.72, + "learning_rate": 2.0792160701822157e-05, + "logits/chosen": -2.4630236625671387, + "logits/rejected": -2.461514472961426, + "logps/chosen": -167.19215393066406, + "logps/rejected": -211.91494750976562, + "loss": 0.0898, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2457730770111084, + "rewards/margins": 3.322935104370117, + "rewards/rejected": -5.568708419799805, + "step": 1314 + }, + { + "epoch": 1.72, + "learning_rate": 2.0756845998871623e-05, + "logits/chosen": -2.2874443531036377, + "logits/rejected": -2.336970567703247, + "logps/chosen": -205.6305694580078, + "logps/rejected": -239.4787139892578, + "loss": 0.2971, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0412678718566895, + "rewards/margins": 3.251214027404785, + "rewards/rejected": -5.292482376098633, + "step": 1315 + }, + { + "epoch": 1.72, + "learning_rate": 2.07215400117233e-05, + "logits/chosen": -2.2303338050842285, + "logits/rejected": -2.320737838745117, + "logps/chosen": -160.9131622314453, + "logps/rejected": -189.41070556640625, + "loss": 0.1847, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7242070436477661, + "rewards/margins": 3.535355567932129, + "rewards/rejected": -5.259562969207764, + "step": 1316 + }, + { + "epoch": 1.72, + "learning_rate": 2.068624281289871e-05, + "logits/chosen": -2.1370110511779785, + "logits/rejected": -2.186886787414551, + "logps/chosen": -163.11351013183594, + "logps/rejected": -208.40513610839844, + "loss": 0.1569, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3170775175094604, + "rewards/margins": 3.180131673812866, + "rewards/rejected": -4.497209072113037, + "step": 1317 + }, + { + "epoch": 1.73, + "learning_rate": 2.065095447490131e-05, + "logits/chosen": -2.1060550212860107, + "logits/rejected": -2.094871759414673, + "logps/chosen": -180.56210327148438, + "logps/rejected": -241.83932495117188, + "loss": 0.0421, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6946886777877808, + "rewards/margins": 5.0019636154174805, + "rewards/rejected": -6.696652412414551, + "step": 1318 + }, + { + "epoch": 1.73, + "learning_rate": 2.0615675070216393e-05, + "logits/chosen": -2.4788966178894043, + "logits/rejected": -2.4797959327697754, + "logps/chosen": -212.43789672851562, + "logps/rejected": -252.77536010742188, + "loss": 0.2685, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7684473991394043, + "rewards/margins": 3.46006178855896, + "rewards/rejected": -6.228508949279785, + "step": 1319 + }, + { + "epoch": 1.73, + "learning_rate": 2.0580404671310878e-05, + "logits/chosen": -2.417243242263794, + "logits/rejected": -2.4775075912475586, + "logps/chosen": -201.75233459472656, + "logps/rejected": -251.92681884765625, + "loss": 0.1312, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.920212984085083, + "rewards/margins": 4.148662090301514, + "rewards/rejected": -6.068874835968018, + "step": 1320 + }, + { + "epoch": 1.73, + "learning_rate": 2.0545143350633177e-05, + "logits/chosen": -2.369495153427124, + "logits/rejected": -2.4142677783966064, + "logps/chosen": -217.51817321777344, + "logps/rejected": -233.458740234375, + "loss": 0.0995, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7607673406600952, + "rewards/margins": 4.3584113121032715, + "rewards/rejected": -6.119178771972656, + "step": 1321 + }, + { + "epoch": 1.73, + "learning_rate": 2.0509891180613066e-05, + "logits/chosen": -2.3835232257843018, + "logits/rejected": -2.309004068374634, + "logps/chosen": -175.8276824951172, + "logps/rejected": -189.5276336669922, + "loss": 0.2106, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9605634212493896, + "rewards/margins": 2.4031474590301514, + "rewards/rejected": -4.363710403442383, + "step": 1322 + }, + { + "epoch": 1.73, + "learning_rate": 2.0474648233661543e-05, + "logits/chosen": -2.230945110321045, + "logits/rejected": -2.3488399982452393, + "logps/chosen": -167.26763916015625, + "logps/rejected": -238.7157440185547, + "loss": 0.1078, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8513191938400269, + "rewards/margins": 4.222087383270264, + "rewards/rejected": -6.07340669631958, + "step": 1323 + }, + { + "epoch": 1.73, + "learning_rate": 2.0439414582170628e-05, + "logits/chosen": -2.330699920654297, + "logits/rejected": -2.3560612201690674, + "logps/chosen": -206.9635009765625, + "logps/rejected": -275.25048828125, + "loss": 0.0833, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9581581354141235, + "rewards/margins": 3.9644105434417725, + "rewards/rejected": -5.9225687980651855, + "step": 1324 + }, + { + "epoch": 1.73, + "learning_rate": 2.040419029851328e-05, + "logits/chosen": -2.229811429977417, + "logits/rejected": -2.2462096214294434, + "logps/chosen": -161.61016845703125, + "logps/rejected": -199.626220703125, + "loss": 0.1821, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6891059875488281, + "rewards/margins": 3.7411978244781494, + "rewards/rejected": -5.430303573608398, + "step": 1325 + }, + { + "epoch": 1.74, + "learning_rate": 2.0368975455043178e-05, + "logits/chosen": -2.2760794162750244, + "logits/rejected": -2.326425313949585, + "logps/chosen": -167.83413696289062, + "logps/rejected": -211.81210327148438, + "loss": 0.1781, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8069965839385986, + "rewards/margins": 3.2401506900787354, + "rewards/rejected": -5.047147274017334, + "step": 1326 + }, + { + "epoch": 1.74, + "learning_rate": 2.033377012409463e-05, + "logits/chosen": -2.368178129196167, + "logits/rejected": -2.3377692699432373, + "logps/chosen": -183.60934448242188, + "logps/rejected": -213.88446044921875, + "loss": 0.2388, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7498223781585693, + "rewards/margins": 3.5527286529541016, + "rewards/rejected": -5.302550792694092, + "step": 1327 + }, + { + "epoch": 1.74, + "learning_rate": 2.0298574377982427e-05, + "logits/chosen": -2.6002888679504395, + "logits/rejected": -2.5674407482147217, + "logps/chosen": -212.8086395263672, + "logps/rejected": -261.0436096191406, + "loss": 0.1642, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0052053928375244, + "rewards/margins": 4.194888114929199, + "rewards/rejected": -6.200093746185303, + "step": 1328 + }, + { + "epoch": 1.74, + "learning_rate": 2.026338828900163e-05, + "logits/chosen": -2.2341036796569824, + "logits/rejected": -2.410188674926758, + "logps/chosen": -185.3136444091797, + "logps/rejected": -293.7235107421875, + "loss": 0.1148, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.51639986038208, + "rewards/margins": 5.255737781524658, + "rewards/rejected": -6.77213716506958, + "step": 1329 + }, + { + "epoch": 1.74, + "learning_rate": 2.022821192942749e-05, + "logits/chosen": -2.263854742050171, + "logits/rejected": -2.4330499172210693, + "logps/chosen": -233.56724548339844, + "logps/rejected": -307.1253662109375, + "loss": 0.0471, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7693581581115723, + "rewards/margins": 4.752530574798584, + "rewards/rejected": -6.521888732910156, + "step": 1330 + }, + { + "epoch": 1.74, + "learning_rate": 2.0193045371515276e-05, + "logits/chosen": -2.2136175632476807, + "logits/rejected": -2.199801445007324, + "logps/chosen": -191.3333282470703, + "logps/rejected": -231.4643096923828, + "loss": 0.0668, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.106376886367798, + "rewards/margins": 3.641350030899048, + "rewards/rejected": -5.7477264404296875, + "step": 1331 + }, + { + "epoch": 1.74, + "learning_rate": 2.015788868750009e-05, + "logits/chosen": -2.2921154499053955, + "logits/rejected": -2.4406850337982178, + "logps/chosen": -162.87600708007812, + "logps/rejected": -268.0835876464844, + "loss": 0.1511, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1696290969848633, + "rewards/margins": 4.204335689544678, + "rewards/rejected": -6.373965263366699, + "step": 1332 + }, + { + "epoch": 1.74, + "learning_rate": 2.0122741949596797e-05, + "logits/chosen": -2.3796262741088867, + "logits/rejected": -2.414687395095825, + "logps/chosen": -155.844482421875, + "logps/rejected": -240.92796325683594, + "loss": 0.1541, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3388099670410156, + "rewards/margins": 4.454137325286865, + "rewards/rejected": -6.792947769165039, + "step": 1333 + }, + { + "epoch": 1.75, + "learning_rate": 2.00876052299998e-05, + "logits/chosen": -2.27986216545105, + "logits/rejected": -2.32919979095459, + "logps/chosen": -183.34228515625, + "logps/rejected": -214.05015563964844, + "loss": 0.0465, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4341685771942139, + "rewards/margins": 4.135167598724365, + "rewards/rejected": -5.5693359375, + "step": 1334 + }, + { + "epoch": 1.75, + "learning_rate": 2.0052478600882935e-05, + "logits/chosen": -2.2123706340789795, + "logits/rejected": -2.2277722358703613, + "logps/chosen": -196.18038940429688, + "logps/rejected": -254.30184936523438, + "loss": 0.0944, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4030604362487793, + "rewards/margins": 3.7253737449645996, + "rewards/rejected": -6.128434181213379, + "step": 1335 + }, + { + "epoch": 1.75, + "learning_rate": 2.001736213439933e-05, + "logits/chosen": -2.464176654815674, + "logits/rejected": -2.388843297958374, + "logps/chosen": -221.20799255371094, + "logps/rejected": -284.9414978027344, + "loss": 0.078, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6775474548339844, + "rewards/margins": 5.273458957672119, + "rewards/rejected": -7.95100736618042, + "step": 1336 + }, + { + "epoch": 1.75, + "learning_rate": 1.9982255902681186e-05, + "logits/chosen": -2.2432444095611572, + "logits/rejected": -2.32804012298584, + "logps/chosen": -181.63099670410156, + "logps/rejected": -231.79083251953125, + "loss": 0.1585, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2732791900634766, + "rewards/margins": 3.634847402572632, + "rewards/rejected": -5.908126354217529, + "step": 1337 + }, + { + "epoch": 1.75, + "learning_rate": 1.9947159977839736e-05, + "logits/chosen": -2.3729875087738037, + "logits/rejected": -2.4305837154388428, + "logps/chosen": -184.2694091796875, + "logps/rejected": -232.55186462402344, + "loss": 0.145, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.705888271331787, + "rewards/margins": 3.7710320949554443, + "rewards/rejected": -5.476920127868652, + "step": 1338 + }, + { + "epoch": 1.75, + "learning_rate": 1.991207443196501e-05, + "logits/chosen": -2.3278250694274902, + "logits/rejected": -2.2711265087127686, + "logps/chosen": -224.13851928710938, + "logps/rejected": -224.71286010742188, + "loss": 0.1683, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9896631240844727, + "rewards/margins": 2.7939271926879883, + "rewards/rejected": -4.783590316772461, + "step": 1339 + }, + { + "epoch": 1.75, + "learning_rate": 1.987699933712573e-05, + "logits/chosen": -2.314023733139038, + "logits/rejected": -2.3875467777252197, + "logps/chosen": -169.424072265625, + "logps/rejected": -209.0259552001953, + "loss": 0.1744, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.286795139312744, + "rewards/margins": 3.2858803272247314, + "rewards/rejected": -5.572675704956055, + "step": 1340 + }, + { + "epoch": 1.76, + "learning_rate": 1.9841934765369153e-05, + "logits/chosen": -2.3745079040527344, + "logits/rejected": -2.3375463485717773, + "logps/chosen": -168.55918884277344, + "logps/rejected": -222.62391662597656, + "loss": 0.1179, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.284468650817871, + "rewards/margins": 3.7939271926879883, + "rewards/rejected": -6.078395843505859, + "step": 1341 + }, + { + "epoch": 1.76, + "learning_rate": 1.9806880788720916e-05, + "logits/chosen": -2.2378830909729004, + "logits/rejected": -2.2404918670654297, + "logps/chosen": -175.9845428466797, + "logps/rejected": -219.64984130859375, + "loss": 0.0935, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9894123077392578, + "rewards/margins": 3.7605044841766357, + "rewards/rejected": -5.749917030334473, + "step": 1342 + }, + { + "epoch": 1.76, + "learning_rate": 1.977183747918489e-05, + "logits/chosen": -2.177305221557617, + "logits/rejected": -2.3256311416625977, + "logps/chosen": -162.03976440429688, + "logps/rejected": -290.5001220703125, + "loss": 0.1344, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.050999402999878, + "rewards/margins": 3.829390525817871, + "rewards/rejected": -5.880390167236328, + "step": 1343 + }, + { + "epoch": 1.76, + "learning_rate": 1.9736804908743033e-05, + "logits/chosen": -2.24698805809021, + "logits/rejected": -2.2751224040985107, + "logps/chosen": -196.80706787109375, + "logps/rejected": -252.5970458984375, + "loss": 0.0675, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0378329753875732, + "rewards/margins": 4.594714641571045, + "rewards/rejected": -6.632546901702881, + "step": 1344 + }, + { + "epoch": 1.76, + "learning_rate": 1.9701783149355255e-05, + "logits/chosen": -2.2692227363586426, + "logits/rejected": -2.307358980178833, + "logps/chosen": -199.25857543945312, + "logps/rejected": -229.75511169433594, + "loss": 0.1442, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6517555713653564, + "rewards/margins": 3.2444560527801514, + "rewards/rejected": -5.896211624145508, + "step": 1345 + }, + { + "epoch": 1.76, + "learning_rate": 1.9666772272959253e-05, + "logits/chosen": -2.404740333557129, + "logits/rejected": -2.446255683898926, + "logps/chosen": -167.06253051757812, + "logps/rejected": -212.9225311279297, + "loss": 0.2258, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.806255578994751, + "rewards/margins": 3.6429660320281982, + "rewards/rejected": -5.449221611022949, + "step": 1346 + }, + { + "epoch": 1.76, + "learning_rate": 1.9631772351470383e-05, + "logits/chosen": -2.3353922367095947, + "logits/rejected": -2.2711098194122314, + "logps/chosen": -220.8671112060547, + "logps/rejected": -256.802978515625, + "loss": 0.1451, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4086428880691528, + "rewards/margins": 4.241592884063721, + "rewards/rejected": -5.650235652923584, + "step": 1347 + }, + { + "epoch": 1.76, + "learning_rate": 1.959678345678146e-05, + "logits/chosen": -2.2605209350585938, + "logits/rejected": -2.2494559288024902, + "logps/chosen": -195.63803100585938, + "logps/rejected": -238.62527465820312, + "loss": 0.1201, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8433411121368408, + "rewards/margins": 4.453139781951904, + "rewards/rejected": -6.296481132507324, + "step": 1348 + }, + { + "epoch": 1.77, + "learning_rate": 1.9561805660762684e-05, + "logits/chosen": -2.3258447647094727, + "logits/rejected": -2.547119140625, + "logps/chosen": -172.7917938232422, + "logps/rejected": -234.5343780517578, + "loss": 0.1559, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3748419284820557, + "rewards/margins": 3.923091173171997, + "rewards/rejected": -6.297933101654053, + "step": 1349 + }, + { + "epoch": 1.77, + "learning_rate": 1.952683903526145e-05, + "logits/chosen": -2.420260190963745, + "logits/rejected": -2.4715757369995117, + "logps/chosen": -215.6678924560547, + "logps/rejected": -262.7197265625, + "loss": 0.1096, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.408724546432495, + "rewards/margins": 3.9322004318237305, + "rewards/rejected": -6.340925216674805, + "step": 1350 + }, + { + "epoch": 1.77, + "learning_rate": 1.9491883652102208e-05, + "logits/chosen": -2.423340082168579, + "logits/rejected": -2.3447985649108887, + "logps/chosen": -239.3814239501953, + "logps/rejected": -279.0250244140625, + "loss": 0.2108, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1968352794647217, + "rewards/margins": 4.180688858032227, + "rewards/rejected": -6.377523899078369, + "step": 1351 + }, + { + "epoch": 1.77, + "learning_rate": 1.9456939583086303e-05, + "logits/chosen": -2.379995822906494, + "logits/rejected": -2.357318639755249, + "logps/chosen": -200.75033569335938, + "logps/rejected": -252.04898071289062, + "loss": 0.036, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.724043607711792, + "rewards/margins": 4.425355911254883, + "rewards/rejected": -6.149399757385254, + "step": 1352 + }, + { + "epoch": 1.77, + "learning_rate": 1.9422006899991878e-05, + "logits/chosen": -2.4514048099517822, + "logits/rejected": -2.503966808319092, + "logps/chosen": -244.29388427734375, + "logps/rejected": -291.3765563964844, + "loss": 0.0462, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5804522037506104, + "rewards/margins": 4.783530235290527, + "rewards/rejected": -6.363982200622559, + "step": 1353 + }, + { + "epoch": 1.77, + "learning_rate": 1.9387085674573616e-05, + "logits/chosen": -2.3664278984069824, + "logits/rejected": -2.3518259525299072, + "logps/chosen": -210.26502990722656, + "logps/rejected": -272.58746337890625, + "loss": 0.0964, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0434794425964355, + "rewards/margins": 4.206425666809082, + "rewards/rejected": -6.249905109405518, + "step": 1354 + }, + { + "epoch": 1.77, + "learning_rate": 1.9352175978562736e-05, + "logits/chosen": -2.233623743057251, + "logits/rejected": -2.276262044906616, + "logps/chosen": -160.51487731933594, + "logps/rejected": -209.20216369628906, + "loss": 0.2404, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9351637363433838, + "rewards/margins": 3.199429512023926, + "rewards/rejected": -5.134593486785889, + "step": 1355 + }, + { + "epoch": 1.77, + "learning_rate": 1.9317277883666745e-05, + "logits/chosen": -2.4637691974639893, + "logits/rejected": -2.395784854888916, + "logps/chosen": -197.92547607421875, + "logps/rejected": -217.10275268554688, + "loss": 0.053, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5906208753585815, + "rewards/margins": 4.310453414916992, + "rewards/rejected": -5.901073932647705, + "step": 1356 + }, + { + "epoch": 1.78, + "learning_rate": 1.9282391461569316e-05, + "logits/chosen": -2.3923232555389404, + "logits/rejected": -2.3674304485321045, + "logps/chosen": -214.41360473632812, + "logps/rejected": -234.9254150390625, + "loss": 0.1749, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5903575420379639, + "rewards/margins": 3.334049940109253, + "rewards/rejected": -4.924407482147217, + "step": 1357 + }, + { + "epoch": 1.78, + "learning_rate": 1.924751678393017e-05, + "logits/chosen": -2.330720901489258, + "logits/rejected": -2.4127392768859863, + "logps/chosen": -216.95899963378906, + "logps/rejected": -280.90777587890625, + "loss": 0.0844, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1891772747039795, + "rewards/margins": 4.025705814361572, + "rewards/rejected": -6.214882850646973, + "step": 1358 + }, + { + "epoch": 1.78, + "learning_rate": 1.9212653922384854e-05, + "logits/chosen": -2.448625087738037, + "logits/rejected": -2.461225986480713, + "logps/chosen": -215.53018188476562, + "logps/rejected": -244.64892578125, + "loss": 0.0972, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.246009588241577, + "rewards/margins": 3.793391704559326, + "rewards/rejected": -6.039401054382324, + "step": 1359 + }, + { + "epoch": 1.78, + "learning_rate": 1.91778029485447e-05, + "logits/chosen": -2.282559633255005, + "logits/rejected": -2.3488802909851074, + "logps/chosen": -184.2056121826172, + "logps/rejected": -236.4654083251953, + "loss": 0.2638, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6305556297302246, + "rewards/margins": 3.3888020515441895, + "rewards/rejected": -6.019358158111572, + "step": 1360 + }, + { + "epoch": 1.78, + "learning_rate": 1.914296393399659e-05, + "logits/chosen": -2.280898332595825, + "logits/rejected": -2.5184438228607178, + "logps/chosen": -203.7155303955078, + "logps/rejected": -288.10894775390625, + "loss": 0.1226, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1515896320343018, + "rewards/margins": 4.192084312438965, + "rewards/rejected": -6.3436737060546875, + "step": 1361 + }, + { + "epoch": 1.78, + "learning_rate": 1.910813695030284e-05, + "logits/chosen": -2.4364683628082275, + "logits/rejected": -2.39170503616333, + "logps/chosen": -222.26742553710938, + "logps/rejected": -226.809814453125, + "loss": 0.3164, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1154820919036865, + "rewards/margins": 2.7663145065307617, + "rewards/rejected": -4.881796836853027, + "step": 1362 + }, + { + "epoch": 1.78, + "learning_rate": 1.9073322069001075e-05, + "logits/chosen": -2.3549532890319824, + "logits/rejected": -2.3573246002197266, + "logps/chosen": -223.8654327392578, + "logps/rejected": -273.63922119140625, + "loss": 0.084, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8906347751617432, + "rewards/margins": 4.691598415374756, + "rewards/rejected": -6.58223295211792, + "step": 1363 + }, + { + "epoch": 1.79, + "learning_rate": 1.9038519361604046e-05, + "logits/chosen": -2.325507879257202, + "logits/rejected": -2.3956029415130615, + "logps/chosen": -177.0860137939453, + "logps/rejected": -251.73207092285156, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4571151733398438, + "rewards/margins": 4.530646324157715, + "rewards/rejected": -6.987761974334717, + "step": 1364 + }, + { + "epoch": 1.79, + "learning_rate": 1.900372889959949e-05, + "logits/chosen": -2.552361011505127, + "logits/rejected": -2.5750346183776855, + "logps/chosen": -192.18612670898438, + "logps/rejected": -251.12831115722656, + "loss": 0.0594, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.220806121826172, + "rewards/margins": 3.9347567558288574, + "rewards/rejected": -6.155562400817871, + "step": 1365 + }, + { + "epoch": 1.79, + "learning_rate": 1.896895075445e-05, + "logits/chosen": -2.2767112255096436, + "logits/rejected": -2.2670252323150635, + "logps/chosen": -173.70436096191406, + "logps/rejected": -202.10231018066406, + "loss": 0.2313, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5938425064086914, + "rewards/margins": 2.886263608932495, + "rewards/rejected": -5.480106353759766, + "step": 1366 + }, + { + "epoch": 1.79, + "learning_rate": 1.8934184997592866e-05, + "logits/chosen": -2.390500068664551, + "logits/rejected": -2.375262975692749, + "logps/chosen": -201.7509765625, + "logps/rejected": -245.776123046875, + "loss": 0.0582, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5441539287567139, + "rewards/margins": 4.597626686096191, + "rewards/rejected": -6.141779899597168, + "step": 1367 + }, + { + "epoch": 1.79, + "learning_rate": 1.8899431700439946e-05, + "logits/chosen": -2.4278464317321777, + "logits/rejected": -2.4439098834991455, + "logps/chosen": -180.22174072265625, + "logps/rejected": -221.05007934570312, + "loss": 0.0612, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7875794172286987, + "rewards/margins": 3.996073007583618, + "rewards/rejected": -5.783651828765869, + "step": 1368 + }, + { + "epoch": 1.79, + "learning_rate": 1.8864690934377492e-05, + "logits/chosen": -2.389608144760132, + "logits/rejected": -2.405284881591797, + "logps/chosen": -185.8512725830078, + "logps/rejected": -235.00833129882812, + "loss": 0.2345, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4995129108428955, + "rewards/margins": 3.2029199600219727, + "rewards/rejected": -5.702433109283447, + "step": 1369 + }, + { + "epoch": 1.79, + "learning_rate": 1.8829962770766003e-05, + "logits/chosen": -2.249424457550049, + "logits/rejected": -2.2341036796569824, + "logps/chosen": -204.85244750976562, + "logps/rejected": -226.37265014648438, + "loss": 0.1982, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5774803161621094, + "rewards/margins": 3.0645623207092285, + "rewards/rejected": -5.642043113708496, + "step": 1370 + }, + { + "epoch": 1.79, + "learning_rate": 1.8795247280940108e-05, + "logits/chosen": -2.343681812286377, + "logits/rejected": -2.269914150238037, + "logps/chosen": -182.87588500976562, + "logps/rejected": -232.2654571533203, + "loss": 0.1017, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4750690460205078, + "rewards/margins": 5.233051776885986, + "rewards/rejected": -6.708120822906494, + "step": 1371 + }, + { + "epoch": 1.8, + "learning_rate": 1.876054453620841e-05, + "logits/chosen": -2.361067771911621, + "logits/rejected": -2.2843315601348877, + "logps/chosen": -192.16220092773438, + "logps/rejected": -256.955810546875, + "loss": 0.0238, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6471022367477417, + "rewards/margins": 6.301355361938477, + "rewards/rejected": -7.948457717895508, + "step": 1372 + }, + { + "epoch": 1.8, + "learning_rate": 1.872585460785332e-05, + "logits/chosen": -2.236304998397827, + "logits/rejected": -2.227062225341797, + "logps/chosen": -199.45516967773438, + "logps/rejected": -238.85398864746094, + "loss": 0.106, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2082507610321045, + "rewards/margins": 3.88494873046875, + "rewards/rejected": -6.093199729919434, + "step": 1373 + }, + { + "epoch": 1.8, + "learning_rate": 1.869117756713092e-05, + "logits/chosen": -2.2852840423583984, + "logits/rejected": -2.26584529876709, + "logps/chosen": -201.33767700195312, + "logps/rejected": -230.25816345214844, + "loss": 0.1652, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6372015476226807, + "rewards/margins": 4.133516311645508, + "rewards/rejected": -6.770718574523926, + "step": 1374 + }, + { + "epoch": 1.8, + "learning_rate": 1.8656513485270843e-05, + "logits/chosen": -2.1882293224334717, + "logits/rejected": -2.170938730239868, + "logps/chosen": -192.08497619628906, + "logps/rejected": -231.12686157226562, + "loss": 0.2103, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2908542156219482, + "rewards/margins": 3.444204807281494, + "rewards/rejected": -5.735058784484863, + "step": 1375 + }, + { + "epoch": 1.8, + "learning_rate": 1.8621862433476054e-05, + "logits/chosen": -2.3386387825012207, + "logits/rejected": -2.328312397003174, + "logps/chosen": -211.4608612060547, + "logps/rejected": -246.96217346191406, + "loss": 0.2087, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2940046787261963, + "rewards/margins": 4.066905975341797, + "rewards/rejected": -6.360910415649414, + "step": 1376 + }, + { + "epoch": 1.8, + "learning_rate": 1.858722448292281e-05, + "logits/chosen": -2.1899824142456055, + "logits/rejected": -2.227428436279297, + "logps/chosen": -231.0190887451172, + "logps/rejected": -314.1480712890625, + "loss": 0.0888, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0735411643981934, + "rewards/margins": 5.1453657150268555, + "rewards/rejected": -7.218906879425049, + "step": 1377 + }, + { + "epoch": 1.8, + "learning_rate": 1.8552599704760424e-05, + "logits/chosen": -2.419363260269165, + "logits/rejected": -2.487983226776123, + "logps/chosen": -155.49874877929688, + "logps/rejected": -190.6211700439453, + "loss": 0.2397, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1746416091918945, + "rewards/margins": 3.2479474544525146, + "rewards/rejected": -5.4225897789001465, + "step": 1378 + }, + { + "epoch": 1.8, + "learning_rate": 1.851798817011116e-05, + "logits/chosen": -2.375593900680542, + "logits/rejected": -2.304309606552124, + "logps/chosen": -222.7501678466797, + "logps/rejected": -235.4834747314453, + "loss": 0.2789, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2550225257873535, + "rewards/margins": 2.8650777339935303, + "rewards/rejected": -5.1200995445251465, + "step": 1379 + }, + { + "epoch": 1.81, + "learning_rate": 1.8483389950070097e-05, + "logits/chosen": -2.2030558586120605, + "logits/rejected": -2.265129804611206, + "logps/chosen": -207.5592041015625, + "logps/rejected": -261.5985107421875, + "loss": 0.1712, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.297219753265381, + "rewards/margins": 4.409007549285889, + "rewards/rejected": -6.706227779388428, + "step": 1380 + }, + { + "epoch": 1.81, + "learning_rate": 1.8448805115704903e-05, + "logits/chosen": -2.3315019607543945, + "logits/rejected": -2.340857744216919, + "logps/chosen": -196.46554565429688, + "logps/rejected": -224.2041015625, + "loss": 0.1435, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3940651416778564, + "rewards/margins": 3.7280590534210205, + "rewards/rejected": -6.122124195098877, + "step": 1381 + }, + { + "epoch": 1.81, + "learning_rate": 1.841423373805583e-05, + "logits/chosen": -2.4746859073638916, + "logits/rejected": -2.4827425479888916, + "logps/chosen": -250.61415100097656, + "logps/rejected": -275.38409423828125, + "loss": 0.0473, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.048532485961914, + "rewards/margins": 4.594575881958008, + "rewards/rejected": -6.643108367919922, + "step": 1382 + }, + { + "epoch": 1.81, + "learning_rate": 1.837967588813544e-05, + "logits/chosen": -2.165999174118042, + "logits/rejected": -2.2862932682037354, + "logps/chosen": -184.2350311279297, + "logps/rejected": -237.280517578125, + "loss": 0.1143, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2835655212402344, + "rewards/margins": 4.6342034339904785, + "rewards/rejected": -6.917769432067871, + "step": 1383 + }, + { + "epoch": 1.81, + "learning_rate": 1.8345131636928518e-05, + "logits/chosen": -2.3341617584228516, + "logits/rejected": -2.4517173767089844, + "logps/chosen": -182.92454528808594, + "logps/rejected": -259.3277587890625, + "loss": 0.0717, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.336014747619629, + "rewards/margins": 5.517913341522217, + "rewards/rejected": -7.8539276123046875, + "step": 1384 + }, + { + "epoch": 1.81, + "learning_rate": 1.8310601055391923e-05, + "logits/chosen": -2.424025535583496, + "logits/rejected": -2.393772602081299, + "logps/chosen": -243.9350128173828, + "logps/rejected": -289.0570068359375, + "loss": 0.0888, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.453885316848755, + "rewards/margins": 3.709909677505493, + "rewards/rejected": -6.16379451751709, + "step": 1385 + }, + { + "epoch": 1.81, + "learning_rate": 1.8276084214454443e-05, + "logits/chosen": -2.3735830783843994, + "logits/rejected": -2.327988862991333, + "logps/chosen": -176.32257080078125, + "logps/rejected": -187.61695861816406, + "loss": 0.291, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0003018379211426, + "rewards/margins": 2.2487120628356934, + "rewards/rejected": -5.249013900756836, + "step": 1386 + }, + { + "epoch": 1.82, + "learning_rate": 1.8241581185016603e-05, + "logits/chosen": -2.2303643226623535, + "logits/rejected": -2.3069612979888916, + "logps/chosen": -212.10330200195312, + "logps/rejected": -248.6053009033203, + "loss": 0.0943, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6326069831848145, + "rewards/margins": 4.7315497398376465, + "rewards/rejected": -7.364156723022461, + "step": 1387 + }, + { + "epoch": 1.82, + "learning_rate": 1.8207092037950602e-05, + "logits/chosen": -2.4394166469573975, + "logits/rejected": -2.4803974628448486, + "logps/chosen": -192.7069854736328, + "logps/rejected": -256.257568359375, + "loss": 0.0331, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1693193912506104, + "rewards/margins": 5.404998779296875, + "rewards/rejected": -7.574317932128906, + "step": 1388 + }, + { + "epoch": 1.82, + "learning_rate": 1.8172616844100096e-05, + "logits/chosen": -2.3755996227264404, + "logits/rejected": -2.3954529762268066, + "logps/chosen": -210.21246337890625, + "logps/rejected": -274.9314880371094, + "loss": 0.0777, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9619503021240234, + "rewards/margins": 4.969242572784424, + "rewards/rejected": -6.931192874908447, + "step": 1389 + }, + { + "epoch": 1.82, + "learning_rate": 1.81381556742801e-05, + "logits/chosen": -2.421814441680908, + "logits/rejected": -2.4499125480651855, + "logps/chosen": -246.77828979492188, + "logps/rejected": -271.91253662109375, + "loss": 0.1833, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7254321575164795, + "rewards/margins": 3.572638750076294, + "rewards/rejected": -6.298070907592773, + "step": 1390 + }, + { + "epoch": 1.82, + "learning_rate": 1.8103708599276812e-05, + "logits/chosen": -2.4076273441314697, + "logits/rejected": -2.4483351707458496, + "logps/chosen": -202.81301879882812, + "logps/rejected": -277.38055419921875, + "loss": 0.1763, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.199967622756958, + "rewards/margins": 4.424561500549316, + "rewards/rejected": -6.624528884887695, + "step": 1391 + }, + { + "epoch": 1.82, + "learning_rate": 1.8069275689847466e-05, + "logits/chosen": -2.4297070503234863, + "logits/rejected": -2.43597674369812, + "logps/chosen": -221.3848876953125, + "logps/rejected": -247.5712890625, + "loss": 0.2, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.255714178085327, + "rewards/margins": 4.192302703857422, + "rewards/rejected": -6.448017120361328, + "step": 1392 + }, + { + "epoch": 1.82, + "learning_rate": 1.803485701672022e-05, + "logits/chosen": -2.155043125152588, + "logits/rejected": -2.2775719165802, + "logps/chosen": -172.92015075683594, + "logps/rejected": -254.85780334472656, + "loss": 0.1715, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.388918161392212, + "rewards/margins": 4.012326717376709, + "rewards/rejected": -6.401244640350342, + "step": 1393 + }, + { + "epoch": 1.82, + "learning_rate": 1.8000452650593976e-05, + "logits/chosen": -2.3290700912475586, + "logits/rejected": -2.35349702835083, + "logps/chosen": -183.62379455566406, + "logps/rejected": -234.48475646972656, + "loss": 0.0791, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.422823429107666, + "rewards/margins": 3.4260871410369873, + "rewards/rejected": -5.848910331726074, + "step": 1394 + }, + { + "epoch": 1.83, + "learning_rate": 1.7966062662138262e-05, + "logits/chosen": -2.2217583656311035, + "logits/rejected": -2.2541911602020264, + "logps/chosen": -213.72406005859375, + "logps/rejected": -275.35272216796875, + "loss": 0.37, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9349758625030518, + "rewards/margins": 4.163061141967773, + "rewards/rejected": -7.098036766052246, + "step": 1395 + }, + { + "epoch": 1.83, + "learning_rate": 1.7931687121993047e-05, + "logits/chosen": -2.2696139812469482, + "logits/rejected": -2.3103439807891846, + "logps/chosen": -157.90899658203125, + "logps/rejected": -210.8253173828125, + "loss": 0.2477, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8092381954193115, + "rewards/margins": 2.957724094390869, + "rewards/rejected": -5.766962051391602, + "step": 1396 + }, + { + "epoch": 1.83, + "learning_rate": 1.7897326100768664e-05, + "logits/chosen": -2.275118112564087, + "logits/rejected": -2.315573215484619, + "logps/chosen": -217.34156799316406, + "logps/rejected": -257.44085693359375, + "loss": 0.1969, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7094452381134033, + "rewards/margins": 4.454812049865723, + "rewards/rejected": -7.164257049560547, + "step": 1397 + }, + { + "epoch": 1.83, + "learning_rate": 1.7862979669045566e-05, + "logits/chosen": -2.3151776790618896, + "logits/rejected": -2.4441988468170166, + "logps/chosen": -163.26112365722656, + "logps/rejected": -268.2952575683594, + "loss": 0.1011, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0306079387664795, + "rewards/margins": 3.7839698791503906, + "rewards/rejected": -5.814577579498291, + "step": 1398 + }, + { + "epoch": 1.83, + "learning_rate": 1.782864789737429e-05, + "logits/chosen": -2.1966049671173096, + "logits/rejected": -2.190493106842041, + "logps/chosen": -173.35159301757812, + "logps/rejected": -216.2772216796875, + "loss": 0.2887, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8507883548736572, + "rewards/margins": 3.7141573429107666, + "rewards/rejected": -5.564945697784424, + "step": 1399 + }, + { + "epoch": 1.83, + "learning_rate": 1.779433085627523e-05, + "logits/chosen": -2.305650472640991, + "logits/rejected": -2.4516422748565674, + "logps/chosen": -173.31443786621094, + "logps/rejected": -230.4690704345703, + "loss": 0.173, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.488478660583496, + "rewards/margins": 3.3143858909606934, + "rewards/rejected": -5.8028645515441895, + "step": 1400 + }, + { + "epoch": 1.83, + "learning_rate": 1.7760028616238535e-05, + "logits/chosen": -2.1745190620422363, + "logits/rejected": -2.2342097759246826, + "logps/chosen": -191.0387420654297, + "logps/rejected": -247.54385375976562, + "loss": 0.2329, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.31973934173584, + "rewards/margins": 4.186702728271484, + "rewards/rejected": -6.506442070007324, + "step": 1401 + }, + { + "epoch": 1.84, + "learning_rate": 1.7725741247723965e-05, + "logits/chosen": -2.2428975105285645, + "logits/rejected": -2.3456952571868896, + "logps/chosen": -190.28341674804688, + "logps/rejected": -263.6100769042969, + "loss": 0.101, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8061240911483765, + "rewards/margins": 4.451410293579102, + "rewards/rejected": -6.257534980773926, + "step": 1402 + }, + { + "epoch": 1.84, + "learning_rate": 1.769146882116068e-05, + "logits/chosen": -2.335822343826294, + "logits/rejected": -2.327932834625244, + "logps/chosen": -216.3273468017578, + "logps/rejected": -271.0181884765625, + "loss": 0.0896, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9624712467193604, + "rewards/margins": 4.33131217956543, + "rewards/rejected": -6.293783187866211, + "step": 1403 + }, + { + "epoch": 1.84, + "learning_rate": 1.7657211406947206e-05, + "logits/chosen": -2.352761745452881, + "logits/rejected": -2.3849642276763916, + "logps/chosen": -225.92799377441406, + "logps/rejected": -279.9659423828125, + "loss": 0.0981, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5281702280044556, + "rewards/margins": 4.954948902130127, + "rewards/rejected": -6.483119010925293, + "step": 1404 + }, + { + "epoch": 1.84, + "learning_rate": 1.7622969075451204e-05, + "logits/chosen": -2.426805257797241, + "logits/rejected": -2.4396920204162598, + "logps/chosen": -236.06060791015625, + "logps/rejected": -296.6875, + "loss": 0.0727, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1789348125457764, + "rewards/margins": 4.371779441833496, + "rewards/rejected": -6.55071496963501, + "step": 1405 + }, + { + "epoch": 1.84, + "learning_rate": 1.758874189700936e-05, + "logits/chosen": -2.564016580581665, + "logits/rejected": -2.57346773147583, + "logps/chosen": -277.2979431152344, + "logps/rejected": -295.3319091796875, + "loss": 0.0406, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9543590545654297, + "rewards/margins": 5.153177738189697, + "rewards/rejected": -7.107537269592285, + "step": 1406 + }, + { + "epoch": 1.84, + "learning_rate": 1.7554529941927243e-05, + "logits/chosen": -2.469637155532837, + "logits/rejected": -2.395562171936035, + "logps/chosen": -193.67764282226562, + "logps/rejected": -217.7783660888672, + "loss": 0.2628, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3800079822540283, + "rewards/margins": 3.211730480194092, + "rewards/rejected": -5.591738224029541, + "step": 1407 + }, + { + "epoch": 1.84, + "learning_rate": 1.7520333280479124e-05, + "logits/chosen": -2.5460124015808105, + "logits/rejected": -2.4323182106018066, + "logps/chosen": -208.2541961669922, + "logps/rejected": -226.3778076171875, + "loss": 0.1286, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8007850646972656, + "rewards/margins": 4.02705717086792, + "rewards/rejected": -6.827842712402344, + "step": 1408 + }, + { + "epoch": 1.84, + "learning_rate": 1.7486151982907896e-05, + "logits/chosen": -2.4071381092071533, + "logits/rejected": -2.3857336044311523, + "logps/chosen": -206.93508911132812, + "logps/rejected": -245.7151641845703, + "loss": 0.0979, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0856006145477295, + "rewards/margins": 3.9716968536376953, + "rewards/rejected": -6.057297706604004, + "step": 1409 + }, + { + "epoch": 1.85, + "learning_rate": 1.7451986119424863e-05, + "logits/chosen": -2.3463447093963623, + "logits/rejected": -2.3490939140319824, + "logps/chosen": -208.6003875732422, + "logps/rejected": -253.23080444335938, + "loss": 0.1813, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.814742922782898, + "rewards/margins": 5.019925117492676, + "rewards/rejected": -6.834668159484863, + "step": 1410 + }, + { + "epoch": 1.85, + "learning_rate": 1.7417835760209638e-05, + "logits/chosen": -2.318296194076538, + "logits/rejected": -2.2780075073242188, + "logps/chosen": -187.7845001220703, + "logps/rejected": -241.4011688232422, + "loss": 0.1568, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4914097785949707, + "rewards/margins": 3.6839685440063477, + "rewards/rejected": -6.17537784576416, + "step": 1411 + }, + { + "epoch": 1.85, + "learning_rate": 1.738370097541e-05, + "logits/chosen": -2.2427215576171875, + "logits/rejected": -2.2862513065338135, + "logps/chosen": -199.6505126953125, + "logps/rejected": -233.87680053710938, + "loss": 0.1351, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6234967708587646, + "rewards/margins": 4.138545513153076, + "rewards/rejected": -5.76204252243042, + "step": 1412 + }, + { + "epoch": 1.85, + "learning_rate": 1.7349581835141725e-05, + "logits/chosen": -2.3899827003479004, + "logits/rejected": -2.3938510417938232, + "logps/chosen": -193.83572387695312, + "logps/rejected": -232.34912109375, + "loss": 0.1208, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.364690065383911, + "rewards/margins": 3.3923044204711914, + "rewards/rejected": -5.756994247436523, + "step": 1413 + }, + { + "epoch": 1.85, + "learning_rate": 1.7315478409488436e-05, + "logits/chosen": -2.433824300765991, + "logits/rejected": -2.5459389686584473, + "logps/chosen": -187.0819091796875, + "logps/rejected": -238.84722900390625, + "loss": 0.1516, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4814958572387695, + "rewards/margins": 3.0508153438568115, + "rewards/rejected": -5.532310962677002, + "step": 1414 + }, + { + "epoch": 1.85, + "learning_rate": 1.72813907685015e-05, + "logits/chosen": -2.438502073287964, + "logits/rejected": -2.463942050933838, + "logps/chosen": -217.55548095703125, + "logps/rejected": -313.0367431640625, + "loss": 0.0606, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8139358758926392, + "rewards/margins": 6.352237701416016, + "rewards/rejected": -8.166172981262207, + "step": 1415 + }, + { + "epoch": 1.85, + "learning_rate": 1.7247318982199862e-05, + "logits/chosen": -2.381103515625, + "logits/rejected": -2.4419844150543213, + "logps/chosen": -199.40476989746094, + "logps/rejected": -232.19622802734375, + "loss": 0.1494, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8808376789093018, + "rewards/margins": 3.400146484375, + "rewards/rejected": -5.280983924865723, + "step": 1416 + }, + { + "epoch": 1.85, + "learning_rate": 1.721326312056989e-05, + "logits/chosen": -2.250420331954956, + "logits/rejected": -2.373267889022827, + "logps/chosen": -182.4378662109375, + "logps/rejected": -249.34072875976562, + "loss": 0.1743, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.787522315979004, + "rewards/margins": 5.724884033203125, + "rewards/rejected": -7.512406349182129, + "step": 1417 + }, + { + "epoch": 1.86, + "learning_rate": 1.717922325356525e-05, + "logits/chosen": -2.431522846221924, + "logits/rejected": -2.4467411041259766, + "logps/chosen": -169.0112762451172, + "logps/rejected": -244.40225219726562, + "loss": 0.1692, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4281190633773804, + "rewards/margins": 4.815309524536133, + "rewards/rejected": -6.243429183959961, + "step": 1418 + }, + { + "epoch": 1.86, + "learning_rate": 1.7145199451106736e-05, + "logits/chosen": -2.3091917037963867, + "logits/rejected": -2.389026165008545, + "logps/chosen": -189.0026092529297, + "logps/rejected": -251.83541870117188, + "loss": 0.1599, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5159945487976074, + "rewards/margins": 3.761789560317993, + "rewards/rejected": -5.27778434753418, + "step": 1419 + }, + { + "epoch": 1.86, + "learning_rate": 1.7111191783082155e-05, + "logits/chosen": -2.411252737045288, + "logits/rejected": -2.314314365386963, + "logps/chosen": -228.8693084716797, + "logps/rejected": -256.5889892578125, + "loss": 0.075, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9669816493988037, + "rewards/margins": 4.889811038970947, + "rewards/rejected": -6.856792449951172, + "step": 1420 + }, + { + "epoch": 1.86, + "learning_rate": 1.7077200319346186e-05, + "logits/chosen": -2.302119731903076, + "logits/rejected": -2.25610613822937, + "logps/chosen": -221.1598358154297, + "logps/rejected": -251.94815063476562, + "loss": 0.1174, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.303022861480713, + "rewards/margins": 4.024908065795898, + "rewards/rejected": -6.327930450439453, + "step": 1421 + }, + { + "epoch": 1.86, + "learning_rate": 1.7043225129720207e-05, + "logits/chosen": -2.2721972465515137, + "logits/rejected": -2.242246389389038, + "logps/chosen": -188.17799377441406, + "logps/rejected": -212.06350708007812, + "loss": 0.1147, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.008423328399658, + "rewards/margins": 3.6622042655944824, + "rewards/rejected": -5.670627117156982, + "step": 1422 + }, + { + "epoch": 1.86, + "learning_rate": 1.7009266283992163e-05, + "logits/chosen": -2.3281729221343994, + "logits/rejected": -2.1807894706726074, + "logps/chosen": -172.7126922607422, + "logps/rejected": -183.3359375, + "loss": 0.2174, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0304126739501953, + "rewards/margins": 3.4383456707000732, + "rewards/rejected": -5.468757629394531, + "step": 1423 + }, + { + "epoch": 1.86, + "learning_rate": 1.6975323851916454e-05, + "logits/chosen": -2.296633005142212, + "logits/rejected": -2.383657455444336, + "logps/chosen": -211.98118591308594, + "logps/rejected": -303.9250793457031, + "loss": 0.2113, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.103783130645752, + "rewards/margins": 3.5214123725891113, + "rewards/rejected": -5.625195503234863, + "step": 1424 + }, + { + "epoch": 1.87, + "learning_rate": 1.6941397903213717e-05, + "logits/chosen": -2.400033473968506, + "logits/rejected": -2.428504467010498, + "logps/chosen": -224.45407104492188, + "logps/rejected": -260.5846252441406, + "loss": 0.0845, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8682763576507568, + "rewards/margins": 4.755299091339111, + "rewards/rejected": -6.623575210571289, + "step": 1425 + }, + { + "epoch": 1.87, + "learning_rate": 1.6907488507570786e-05, + "logits/chosen": -2.340531826019287, + "logits/rejected": -2.4645419120788574, + "logps/chosen": -217.0526123046875, + "logps/rejected": -286.190185546875, + "loss": 0.0807, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1855876445770264, + "rewards/margins": 5.310837745666504, + "rewards/rejected": -7.496426105499268, + "step": 1426 + }, + { + "epoch": 1.87, + "learning_rate": 1.6873595734640457e-05, + "logits/chosen": -2.3777904510498047, + "logits/rejected": -2.4368772506713867, + "logps/chosen": -190.16751098632812, + "logps/rejected": -225.77963256835938, + "loss": 0.1334, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3047542572021484, + "rewards/margins": 3.431431293487549, + "rewards/rejected": -5.736185550689697, + "step": 1427 + }, + { + "epoch": 1.87, + "learning_rate": 1.683971965404139e-05, + "logits/chosen": -2.234236001968384, + "logits/rejected": -2.2022533416748047, + "logps/chosen": -199.09043884277344, + "logps/rejected": -255.25946044921875, + "loss": 0.1387, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1556639671325684, + "rewards/margins": 4.284952163696289, + "rewards/rejected": -6.440616607666016, + "step": 1428 + }, + { + "epoch": 1.87, + "learning_rate": 1.6805860335357977e-05, + "logits/chosen": -2.286823272705078, + "logits/rejected": -2.4171409606933594, + "logps/chosen": -192.07984924316406, + "logps/rejected": -261.97418212890625, + "loss": 0.1029, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1071534156799316, + "rewards/margins": 3.961883544921875, + "rewards/rejected": -6.069036960601807, + "step": 1429 + }, + { + "epoch": 1.87, + "learning_rate": 1.6772017848140132e-05, + "logits/chosen": -2.277740478515625, + "logits/rejected": -2.286550521850586, + "logps/chosen": -192.15283203125, + "logps/rejected": -213.09674072265625, + "loss": 0.2975, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0977118015289307, + "rewards/margins": 2.784632682800293, + "rewards/rejected": -4.882344722747803, + "step": 1430 + }, + { + "epoch": 1.87, + "learning_rate": 1.6738192261903248e-05, + "logits/chosen": -2.350882053375244, + "logits/rejected": -2.358020782470703, + "logps/chosen": -199.02256774902344, + "logps/rejected": -211.52923583984375, + "loss": 0.0748, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.024993419647217, + "rewards/margins": 3.764868974685669, + "rewards/rejected": -5.789862632751465, + "step": 1431 + }, + { + "epoch": 1.87, + "learning_rate": 1.6704383646127973e-05, + "logits/chosen": -2.4114990234375, + "logits/rejected": -2.471489906311035, + "logps/chosen": -179.5003662109375, + "logps/rejected": -251.12921142578125, + "loss": 0.1482, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.546452760696411, + "rewards/margins": 4.133995056152344, + "rewards/rejected": -6.680447578430176, + "step": 1432 + }, + { + "epoch": 1.88, + "learning_rate": 1.6670592070260106e-05, + "logits/chosen": -2.292158842086792, + "logits/rejected": -2.3579061031341553, + "logps/chosen": -215.42198181152344, + "logps/rejected": -256.4615478515625, + "loss": 0.205, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4778378009796143, + "rewards/margins": 3.3685197830200195, + "rewards/rejected": -5.846357345581055, + "step": 1433 + }, + { + "epoch": 1.88, + "learning_rate": 1.6636817603710437e-05, + "logits/chosen": -2.4305903911590576, + "logits/rejected": -2.4421541690826416, + "logps/chosen": -223.333984375, + "logps/rejected": -273.8779296875, + "loss": 0.1467, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.138701915740967, + "rewards/margins": 4.6371378898620605, + "rewards/rejected": -6.775839805603027, + "step": 1434 + }, + { + "epoch": 1.88, + "learning_rate": 1.660306031585463e-05, + "logits/chosen": -2.241135835647583, + "logits/rejected": -2.347278118133545, + "logps/chosen": -224.71678161621094, + "logps/rejected": -277.7406921386719, + "loss": 0.0649, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3808908462524414, + "rewards/margins": 4.80213737487793, + "rewards/rejected": -7.183028221130371, + "step": 1435 + }, + { + "epoch": 1.88, + "learning_rate": 1.6569320276033034e-05, + "logits/chosen": -2.4102020263671875, + "logits/rejected": -2.4303605556488037, + "logps/chosen": -208.3951873779297, + "logps/rejected": -266.3526611328125, + "loss": 0.1238, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.318913221359253, + "rewards/margins": 4.623273849487305, + "rewards/rejected": -6.9421868324279785, + "step": 1436 + }, + { + "epoch": 1.88, + "learning_rate": 1.653559755355058e-05, + "logits/chosen": -2.1951780319213867, + "logits/rejected": -2.2318592071533203, + "logps/chosen": -176.567626953125, + "logps/rejected": -239.26303100585938, + "loss": 0.0748, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9915611743927002, + "rewards/margins": 4.4353132247924805, + "rewards/rejected": -6.426874160766602, + "step": 1437 + }, + { + "epoch": 1.88, + "learning_rate": 1.6501892217676653e-05, + "logits/chosen": -2.3401355743408203, + "logits/rejected": -2.500912666320801, + "logps/chosen": -226.52468872070312, + "logps/rejected": -327.27764892578125, + "loss": 0.0987, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8020706176757812, + "rewards/margins": 4.857433319091797, + "rewards/rejected": -7.65950345993042, + "step": 1438 + }, + { + "epoch": 1.88, + "learning_rate": 1.6468204337644887e-05, + "logits/chosen": -2.3637022972106934, + "logits/rejected": -2.4382131099700928, + "logps/chosen": -172.9948272705078, + "logps/rejected": -230.6323699951172, + "loss": 0.1339, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.165769577026367, + "rewards/margins": 4.19622802734375, + "rewards/rejected": -6.361997604370117, + "step": 1439 + }, + { + "epoch": 1.88, + "learning_rate": 1.643453398265309e-05, + "logits/chosen": -2.3473615646362305, + "logits/rejected": -2.4326136112213135, + "logps/chosen": -229.02716064453125, + "logps/rejected": -300.13128662109375, + "loss": 0.0614, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6115474700927734, + "rewards/margins": 4.466472625732422, + "rewards/rejected": -7.078019618988037, + "step": 1440 + }, + { + "epoch": 1.89, + "learning_rate": 1.6400881221863044e-05, + "logits/chosen": -2.192305564880371, + "logits/rejected": -2.3297770023345947, + "logps/chosen": -198.7337188720703, + "logps/rejected": -254.3240203857422, + "loss": 0.1015, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7752209901809692, + "rewards/margins": 4.9826154708862305, + "rewards/rejected": -6.757837295532227, + "step": 1441 + }, + { + "epoch": 1.89, + "learning_rate": 1.6367246124400402e-05, + "logits/chosen": -2.4509072303771973, + "logits/rejected": -2.343606948852539, + "logps/chosen": -272.90118408203125, + "logps/rejected": -256.77606201171875, + "loss": 0.0647, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2714061737060547, + "rewards/margins": 5.342903137207031, + "rewards/rejected": -7.614309310913086, + "step": 1442 + }, + { + "epoch": 1.89, + "learning_rate": 1.633362875935456e-05, + "logits/chosen": -2.4906082153320312, + "logits/rejected": -2.56166672706604, + "logps/chosen": -210.90478515625, + "logps/rejected": -278.2033386230469, + "loss": 0.0464, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2196850776672363, + "rewards/margins": 4.7737860679626465, + "rewards/rejected": -6.993471145629883, + "step": 1443 + }, + { + "epoch": 1.89, + "learning_rate": 1.6300029195778455e-05, + "logits/chosen": -2.4831748008728027, + "logits/rejected": -2.4640426635742188, + "logps/chosen": -251.76467895507812, + "logps/rejected": -291.890625, + "loss": 0.1085, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2407264709472656, + "rewards/margins": 5.016515731811523, + "rewards/rejected": -7.257242202758789, + "step": 1444 + }, + { + "epoch": 1.89, + "learning_rate": 1.626644750268847e-05, + "logits/chosen": -2.340923547744751, + "logits/rejected": -2.364927291870117, + "logps/chosen": -198.6310577392578, + "logps/rejected": -267.87506103515625, + "loss": 0.1094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8994073867797852, + "rewards/margins": 4.561697006225586, + "rewards/rejected": -6.461103439331055, + "step": 1445 + }, + { + "epoch": 1.89, + "learning_rate": 1.62328837490643e-05, + "logits/chosen": -2.4049181938171387, + "logits/rejected": -2.4683194160461426, + "logps/chosen": -213.10623168945312, + "logps/rejected": -296.3499755859375, + "loss": 0.1234, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.042850971221924, + "rewards/margins": 4.385453701019287, + "rewards/rejected": -6.428304672241211, + "step": 1446 + }, + { + "epoch": 1.89, + "learning_rate": 1.6199338003848745e-05, + "logits/chosen": -2.329580545425415, + "logits/rejected": -2.4312570095062256, + "logps/chosen": -195.87893676757812, + "logps/rejected": -264.84075927734375, + "loss": 0.0531, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3866477012634277, + "rewards/margins": 5.031021595001221, + "rewards/rejected": -7.417669296264648, + "step": 1447 + }, + { + "epoch": 1.9, + "learning_rate": 1.6165810335947664e-05, + "logits/chosen": -2.255166530609131, + "logits/rejected": -2.41721248626709, + "logps/chosen": -168.88145446777344, + "logps/rejected": -259.59075927734375, + "loss": 0.118, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0207834243774414, + "rewards/margins": 3.352409601211548, + "rewards/rejected": -5.37319278717041, + "step": 1448 + }, + { + "epoch": 1.9, + "learning_rate": 1.6132300814229755e-05, + "logits/chosen": -2.47602915763855, + "logits/rejected": -2.5320427417755127, + "logps/chosen": -194.60694885253906, + "logps/rejected": -264.7068786621094, + "loss": 0.0885, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.452468156814575, + "rewards/margins": 4.713617324829102, + "rewards/rejected": -7.166085243225098, + "step": 1449 + }, + { + "epoch": 1.9, + "learning_rate": 1.6098809507526445e-05, + "logits/chosen": -2.501377582550049, + "logits/rejected": -2.5338125228881836, + "logps/chosen": -209.98854064941406, + "logps/rejected": -275.7145080566406, + "loss": 0.0858, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.737166166305542, + "rewards/margins": 4.94148063659668, + "rewards/rejected": -6.678646087646484, + "step": 1450 + }, + { + "epoch": 1.9, + "learning_rate": 1.606533648463177e-05, + "logits/chosen": -2.3755364418029785, + "logits/rejected": -2.404049873352051, + "logps/chosen": -207.2648468017578, + "logps/rejected": -243.770263671875, + "loss": 0.1872, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.953253746032715, + "rewards/margins": 3.190807580947876, + "rewards/rejected": -6.14406156539917, + "step": 1451 + }, + { + "epoch": 1.9, + "learning_rate": 1.603188181430216e-05, + "logits/chosen": -2.4449565410614014, + "logits/rejected": -2.4856343269348145, + "logps/chosen": -237.06878662109375, + "logps/rejected": -327.125732421875, + "loss": 0.0517, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1644299030303955, + "rewards/margins": 5.85663366317749, + "rewards/rejected": -8.021064758300781, + "step": 1452 + }, + { + "epoch": 1.9, + "learning_rate": 1.5998445565256398e-05, + "logits/chosen": -2.4012949466705322, + "logits/rejected": -2.516373872756958, + "logps/chosen": -190.51754760742188, + "logps/rejected": -254.72982788085938, + "loss": 0.06, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.806227207183838, + "rewards/margins": 4.247498512268066, + "rewards/rejected": -6.0537261962890625, + "step": 1453 + }, + { + "epoch": 1.9, + "learning_rate": 1.59650278061754e-05, + "logits/chosen": -2.5338635444641113, + "logits/rejected": -2.6262829303741455, + "logps/chosen": -213.9265899658203, + "logps/rejected": -303.3611755371094, + "loss": 0.1236, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.135544776916504, + "rewards/margins": 4.015152931213379, + "rewards/rejected": -6.150697708129883, + "step": 1454 + }, + { + "epoch": 1.9, + "learning_rate": 1.5931628605702102e-05, + "logits/chosen": -2.1860647201538086, + "logits/rejected": -2.1904773712158203, + "logps/chosen": -156.0186767578125, + "logps/rejected": -222.42051696777344, + "loss": 0.1663, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1143012046813965, + "rewards/margins": 3.9499664306640625, + "rewards/rejected": -6.064268112182617, + "step": 1455 + }, + { + "epoch": 1.91, + "learning_rate": 1.5898248032441336e-05, + "logits/chosen": -2.453688859939575, + "logits/rejected": -2.5124049186706543, + "logps/chosen": -212.0570831298828, + "logps/rejected": -288.8057861328125, + "loss": 0.091, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.561494827270508, + "rewards/margins": 4.191839694976807, + "rewards/rejected": -6.753334999084473, + "step": 1456 + }, + { + "epoch": 1.91, + "learning_rate": 1.5864886154959673e-05, + "logits/chosen": -2.337547779083252, + "logits/rejected": -2.391294002532959, + "logps/chosen": -200.18592834472656, + "logps/rejected": -251.96511840820312, + "loss": 0.0654, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3205690383911133, + "rewards/margins": 4.814243793487549, + "rewards/rejected": -7.13481330871582, + "step": 1457 + }, + { + "epoch": 1.91, + "learning_rate": 1.5831543041785247e-05, + "logits/chosen": -2.451817512512207, + "logits/rejected": -2.4509520530700684, + "logps/chosen": -204.92709350585938, + "logps/rejected": -240.1418914794922, + "loss": 0.2031, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6564457416534424, + "rewards/margins": 4.317297458648682, + "rewards/rejected": -6.973743438720703, + "step": 1458 + }, + { + "epoch": 1.91, + "learning_rate": 1.579821876140768e-05, + "logits/chosen": -2.432459592819214, + "logits/rejected": -2.5343260765075684, + "logps/chosen": -175.5408935546875, + "logps/rejected": -247.48802185058594, + "loss": 0.0926, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5296019315719604, + "rewards/margins": 4.98807430267334, + "rewards/rejected": -6.51767635345459, + "step": 1459 + }, + { + "epoch": 1.91, + "learning_rate": 1.5764913382277903e-05, + "logits/chosen": -2.364755153656006, + "logits/rejected": -2.4252707958221436, + "logps/chosen": -188.4788818359375, + "logps/rejected": -260.39886474609375, + "loss": 0.1568, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2904212474823, + "rewards/margins": 4.224641799926758, + "rewards/rejected": -6.515063762664795, + "step": 1460 + }, + { + "epoch": 1.91, + "learning_rate": 1.5731626972808027e-05, + "logits/chosen": -2.3451716899871826, + "logits/rejected": -2.465348720550537, + "logps/chosen": -180.72216796875, + "logps/rejected": -261.1882629394531, + "loss": 0.0225, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9220457077026367, + "rewards/margins": 5.0246477127075195, + "rewards/rejected": -6.946692943572998, + "step": 1461 + }, + { + "epoch": 1.91, + "learning_rate": 1.5698359601371187e-05, + "logits/chosen": -2.3472959995269775, + "logits/rejected": -2.3777170181274414, + "logps/chosen": -204.30133056640625, + "logps/rejected": -278.4882507324219, + "loss": 0.1245, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9885146617889404, + "rewards/margins": 5.202247142791748, + "rewards/rejected": -7.190761566162109, + "step": 1462 + }, + { + "epoch": 1.91, + "learning_rate": 1.5665111336301415e-05, + "logits/chosen": -2.4170303344726562, + "logits/rejected": -2.452892541885376, + "logps/chosen": -184.4312744140625, + "logps/rejected": -258.8091125488281, + "loss": 0.0365, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2951486110687256, + "rewards/margins": 5.710271835327148, + "rewards/rejected": -8.005420684814453, + "step": 1463 + }, + { + "epoch": 1.92, + "learning_rate": 1.563188224589349e-05, + "logits/chosen": -2.381779193878174, + "logits/rejected": -2.3906126022338867, + "logps/chosen": -186.55616760253906, + "logps/rejected": -237.7046356201172, + "loss": 0.102, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0758442878723145, + "rewards/margins": 4.258363246917725, + "rewards/rejected": -6.334208011627197, + "step": 1464 + }, + { + "epoch": 1.92, + "learning_rate": 1.5598672398402835e-05, + "logits/chosen": -2.3338801860809326, + "logits/rejected": -2.271404266357422, + "logps/chosen": -195.86288452148438, + "logps/rejected": -233.76541137695312, + "loss": 0.0596, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9144065380096436, + "rewards/margins": 4.764090538024902, + "rewards/rejected": -6.678496360778809, + "step": 1465 + }, + { + "epoch": 1.92, + "learning_rate": 1.5565481862045312e-05, + "logits/chosen": -2.4293978214263916, + "logits/rejected": -2.454214334487915, + "logps/chosen": -222.53311157226562, + "logps/rejected": -271.8833923339844, + "loss": 0.0856, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2399611473083496, + "rewards/margins": 4.397007942199707, + "rewards/rejected": -6.636968612670898, + "step": 1466 + }, + { + "epoch": 1.92, + "learning_rate": 1.553231070499712e-05, + "logits/chosen": -2.3797686100006104, + "logits/rejected": -2.4712514877319336, + "logps/chosen": -183.9396514892578, + "logps/rejected": -233.15830993652344, + "loss": 0.1166, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.122859477996826, + "rewards/margins": 3.340522289276123, + "rewards/rejected": -5.463381767272949, + "step": 1467 + }, + { + "epoch": 1.92, + "learning_rate": 1.549915899539469e-05, + "logits/chosen": -2.287867784500122, + "logits/rejected": -2.3321452140808105, + "logps/chosen": -209.50926208496094, + "logps/rejected": -274.19464111328125, + "loss": 0.0597, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4684481620788574, + "rewards/margins": 5.046233654022217, + "rewards/rejected": -7.514681339263916, + "step": 1468 + }, + { + "epoch": 1.92, + "learning_rate": 1.5466026801334437e-05, + "logits/chosen": -2.2746798992156982, + "logits/rejected": -2.3161165714263916, + "logps/chosen": -200.4854736328125, + "logps/rejected": -265.2914733886719, + "loss": 0.2465, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4387340545654297, + "rewards/margins": 4.696792125701904, + "rewards/rejected": -7.135526180267334, + "step": 1469 + }, + { + "epoch": 1.92, + "learning_rate": 1.5432914190872757e-05, + "logits/chosen": -2.4928228855133057, + "logits/rejected": -2.5845017433166504, + "logps/chosen": -210.977783203125, + "logps/rejected": -323.95318603515625, + "loss": 0.1103, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8660938739776611, + "rewards/margins": 5.945382118225098, + "rewards/rejected": -7.81147575378418, + "step": 1470 + }, + { + "epoch": 1.93, + "learning_rate": 1.5399821232025786e-05, + "logits/chosen": -2.3834643363952637, + "logits/rejected": -2.254634141921997, + "logps/chosen": -193.97683715820312, + "logps/rejected": -199.76541137695312, + "loss": 0.2313, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1314117908477783, + "rewards/margins": 3.0294675827026367, + "rewards/rejected": -5.160879611968994, + "step": 1471 + }, + { + "epoch": 1.93, + "learning_rate": 1.5366747992769287e-05, + "logits/chosen": -2.1406219005584717, + "logits/rejected": -2.124356985092163, + "logps/chosen": -212.46966552734375, + "logps/rejected": -255.19540405273438, + "loss": 0.1093, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5713167190551758, + "rewards/margins": 5.360221862792969, + "rewards/rejected": -6.9315385818481445, + "step": 1472 + }, + { + "epoch": 1.93, + "learning_rate": 1.5333694541038557e-05, + "logits/chosen": -2.380096912384033, + "logits/rejected": -2.369213104248047, + "logps/chosen": -210.93670654296875, + "logps/rejected": -287.24224853515625, + "loss": 0.0758, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.420469045639038, + "rewards/margins": 4.858035087585449, + "rewards/rejected": -7.278504371643066, + "step": 1473 + }, + { + "epoch": 1.93, + "learning_rate": 1.5300660944728187e-05, + "logits/chosen": -2.320352554321289, + "logits/rejected": -2.387417793273926, + "logps/chosen": -242.43075561523438, + "logps/rejected": -348.77142333984375, + "loss": 0.0272, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9822957515716553, + "rewards/margins": 6.040739059448242, + "rewards/rejected": -8.023035049438477, + "step": 1474 + }, + { + "epoch": 1.93, + "learning_rate": 1.5267647271692036e-05, + "logits/chosen": -2.2970592975616455, + "logits/rejected": -2.3525736331939697, + "logps/chosen": -179.4069061279297, + "logps/rejected": -236.16236877441406, + "loss": 0.082, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.654468059539795, + "rewards/margins": 4.3037519454956055, + "rewards/rejected": -6.958219528198242, + "step": 1475 + }, + { + "epoch": 1.93, + "learning_rate": 1.523465358974302e-05, + "logits/chosen": -2.382110595703125, + "logits/rejected": -2.4085874557495117, + "logps/chosen": -225.62562561035156, + "logps/rejected": -252.38140869140625, + "loss": 0.2121, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6140198707580566, + "rewards/margins": 4.331794738769531, + "rewards/rejected": -6.945814609527588, + "step": 1476 + }, + { + "epoch": 1.93, + "learning_rate": 1.5201679966652981e-05, + "logits/chosen": -2.3446455001831055, + "logits/rejected": -2.479895830154419, + "logps/chosen": -169.71682739257812, + "logps/rejected": -216.0916290283203, + "loss": 0.2449, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.056380271911621, + "rewards/margins": 3.161182403564453, + "rewards/rejected": -5.217563152313232, + "step": 1477 + }, + { + "epoch": 1.93, + "learning_rate": 1.5168726470152583e-05, + "logits/chosen": -2.4392571449279785, + "logits/rejected": -2.380384683609009, + "logps/chosen": -192.3358612060547, + "logps/rejected": -220.29931640625, + "loss": 0.2332, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.092043876647949, + "rewards/margins": 3.4863698482513428, + "rewards/rejected": -5.578413963317871, + "step": 1478 + }, + { + "epoch": 1.94, + "learning_rate": 1.5135793167931128e-05, + "logits/chosen": -2.374253988265991, + "logits/rejected": -2.4782328605651855, + "logps/chosen": -201.88290405273438, + "logps/rejected": -274.1608581542969, + "loss": 0.0695, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9295202493667603, + "rewards/margins": 4.443012237548828, + "rewards/rejected": -6.372532844543457, + "step": 1479 + }, + { + "epoch": 1.94, + "learning_rate": 1.5102880127636438e-05, + "logits/chosen": -2.341825008392334, + "logits/rejected": -2.3118302822113037, + "logps/chosen": -195.2682342529297, + "logps/rejected": -249.34971618652344, + "loss": 0.2476, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2763445377349854, + "rewards/margins": 3.5501656532287598, + "rewards/rejected": -5.826510429382324, + "step": 1480 + }, + { + "epoch": 1.94, + "learning_rate": 1.506998741687472e-05, + "logits/chosen": -2.386662244796753, + "logits/rejected": -2.387885570526123, + "logps/chosen": -191.30166625976562, + "logps/rejected": -268.77490234375, + "loss": 0.1141, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6786525249481201, + "rewards/margins": 4.7236223220825195, + "rewards/rejected": -6.4022746086120605, + "step": 1481 + }, + { + "epoch": 1.94, + "learning_rate": 1.5037115103210419e-05, + "logits/chosen": -2.4066412448883057, + "logits/rejected": -2.433255195617676, + "logps/chosen": -188.62110900878906, + "logps/rejected": -262.4204406738281, + "loss": 0.1925, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.035118818283081, + "rewards/margins": 4.498135089874268, + "rewards/rejected": -6.533254146575928, + "step": 1482 + }, + { + "epoch": 1.94, + "learning_rate": 1.5004263254166107e-05, + "logits/chosen": -2.293179512023926, + "logits/rejected": -2.3248684406280518, + "logps/chosen": -195.94442749023438, + "logps/rejected": -234.81375122070312, + "loss": 0.7615, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7963595390319824, + "rewards/margins": 2.977153778076172, + "rewards/rejected": -5.7735137939453125, + "step": 1483 + }, + { + "epoch": 1.94, + "learning_rate": 1.4971431937222283e-05, + "logits/chosen": -2.3372790813446045, + "logits/rejected": -2.432366371154785, + "logps/chosen": -172.42056274414062, + "logps/rejected": -264.4915771484375, + "loss": 0.0354, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4935553073883057, + "rewards/margins": 5.465178489685059, + "rewards/rejected": -7.958734512329102, + "step": 1484 + }, + { + "epoch": 1.94, + "learning_rate": 1.493862121981729e-05, + "logits/chosen": -2.247382402420044, + "logits/rejected": -2.2521233558654785, + "logps/chosen": -171.16897583007812, + "logps/rejected": -202.0714874267578, + "loss": 0.1567, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7494994401931763, + "rewards/margins": 3.5377261638641357, + "rewards/rejected": -5.287225723266602, + "step": 1485 + }, + { + "epoch": 1.95, + "learning_rate": 1.4905831169347145e-05, + "logits/chosen": -2.479897975921631, + "logits/rejected": -2.4930005073547363, + "logps/chosen": -178.76251220703125, + "logps/rejected": -211.6082763671875, + "loss": 0.2357, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1313836574554443, + "rewards/margins": 2.863184928894043, + "rewards/rejected": -4.994568824768066, + "step": 1486 + }, + { + "epoch": 1.95, + "learning_rate": 1.4873061853165444e-05, + "logits/chosen": -2.4114627838134766, + "logits/rejected": -2.502809524536133, + "logps/chosen": -201.6521453857422, + "logps/rejected": -259.56207275390625, + "loss": 0.1481, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5456851720809937, + "rewards/margins": 4.57054328918457, + "rewards/rejected": -6.116229057312012, + "step": 1487 + }, + { + "epoch": 1.95, + "learning_rate": 1.4840313338583162e-05, + "logits/chosen": -2.381669044494629, + "logits/rejected": -2.502936601638794, + "logps/chosen": -211.47958374023438, + "logps/rejected": -261.13031005859375, + "loss": 0.1547, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.519897699356079, + "rewards/margins": 4.02741003036499, + "rewards/rejected": -6.547307968139648, + "step": 1488 + }, + { + "epoch": 1.95, + "learning_rate": 1.4807585692868552e-05, + "logits/chosen": -2.4341812133789062, + "logits/rejected": -2.325894832611084, + "logps/chosen": -166.9203338623047, + "logps/rejected": -200.4617462158203, + "loss": 0.2088, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.208306074142456, + "rewards/margins": 3.607130527496338, + "rewards/rejected": -5.815436840057373, + "step": 1489 + }, + { + "epoch": 1.95, + "learning_rate": 1.4774878983247026e-05, + "logits/chosen": -2.4430747032165527, + "logits/rejected": -2.509141683578491, + "logps/chosen": -192.38897705078125, + "logps/rejected": -243.39483642578125, + "loss": 0.1405, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8950132131576538, + "rewards/margins": 4.2639055252075195, + "rewards/rejected": -6.158918857574463, + "step": 1490 + }, + { + "epoch": 1.95, + "learning_rate": 1.4742193276900937e-05, + "logits/chosen": -2.3793163299560547, + "logits/rejected": -2.536176919937134, + "logps/chosen": -196.2447967529297, + "logps/rejected": -300.6761779785156, + "loss": 0.3059, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.137730360031128, + "rewards/margins": 3.47735857963562, + "rewards/rejected": -6.615088939666748, + "step": 1491 + }, + { + "epoch": 1.95, + "learning_rate": 1.4709528640969552e-05, + "logits/chosen": -2.296710729598999, + "logits/rejected": -2.3962302207946777, + "logps/chosen": -156.82606506347656, + "logps/rejected": -223.34359741210938, + "loss": 0.1771, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.40519118309021, + "rewards/margins": 3.2766478061676025, + "rewards/rejected": -5.6818389892578125, + "step": 1492 + }, + { + "epoch": 1.95, + "learning_rate": 1.4676885142548829e-05, + "logits/chosen": -2.4579343795776367, + "logits/rejected": -2.450451135635376, + "logps/chosen": -270.4036865234375, + "logps/rejected": -307.7275390625, + "loss": 0.2588, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.959599256515503, + "rewards/margins": 3.889989137649536, + "rewards/rejected": -6.849588394165039, + "step": 1493 + }, + { + "epoch": 1.96, + "learning_rate": 1.4644262848691311e-05, + "logits/chosen": -2.314662218093872, + "logits/rejected": -2.292576313018799, + "logps/chosen": -185.71998596191406, + "logps/rejected": -222.1219482421875, + "loss": 0.1945, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.801149606704712, + "rewards/margins": 3.2993435859680176, + "rewards/rejected": -5.10049295425415, + "step": 1494 + }, + { + "epoch": 1.96, + "learning_rate": 1.4611661826406004e-05, + "logits/chosen": -2.398597478866577, + "logits/rejected": -2.2957332134246826, + "logps/chosen": -230.4066162109375, + "logps/rejected": -220.38003540039062, + "loss": 0.1195, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.370292901992798, + "rewards/margins": 3.5345489978790283, + "rewards/rejected": -5.904841423034668, + "step": 1495 + }, + { + "epoch": 1.96, + "learning_rate": 1.4579082142658176e-05, + "logits/chosen": -2.365417718887329, + "logits/rejected": -2.3913309574127197, + "logps/chosen": -187.0292510986328, + "logps/rejected": -243.29855346679688, + "loss": 0.1477, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.043100118637085, + "rewards/margins": 4.389698028564453, + "rewards/rejected": -6.432798385620117, + "step": 1496 + }, + { + "epoch": 1.96, + "learning_rate": 1.4546523864369303e-05, + "logits/chosen": -2.247971534729004, + "logits/rejected": -2.265176296234131, + "logps/chosen": -194.1903076171875, + "logps/rejected": -263.1048583984375, + "loss": 0.0937, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.664482593536377, + "rewards/margins": 4.286263465881348, + "rewards/rejected": -5.950745582580566, + "step": 1497 + }, + { + "epoch": 1.96, + "learning_rate": 1.4513987058416879e-05, + "logits/chosen": -2.40185809135437, + "logits/rejected": -2.313415288925171, + "logps/chosen": -256.99554443359375, + "logps/rejected": -261.20361328125, + "loss": 0.1004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.336158275604248, + "rewards/margins": 4.115516662597656, + "rewards/rejected": -6.451674461364746, + "step": 1498 + }, + { + "epoch": 1.96, + "learning_rate": 1.448147179163431e-05, + "logits/chosen": -2.234518051147461, + "logits/rejected": -2.3311407566070557, + "logps/chosen": -188.40882873535156, + "logps/rejected": -265.4433288574219, + "loss": 0.1054, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.345661163330078, + "rewards/margins": 4.099471569061279, + "rewards/rejected": -6.445132255554199, + "step": 1499 + }, + { + "epoch": 1.96, + "learning_rate": 1.4448978130810715e-05, + "logits/chosen": -2.266646385192871, + "logits/rejected": -2.3148138523101807, + "logps/chosen": -228.14453125, + "logps/rejected": -279.6742248535156, + "loss": 0.1282, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.513174057006836, + "rewards/margins": 3.8379502296447754, + "rewards/rejected": -6.3511247634887695, + "step": 1500 + }, + { + "epoch": 1.96, + "learning_rate": 1.4416506142690889e-05, + "logits/chosen": -2.4405999183654785, + "logits/rejected": -2.3496949672698975, + "logps/chosen": -228.78981018066406, + "logps/rejected": -251.04478454589844, + "loss": 0.0708, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7194424867630005, + "rewards/margins": 4.5234832763671875, + "rewards/rejected": -6.242925643920898, + "step": 1501 + }, + { + "epoch": 1.97, + "learning_rate": 1.4384055893975051e-05, + "logits/chosen": -2.3932909965515137, + "logits/rejected": -2.377253770828247, + "logps/chosen": -205.10511779785156, + "logps/rejected": -265.08203125, + "loss": 0.0687, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0799789428710938, + "rewards/margins": 5.627187728881836, + "rewards/rejected": -7.70716667175293, + "step": 1502 + }, + { + "epoch": 1.97, + "learning_rate": 1.4351627451318821e-05, + "logits/chosen": -2.2164227962493896, + "logits/rejected": -2.1773486137390137, + "logps/chosen": -151.42803955078125, + "logps/rejected": -174.91114807128906, + "loss": 0.1874, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7035986185073853, + "rewards/margins": 2.8968112468719482, + "rewards/rejected": -4.600409984588623, + "step": 1503 + }, + { + "epoch": 1.97, + "learning_rate": 1.4319220881332979e-05, + "logits/chosen": -2.4396188259124756, + "logits/rejected": -2.3794381618499756, + "logps/chosen": -179.81802368164062, + "logps/rejected": -205.66061401367188, + "loss": 0.075, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.619563341140747, + "rewards/margins": 3.6418092250823975, + "rewards/rejected": -5.2613725662231445, + "step": 1504 + }, + { + "epoch": 1.97, + "learning_rate": 1.428683625058341e-05, + "logits/chosen": -2.3086800575256348, + "logits/rejected": -2.345752239227295, + "logps/chosen": -192.7046356201172, + "logps/rejected": -244.7045135498047, + "loss": 0.0621, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8035435676574707, + "rewards/margins": 4.999575138092041, + "rewards/rejected": -6.803118705749512, + "step": 1505 + }, + { + "epoch": 1.97, + "learning_rate": 1.4254473625590942e-05, + "logits/chosen": -2.419743061065674, + "logits/rejected": -2.438997507095337, + "logps/chosen": -208.04969787597656, + "logps/rejected": -277.6807861328125, + "loss": 0.0497, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8430733680725098, + "rewards/margins": 5.133272171020508, + "rewards/rejected": -6.976346015930176, + "step": 1506 + }, + { + "epoch": 1.97, + "learning_rate": 1.4222133072831143e-05, + "logits/chosen": -2.3248109817504883, + "logits/rejected": -2.3386592864990234, + "logps/chosen": -191.18887329101562, + "logps/rejected": -272.36041259765625, + "loss": 0.0371, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9322004318237305, + "rewards/margins": 4.829640865325928, + "rewards/rejected": -6.761841297149658, + "step": 1507 + }, + { + "epoch": 1.97, + "learning_rate": 1.4189814658734302e-05, + "logits/chosen": -2.328672409057617, + "logits/rejected": -2.430020570755005, + "logps/chosen": -145.69297790527344, + "logps/rejected": -184.90199279785156, + "loss": 0.2388, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0472798347473145, + "rewards/margins": 2.602170705795288, + "rewards/rejected": -4.649450302124023, + "step": 1508 + }, + { + "epoch": 1.98, + "learning_rate": 1.415751844968522e-05, + "logits/chosen": -2.382086992263794, + "logits/rejected": -2.398005485534668, + "logps/chosen": -204.1260986328125, + "logps/rejected": -260.9716491699219, + "loss": 0.0497, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.794833183288574, + "rewards/margins": 4.448567867279053, + "rewards/rejected": -7.243400573730469, + "step": 1509 + }, + { + "epoch": 1.98, + "learning_rate": 1.4125244512023062e-05, + "logits/chosen": -2.4380111694335938, + "logits/rejected": -2.482318639755249, + "logps/chosen": -191.6527557373047, + "logps/rejected": -231.12994384765625, + "loss": 0.2027, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7060452699661255, + "rewards/margins": 3.2779581546783447, + "rewards/rejected": -4.98400354385376, + "step": 1510 + }, + { + "epoch": 1.98, + "learning_rate": 1.4092992912041274e-05, + "logits/chosen": -2.402305841445923, + "logits/rejected": -2.5720431804656982, + "logps/chosen": -185.8068389892578, + "logps/rejected": -256.4432678222656, + "loss": 0.1323, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.64840030670166, + "rewards/margins": 4.095426559448242, + "rewards/rejected": -6.743826866149902, + "step": 1511 + }, + { + "epoch": 1.98, + "learning_rate": 1.4060763715987418e-05, + "logits/chosen": -2.305198907852173, + "logits/rejected": -2.4230222702026367, + "logps/chosen": -212.3504180908203, + "logps/rejected": -296.18792724609375, + "loss": 0.079, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8030345439910889, + "rewards/margins": 5.6656599044799805, + "rewards/rejected": -7.46869421005249, + "step": 1512 + }, + { + "epoch": 1.98, + "learning_rate": 1.4028556990063018e-05, + "logits/chosen": -2.0898091793060303, + "logits/rejected": -2.1528420448303223, + "logps/chosen": -200.12432861328125, + "logps/rejected": -263.9690856933594, + "loss": 0.3021, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2296040058135986, + "rewards/margins": 3.9368090629577637, + "rewards/rejected": -6.166413307189941, + "step": 1513 + }, + { + "epoch": 1.98, + "learning_rate": 1.399637280042344e-05, + "logits/chosen": -2.4251341819763184, + "logits/rejected": -2.4876465797424316, + "logps/chosen": -178.23699951171875, + "logps/rejected": -242.85079956054688, + "loss": 0.2219, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4770002365112305, + "rewards/margins": 3.1585631370544434, + "rewards/rejected": -5.635563373565674, + "step": 1514 + }, + { + "epoch": 1.98, + "learning_rate": 1.3964211213177777e-05, + "logits/chosen": -2.3534107208251953, + "logits/rejected": -2.3668625354766846, + "logps/chosen": -211.80165100097656, + "logps/rejected": -233.48025512695312, + "loss": 0.1395, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0712199211120605, + "rewards/margins": 4.181821346282959, + "rewards/rejected": -6.253041744232178, + "step": 1515 + }, + { + "epoch": 1.98, + "learning_rate": 1.3932072294388701e-05, + "logits/chosen": -2.4182662963867188, + "logits/rejected": -2.4597978591918945, + "logps/chosen": -171.20083618164062, + "logps/rejected": -214.0645751953125, + "loss": 0.0829, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9358714818954468, + "rewards/margins": 4.116148948669434, + "rewards/rejected": -6.052020072937012, + "step": 1516 + }, + { + "epoch": 1.99, + "learning_rate": 1.3899956110072296e-05, + "logits/chosen": -2.4960570335388184, + "logits/rejected": -2.4462080001831055, + "logps/chosen": -242.96786499023438, + "logps/rejected": -299.4893493652344, + "loss": 0.1082, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.992890477180481, + "rewards/margins": 4.823695182800293, + "rewards/rejected": -6.816585540771484, + "step": 1517 + }, + { + "epoch": 1.99, + "learning_rate": 1.386786272619795e-05, + "logits/chosen": -2.4045751094818115, + "logits/rejected": -2.4094862937927246, + "logps/chosen": -190.58834838867188, + "logps/rejected": -230.73782348632812, + "loss": 0.0929, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.293677806854248, + "rewards/margins": 3.4931743144989014, + "rewards/rejected": -5.786851406097412, + "step": 1518 + }, + { + "epoch": 1.99, + "learning_rate": 1.383579220868823e-05, + "logits/chosen": -2.5110924243927, + "logits/rejected": -2.52997088432312, + "logps/chosen": -217.063232421875, + "logps/rejected": -278.5524597167969, + "loss": 0.0505, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.895824670791626, + "rewards/margins": 4.731877326965332, + "rewards/rejected": -7.627702236175537, + "step": 1519 + }, + { + "epoch": 1.99, + "learning_rate": 1.3803744623418751e-05, + "logits/chosen": -2.425816774368286, + "logits/rejected": -2.4611082077026367, + "logps/chosen": -165.69699096679688, + "logps/rejected": -223.45858764648438, + "loss": 0.1553, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4300737380981445, + "rewards/margins": 4.136795520782471, + "rewards/rejected": -6.566868782043457, + "step": 1520 + }, + { + "epoch": 1.99, + "learning_rate": 1.3771720036217969e-05, + "logits/chosen": -2.3446614742279053, + "logits/rejected": -2.4226980209350586, + "logps/chosen": -194.88755798339844, + "logps/rejected": -258.51702880859375, + "loss": 0.1452, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9760570526123047, + "rewards/margins": 4.724256992340088, + "rewards/rejected": -6.700314521789551, + "step": 1521 + }, + { + "epoch": 1.99, + "learning_rate": 1.3739718512867151e-05, + "logits/chosen": -2.3633670806884766, + "logits/rejected": -2.429861307144165, + "logps/chosen": -208.61325073242188, + "logps/rejected": -244.76258850097656, + "loss": 0.0495, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0340147018432617, + "rewards/margins": 3.9696810245513916, + "rewards/rejected": -6.003695964813232, + "step": 1522 + }, + { + "epoch": 1.99, + "learning_rate": 1.3707740119100185e-05, + "logits/chosen": -2.5881145000457764, + "logits/rejected": -2.5587666034698486, + "logps/chosen": -202.13832092285156, + "logps/rejected": -212.1961669921875, + "loss": 0.2812, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.402574300765991, + "rewards/margins": 3.06561541557312, + "rewards/rejected": -5.4681901931762695, + "step": 1523 + }, + { + "epoch": 1.99, + "learning_rate": 1.3675784920603397e-05, + "logits/chosen": -2.2615389823913574, + "logits/rejected": -2.3032755851745605, + "logps/chosen": -195.72640991210938, + "logps/rejected": -244.49838256835938, + "loss": 0.1303, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5124311447143555, + "rewards/margins": 3.363710641860962, + "rewards/rejected": -5.876141548156738, + "step": 1524 + }, + { + "epoch": 2.0, + "learning_rate": 1.3643852983015524e-05, + "logits/chosen": -2.362877368927002, + "logits/rejected": -2.3544070720672607, + "logps/chosen": -198.101318359375, + "logps/rejected": -253.771484375, + "loss": 0.1061, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7205719947814941, + "rewards/margins": 5.395138740539551, + "rewards/rejected": -7.115710258483887, + "step": 1525 + }, + { + "epoch": 2.0, + "learning_rate": 1.3611944371927515e-05, + "logits/chosen": -2.497173309326172, + "logits/rejected": -2.5043833255767822, + "logps/chosen": -217.73866271972656, + "logps/rejected": -250.70755004882812, + "loss": 0.271, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1353023052215576, + "rewards/margins": 3.6170356273651123, + "rewards/rejected": -5.752337455749512, + "step": 1526 + }, + { + "epoch": 2.0, + "learning_rate": 1.3580059152882374e-05, + "logits/chosen": -2.341907024383545, + "logits/rejected": -2.398087739944458, + "logps/chosen": -210.10813903808594, + "logps/rejected": -271.9307861328125, + "loss": 0.1204, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.534003734588623, + "rewards/margins": 3.9333431720733643, + "rewards/rejected": -6.467347145080566, + "step": 1527 + }, + { + "epoch": 2.0, + "learning_rate": 1.3548197391375092e-05, + "logits/chosen": -2.387807607650757, + "logits/rejected": -2.3193914890289307, + "logps/chosen": -201.58016967773438, + "logps/rejected": -243.78512573242188, + "loss": 0.2009, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2035584449768066, + "rewards/margins": 4.358645915985107, + "rewards/rejected": -6.562204360961914, + "step": 1528 + }, + { + "epoch": 2.0, + "learning_rate": 1.3516359152852443e-05, + "logits/chosen": -2.5313313007354736, + "logits/rejected": -2.507538318634033, + "logps/chosen": -188.7819366455078, + "logps/rejected": -256.54669189453125, + "loss": 0.0599, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5708919763565063, + "rewards/margins": 4.414511680603027, + "rewards/rejected": -5.985404014587402, + "step": 1529 + }, + { + "epoch": 2.0, + "learning_rate": 1.348454450271292e-05, + "logits/chosen": -2.366176128387451, + "logits/rejected": -2.422241687774658, + "logps/chosen": -212.8551788330078, + "logps/rejected": -257.6839599609375, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8496794700622559, + "rewards/margins": 5.041144371032715, + "rewards/rejected": -6.890823841094971, + "step": 1530 + }, + { + "epoch": 2.0, + "learning_rate": 1.345275350630652e-05, + "logits/chosen": -2.3863351345062256, + "logits/rejected": -2.448045253753662, + "logps/chosen": -179.83212280273438, + "logps/rejected": -222.12490844726562, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7616995573043823, + "rewards/margins": 5.305027484893799, + "rewards/rejected": -7.066726207733154, + "step": 1531 + }, + { + "epoch": 2.01, + "learning_rate": 1.342098622893469e-05, + "logits/chosen": -2.4129581451416016, + "logits/rejected": -2.3667452335357666, + "logps/chosen": -175.57212829589844, + "logps/rejected": -231.375732421875, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.115656852722168, + "rewards/margins": 6.046964645385742, + "rewards/rejected": -7.162621974945068, + "step": 1532 + }, + { + "epoch": 2.01, + "learning_rate": 1.3389242735850146e-05, + "logits/chosen": -2.2602086067199707, + "logits/rejected": -2.488478422164917, + "logps/chosen": -164.0843048095703, + "logps/rejected": -252.3877716064453, + "loss": 0.0458, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5239639282226562, + "rewards/margins": 4.917407512664795, + "rewards/rejected": -6.441370964050293, + "step": 1533 + }, + { + "epoch": 2.01, + "learning_rate": 1.3357523092256742e-05, + "logits/chosen": -2.409337282180786, + "logits/rejected": -2.466679334640503, + "logps/chosen": -176.50518798828125, + "logps/rejected": -222.38616943359375, + "loss": 0.0454, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7415494918823242, + "rewards/margins": 5.030533790588379, + "rewards/rejected": -6.772083282470703, + "step": 1534 + }, + { + "epoch": 2.01, + "learning_rate": 1.3325827363309329e-05, + "logits/chosen": -2.305659770965576, + "logits/rejected": -2.3584156036376953, + "logps/chosen": -198.14419555664062, + "logps/rejected": -243.364013671875, + "loss": 0.0515, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8096667528152466, + "rewards/margins": 4.990447998046875, + "rewards/rejected": -6.80011510848999, + "step": 1535 + }, + { + "epoch": 2.01, + "learning_rate": 1.3294155614113673e-05, + "logits/chosen": -2.310314416885376, + "logits/rejected": -2.3078436851501465, + "logps/chosen": -220.3576202392578, + "logps/rejected": -274.2171936035156, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.109769582748413, + "rewards/margins": 6.850515365600586, + "rewards/rejected": -7.96028470993042, + "step": 1536 + }, + { + "epoch": 2.01, + "learning_rate": 1.3262507909726251e-05, + "logits/chosen": -2.4167308807373047, + "logits/rejected": -2.5308213233947754, + "logps/chosen": -191.4811248779297, + "logps/rejected": -271.5152893066406, + "loss": 0.0299, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.123037099838257, + "rewards/margins": 5.914687633514404, + "rewards/rejected": -8.037725448608398, + "step": 1537 + }, + { + "epoch": 2.01, + "learning_rate": 1.3230884315154163e-05, + "logits/chosen": -2.252077341079712, + "logits/rejected": -2.312941551208496, + "logps/chosen": -172.56475830078125, + "logps/rejected": -225.84812927246094, + "loss": 0.1099, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0771849155426025, + "rewards/margins": 4.0363078117370605, + "rewards/rejected": -6.113492965698242, + "step": 1538 + }, + { + "epoch": 2.01, + "learning_rate": 1.3199284895355002e-05, + "logits/chosen": -2.3245887756347656, + "logits/rejected": -2.353400468826294, + "logps/chosen": -190.2633514404297, + "logps/rejected": -271.1448059082031, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6137843132019043, + "rewards/margins": 6.092675685882568, + "rewards/rejected": -7.706459999084473, + "step": 1539 + }, + { + "epoch": 2.02, + "learning_rate": 1.316770971523667e-05, + "logits/chosen": -2.4438893795013428, + "logits/rejected": -2.4827592372894287, + "logps/chosen": -171.02200317382812, + "logps/rejected": -239.64212036132812, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.544966697692871, + "rewards/margins": 5.675688743591309, + "rewards/rejected": -7.220654487609863, + "step": 1540 + }, + { + "epoch": 2.02, + "learning_rate": 1.3136158839657287e-05, + "logits/chosen": -2.331041097640991, + "logits/rejected": -2.3731489181518555, + "logps/chosen": -182.59205627441406, + "logps/rejected": -245.08189392089844, + "loss": 0.0712, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8168766498565674, + "rewards/margins": 5.248274326324463, + "rewards/rejected": -7.065151214599609, + "step": 1541 + }, + { + "epoch": 2.02, + "learning_rate": 1.3104632333425066e-05, + "logits/chosen": -2.43971586227417, + "logits/rejected": -2.5733742713928223, + "logps/chosen": -234.81520080566406, + "logps/rejected": -323.9015808105469, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.146144390106201, + "rewards/margins": 6.549072742462158, + "rewards/rejected": -8.69521713256836, + "step": 1542 + }, + { + "epoch": 2.02, + "learning_rate": 1.3073130261298167e-05, + "logits/chosen": -2.257291555404663, + "logits/rejected": -2.3152527809143066, + "logps/chosen": -193.2762451171875, + "logps/rejected": -273.153076171875, + "loss": 0.0818, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3066201210021973, + "rewards/margins": 4.769436836242676, + "rewards/rejected": -7.076056957244873, + "step": 1543 + }, + { + "epoch": 2.02, + "learning_rate": 1.3041652687984535e-05, + "logits/chosen": -2.247779130935669, + "logits/rejected": -2.2334134578704834, + "logps/chosen": -157.1170196533203, + "logps/rejected": -193.9186553955078, + "loss": 0.0543, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5675333738327026, + "rewards/margins": 4.581010818481445, + "rewards/rejected": -6.148544788360596, + "step": 1544 + }, + { + "epoch": 2.02, + "learning_rate": 1.3010199678141793e-05, + "logits/chosen": -2.2957258224487305, + "logits/rejected": -2.413360595703125, + "logps/chosen": -175.71417236328125, + "logps/rejected": -247.52357482910156, + "loss": 0.0631, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8569958209991455, + "rewards/margins": 4.924184322357178, + "rewards/rejected": -6.781179904937744, + "step": 1545 + }, + { + "epoch": 2.02, + "learning_rate": 1.297877129637714e-05, + "logits/chosen": -2.3475942611694336, + "logits/rejected": -2.3347256183624268, + "logps/chosen": -184.77877807617188, + "logps/rejected": -234.86886596679688, + "loss": 0.0247, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.200761318206787, + "rewards/margins": 5.272454738616943, + "rewards/rejected": -7.4732160568237305, + "step": 1546 + }, + { + "epoch": 2.02, + "learning_rate": 1.2947367607247168e-05, + "logits/chosen": -2.392920732498169, + "logits/rejected": -2.5223355293273926, + "logps/chosen": -197.4448699951172, + "logps/rejected": -270.2280578613281, + "loss": 0.051, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9359830617904663, + "rewards/margins": 6.967413902282715, + "rewards/rejected": -8.903396606445312, + "step": 1547 + }, + { + "epoch": 2.03, + "learning_rate": 1.2915988675257729e-05, + "logits/chosen": -2.3832263946533203, + "logits/rejected": -2.4544239044189453, + "logps/chosen": -194.5345458984375, + "logps/rejected": -255.82579040527344, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0281264781951904, + "rewards/margins": 5.703969478607178, + "rewards/rejected": -7.7320966720581055, + "step": 1548 + }, + { + "epoch": 2.03, + "learning_rate": 1.2884634564863853e-05, + "logits/chosen": -2.213028907775879, + "logits/rejected": -2.3441524505615234, + "logps/chosen": -157.87869262695312, + "logps/rejected": -242.41151428222656, + "loss": 0.0496, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8369756937026978, + "rewards/margins": 5.794984340667725, + "rewards/rejected": -7.631959915161133, + "step": 1549 + }, + { + "epoch": 2.03, + "learning_rate": 1.2853305340469592e-05, + "logits/chosen": -2.3266258239746094, + "logits/rejected": -2.393909215927124, + "logps/chosen": -174.3462371826172, + "logps/rejected": -266.98583984375, + "loss": 0.0931, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0885326862335205, + "rewards/margins": 5.259206295013428, + "rewards/rejected": -7.347738742828369, + "step": 1550 + }, + { + "epoch": 2.03, + "learning_rate": 1.2822001066427818e-05, + "logits/chosen": -2.3379714488983154, + "logits/rejected": -2.3444557189941406, + "logps/chosen": -198.1367950439453, + "logps/rejected": -241.43930053710938, + "loss": 0.0597, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.078436851501465, + "rewards/margins": 5.3824849128723145, + "rewards/rejected": -7.460921764373779, + "step": 1551 + }, + { + "epoch": 2.03, + "learning_rate": 1.2790721807040216e-05, + "logits/chosen": -2.3123466968536377, + "logits/rejected": -2.280809164047241, + "logps/chosen": -193.3171844482422, + "logps/rejected": -260.4233703613281, + "loss": 0.0463, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.086230516433716, + "rewards/margins": 6.909906387329102, + "rewards/rejected": -8.996137619018555, + "step": 1552 + }, + { + "epoch": 2.03, + "learning_rate": 1.2759467626557076e-05, + "logits/chosen": -2.2128634452819824, + "logits/rejected": -2.339836597442627, + "logps/chosen": -221.15419006347656, + "logps/rejected": -270.0534973144531, + "loss": 0.0354, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1960716247558594, + "rewards/margins": 4.904435634613037, + "rewards/rejected": -7.1005072593688965, + "step": 1553 + }, + { + "epoch": 2.03, + "learning_rate": 1.2728238589177141e-05, + "logits/chosen": -2.347524642944336, + "logits/rejected": -2.2826714515686035, + "logps/chosen": -203.14903259277344, + "logps/rejected": -260.5144958496094, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7997230291366577, + "rewards/margins": 6.106321334838867, + "rewards/rejected": -7.9060444831848145, + "step": 1554 + }, + { + "epoch": 2.04, + "learning_rate": 1.2697034759047561e-05, + "logits/chosen": -2.300416946411133, + "logits/rejected": -2.3241891860961914, + "logps/chosen": -192.6947479248047, + "logps/rejected": -266.982177734375, + "loss": 0.0546, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4825968742370605, + "rewards/margins": 5.297131061553955, + "rewards/rejected": -7.779728412628174, + "step": 1555 + }, + { + "epoch": 2.04, + "learning_rate": 1.2665856200263649e-05, + "logits/chosen": -2.308593511581421, + "logits/rejected": -2.266723871231079, + "logps/chosen": -194.49267578125, + "logps/rejected": -252.2139434814453, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.18524432182312, + "rewards/margins": 5.096360683441162, + "rewards/rejected": -7.281604766845703, + "step": 1556 + }, + { + "epoch": 2.04, + "learning_rate": 1.2634702976868868e-05, + "logits/chosen": -2.491757869720459, + "logits/rejected": -2.445744037628174, + "logps/chosen": -183.77328491210938, + "logps/rejected": -262.5264892578125, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.481851816177368, + "rewards/margins": 6.556219100952148, + "rewards/rejected": -9.038070678710938, + "step": 1557 + }, + { + "epoch": 2.04, + "learning_rate": 1.2603575152854582e-05, + "logits/chosen": -2.332422971725464, + "logits/rejected": -2.423205852508545, + "logps/chosen": -251.45309448242188, + "logps/rejected": -316.4215393066406, + "loss": 0.048, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.858559489250183, + "rewards/margins": 6.350773811340332, + "rewards/rejected": -8.209333419799805, + "step": 1558 + }, + { + "epoch": 2.04, + "learning_rate": 1.2572472792160029e-05, + "logits/chosen": -2.329160451889038, + "logits/rejected": -2.4804186820983887, + "logps/chosen": -155.7603302001953, + "logps/rejected": -263.2673645019531, + "loss": 0.0469, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1162149906158447, + "rewards/margins": 5.903825759887695, + "rewards/rejected": -8.020040512084961, + "step": 1559 + }, + { + "epoch": 2.04, + "learning_rate": 1.2541395958672128e-05, + "logits/chosen": -2.424405574798584, + "logits/rejected": -2.4706661701202393, + "logps/chosen": -227.40451049804688, + "logps/rejected": -309.1504821777344, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2757644653320312, + "rewards/margins": 7.593533515930176, + "rewards/rejected": -9.869297981262207, + "step": 1560 + }, + { + "epoch": 2.04, + "learning_rate": 1.2510344716225353e-05, + "logits/chosen": -2.4445531368255615, + "logits/rejected": -2.458831310272217, + "logps/chosen": -199.35171508789062, + "logps/rejected": -263.1798095703125, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6037559509277344, + "rewards/margins": 6.655534267425537, + "rewards/rejected": -9.259289741516113, + "step": 1561 + }, + { + "epoch": 2.04, + "learning_rate": 1.247931912860161e-05, + "logits/chosen": -2.354203939437866, + "logits/rejected": -2.4863195419311523, + "logps/chosen": -194.77252197265625, + "logps/rejected": -271.1220703125, + "loss": 0.0422, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8537802696228027, + "rewards/margins": 5.8310546875, + "rewards/rejected": -8.684834480285645, + "step": 1562 + }, + { + "epoch": 2.05, + "learning_rate": 1.2448319259530129e-05, + "logits/chosen": -2.2290539741516113, + "logits/rejected": -2.2520558834075928, + "logps/chosen": -208.50885009765625, + "logps/rejected": -308.70037841796875, + "loss": 0.0581, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.532024383544922, + "rewards/margins": 6.592292785644531, + "rewards/rejected": -9.124316215515137, + "step": 1563 + }, + { + "epoch": 2.05, + "learning_rate": 1.2417345172687303e-05, + "logits/chosen": -2.2981693744659424, + "logits/rejected": -2.3568122386932373, + "logps/chosen": -231.53965759277344, + "logps/rejected": -274.71612548828125, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4461472034454346, + "rewards/margins": 5.940674304962158, + "rewards/rejected": -8.386821746826172, + "step": 1564 + }, + { + "epoch": 2.05, + "learning_rate": 1.2386396931696545e-05, + "logits/chosen": -2.3827478885650635, + "logits/rejected": -2.4525115489959717, + "logps/chosen": -200.04852294921875, + "logps/rejected": -302.6248779296875, + "loss": 0.0513, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.342930316925049, + "rewards/margins": 6.387629508972168, + "rewards/rejected": -8.730560302734375, + "step": 1565 + }, + { + "epoch": 2.05, + "learning_rate": 1.235547460012822e-05, + "logits/chosen": -2.2762413024902344, + "logits/rejected": -2.2673659324645996, + "logps/chosen": -156.23342895507812, + "logps/rejected": -223.69552612304688, + "loss": 0.1848, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6055116653442383, + "rewards/margins": 4.303553104400635, + "rewards/rejected": -7.909064769744873, + "step": 1566 + }, + { + "epoch": 2.05, + "learning_rate": 1.2324578241499434e-05, + "logits/chosen": -2.3084819316864014, + "logits/rejected": -2.2801153659820557, + "logps/chosen": -185.6167755126953, + "logps/rejected": -260.8983154296875, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.420630931854248, + "rewards/margins": 5.761885166168213, + "rewards/rejected": -8.182516098022461, + "step": 1567 + }, + { + "epoch": 2.05, + "learning_rate": 1.2293707919273951e-05, + "logits/chosen": -2.3103654384613037, + "logits/rejected": -2.255885362625122, + "logps/chosen": -234.4677276611328, + "logps/rejected": -281.4022216796875, + "loss": 0.048, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8458998203277588, + "rewards/margins": 6.691174507141113, + "rewards/rejected": -8.537074089050293, + "step": 1568 + }, + { + "epoch": 2.05, + "learning_rate": 1.2262863696862067e-05, + "logits/chosen": -2.1341712474823, + "logits/rejected": -2.179393768310547, + "logps/chosen": -171.52239990234375, + "logps/rejected": -255.7606201171875, + "loss": 0.0893, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2721705436706543, + "rewards/margins": 6.434325218200684, + "rewards/rejected": -8.706496238708496, + "step": 1569 + }, + { + "epoch": 2.05, + "learning_rate": 1.223204563762047e-05, + "logits/chosen": -2.3797271251678467, + "logits/rejected": -2.392305374145508, + "logps/chosen": -201.71975708007812, + "logps/rejected": -281.9989013671875, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8351311683654785, + "rewards/margins": 6.801389694213867, + "rewards/rejected": -9.636520385742188, + "step": 1570 + }, + { + "epoch": 2.06, + "learning_rate": 1.2201253804852081e-05, + "logits/chosen": -2.2498581409454346, + "logits/rejected": -2.273632049560547, + "logps/chosen": -177.27818298339844, + "logps/rejected": -249.8491668701172, + "loss": 0.0353, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8289601802825928, + "rewards/margins": 6.3932414054870605, + "rewards/rejected": -9.222201347351074, + "step": 1571 + }, + { + "epoch": 2.06, + "learning_rate": 1.2170488261805978e-05, + "logits/chosen": -2.5204391479492188, + "logits/rejected": -2.470536231994629, + "logps/chosen": -200.5594024658203, + "logps/rejected": -254.46780395507812, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.601285219192505, + "rewards/margins": 6.231690406799316, + "rewards/rejected": -8.832975387573242, + "step": 1572 + }, + { + "epoch": 2.06, + "learning_rate": 1.2139749071677215e-05, + "logits/chosen": -2.1959707736968994, + "logits/rejected": -2.158179759979248, + "logps/chosen": -163.62408447265625, + "logps/rejected": -258.8075866699219, + "loss": 0.0524, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2156784534454346, + "rewards/margins": 5.738166332244873, + "rewards/rejected": -7.953845024108887, + "step": 1573 + }, + { + "epoch": 2.06, + "learning_rate": 1.2109036297606733e-05, + "logits/chosen": -2.246157646179199, + "logits/rejected": -2.2448368072509766, + "logps/chosen": -241.0048065185547, + "logps/rejected": -304.39208984375, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0603792667388916, + "rewards/margins": 5.706993103027344, + "rewards/rejected": -8.767373085021973, + "step": 1574 + }, + { + "epoch": 2.06, + "learning_rate": 1.207835000268119e-05, + "logits/chosen": -2.292264223098755, + "logits/rejected": -2.406580924987793, + "logps/chosen": -194.7406768798828, + "logps/rejected": -308.48004150390625, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3975987434387207, + "rewards/margins": 6.630142688751221, + "rewards/rejected": -10.027740478515625, + "step": 1575 + }, + { + "epoch": 2.06, + "learning_rate": 1.2047690249932881e-05, + "logits/chosen": -2.4680731296539307, + "logits/rejected": -2.469062089920044, + "logps/chosen": -215.36041259765625, + "logps/rejected": -289.2483215332031, + "loss": 0.0494, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0653109550476074, + "rewards/margins": 6.796317100524902, + "rewards/rejected": -8.861627578735352, + "step": 1576 + }, + { + "epoch": 2.06, + "learning_rate": 1.2017057102339579e-05, + "logits/chosen": -2.3034727573394775, + "logits/rejected": -2.4027416706085205, + "logps/chosen": -224.2242431640625, + "logps/rejected": -324.6923828125, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.576077461242676, + "rewards/margins": 6.626518726348877, + "rewards/rejected": -9.202596664428711, + "step": 1577 + }, + { + "epoch": 2.07, + "learning_rate": 1.198645062282436e-05, + "logits/chosen": -2.3396496772766113, + "logits/rejected": -2.415276288986206, + "logps/chosen": -167.16355895996094, + "logps/rejected": -240.4926300048828, + "loss": 0.0579, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.895721435546875, + "rewards/margins": 5.79655647277832, + "rewards/rejected": -8.692277908325195, + "step": 1578 + }, + { + "epoch": 2.07, + "learning_rate": 1.1955870874255581e-05, + "logits/chosen": -2.207409381866455, + "logits/rejected": -2.326094627380371, + "logps/chosen": -215.62725830078125, + "logps/rejected": -295.116943359375, + "loss": 0.0781, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1009631156921387, + "rewards/margins": 4.741249084472656, + "rewards/rejected": -7.842212200164795, + "step": 1579 + }, + { + "epoch": 2.07, + "learning_rate": 1.1925317919446674e-05, + "logits/chosen": -2.258795976638794, + "logits/rejected": -2.319410800933838, + "logps/chosen": -195.01417541503906, + "logps/rejected": -242.4963836669922, + "loss": 0.1004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8233892917633057, + "rewards/margins": 4.003058433532715, + "rewards/rejected": -6.826447486877441, + "step": 1580 + }, + { + "epoch": 2.07, + "learning_rate": 1.189479182115601e-05, + "logits/chosen": -2.286520004272461, + "logits/rejected": -2.295222043991089, + "logps/chosen": -197.43077087402344, + "logps/rejected": -239.78627014160156, + "loss": 0.0341, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.990055561065674, + "rewards/margins": 5.583894729614258, + "rewards/rejected": -8.57395076751709, + "step": 1581 + }, + { + "epoch": 2.07, + "learning_rate": 1.1864292642086821e-05, + "logits/chosen": -2.42768931388855, + "logits/rejected": -2.488621711730957, + "logps/chosen": -197.78765869140625, + "logps/rejected": -256.57208251953125, + "loss": 0.0932, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.106519937515259, + "rewards/margins": 4.9703850746154785, + "rewards/rejected": -8.076905250549316, + "step": 1582 + }, + { + "epoch": 2.07, + "learning_rate": 1.1833820444887047e-05, + "logits/chosen": -2.25589919090271, + "logits/rejected": -2.3008480072021484, + "logps/chosen": -220.13059997558594, + "logps/rejected": -302.11822509765625, + "loss": 0.0179, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3662121295928955, + "rewards/margins": 6.320446968078613, + "rewards/rejected": -9.68665885925293, + "step": 1583 + }, + { + "epoch": 2.07, + "learning_rate": 1.1803375292149188e-05, + "logits/chosen": -2.213704824447632, + "logits/rejected": -2.3366587162017822, + "logps/chosen": -199.03379821777344, + "logps/rejected": -292.4614562988281, + "loss": 0.0533, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4834330081939697, + "rewards/margins": 5.422192573547363, + "rewards/rejected": -7.905625343322754, + "step": 1584 + }, + { + "epoch": 2.07, + "learning_rate": 1.1772957246410182e-05, + "logits/chosen": -2.2054905891418457, + "logits/rejected": -2.2827553749084473, + "logps/chosen": -164.93017578125, + "logps/rejected": -229.33767700195312, + "loss": 0.1083, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6794934272766113, + "rewards/margins": 5.130819797515869, + "rewards/rejected": -7.8103132247924805, + "step": 1585 + }, + { + "epoch": 2.08, + "learning_rate": 1.174256637015132e-05, + "logits/chosen": -2.324032783508301, + "logits/rejected": -2.2129745483398438, + "logps/chosen": -201.4606170654297, + "logps/rejected": -243.27915954589844, + "loss": 0.0933, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.299257755279541, + "rewards/margins": 5.95577335357666, + "rewards/rejected": -8.25503158569336, + "step": 1586 + }, + { + "epoch": 2.08, + "learning_rate": 1.1712202725798072e-05, + "logits/chosen": -2.212672233581543, + "logits/rejected": -2.254991292953491, + "logps/chosen": -169.02316284179688, + "logps/rejected": -248.58642578125, + "loss": 0.0562, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.544949531555176, + "rewards/margins": 5.900118827819824, + "rewards/rejected": -8.445068359375, + "step": 1587 + }, + { + "epoch": 2.08, + "learning_rate": 1.1681866375719962e-05, + "logits/chosen": -2.2961604595184326, + "logits/rejected": -2.3075456619262695, + "logps/chosen": -231.47576904296875, + "logps/rejected": -278.3673400878906, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.529031991958618, + "rewards/margins": 6.442896842956543, + "rewards/rejected": -9.971928596496582, + "step": 1588 + }, + { + "epoch": 2.08, + "learning_rate": 1.1651557382230444e-05, + "logits/chosen": -2.2988386154174805, + "logits/rejected": -2.3502931594848633, + "logps/chosen": -192.17660522460938, + "logps/rejected": -266.4827880859375, + "loss": 0.028, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.932581663131714, + "rewards/margins": 6.944507598876953, + "rewards/rejected": -9.877090454101562, + "step": 1589 + }, + { + "epoch": 2.08, + "learning_rate": 1.1621275807586799e-05, + "logits/chosen": -2.121675968170166, + "logits/rejected": -2.3211705684661865, + "logps/chosen": -154.65994262695312, + "logps/rejected": -319.36865234375, + "loss": 0.052, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8421897888183594, + "rewards/margins": 6.399538516998291, + "rewards/rejected": -9.241728782653809, + "step": 1590 + }, + { + "epoch": 2.08, + "learning_rate": 1.1591021713989986e-05, + "logits/chosen": -2.15118145942688, + "logits/rejected": -2.205139636993408, + "logps/chosen": -203.25672912597656, + "logps/rejected": -279.9464111328125, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.680593490600586, + "rewards/margins": 6.324219226837158, + "rewards/rejected": -10.004813194274902, + "step": 1591 + }, + { + "epoch": 2.08, + "learning_rate": 1.1560795163584492e-05, + "logits/chosen": -2.1271445751190186, + "logits/rejected": -2.2297892570495605, + "logps/chosen": -191.21505737304688, + "logps/rejected": -252.1796875, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4730653762817383, + "rewards/margins": 6.288280487060547, + "rewards/rejected": -9.761346817016602, + "step": 1592 + }, + { + "epoch": 2.09, + "learning_rate": 1.153059621845825e-05, + "logits/chosen": -2.1384806632995605, + "logits/rejected": -2.1799659729003906, + "logps/chosen": -209.28909301757812, + "logps/rejected": -294.0850830078125, + "loss": 0.0336, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.143935680389404, + "rewards/margins": 5.88149356842041, + "rewards/rejected": -10.025428771972656, + "step": 1593 + }, + { + "epoch": 2.09, + "learning_rate": 1.1500424940642507e-05, + "logits/chosen": -2.103912830352783, + "logits/rejected": -2.061676025390625, + "logps/chosen": -182.24667358398438, + "logps/rejected": -242.8250274658203, + "loss": 0.1188, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8287858963012695, + "rewards/margins": 5.604012966156006, + "rewards/rejected": -8.432799339294434, + "step": 1594 + }, + { + "epoch": 2.09, + "learning_rate": 1.1470281392111611e-05, + "logits/chosen": -2.3141794204711914, + "logits/rejected": -2.380267381668091, + "logps/chosen": -226.50503540039062, + "logps/rejected": -276.2835388183594, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4971554279327393, + "rewards/margins": 6.458747863769531, + "rewards/rejected": -9.955904006958008, + "step": 1595 + }, + { + "epoch": 2.09, + "learning_rate": 1.144016563478302e-05, + "logits/chosen": -2.141918182373047, + "logits/rejected": -2.029339075088501, + "logps/chosen": -249.36764526367188, + "logps/rejected": -325.52423095703125, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7844009399414062, + "rewards/margins": 7.229696750640869, + "rewards/rejected": -11.014098167419434, + "step": 1596 + }, + { + "epoch": 2.09, + "learning_rate": 1.1410077730517089e-05, + "logits/chosen": -2.190662384033203, + "logits/rejected": -2.1414177417755127, + "logps/chosen": -211.12655639648438, + "logps/rejected": -293.4756774902344, + "loss": 0.0462, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3928468227386475, + "rewards/margins": 7.117488384246826, + "rewards/rejected": -9.510336875915527, + "step": 1597 + }, + { + "epoch": 2.09, + "learning_rate": 1.1380017741116933e-05, + "logits/chosen": -2.2064993381500244, + "logits/rejected": -2.184873580932617, + "logps/chosen": -212.8834228515625, + "logps/rejected": -295.10968017578125, + "loss": 0.0461, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.782785654067993, + "rewards/margins": 7.301713943481445, + "rewards/rejected": -11.084498405456543, + "step": 1598 + }, + { + "epoch": 2.09, + "learning_rate": 1.134998572832837e-05, + "logits/chosen": -2.25199818611145, + "logits/rejected": -2.2279510498046875, + "logps/chosen": -197.3810272216797, + "logps/rejected": -248.04248046875, + "loss": 0.0491, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.389653205871582, + "rewards/margins": 6.892034530639648, + "rewards/rejected": -9.281686782836914, + "step": 1599 + }, + { + "epoch": 2.09, + "learning_rate": 1.1319981753839709e-05, + "logits/chosen": -2.1825180053710938, + "logits/rejected": -2.205697774887085, + "logps/chosen": -234.11672973632812, + "logps/rejected": -296.94512939453125, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8696060180664062, + "rewards/margins": 7.079179763793945, + "rewards/rejected": -10.948786735534668, + "step": 1600 + }, + { + "epoch": 2.1, + "learning_rate": 1.129000587928171e-05, + "logits/chosen": -1.7817847728729248, + "logits/rejected": -1.6960285902023315, + "logps/chosen": -232.90301513671875, + "logps/rejected": -320.638916015625, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.071061611175537, + "rewards/margins": 7.4023966789245605, + "rewards/rejected": -11.473458290100098, + "step": 1601 + }, + { + "epoch": 2.1, + "learning_rate": 1.1260058166227364e-05, + "logits/chosen": -2.3523454666137695, + "logits/rejected": -2.4064064025878906, + "logps/chosen": -222.3098907470703, + "logps/rejected": -267.5356750488281, + "loss": 0.0324, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.040623188018799, + "rewards/margins": 5.557378768920898, + "rewards/rejected": -9.598002433776855, + "step": 1602 + }, + { + "epoch": 2.1, + "learning_rate": 1.1230138676191857e-05, + "logits/chosen": -2.1782217025756836, + "logits/rejected": -2.2649686336517334, + "logps/chosen": -276.44940185546875, + "logps/rejected": -351.1517639160156, + "loss": 0.0545, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.3658041954040527, + "rewards/margins": 6.645700931549072, + "rewards/rejected": -10.011505126953125, + "step": 1603 + }, + { + "epoch": 2.1, + "learning_rate": 1.1200247470632393e-05, + "logits/chosen": -2.1321499347686768, + "logits/rejected": -2.137124538421631, + "logps/chosen": -232.744140625, + "logps/rejected": -296.22705078125, + "loss": 0.0569, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.346226453781128, + "rewards/margins": 6.62972354888916, + "rewards/rejected": -9.97594928741455, + "step": 1604 + }, + { + "epoch": 2.1, + "learning_rate": 1.1170384610948065e-05, + "logits/chosen": -2.153550863265991, + "logits/rejected": -2.22656512260437, + "logps/chosen": -206.77157592773438, + "logps/rejected": -298.4268798828125, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.700435161590576, + "rewards/margins": 7.368077754974365, + "rewards/rejected": -11.068513870239258, + "step": 1605 + }, + { + "epoch": 2.1, + "learning_rate": 1.1140550158479737e-05, + "logits/chosen": -1.989073634147644, + "logits/rejected": -2.158825635910034, + "logps/chosen": -188.94114685058594, + "logps/rejected": -244.51181030273438, + "loss": 0.1315, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.8820228576660156, + "rewards/margins": 5.796136379241943, + "rewards/rejected": -9.6781587600708, + "step": 1606 + }, + { + "epoch": 2.1, + "learning_rate": 1.1110744174509952e-05, + "logits/chosen": -2.1463160514831543, + "logits/rejected": -2.2772376537323, + "logps/chosen": -216.15863037109375, + "logps/rejected": -324.2829284667969, + "loss": 0.0448, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.430356025695801, + "rewards/margins": 7.027136325836182, + "rewards/rejected": -10.45749282836914, + "step": 1607 + }, + { + "epoch": 2.1, + "learning_rate": 1.1080966720262737e-05, + "logits/chosen": -2.1895132064819336, + "logits/rejected": -2.404956579208374, + "logps/chosen": -193.4120330810547, + "logps/rejected": -320.78546142578125, + "loss": 0.0521, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.4045162200927734, + "rewards/margins": 5.620763778686523, + "rewards/rejected": -9.02527904510498, + "step": 1608 + }, + { + "epoch": 2.11, + "learning_rate": 1.1051217856903551e-05, + "logits/chosen": -2.132962942123413, + "logits/rejected": -2.190340042114258, + "logps/chosen": -187.3995361328125, + "logps/rejected": -294.4993896484375, + "loss": 0.0487, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9074325561523438, + "rewards/margins": 7.05354642868042, + "rewards/rejected": -9.960978507995605, + "step": 1609 + }, + { + "epoch": 2.11, + "learning_rate": 1.1021497645539115e-05, + "logits/chosen": -2.2757067680358887, + "logits/rejected": -2.3408169746398926, + "logps/chosen": -229.71934509277344, + "logps/rejected": -299.71826171875, + "loss": 0.0875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.7088205814361572, + "rewards/margins": 7.171504020690918, + "rewards/rejected": -10.880324363708496, + "step": 1610 + }, + { + "epoch": 2.11, + "learning_rate": 1.0991806147217282e-05, + "logits/chosen": -2.2038729190826416, + "logits/rejected": -2.290703058242798, + "logps/chosen": -212.97055053710938, + "logps/rejected": -265.35845947265625, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1300806999206543, + "rewards/margins": 5.740922927856445, + "rewards/rejected": -8.871004104614258, + "step": 1611 + }, + { + "epoch": 2.11, + "learning_rate": 1.0962143422926929e-05, + "logits/chosen": -2.2745604515075684, + "logits/rejected": -2.3006949424743652, + "logps/chosen": -215.57911682128906, + "logps/rejected": -291.5915832519531, + "loss": 0.0513, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.8373029232025146, + "rewards/margins": 6.475310802459717, + "rewards/rejected": -10.312614440917969, + "step": 1612 + }, + { + "epoch": 2.11, + "learning_rate": 1.0932509533597843e-05, + "logits/chosen": -2.237213373184204, + "logits/rejected": -2.306109666824341, + "logps/chosen": -207.54611206054688, + "logps/rejected": -297.6869201660156, + "loss": 0.0531, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6361207962036133, + "rewards/margins": 6.58629846572876, + "rewards/rejected": -10.222419738769531, + "step": 1613 + }, + { + "epoch": 2.11, + "learning_rate": 1.0902904540100587e-05, + "logits/chosen": -2.276996612548828, + "logits/rejected": -2.2169079780578613, + "logps/chosen": -254.11712646484375, + "logps/rejected": -297.9110412597656, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4462196826934814, + "rewards/margins": 6.50248384475708, + "rewards/rejected": -9.94870376586914, + "step": 1614 + }, + { + "epoch": 2.11, + "learning_rate": 1.0873328503246336e-05, + "logits/chosen": -2.078299045562744, + "logits/rejected": -2.055584192276001, + "logps/chosen": -181.3302001953125, + "logps/rejected": -213.24034118652344, + "loss": 0.1161, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.237797260284424, + "rewards/margins": 4.867356777191162, + "rewards/rejected": -8.105154037475586, + "step": 1615 + }, + { + "epoch": 2.12, + "learning_rate": 1.0843781483786823e-05, + "logits/chosen": -2.368086576461792, + "logits/rejected": -2.4271674156188965, + "logps/chosen": -268.1463623046875, + "logps/rejected": -323.628662109375, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.088112831115723, + "rewards/margins": 6.905062675476074, + "rewards/rejected": -10.993173599243164, + "step": 1616 + }, + { + "epoch": 2.12, + "learning_rate": 1.081426354241414e-05, + "logits/chosen": -2.116400718688965, + "logits/rejected": -2.2377758026123047, + "logps/chosen": -177.73162841796875, + "logps/rejected": -271.86968994140625, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6129679679870605, + "rewards/margins": 7.3211283683776855, + "rewards/rejected": -10.934096336364746, + "step": 1617 + }, + { + "epoch": 2.12, + "learning_rate": 1.0784774739760694e-05, + "logits/chosen": -2.188504457473755, + "logits/rejected": -2.2299485206604004, + "logps/chosen": -234.41473388671875, + "logps/rejected": -300.1286315917969, + "loss": 0.0469, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9869184494018555, + "rewards/margins": 5.850296497344971, + "rewards/rejected": -9.837214469909668, + "step": 1618 + }, + { + "epoch": 2.12, + "learning_rate": 1.075531513639899e-05, + "logits/chosen": -2.123745918273926, + "logits/rejected": -2.083372116088867, + "logps/chosen": -199.02845764160156, + "logps/rejected": -263.86773681640625, + "loss": 0.0527, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8953633308410645, + "rewards/margins": 6.078271865844727, + "rewards/rejected": -8.973634719848633, + "step": 1619 + }, + { + "epoch": 2.12, + "learning_rate": 1.0725884792841598e-05, + "logits/chosen": -2.271965742111206, + "logits/rejected": -2.390068292617798, + "logps/chosen": -209.24119567871094, + "logps/rejected": -301.5091247558594, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8023877143859863, + "rewards/margins": 7.832930564880371, + "rewards/rejected": -11.635318756103516, + "step": 1620 + }, + { + "epoch": 2.12, + "learning_rate": 1.0696483769540974e-05, + "logits/chosen": -2.243886709213257, + "logits/rejected": -2.3197927474975586, + "logps/chosen": -200.3712158203125, + "logps/rejected": -293.0459289550781, + "loss": 0.0499, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.732126235961914, + "rewards/margins": 5.311313629150391, + "rewards/rejected": -9.043439865112305, + "step": 1621 + }, + { + "epoch": 2.12, + "learning_rate": 1.0667112126889314e-05, + "logits/chosen": -2.1201698780059814, + "logits/rejected": -2.174553871154785, + "logps/chosen": -221.56431579589844, + "logps/rejected": -324.44281005859375, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1623072624206543, + "rewards/margins": 6.70387601852417, + "rewards/rejected": -9.86618423461914, + "step": 1622 + }, + { + "epoch": 2.12, + "learning_rate": 1.0637769925218502e-05, + "logits/chosen": -1.9317796230316162, + "logits/rejected": -2.057620048522949, + "logps/chosen": -198.12396240234375, + "logps/rejected": -278.1123046875, + "loss": 0.0477, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8445963859558105, + "rewards/margins": 6.424114227294922, + "rewards/rejected": -9.26871109008789, + "step": 1623 + }, + { + "epoch": 2.13, + "learning_rate": 1.0608457224799953e-05, + "logits/chosen": -2.1645169258117676, + "logits/rejected": -2.0335216522216797, + "logps/chosen": -235.41375732421875, + "logps/rejected": -255.96315002441406, + "loss": 0.0234, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2206039428710938, + "rewards/margins": 6.013018608093262, + "rewards/rejected": -9.233621597290039, + "step": 1624 + }, + { + "epoch": 2.13, + "learning_rate": 1.0579174085844442e-05, + "logits/chosen": -2.0966222286224365, + "logits/rejected": -2.2772252559661865, + "logps/chosen": -204.748046875, + "logps/rejected": -297.9283447265625, + "loss": 0.0464, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.924081802368164, + "rewards/margins": 5.931114673614502, + "rewards/rejected": -9.855195999145508, + "step": 1625 + }, + { + "epoch": 2.13, + "learning_rate": 1.0549920568502065e-05, + "logits/chosen": -2.1628050804138184, + "logits/rejected": -2.1597471237182617, + "logps/chosen": -216.96873474121094, + "logps/rejected": -281.53814697265625, + "loss": 0.0411, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.092637538909912, + "rewards/margins": 7.373514652252197, + "rewards/rejected": -10.46615219116211, + "step": 1626 + }, + { + "epoch": 2.13, + "learning_rate": 1.0520696732862057e-05, + "logits/chosen": -2.3299717903137207, + "logits/rejected": -2.4439003467559814, + "logps/chosen": -187.8339080810547, + "logps/rejected": -300.79541015625, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.266471862792969, + "rewards/margins": 7.433576583862305, + "rewards/rejected": -11.700048446655273, + "step": 1627 + }, + { + "epoch": 2.13, + "learning_rate": 1.0491502638952675e-05, + "logits/chosen": -2.230226755142212, + "logits/rejected": -2.3637144565582275, + "logps/chosen": -217.8379669189453, + "logps/rejected": -293.7845764160156, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.065153121948242, + "rewards/margins": 6.4123148918151855, + "rewards/rejected": -10.477468490600586, + "step": 1628 + }, + { + "epoch": 2.13, + "learning_rate": 1.0462338346741086e-05, + "logits/chosen": -2.2081868648529053, + "logits/rejected": -2.2785487174987793, + "logps/chosen": -274.3138122558594, + "logps/rejected": -344.9634704589844, + "loss": 0.1104, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.9868292808532715, + "rewards/margins": 5.559194564819336, + "rewards/rejected": -9.546022415161133, + "step": 1629 + }, + { + "epoch": 2.13, + "learning_rate": 1.0433203916133252e-05, + "logits/chosen": -2.169698715209961, + "logits/rejected": -2.371366262435913, + "logps/chosen": -190.68496704101562, + "logps/rejected": -273.5775146484375, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.32405686378479, + "rewards/margins": 7.341072082519531, + "rewards/rejected": -10.665129661560059, + "step": 1630 + }, + { + "epoch": 2.13, + "learning_rate": 1.0404099406973803e-05, + "logits/chosen": -2.2203657627105713, + "logits/rejected": -2.3042683601379395, + "logps/chosen": -188.48623657226562, + "logps/rejected": -267.5204772949219, + "loss": 0.0399, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2512853145599365, + "rewards/margins": 5.627998352050781, + "rewards/rejected": -8.879283905029297, + "step": 1631 + }, + { + "epoch": 2.14, + "learning_rate": 1.0375024879045889e-05, + "logits/chosen": -2.165238380432129, + "logits/rejected": -2.2101080417633057, + "logps/chosen": -216.67477416992188, + "logps/rejected": -320.3745422363281, + "loss": 0.0447, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.366825580596924, + "rewards/margins": 8.279644012451172, + "rewards/rejected": -11.646470069885254, + "step": 1632 + }, + { + "epoch": 2.14, + "learning_rate": 1.0345980392071073e-05, + "logits/chosen": -2.219921350479126, + "logits/rejected": -2.1141204833984375, + "logps/chosen": -207.9088134765625, + "logps/rejected": -249.02545166015625, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7133326530456543, + "rewards/margins": 6.504356861114502, + "rewards/rejected": -9.217689514160156, + "step": 1633 + }, + { + "epoch": 2.14, + "learning_rate": 1.031696600570923e-05, + "logits/chosen": -2.267820119857788, + "logits/rejected": -2.3031530380249023, + "logps/chosen": -214.98089599609375, + "logps/rejected": -302.63507080078125, + "loss": 0.0895, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.183248519897461, + "rewards/margins": 6.360177993774414, + "rewards/rejected": -10.543425559997559, + "step": 1634 + }, + { + "epoch": 2.14, + "learning_rate": 1.0287981779558411e-05, + "logits/chosen": -2.1929433345794678, + "logits/rejected": -2.2143869400024414, + "logps/chosen": -194.69989013671875, + "logps/rejected": -271.4024353027344, + "loss": 0.0707, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.120606899261475, + "rewards/margins": 6.8030266761779785, + "rewards/rejected": -10.923632621765137, + "step": 1635 + }, + { + "epoch": 2.14, + "learning_rate": 1.0259027773154681e-05, + "logits/chosen": -2.0816688537597656, + "logits/rejected": -2.1673309803009033, + "logps/chosen": -208.89109802246094, + "logps/rejected": -281.7503967285156, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.392482280731201, + "rewards/margins": 7.145538806915283, + "rewards/rejected": -10.538021087646484, + "step": 1636 + }, + { + "epoch": 2.14, + "learning_rate": 1.023010404597206e-05, + "logits/chosen": -2.0719008445739746, + "logits/rejected": -2.143127918243408, + "logps/chosen": -206.9235076904297, + "logps/rejected": -290.1211853027344, + "loss": 0.0491, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.069438934326172, + "rewards/margins": 6.125019073486328, + "rewards/rejected": -10.1944580078125, + "step": 1637 + }, + { + "epoch": 2.14, + "learning_rate": 1.0201210657422386e-05, + "logits/chosen": -2.0575828552246094, + "logits/rejected": -2.1513028144836426, + "logps/chosen": -172.28555297851562, + "logps/rejected": -283.77423095703125, + "loss": 0.045, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.409141778945923, + "rewards/margins": 7.430361747741699, + "rewards/rejected": -10.839503288269043, + "step": 1638 + }, + { + "epoch": 2.15, + "learning_rate": 1.0172347666855117e-05, + "logits/chosen": -2.294290065765381, + "logits/rejected": -2.2601540088653564, + "logps/chosen": -216.41030883789062, + "logps/rejected": -288.5289306640625, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3232598304748535, + "rewards/margins": 7.223988056182861, + "rewards/rejected": -10.547247886657715, + "step": 1639 + }, + { + "epoch": 2.15, + "learning_rate": 1.0143515133557333e-05, + "logits/chosen": -2.066789388656616, + "logits/rejected": -2.2536721229553223, + "logps/chosen": -291.7214660644531, + "logps/rejected": -376.40948486328125, + "loss": 0.0151, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.196288824081421, + "rewards/margins": 8.27435302734375, + "rewards/rejected": -11.470641136169434, + "step": 1640 + }, + { + "epoch": 2.15, + "learning_rate": 1.0114713116753533e-05, + "logits/chosen": -2.3477981090545654, + "logits/rejected": -2.3606629371643066, + "logps/chosen": -248.16506958007812, + "logps/rejected": -356.3658142089844, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3419442176818848, + "rewards/margins": 7.31946325302124, + "rewards/rejected": -10.661409378051758, + "step": 1641 + }, + { + "epoch": 2.15, + "learning_rate": 1.0085941675605517e-05, + "logits/chosen": -2.112485408782959, + "logits/rejected": -2.137420177459717, + "logps/chosen": -246.53201293945312, + "logps/rejected": -299.3148498535156, + "loss": 0.1085, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.04219913482666, + "rewards/margins": 5.816624641418457, + "rewards/rejected": -9.858823776245117, + "step": 1642 + }, + { + "epoch": 2.15, + "learning_rate": 1.0057200869212308e-05, + "logits/chosen": -2.3714053630828857, + "logits/rejected": -2.4155068397521973, + "logps/chosen": -202.87295532226562, + "logps/rejected": -294.2357177734375, + "loss": 0.0471, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.238739013671875, + "rewards/margins": 5.89993143081665, + "rewards/rejected": -9.138670921325684, + "step": 1643 + }, + { + "epoch": 2.15, + "learning_rate": 1.0028490756609971e-05, + "logits/chosen": -2.231926918029785, + "logits/rejected": -2.2947537899017334, + "logps/chosen": -202.70620727539062, + "logps/rejected": -305.67767333984375, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.23713755607605, + "rewards/margins": 8.05178165435791, + "rewards/rejected": -11.288919448852539, + "step": 1644 + }, + { + "epoch": 2.15, + "learning_rate": 9.999811396771554e-06, + "logits/chosen": -2.1920087337493896, + "logits/rejected": -2.270575761795044, + "logps/chosen": -248.9163818359375, + "logps/rejected": -301.1416015625, + "loss": 0.0262, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.135695934295654, + "rewards/margins": 6.194760322570801, + "rewards/rejected": -10.330455780029297, + "step": 1645 + }, + { + "epoch": 2.15, + "learning_rate": 9.971162848606907e-06, + "logits/chosen": -2.1534903049468994, + "logits/rejected": -2.2489120960235596, + "logps/chosen": -199.48480224609375, + "logps/rejected": -292.18505859375, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2431271076202393, + "rewards/margins": 7.970319747924805, + "rewards/rejected": -11.213445663452148, + "step": 1646 + }, + { + "epoch": 2.16, + "learning_rate": 9.942545170962611e-06, + "logits/chosen": -2.2566299438476562, + "logits/rejected": -2.27413010597229, + "logps/chosen": -203.12030029296875, + "logps/rejected": -314.768798828125, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.409508466720581, + "rewards/margins": 7.119698524475098, + "rewards/rejected": -10.529206275939941, + "step": 1647 + }, + { + "epoch": 2.16, + "learning_rate": 9.913958422621845e-06, + "logits/chosen": -2.0916855335235596, + "logits/rejected": -2.169645309448242, + "logps/chosen": -197.75099182128906, + "logps/rejected": -278.7314453125, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1996688842773438, + "rewards/margins": 7.370741844177246, + "rewards/rejected": -10.570409774780273, + "step": 1648 + }, + { + "epoch": 2.16, + "learning_rate": 9.885402662304222e-06, + "logits/chosen": -2.326343536376953, + "logits/rejected": -2.3084158897399902, + "logps/chosen": -199.17776489257812, + "logps/rejected": -262.3792419433594, + "loss": 0.0592, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2895028591156006, + "rewards/margins": 6.672173500061035, + "rewards/rejected": -9.961676597595215, + "step": 1649 + }, + { + "epoch": 2.16, + "learning_rate": 9.856877948665724e-06, + "logits/chosen": -1.9094470739364624, + "logits/rejected": -2.026721239089966, + "logps/chosen": -204.00096130371094, + "logps/rejected": -311.9467468261719, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7700963020324707, + "rewards/margins": 7.001936912536621, + "rewards/rejected": -10.772032737731934, + "step": 1650 + }, + { + "epoch": 2.16, + "learning_rate": 9.828384340298572e-06, + "logits/chosen": -2.1582090854644775, + "logits/rejected": -2.193516969680786, + "logps/chosen": -197.71864318847656, + "logps/rejected": -262.1219177246094, + "loss": 0.0904, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4774627685546875, + "rewards/margins": 5.872793197631836, + "rewards/rejected": -9.350255966186523, + "step": 1651 + }, + { + "epoch": 2.16, + "learning_rate": 9.799921895731062e-06, + "logits/chosen": -1.9444248676300049, + "logits/rejected": -2.0439116954803467, + "logps/chosen": -197.35269165039062, + "logps/rejected": -282.3183898925781, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.436498641967773, + "rewards/margins": 6.666257381439209, + "rewards/rejected": -11.102755546569824, + "step": 1652 + }, + { + "epoch": 2.16, + "learning_rate": 9.771490673427508e-06, + "logits/chosen": -2.2924294471740723, + "logits/rejected": -2.2707741260528564, + "logps/chosen": -257.9192199707031, + "logps/rejected": -323.6116943359375, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.3676676750183105, + "rewards/margins": 7.392003536224365, + "rewards/rejected": -11.759672164916992, + "step": 1653 + }, + { + "epoch": 2.16, + "learning_rate": 9.743090731788088e-06, + "logits/chosen": -2.143031358718872, + "logits/rejected": -2.041055679321289, + "logps/chosen": -220.89820861816406, + "logps/rejected": -307.67462158203125, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4047765731811523, + "rewards/margins": 7.765774726867676, + "rewards/rejected": -11.170552253723145, + "step": 1654 + }, + { + "epoch": 2.17, + "learning_rate": 9.714722129148705e-06, + "logits/chosen": -2.2439427375793457, + "logits/rejected": -2.2781338691711426, + "logps/chosen": -227.20025634765625, + "logps/rejected": -277.2979736328125, + "loss": 0.0533, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.589806079864502, + "rewards/margins": 6.240189075469971, + "rewards/rejected": -9.829996109008789, + "step": 1655 + }, + { + "epoch": 2.17, + "learning_rate": 9.686384923780894e-06, + "logits/chosen": -2.206939220428467, + "logits/rejected": -2.11808443069458, + "logps/chosen": -247.38851928710938, + "logps/rejected": -271.9498596191406, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1840476989746094, + "rewards/margins": 6.11014461517334, + "rewards/rejected": -9.294191360473633, + "step": 1656 + }, + { + "epoch": 2.17, + "learning_rate": 9.658079173891718e-06, + "logits/chosen": -2.1608505249023438, + "logits/rejected": -2.173807144165039, + "logps/chosen": -213.724853515625, + "logps/rejected": -266.6632080078125, + "loss": 0.0544, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.454838991165161, + "rewards/margins": 6.099588394165039, + "rewards/rejected": -9.554428100585938, + "step": 1657 + }, + { + "epoch": 2.17, + "learning_rate": 9.62980493762362e-06, + "logits/chosen": -2.188878059387207, + "logits/rejected": -2.2450056076049805, + "logps/chosen": -203.64845275878906, + "logps/rejected": -275.3282775878906, + "loss": 0.0197, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4441545009613037, + "rewards/margins": 6.198050022125244, + "rewards/rejected": -9.642206192016602, + "step": 1658 + }, + { + "epoch": 2.17, + "learning_rate": 9.60156227305429e-06, + "logits/chosen": -2.227328062057495, + "logits/rejected": -2.251100540161133, + "logps/chosen": -211.82901000976562, + "logps/rejected": -266.81378173828125, + "loss": 0.1153, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.311059951782227, + "rewards/margins": 6.11300802230835, + "rewards/rejected": -10.424068450927734, + "step": 1659 + }, + { + "epoch": 2.17, + "learning_rate": 9.573351238196598e-06, + "logits/chosen": -2.252056360244751, + "logits/rejected": -2.2614476680755615, + "logps/chosen": -231.4994659423828, + "logps/rejected": -299.8752746582031, + "loss": 0.152, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4413654804229736, + "rewards/margins": 7.027933120727539, + "rewards/rejected": -10.469298362731934, + "step": 1660 + }, + { + "epoch": 2.17, + "learning_rate": 9.545171890998415e-06, + "logits/chosen": -2.28059458732605, + "logits/rejected": -2.3064792156219482, + "logps/chosen": -217.7825164794922, + "logps/rejected": -280.8410339355469, + "loss": 0.1021, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.502070188522339, + "rewards/margins": 6.535567283630371, + "rewards/rejected": -10.037637710571289, + "step": 1661 + }, + { + "epoch": 2.18, + "learning_rate": 9.51702428934255e-06, + "logits/chosen": -2.0654711723327637, + "logits/rejected": -2.1146602630615234, + "logps/chosen": -211.56369018554688, + "logps/rejected": -301.67156982421875, + "loss": 0.0938, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.989013671875, + "rewards/margins": 7.091141700744629, + "rewards/rejected": -10.080155372619629, + "step": 1662 + }, + { + "epoch": 2.18, + "learning_rate": 9.488908491046575e-06, + "logits/chosen": -2.2987663745880127, + "logits/rejected": -2.303044319152832, + "logps/chosen": -204.07955932617188, + "logps/rejected": -248.48583984375, + "loss": 0.0352, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.274834156036377, + "rewards/margins": 5.580770492553711, + "rewards/rejected": -7.85560417175293, + "step": 1663 + }, + { + "epoch": 2.18, + "learning_rate": 9.460824553862762e-06, + "logits/chosen": -2.1732518672943115, + "logits/rejected": -2.1585304737091064, + "logps/chosen": -228.58299255371094, + "logps/rejected": -307.8688659667969, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5977869033813477, + "rewards/margins": 7.884058952331543, + "rewards/rejected": -11.48184585571289, + "step": 1664 + }, + { + "epoch": 2.18, + "learning_rate": 9.432772535477941e-06, + "logits/chosen": -2.2695236206054688, + "logits/rejected": -2.167841911315918, + "logps/chosen": -226.73167419433594, + "logps/rejected": -275.1556701660156, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8047893047332764, + "rewards/margins": 6.394350528717041, + "rewards/rejected": -10.199140548706055, + "step": 1665 + }, + { + "epoch": 2.18, + "learning_rate": 9.40475249351333e-06, + "logits/chosen": -2.1247053146362305, + "logits/rejected": -2.1636288166046143, + "logps/chosen": -212.21188354492188, + "logps/rejected": -331.0560302734375, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0351576805114746, + "rewards/margins": 6.875593662261963, + "rewards/rejected": -9.910751342773438, + "step": 1666 + }, + { + "epoch": 2.18, + "learning_rate": 9.376764485524515e-06, + "logits/chosen": -2.289851188659668, + "logits/rejected": -2.2931480407714844, + "logps/chosen": -225.97760009765625, + "logps/rejected": -271.6870422363281, + "loss": 0.0723, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.344660997390747, + "rewards/margins": 5.824459075927734, + "rewards/rejected": -9.169119834899902, + "step": 1667 + }, + { + "epoch": 2.18, + "learning_rate": 9.348808569001272e-06, + "logits/chosen": -2.2589008808135986, + "logits/rejected": -2.261256456375122, + "logps/chosen": -197.78787231445312, + "logps/rejected": -230.6781463623047, + "loss": 0.114, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.5056469440460205, + "rewards/margins": 4.691145896911621, + "rewards/rejected": -8.196792602539062, + "step": 1668 + }, + { + "epoch": 2.18, + "learning_rate": 9.320884801367435e-06, + "logits/chosen": -2.075309991836548, + "logits/rejected": -2.196279525756836, + "logps/chosen": -203.84503173828125, + "logps/rejected": -309.2568054199219, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.108717918395996, + "rewards/margins": 7.0311126708984375, + "rewards/rejected": -11.139830589294434, + "step": 1669 + }, + { + "epoch": 2.19, + "learning_rate": 9.292993239980827e-06, + "logits/chosen": -2.111889362335205, + "logits/rejected": -2.1239311695098877, + "logps/chosen": -168.93466186523438, + "logps/rejected": -261.82501220703125, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.21466064453125, + "rewards/margins": 7.567586421966553, + "rewards/rejected": -10.782247543334961, + "step": 1670 + }, + { + "epoch": 2.19, + "learning_rate": 9.265133942133115e-06, + "logits/chosen": -1.976646065711975, + "logits/rejected": -2.043689250946045, + "logps/chosen": -187.87380981445312, + "logps/rejected": -256.44744873046875, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9920454025268555, + "rewards/margins": 7.2919230461120605, + "rewards/rejected": -10.283968925476074, + "step": 1671 + }, + { + "epoch": 2.19, + "learning_rate": 9.237306965049677e-06, + "logits/chosen": -2.257997989654541, + "logits/rejected": -2.3550729751586914, + "logps/chosen": -201.9134521484375, + "logps/rejected": -292.55157470703125, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.076332092285156, + "rewards/margins": 7.113800048828125, + "rewards/rejected": -11.190132141113281, + "step": 1672 + }, + { + "epoch": 2.19, + "learning_rate": 9.2095123658895e-06, + "logits/chosen": -2.1104347705841064, + "logits/rejected": -2.185384511947632, + "logps/chosen": -232.2742462158203, + "logps/rejected": -336.77496337890625, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.006392955780029, + "rewards/margins": 8.430061340332031, + "rewards/rejected": -12.436452865600586, + "step": 1673 + }, + { + "epoch": 2.19, + "learning_rate": 9.181750201745087e-06, + "logits/chosen": -2.2099478244781494, + "logits/rejected": -1.9959185123443604, + "logps/chosen": -187.31976318359375, + "logps/rejected": -222.1249237060547, + "loss": 0.021, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1507749557495117, + "rewards/margins": 6.772585391998291, + "rewards/rejected": -9.923359870910645, + "step": 1674 + }, + { + "epoch": 2.19, + "learning_rate": 9.15402052964231e-06, + "logits/chosen": -2.1142640113830566, + "logits/rejected": -2.189750909805298, + "logps/chosen": -169.11431884765625, + "logps/rejected": -254.74932861328125, + "loss": 0.0611, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.360865592956543, + "rewards/margins": 6.604044437408447, + "rewards/rejected": -9.964909553527832, + "step": 1675 + }, + { + "epoch": 2.19, + "learning_rate": 9.126323406540282e-06, + "logits/chosen": -2.185606002807617, + "logits/rejected": -2.1900880336761475, + "logps/chosen": -217.94564819335938, + "logps/rejected": -293.8155517578125, + "loss": 0.0894, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.929992198944092, + "rewards/margins": 6.3433098793029785, + "rewards/rejected": -10.27330207824707, + "step": 1676 + }, + { + "epoch": 2.2, + "learning_rate": 9.098658889331265e-06, + "logits/chosen": -2.1766843795776367, + "logits/rejected": -2.230194091796875, + "logps/chosen": -230.25975036621094, + "logps/rejected": -350.9499206542969, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.521965503692627, + "rewards/margins": 8.827539443969727, + "rewards/rejected": -13.349506378173828, + "step": 1677 + }, + { + "epoch": 2.2, + "learning_rate": 9.07102703484056e-06, + "logits/chosen": -2.188322067260742, + "logits/rejected": -2.2243635654449463, + "logps/chosen": -220.1571502685547, + "logps/rejected": -324.730712890625, + "loss": 0.0549, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9159555435180664, + "rewards/margins": 6.724329948425293, + "rewards/rejected": -10.64028549194336, + "step": 1678 + }, + { + "epoch": 2.2, + "learning_rate": 9.043427899826367e-06, + "logits/chosen": -1.9611784219741821, + "logits/rejected": -2.0099782943725586, + "logps/chosen": -194.97201538085938, + "logps/rejected": -252.364990234375, + "loss": 0.0624, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.26351261138916, + "rewards/margins": 5.814044952392578, + "rewards/rejected": -10.077558517456055, + "step": 1679 + }, + { + "epoch": 2.2, + "learning_rate": 9.015861540979667e-06, + "logits/chosen": -2.262836217880249, + "logits/rejected": -2.257211685180664, + "logps/chosen": -240.6387481689453, + "logps/rejected": -293.8360290527344, + "loss": 0.0399, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.846163034439087, + "rewards/margins": 7.1551642417907715, + "rewards/rejected": -11.001328468322754, + "step": 1680 + }, + { + "epoch": 2.2, + "learning_rate": 8.988328014924136e-06, + "logits/chosen": -2.21905779838562, + "logits/rejected": -2.220041275024414, + "logps/chosen": -204.77565002441406, + "logps/rejected": -285.16015625, + "loss": 0.0358, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4136340618133545, + "rewards/margins": 7.5210676193237305, + "rewards/rejected": -10.934701919555664, + "step": 1681 + }, + { + "epoch": 2.2, + "learning_rate": 8.960827378215994e-06, + "logits/chosen": -2.0818262100219727, + "logits/rejected": -2.146986484527588, + "logps/chosen": -211.57376098632812, + "logps/rejected": -344.95977783203125, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.177980899810791, + "rewards/margins": 8.620105743408203, + "rewards/rejected": -12.798086166381836, + "step": 1682 + }, + { + "epoch": 2.2, + "learning_rate": 8.933359687343895e-06, + "logits/chosen": -2.1399118900299072, + "logits/rejected": -2.2968056201934814, + "logps/chosen": -198.75869750976562, + "logps/rejected": -367.2664489746094, + "loss": 0.0463, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2623679637908936, + "rewards/margins": 8.043490409851074, + "rewards/rejected": -11.305858612060547, + "step": 1683 + }, + { + "epoch": 2.2, + "learning_rate": 8.90592499872884e-06, + "logits/chosen": -2.15018892288208, + "logits/rejected": -2.1640963554382324, + "logps/chosen": -197.57192993164062, + "logps/rejected": -256.65386962890625, + "loss": 0.0461, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.4939863681793213, + "rewards/margins": 6.588541507720947, + "rewards/rejected": -10.082528114318848, + "step": 1684 + }, + { + "epoch": 2.21, + "learning_rate": 8.878523368724046e-06, + "logits/chosen": -2.40644907951355, + "logits/rejected": -2.4322493076324463, + "logps/chosen": -275.20172119140625, + "logps/rejected": -316.6629943847656, + "loss": 0.0493, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.638556957244873, + "rewards/margins": 6.466561317443848, + "rewards/rejected": -10.105119705200195, + "step": 1685 + }, + { + "epoch": 2.21, + "learning_rate": 8.851154853614788e-06, + "logits/chosen": -2.15041446685791, + "logits/rejected": -2.196890115737915, + "logps/chosen": -203.22430419921875, + "logps/rejected": -248.96475219726562, + "loss": 0.1115, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.776184320449829, + "rewards/margins": 5.448081970214844, + "rewards/rejected": -8.224266052246094, + "step": 1686 + }, + { + "epoch": 2.21, + "learning_rate": 8.823819509618364e-06, + "logits/chosen": -2.1007986068725586, + "logits/rejected": -2.0942115783691406, + "logps/chosen": -190.64520263671875, + "logps/rejected": -259.6680908203125, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6771810054779053, + "rewards/margins": 6.795676231384277, + "rewards/rejected": -10.472857475280762, + "step": 1687 + }, + { + "epoch": 2.21, + "learning_rate": 8.796517392883894e-06, + "logits/chosen": -2.0247445106506348, + "logits/rejected": -2.1265811920166016, + "logps/chosen": -166.64691162109375, + "logps/rejected": -277.727294921875, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2996060848236084, + "rewards/margins": 7.843116760253906, + "rewards/rejected": -11.142721176147461, + "step": 1688 + }, + { + "epoch": 2.21, + "learning_rate": 8.769248559492286e-06, + "logits/chosen": -2.191051721572876, + "logits/rejected": -2.1718432903289795, + "logps/chosen": -193.72390747070312, + "logps/rejected": -226.29359436035156, + "loss": 0.0978, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9202723503112793, + "rewards/margins": 5.539116382598877, + "rewards/rejected": -8.459388732910156, + "step": 1689 + }, + { + "epoch": 2.21, + "learning_rate": 8.742013065456047e-06, + "logits/chosen": -2.2796130180358887, + "logits/rejected": -2.2604551315307617, + "logps/chosen": -187.779296875, + "logps/rejected": -272.9173889160156, + "loss": 0.1057, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.1299233436584473, + "rewards/margins": 6.76289176940918, + "rewards/rejected": -9.892814636230469, + "step": 1690 + }, + { + "epoch": 2.21, + "learning_rate": 8.714810966719225e-06, + "logits/chosen": -1.9834812879562378, + "logits/rejected": -2.027233362197876, + "logps/chosen": -186.88668823242188, + "logps/rejected": -273.3917236328125, + "loss": 0.1103, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.3214528560638428, + "rewards/margins": 6.90608024597168, + "rewards/rejected": -10.227533340454102, + "step": 1691 + }, + { + "epoch": 2.21, + "learning_rate": 8.687642319157279e-06, + "logits/chosen": -2.123330593109131, + "logits/rejected": -2.1994919776916504, + "logps/chosen": -204.09619140625, + "logps/rejected": -267.6197814941406, + "loss": 0.0827, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.219262599945068, + "rewards/margins": 5.8450493812561035, + "rewards/rejected": -10.064311981201172, + "step": 1692 + }, + { + "epoch": 2.22, + "learning_rate": 8.660507178576907e-06, + "logits/chosen": -2.235646963119507, + "logits/rejected": -2.299664258956909, + "logps/chosen": -197.2362060546875, + "logps/rejected": -271.36590576171875, + "loss": 0.0644, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.8789830207824707, + "rewards/margins": 5.628900051116943, + "rewards/rejected": -9.507883071899414, + "step": 1693 + }, + { + "epoch": 2.22, + "learning_rate": 8.633405600716035e-06, + "logits/chosen": -2.35422420501709, + "logits/rejected": -2.4411213397979736, + "logps/chosen": -217.37399291992188, + "logps/rejected": -282.7566223144531, + "loss": 0.0508, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.441631317138672, + "rewards/margins": 5.957549571990967, + "rewards/rejected": -9.399181365966797, + "step": 1694 + }, + { + "epoch": 2.22, + "learning_rate": 8.606337641243634e-06, + "logits/chosen": -2.140549898147583, + "logits/rejected": -2.2695186138153076, + "logps/chosen": -201.26034545898438, + "logps/rejected": -301.54681396484375, + "loss": 0.0467, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.001363277435303, + "rewards/margins": 7.241910934448242, + "rewards/rejected": -11.243273735046387, + "step": 1695 + }, + { + "epoch": 2.22, + "learning_rate": 8.579303355759597e-06, + "logits/chosen": -2.1433253288269043, + "logits/rejected": -2.1213161945343018, + "logps/chosen": -207.36854553222656, + "logps/rejected": -260.1400146484375, + "loss": 0.0356, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.942014217376709, + "rewards/margins": 6.184602737426758, + "rewards/rejected": -10.126616477966309, + "step": 1696 + }, + { + "epoch": 2.22, + "learning_rate": 8.552302799794675e-06, + "logits/chosen": -2.1126792430877686, + "logits/rejected": -2.27596378326416, + "logps/chosen": -195.60414123535156, + "logps/rejected": -304.55316162109375, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.663034677505493, + "rewards/margins": 7.626105308532715, + "rewards/rejected": -11.289140701293945, + "step": 1697 + }, + { + "epoch": 2.22, + "learning_rate": 8.525336028810333e-06, + "logits/chosen": -2.0849618911743164, + "logits/rejected": -2.2802860736846924, + "logps/chosen": -178.1671905517578, + "logps/rejected": -281.45343017578125, + "loss": 0.0623, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.9442617893218994, + "rewards/margins": 6.8298020362854, + "rewards/rejected": -10.774064064025879, + "step": 1698 + }, + { + "epoch": 2.22, + "learning_rate": 8.498403098198621e-06, + "logits/chosen": -1.9921576976776123, + "logits/rejected": -1.9892094135284424, + "logps/chosen": -196.30545043945312, + "logps/rejected": -263.2068786621094, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6652305126190186, + "rewards/margins": 7.061951637268066, + "rewards/rejected": -10.72718334197998, + "step": 1699 + }, + { + "epoch": 2.23, + "learning_rate": 8.471504063282082e-06, + "logits/chosen": -2.1942853927612305, + "logits/rejected": -2.22829532623291, + "logps/chosen": -223.79489135742188, + "logps/rejected": -295.2370300292969, + "loss": 0.0488, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.158882141113281, + "rewards/margins": 6.205422878265381, + "rewards/rejected": -10.36430549621582, + "step": 1700 + }, + { + "epoch": 2.23, + "learning_rate": 8.444638979313647e-06, + "logits/chosen": -2.1244921684265137, + "logits/rejected": -2.1036319732666016, + "logps/chosen": -175.69313049316406, + "logps/rejected": -268.51983642578125, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2730836868286133, + "rewards/margins": 7.530813217163086, + "rewards/rejected": -10.8038969039917, + "step": 1701 + }, + { + "epoch": 2.23, + "learning_rate": 8.417807901476513e-06, + "logits/chosen": -2.1803970336914062, + "logits/rejected": -2.0890371799468994, + "logps/chosen": -271.51385498046875, + "logps/rejected": -313.63311767578125, + "loss": 0.0467, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.38176155090332, + "rewards/margins": 7.268465042114258, + "rewards/rejected": -11.650225639343262, + "step": 1702 + }, + { + "epoch": 2.23, + "learning_rate": 8.391010884884008e-06, + "logits/chosen": -2.1695311069488525, + "logits/rejected": -2.185807228088379, + "logps/chosen": -204.08135986328125, + "logps/rejected": -253.77536010742188, + "loss": 0.0487, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.5758719444274902, + "rewards/margins": 6.263754367828369, + "rewards/rejected": -9.83962631225586, + "step": 1703 + }, + { + "epoch": 2.23, + "learning_rate": 8.364247984579487e-06, + "logits/chosen": -2.113870859146118, + "logits/rejected": -2.1332485675811768, + "logps/chosen": -233.40701293945312, + "logps/rejected": -329.2414245605469, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.103847980499268, + "rewards/margins": 7.6764984130859375, + "rewards/rejected": -11.780345916748047, + "step": 1704 + }, + { + "epoch": 2.23, + "learning_rate": 8.337519255536259e-06, + "logits/chosen": -2.177475690841675, + "logits/rejected": -2.208240032196045, + "logps/chosen": -226.5784454345703, + "logps/rejected": -310.8875427246094, + "loss": 0.0496, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.634860038757324, + "rewards/margins": 6.781673431396484, + "rewards/rejected": -10.416533470153809, + "step": 1705 + }, + { + "epoch": 2.23, + "learning_rate": 8.310824752657426e-06, + "logits/chosen": -2.2381484508514404, + "logits/rejected": -2.3158018589019775, + "logps/chosen": -221.65826416015625, + "logps/rejected": -298.93603515625, + "loss": 0.056, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.770693778991699, + "rewards/margins": 5.948896884918213, + "rewards/rejected": -9.71959114074707, + "step": 1706 + }, + { + "epoch": 2.23, + "learning_rate": 8.284164530775776e-06, + "logits/chosen": -2.076836347579956, + "logits/rejected": -2.1686782836914062, + "logps/chosen": -218.02001953125, + "logps/rejected": -289.7492370605469, + "loss": 0.1123, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.037287712097168, + "rewards/margins": 5.57905387878418, + "rewards/rejected": -9.616342544555664, + "step": 1707 + }, + { + "epoch": 2.24, + "learning_rate": 8.257538644653695e-06, + "logits/chosen": -2.0760207176208496, + "logits/rejected": -2.141066312789917, + "logps/chosen": -193.05023193359375, + "logps/rejected": -275.33636474609375, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.236574411392212, + "rewards/margins": 7.110628128051758, + "rewards/rejected": -10.34720230102539, + "step": 1708 + }, + { + "epoch": 2.24, + "learning_rate": 8.230947148983056e-06, + "logits/chosen": -2.102829933166504, + "logits/rejected": -2.1513314247131348, + "logps/chosen": -222.66781616210938, + "logps/rejected": -271.4768981933594, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5908262729644775, + "rewards/margins": 7.006527900695801, + "rewards/rejected": -10.597354888916016, + "step": 1709 + }, + { + "epoch": 2.24, + "learning_rate": 8.20439009838504e-06, + "logits/chosen": -2.0865116119384766, + "logits/rejected": -2.0774080753326416, + "logps/chosen": -180.08731079101562, + "logps/rejected": -223.34381103515625, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8457865715026855, + "rewards/margins": 5.593956470489502, + "rewards/rejected": -9.439743041992188, + "step": 1710 + }, + { + "epoch": 2.24, + "learning_rate": 8.177867547410117e-06, + "logits/chosen": -2.084426164627075, + "logits/rejected": -2.1038851737976074, + "logps/chosen": -206.22137451171875, + "logps/rejected": -277.5307312011719, + "loss": 0.0489, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.4118704795837402, + "rewards/margins": 7.60734748840332, + "rewards/rejected": -11.019217491149902, + "step": 1711 + }, + { + "epoch": 2.24, + "learning_rate": 8.151379550537894e-06, + "logits/chosen": -2.2488396167755127, + "logits/rejected": -2.2633602619171143, + "logps/chosen": -220.5198516845703, + "logps/rejected": -293.1371154785156, + "loss": 0.0528, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9966230392456055, + "rewards/margins": 6.593157768249512, + "rewards/rejected": -10.589780807495117, + "step": 1712 + }, + { + "epoch": 2.24, + "learning_rate": 8.124926162176972e-06, + "logits/chosen": -2.072826862335205, + "logits/rejected": -2.070577621459961, + "logps/chosen": -210.2643280029297, + "logps/rejected": -270.3810119628906, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.08571195602417, + "rewards/margins": 6.959323883056641, + "rewards/rejected": -10.045035362243652, + "step": 1713 + }, + { + "epoch": 2.24, + "learning_rate": 8.09850743666489e-06, + "logits/chosen": -2.1879916191101074, + "logits/rejected": -2.1982650756835938, + "logps/chosen": -253.97344970703125, + "logps/rejected": -307.61566162109375, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2880120277404785, + "rewards/margins": 6.6415114402771, + "rewards/rejected": -9.929523468017578, + "step": 1714 + }, + { + "epoch": 2.24, + "learning_rate": 8.072123428267966e-06, + "logits/chosen": -2.325287342071533, + "logits/rejected": -2.3086040019989014, + "logps/chosen": -227.01278686523438, + "logps/rejected": -301.4085388183594, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4240729808807373, + "rewards/margins": 7.784595012664795, + "rewards/rejected": -11.20866870880127, + "step": 1715 + }, + { + "epoch": 2.25, + "learning_rate": 8.045774191181229e-06, + "logits/chosen": -2.147463321685791, + "logits/rejected": -2.103285312652588, + "logps/chosen": -206.35748291015625, + "logps/rejected": -252.62828063964844, + "loss": 0.0175, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.055338144302368, + "rewards/margins": 5.559782028198242, + "rewards/rejected": -8.615119934082031, + "step": 1716 + }, + { + "epoch": 2.25, + "learning_rate": 8.01945977952826e-06, + "logits/chosen": -2.0845131874084473, + "logits/rejected": -2.0898427963256836, + "logps/chosen": -172.79112243652344, + "logps/rejected": -248.797607421875, + "loss": 0.0609, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2854185104370117, + "rewards/margins": 6.905313968658447, + "rewards/rejected": -10.190733909606934, + "step": 1717 + }, + { + "epoch": 2.25, + "learning_rate": 7.993180247361117e-06, + "logits/chosen": -2.0799713134765625, + "logits/rejected": -2.1991119384765625, + "logps/chosen": -244.17735290527344, + "logps/rejected": -356.76776123046875, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.858489990234375, + "rewards/margins": 8.126222610473633, + "rewards/rejected": -12.984710693359375, + "step": 1718 + }, + { + "epoch": 2.25, + "learning_rate": 7.966935648660229e-06, + "logits/chosen": -2.242586851119995, + "logits/rejected": -2.0992231369018555, + "logps/chosen": -248.20318603515625, + "logps/rejected": -291.885498046875, + "loss": 0.0984, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.029764175415039, + "rewards/margins": 6.6830830574035645, + "rewards/rejected": -10.712845802307129, + "step": 1719 + }, + { + "epoch": 2.25, + "learning_rate": 7.940726037334237e-06, + "logits/chosen": -2.1935319900512695, + "logits/rejected": -2.33777117729187, + "logps/chosen": -223.61981201171875, + "logps/rejected": -333.50787353515625, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7222771644592285, + "rewards/margins": 8.106025695800781, + "rewards/rejected": -11.828303337097168, + "step": 1720 + }, + { + "epoch": 2.25, + "learning_rate": 7.914551467219928e-06, + "logits/chosen": -1.8459537029266357, + "logits/rejected": -1.9104955196380615, + "logps/chosen": -225.85025024414062, + "logps/rejected": -330.11334228515625, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.673243999481201, + "rewards/margins": 7.81467866897583, + "rewards/rejected": -12.487922668457031, + "step": 1721 + }, + { + "epoch": 2.25, + "learning_rate": 7.88841199208212e-06, + "logits/chosen": -2.0117578506469727, + "logits/rejected": -2.080282211303711, + "logps/chosen": -187.16709899902344, + "logps/rejected": -296.5640869140625, + "loss": 0.0481, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.788334846496582, + "rewards/margins": 6.446020126342773, + "rewards/rejected": -10.234354972839355, + "step": 1722 + }, + { + "epoch": 2.26, + "learning_rate": 7.862307665613543e-06, + "logits/chosen": -1.9193265438079834, + "logits/rejected": -2.013028144836426, + "logps/chosen": -215.90859985351562, + "logps/rejected": -289.8129577636719, + "loss": 0.0501, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.7413129806518555, + "rewards/margins": 7.53889274597168, + "rewards/rejected": -11.280205726623535, + "step": 1723 + }, + { + "epoch": 2.26, + "learning_rate": 7.836238541434709e-06, + "logits/chosen": -2.1009018421173096, + "logits/rejected": -2.122446298599243, + "logps/chosen": -219.10997009277344, + "logps/rejected": -285.4053955078125, + "loss": 0.1318, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.125407695770264, + "rewards/margins": 6.1049981117248535, + "rewards/rejected": -10.230405807495117, + "step": 1724 + }, + { + "epoch": 2.26, + "learning_rate": 7.810204673093848e-06, + "logits/chosen": -2.0991744995117188, + "logits/rejected": -2.227687358856201, + "logps/chosen": -237.60040283203125, + "logps/rejected": -340.72216796875, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2778449058532715, + "rewards/margins": 8.455217361450195, + "rewards/rejected": -12.733062744140625, + "step": 1725 + }, + { + "epoch": 2.26, + "learning_rate": 7.784206114066753e-06, + "logits/chosen": -2.1397557258605957, + "logits/rejected": -2.1359357833862305, + "logps/chosen": -204.16653442382812, + "logps/rejected": -274.5548400878906, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5361433029174805, + "rewards/margins": 6.830693244934082, + "rewards/rejected": -10.366836547851562, + "step": 1726 + }, + { + "epoch": 2.26, + "learning_rate": 7.758242917756683e-06, + "logits/chosen": -2.276782274246216, + "logits/rejected": -2.2250943183898926, + "logps/chosen": -189.53021240234375, + "logps/rejected": -252.00086975097656, + "loss": 0.0532, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2140469551086426, + "rewards/margins": 5.634598731994629, + "rewards/rejected": -8.848645210266113, + "step": 1727 + }, + { + "epoch": 2.26, + "learning_rate": 7.732315137494277e-06, + "logits/chosen": -2.20760440826416, + "logits/rejected": -2.148677349090576, + "logps/chosen": -203.02822875976562, + "logps/rejected": -246.4269256591797, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.145794630050659, + "rewards/margins": 6.7561821937561035, + "rewards/rejected": -9.901976585388184, + "step": 1728 + }, + { + "epoch": 2.26, + "learning_rate": 7.706422826537435e-06, + "logits/chosen": -2.148484468460083, + "logits/rejected": -2.2669897079467773, + "logps/chosen": -152.33865356445312, + "logps/rejected": -233.62367248535156, + "loss": 0.0648, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.132147789001465, + "rewards/margins": 5.582458972930908, + "rewards/rejected": -8.714607238769531, + "step": 1729 + }, + { + "epoch": 2.26, + "learning_rate": 7.680566038071157e-06, + "logits/chosen": -1.996780514717102, + "logits/rejected": -2.0586371421813965, + "logps/chosen": -179.54551696777344, + "logps/rejected": -279.74627685546875, + "loss": 0.0309, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.321688652038574, + "rewards/margins": 7.3477277755737305, + "rewards/rejected": -11.669416427612305, + "step": 1730 + }, + { + "epoch": 2.27, + "learning_rate": 7.654744825207527e-06, + "logits/chosen": -2.154583215713501, + "logits/rejected": -2.283592462539673, + "logps/chosen": -185.02003479003906, + "logps/rejected": -300.23724365234375, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.956886053085327, + "rewards/margins": 7.720510482788086, + "rewards/rejected": -11.677397727966309, + "step": 1731 + }, + { + "epoch": 2.27, + "learning_rate": 7.628959240985514e-06, + "logits/chosen": -2.106767177581787, + "logits/rejected": -2.0589888095855713, + "logps/chosen": -251.1912841796875, + "logps/rejected": -315.5182800292969, + "loss": 0.0451, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.55436635017395, + "rewards/margins": 7.869057655334473, + "rewards/rejected": -11.423423767089844, + "step": 1732 + }, + { + "epoch": 2.27, + "learning_rate": 7.6032093383709345e-06, + "logits/chosen": -2.070892810821533, + "logits/rejected": -2.235532283782959, + "logps/chosen": -192.06036376953125, + "logps/rejected": -234.05557250976562, + "loss": 0.116, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.1749696731567383, + "rewards/margins": 5.039150714874268, + "rewards/rejected": -8.214120864868164, + "step": 1733 + }, + { + "epoch": 2.27, + "learning_rate": 7.57749517025628e-06, + "logits/chosen": -1.902644395828247, + "logits/rejected": -2.063284397125244, + "logps/chosen": -206.60418701171875, + "logps/rejected": -292.2391357421875, + "loss": 0.0463, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.047153949737549, + "rewards/margins": 7.107631683349609, + "rewards/rejected": -11.154786109924316, + "step": 1734 + }, + { + "epoch": 2.27, + "learning_rate": 7.551816789460664e-06, + "logits/chosen": -2.215104103088379, + "logits/rejected": -2.2091927528381348, + "logps/chosen": -270.4341125488281, + "logps/rejected": -361.67132568359375, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8221192359924316, + "rewards/margins": 8.657204627990723, + "rewards/rejected": -12.479323387145996, + "step": 1735 + }, + { + "epoch": 2.27, + "learning_rate": 7.5261742487297e-06, + "logits/chosen": -2.13075590133667, + "logits/rejected": -2.095841407775879, + "logps/chosen": -224.40060424804688, + "logps/rejected": -327.21160888671875, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.029275894165039, + "rewards/margins": 7.33086633682251, + "rewards/rejected": -11.360143661499023, + "step": 1736 + }, + { + "epoch": 2.27, + "learning_rate": 7.5005676007353364e-06, + "logits/chosen": -1.993504524230957, + "logits/rejected": -2.035719871520996, + "logps/chosen": -194.23150634765625, + "logps/rejected": -267.42095947265625, + "loss": 0.109, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.5826520919799805, + "rewards/margins": 6.0719733238220215, + "rewards/rejected": -9.65462589263916, + "step": 1737 + }, + { + "epoch": 2.27, + "learning_rate": 7.4749968980758365e-06, + "logits/chosen": -1.983603835105896, + "logits/rejected": -1.9229538440704346, + "logps/chosen": -219.50735473632812, + "logps/rejected": -280.61383056640625, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7313010692596436, + "rewards/margins": 6.371834754943848, + "rewards/rejected": -10.103137016296387, + "step": 1738 + }, + { + "epoch": 2.28, + "learning_rate": 7.449462193275628e-06, + "logits/chosen": -2.1656100749969482, + "logits/rejected": -2.166274309158325, + "logps/chosen": -233.36244201660156, + "logps/rejected": -286.607666015625, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3957180976867676, + "rewards/margins": 6.80760383605957, + "rewards/rejected": -10.20332145690918, + "step": 1739 + }, + { + "epoch": 2.28, + "learning_rate": 7.4239635387851615e-06, + "logits/chosen": -1.9828159809112549, + "logits/rejected": -1.9454889297485352, + "logps/chosen": -192.1469268798828, + "logps/rejected": -272.8314208984375, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.286863088607788, + "rewards/margins": 7.123612403869629, + "rewards/rejected": -10.41047477722168, + "step": 1740 + }, + { + "epoch": 2.28, + "learning_rate": 7.398500986980877e-06, + "logits/chosen": -2.0718801021575928, + "logits/rejected": -2.1577818393707275, + "logps/chosen": -229.4145965576172, + "logps/rejected": -302.8059387207031, + "loss": 0.0462, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.6249537467956543, + "rewards/margins": 6.7777605056762695, + "rewards/rejected": -10.402714729309082, + "step": 1741 + }, + { + "epoch": 2.28, + "learning_rate": 7.373074590165041e-06, + "logits/chosen": -2.0363731384277344, + "logits/rejected": -2.1700496673583984, + "logps/chosen": -221.30569458007812, + "logps/rejected": -305.55169677734375, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.189105033874512, + "rewards/margins": 7.2156219482421875, + "rewards/rejected": -11.404727935791016, + "step": 1742 + }, + { + "epoch": 2.28, + "learning_rate": 7.347684400565646e-06, + "logits/chosen": -1.9244577884674072, + "logits/rejected": -2.0748891830444336, + "logps/chosen": -183.54324340820312, + "logps/rejected": -288.7178955078125, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5454869270324707, + "rewards/margins": 7.272469997406006, + "rewards/rejected": -10.817957878112793, + "step": 1743 + }, + { + "epoch": 2.28, + "learning_rate": 7.3223304703363135e-06, + "logits/chosen": -2.2031161785125732, + "logits/rejected": -2.121328353881836, + "logps/chosen": -231.85064697265625, + "logps/rejected": -253.69320678710938, + "loss": 0.0671, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.4508142471313477, + "rewards/margins": 6.465787410736084, + "rewards/rejected": -9.91660213470459, + "step": 1744 + }, + { + "epoch": 2.28, + "learning_rate": 7.297012851556198e-06, + "logits/chosen": -2.2004313468933105, + "logits/rejected": -2.2006747722625732, + "logps/chosen": -229.7209930419922, + "logps/rejected": -292.5038757324219, + "loss": 0.0637, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.7780380249023438, + "rewards/margins": 6.821466445922852, + "rewards/rejected": -10.599504470825195, + "step": 1745 + }, + { + "epoch": 2.29, + "learning_rate": 7.271731596229864e-06, + "logits/chosen": -2.20011043548584, + "logits/rejected": -2.212289810180664, + "logps/chosen": -201.83372497558594, + "logps/rejected": -278.42437744140625, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6086173057556152, + "rewards/margins": 7.120046138763428, + "rewards/rejected": -10.728663444519043, + "step": 1746 + }, + { + "epoch": 2.29, + "learning_rate": 7.2464867562871745e-06, + "logits/chosen": -2.1461029052734375, + "logits/rejected": -2.151041030883789, + "logps/chosen": -252.44137573242188, + "logps/rejected": -282.8408203125, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.325305461883545, + "rewards/margins": 6.61509895324707, + "rewards/rejected": -9.940404891967773, + "step": 1747 + }, + { + "epoch": 2.29, + "learning_rate": 7.221278383583185e-06, + "logits/chosen": -2.0986156463623047, + "logits/rejected": -2.2115676403045654, + "logps/chosen": -214.80799865722656, + "logps/rejected": -295.4984130859375, + "loss": 0.0614, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6774749755859375, + "rewards/margins": 5.940493583679199, + "rewards/rejected": -9.617968559265137, + "step": 1748 + }, + { + "epoch": 2.29, + "learning_rate": 7.1961065298980666e-06, + "logits/chosen": -1.9975063800811768, + "logits/rejected": -2.091555595397949, + "logps/chosen": -215.13330078125, + "logps/rejected": -307.3282165527344, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.527926921844482, + "rewards/margins": 7.537641525268555, + "rewards/rejected": -12.065568923950195, + "step": 1749 + }, + { + "epoch": 2.29, + "learning_rate": 7.170971246936966e-06, + "logits/chosen": -2.104663610458374, + "logits/rejected": -2.122528314590454, + "logps/chosen": -228.45932006835938, + "logps/rejected": -315.5073547363281, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4144935607910156, + "rewards/margins": 7.756294250488281, + "rewards/rejected": -11.170787811279297, + "step": 1750 + }, + { + "epoch": 2.29, + "learning_rate": 7.145872586329902e-06, + "logits/chosen": -2.11407732963562, + "logits/rejected": -2.1897659301757812, + "logps/chosen": -306.5154113769531, + "logps/rejected": -407.1827697753906, + "loss": 0.0895, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.735919237136841, + "rewards/margins": 6.975547790527344, + "rewards/rejected": -10.711467742919922, + "step": 1751 + }, + { + "epoch": 2.29, + "learning_rate": 7.12081059963168e-06, + "logits/chosen": -2.0776185989379883, + "logits/rejected": -2.076169490814209, + "logps/chosen": -200.6983184814453, + "logps/rejected": -265.2762451171875, + "loss": 0.0858, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.390803813934326, + "rewards/margins": 6.0853047370910645, + "rewards/rejected": -11.47610855102539, + "step": 1752 + }, + { + "epoch": 2.29, + "learning_rate": 7.095785338321787e-06, + "logits/chosen": -2.1468100547790527, + "logits/rejected": -2.204784393310547, + "logps/chosen": -216.55856323242188, + "logps/rejected": -313.2846374511719, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9855146408081055, + "rewards/margins": 7.609063148498535, + "rewards/rejected": -11.594578742980957, + "step": 1753 + }, + { + "epoch": 2.3, + "learning_rate": 7.070796853804221e-06, + "logits/chosen": -2.2753148078918457, + "logits/rejected": -2.2854723930358887, + "logps/chosen": -208.4947509765625, + "logps/rejected": -306.0053405761719, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.921264410018921, + "rewards/margins": 7.324665069580078, + "rewards/rejected": -11.245929718017578, + "step": 1754 + }, + { + "epoch": 2.3, + "learning_rate": 7.045845197407494e-06, + "logits/chosen": -2.145781993865967, + "logits/rejected": -2.2183146476745605, + "logps/chosen": -213.462646484375, + "logps/rejected": -290.3109436035156, + "loss": 0.0533, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.63186502456665, + "rewards/margins": 5.664203643798828, + "rewards/rejected": -10.29606819152832, + "step": 1755 + }, + { + "epoch": 2.3, + "learning_rate": 7.02093042038445e-06, + "logits/chosen": -2.2121572494506836, + "logits/rejected": -2.176927089691162, + "logps/chosen": -224.22760009765625, + "logps/rejected": -305.14923095703125, + "loss": 0.052, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4724502563476562, + "rewards/margins": 5.820268154144287, + "rewards/rejected": -9.292718887329102, + "step": 1756 + }, + { + "epoch": 2.3, + "learning_rate": 6.996052573912163e-06, + "logits/chosen": -2.2328412532806396, + "logits/rejected": -2.237520694732666, + "logps/chosen": -233.8831787109375, + "logps/rejected": -300.8020935058594, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.700827121734619, + "rewards/margins": 6.730297088623047, + "rewards/rejected": -11.431123733520508, + "step": 1757 + }, + { + "epoch": 2.3, + "learning_rate": 6.971211709091882e-06, + "logits/chosen": -2.1268723011016846, + "logits/rejected": -2.1312317848205566, + "logps/chosen": -217.12136840820312, + "logps/rejected": -263.36602783203125, + "loss": 0.1273, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.017413139343262, + "rewards/margins": 5.4917073249816895, + "rewards/rejected": -9.50912094116211, + "step": 1758 + }, + { + "epoch": 2.3, + "learning_rate": 6.946407876948854e-06, + "logits/chosen": -2.0834720134735107, + "logits/rejected": -2.123154878616333, + "logps/chosen": -213.0337371826172, + "logps/rejected": -265.0970764160156, + "loss": 0.0503, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.0517897605896, + "rewards/margins": 6.489593029022217, + "rewards/rejected": -10.541382789611816, + "step": 1759 + }, + { + "epoch": 2.3, + "learning_rate": 6.921641128432299e-06, + "logits/chosen": -2.2070693969726562, + "logits/rejected": -2.1769251823425293, + "logps/chosen": -239.89093017578125, + "logps/rejected": -263.4864196777344, + "loss": 0.0529, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.000592231750488, + "rewards/margins": 5.610342502593994, + "rewards/rejected": -9.610936164855957, + "step": 1760 + }, + { + "epoch": 2.3, + "learning_rate": 6.896911514415219e-06, + "logits/chosen": -2.1683695316314697, + "logits/rejected": -2.192795515060425, + "logps/chosen": -252.39651489257812, + "logps/rejected": -330.07366943359375, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.251027584075928, + "rewards/margins": 6.4402594566345215, + "rewards/rejected": -11.691286087036133, + "step": 1761 + }, + { + "epoch": 2.31, + "learning_rate": 6.872219085694376e-06, + "logits/chosen": -2.240156650543213, + "logits/rejected": -2.253934144973755, + "logps/chosen": -260.0806579589844, + "logps/rejected": -311.2843933105469, + "loss": 0.0322, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.272195339202881, + "rewards/margins": 6.815130710601807, + "rewards/rejected": -12.087327003479004, + "step": 1762 + }, + { + "epoch": 2.31, + "learning_rate": 6.8475638929901385e-06, + "logits/chosen": -2.0474612712860107, + "logits/rejected": -2.1473560333251953, + "logps/chosen": -206.079833984375, + "logps/rejected": -284.6674499511719, + "loss": 0.0419, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.4340500831604, + "rewards/margins": 6.111638069152832, + "rewards/rejected": -10.545687675476074, + "step": 1763 + }, + { + "epoch": 2.31, + "learning_rate": 6.822945986946386e-06, + "logits/chosen": -2.053663730621338, + "logits/rejected": -2.1794652938842773, + "logps/chosen": -208.49530029296875, + "logps/rejected": -291.08233642578125, + "loss": 0.0459, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.7978172302246094, + "rewards/margins": 7.258923053741455, + "rewards/rejected": -11.056741714477539, + "step": 1764 + }, + { + "epoch": 2.31, + "learning_rate": 6.798365418130395e-06, + "logits/chosen": -2.1590709686279297, + "logits/rejected": -2.24398136138916, + "logps/chosen": -198.2674102783203, + "logps/rejected": -315.79046630859375, + "loss": 0.0546, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.832282304763794, + "rewards/margins": 6.987562656402588, + "rewards/rejected": -10.819845199584961, + "step": 1765 + }, + { + "epoch": 2.31, + "learning_rate": 6.773822237032779e-06, + "logits/chosen": -2.0665082931518555, + "logits/rejected": -2.0677051544189453, + "logps/chosen": -214.24874877929688, + "logps/rejected": -261.5728759765625, + "loss": 0.092, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.335167407989502, + "rewards/margins": 6.137868881225586, + "rewards/rejected": -10.473037719726562, + "step": 1766 + }, + { + "epoch": 2.31, + "learning_rate": 6.74931649406732e-06, + "logits/chosen": -2.306215524673462, + "logits/rejected": -2.3424389362335205, + "logps/chosen": -241.4788818359375, + "logps/rejected": -348.3437194824219, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.305469512939453, + "rewards/margins": 6.880587577819824, + "rewards/rejected": -11.186056137084961, + "step": 1767 + }, + { + "epoch": 2.31, + "learning_rate": 6.724848239570927e-06, + "logits/chosen": -2.140939474105835, + "logits/rejected": -2.1452903747558594, + "logps/chosen": -219.03012084960938, + "logps/rejected": -269.3534851074219, + "loss": 0.1539, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.8891329765319824, + "rewards/margins": 5.367030143737793, + "rewards/rejected": -9.256163597106934, + "step": 1768 + }, + { + "epoch": 2.32, + "learning_rate": 6.700417523803498e-06, + "logits/chosen": -2.1133172512054443, + "logits/rejected": -2.101212501525879, + "logps/chosen": -195.3477325439453, + "logps/rejected": -258.8363037109375, + "loss": 0.0453, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.436612844467163, + "rewards/margins": 7.114837646484375, + "rewards/rejected": -10.551450729370117, + "step": 1769 + }, + { + "epoch": 2.32, + "learning_rate": 6.6760243969478105e-06, + "logits/chosen": -2.1055779457092285, + "logits/rejected": -2.194345474243164, + "logps/chosen": -215.8966064453125, + "logps/rejected": -301.4342041015625, + "loss": 0.1088, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7081544399261475, + "rewards/margins": 6.4427995681762695, + "rewards/rejected": -10.15095329284668, + "step": 1770 + }, + { + "epoch": 2.32, + "learning_rate": 6.651668909109435e-06, + "logits/chosen": -2.1403582096099854, + "logits/rejected": -2.265516519546509, + "logps/chosen": -238.2333526611328, + "logps/rejected": -346.2216796875, + "loss": 0.045, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.873538017272949, + "rewards/margins": 8.068729400634766, + "rewards/rejected": -12.942268371582031, + "step": 1771 + }, + { + "epoch": 2.32, + "learning_rate": 6.627351110316635e-06, + "logits/chosen": -2.2598462104797363, + "logits/rejected": -2.347071409225464, + "logps/chosen": -201.2434844970703, + "logps/rejected": -283.64190673828125, + "loss": 0.0904, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.5362493991851807, + "rewards/margins": 6.213850021362305, + "rewards/rejected": -9.750099182128906, + "step": 1772 + }, + { + "epoch": 2.32, + "learning_rate": 6.603071050520262e-06, + "logits/chosen": -1.8852803707122803, + "logits/rejected": -1.8770195245742798, + "logps/chosen": -209.76846313476562, + "logps/rejected": -308.1743469238281, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.789165496826172, + "rewards/margins": 7.542721748352051, + "rewards/rejected": -11.331888198852539, + "step": 1773 + }, + { + "epoch": 2.32, + "learning_rate": 6.578828779593632e-06, + "logits/chosen": -1.9376863241195679, + "logits/rejected": -2.0128629207611084, + "logps/chosen": -205.37930297851562, + "logps/rejected": -284.5502624511719, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.145955562591553, + "rewards/margins": 6.896275520324707, + "rewards/rejected": -11.042230606079102, + "step": 1774 + }, + { + "epoch": 2.32, + "learning_rate": 6.554624347332458e-06, + "logits/chosen": -2.1601622104644775, + "logits/rejected": -2.1833298206329346, + "logps/chosen": -190.87811279296875, + "logps/rejected": -273.7500915527344, + "loss": 0.0317, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.168887138366699, + "rewards/margins": 6.715917110443115, + "rewards/rejected": -10.884803771972656, + "step": 1775 + }, + { + "epoch": 2.32, + "learning_rate": 6.530457803454707e-06, + "logits/chosen": -2.22094988822937, + "logits/rejected": -2.216006278991699, + "logps/chosen": -225.97747802734375, + "logps/rejected": -288.8360900878906, + "loss": 0.0618, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.52715539932251, + "rewards/margins": 5.40452766418457, + "rewards/rejected": -9.931682586669922, + "step": 1776 + }, + { + "epoch": 2.33, + "learning_rate": 6.5063291976005445e-06, + "logits/chosen": -2.1064374446868896, + "logits/rejected": -2.1741158962249756, + "logps/chosen": -167.62933349609375, + "logps/rejected": -264.98760986328125, + "loss": 0.058, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.3796675205230713, + "rewards/margins": 6.625879287719727, + "rewards/rejected": -10.005547523498535, + "step": 1777 + }, + { + "epoch": 2.33, + "learning_rate": 6.482238579332184e-06, + "logits/chosen": -2.001718521118164, + "logits/rejected": -2.041412115097046, + "logps/chosen": -194.46405029296875, + "logps/rejected": -295.02532958984375, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.654270648956299, + "rewards/margins": 6.65665340423584, + "rewards/rejected": -10.31092357635498, + "step": 1778 + }, + { + "epoch": 2.33, + "learning_rate": 6.458185998133828e-06, + "logits/chosen": -2.2002484798431396, + "logits/rejected": -2.2560224533081055, + "logps/chosen": -214.6554718017578, + "logps/rejected": -292.0528564453125, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.23142409324646, + "rewards/margins": 7.109623432159424, + "rewards/rejected": -10.341047286987305, + "step": 1779 + }, + { + "epoch": 2.33, + "learning_rate": 6.434171503411557e-06, + "logits/chosen": -2.2057886123657227, + "logits/rejected": -2.1664533615112305, + "logps/chosen": -233.0133056640625, + "logps/rejected": -273.194580078125, + "loss": 0.046, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.7067742347717285, + "rewards/margins": 6.4068708419799805, + "rewards/rejected": -10.113645553588867, + "step": 1780 + }, + { + "epoch": 2.33, + "learning_rate": 6.4101951444931725e-06, + "logits/chosen": -2.3328921794891357, + "logits/rejected": -2.2982568740844727, + "logps/chosen": -278.59051513671875, + "logps/rejected": -312.4544372558594, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.552913188934326, + "rewards/margins": 7.191999912261963, + "rewards/rejected": -10.744913101196289, + "step": 1781 + }, + { + "epoch": 2.33, + "learning_rate": 6.386256970628185e-06, + "logits/chosen": -2.1103456020355225, + "logits/rejected": -2.2298026084899902, + "logps/chosen": -227.68081665039062, + "logps/rejected": -353.94866943359375, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.168125152587891, + "rewards/margins": 7.072556018829346, + "rewards/rejected": -11.240681648254395, + "step": 1782 + }, + { + "epoch": 2.33, + "learning_rate": 6.362357030987667e-06, + "logits/chosen": -2.0208115577697754, + "logits/rejected": -2.057620048522949, + "logps/chosen": -211.07749938964844, + "logps/rejected": -317.357666015625, + "loss": 0.098, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.873246669769287, + "rewards/margins": 6.117911338806152, + "rewards/rejected": -9.991158485412598, + "step": 1783 + }, + { + "epoch": 2.34, + "learning_rate": 6.338495374664127e-06, + "logits/chosen": -1.9480476379394531, + "logits/rejected": -2.128697156906128, + "logps/chosen": -176.2564239501953, + "logps/rejected": -337.870361328125, + "loss": 0.0179, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.32150936126709, + "rewards/margins": 6.756161689758301, + "rewards/rejected": -11.07767105102539, + "step": 1784 + }, + { + "epoch": 2.34, + "learning_rate": 6.314672050671461e-06, + "logits/chosen": -1.9140262603759766, + "logits/rejected": -2.0411410331726074, + "logps/chosen": -140.927978515625, + "logps/rejected": -299.61907958984375, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6733498573303223, + "rewards/margins": 7.062857151031494, + "rewards/rejected": -10.736207008361816, + "step": 1785 + }, + { + "epoch": 2.34, + "learning_rate": 6.290887107944826e-06, + "logits/chosen": -2.172755002975464, + "logits/rejected": -2.233189344406128, + "logps/chosen": -208.9151611328125, + "logps/rejected": -296.10662841796875, + "loss": 0.0876, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.058465480804443, + "rewards/margins": 8.806312561035156, + "rewards/rejected": -12.864777565002441, + "step": 1786 + }, + { + "epoch": 2.34, + "learning_rate": 6.267140595340529e-06, + "logits/chosen": -2.045510768890381, + "logits/rejected": -2.1158676147460938, + "logps/chosen": -250.94557189941406, + "logps/rejected": -366.7011413574219, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7585463523864746, + "rewards/margins": 7.752059459686279, + "rewards/rejected": -11.510605812072754, + "step": 1787 + }, + { + "epoch": 2.34, + "learning_rate": 6.243432561635934e-06, + "logits/chosen": -2.1238536834716797, + "logits/rejected": -2.099079132080078, + "logps/chosen": -199.07931518554688, + "logps/rejected": -258.5728759765625, + "loss": 0.061, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.148482322692871, + "rewards/margins": 6.015169143676758, + "rewards/rejected": -9.163650512695312, + "step": 1788 + }, + { + "epoch": 2.34, + "learning_rate": 6.219763055529384e-06, + "logits/chosen": -2.1907827854156494, + "logits/rejected": -2.333000659942627, + "logps/chosen": -188.3475341796875, + "logps/rejected": -268.57891845703125, + "loss": 0.0544, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.167145252227783, + "rewards/margins": 6.312406539916992, + "rewards/rejected": -10.47955322265625, + "step": 1789 + }, + { + "epoch": 2.34, + "learning_rate": 6.1961321256400836e-06, + "logits/chosen": -2.2008023262023926, + "logits/rejected": -2.1978447437286377, + "logps/chosen": -208.06871032714844, + "logps/rejected": -296.72540283203125, + "loss": 0.09, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.789475917816162, + "rewards/margins": 6.763521194458008, + "rewards/rejected": -10.552996635437012, + "step": 1790 + }, + { + "epoch": 2.34, + "learning_rate": 6.172539820507977e-06, + "logits/chosen": -2.1622846126556396, + "logits/rejected": -2.1091556549072266, + "logps/chosen": -232.94302368164062, + "logps/rejected": -298.225830078125, + "loss": 0.0877, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.8711302280426025, + "rewards/margins": 7.550189018249512, + "rewards/rejected": -11.421319961547852, + "step": 1791 + }, + { + "epoch": 2.35, + "learning_rate": 6.1489861885936805e-06, + "logits/chosen": -2.2897160053253174, + "logits/rejected": -2.225477457046509, + "logps/chosen": -203.59793090820312, + "logps/rejected": -269.2810363769531, + "loss": 0.041, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.657342910766602, + "rewards/margins": 5.531937599182129, + "rewards/rejected": -10.189279556274414, + "step": 1792 + }, + { + "epoch": 2.35, + "learning_rate": 6.125471278278378e-06, + "logits/chosen": -2.196854829788208, + "logits/rejected": -2.1537506580352783, + "logps/chosen": -206.58926391601562, + "logps/rejected": -249.27731323242188, + "loss": 0.0916, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.326552391052246, + "rewards/margins": 5.518029689788818, + "rewards/rejected": -8.844581604003906, + "step": 1793 + }, + { + "epoch": 2.35, + "learning_rate": 6.101995137863717e-06, + "logits/chosen": -2.1413371562957764, + "logits/rejected": -2.0484182834625244, + "logps/chosen": -211.50439453125, + "logps/rejected": -281.75872802734375, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6685571670532227, + "rewards/margins": 6.447417259216309, + "rewards/rejected": -10.115974426269531, + "step": 1794 + }, + { + "epoch": 2.35, + "learning_rate": 6.078557815571692e-06, + "logits/chosen": -2.123558521270752, + "logits/rejected": -2.0788276195526123, + "logps/chosen": -182.96690368652344, + "logps/rejected": -256.431640625, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9243383407592773, + "rewards/margins": 5.789246082305908, + "rewards/rejected": -9.713583946228027, + "step": 1795 + }, + { + "epoch": 2.35, + "learning_rate": 6.055159359544579e-06, + "logits/chosen": -2.0483345985412598, + "logits/rejected": -2.150050640106201, + "logps/chosen": -293.3150634765625, + "logps/rejected": -381.803955078125, + "loss": 0.0606, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.172520160675049, + "rewards/margins": 6.860371112823486, + "rewards/rejected": -11.032891273498535, + "step": 1796 + }, + { + "epoch": 2.35, + "learning_rate": 6.03179981784483e-06, + "logits/chosen": -2.1611032485961914, + "logits/rejected": -2.1823787689208984, + "logps/chosen": -232.75491333007812, + "logps/rejected": -306.81280517578125, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9949727058410645, + "rewards/margins": 7.062458038330078, + "rewards/rejected": -11.0574312210083, + "step": 1797 + }, + { + "epoch": 2.35, + "learning_rate": 6.008479238454915e-06, + "logits/chosen": -2.1579670906066895, + "logits/rejected": -2.1914076805114746, + "logps/chosen": -226.7843017578125, + "logps/rejected": -308.40167236328125, + "loss": 0.0523, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.199453830718994, + "rewards/margins": 7.539890289306641, + "rewards/rejected": -11.73934268951416, + "step": 1798 + }, + { + "epoch": 2.35, + "learning_rate": 5.98519766927732e-06, + "logits/chosen": -2.123642921447754, + "logits/rejected": -2.2567496299743652, + "logps/chosen": -254.04586791992188, + "logps/rejected": -333.2673034667969, + "loss": 0.0467, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.96802282333374, + "rewards/margins": 6.388888359069824, + "rewards/rejected": -11.356910705566406, + "step": 1799 + }, + { + "epoch": 2.36, + "learning_rate": 5.961955158134391e-06, + "logits/chosen": -2.243257999420166, + "logits/rejected": -2.239105463027954, + "logps/chosen": -197.17347717285156, + "logps/rejected": -256.05780029296875, + "loss": 0.0493, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7219371795654297, + "rewards/margins": 5.918248176574707, + "rewards/rejected": -9.640185356140137, + "step": 1800 + }, + { + "epoch": 2.36, + "learning_rate": 5.938751752768226e-06, + "logits/chosen": -1.9114123582839966, + "logits/rejected": -2.05192232131958, + "logps/chosen": -150.3896026611328, + "logps/rejected": -231.26318359375, + "loss": 0.0529, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4007928371429443, + "rewards/margins": 6.229790687561035, + "rewards/rejected": -9.630583763122559, + "step": 1801 + }, + { + "epoch": 2.36, + "learning_rate": 5.915587500840625e-06, + "logits/chosen": -2.0898430347442627, + "logits/rejected": -2.1754465103149414, + "logps/chosen": -214.41954040527344, + "logps/rejected": -312.51507568359375, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.004836082458496, + "rewards/margins": 7.19172477722168, + "rewards/rejected": -11.196561813354492, + "step": 1802 + }, + { + "epoch": 2.36, + "learning_rate": 5.892462449932928e-06, + "logits/chosen": -2.1643826961517334, + "logits/rejected": -2.1606805324554443, + "logps/chosen": -276.9612121582031, + "logps/rejected": -313.9469909667969, + "loss": 0.0463, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.348579406738281, + "rewards/margins": 7.2748212814331055, + "rewards/rejected": -11.623401641845703, + "step": 1803 + }, + { + "epoch": 2.36, + "learning_rate": 5.869376647545993e-06, + "logits/chosen": -2.0581490993499756, + "logits/rejected": -2.216388463973999, + "logps/chosen": -171.70692443847656, + "logps/rejected": -271.78564453125, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.89931058883667, + "rewards/margins": 7.627808094024658, + "rewards/rejected": -10.527118682861328, + "step": 1804 + }, + { + "epoch": 2.36, + "learning_rate": 5.84633014110002e-06, + "logits/chosen": -2.058985471725464, + "logits/rejected": -2.06011962890625, + "logps/chosen": -212.3360137939453, + "logps/rejected": -267.62933349609375, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.888467788696289, + "rewards/margins": 5.834796905517578, + "rewards/rejected": -9.723264694213867, + "step": 1805 + }, + { + "epoch": 2.36, + "learning_rate": 5.82332297793452e-06, + "logits/chosen": -1.9412457942962646, + "logits/rejected": -1.9836660623550415, + "logps/chosen": -163.65220642089844, + "logps/rejected": -238.16073608398438, + "loss": 0.0807, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.8942549228668213, + "rewards/margins": 5.957305908203125, + "rewards/rejected": -9.851561546325684, + "step": 1806 + }, + { + "epoch": 2.37, + "learning_rate": 5.800355205308183e-06, + "logits/chosen": -2.0713634490966797, + "logits/rejected": -2.0610740184783936, + "logps/chosen": -254.6533966064453, + "logps/rejected": -293.9253234863281, + "loss": 0.0981, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2671544551849365, + "rewards/margins": 6.199561595916748, + "rewards/rejected": -9.466714859008789, + "step": 1807 + }, + { + "epoch": 2.37, + "learning_rate": 5.777426870398777e-06, + "logits/chosen": -2.208155393600464, + "logits/rejected": -2.2371108531951904, + "logps/chosen": -212.6221160888672, + "logps/rejected": -268.3111572265625, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.939673662185669, + "rewards/margins": 6.598185062408447, + "rewards/rejected": -10.537859916687012, + "step": 1808 + }, + { + "epoch": 2.37, + "learning_rate": 5.754538020303063e-06, + "logits/chosen": -2.1227216720581055, + "logits/rejected": -2.115285873413086, + "logps/chosen": -214.60894775390625, + "logps/rejected": -292.3565673828125, + "loss": 0.0516, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.129734992980957, + "rewards/margins": 6.104279518127441, + "rewards/rejected": -10.234013557434082, + "step": 1809 + }, + { + "epoch": 2.37, + "learning_rate": 5.731688702036717e-06, + "logits/chosen": -2.2615931034088135, + "logits/rejected": -2.1857032775878906, + "logps/chosen": -222.4258270263672, + "logps/rejected": -317.2377014160156, + "loss": 0.0444, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.3869171142578125, + "rewards/margins": 7.405966281890869, + "rewards/rejected": -10.79288387298584, + "step": 1810 + }, + { + "epoch": 2.37, + "learning_rate": 5.708878962534181e-06, + "logits/chosen": -2.1840271949768066, + "logits/rejected": -2.338179349899292, + "logps/chosen": -247.80409240722656, + "logps/rejected": -328.4097900390625, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.7568488121032715, + "rewards/margins": 7.269282341003418, + "rewards/rejected": -12.026128768920898, + "step": 1811 + }, + { + "epoch": 2.37, + "learning_rate": 5.686108848648624e-06, + "logits/chosen": -2.262352228164673, + "logits/rejected": -2.274977922439575, + "logps/chosen": -210.85458374023438, + "logps/rejected": -297.81134033203125, + "loss": 0.0531, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.134511947631836, + "rewards/margins": 6.266992092132568, + "rewards/rejected": -10.401504516601562, + "step": 1812 + }, + { + "epoch": 2.37, + "learning_rate": 5.6633784071518205e-06, + "logits/chosen": -1.9341158866882324, + "logits/rejected": -1.916247010231018, + "logps/chosen": -189.86752319335938, + "logps/rejected": -254.46963500976562, + "loss": 0.0546, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.688324451446533, + "rewards/margins": 6.383115291595459, + "rewards/rejected": -10.071439743041992, + "step": 1813 + }, + { + "epoch": 2.37, + "learning_rate": 5.640687684734039e-06, + "logits/chosen": -2.0308847427368164, + "logits/rejected": -2.06268572807312, + "logps/chosen": -182.3255157470703, + "logps/rejected": -262.5723571777344, + "loss": 0.0904, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.329672336578369, + "rewards/margins": 6.441825866699219, + "rewards/rejected": -9.77149772644043, + "step": 1814 + }, + { + "epoch": 2.38, + "learning_rate": 5.618036728003965e-06, + "logits/chosen": -2.065044403076172, + "logits/rejected": -2.2009060382843018, + "logps/chosen": -199.57066345214844, + "logps/rejected": -292.4554443359375, + "loss": 0.0863, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.339345455169678, + "rewards/margins": 6.474607944488525, + "rewards/rejected": -10.813953399658203, + "step": 1815 + }, + { + "epoch": 2.38, + "learning_rate": 5.595425583488608e-06, + "logits/chosen": -2.1938352584838867, + "logits/rejected": -2.227843761444092, + "logps/chosen": -203.678466796875, + "logps/rejected": -304.1196594238281, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.499175548553467, + "rewards/margins": 6.937692642211914, + "rewards/rejected": -11.436869621276855, + "step": 1816 + }, + { + "epoch": 2.38, + "learning_rate": 5.572854297633209e-06, + "logits/chosen": -1.8822352886199951, + "logits/rejected": -1.8548696041107178, + "logps/chosen": -236.59857177734375, + "logps/rejected": -265.0214538574219, + "loss": 0.0951, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8601155281066895, + "rewards/margins": 5.618002414703369, + "rewards/rejected": -8.478118896484375, + "step": 1817 + }, + { + "epoch": 2.38, + "learning_rate": 5.550322916801115e-06, + "logits/chosen": -2.0333809852600098, + "logits/rejected": -2.1343681812286377, + "logps/chosen": -205.2764129638672, + "logps/rejected": -323.6562194824219, + "loss": 0.0512, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.380003929138184, + "rewards/margins": 7.164127349853516, + "rewards/rejected": -11.5441312789917, + "step": 1818 + }, + { + "epoch": 2.38, + "learning_rate": 5.5278314872737105e-06, + "logits/chosen": -2.0725784301757812, + "logits/rejected": -2.0405972003936768, + "logps/chosen": -218.96310424804688, + "logps/rejected": -305.20550537109375, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.223976135253906, + "rewards/margins": 7.075107097625732, + "rewards/rejected": -11.299083709716797, + "step": 1819 + }, + { + "epoch": 2.38, + "learning_rate": 5.505380055250325e-06, + "logits/chosen": -2.281785488128662, + "logits/rejected": -2.3167812824249268, + "logps/chosen": -217.0192108154297, + "logps/rejected": -308.8101501464844, + "loss": 0.0607, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.779785633087158, + "rewards/margins": 7.025328159332275, + "rewards/rejected": -10.80511474609375, + "step": 1820 + }, + { + "epoch": 2.38, + "learning_rate": 5.482968666848132e-06, + "logits/chosen": -2.3007218837738037, + "logits/rejected": -2.2076327800750732, + "logps/chosen": -233.9478759765625, + "logps/rejected": -254.18597412109375, + "loss": 0.0992, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.815403461456299, + "rewards/margins": 4.948131561279297, + "rewards/rejected": -8.763534545898438, + "step": 1821 + }, + { + "epoch": 2.38, + "learning_rate": 5.460597368102033e-06, + "logits/chosen": -2.3140413761138916, + "logits/rejected": -2.2337114810943604, + "logps/chosen": -215.36460876464844, + "logps/rejected": -273.8767395019531, + "loss": 0.0487, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.311752796173096, + "rewards/margins": 5.821850776672363, + "rewards/rejected": -10.1336030960083, + "step": 1822 + }, + { + "epoch": 2.39, + "learning_rate": 5.4382662049646036e-06, + "logits/chosen": -2.12442946434021, + "logits/rejected": -2.170098066329956, + "logps/chosen": -214.23106384277344, + "logps/rejected": -306.04620361328125, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.401787042617798, + "rewards/margins": 8.177001953125, + "rewards/rejected": -11.578789710998535, + "step": 1823 + }, + { + "epoch": 2.39, + "learning_rate": 5.4159752233059745e-06, + "logits/chosen": -1.9921551942825317, + "logits/rejected": -2.0559818744659424, + "logps/chosen": -212.55264282226562, + "logps/rejected": -321.1891174316406, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.411143779754639, + "rewards/margins": 7.2184062004089355, + "rewards/rejected": -11.629549980163574, + "step": 1824 + }, + { + "epoch": 2.39, + "learning_rate": 5.393724468913713e-06, + "logits/chosen": -2.294192314147949, + "logits/rejected": -2.3457095623016357, + "logps/chosen": -247.34425354003906, + "logps/rejected": -319.82086181640625, + "loss": 0.0534, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.5188069343566895, + "rewards/margins": 6.9631147384643555, + "rewards/rejected": -11.481922149658203, + "step": 1825 + }, + { + "epoch": 2.39, + "learning_rate": 5.371513987492788e-06, + "logits/chosen": -2.074880361557007, + "logits/rejected": -2.1489012241363525, + "logps/chosen": -218.72879028320312, + "logps/rejected": -264.3954162597656, + "loss": 0.0352, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6044538021087646, + "rewards/margins": 5.694995880126953, + "rewards/rejected": -9.299449920654297, + "step": 1826 + }, + { + "epoch": 2.39, + "learning_rate": 5.34934382466544e-06, + "logits/chosen": -2.2337937355041504, + "logits/rejected": -2.25191593170166, + "logps/chosen": -254.05868530273438, + "logps/rejected": -339.0094909667969, + "loss": 0.0667, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.9696545600891113, + "rewards/margins": 8.696247100830078, + "rewards/rejected": -12.665901184082031, + "step": 1827 + }, + { + "epoch": 2.39, + "learning_rate": 5.32721402597107e-06, + "logits/chosen": -2.317227840423584, + "logits/rejected": -2.2850241661071777, + "logps/chosen": -210.19253540039062, + "logps/rejected": -273.40594482421875, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1642563343048096, + "rewards/margins": 6.633707046508789, + "rewards/rejected": -9.79796314239502, + "step": 1828 + }, + { + "epoch": 2.39, + "learning_rate": 5.3051246368661965e-06, + "logits/chosen": -2.1504149436950684, + "logits/rejected": -2.185987949371338, + "logps/chosen": -241.4326629638672, + "logps/rejected": -307.8175964355469, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.094233512878418, + "rewards/margins": 7.031077861785889, + "rewards/rejected": -12.125310897827148, + "step": 1829 + }, + { + "epoch": 2.4, + "learning_rate": 5.283075702724305e-06, + "logits/chosen": -2.0142877101898193, + "logits/rejected": -1.9442832469940186, + "logps/chosen": -213.77667236328125, + "logps/rejected": -287.9725341796875, + "loss": 0.0225, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.171930313110352, + "rewards/margins": 6.826648712158203, + "rewards/rejected": -10.998579025268555, + "step": 1830 + }, + { + "epoch": 2.4, + "learning_rate": 5.261067268835812e-06, + "logits/chosen": -2.175037145614624, + "logits/rejected": -2.212203025817871, + "logps/chosen": -184.75479125976562, + "logps/rejected": -272.3804931640625, + "loss": 0.1665, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.450549602508545, + "rewards/margins": 5.633615493774414, + "rewards/rejected": -9.0841646194458, + "step": 1831 + }, + { + "epoch": 2.4, + "learning_rate": 5.239099380407916e-06, + "logits/chosen": -2.1539487838745117, + "logits/rejected": -2.253438711166382, + "logps/chosen": -260.47998046875, + "logps/rejected": -349.2868347167969, + "loss": 0.131, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.51906156539917, + "rewards/margins": 6.439817905426025, + "rewards/rejected": -10.958879470825195, + "step": 1832 + }, + { + "epoch": 2.4, + "learning_rate": 5.217172082564547e-06, + "logits/chosen": -2.2008895874023438, + "logits/rejected": -2.1305854320526123, + "logps/chosen": -217.75546264648438, + "logps/rejected": -274.8553466796875, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7898471355438232, + "rewards/margins": 7.112201690673828, + "rewards/rejected": -10.90204906463623, + "step": 1833 + }, + { + "epoch": 2.4, + "learning_rate": 5.195285420346263e-06, + "logits/chosen": -2.2278761863708496, + "logits/rejected": -2.29577374458313, + "logps/chosen": -209.95132446289062, + "logps/rejected": -294.8236999511719, + "loss": 0.0519, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6230764389038086, + "rewards/margins": 6.232743263244629, + "rewards/rejected": -9.855819702148438, + "step": 1834 + }, + { + "epoch": 2.4, + "learning_rate": 5.17343943871014e-06, + "logits/chosen": -2.0527660846710205, + "logits/rejected": -2.106213092803955, + "logps/chosen": -222.8198699951172, + "logps/rejected": -309.6136169433594, + "loss": 0.0811, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.629582405090332, + "rewards/margins": 6.693077564239502, + "rewards/rejected": -11.322659492492676, + "step": 1835 + }, + { + "epoch": 2.4, + "learning_rate": 5.151634182529691e-06, + "logits/chosen": -2.2363510131835938, + "logits/rejected": -2.3908557891845703, + "logps/chosen": -223.19668579101562, + "logps/rejected": -292.28472900390625, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8910751342773438, + "rewards/margins": 6.210873603820801, + "rewards/rejected": -10.101948738098145, + "step": 1836 + }, + { + "epoch": 2.4, + "learning_rate": 5.129869696594786e-06, + "logits/chosen": -1.9200162887573242, + "logits/rejected": -2.015597105026245, + "logps/chosen": -166.8680419921875, + "logps/rejected": -259.1383361816406, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3057968616485596, + "rewards/margins": 7.60993766784668, + "rewards/rejected": -10.915735244750977, + "step": 1837 + }, + { + "epoch": 2.41, + "learning_rate": 5.108146025611554e-06, + "logits/chosen": -2.149808883666992, + "logits/rejected": -2.2070679664611816, + "logps/chosen": -210.15655517578125, + "logps/rejected": -273.7901611328125, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1777126789093018, + "rewards/margins": 7.393311500549316, + "rewards/rejected": -10.571023941040039, + "step": 1838 + }, + { + "epoch": 2.41, + "learning_rate": 5.086463214202264e-06, + "logits/chosen": -2.139334201812744, + "logits/rejected": -2.168809413909912, + "logps/chosen": -211.06936645507812, + "logps/rejected": -286.35272216796875, + "loss": 0.0602, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5051779747009277, + "rewards/margins": 7.504420757293701, + "rewards/rejected": -11.009599685668945, + "step": 1839 + }, + { + "epoch": 2.41, + "learning_rate": 5.064821306905288e-06, + "logits/chosen": -2.3208675384521484, + "logits/rejected": -2.4141125679016113, + "logps/chosen": -262.73638916015625, + "logps/rejected": -333.20159912109375, + "loss": 0.0568, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.194512844085693, + "rewards/margins": 6.03032112121582, + "rewards/rejected": -11.224833488464355, + "step": 1840 + }, + { + "epoch": 2.41, + "learning_rate": 5.043220348174945e-06, + "logits/chosen": -2.197161912918091, + "logits/rejected": -2.178490400314331, + "logps/chosen": -251.76217651367188, + "logps/rejected": -364.5456848144531, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.281266689300537, + "rewards/margins": 8.348335266113281, + "rewards/rejected": -13.62960433959961, + "step": 1841 + }, + { + "epoch": 2.41, + "learning_rate": 5.021660382381457e-06, + "logits/chosen": -2.2058050632476807, + "logits/rejected": -2.2069003582000732, + "logps/chosen": -234.16897583007812, + "logps/rejected": -254.03367614746094, + "loss": 0.0582, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.6393985748291016, + "rewards/margins": 5.1353535652160645, + "rewards/rejected": -8.774752616882324, + "step": 1842 + }, + { + "epoch": 2.41, + "learning_rate": 5.000141453810847e-06, + "logits/chosen": -2.03346848487854, + "logits/rejected": -2.0008981227874756, + "logps/chosen": -182.87733459472656, + "logps/rejected": -230.134521484375, + "loss": 0.1136, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.548121929168701, + "rewards/margins": 5.3795318603515625, + "rewards/rejected": -8.927654266357422, + "step": 1843 + }, + { + "epoch": 2.41, + "learning_rate": 4.9786636066648436e-06, + "logits/chosen": -1.8051122426986694, + "logits/rejected": -1.7872240543365479, + "logps/chosen": -187.14004516601562, + "logps/rejected": -252.6630401611328, + "loss": 0.0507, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.258450508117676, + "rewards/margins": 7.363024711608887, + "rewards/rejected": -10.621475219726562, + "step": 1844 + }, + { + "epoch": 2.41, + "learning_rate": 4.957226885060779e-06, + "logits/chosen": -2.083578109741211, + "logits/rejected": -2.0582897663116455, + "logps/chosen": -209.77432250976562, + "logps/rejected": -292.0636901855469, + "loss": 0.0432, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.913299322128296, + "rewards/margins": 6.929133892059326, + "rewards/rejected": -10.84243392944336, + "step": 1845 + }, + { + "epoch": 2.42, + "learning_rate": 4.935831333031527e-06, + "logits/chosen": -1.87041437625885, + "logits/rejected": -1.8491570949554443, + "logps/chosen": -164.29840087890625, + "logps/rejected": -232.9718475341797, + "loss": 0.0557, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.0937681198120117, + "rewards/margins": 5.704786777496338, + "rewards/rejected": -8.798555374145508, + "step": 1846 + }, + { + "epoch": 2.42, + "learning_rate": 4.914476994525372e-06, + "logits/chosen": -2.036473035812378, + "logits/rejected": -2.14975643157959, + "logps/chosen": -194.5894317626953, + "logps/rejected": -271.552490234375, + "loss": 0.0304, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.0837812423706055, + "rewards/margins": 6.59035587310791, + "rewards/rejected": -10.6741361618042, + "step": 1847 + }, + { + "epoch": 2.42, + "learning_rate": 4.893163913405971e-06, + "logits/chosen": -2.035865306854248, + "logits/rejected": -2.083787441253662, + "logps/chosen": -226.96177673339844, + "logps/rejected": -301.60595703125, + "loss": 0.0666, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.396075248718262, + "rewards/margins": 7.272006511688232, + "rewards/rejected": -11.668082237243652, + "step": 1848 + }, + { + "epoch": 2.42, + "learning_rate": 4.871892133452211e-06, + "logits/chosen": -2.1103556156158447, + "logits/rejected": -2.068713665008545, + "logps/chosen": -232.10317993164062, + "logps/rejected": -272.7130432128906, + "loss": 0.0491, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.226951837539673, + "rewards/margins": 6.436667442321777, + "rewards/rejected": -9.663619041442871, + "step": 1849 + }, + { + "epoch": 2.42, + "learning_rate": 4.850661698358156e-06, + "logits/chosen": -2.2079739570617676, + "logits/rejected": -2.2390694618225098, + "logps/chosen": -199.2862548828125, + "logps/rejected": -302.1933898925781, + "loss": 0.0546, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.73276424407959, + "rewards/margins": 6.339703559875488, + "rewards/rejected": -10.072466850280762, + "step": 1850 + }, + { + "epoch": 2.42, + "learning_rate": 4.8294726517329496e-06, + "logits/chosen": -2.2845871448516846, + "logits/rejected": -2.249790906906128, + "logps/chosen": -280.6891784667969, + "logps/rejected": -319.7981872558594, + "loss": 0.0901, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.029819965362549, + "rewards/margins": 5.86156702041626, + "rewards/rejected": -9.891386032104492, + "step": 1851 + }, + { + "epoch": 2.42, + "learning_rate": 4.808325037100691e-06, + "logits/chosen": -2.1286332607269287, + "logits/rejected": -2.1560935974121094, + "logps/chosen": -232.47000122070312, + "logps/rejected": -290.2018737792969, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8344550132751465, + "rewards/margins": 7.7763800621032715, + "rewards/rejected": -11.610835075378418, + "step": 1852 + }, + { + "epoch": 2.43, + "learning_rate": 4.787218897900403e-06, + "logits/chosen": -2.200514316558838, + "logits/rejected": -2.30204176902771, + "logps/chosen": -220.23886108398438, + "logps/rejected": -326.209228515625, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3958640098571777, + "rewards/margins": 7.256298542022705, + "rewards/rejected": -10.652162551879883, + "step": 1853 + }, + { + "epoch": 2.43, + "learning_rate": 4.766154277485915e-06, + "logits/chosen": -2.2365496158599854, + "logits/rejected": -2.300928831100464, + "logps/chosen": -237.87591552734375, + "logps/rejected": -280.73968505859375, + "loss": 0.0609, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.0166633129119873, + "rewards/margins": 5.886605262756348, + "rewards/rejected": -8.903267860412598, + "step": 1854 + }, + { + "epoch": 2.43, + "learning_rate": 4.745131219125748e-06, + "logits/chosen": -2.039705991744995, + "logits/rejected": -2.09029221534729, + "logps/chosen": -220.57838439941406, + "logps/rejected": -335.3861083984375, + "loss": 0.0474, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.9333319664001465, + "rewards/margins": 7.484780788421631, + "rewards/rejected": -12.418112754821777, + "step": 1855 + }, + { + "epoch": 2.43, + "learning_rate": 4.7241497660030744e-06, + "logits/chosen": -2.1581056118011475, + "logits/rejected": -2.1779823303222656, + "logps/chosen": -219.03445434570312, + "logps/rejected": -272.8111267089844, + "loss": 0.1063, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.065601348876953, + "rewards/margins": 5.467005729675293, + "rewards/rejected": -9.532607078552246, + "step": 1856 + }, + { + "epoch": 2.43, + "learning_rate": 4.703209961215607e-06, + "logits/chosen": -2.322453498840332, + "logits/rejected": -2.313636541366577, + "logps/chosen": -230.64517211914062, + "logps/rejected": -318.17388916015625, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.623886823654175, + "rewards/margins": 7.339909553527832, + "rewards/rejected": -10.96379566192627, + "step": 1857 + }, + { + "epoch": 2.43, + "learning_rate": 4.682311847775489e-06, + "logits/chosen": -2.083808422088623, + "logits/rejected": -2.1724205017089844, + "logps/chosen": -195.5645294189453, + "logps/rejected": -294.87261962890625, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6428382396698, + "rewards/margins": 7.440382480621338, + "rewards/rejected": -11.083220481872559, + "step": 1858 + }, + { + "epoch": 2.43, + "learning_rate": 4.661455468609235e-06, + "logits/chosen": -2.13789963722229, + "logits/rejected": -2.2410500049591064, + "logps/chosen": -232.37168884277344, + "logps/rejected": -326.5816345214844, + "loss": 0.0609, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.526942729949951, + "rewards/margins": 5.357692718505859, + "rewards/rejected": -9.884634971618652, + "step": 1859 + }, + { + "epoch": 2.43, + "learning_rate": 4.640640866557644e-06, + "logits/chosen": -2.2538199424743652, + "logits/rejected": -2.243154764175415, + "logps/chosen": -244.44183349609375, + "logps/rejected": -315.3876953125, + "loss": 0.0503, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.397599220275879, + "rewards/margins": 6.828386306762695, + "rewards/rejected": -11.225984573364258, + "step": 1860 + }, + { + "epoch": 2.44, + "learning_rate": 4.6198680843756975e-06, + "logits/chosen": -2.1034634113311768, + "logits/rejected": -2.0983917713165283, + "logps/chosen": -176.01702880859375, + "logps/rejected": -260.7190246582031, + "loss": 0.0528, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6103034019470215, + "rewards/margins": 6.3850250244140625, + "rewards/rejected": -9.995328903198242, + "step": 1861 + }, + { + "epoch": 2.44, + "learning_rate": 4.599137164732464e-06, + "logits/chosen": -2.1416175365448, + "logits/rejected": -2.1432993412017822, + "logps/chosen": -234.75640869140625, + "logps/rejected": -292.698486328125, + "loss": 0.045, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.422138690948486, + "rewards/margins": 7.162553787231445, + "rewards/rejected": -11.584692001342773, + "step": 1862 + }, + { + "epoch": 2.44, + "learning_rate": 4.578448150211026e-06, + "logits/chosen": -2.2309107780456543, + "logits/rejected": -2.215064287185669, + "logps/chosen": -275.6539306640625, + "logps/rejected": -332.3753356933594, + "loss": 0.1363, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.8252062797546387, + "rewards/margins": 5.361887454986572, + "rewards/rejected": -9.187093734741211, + "step": 1863 + }, + { + "epoch": 2.44, + "learning_rate": 4.557801083308403e-06, + "logits/chosen": -2.086691379547119, + "logits/rejected": -2.214555501937866, + "logps/chosen": -179.25845336914062, + "logps/rejected": -309.36676025390625, + "loss": 0.0912, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7035117149353027, + "rewards/margins": 7.860903739929199, + "rewards/rejected": -11.564414978027344, + "step": 1864 + }, + { + "epoch": 2.44, + "learning_rate": 4.53719600643544e-06, + "logits/chosen": -1.9977816343307495, + "logits/rejected": -2.0475850105285645, + "logps/chosen": -233.91769409179688, + "logps/rejected": -273.3837890625, + "loss": 0.0612, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.841904640197754, + "rewards/margins": 5.550585746765137, + "rewards/rejected": -10.392489433288574, + "step": 1865 + }, + { + "epoch": 2.44, + "learning_rate": 4.516632961916722e-06, + "logits/chosen": -2.2113523483276367, + "logits/rejected": -2.1286487579345703, + "logps/chosen": -204.07412719726562, + "logps/rejected": -298.1314697265625, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.31687593460083, + "rewards/margins": 8.184849739074707, + "rewards/rejected": -12.501725196838379, + "step": 1866 + }, + { + "epoch": 2.44, + "learning_rate": 4.496111991990518e-06, + "logits/chosen": -2.2053539752960205, + "logits/rejected": -2.2924482822418213, + "logps/chosen": -228.98101806640625, + "logps/rejected": -321.0423278808594, + "loss": 0.0346, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.674072027206421, + "rewards/margins": 7.283987045288086, + "rewards/rejected": -10.958059310913086, + "step": 1867 + }, + { + "epoch": 2.45, + "learning_rate": 4.475633138808663e-06, + "logits/chosen": -1.9363011121749878, + "logits/rejected": -1.9873145818710327, + "logps/chosen": -165.1034393310547, + "logps/rejected": -216.7107391357422, + "loss": 0.1014, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.013890266418457, + "rewards/margins": 5.167486667633057, + "rewards/rejected": -9.181377410888672, + "step": 1868 + }, + { + "epoch": 2.45, + "learning_rate": 4.45519644443646e-06, + "logits/chosen": -2.1952757835388184, + "logits/rejected": -2.2331581115722656, + "logps/chosen": -225.0584716796875, + "logps/rejected": -300.4390563964844, + "loss": 0.0914, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.084003448486328, + "rewards/margins": 6.949990272521973, + "rewards/rejected": -11.033992767333984, + "step": 1869 + }, + { + "epoch": 2.45, + "learning_rate": 4.434801950852644e-06, + "logits/chosen": -2.349788188934326, + "logits/rejected": -2.2885594367980957, + "logps/chosen": -217.0651397705078, + "logps/rejected": -284.756591796875, + "loss": 0.0557, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.3623266220092773, + "rewards/margins": 6.815611362457275, + "rewards/rejected": -10.177937507629395, + "step": 1870 + }, + { + "epoch": 2.45, + "learning_rate": 4.414449699949255e-06, + "logits/chosen": -2.01163387298584, + "logits/rejected": -2.0794479846954346, + "logps/chosen": -205.29641723632812, + "logps/rejected": -270.76806640625, + "loss": 0.0672, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4594311714172363, + "rewards/margins": 6.427774429321289, + "rewards/rejected": -9.887205123901367, + "step": 1871 + }, + { + "epoch": 2.45, + "learning_rate": 4.394139733531555e-06, + "logits/chosen": -1.8879884481430054, + "logits/rejected": -2.0123119354248047, + "logps/chosen": -181.33779907226562, + "logps/rejected": -244.18841552734375, + "loss": 0.0808, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.287167549133301, + "rewards/margins": 5.1526970863342285, + "rewards/rejected": -9.439864158630371, + "step": 1872 + }, + { + "epoch": 2.45, + "learning_rate": 4.373872093317965e-06, + "logits/chosen": -2.0780861377716064, + "logits/rejected": -2.073702573776245, + "logps/chosen": -190.65650939941406, + "logps/rejected": -220.1215362548828, + "loss": 0.136, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.566168785095215, + "rewards/margins": 5.389325141906738, + "rewards/rejected": -7.9554948806762695, + "step": 1873 + }, + { + "epoch": 2.45, + "learning_rate": 4.353646820939944e-06, + "logits/chosen": -2.2936534881591797, + "logits/rejected": -2.285000801086426, + "logps/chosen": -180.38958740234375, + "logps/rejected": -240.47410583496094, + "loss": 0.0255, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.39701247215271, + "rewards/margins": 5.853748321533203, + "rewards/rejected": -9.250761032104492, + "step": 1874 + }, + { + "epoch": 2.45, + "learning_rate": 4.333463957941952e-06, + "logits/chosen": -2.1267402172088623, + "logits/rejected": -2.1798155307769775, + "logps/chosen": -219.91506958007812, + "logps/rejected": -271.61383056640625, + "loss": 0.0195, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.224079608917236, + "rewards/margins": 5.692914962768555, + "rewards/rejected": -9.91699504852295, + "step": 1875 + }, + { + "epoch": 2.46, + "learning_rate": 4.313323545781306e-06, + "logits/chosen": -2.109710931777954, + "logits/rejected": -2.1083834171295166, + "logps/chosen": -223.51898193359375, + "logps/rejected": -299.0489807128906, + "loss": 0.0466, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.537987232208252, + "rewards/margins": 7.322035789489746, + "rewards/rejected": -11.860023498535156, + "step": 1876 + }, + { + "epoch": 2.46, + "learning_rate": 4.293225625828143e-06, + "logits/chosen": -2.1878747940063477, + "logits/rejected": -2.1590054035186768, + "logps/chosen": -207.21958923339844, + "logps/rejected": -266.81854248046875, + "loss": 0.041, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.3527703285217285, + "rewards/margins": 6.260322093963623, + "rewards/rejected": -10.613092422485352, + "step": 1877 + }, + { + "epoch": 2.46, + "learning_rate": 4.273170239365323e-06, + "logits/chosen": -1.992726445198059, + "logits/rejected": -2.107708692550659, + "logps/chosen": -196.45310974121094, + "logps/rejected": -290.7101135253906, + "loss": 0.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.52007794380188, + "rewards/margins": 7.199146747589111, + "rewards/rejected": -10.71922492980957, + "step": 1878 + }, + { + "epoch": 2.46, + "learning_rate": 4.253157427588325e-06, + "logits/chosen": -2.169482469558716, + "logits/rejected": -2.23740816116333, + "logps/chosen": -227.9417724609375, + "logps/rejected": -315.59869384765625, + "loss": 0.0206, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9731407165527344, + "rewards/margins": 7.469448089599609, + "rewards/rejected": -11.442588806152344, + "step": 1879 + }, + { + "epoch": 2.46, + "learning_rate": 4.233187231605173e-06, + "logits/chosen": -2.2113475799560547, + "logits/rejected": -2.294445037841797, + "logps/chosen": -218.82135009765625, + "logps/rejected": -317.256103515625, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6115753650665283, + "rewards/margins": 7.621359825134277, + "rewards/rejected": -11.232934951782227, + "step": 1880 + }, + { + "epoch": 2.46, + "learning_rate": 4.213259692436367e-06, + "logits/chosen": -2.0652737617492676, + "logits/rejected": -2.215181827545166, + "logps/chosen": -207.25869750976562, + "logps/rejected": -307.0806884765625, + "loss": 0.0542, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.270017147064209, + "rewards/margins": 6.618261337280273, + "rewards/rejected": -10.88827896118164, + "step": 1881 + }, + { + "epoch": 2.46, + "learning_rate": 4.193374851014789e-06, + "logits/chosen": -1.949131965637207, + "logits/rejected": -2.18892240524292, + "logps/chosen": -203.45437622070312, + "logps/rejected": -297.58172607421875, + "loss": 0.0167, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.3908915519714355, + "rewards/margins": 7.496838569641113, + "rewards/rejected": -11.887728691101074, + "step": 1882 + }, + { + "epoch": 2.46, + "learning_rate": 4.1735327481855965e-06, + "logits/chosen": -2.106013298034668, + "logits/rejected": -2.169212579727173, + "logps/chosen": -217.0201873779297, + "logps/rejected": -294.45233154296875, + "loss": 0.0484, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.394174814224243, + "rewards/margins": 6.548811912536621, + "rewards/rejected": -9.942986488342285, + "step": 1883 + }, + { + "epoch": 2.47, + "learning_rate": 4.153733424706183e-06, + "logits/chosen": -2.1933975219726562, + "logits/rejected": -2.367788791656494, + "logps/chosen": -221.85044860839844, + "logps/rejected": -306.055419921875, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.990512847900391, + "rewards/margins": 7.562135219573975, + "rewards/rejected": -12.552647590637207, + "step": 1884 + }, + { + "epoch": 2.47, + "learning_rate": 4.13397692124605e-06, + "logits/chosen": -2.1423697471618652, + "logits/rejected": -2.155214548110962, + "logps/chosen": -166.47132873535156, + "logps/rejected": -234.875244140625, + "loss": 0.0554, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.6059443950653076, + "rewards/margins": 6.1131415367126465, + "rewards/rejected": -9.719085693359375, + "step": 1885 + }, + { + "epoch": 2.47, + "learning_rate": 4.114263278386743e-06, + "logits/chosen": -2.2024450302124023, + "logits/rejected": -2.214484691619873, + "logps/chosen": -239.01707458496094, + "logps/rejected": -311.5782775878906, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.257002353668213, + "rewards/margins": 6.260740280151367, + "rewards/rejected": -10.517742156982422, + "step": 1886 + }, + { + "epoch": 2.47, + "learning_rate": 4.09459253662178e-06, + "logits/chosen": -2.194406270980835, + "logits/rejected": -2.19130277633667, + "logps/chosen": -259.03558349609375, + "logps/rejected": -298.7030334472656, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.031529426574707, + "rewards/margins": 7.081670761108398, + "rewards/rejected": -11.113200187683105, + "step": 1887 + }, + { + "epoch": 2.47, + "learning_rate": 4.074964736356563e-06, + "logits/chosen": -2.1347687244415283, + "logits/rejected": -2.1638143062591553, + "logps/chosen": -223.41014099121094, + "logps/rejected": -296.030029296875, + "loss": 0.1773, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.8601789474487305, + "rewards/margins": 6.453425407409668, + "rewards/rejected": -11.313605308532715, + "step": 1888 + }, + { + "epoch": 2.47, + "learning_rate": 4.055379917908258e-06, + "logits/chosen": -2.199756622314453, + "logits/rejected": -2.2278687953948975, + "logps/chosen": -216.8646240234375, + "logps/rejected": -284.65582275390625, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.242045879364014, + "rewards/margins": 6.611310958862305, + "rewards/rejected": -10.85335636138916, + "step": 1889 + }, + { + "epoch": 2.47, + "learning_rate": 4.035838121505778e-06, + "logits/chosen": -2.117622137069702, + "logits/rejected": -2.110386610031128, + "logps/chosen": -198.87899780273438, + "logps/rejected": -273.65032958984375, + "loss": 0.1022, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7314319610595703, + "rewards/margins": 5.809887409210205, + "rewards/rejected": -9.541318893432617, + "step": 1890 + }, + { + "epoch": 2.48, + "learning_rate": 4.016339387289636e-06, + "logits/chosen": -2.0531842708587646, + "logits/rejected": -2.182842969894409, + "logps/chosen": -188.74774169921875, + "logps/rejected": -319.1669921875, + "loss": 0.0435, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8636474609375, + "rewards/margins": 7.400137901306152, + "rewards/rejected": -11.263785362243652, + "step": 1891 + }, + { + "epoch": 2.48, + "learning_rate": 3.996883755311917e-06, + "logits/chosen": -2.0855536460876465, + "logits/rejected": -2.086881160736084, + "logps/chosen": -200.0166473388672, + "logps/rejected": -285.82470703125, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.857919931411743, + "rewards/margins": 7.221430778503418, + "rewards/rejected": -11.079350471496582, + "step": 1892 + }, + { + "epoch": 2.48, + "learning_rate": 3.977471265536142e-06, + "logits/chosen": -2.2221086025238037, + "logits/rejected": -2.2463083267211914, + "logps/chosen": -170.12353515625, + "logps/rejected": -256.7975769042969, + "loss": 0.0248, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.535106658935547, + "rewards/margins": 6.626214504241943, + "rewards/rejected": -11.161319732666016, + "step": 1893 + }, + { + "epoch": 2.48, + "learning_rate": 3.95810195783724e-06, + "logits/chosen": -2.061293601989746, + "logits/rejected": -2.150324821472168, + "logps/chosen": -192.5421600341797, + "logps/rejected": -289.6590270996094, + "loss": 0.0204, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2623133659362793, + "rewards/margins": 7.353786468505859, + "rewards/rejected": -10.616100311279297, + "step": 1894 + }, + { + "epoch": 2.48, + "learning_rate": 3.938775872001441e-06, + "logits/chosen": -2.112722396850586, + "logits/rejected": -2.166233777999878, + "logps/chosen": -203.79443359375, + "logps/rejected": -288.2057800292969, + "loss": 0.1395, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.261386394500732, + "rewards/margins": 5.453499794006348, + "rewards/rejected": -9.714886665344238, + "step": 1895 + }, + { + "epoch": 2.48, + "learning_rate": 3.919493047726156e-06, + "logits/chosen": -2.3003602027893066, + "logits/rejected": -2.379110336303711, + "logps/chosen": -244.69813537597656, + "logps/rejected": -320.92449951171875, + "loss": 0.0472, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.002113342285156, + "rewards/margins": 6.668060302734375, + "rewards/rejected": -10.670173645019531, + "step": 1896 + }, + { + "epoch": 2.48, + "learning_rate": 3.900253524619973e-06, + "logits/chosen": -2.013737440109253, + "logits/rejected": -2.1030960083007812, + "logps/chosen": -189.7953643798828, + "logps/rejected": -281.83917236328125, + "loss": 0.0478, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.178432941436768, + "rewards/margins": 7.2240376472473145, + "rewards/rejected": -11.402470588684082, + "step": 1897 + }, + { + "epoch": 2.48, + "learning_rate": 3.881057342202532e-06, + "logits/chosen": -2.089158058166504, + "logits/rejected": -2.138014078140259, + "logps/chosen": -201.237548828125, + "logps/rejected": -275.27056884765625, + "loss": 0.0535, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.8264353275299072, + "rewards/margins": 6.348036766052246, + "rewards/rejected": -10.174470901489258, + "step": 1898 + }, + { + "epoch": 2.49, + "learning_rate": 3.861904539904421e-06, + "logits/chosen": -2.116814374923706, + "logits/rejected": -2.049565315246582, + "logps/chosen": -254.93804931640625, + "logps/rejected": -324.2756042480469, + "loss": 0.0472, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.7399725914001465, + "rewards/margins": 8.12868595123291, + "rewards/rejected": -12.868659019470215, + "step": 1899 + }, + { + "epoch": 2.49, + "learning_rate": 3.842795157067147e-06, + "logits/chosen": -2.2074708938598633, + "logits/rejected": -2.1448240280151367, + "logps/chosen": -227.035400390625, + "logps/rejected": -306.4009094238281, + "loss": 0.0461, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5230255126953125, + "rewards/margins": 6.875339984893799, + "rewards/rejected": -10.398365020751953, + "step": 1900 + }, + { + "epoch": 2.49, + "learning_rate": 3.823729232943027e-06, + "logits/chosen": -2.3444838523864746, + "logits/rejected": -2.345331907272339, + "logps/chosen": -245.95098876953125, + "logps/rejected": -288.42559814453125, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.37188720703125, + "rewards/margins": 5.866891860961914, + "rewards/rejected": -10.238780975341797, + "step": 1901 + }, + { + "epoch": 2.49, + "learning_rate": 3.804706806695099e-06, + "logits/chosen": -2.2394521236419678, + "logits/rejected": -2.2196478843688965, + "logps/chosen": -266.4293518066406, + "logps/rejected": -318.98577880859375, + "loss": 0.091, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.012628555297852, + "rewards/margins": 6.09891939163208, + "rewards/rejected": -11.11154842376709, + "step": 1902 + }, + { + "epoch": 2.49, + "learning_rate": 3.785727917397047e-06, + "logits/chosen": -2.049072742462158, + "logits/rejected": -2.058758497238159, + "logps/chosen": -193.92555236816406, + "logps/rejected": -263.79443359375, + "loss": 0.0292, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.309211730957031, + "rewards/margins": 6.7394914627075195, + "rewards/rejected": -12.04870319366455, + "step": 1903 + }, + { + "epoch": 2.49, + "learning_rate": 3.7667926040331507e-06, + "logits/chosen": -1.9702086448669434, + "logits/rejected": -2.0834591388702393, + "logps/chosen": -209.8311004638672, + "logps/rejected": -278.60272216796875, + "loss": 0.0392, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9955968856811523, + "rewards/margins": 6.438357353210449, + "rewards/rejected": -10.433953285217285, + "step": 1904 + }, + { + "epoch": 2.49, + "learning_rate": 3.7479009054981667e-06, + "logits/chosen": -2.0072498321533203, + "logits/rejected": -1.9878771305084229, + "logps/chosen": -206.0680389404297, + "logps/rejected": -264.1946716308594, + "loss": 0.0639, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.3403706550598145, + "rewards/margins": 5.70688009262085, + "rewards/rejected": -10.047250747680664, + "step": 1905 + }, + { + "epoch": 2.49, + "learning_rate": 3.7290528605972625e-06, + "logits/chosen": -2.19431734085083, + "logits/rejected": -2.1953868865966797, + "logps/chosen": -207.38722229003906, + "logps/rejected": -310.5050354003906, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.520511627197266, + "rewards/margins": 7.738292217254639, + "rewards/rejected": -12.258804321289062, + "step": 1906 + }, + { + "epoch": 2.5, + "learning_rate": 3.7102485080459328e-06, + "logits/chosen": -2.2219109535217285, + "logits/rejected": -2.4010887145996094, + "logps/chosen": -238.17190551757812, + "logps/rejected": -338.6058044433594, + "loss": 0.0593, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.577810049057007, + "rewards/margins": 6.7860822677612305, + "rewards/rejected": -10.363893508911133, + "step": 1907 + }, + { + "epoch": 2.5, + "learning_rate": 3.6914878864699326e-06, + "logits/chosen": -2.216526508331299, + "logits/rejected": -2.2307300567626953, + "logps/chosen": -249.19908142089844, + "logps/rejected": -337.4971618652344, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.253111839294434, + "rewards/margins": 7.2695817947387695, + "rewards/rejected": -12.52269172668457, + "step": 1908 + }, + { + "epoch": 2.5, + "learning_rate": 3.672771034405195e-06, + "logits/chosen": -2.0850324630737305, + "logits/rejected": -2.1441049575805664, + "logps/chosen": -208.41366577148438, + "logps/rejected": -290.0233459472656, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.735658645629883, + "rewards/margins": 6.897973537445068, + "rewards/rejected": -10.633630752563477, + "step": 1909 + }, + { + "epoch": 2.5, + "learning_rate": 3.654097990297731e-06, + "logits/chosen": -1.9736182689666748, + "logits/rejected": -1.986024022102356, + "logps/chosen": -218.3415069580078, + "logps/rejected": -287.7013854980469, + "loss": 0.1002, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.8704919815063477, + "rewards/margins": 7.1873321533203125, + "rewards/rejected": -11.05782413482666, + "step": 1910 + }, + { + "epoch": 2.5, + "learning_rate": 3.6354687925035743e-06, + "logits/chosen": -2.2040674686431885, + "logits/rejected": -2.199704885482788, + "logps/chosen": -235.47389221191406, + "logps/rejected": -320.45123291015625, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.39183235168457, + "rewards/margins": 7.688186168670654, + "rewards/rejected": -12.08001708984375, + "step": 1911 + }, + { + "epoch": 2.5, + "learning_rate": 3.6168834792887103e-06, + "logits/chosen": -2.087803840637207, + "logits/rejected": -2.1227338314056396, + "logps/chosen": -211.21353149414062, + "logps/rejected": -297.54248046875, + "loss": 0.0194, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.358908176422119, + "rewards/margins": 8.122173309326172, + "rewards/rejected": -11.481081008911133, + "step": 1912 + }, + { + "epoch": 2.5, + "learning_rate": 3.598342088828943e-06, + "logits/chosen": -1.997459053993225, + "logits/rejected": -2.028629779815674, + "logps/chosen": -236.18214416503906, + "logps/rejected": -311.0181579589844, + "loss": 0.0148, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.55603289604187, + "rewards/margins": 7.4721574783325195, + "rewards/rejected": -11.028191566467285, + "step": 1913 + }, + { + "epoch": 2.51, + "learning_rate": 3.5798446592098883e-06, + "logits/chosen": -2.1438562870025635, + "logits/rejected": -2.171473741531372, + "logps/chosen": -218.49282836914062, + "logps/rejected": -310.1673278808594, + "loss": 0.0471, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.263566970825195, + "rewards/margins": 7.407224178314209, + "rewards/rejected": -11.670791625976562, + "step": 1914 + }, + { + "epoch": 2.51, + "learning_rate": 3.561391228426861e-06, + "logits/chosen": -2.077690839767456, + "logits/rejected": -2.088977336883545, + "logps/chosen": -236.00921630859375, + "logps/rejected": -332.857666015625, + "loss": 0.0475, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.714725017547607, + "rewards/margins": 7.768087863922119, + "rewards/rejected": -12.482812881469727, + "step": 1915 + }, + { + "epoch": 2.51, + "learning_rate": 3.542981834384776e-06, + "logits/chosen": -2.1037631034851074, + "logits/rejected": -2.1485438346862793, + "logps/chosen": -194.81832885742188, + "logps/rejected": -251.98809814453125, + "loss": 0.0741, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.424944877624512, + "rewards/margins": 6.0854172706604, + "rewards/rejected": -10.510361671447754, + "step": 1916 + }, + { + "epoch": 2.51, + "learning_rate": 3.5246165148981214e-06, + "logits/chosen": -2.0972046852111816, + "logits/rejected": -2.029261827468872, + "logps/chosen": -242.43527221679688, + "logps/rejected": -287.77203369140625, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.314417362213135, + "rewards/margins": 7.230639934539795, + "rewards/rejected": -11.54505729675293, + "step": 1917 + }, + { + "epoch": 2.51, + "learning_rate": 3.5062953076908268e-06, + "logits/chosen": -2.366607189178467, + "logits/rejected": -2.4461746215820312, + "logps/chosen": -224.57760620117188, + "logps/rejected": -321.8349914550781, + "loss": 0.0466, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.106081008911133, + "rewards/margins": 7.173934459686279, + "rewards/rejected": -11.280014991760254, + "step": 1918 + }, + { + "epoch": 2.51, + "learning_rate": 3.488018250396233e-06, + "logits/chosen": -1.9970202445983887, + "logits/rejected": -2.0337016582489014, + "logps/chosen": -193.3861083984375, + "logps/rejected": -325.51397705078125, + "loss": 0.0447, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.675048351287842, + "rewards/margins": 8.534703254699707, + "rewards/rejected": -12.20975112915039, + "step": 1919 + }, + { + "epoch": 2.51, + "learning_rate": 3.4697853805569696e-06, + "logits/chosen": -2.04309344291687, + "logits/rejected": -2.0823137760162354, + "logps/chosen": -222.3388671875, + "logps/rejected": -308.24945068359375, + "loss": 0.0926, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.982977867126465, + "rewards/margins": 6.360529899597168, + "rewards/rejected": -11.343506813049316, + "step": 1920 + }, + { + "epoch": 2.51, + "learning_rate": 3.4515967356249263e-06, + "logits/chosen": -2.175962209701538, + "logits/rejected": -2.1958234310150146, + "logps/chosen": -227.48289489746094, + "logps/rejected": -344.162353515625, + "loss": 0.0885, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.055968761444092, + "rewards/margins": 6.053489685058594, + "rewards/rejected": -10.109458923339844, + "step": 1921 + }, + { + "epoch": 2.52, + "learning_rate": 3.4334523529611416e-06, + "logits/chosen": -2.1930103302001953, + "logits/rejected": -2.190291404724121, + "logps/chosen": -212.89968872070312, + "logps/rejected": -310.3575439453125, + "loss": 0.0791, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.8515625, + "rewards/margins": 6.565982818603516, + "rewards/rejected": -11.4175443649292, + "step": 1922 + }, + { + "epoch": 2.52, + "learning_rate": 3.415352269835731e-06, + "logits/chosen": -2.1003472805023193, + "logits/rejected": -2.0964856147766113, + "logps/chosen": -201.11459350585938, + "logps/rejected": -289.0037841796875, + "loss": 0.0235, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.3380255699157715, + "rewards/margins": 7.608311176300049, + "rewards/rejected": -11.946335792541504, + "step": 1923 + }, + { + "epoch": 2.52, + "learning_rate": 3.3972965234278065e-06, + "logits/chosen": -2.1426749229431152, + "logits/rejected": -2.140812397003174, + "logps/chosen": -232.2127227783203, + "logps/rejected": -322.78814697265625, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.508618354797363, + "rewards/margins": 7.715066909790039, + "rewards/rejected": -12.223686218261719, + "step": 1924 + }, + { + "epoch": 2.52, + "learning_rate": 3.379285150825434e-06, + "logits/chosen": -2.1275901794433594, + "logits/rejected": -2.2409205436706543, + "logps/chosen": -250.38487243652344, + "logps/rejected": -335.5484924316406, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.848855972290039, + "rewards/margins": 8.123422622680664, + "rewards/rejected": -12.972278594970703, + "step": 1925 + }, + { + "epoch": 2.52, + "learning_rate": 3.3613181890255056e-06, + "logits/chosen": -2.218322277069092, + "logits/rejected": -2.279414653778076, + "logps/chosen": -217.46316528320312, + "logps/rejected": -303.641357421875, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.651366710662842, + "rewards/margins": 6.0683979988098145, + "rewards/rejected": -10.719764709472656, + "step": 1926 + }, + { + "epoch": 2.52, + "learning_rate": 3.343395674933711e-06, + "logits/chosen": -2.207975387573242, + "logits/rejected": -2.250619888305664, + "logps/chosen": -273.9842834472656, + "logps/rejected": -385.20855712890625, + "loss": 0.046, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.535407543182373, + "rewards/margins": 7.137181282043457, + "rewards/rejected": -11.672589302062988, + "step": 1927 + }, + { + "epoch": 2.52, + "learning_rate": 3.325517645364429e-06, + "logits/chosen": -2.0839622020721436, + "logits/rejected": -2.0372018814086914, + "logps/chosen": -218.240234375, + "logps/rejected": -282.0450134277344, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.900965452194214, + "rewards/margins": 7.21649169921875, + "rewards/rejected": -11.117456436157227, + "step": 1928 + }, + { + "epoch": 2.52, + "learning_rate": 3.3076841370406674e-06, + "logits/chosen": -2.256783962249756, + "logits/rejected": -2.3672468662261963, + "logps/chosen": -199.17889404296875, + "logps/rejected": -248.9525146484375, + "loss": 0.066, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.5688915252685547, + "rewards/margins": 5.722840309143066, + "rewards/rejected": -9.291731834411621, + "step": 1929 + }, + { + "epoch": 2.53, + "learning_rate": 3.289895186593972e-06, + "logits/chosen": -2.164548873901367, + "logits/rejected": -2.243574619293213, + "logps/chosen": -221.25128173828125, + "logps/rejected": -292.24310302734375, + "loss": 0.0204, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.508374214172363, + "rewards/margins": 6.047191143035889, + "rewards/rejected": -10.555566787719727, + "step": 1930 + }, + { + "epoch": 2.53, + "learning_rate": 3.27215083056438e-06, + "logits/chosen": -2.067539691925049, + "logits/rejected": -2.1059985160827637, + "logps/chosen": -208.03504943847656, + "logps/rejected": -274.39300537109375, + "loss": 0.0836, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.032598972320557, + "rewards/margins": 6.347752571105957, + "rewards/rejected": -10.380351066589355, + "step": 1931 + }, + { + "epoch": 2.53, + "learning_rate": 3.2544511054003246e-06, + "logits/chosen": -2.2770469188690186, + "logits/rejected": -2.4603452682495117, + "logps/chosen": -252.45327758789062, + "logps/rejected": -338.2505187988281, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.37544059753418, + "rewards/margins": 6.901148796081543, + "rewards/rejected": -11.276590347290039, + "step": 1932 + }, + { + "epoch": 2.53, + "learning_rate": 3.2367960474585458e-06, + "logits/chosen": -2.323967456817627, + "logits/rejected": -2.369455099105835, + "logps/chosen": -197.98849487304688, + "logps/rejected": -272.9342346191406, + "loss": 0.0307, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.09578800201416, + "rewards/margins": 7.210512161254883, + "rewards/rejected": -11.30630111694336, + "step": 1933 + }, + { + "epoch": 2.53, + "learning_rate": 3.2191856930040646e-06, + "logits/chosen": -2.162247657775879, + "logits/rejected": -2.2363877296447754, + "logps/chosen": -223.33108520507812, + "logps/rejected": -308.9804382324219, + "loss": 0.0449, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.032862186431885, + "rewards/margins": 7.454715728759766, + "rewards/rejected": -11.487578392028809, + "step": 1934 + }, + { + "epoch": 2.53, + "learning_rate": 3.2016200782100436e-06, + "logits/chosen": -2.0275230407714844, + "logits/rejected": -2.0601468086242676, + "logps/chosen": -143.40357971191406, + "logps/rejected": -229.19822692871094, + "loss": 0.0771, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8151357173919678, + "rewards/margins": 6.494118690490723, + "rewards/rejected": -9.309255599975586, + "step": 1935 + }, + { + "epoch": 2.53, + "learning_rate": 3.18409923915777e-06, + "logits/chosen": -1.8332370519638062, + "logits/rejected": -1.8421392440795898, + "logps/chosen": -244.0946502685547, + "logps/rejected": -273.56085205078125, + "loss": 0.0729, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1983587741851807, + "rewards/margins": 6.372551918029785, + "rewards/rejected": -8.570911407470703, + "step": 1936 + }, + { + "epoch": 2.54, + "learning_rate": 3.1666232118365474e-06, + "logits/chosen": -2.1268603801727295, + "logits/rejected": -2.1130568981170654, + "logps/chosen": -267.3658752441406, + "logps/rejected": -297.03607177734375, + "loss": 0.0796, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.861048936843872, + "rewards/margins": 6.060363292694092, + "rewards/rejected": -9.921411514282227, + "step": 1937 + }, + { + "epoch": 2.54, + "learning_rate": 3.1491920321436303e-06, + "logits/chosen": -2.1559815406799316, + "logits/rejected": -2.2063589096069336, + "logps/chosen": -203.36326599121094, + "logps/rejected": -293.6430969238281, + "loss": 0.0378, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.1123948097229, + "rewards/margins": 7.340630531311035, + "rewards/rejected": -11.453025817871094, + "step": 1938 + }, + { + "epoch": 2.54, + "learning_rate": 3.1318057358841745e-06, + "logits/chosen": -2.042067527770996, + "logits/rejected": -2.122093915939331, + "logps/chosen": -260.86175537109375, + "logps/rejected": -385.980224609375, + "loss": 0.0548, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.0958757400512695, + "rewards/margins": 8.177687644958496, + "rewards/rejected": -13.273563385009766, + "step": 1939 + }, + { + "epoch": 2.54, + "learning_rate": 3.114464358771102e-06, + "logits/chosen": -2.13277268409729, + "logits/rejected": -2.1633195877075195, + "logps/chosen": -218.29551696777344, + "logps/rejected": -292.3998718261719, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.922913074493408, + "rewards/margins": 7.652833938598633, + "rewards/rejected": -11.575746536254883, + "step": 1940 + }, + { + "epoch": 2.54, + "learning_rate": 3.097167936425094e-06, + "logits/chosen": -2.0178704261779785, + "logits/rejected": -2.0704665184020996, + "logps/chosen": -270.4171142578125, + "logps/rejected": -330.67401123046875, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.412740230560303, + "rewards/margins": 6.659230709075928, + "rewards/rejected": -11.071971893310547, + "step": 1941 + }, + { + "epoch": 2.54, + "learning_rate": 3.079916504374494e-06, + "logits/chosen": -2.149261474609375, + "logits/rejected": -2.1316537857055664, + "logps/chosen": -211.03335571289062, + "logps/rejected": -296.0220947265625, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.65706467628479, + "rewards/margins": 7.388676643371582, + "rewards/rejected": -11.04574203491211, + "step": 1942 + }, + { + "epoch": 2.54, + "learning_rate": 3.0627100980552133e-06, + "logits/chosen": -2.018838882446289, + "logits/rejected": -2.114900588989258, + "logps/chosen": -155.8693389892578, + "logps/rejected": -229.74928283691406, + "loss": 0.1927, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.544346332550049, + "rewards/margins": 5.97032356262207, + "rewards/rejected": -9.514669418334961, + "step": 1943 + }, + { + "epoch": 2.54, + "learning_rate": 3.045548752810687e-06, + "logits/chosen": -2.1688249111175537, + "logits/rejected": -2.299863338470459, + "logps/chosen": -274.62591552734375, + "logps/rejected": -355.90509033203125, + "loss": 0.0503, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.920033931732178, + "rewards/margins": 6.292994976043701, + "rewards/rejected": -11.213028907775879, + "step": 1944 + }, + { + "epoch": 2.55, + "learning_rate": 3.028432503891801e-06, + "logits/chosen": -2.1515274047851562, + "logits/rejected": -2.136507272720337, + "logps/chosen": -273.89544677734375, + "logps/rejected": -335.6448974609375, + "loss": 0.0975, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.763400077819824, + "rewards/margins": 6.956304550170898, + "rewards/rejected": -11.719703674316406, + "step": 1945 + }, + { + "epoch": 2.55, + "learning_rate": 3.01136138645679e-06, + "logits/chosen": -2.158633232116699, + "logits/rejected": -2.1321115493774414, + "logps/chosen": -276.24169921875, + "logps/rejected": -337.2641296386719, + "loss": 0.0493, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.528733730316162, + "rewards/margins": 6.0531907081604, + "rewards/rejected": -11.581924438476562, + "step": 1946 + }, + { + "epoch": 2.55, + "learning_rate": 2.9943354355711884e-06, + "logits/chosen": -2.28364896774292, + "logits/rejected": -2.2799689769744873, + "logps/chosen": -258.55963134765625, + "logps/rejected": -316.2432556152344, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.101343154907227, + "rewards/margins": 7.162715911865234, + "rewards/rejected": -11.264059066772461, + "step": 1947 + }, + { + "epoch": 2.55, + "learning_rate": 2.9773546862077617e-06, + "logits/chosen": -2.0789194107055664, + "logits/rejected": -2.054330587387085, + "logps/chosen": -165.2958984375, + "logps/rejected": -287.51580810546875, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4139468669891357, + "rewards/margins": 8.76666259765625, + "rewards/rejected": -12.180608749389648, + "step": 1948 + }, + { + "epoch": 2.55, + "learning_rate": 2.960419173246437e-06, + "logits/chosen": -2.1714565753936768, + "logits/rejected": -2.192408323287964, + "logps/chosen": -206.03785705566406, + "logps/rejected": -293.42401123046875, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.23584508895874, + "rewards/margins": 6.573352336883545, + "rewards/rejected": -10.809196472167969, + "step": 1949 + }, + { + "epoch": 2.55, + "learning_rate": 2.9435289314742015e-06, + "logits/chosen": -2.1490747928619385, + "logits/rejected": -2.260983467102051, + "logps/chosen": -229.38429260253906, + "logps/rejected": -335.409423828125, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.311609745025635, + "rewards/margins": 7.348052978515625, + "rewards/rejected": -11.659663200378418, + "step": 1950 + }, + { + "epoch": 2.55, + "learning_rate": 2.926683995585053e-06, + "logits/chosen": -2.0619702339172363, + "logits/rejected": -2.111379623413086, + "logps/chosen": -233.77268981933594, + "logps/rejected": -308.01812744140625, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7177836894989014, + "rewards/margins": 7.628304958343506, + "rewards/rejected": -11.346088409423828, + "step": 1951 + }, + { + "epoch": 2.55, + "learning_rate": 2.9098844001799407e-06, + "logits/chosen": -2.2158334255218506, + "logits/rejected": -2.204169273376465, + "logps/chosen": -256.19866943359375, + "logps/rejected": -286.0683898925781, + "loss": 0.0376, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.717142105102539, + "rewards/margins": 6.895275115966797, + "rewards/rejected": -10.612417221069336, + "step": 1952 + }, + { + "epoch": 2.56, + "learning_rate": 2.8931301797666844e-06, + "logits/chosen": -2.1415328979492188, + "logits/rejected": -2.119919776916504, + "logps/chosen": -196.79298400878906, + "logps/rejected": -258.55987548828125, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8779635429382324, + "rewards/margins": 6.7579193115234375, + "rewards/rejected": -10.635883331298828, + "step": 1953 + }, + { + "epoch": 2.56, + "learning_rate": 2.8764213687598713e-06, + "logits/chosen": -1.9052060842514038, + "logits/rejected": -2.028146266937256, + "logps/chosen": -233.23736572265625, + "logps/rejected": -352.845703125, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.083035469055176, + "rewards/margins": 7.8727827072143555, + "rewards/rejected": -12.955819129943848, + "step": 1954 + }, + { + "epoch": 2.56, + "learning_rate": 2.85975800148085e-06, + "logits/chosen": -2.160583734512329, + "logits/rejected": -2.103926181793213, + "logps/chosen": -221.9737548828125, + "logps/rejected": -239.5202178955078, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.72629976272583, + "rewards/margins": 5.202835559844971, + "rewards/rejected": -9.9291353225708, + "step": 1955 + }, + { + "epoch": 2.56, + "learning_rate": 2.843140112157594e-06, + "logits/chosen": -2.1782562732696533, + "logits/rejected": -2.2178544998168945, + "logps/chosen": -200.49549865722656, + "logps/rejected": -256.28692626953125, + "loss": 0.0176, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.254265785217285, + "rewards/margins": 6.321405410766602, + "rewards/rejected": -9.575671195983887, + "step": 1956 + }, + { + "epoch": 2.56, + "learning_rate": 2.8265677349246735e-06, + "logits/chosen": -2.0274906158447266, + "logits/rejected": -2.0620315074920654, + "logps/chosen": -257.68817138671875, + "logps/rejected": -353.147216796875, + "loss": 0.0448, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.044673919677734, + "rewards/margins": 7.667537212371826, + "rewards/rejected": -11.712210655212402, + "step": 1957 + }, + { + "epoch": 2.56, + "learning_rate": 2.8100409038231746e-06, + "logits/chosen": -2.1786317825317383, + "logits/rejected": -2.052286386489868, + "logps/chosen": -233.5760040283203, + "logps/rejected": -284.4718933105469, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.581871271133423, + "rewards/margins": 7.291482925415039, + "rewards/rejected": -10.8733549118042, + "step": 1958 + }, + { + "epoch": 2.56, + "learning_rate": 2.793559652800631e-06, + "logits/chosen": -2.05732798576355, + "logits/rejected": -2.136399507522583, + "logps/chosen": -217.72947692871094, + "logps/rejected": -300.5487976074219, + "loss": 0.0739, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.1660027503967285, + "rewards/margins": 7.415482997894287, + "rewards/rejected": -12.581485748291016, + "step": 1959 + }, + { + "epoch": 2.57, + "learning_rate": 2.7771240157109355e-06, + "logits/chosen": -1.9800173044204712, + "logits/rejected": -2.0252513885498047, + "logps/chosen": -211.98147583007812, + "logps/rejected": -304.8021545410156, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4743568897247314, + "rewards/margins": 7.9838104248046875, + "rewards/rejected": -11.45816707611084, + "step": 1960 + }, + { + "epoch": 2.57, + "learning_rate": 2.7607340263143073e-06, + "logits/chosen": -2.134765148162842, + "logits/rejected": -2.1380696296691895, + "logps/chosen": -211.37094116210938, + "logps/rejected": -270.9632263183594, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.567144870758057, + "rewards/margins": 6.111141204833984, + "rewards/rejected": -10.6782865524292, + "step": 1961 + }, + { + "epoch": 2.57, + "learning_rate": 2.7443897182771794e-06, + "logits/chosen": -2.178759813308716, + "logits/rejected": -2.3176801204681396, + "logps/chosen": -207.14418029785156, + "logps/rejected": -316.44708251953125, + "loss": 0.0548, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.8760452270507812, + "rewards/margins": 6.602855682373047, + "rewards/rejected": -10.478901863098145, + "step": 1962 + }, + { + "epoch": 2.57, + "learning_rate": 2.7280911251721748e-06, + "logits/chosen": -2.363405466079712, + "logits/rejected": -2.316239356994629, + "logps/chosen": -206.2725830078125, + "logps/rejected": -253.55917358398438, + "loss": 0.0503, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9529001712799072, + "rewards/margins": 7.212587833404541, + "rewards/rejected": -10.165487289428711, + "step": 1963 + }, + { + "epoch": 2.57, + "learning_rate": 2.711838280477988e-06, + "logits/chosen": -2.192681312561035, + "logits/rejected": -2.1529693603515625, + "logps/chosen": -201.1187286376953, + "logps/rejected": -238.6221466064453, + "loss": 0.1008, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.321526527404785, + "rewards/margins": 5.079824447631836, + "rewards/rejected": -10.401350975036621, + "step": 1964 + }, + { + "epoch": 2.57, + "learning_rate": 2.6956312175793613e-06, + "logits/chosen": -2.146167039871216, + "logits/rejected": -2.2076475620269775, + "logps/chosen": -233.29090881347656, + "logps/rejected": -312.9189758300781, + "loss": 0.0511, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.2652974128723145, + "rewards/margins": 6.636852264404297, + "rewards/rejected": -10.90215015411377, + "step": 1965 + }, + { + "epoch": 2.57, + "learning_rate": 2.679469969767001e-06, + "logits/chosen": -2.233349084854126, + "logits/rejected": -2.220684766769409, + "logps/chosen": -243.32542419433594, + "logps/rejected": -299.36517333984375, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.597517728805542, + "rewards/margins": 6.295795440673828, + "rewards/rejected": -9.89331340789795, + "step": 1966 + }, + { + "epoch": 2.57, + "learning_rate": 2.663354570237481e-06, + "logits/chosen": -2.2344844341278076, + "logits/rejected": -2.227917432785034, + "logps/chosen": -227.46246337890625, + "logps/rejected": -299.5288391113281, + "loss": 0.0502, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.381045341491699, + "rewards/margins": 6.389766693115234, + "rewards/rejected": -10.770811080932617, + "step": 1967 + }, + { + "epoch": 2.58, + "learning_rate": 2.647285052093218e-06, + "logits/chosen": -2.145289421081543, + "logits/rejected": -2.1941628456115723, + "logps/chosen": -264.63287353515625, + "logps/rejected": -291.396728515625, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.752795696258545, + "rewards/margins": 6.148702621459961, + "rewards/rejected": -10.901496887207031, + "step": 1968 + }, + { + "epoch": 2.58, + "learning_rate": 2.631261448342387e-06, + "logits/chosen": -2.1281938552856445, + "logits/rejected": -2.2055163383483887, + "logps/chosen": -193.5919189453125, + "logps/rejected": -284.9674987792969, + "loss": 0.1184, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.3006486892700195, + "rewards/margins": 7.561243057250977, + "rewards/rejected": -11.86189079284668, + "step": 1969 + }, + { + "epoch": 2.58, + "learning_rate": 2.615283791898837e-06, + "logits/chosen": -2.080514907836914, + "logits/rejected": -2.1863436698913574, + "logps/chosen": -202.62644958496094, + "logps/rejected": -291.22418212890625, + "loss": 0.0468, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.090460777282715, + "rewards/margins": 6.816705703735352, + "rewards/rejected": -10.907167434692383, + "step": 1970 + }, + { + "epoch": 2.58, + "learning_rate": 2.599352115582046e-06, + "logits/chosen": -1.8590359687805176, + "logits/rejected": -1.9023526906967163, + "logps/chosen": -190.2621612548828, + "logps/rejected": -247.3043670654297, + "loss": 0.0154, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8165645599365234, + "rewards/margins": 7.015137672424316, + "rewards/rejected": -10.83170223236084, + "step": 1971 + }, + { + "epoch": 2.58, + "learning_rate": 2.5834664521170504e-06, + "logits/chosen": -2.2179200649261475, + "logits/rejected": -2.2618539333343506, + "logps/chosen": -245.14952087402344, + "logps/rejected": -327.466064453125, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.565230846405029, + "rewards/margins": 7.709439277648926, + "rewards/rejected": -12.274670600891113, + "step": 1972 + }, + { + "epoch": 2.58, + "learning_rate": 2.5676268341343622e-06, + "logits/chosen": -2.2276315689086914, + "logits/rejected": -2.2667124271392822, + "logps/chosen": -227.38421630859375, + "logps/rejected": -318.4751892089844, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.067168712615967, + "rewards/margins": 6.87885856628418, + "rewards/rejected": -10.946027755737305, + "step": 1973 + }, + { + "epoch": 2.58, + "learning_rate": 2.5518332941699056e-06, + "logits/chosen": -2.111288547515869, + "logits/rejected": -2.1012141704559326, + "logps/chosen": -224.7599334716797, + "logps/rejected": -277.36724853515625, + "loss": 0.0935, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.625319957733154, + "rewards/margins": 4.988708972930908, + "rewards/rejected": -9.614028930664062, + "step": 1974 + }, + { + "epoch": 2.59, + "learning_rate": 2.5360858646649722e-06, + "logits/chosen": -2.1115424633026123, + "logits/rejected": -2.1978673934936523, + "logps/chosen": -214.7374725341797, + "logps/rejected": -298.4267883300781, + "loss": 0.0888, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.30065393447876, + "rewards/margins": 6.984291076660156, + "rewards/rejected": -11.284944534301758, + "step": 1975 + }, + { + "epoch": 2.59, + "learning_rate": 2.520384577966142e-06, + "logits/chosen": -2.13796329498291, + "logits/rejected": -2.2352454662323, + "logps/chosen": -214.27926635742188, + "logps/rejected": -310.12548828125, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.689536094665527, + "rewards/margins": 7.723046779632568, + "rewards/rejected": -12.412583351135254, + "step": 1976 + }, + { + "epoch": 2.59, + "learning_rate": 2.5047294663251953e-06, + "logits/chosen": -1.8943133354187012, + "logits/rejected": -1.8863542079925537, + "logps/chosen": -202.632568359375, + "logps/rejected": -283.7858581542969, + "loss": 0.0277, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.304527282714844, + "rewards/margins": 6.5835723876953125, + "rewards/rejected": -10.888099670410156, + "step": 1977 + }, + { + "epoch": 2.59, + "learning_rate": 2.4891205618990666e-06, + "logits/chosen": -2.2262890338897705, + "logits/rejected": -2.2817721366882324, + "logps/chosen": -205.2901611328125, + "logps/rejected": -313.1614074707031, + "loss": 0.0244, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7924742698669434, + "rewards/margins": 7.6968865394592285, + "rewards/rejected": -11.489360809326172, + "step": 1978 + }, + { + "epoch": 2.59, + "learning_rate": 2.4735578967497953e-06, + "logits/chosen": -2.2580699920654297, + "logits/rejected": -2.271095037460327, + "logps/chosen": -228.66651916503906, + "logps/rejected": -313.8482360839844, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5053930282592773, + "rewards/margins": 6.693525791168213, + "rewards/rejected": -10.198918342590332, + "step": 1979 + }, + { + "epoch": 2.59, + "learning_rate": 2.4580415028444326e-06, + "logits/chosen": -2.1963253021240234, + "logits/rejected": -2.225990056991577, + "logps/chosen": -220.40390014648438, + "logps/rejected": -302.4560546875, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.035942077636719, + "rewards/margins": 7.159281253814697, + "rewards/rejected": -11.195222854614258, + "step": 1980 + }, + { + "epoch": 2.59, + "learning_rate": 2.4425714120549726e-06, + "logits/chosen": -2.1135385036468506, + "logits/rejected": -2.1836442947387695, + "logps/chosen": -229.04840087890625, + "logps/rejected": -297.4963684082031, + "loss": 0.0536, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.9167304039001465, + "rewards/margins": 6.263636589050293, + "rewards/rejected": -10.180366516113281, + "step": 1981 + }, + { + "epoch": 2.59, + "learning_rate": 2.42714765615831e-06, + "logits/chosen": -2.150205612182617, + "logits/rejected": -2.124638795852661, + "logps/chosen": -191.8637237548828, + "logps/rejected": -334.6326599121094, + "loss": 0.1384, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.8502745628356934, + "rewards/margins": 6.817922115325928, + "rewards/rejected": -10.668197631835938, + "step": 1982 + }, + { + "epoch": 2.6, + "learning_rate": 2.4117702668361777e-06, + "logits/chosen": -2.2239620685577393, + "logits/rejected": -2.2690749168395996, + "logps/chosen": -189.5276336669922, + "logps/rejected": -279.3620300292969, + "loss": 0.0576, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.5610768795013428, + "rewards/margins": 7.047041416168213, + "rewards/rejected": -10.60811710357666, + "step": 1983 + }, + { + "epoch": 2.6, + "learning_rate": 2.3964392756750276e-06, + "logits/chosen": -2.1364612579345703, + "logits/rejected": -2.2472097873687744, + "logps/chosen": -168.07904052734375, + "logps/rejected": -220.91741943359375, + "loss": 0.1491, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.8274848461151123, + "rewards/margins": 4.70515775680542, + "rewards/rejected": -8.532642364501953, + "step": 1984 + }, + { + "epoch": 2.6, + "learning_rate": 2.381154714166045e-06, + "logits/chosen": -2.2876319885253906, + "logits/rejected": -2.2506895065307617, + "logps/chosen": -254.5384521484375, + "logps/rejected": -337.84161376953125, + "loss": 0.02, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.346930027008057, + "rewards/margins": 7.682336330413818, + "rewards/rejected": -12.029267311096191, + "step": 1985 + }, + { + "epoch": 2.6, + "learning_rate": 2.3659166137050297e-06, + "logits/chosen": -2.1608285903930664, + "logits/rejected": -2.2598462104797363, + "logps/chosen": -225.83119201660156, + "logps/rejected": -306.9623107910156, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6818244457244873, + "rewards/margins": 7.314521312713623, + "rewards/rejected": -10.996345520019531, + "step": 1986 + }, + { + "epoch": 2.6, + "learning_rate": 2.3507250055923384e-06, + "logits/chosen": -2.114039897918701, + "logits/rejected": -2.137077808380127, + "logps/chosen": -188.9156951904297, + "logps/rejected": -261.7002868652344, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7715907096862793, + "rewards/margins": 7.349464416503906, + "rewards/rejected": -11.121053695678711, + "step": 1987 + }, + { + "epoch": 2.6, + "learning_rate": 2.335579921032849e-06, + "logits/chosen": -2.098381280899048, + "logits/rejected": -2.0801186561584473, + "logps/chosen": -227.4667510986328, + "logps/rejected": -282.9793395996094, + "loss": 0.0462, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.089412689208984, + "rewards/margins": 7.122097015380859, + "rewards/rejected": -11.211509704589844, + "step": 1988 + }, + { + "epoch": 2.6, + "learning_rate": 2.3204813911358535e-06, + "logits/chosen": -2.156756639480591, + "logits/rejected": -2.2026946544647217, + "logps/chosen": -214.17596435546875, + "logps/rejected": -287.5680847167969, + "loss": 0.0299, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8633828163146973, + "rewards/margins": 7.400090217590332, + "rewards/rejected": -11.263472557067871, + "step": 1989 + }, + { + "epoch": 2.6, + "learning_rate": 2.305429446915036e-06, + "logits/chosen": -2.052035093307495, + "logits/rejected": -2.110588312149048, + "logps/chosen": -203.76332092285156, + "logps/rejected": -261.7690734863281, + "loss": 0.0984, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.9598305225372314, + "rewards/margins": 5.059589385986328, + "rewards/rejected": -9.019420623779297, + "step": 1990 + }, + { + "epoch": 2.61, + "learning_rate": 2.2904241192883703e-06, + "logits/chosen": -2.180089235305786, + "logits/rejected": -2.179062604904175, + "logps/chosen": -231.613037109375, + "logps/rejected": -335.9823303222656, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7551474571228027, + "rewards/margins": 9.129593849182129, + "rewards/rejected": -12.88474178314209, + "step": 1991 + }, + { + "epoch": 2.61, + "learning_rate": 2.2754654390780924e-06, + "logits/chosen": -2.2093710899353027, + "logits/rejected": -2.2714219093322754, + "logps/chosen": -187.42982482910156, + "logps/rejected": -282.70550537109375, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.000192642211914, + "rewards/margins": 6.893460273742676, + "rewards/rejected": -10.89365291595459, + "step": 1992 + }, + { + "epoch": 2.61, + "learning_rate": 2.260553437010621e-06, + "logits/chosen": -2.4081714153289795, + "logits/rejected": -2.382913827896118, + "logps/chosen": -185.78472900390625, + "logps/rejected": -273.45294189453125, + "loss": 0.0513, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.414534330368042, + "rewards/margins": 6.871214389801025, + "rewards/rejected": -10.285750389099121, + "step": 1993 + }, + { + "epoch": 2.61, + "learning_rate": 2.245688143716476e-06, + "logits/chosen": -2.0556302070617676, + "logits/rejected": -2.0775372982025146, + "logps/chosen": -199.91546630859375, + "logps/rejected": -244.66751098632812, + "loss": 0.0585, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.456460952758789, + "rewards/margins": 5.888851642608643, + "rewards/rejected": -10.345312118530273, + "step": 1994 + }, + { + "epoch": 2.61, + "learning_rate": 2.2308695897302472e-06, + "logits/chosen": -2.170102834701538, + "logits/rejected": -2.0720081329345703, + "logps/chosen": -187.82437133789062, + "logps/rejected": -254.10223388671875, + "loss": 0.0468, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.3889241218566895, + "rewards/margins": 7.232866287231445, + "rewards/rejected": -10.621789932250977, + "step": 1995 + }, + { + "epoch": 2.61, + "learning_rate": 2.216097805490516e-06, + "logits/chosen": -1.9916555881500244, + "logits/rejected": -2.0511884689331055, + "logps/chosen": -189.18374633789062, + "logps/rejected": -304.3656311035156, + "loss": 0.0232, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.122269630432129, + "rewards/margins": 6.872952938079834, + "rewards/rejected": -10.995223045349121, + "step": 1996 + }, + { + "epoch": 2.61, + "learning_rate": 2.2013728213398006e-06, + "logits/chosen": -2.206115961074829, + "logits/rejected": -2.2186381816864014, + "logps/chosen": -199.43597412109375, + "logps/rejected": -281.40008544921875, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.699346542358398, + "rewards/margins": 6.290205955505371, + "rewards/rejected": -10.98955249786377, + "step": 1997 + }, + { + "epoch": 2.62, + "learning_rate": 2.1866946675244692e-06, + "logits/chosen": -1.9953958988189697, + "logits/rejected": -2.0668516159057617, + "logps/chosen": -191.92825317382812, + "logps/rejected": -315.6248779296875, + "loss": 0.0211, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.754812240600586, + "rewards/margins": 9.015074729919434, + "rewards/rejected": -13.76988697052002, + "step": 1998 + }, + { + "epoch": 2.62, + "learning_rate": 2.1720633741947187e-06, + "logits/chosen": -2.1922409534454346, + "logits/rejected": -2.2489383220672607, + "logps/chosen": -236.48086547851562, + "logps/rejected": -337.6635437011719, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.471558570861816, + "rewards/margins": 7.329318046569824, + "rewards/rejected": -11.800875663757324, + "step": 1999 + }, + { + "epoch": 2.62, + "learning_rate": 2.157478971404478e-06, + "logits/chosen": -1.9427260160446167, + "logits/rejected": -2.019679307937622, + "logps/chosen": -208.48928833007812, + "logps/rejected": -307.0675964355469, + "loss": 0.0887, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.904735565185547, + "rewards/margins": 6.429941177368164, + "rewards/rejected": -11.334676742553711, + "step": 2000 + } + ], + "logging_steps": 1, + "max_steps": 2292, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}