diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4566 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.998451213216314, + "eval_steps": 100, + "global_step": 2904, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.7182130584192438e-09, + "logits/chosen": -2.41489839553833, + "logits/rejected": -2.313730239868164, + "logps/chosen": -426.6319580078125, + "logps/rejected": -209.72433471679688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 1.718213058419244e-08, + "logits/chosen": -2.736762762069702, + "logits/rejected": -2.701097249984741, + "logps/chosen": -275.3837890625, + "logps/rejected": -267.7837219238281, + "loss": 0.696, + "rewards/accuracies": 0.4236111044883728, + "rewards/chosen": -0.0019075096352025867, + "rewards/margins": -0.0022293850779533386, + "rewards/rejected": 0.00032187564647756517, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 3.436426116838488e-08, + "logits/chosen": -2.6771740913391113, + "logits/rejected": -2.614168643951416, + "logps/chosen": -282.4807434082031, + "logps/rejected": -230.01553344726562, + "loss": 0.6902, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": 0.00013565353583544493, + "rewards/margins": 0.002631585579365492, + "rewards/rejected": -0.0024959323927760124, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 5.154639175257731e-08, + "logits/chosen": -2.670668363571167, + "logits/rejected": -2.628671169281006, + "logps/chosen": -254.06240844726562, + "logps/rejected": -235.0716094970703, + "loss": 0.6828, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0025966805405914783, + "rewards/margins": 0.00959115568548441, + "rewards/rejected": -0.006994475610554218, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 6.872852233676976e-08, + "logits/chosen": -2.7003836631774902, + "logits/rejected": -2.615793228149414, + "logps/chosen": -238.7230682373047, + "logps/rejected": -180.0827178955078, + "loss": 0.6629, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.025214945897459984, + "rewards/margins": 0.05416213348507881, + "rewards/rejected": -0.02894718386232853, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 8.59106529209622e-08, + "logits/chosen": -2.6934826374053955, + "logits/rejected": -2.6144022941589355, + "logps/chosen": -251.5126495361328, + "logps/rejected": -197.26048278808594, + "loss": 0.6399, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.046762533485889435, + "rewards/margins": 0.12699802219867706, + "rewards/rejected": -0.08023548126220703, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 1.0309278350515462e-07, + "logits/chosen": -2.692603349685669, + "logits/rejected": -2.6785025596618652, + "logps/chosen": -257.05914306640625, + "logps/rejected": -247.1437530517578, + "loss": 0.6155, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.04687836393713951, + "rewards/margins": 0.17602364718914032, + "rewards/rejected": -0.1291452944278717, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 1.202749140893471e-07, + "logits/chosen": -2.737905979156494, + "logits/rejected": -2.623173952102661, + "logps/chosen": -316.3946228027344, + "logps/rejected": -238.3878936767578, + "loss": 0.5916, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.10290606319904327, + "rewards/margins": 0.3630576729774475, + "rewards/rejected": -0.26015162467956543, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 1.3745704467353952e-07, + "logits/chosen": -2.6619038581848145, + "logits/rejected": -2.6211159229278564, + "logps/chosen": -279.396728515625, + "logps/rejected": -232.0877685546875, + "loss": 0.5671, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.05718129128217697, + "rewards/margins": 0.43765443563461304, + "rewards/rejected": -0.38047313690185547, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 1.5463917525773197e-07, + "logits/chosen": -2.6507420539855957, + "logits/rejected": -2.575387954711914, + "logps/chosen": -272.7942199707031, + "logps/rejected": -227.7272491455078, + "loss": 0.5676, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.008855322375893593, + "rewards/margins": 0.3766869902610779, + "rewards/rejected": -0.3855423331260681, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 1.718213058419244e-07, + "logits/chosen": -2.6757609844207764, + "logits/rejected": -2.5984952449798584, + "logps/chosen": -278.7255554199219, + "logps/rejected": -253.88668823242188, + "loss": 0.575, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.0012895159889012575, + "rewards/margins": 0.500220775604248, + "rewards/rejected": -0.49893131852149963, + "step": 100 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -2.745777130126953, + "eval_logits/rejected": -2.6524808406829834, + "eval_logps/chosen": -273.94866943359375, + "eval_logps/rejected": -235.7017822265625, + "eval_loss": 0.5309013724327087, + "eval_rewards/accuracies": 0.7459999918937683, + "eval_rewards/chosen": -0.01014842838048935, + "eval_rewards/margins": 0.593262791633606, + "eval_rewards/rejected": -0.6034111976623535, + "eval_runtime": 330.0041, + "eval_samples_per_second": 6.061, + "eval_steps_per_second": 0.379, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 1.8900343642611682e-07, + "logits/chosen": -2.731081485748291, + "logits/rejected": -2.6355175971984863, + "logps/chosen": -276.24908447265625, + "logps/rejected": -247.2675018310547, + "loss": 0.5464, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.021220406517386436, + "rewards/margins": 0.625217854976654, + "rewards/rejected": -0.6464383006095886, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 2.0618556701030925e-07, + "logits/chosen": -2.715306520462036, + "logits/rejected": -2.6525187492370605, + "logps/chosen": -274.8995361328125, + "logps/rejected": -230.00900268554688, + "loss": 0.5043, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.022734543308615685, + "rewards/margins": 0.7808176279067993, + "rewards/rejected": -0.8035521507263184, + "step": 120 + }, + { + "epoch": 0.13, + "learning_rate": 2.2336769759450173e-07, + "logits/chosen": -2.720980405807495, + "logits/rejected": -2.6762051582336426, + "logps/chosen": -258.5883483886719, + "logps/rejected": -226.6556396484375, + "loss": 0.5365, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.060701239854097366, + "rewards/margins": 0.8721252679824829, + "rewards/rejected": -0.9328263998031616, + "step": 130 + }, + { + "epoch": 0.14, + "learning_rate": 2.405498281786942e-07, + "logits/chosen": -2.7193636894226074, + "logits/rejected": -2.6467044353485107, + "logps/chosen": -283.06817626953125, + "logps/rejected": -243.6209716796875, + "loss": 0.5378, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.13080577552318573, + "rewards/margins": 0.6716828942298889, + "rewards/rejected": -0.8024886250495911, + "step": 140 + }, + { + "epoch": 0.15, + "learning_rate": 2.5773195876288655e-07, + "logits/chosen": -2.7493207454681396, + "logits/rejected": -2.6163530349731445, + "logps/chosen": -282.48358154296875, + "logps/rejected": -222.87710571289062, + "loss": 0.522, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.12233565002679825, + "rewards/margins": 0.6884902119636536, + "rewards/rejected": -0.8108257055282593, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 2.7491408934707903e-07, + "logits/chosen": -2.7176194190979004, + "logits/rejected": -2.6076390743255615, + "logps/chosen": -283.25506591796875, + "logps/rejected": -229.7573699951172, + "loss": 0.5103, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.0403318852186203, + "rewards/margins": 0.8281108140945435, + "rewards/rejected": -0.868442714214325, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 2.9209621993127146e-07, + "logits/chosen": -2.715559482574463, + "logits/rejected": -2.630178689956665, + "logps/chosen": -270.50274658203125, + "logps/rejected": -247.9048614501953, + "loss": 0.5238, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.018013525754213333, + "rewards/margins": 0.9226928949356079, + "rewards/rejected": -0.9407063722610474, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 3.0927835051546394e-07, + "logits/chosen": -2.668119430541992, + "logits/rejected": -2.601747989654541, + "logps/chosen": -248.54458618164062, + "logps/rejected": -225.4590606689453, + "loss": 0.4972, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2065100222826004, + "rewards/margins": 0.7541019320487976, + "rewards/rejected": -0.9606119394302368, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 3.2646048109965636e-07, + "logits/chosen": -2.6944541931152344, + "logits/rejected": -2.607551097869873, + "logps/chosen": -268.26678466796875, + "logps/rejected": -233.5041961669922, + "loss": 0.498, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1957332193851471, + "rewards/margins": 0.9389954805374146, + "rewards/rejected": -1.1347286701202393, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 3.436426116838488e-07, + "logits/chosen": -2.779195785522461, + "logits/rejected": -2.6563549041748047, + "logps/chosen": -318.3896484375, + "logps/rejected": -265.0843505859375, + "loss": 0.4759, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.09323607385158539, + "rewards/margins": 1.1875323057174683, + "rewards/rejected": -1.2807685136795044, + "step": 200 + }, + { + "epoch": 0.21, + "eval_logits/chosen": -2.800610303878784, + "eval_logits/rejected": -2.7065932750701904, + "eval_logps/chosen": -274.48919677734375, + "eval_logps/rejected": -240.49658203125, + "eval_loss": 0.49432528018951416, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -0.06420008093118668, + "eval_rewards/margins": 1.0186885595321655, + "eval_rewards/rejected": -1.0828887224197388, + "eval_runtime": 330.0896, + "eval_samples_per_second": 6.059, + "eval_steps_per_second": 0.379, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 3.608247422680412e-07, + "logits/chosen": -2.7947614192962646, + "logits/rejected": -2.7344536781311035, + "logps/chosen": -279.47857666015625, + "logps/rejected": -244.9560546875, + "loss": 0.5212, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.10790084302425385, + "rewards/margins": 0.8937327265739441, + "rewards/rejected": -1.0016335248947144, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 3.7800687285223364e-07, + "logits/chosen": -2.730440855026245, + "logits/rejected": -2.670431613922119, + "logps/chosen": -239.8012237548828, + "logps/rejected": -226.3561248779297, + "loss": 0.4978, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.10349278151988983, + "rewards/margins": 1.0418349504470825, + "rewards/rejected": -1.1453276872634888, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 3.9518900343642607e-07, + "logits/chosen": -2.798637866973877, + "logits/rejected": -2.733868360519409, + "logps/chosen": -326.19427490234375, + "logps/rejected": -257.0804138183594, + "loss": 0.5144, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.14514626562595367, + "rewards/margins": 1.034011960029602, + "rewards/rejected": -1.1791582107543945, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 4.123711340206185e-07, + "logits/chosen": -2.7531001567840576, + "logits/rejected": -2.6891307830810547, + "logps/chosen": -270.3514404296875, + "logps/rejected": -258.6917724609375, + "loss": 0.4934, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.23920175433158875, + "rewards/margins": 1.1313306093215942, + "rewards/rejected": -1.3705322742462158, + "step": 240 + }, + { + "epoch": 0.26, + "learning_rate": 4.2955326460481097e-07, + "logits/chosen": -2.6519603729248047, + "logits/rejected": -2.569204807281494, + "logps/chosen": -292.78662109375, + "logps/rejected": -245.4613800048828, + "loss": 0.5315, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37750571966171265, + "rewards/margins": 0.8689467310905457, + "rewards/rejected": -1.2464525699615479, + "step": 250 + }, + { + "epoch": 0.27, + "learning_rate": 4.4673539518900345e-07, + "logits/chosen": -2.683892011642456, + "logits/rejected": -2.599489450454712, + "logps/chosen": -291.04229736328125, + "logps/rejected": -281.9589538574219, + "loss": 0.4881, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.4090940058231354, + "rewards/margins": 1.0680254697799683, + "rewards/rejected": -1.4771194458007812, + "step": 260 + }, + { + "epoch": 0.28, + "learning_rate": 4.639175257731959e-07, + "logits/chosen": -2.6438565254211426, + "logits/rejected": -2.536273241043091, + "logps/chosen": -297.5185241699219, + "logps/rejected": -231.0972137451172, + "loss": 0.4995, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.3678213357925415, + "rewards/margins": 0.9588603973388672, + "rewards/rejected": -1.3266817331314087, + "step": 270 + }, + { + "epoch": 0.29, + "learning_rate": 4.810996563573884e-07, + "logits/chosen": -2.6681864261627197, + "logits/rejected": -2.6065673828125, + "logps/chosen": -239.76901245117188, + "logps/rejected": -237.3007354736328, + "loss": 0.5027, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.25205284357070923, + "rewards/margins": 1.061631679534912, + "rewards/rejected": -1.3136845827102661, + "step": 280 + }, + { + "epoch": 0.3, + "learning_rate": 4.982817869415807e-07, + "logits/chosen": -2.6735098361968994, + "logits/rejected": -2.5926923751831055, + "logps/chosen": -266.5799255371094, + "logps/rejected": -243.5729217529297, + "loss": 0.4921, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.22308111190795898, + "rewards/margins": 1.2045137882232666, + "rewards/rejected": -1.4275949001312256, + "step": 290 + }, + { + "epoch": 0.31, + "learning_rate": 4.982778415614236e-07, + "logits/chosen": -2.7693233489990234, + "logits/rejected": -2.703303575515747, + "logps/chosen": -270.84503173828125, + "logps/rejected": -263.55548095703125, + "loss": 0.5022, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.21006159484386444, + "rewards/margins": 1.251680850982666, + "rewards/rejected": -1.4617425203323364, + "step": 300 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -2.8225364685058594, + "eval_logits/rejected": -2.7362353801727295, + "eval_logps/chosen": -275.37347412109375, + "eval_logps/rejected": -242.1844940185547, + "eval_loss": 0.4824218451976776, + "eval_rewards/accuracies": 0.7620000243186951, + "eval_rewards/chosen": -0.15262925624847412, + "eval_rewards/margins": 1.0990517139434814, + "eval_rewards/rejected": -1.2516810894012451, + "eval_runtime": 329.0169, + "eval_samples_per_second": 6.079, + "eval_steps_per_second": 0.38, + "step": 300 + }, + { + "epoch": 0.32, + "learning_rate": 4.963643321852277e-07, + "logits/chosen": -2.657869815826416, + "logits/rejected": -2.604085683822632, + "logps/chosen": -269.7090759277344, + "logps/rejected": -248.0914306640625, + "loss": 0.4821, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.2833268642425537, + "rewards/margins": 1.0022751092910767, + "rewards/rejected": -1.2856018543243408, + "step": 310 + }, + { + "epoch": 0.33, + "learning_rate": 4.944508228090318e-07, + "logits/chosen": -2.753511905670166, + "logits/rejected": -2.669100284576416, + "logps/chosen": -282.27386474609375, + "logps/rejected": -246.5732879638672, + "loss": 0.4972, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.3067387044429779, + "rewards/margins": 1.3396486043930054, + "rewards/rejected": -1.6463874578475952, + "step": 320 + }, + { + "epoch": 0.34, + "learning_rate": 4.925373134328357e-07, + "logits/chosen": -2.638120412826538, + "logits/rejected": -2.5762674808502197, + "logps/chosen": -251.3295135498047, + "logps/rejected": -223.2657470703125, + "loss": 0.4817, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.36169368028640747, + "rewards/margins": 1.3353826999664307, + "rewards/rejected": -1.697076439857483, + "step": 330 + }, + { + "epoch": 0.35, + "learning_rate": 4.906238040566398e-07, + "logits/chosen": -2.7591946125030518, + "logits/rejected": -2.647918939590454, + "logps/chosen": -253.46444702148438, + "logps/rejected": -215.0161590576172, + "loss": 0.4821, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.3391885757446289, + "rewards/margins": 1.2555291652679443, + "rewards/rejected": -1.5947177410125732, + "step": 340 + }, + { + "epoch": 0.36, + "learning_rate": 4.887102946804438e-07, + "logits/chosen": -2.755331039428711, + "logits/rejected": -2.664771556854248, + "logps/chosen": -278.7284851074219, + "logps/rejected": -265.8031311035156, + "loss": 0.5166, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.29750093817710876, + "rewards/margins": 1.3157111406326294, + "rewards/rejected": -1.61321222782135, + "step": 350 + }, + { + "epoch": 0.37, + "learning_rate": 4.867967853042479e-07, + "logits/chosen": -2.7414841651916504, + "logits/rejected": -2.649625778198242, + "logps/chosen": -289.3199768066406, + "logps/rejected": -261.6228942871094, + "loss": 0.507, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.4183623790740967, + "rewards/margins": 1.3235833644866943, + "rewards/rejected": -1.7419458627700806, + "step": 360 + }, + { + "epoch": 0.38, + "learning_rate": 4.84883275928052e-07, + "logits/chosen": -2.6831367015838623, + "logits/rejected": -2.6130385398864746, + "logps/chosen": -243.5470428466797, + "logps/rejected": -231.25369262695312, + "loss": 0.5619, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7277100086212158, + "rewards/margins": 0.7260710597038269, + "rewards/rejected": -1.4537811279296875, + "step": 370 + }, + { + "epoch": 0.39, + "learning_rate": 4.82969766551856e-07, + "logits/chosen": -2.79364275932312, + "logits/rejected": -2.704852342605591, + "logps/chosen": -284.8489685058594, + "logps/rejected": -241.8092041015625, + "loss": 0.4803, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4155152440071106, + "rewards/margins": 1.2566057443618774, + "rewards/rejected": -1.6721210479736328, + "step": 380 + }, + { + "epoch": 0.4, + "learning_rate": 4.810562571756601e-07, + "logits/chosen": -2.7232635021209717, + "logits/rejected": -2.6678431034088135, + "logps/chosen": -290.93505859375, + "logps/rejected": -260.04241943359375, + "loss": 0.4719, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3851390480995178, + "rewards/margins": 1.2824820280075073, + "rewards/rejected": -1.6676212549209595, + "step": 390 + }, + { + "epoch": 0.41, + "learning_rate": 4.791427477994642e-07, + "logits/chosen": -2.7520861625671387, + "logits/rejected": -2.6794071197509766, + "logps/chosen": -318.5746765136719, + "logps/rejected": -269.6056823730469, + "loss": 0.5282, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6232264637947083, + "rewards/margins": 0.9759466052055359, + "rewards/rejected": -1.5991729497909546, + "step": 400 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -2.792391061782837, + "eval_logits/rejected": -2.702341079711914, + "eval_logps/chosen": -280.6413269042969, + "eval_logps/rejected": -249.08758544921875, + "eval_loss": 0.4877680838108063, + "eval_rewards/accuracies": 0.7839999794960022, + "eval_rewards/chosen": -0.6794139742851257, + "eval_rewards/margins": 1.262575387954712, + "eval_rewards/rejected": -1.9419893026351929, + "eval_runtime": 327.8949, + "eval_samples_per_second": 6.1, + "eval_steps_per_second": 0.381, + "step": 400 + }, + { + "epoch": 0.42, + "learning_rate": 4.772292384232682e-07, + "logits/chosen": -2.709923028945923, + "logits/rejected": -2.662933588027954, + "logps/chosen": -258.0174560546875, + "logps/rejected": -243.55264282226562, + "loss": 0.506, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.7414777278900146, + "rewards/margins": 1.040825605392456, + "rewards/rejected": -1.7823032140731812, + "step": 410 + }, + { + "epoch": 0.43, + "learning_rate": 4.753157290470723e-07, + "logits/chosen": -2.770792245864868, + "logits/rejected": -2.7004318237304688, + "logps/chosen": -301.0741271972656, + "logps/rejected": -260.6761779785156, + "loss": 0.5656, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5612724423408508, + "rewards/margins": 0.8177105188369751, + "rewards/rejected": -1.3789829015731812, + "step": 420 + }, + { + "epoch": 0.44, + "learning_rate": 4.7340221967087635e-07, + "logits/chosen": -2.6355745792388916, + "logits/rejected": -2.5235517024993896, + "logps/chosen": -259.4550476074219, + "logps/rejected": -241.081298828125, + "loss": 0.5152, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.33019572496414185, + "rewards/margins": 1.2212638854980469, + "rewards/rejected": -1.5514596700668335, + "step": 430 + }, + { + "epoch": 0.45, + "learning_rate": 4.714887102946804e-07, + "logits/chosen": -2.6858983039855957, + "logits/rejected": -2.606168031692505, + "logps/chosen": -286.8215026855469, + "logps/rejected": -261.89642333984375, + "loss": 0.4437, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.27932292222976685, + "rewards/margins": 1.2575178146362305, + "rewards/rejected": -1.536840796470642, + "step": 440 + }, + { + "epoch": 0.46, + "learning_rate": 4.6957520091848447e-07, + "logits/chosen": -2.6906371116638184, + "logits/rejected": -2.62721586227417, + "logps/chosen": -282.0963439941406, + "logps/rejected": -252.3018035888672, + "loss": 0.4949, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.45128607749938965, + "rewards/margins": 1.2710622549057007, + "rewards/rejected": -1.7223484516143799, + "step": 450 + }, + { + "epoch": 0.47, + "learning_rate": 4.6766169154228853e-07, + "logits/chosen": -2.6066861152648926, + "logits/rejected": -2.541717767715454, + "logps/chosen": -287.3714904785156, + "logps/rejected": -264.9255065917969, + "loss": 0.4911, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.575259804725647, + "rewards/margins": 1.08199143409729, + "rewards/rejected": -1.6572513580322266, + "step": 460 + }, + { + "epoch": 0.49, + "learning_rate": 4.657481821660926e-07, + "logits/chosen": -2.634578227996826, + "logits/rejected": -2.5506088733673096, + "logps/chosen": -291.7644348144531, + "logps/rejected": -266.2176513671875, + "loss": 0.4912, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5489755868911743, + "rewards/margins": 1.124472975730896, + "rewards/rejected": -1.6734485626220703, + "step": 470 + }, + { + "epoch": 0.5, + "learning_rate": 4.6383467278989666e-07, + "logits/chosen": -2.7003040313720703, + "logits/rejected": -2.5828516483306885, + "logps/chosen": -315.20574951171875, + "logps/rejected": -249.669677734375, + "loss": 0.4455, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2965030074119568, + "rewards/margins": 1.5941083431243896, + "rewards/rejected": -1.8906112909317017, + "step": 480 + }, + { + "epoch": 0.51, + "learning_rate": 4.6192116341370067e-07, + "logits/chosen": -2.6843831539154053, + "logits/rejected": -2.560138702392578, + "logps/chosen": -286.26092529296875, + "logps/rejected": -245.45443725585938, + "loss": 0.5024, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.48061037063598633, + "rewards/margins": 1.3923174142837524, + "rewards/rejected": -1.8729279041290283, + "step": 490 + }, + { + "epoch": 0.52, + "learning_rate": 4.6000765403750473e-07, + "logits/chosen": -2.7150895595550537, + "logits/rejected": -2.6805529594421387, + "logps/chosen": -259.2522888183594, + "logps/rejected": -249.19253540039062, + "loss": 0.5179, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.38437995314598083, + "rewards/margins": 0.9828270077705383, + "rewards/rejected": -1.3672068119049072, + "step": 500 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -2.7630860805511475, + "eval_logits/rejected": -2.677306652069092, + "eval_logps/chosen": -276.4918212890625, + "eval_logps/rejected": -244.1531524658203, + "eval_loss": 0.4804608225822449, + "eval_rewards/accuracies": 0.7760000228881836, + "eval_rewards/chosen": -0.26446640491485596, + "eval_rewards/margins": 1.1840814352035522, + "eval_rewards/rejected": -1.4485478401184082, + "eval_runtime": 328.3333, + "eval_samples_per_second": 6.091, + "eval_steps_per_second": 0.381, + "step": 500 + }, + { + "epoch": 0.53, + "learning_rate": 4.580941446613088e-07, + "logits/chosen": -2.771284580230713, + "logits/rejected": -2.660457134246826, + "logps/chosen": -299.34197998046875, + "logps/rejected": -261.97998046875, + "loss": 0.4769, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.30852824449539185, + "rewards/margins": 1.3942599296569824, + "rewards/rejected": -1.7027881145477295, + "step": 510 + }, + { + "epoch": 0.54, + "learning_rate": 4.5618063528511285e-07, + "logits/chosen": -2.7847771644592285, + "logits/rejected": -2.7074437141418457, + "logps/chosen": -247.3426513671875, + "logps/rejected": -230.1765899658203, + "loss": 0.457, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.6353321075439453, + "rewards/margins": 1.1674182415008545, + "rewards/rejected": -1.8027503490447998, + "step": 520 + }, + { + "epoch": 0.55, + "learning_rate": 4.542671259089169e-07, + "logits/chosen": -2.6685898303985596, + "logits/rejected": -2.611987590789795, + "logps/chosen": -291.04827880859375, + "logps/rejected": -270.1742248535156, + "loss": 0.4972, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6013702154159546, + "rewards/margins": 1.2920764684677124, + "rewards/rejected": -1.893446922302246, + "step": 530 + }, + { + "epoch": 0.56, + "learning_rate": 4.52353616532721e-07, + "logits/chosen": -2.691497802734375, + "logits/rejected": -2.620311737060547, + "logps/chosen": -271.4696044921875, + "logps/rejected": -264.66278076171875, + "loss": 0.5215, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5857911109924316, + "rewards/margins": 1.2745181322097778, + "rewards/rejected": -1.860309362411499, + "step": 540 + }, + { + "epoch": 0.57, + "learning_rate": 4.5044010715652504e-07, + "logits/chosen": -2.656611919403076, + "logits/rejected": -2.6252033710479736, + "logps/chosen": -273.7254943847656, + "logps/rejected": -247.2091827392578, + "loss": 0.4766, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7783125042915344, + "rewards/margins": 0.9800532460212708, + "rewards/rejected": -1.7583658695220947, + "step": 550 + }, + { + "epoch": 0.58, + "learning_rate": 4.485265977803291e-07, + "logits/chosen": -2.7592997550964355, + "logits/rejected": -2.657989978790283, + "logps/chosen": -281.6728820800781, + "logps/rejected": -246.2255859375, + "loss": 0.4732, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.4413899779319763, + "rewards/margins": 1.3485950231552124, + "rewards/rejected": -1.7899850606918335, + "step": 560 + }, + { + "epoch": 0.59, + "learning_rate": 4.4661308840413316e-07, + "logits/chosen": -2.7648417949676514, + "logits/rejected": -2.6974472999572754, + "logps/chosen": -249.6786346435547, + "logps/rejected": -236.48934936523438, + "loss": 0.4813, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.43044313788414, + "rewards/margins": 1.4435796737670898, + "rewards/rejected": -1.8740227222442627, + "step": 570 + }, + { + "epoch": 0.6, + "learning_rate": 4.446995790279372e-07, + "logits/chosen": -2.756700038909912, + "logits/rejected": -2.6791555881500244, + "logps/chosen": -291.4740295410156, + "logps/rejected": -262.89801025390625, + "loss": 0.4691, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6896177530288696, + "rewards/margins": 1.4548016786575317, + "rewards/rejected": -2.1444194316864014, + "step": 580 + }, + { + "epoch": 0.61, + "learning_rate": 4.4278606965174123e-07, + "logits/chosen": -2.6673600673675537, + "logits/rejected": -2.6058640480041504, + "logps/chosen": -262.8567810058594, + "logps/rejected": -247.41561889648438, + "loss": 0.509, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.3871522545814514, + "rewards/margins": 1.3774772882461548, + "rewards/rejected": -1.764629602432251, + "step": 590 + }, + { + "epoch": 0.62, + "learning_rate": 4.408725602755453e-07, + "logits/chosen": -2.745607376098633, + "logits/rejected": -2.6397905349731445, + "logps/chosen": -289.14971923828125, + "logps/rejected": -248.13156127929688, + "loss": 0.4705, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.5186043977737427, + "rewards/margins": 1.4924787282943726, + "rewards/rejected": -2.0110831260681152, + "step": 600 + }, + { + "epoch": 0.62, + "eval_logits/chosen": -2.783844232559204, + "eval_logits/rejected": -2.7009334564208984, + "eval_logps/chosen": -276.8629150390625, + "eval_logps/rejected": -245.43368530273438, + "eval_loss": 0.47145330905914307, + "eval_rewards/accuracies": 0.7559999823570251, + "eval_rewards/chosen": -0.30157405138015747, + "eval_rewards/margins": 1.2750270366668701, + "eval_rewards/rejected": -1.5766010284423828, + "eval_runtime": 329.209, + "eval_samples_per_second": 6.075, + "eval_steps_per_second": 0.38, + "step": 600 + }, + { + "epoch": 0.63, + "learning_rate": 4.3895905089934936e-07, + "logits/chosen": -2.776756525039673, + "logits/rejected": -2.627607822418213, + "logps/chosen": -299.1159362792969, + "logps/rejected": -242.3809051513672, + "loss": 0.4517, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.31977617740631104, + "rewards/margins": 1.6483633518218994, + "rewards/rejected": -1.968139410018921, + "step": 610 + }, + { + "epoch": 0.64, + "learning_rate": 4.370455415231534e-07, + "logits/chosen": -2.73073148727417, + "logits/rejected": -2.6580116748809814, + "logps/chosen": -263.01519775390625, + "logps/rejected": -232.265625, + "loss": 0.4977, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.4189836084842682, + "rewards/margins": 1.2726702690124512, + "rewards/rejected": -1.691653847694397, + "step": 620 + }, + { + "epoch": 0.65, + "learning_rate": 4.351320321469575e-07, + "logits/chosen": -2.7166030406951904, + "logits/rejected": -2.6070265769958496, + "logps/chosen": -264.41058349609375, + "logps/rejected": -236.72128295898438, + "loss": 0.4824, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.27821844816207886, + "rewards/margins": 1.1915299892425537, + "rewards/rejected": -1.4697484970092773, + "step": 630 + }, + { + "epoch": 0.66, + "learning_rate": 4.3321852277076154e-07, + "logits/chosen": -2.611860513687134, + "logits/rejected": -2.5530788898468018, + "logps/chosen": -286.221923828125, + "logps/rejected": -246.6705780029297, + "loss": 0.4779, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.16525471210479736, + "rewards/margins": 1.415501594543457, + "rewards/rejected": -1.5807561874389648, + "step": 640 + }, + { + "epoch": 0.67, + "learning_rate": 4.313050133945656e-07, + "logits/chosen": -2.626971960067749, + "logits/rejected": -2.5745906829833984, + "logps/chosen": -255.57138061523438, + "logps/rejected": -252.6260528564453, + "loss": 0.5389, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09872700273990631, + "rewards/margins": 1.0981225967407227, + "rewards/rejected": -1.1968495845794678, + "step": 650 + }, + { + "epoch": 0.68, + "learning_rate": 4.2939150401836967e-07, + "logits/chosen": -2.596104145050049, + "logits/rejected": -2.5283610820770264, + "logps/chosen": -255.2698516845703, + "logps/rejected": -223.36605834960938, + "loss": 0.5229, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.45211920142173767, + "rewards/margins": 0.9975794553756714, + "rewards/rejected": -1.4496986865997314, + "step": 660 + }, + { + "epoch": 0.69, + "learning_rate": 4.2747799464217373e-07, + "logits/chosen": -2.6542000770568848, + "logits/rejected": -2.574998378753662, + "logps/chosen": -261.0472717285156, + "logps/rejected": -231.40328979492188, + "loss": 0.5082, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.492902934551239, + "rewards/margins": 1.1550512313842773, + "rewards/rejected": -1.647953987121582, + "step": 670 + }, + { + "epoch": 0.7, + "learning_rate": 4.255644852659778e-07, + "logits/chosen": -2.6648573875427246, + "logits/rejected": -2.5788636207580566, + "logps/chosen": -303.67791748046875, + "logps/rejected": -276.10723876953125, + "loss": 0.4633, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.35903650522232056, + "rewards/margins": 1.5083674192428589, + "rewards/rejected": -1.8674037456512451, + "step": 680 + }, + { + "epoch": 0.71, + "learning_rate": 4.236509758897818e-07, + "logits/chosen": -2.6442172527313232, + "logits/rejected": -2.5383548736572266, + "logps/chosen": -322.7700500488281, + "logps/rejected": -257.04278564453125, + "loss": 0.503, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48487988114356995, + "rewards/margins": 1.269715666770935, + "rewards/rejected": -1.7545955181121826, + "step": 690 + }, + { + "epoch": 0.72, + "learning_rate": 4.2173746651358586e-07, + "logits/chosen": -2.6078996658325195, + "logits/rejected": -2.502384901046753, + "logps/chosen": -276.53558349609375, + "logps/rejected": -268.16131591796875, + "loss": 0.5038, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.3312338888645172, + "rewards/margins": 1.3475643396377563, + "rewards/rejected": -1.6787983179092407, + "step": 700 + }, + { + "epoch": 0.72, + "eval_logits/chosen": -2.6268980503082275, + "eval_logits/rejected": -2.5408694744110107, + "eval_logps/chosen": -276.9665832519531, + "eval_logps/rejected": -245.39862060546875, + "eval_loss": 0.47904127836227417, + "eval_rewards/accuracies": 0.7680000066757202, + "eval_rewards/chosen": -0.31194165349006653, + "eval_rewards/margins": 1.261155366897583, + "eval_rewards/rejected": -1.5730971097946167, + "eval_runtime": 329.3542, + "eval_samples_per_second": 6.072, + "eval_steps_per_second": 0.38, + "step": 700 + }, + { + "epoch": 0.73, + "learning_rate": 4.198239571373899e-07, + "logits/chosen": -2.565584182739258, + "logits/rejected": -2.4798641204833984, + "logps/chosen": -289.7725830078125, + "logps/rejected": -241.417724609375, + "loss": 0.4679, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.13149698078632355, + "rewards/margins": 1.4788200855255127, + "rewards/rejected": -1.6103169918060303, + "step": 710 + }, + { + "epoch": 0.74, + "learning_rate": 4.17910447761194e-07, + "logits/chosen": -2.5125203132629395, + "logits/rejected": -2.440331220626831, + "logps/chosen": -243.36306762695312, + "logps/rejected": -215.3681640625, + "loss": 0.4894, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.39927834272384644, + "rewards/margins": 1.3379909992218018, + "rewards/rejected": -1.737269401550293, + "step": 720 + }, + { + "epoch": 0.75, + "learning_rate": 4.1599693838499805e-07, + "logits/chosen": -2.523902177810669, + "logits/rejected": -2.4464023113250732, + "logps/chosen": -322.8412170410156, + "logps/rejected": -260.85198974609375, + "loss": 0.5377, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4920814633369446, + "rewards/margins": 1.0238711833953857, + "rewards/rejected": -1.515952467918396, + "step": 730 + }, + { + "epoch": 0.76, + "learning_rate": 4.140834290088021e-07, + "logits/chosen": -2.4797842502593994, + "logits/rejected": -2.4306349754333496, + "logps/chosen": -260.5240173339844, + "logps/rejected": -235.9158172607422, + "loss": 0.4844, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.26274144649505615, + "rewards/margins": 1.1417067050933838, + "rewards/rejected": -1.40444815158844, + "step": 740 + }, + { + "epoch": 0.77, + "learning_rate": 4.121699196326062e-07, + "logits/chosen": -2.5010387897491455, + "logits/rejected": -2.4390175342559814, + "logps/chosen": -267.5348815917969, + "logps/rejected": -265.01861572265625, + "loss": 0.5134, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.3145067095756531, + "rewards/margins": 1.411447286605835, + "rewards/rejected": -1.7259540557861328, + "step": 750 + }, + { + "epoch": 0.78, + "learning_rate": 4.1025641025641024e-07, + "logits/chosen": -2.502624988555908, + "logits/rejected": -2.4160873889923096, + "logps/chosen": -248.45449829101562, + "logps/rejected": -230.887939453125, + "loss": 0.5244, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5923348665237427, + "rewards/margins": 1.0817146301269531, + "rewards/rejected": -1.6740491390228271, + "step": 760 + }, + { + "epoch": 0.8, + "learning_rate": 4.083429008802143e-07, + "logits/chosen": -2.600817918777466, + "logits/rejected": -2.5673515796661377, + "logps/chosen": -283.1528015136719, + "logps/rejected": -261.5489501953125, + "loss": 0.5599, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4175918698310852, + "rewards/margins": 1.0563184022903442, + "rewards/rejected": -1.4739103317260742, + "step": 770 + }, + { + "epoch": 0.81, + "learning_rate": 4.0642939150401836e-07, + "logits/chosen": -2.5576181411743164, + "logits/rejected": -2.5258378982543945, + "logps/chosen": -274.6518859863281, + "logps/rejected": -244.93185424804688, + "loss": 0.4387, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4217241406440735, + "rewards/margins": 1.273429036140442, + "rewards/rejected": -1.695152997970581, + "step": 780 + }, + { + "epoch": 0.82, + "learning_rate": 4.0451588212782237e-07, + "logits/chosen": -2.5924856662750244, + "logits/rejected": -2.5202670097351074, + "logps/chosen": -269.2760009765625, + "logps/rejected": -251.0695037841797, + "loss": 0.4782, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.3948896527290344, + "rewards/margins": 1.6062942743301392, + "rewards/rejected": -2.0011839866638184, + "step": 790 + }, + { + "epoch": 0.83, + "learning_rate": 4.0260237275162643e-07, + "logits/chosen": -2.500338315963745, + "logits/rejected": -2.4486918449401855, + "logps/chosen": -275.6775817871094, + "logps/rejected": -254.4421844482422, + "loss": 0.4418, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.3929779827594757, + "rewards/margins": 1.565860390663147, + "rewards/rejected": -1.9588382244110107, + "step": 800 + }, + { + "epoch": 0.83, + "eval_logits/chosen": -2.5635573863983154, + "eval_logits/rejected": -2.4833929538726807, + "eval_logps/chosen": -278.41131591796875, + "eval_logps/rejected": -249.84423828125, + "eval_loss": 0.46652939915657043, + "eval_rewards/accuracies": 0.7799999713897705, + "eval_rewards/chosen": -0.45641571283340454, + "eval_rewards/margins": 1.5612393617630005, + "eval_rewards/rejected": -2.01765513420105, + "eval_runtime": 328.8068, + "eval_samples_per_second": 6.083, + "eval_steps_per_second": 0.38, + "step": 800 + }, + { + "epoch": 0.84, + "learning_rate": 4.006888633754305e-07, + "logits/chosen": -2.521207332611084, + "logits/rejected": -2.4697537422180176, + "logps/chosen": -286.6387634277344, + "logps/rejected": -234.17514038085938, + "loss": 0.5215, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.6284313201904297, + "rewards/margins": 1.2665073871612549, + "rewards/rejected": -1.8949388265609741, + "step": 810 + }, + { + "epoch": 0.85, + "learning_rate": 3.9877535399923456e-07, + "logits/chosen": -2.600959539413452, + "logits/rejected": -2.547930955886841, + "logps/chosen": -279.1971435546875, + "logps/rejected": -271.48529052734375, + "loss": 0.5072, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5025497674942017, + "rewards/margins": 1.4657728672027588, + "rewards/rejected": -1.968322515487671, + "step": 820 + }, + { + "epoch": 0.86, + "learning_rate": 3.968618446230386e-07, + "logits/chosen": -2.5383434295654297, + "logits/rejected": -2.4821691513061523, + "logps/chosen": -276.9917907714844, + "logps/rejected": -230.35025024414062, + "loss": 0.4449, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5041875243186951, + "rewards/margins": 1.4996968507766724, + "rewards/rejected": -2.0038845539093018, + "step": 830 + }, + { + "epoch": 0.87, + "learning_rate": 3.949483352468427e-07, + "logits/chosen": -2.568638563156128, + "logits/rejected": -2.5122299194335938, + "logps/chosen": -234.30862426757812, + "logps/rejected": -238.3922119140625, + "loss": 0.4666, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5311821103096008, + "rewards/margins": 1.3636810779571533, + "rewards/rejected": -1.894863486289978, + "step": 840 + }, + { + "epoch": 0.88, + "learning_rate": 3.9303482587064674e-07, + "logits/chosen": -2.550352096557617, + "logits/rejected": -2.4809679985046387, + "logps/chosen": -277.41363525390625, + "logps/rejected": -261.5442810058594, + "loss": 0.4991, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.4109998345375061, + "rewards/margins": 1.6612545251846313, + "rewards/rejected": -2.072254180908203, + "step": 850 + }, + { + "epoch": 0.89, + "learning_rate": 3.911213164944508e-07, + "logits/chosen": -2.5133216381073, + "logits/rejected": -2.4779887199401855, + "logps/chosen": -280.0914001464844, + "logps/rejected": -264.759765625, + "loss": 0.4767, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.3346596360206604, + "rewards/margins": 1.4379736185073853, + "rewards/rejected": -1.7726333141326904, + "step": 860 + }, + { + "epoch": 0.9, + "learning_rate": 3.8920780711825487e-07, + "logits/chosen": -2.558295488357544, + "logits/rejected": -2.4871633052825928, + "logps/chosen": -284.81866455078125, + "logps/rejected": -264.2315368652344, + "loss": 0.5071, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.68181312084198, + "rewards/margins": 1.149623155593872, + "rewards/rejected": -1.8314363956451416, + "step": 870 + }, + { + "epoch": 0.91, + "learning_rate": 3.8729429774205893e-07, + "logits/chosen": -2.5324788093566895, + "logits/rejected": -2.458425998687744, + "logps/chosen": -279.7908935546875, + "logps/rejected": -257.31512451171875, + "loss": 0.4992, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.35828351974487305, + "rewards/margins": 1.2599681615829468, + "rewards/rejected": -1.6182515621185303, + "step": 880 + }, + { + "epoch": 0.92, + "learning_rate": 3.8538078836586294e-07, + "logits/chosen": -2.4853129386901855, + "logits/rejected": -2.4347476959228516, + "logps/chosen": -281.7766418457031, + "logps/rejected": -250.4887237548828, + "loss": 0.4881, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3670748770236969, + "rewards/margins": 1.4128227233886719, + "rewards/rejected": -1.7798974514007568, + "step": 890 + }, + { + "epoch": 0.93, + "learning_rate": 3.83467278989667e-07, + "logits/chosen": -2.498248338699341, + "logits/rejected": -2.4675116539001465, + "logps/chosen": -293.01519775390625, + "logps/rejected": -260.99517822265625, + "loss": 0.5155, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.6357588768005371, + "rewards/margins": 1.182974100112915, + "rewards/rejected": -1.8187328577041626, + "step": 900 + }, + { + "epoch": 0.93, + "eval_logits/chosen": -2.592735528945923, + "eval_logits/rejected": -2.511793851852417, + "eval_logps/chosen": -277.5622253417969, + "eval_logps/rejected": -246.7467803955078, + "eval_loss": 0.47698700428009033, + "eval_rewards/accuracies": 0.7739999890327454, + "eval_rewards/chosen": -0.3715021014213562, + "eval_rewards/margins": 1.3364099264144897, + "eval_rewards/rejected": -1.7079118490219116, + "eval_runtime": 329.3477, + "eval_samples_per_second": 6.073, + "eval_steps_per_second": 0.38, + "step": 900 + }, + { + "epoch": 0.94, + "learning_rate": 3.8155376961347106e-07, + "logits/chosen": -2.5742714405059814, + "logits/rejected": -2.529768466949463, + "logps/chosen": -299.46148681640625, + "logps/rejected": -274.9568176269531, + "loss": 0.5162, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5742639303207397, + "rewards/margins": 1.0656424760818481, + "rewards/rejected": -1.6399062871932983, + "step": 910 + }, + { + "epoch": 0.95, + "learning_rate": 3.796402602372751e-07, + "logits/chosen": -2.591820001602173, + "logits/rejected": -2.5311317443847656, + "logps/chosen": -250.81991577148438, + "logps/rejected": -271.74761962890625, + "loss": 0.4654, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5318310260772705, + "rewards/margins": 1.5595569610595703, + "rewards/rejected": -2.09138822555542, + "step": 920 + }, + { + "epoch": 0.96, + "learning_rate": 3.777267508610792e-07, + "logits/chosen": -2.615499496459961, + "logits/rejected": -2.5490453243255615, + "logps/chosen": -298.67694091796875, + "logps/rejected": -250.24130249023438, + "loss": 0.4747, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.4868291914463043, + "rewards/margins": 1.7270475625991821, + "rewards/rejected": -2.213876247406006, + "step": 930 + }, + { + "epoch": 0.97, + "learning_rate": 3.7581324148488325e-07, + "logits/chosen": -2.6219589710235596, + "logits/rejected": -2.500911235809326, + "logps/chosen": -284.215576171875, + "logps/rejected": -231.4106903076172, + "loss": 0.4837, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.46450090408325195, + "rewards/margins": 1.5634416341781616, + "rewards/rejected": -2.027942657470703, + "step": 940 + }, + { + "epoch": 0.98, + "learning_rate": 3.738997321086873e-07, + "logits/chosen": -2.5877745151519775, + "logits/rejected": -2.5523974895477295, + "logps/chosen": -296.5566101074219, + "logps/rejected": -282.413818359375, + "loss": 0.4916, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.7169415354728699, + "rewards/margins": 1.3798969984054565, + "rewards/rejected": -2.0968384742736816, + "step": 950 + }, + { + "epoch": 0.99, + "learning_rate": 3.7198622273249137e-07, + "logits/chosen": -2.5522520542144775, + "logits/rejected": -2.5006377696990967, + "logps/chosen": -271.15631103515625, + "logps/rejected": -253.3765411376953, + "loss": 0.4431, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6804525852203369, + "rewards/margins": 1.3519657850265503, + "rewards/rejected": -2.0324184894561768, + "step": 960 + }, + { + "epoch": 1.0, + "learning_rate": 3.7007271335629544e-07, + "logits/chosen": -2.5357367992401123, + "logits/rejected": -2.434713363647461, + "logps/chosen": -301.4870300292969, + "logps/rejected": -240.8806915283203, + "loss": 0.5095, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.6089678406715393, + "rewards/margins": 1.4054759740829468, + "rewards/rejected": -2.014443874359131, + "step": 970 + }, + { + "epoch": 1.01, + "learning_rate": 3.681592039800995e-07, + "logits/chosen": -2.637885570526123, + "logits/rejected": -2.5964674949645996, + "logps/chosen": -285.35125732421875, + "logps/rejected": -281.14105224609375, + "loss": 0.3956, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.23545095324516296, + "rewards/margins": 2.0617527961730957, + "rewards/rejected": -2.29720401763916, + "step": 980 + }, + { + "epoch": 1.02, + "learning_rate": 3.662456946039035e-07, + "logits/chosen": -2.633040428161621, + "logits/rejected": -2.5896546840667725, + "logps/chosen": -291.3595886230469, + "logps/rejected": -263.8270263671875, + "loss": 0.3593, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.06208040565252304, + "rewards/margins": 2.071259021759033, + "rewards/rejected": -2.1333394050598145, + "step": 990 + }, + { + "epoch": 1.03, + "learning_rate": 3.6433218522770757e-07, + "logits/chosen": -2.621001958847046, + "logits/rejected": -2.5748400688171387, + "logps/chosen": -238.26870727539062, + "logps/rejected": -245.1639404296875, + "loss": 0.3463, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.22620248794555664, + "rewards/margins": 1.9476579427719116, + "rewards/rejected": -2.1738600730895996, + "step": 1000 + }, + { + "epoch": 1.03, + "eval_logits/chosen": -2.7082529067993164, + "eval_logits/rejected": -2.6281511783599854, + "eval_logps/chosen": -279.15203857421875, + "eval_logps/rejected": -247.93055725097656, + "eval_loss": 0.47547972202301025, + "eval_rewards/accuracies": 0.7680000066757202, + "eval_rewards/chosen": -0.5304849743843079, + "eval_rewards/margins": 1.2958035469055176, + "eval_rewards/rejected": -1.8262888193130493, + "eval_runtime": 328.7321, + "eval_samples_per_second": 6.084, + "eval_steps_per_second": 0.38, + "step": 1000 + }, + { + "epoch": 1.04, + "learning_rate": 3.6241867585151163e-07, + "logits/chosen": -2.6334519386291504, + "logits/rejected": -2.5576975345611572, + "logps/chosen": -237.9059295654297, + "logps/rejected": -202.5619659423828, + "loss": 0.2682, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -0.20000645518302917, + "rewards/margins": 1.985250473022461, + "rewards/rejected": -2.1852567195892334, + "step": 1010 + }, + { + "epoch": 1.05, + "learning_rate": 3.605051664753157e-07, + "logits/chosen": -2.6050827503204346, + "logits/rejected": -2.5590696334838867, + "logps/chosen": -263.57232666015625, + "logps/rejected": -226.03652954101562, + "loss": 0.2637, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 0.14606496691703796, + "rewards/margins": 2.4412872791290283, + "rewards/rejected": -2.295222520828247, + "step": 1020 + }, + { + "epoch": 1.06, + "learning_rate": 3.5859165709911975e-07, + "logits/chosen": -2.592588424682617, + "logits/rejected": -2.560858726501465, + "logps/chosen": -251.0801239013672, + "logps/rejected": -268.08868408203125, + "loss": 0.2414, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16298413276672363, + "rewards/margins": 2.507159948348999, + "rewards/rejected": -2.3441758155822754, + "step": 1030 + }, + { + "epoch": 1.07, + "learning_rate": 3.566781477229238e-07, + "logits/chosen": -2.623647451400757, + "logits/rejected": -2.5220494270324707, + "logps/chosen": -319.46319580078125, + "logps/rejected": -263.6154479980469, + "loss": 0.2081, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 0.14128056168556213, + "rewards/margins": 2.827650785446167, + "rewards/rejected": -2.6863701343536377, + "step": 1040 + }, + { + "epoch": 1.08, + "learning_rate": 3.547646383467279e-07, + "logits/chosen": -2.5959651470184326, + "logits/rejected": -2.5497353076934814, + "logps/chosen": -283.4841613769531, + "logps/rejected": -247.54867553710938, + "loss": 0.1888, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1350892335176468, + "rewards/margins": 2.7649712562561035, + "rewards/rejected": -2.6298820972442627, + "step": 1050 + }, + { + "epoch": 1.09, + "learning_rate": 3.5285112897053194e-07, + "logits/chosen": -2.559783697128296, + "logits/rejected": -2.5014069080352783, + "logps/chosen": -270.575927734375, + "logps/rejected": -260.4879455566406, + "loss": 0.1804, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.1834518164396286, + "rewards/margins": 3.1269893646240234, + "rewards/rejected": -2.943537712097168, + "step": 1060 + }, + { + "epoch": 1.1, + "learning_rate": 3.50937619594336e-07, + "logits/chosen": -2.627401113510132, + "logits/rejected": -2.5442299842834473, + "logps/chosen": -265.7518310546875, + "logps/rejected": -277.06048583984375, + "loss": 0.1877, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.1024879664182663, + "rewards/margins": 2.9685001373291016, + "rewards/rejected": -2.8660120964050293, + "step": 1070 + }, + { + "epoch": 1.12, + "learning_rate": 3.4902411021814007e-07, + "logits/chosen": -2.631279706954956, + "logits/rejected": -2.5586652755737305, + "logps/chosen": -291.9482116699219, + "logps/rejected": -279.103759765625, + "loss": 0.1456, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.08294292539358139, + "rewards/margins": 3.53105092048645, + "rewards/rejected": -3.448107957839966, + "step": 1080 + }, + { + "epoch": 1.13, + "learning_rate": 3.4711060084194413e-07, + "logits/chosen": -2.5913405418395996, + "logits/rejected": -2.5348830223083496, + "logps/chosen": -256.9413757324219, + "logps/rejected": -251.29953002929688, + "loss": 0.1252, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.05543314665555954, + "rewards/margins": 3.7001495361328125, + "rewards/rejected": -3.644716262817383, + "step": 1090 + }, + { + "epoch": 1.14, + "learning_rate": 3.4519709146574814e-07, + "logits/chosen": -2.631477117538452, + "logits/rejected": -2.6130545139312744, + "logps/chosen": -256.7820739746094, + "logps/rejected": -247.29861450195312, + "loss": 0.1266, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.08383353799581528, + "rewards/margins": 3.7477478981018066, + "rewards/rejected": -3.663914203643799, + "step": 1100 + }, + { + "epoch": 1.14, + "eval_logits/chosen": -2.6430113315582275, + "eval_logits/rejected": -2.5584213733673096, + "eval_logps/chosen": -283.978271484375, + "eval_logps/rejected": -258.3182067871094, + "eval_loss": 0.49244019389152527, + "eval_rewards/accuracies": 0.7739999890327454, + "eval_rewards/chosen": -1.0131123065948486, + "eval_rewards/margins": 1.8519433736801147, + "eval_rewards/rejected": -2.8650553226470947, + "eval_runtime": 328.4358, + "eval_samples_per_second": 6.089, + "eval_steps_per_second": 0.381, + "step": 1100 + }, + { + "epoch": 1.15, + "learning_rate": 3.432835820895522e-07, + "logits/chosen": -2.6089444160461426, + "logits/rejected": -2.569436550140381, + "logps/chosen": -277.25665283203125, + "logps/rejected": -279.2236633300781, + "loss": 0.1245, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.09174539893865585, + "rewards/margins": 3.8159255981445312, + "rewards/rejected": -3.9076714515686035, + "step": 1110 + }, + { + "epoch": 1.16, + "learning_rate": 3.4137007271335626e-07, + "logits/chosen": -2.6246414184570312, + "logits/rejected": -2.5093507766723633, + "logps/chosen": -296.4078674316406, + "logps/rejected": -253.695068359375, + "loss": 0.1108, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.04911733418703079, + "rewards/margins": 3.8723056316375732, + "rewards/rejected": -3.823188304901123, + "step": 1120 + }, + { + "epoch": 1.17, + "learning_rate": 3.394565633371603e-07, + "logits/chosen": -2.5909955501556396, + "logits/rejected": -2.5098001956939697, + "logps/chosen": -271.1025390625, + "logps/rejected": -267.0798645019531, + "loss": 0.1042, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.018337160348892212, + "rewards/margins": 4.38796329498291, + "rewards/rejected": -4.369626522064209, + "step": 1130 + }, + { + "epoch": 1.18, + "learning_rate": 3.375430539609644e-07, + "logits/chosen": -2.603445291519165, + "logits/rejected": -2.5317625999450684, + "logps/chosen": -267.75177001953125, + "logps/rejected": -276.5166931152344, + "loss": 0.0957, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.007721236441284418, + "rewards/margins": 4.616903781890869, + "rewards/rejected": -4.624624252319336, + "step": 1140 + }, + { + "epoch": 1.19, + "learning_rate": 3.3562954458476845e-07, + "logits/chosen": -2.5505504608154297, + "logits/rejected": -2.4800028800964355, + "logps/chosen": -244.16177368164062, + "logps/rejected": -260.59527587890625, + "loss": 0.0964, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.06246214359998703, + "rewards/margins": 4.644533634185791, + "rewards/rejected": -4.582070827484131, + "step": 1150 + }, + { + "epoch": 1.2, + "learning_rate": 3.337160352085725e-07, + "logits/chosen": -2.5717084407806396, + "logits/rejected": -2.48624587059021, + "logps/chosen": -284.52362060546875, + "logps/rejected": -286.84625244140625, + "loss": 0.0821, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.03565800562500954, + "rewards/margins": 4.872807502746582, + "rewards/rejected": -4.837149620056152, + "step": 1160 + }, + { + "epoch": 1.21, + "learning_rate": 3.3180252583237657e-07, + "logits/chosen": -2.61575984954834, + "logits/rejected": -2.5330493450164795, + "logps/chosen": -302.53533935546875, + "logps/rejected": -303.67889404296875, + "loss": 0.0761, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.09367243200540543, + "rewards/margins": 5.462395668029785, + "rewards/rejected": -5.5560688972473145, + "step": 1170 + }, + { + "epoch": 1.22, + "learning_rate": 3.2988901645618063e-07, + "logits/chosen": -2.6104063987731934, + "logits/rejected": -2.5553605556488037, + "logps/chosen": -269.8067932128906, + "logps/rejected": -275.9356994628906, + "loss": 0.0647, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.17088812589645386, + "rewards/margins": 5.2629804611206055, + "rewards/rejected": -5.092092990875244, + "step": 1180 + }, + { + "epoch": 1.23, + "learning_rate": 3.279755070799847e-07, + "logits/chosen": -2.594054698944092, + "logits/rejected": -2.5422348976135254, + "logps/chosen": -262.38433837890625, + "logps/rejected": -284.35272216796875, + "loss": 0.0645, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.2543705999851227, + "rewards/margins": 5.296311378479004, + "rewards/rejected": -5.550681114196777, + "step": 1190 + }, + { + "epoch": 1.24, + "learning_rate": 3.260619977037887e-07, + "logits/chosen": -2.606248140335083, + "logits/rejected": -2.5383098125457764, + "logps/chosen": -306.80389404296875, + "logps/rejected": -289.07110595703125, + "loss": 0.0751, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.4288966655731201, + "rewards/margins": 5.881345272064209, + "rewards/rejected": -5.45244836807251, + "step": 1200 + }, + { + "epoch": 1.24, + "eval_logits/chosen": -2.6450486183166504, + "eval_logits/rejected": -2.5574116706848145, + "eval_logps/chosen": -288.3549499511719, + "eval_logps/rejected": -266.3139343261719, + "eval_loss": 0.5208475589752197, + "eval_rewards/accuracies": 0.7760000228881836, + "eval_rewards/chosen": -1.4507769346237183, + "eval_rewards/margins": 2.213848352432251, + "eval_rewards/rejected": -3.6646251678466797, + "eval_runtime": 328.5316, + "eval_samples_per_second": 6.088, + "eval_steps_per_second": 0.38, + "step": 1200 + }, + { + "epoch": 1.25, + "learning_rate": 3.2414848832759277e-07, + "logits/chosen": -2.569955348968506, + "logits/rejected": -2.5366978645324707, + "logps/chosen": -271.14764404296875, + "logps/rejected": -311.7731628417969, + "loss": 0.0516, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.0714511051774025, + "rewards/margins": 6.200972557067871, + "rewards/rejected": -6.129521369934082, + "step": 1210 + }, + { + "epoch": 1.26, + "learning_rate": 3.2223497895139683e-07, + "logits/chosen": -2.552734851837158, + "logits/rejected": -2.444171905517578, + "logps/chosen": -288.9889221191406, + "logps/rejected": -297.237548828125, + "loss": 0.0629, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27881333231925964, + "rewards/margins": 6.696805000305176, + "rewards/rejected": -6.417990684509277, + "step": 1220 + }, + { + "epoch": 1.27, + "learning_rate": 3.203214695752009e-07, + "logits/chosen": -2.5759170055389404, + "logits/rejected": -2.5034148693084717, + "logps/chosen": -285.6966857910156, + "logps/rejected": -329.5125427246094, + "loss": 0.0629, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.15722158551216125, + "rewards/margins": 6.698854923248291, + "rewards/rejected": -6.541632652282715, + "step": 1230 + }, + { + "epoch": 1.28, + "learning_rate": 3.1840796019900495e-07, + "logits/chosen": -2.555694818496704, + "logits/rejected": -2.485347270965576, + "logps/chosen": -288.07611083984375, + "logps/rejected": -292.7718505859375, + "loss": 0.0525, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.205997496843338, + "rewards/margins": 6.743833065032959, + "rewards/rejected": -6.53783655166626, + "step": 1240 + }, + { + "epoch": 1.29, + "learning_rate": 3.16494450822809e-07, + "logits/chosen": -2.576767683029175, + "logits/rejected": -2.534345865249634, + "logps/chosen": -238.9010467529297, + "logps/rejected": -278.7056579589844, + "loss": 0.0425, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.13358671963214874, + "rewards/margins": 6.311589241027832, + "rewards/rejected": -6.178002834320068, + "step": 1250 + }, + { + "epoch": 1.3, + "learning_rate": 3.145809414466131e-07, + "logits/chosen": -2.576357126235962, + "logits/rejected": -2.497936248779297, + "logps/chosen": -266.5452880859375, + "logps/rejected": -300.4448547363281, + "loss": 0.0417, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.3052632808685303, + "rewards/margins": 7.074311256408691, + "rewards/rejected": -6.769047737121582, + "step": 1260 + }, + { + "epoch": 1.31, + "learning_rate": 3.1266743207041714e-07, + "logits/chosen": -2.5541863441467285, + "logits/rejected": -2.496516704559326, + "logps/chosen": -262.9737854003906, + "logps/rejected": -312.977783203125, + "loss": 0.047, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.3359752893447876, + "rewards/margins": 7.000231742858887, + "rewards/rejected": -6.664257049560547, + "step": 1270 + }, + { + "epoch": 1.32, + "learning_rate": 3.107539226942212e-07, + "logits/chosen": -2.4729394912719727, + "logits/rejected": -2.4231820106506348, + "logps/chosen": -249.3036651611328, + "logps/rejected": -301.2051696777344, + "loss": 0.0501, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.4446950852870941, + "rewards/margins": 6.993998050689697, + "rewards/rejected": -6.549304008483887, + "step": 1280 + }, + { + "epoch": 1.33, + "learning_rate": 3.0884041331802526e-07, + "logits/chosen": -2.563807964324951, + "logits/rejected": -2.496666431427002, + "logps/chosen": -284.2848205566406, + "logps/rejected": -292.50872802734375, + "loss": 0.0458, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.3111470341682434, + "rewards/margins": 7.219347953796387, + "rewards/rejected": -6.908200740814209, + "step": 1290 + }, + { + "epoch": 1.34, + "learning_rate": 3.0692690394182927e-07, + "logits/chosen": -2.505225658416748, + "logits/rejected": -2.4466845989227295, + "logps/chosen": -245.6407470703125, + "logps/rejected": -272.4444274902344, + "loss": 0.0306, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.265463650226593, + "rewards/margins": 6.57799768447876, + "rewards/rejected": -6.843461513519287, + "step": 1300 + }, + { + "epoch": 1.34, + "eval_logits/chosen": -2.5864531993865967, + "eval_logits/rejected": -2.4956560134887695, + "eval_logps/chosen": -295.3101806640625, + "eval_logps/rejected": -277.1172180175781, + "eval_loss": 0.577923059463501, + "eval_rewards/accuracies": 0.7580000162124634, + "eval_rewards/chosen": -2.1463019847869873, + "eval_rewards/margins": 2.5986533164978027, + "eval_rewards/rejected": -4.744955539703369, + "eval_runtime": 327.6113, + "eval_samples_per_second": 6.105, + "eval_steps_per_second": 0.382, + "step": 1300 + }, + { + "epoch": 1.35, + "learning_rate": 3.0501339456563334e-07, + "logits/chosen": -2.530522584915161, + "logits/rejected": -2.4422647953033447, + "logps/chosen": -260.3827819824219, + "logps/rejected": -274.53424072265625, + "loss": 0.0479, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.38277196884155273, + "rewards/margins": 6.145764350891113, + "rewards/rejected": -6.52853536605835, + "step": 1310 + }, + { + "epoch": 1.36, + "learning_rate": 3.030998851894374e-07, + "logits/chosen": -2.530467987060547, + "logits/rejected": -2.457545757293701, + "logps/chosen": -280.2557678222656, + "logps/rejected": -312.9700622558594, + "loss": 0.0435, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0774858146905899, + "rewards/margins": 7.1998748779296875, + "rewards/rejected": -7.122389316558838, + "step": 1320 + }, + { + "epoch": 1.37, + "learning_rate": 3.0118637581324146e-07, + "logits/chosen": -2.5862784385681152, + "logits/rejected": -2.4979710578918457, + "logps/chosen": -265.94195556640625, + "logps/rejected": -322.9318542480469, + "loss": 0.0529, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.03257175534963608, + "rewards/margins": 7.4844207763671875, + "rewards/rejected": -7.451849460601807, + "step": 1330 + }, + { + "epoch": 1.38, + "learning_rate": 2.992728664370455e-07, + "logits/chosen": -2.529265880584717, + "logits/rejected": -2.472142457962036, + "logps/chosen": -241.907958984375, + "logps/rejected": -295.9644775390625, + "loss": 0.0453, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.2552769184112549, + "rewards/margins": 7.376172065734863, + "rewards/rejected": -7.631448268890381, + "step": 1340 + }, + { + "epoch": 1.39, + "learning_rate": 2.973593570608496e-07, + "logits/chosen": -2.6861939430236816, + "logits/rejected": -2.5757153034210205, + "logps/chosen": -304.3006286621094, + "logps/rejected": -301.52545166015625, + "loss": 0.0396, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.03746197372674942, + "rewards/margins": 7.219727993011475, + "rewards/rejected": -7.1822662353515625, + "step": 1350 + }, + { + "epoch": 1.4, + "learning_rate": 2.9544584768465365e-07, + "logits/chosen": -2.54135799407959, + "logits/rejected": -2.515650749206543, + "logps/chosen": -278.438232421875, + "logps/rejected": -317.2010498046875, + "loss": 0.0398, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.3339845538139343, + "rewards/margins": 7.23375940322876, + "rewards/rejected": -7.56774377822876, + "step": 1360 + }, + { + "epoch": 1.41, + "learning_rate": 2.935323383084577e-07, + "logits/chosen": -2.599730968475342, + "logits/rejected": -2.531221628189087, + "logps/chosen": -311.6480712890625, + "logps/rejected": -324.45635986328125, + "loss": 0.0447, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.26491448283195496, + "rewards/margins": 7.551673889160156, + "rewards/rejected": -7.816588401794434, + "step": 1370 + }, + { + "epoch": 1.42, + "learning_rate": 2.9161882893226177e-07, + "logits/chosen": -2.5754849910736084, + "logits/rejected": -2.5394327640533447, + "logps/chosen": -259.94171142578125, + "logps/rejected": -315.7677001953125, + "loss": 0.0517, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.3456391394138336, + "rewards/margins": 6.892177581787109, + "rewards/rejected": -7.23781681060791, + "step": 1380 + }, + { + "epoch": 1.44, + "learning_rate": 2.8970531955606583e-07, + "logits/chosen": -2.6177515983581543, + "logits/rejected": -2.5629687309265137, + "logps/chosen": -286.0957336425781, + "logps/rejected": -307.2629699707031, + "loss": 0.0425, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.2400694638490677, + "rewards/margins": 6.7607903480529785, + "rewards/rejected": -6.520721435546875, + "step": 1390 + }, + { + "epoch": 1.45, + "learning_rate": 2.8779181017986984e-07, + "logits/chosen": -2.5180513858795166, + "logits/rejected": -2.4203319549560547, + "logps/chosen": -262.3363952636719, + "logps/rejected": -293.3941955566406, + "loss": 0.031, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.30498212575912476, + "rewards/margins": 6.864785671234131, + "rewards/rejected": -7.1697678565979, + "step": 1400 + }, + { + "epoch": 1.45, + "eval_logits/chosen": -2.6050970554351807, + "eval_logits/rejected": -2.5157244205474854, + "eval_logps/chosen": -300.5773620605469, + "eval_logps/rejected": -282.7791748046875, + "eval_loss": 0.5992786288261414, + "eval_rewards/accuracies": 0.7580000162124634, + "eval_rewards/chosen": -2.673020124435425, + "eval_rewards/margins": 2.638129711151123, + "eval_rewards/rejected": -5.311149597167969, + "eval_runtime": 329.4856, + "eval_samples_per_second": 6.07, + "eval_steps_per_second": 0.379, + "step": 1400 + }, + { + "epoch": 1.46, + "learning_rate": 2.858783008036739e-07, + "logits/chosen": -2.6051602363586426, + "logits/rejected": -2.492736339569092, + "logps/chosen": -296.25274658203125, + "logps/rejected": -316.0076599121094, + "loss": 0.0344, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.7290128469467163, + "rewards/margins": 6.831245422363281, + "rewards/rejected": -7.560257911682129, + "step": 1410 + }, + { + "epoch": 1.47, + "learning_rate": 2.8396479142747797e-07, + "logits/chosen": -2.5280001163482666, + "logits/rejected": -2.476745367050171, + "logps/chosen": -287.8592529296875, + "logps/rejected": -325.2400207519531, + "loss": 0.0638, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.8896492123603821, + "rewards/margins": 7.580406188964844, + "rewards/rejected": -8.47005558013916, + "step": 1420 + }, + { + "epoch": 1.48, + "learning_rate": 2.8205128205128203e-07, + "logits/chosen": -2.591068744659424, + "logits/rejected": -2.5293643474578857, + "logps/chosen": -284.336181640625, + "logps/rejected": -330.9443359375, + "loss": 0.0486, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.8629849553108215, + "rewards/margins": 6.952902317047119, + "rewards/rejected": -7.815886497497559, + "step": 1430 + }, + { + "epoch": 1.49, + "learning_rate": 2.801377726750861e-07, + "logits/chosen": -2.597510814666748, + "logits/rejected": -2.549468517303467, + "logps/chosen": -293.65545654296875, + "logps/rejected": -315.900146484375, + "loss": 0.0487, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.5767217874526978, + "rewards/margins": 7.4031982421875, + "rewards/rejected": -7.97991943359375, + "step": 1440 + }, + { + "epoch": 1.5, + "learning_rate": 2.7822426329889015e-07, + "logits/chosen": -2.6672866344451904, + "logits/rejected": -2.5381336212158203, + "logps/chosen": -321.498291015625, + "logps/rejected": -318.640869140625, + "loss": 0.0441, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.2594658434391022, + "rewards/margins": 7.167167663574219, + "rewards/rejected": -7.426634311676025, + "step": 1450 + }, + { + "epoch": 1.51, + "learning_rate": 2.763107539226942e-07, + "logits/chosen": -2.6042659282684326, + "logits/rejected": -2.4887306690216064, + "logps/chosen": -276.0786437988281, + "logps/rejected": -292.72967529296875, + "loss": 0.0589, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5728877186775208, + "rewards/margins": 6.894124507904053, + "rewards/rejected": -7.467011451721191, + "step": 1460 + }, + { + "epoch": 1.52, + "learning_rate": 2.743972445464983e-07, + "logits/chosen": -2.5835888385772705, + "logits/rejected": -2.523806571960449, + "logps/chosen": -274.3123474121094, + "logps/rejected": -314.17633056640625, + "loss": 0.0726, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.20469942688941956, + "rewards/margins": 6.882570743560791, + "rewards/rejected": -7.087271213531494, + "step": 1470 + }, + { + "epoch": 1.53, + "learning_rate": 2.7248373517030234e-07, + "logits/chosen": -2.5750460624694824, + "logits/rejected": -2.498121976852417, + "logps/chosen": -270.62261962890625, + "logps/rejected": -296.22247314453125, + "loss": 0.0533, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.46651512384414673, + "rewards/margins": 6.364560604095459, + "rewards/rejected": -6.831076145172119, + "step": 1480 + }, + { + "epoch": 1.54, + "learning_rate": 2.705702257941064e-07, + "logits/chosen": -2.6253905296325684, + "logits/rejected": -2.530066967010498, + "logps/chosen": -249.2847442626953, + "logps/rejected": -282.79571533203125, + "loss": 0.0408, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.562870979309082, + "rewards/margins": 6.45670223236084, + "rewards/rejected": -7.019574165344238, + "step": 1490 + }, + { + "epoch": 1.55, + "learning_rate": 2.686567164179104e-07, + "logits/chosen": -2.5404858589172363, + "logits/rejected": -2.4926059246063232, + "logps/chosen": -304.41253662109375, + "logps/rejected": -347.6630554199219, + "loss": 0.0535, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.6575719714164734, + "rewards/margins": 7.0210371017456055, + "rewards/rejected": -7.678609371185303, + "step": 1500 + }, + { + "epoch": 1.55, + "eval_logits/chosen": -2.652937650680542, + "eval_logits/rejected": -2.5616180896759033, + "eval_logps/chosen": -295.47467041015625, + "eval_logps/rejected": -277.6109924316406, + "eval_loss": 0.573060154914856, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -2.162747621536255, + "eval_rewards/margins": 2.631584882736206, + "eval_rewards/rejected": -4.794332981109619, + "eval_runtime": 329.0517, + "eval_samples_per_second": 6.078, + "eval_steps_per_second": 0.38, + "step": 1500 + }, + { + "epoch": 1.56, + "learning_rate": 2.6674320704171447e-07, + "logits/chosen": -2.6182427406311035, + "logits/rejected": -2.5373194217681885, + "logps/chosen": -272.0426940917969, + "logps/rejected": -314.56256103515625, + "loss": 0.0406, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.302041620016098, + "rewards/margins": 6.977307319641113, + "rewards/rejected": -7.279348850250244, + "step": 1510 + }, + { + "epoch": 1.57, + "learning_rate": 2.6482969766551853e-07, + "logits/chosen": -2.5645289421081543, + "logits/rejected": -2.5248210430145264, + "logps/chosen": -267.35479736328125, + "logps/rejected": -293.21136474609375, + "loss": 0.0381, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.3300275206565857, + "rewards/margins": 6.987812042236328, + "rewards/rejected": -7.317839622497559, + "step": 1520 + }, + { + "epoch": 1.58, + "learning_rate": 2.629161882893226e-07, + "logits/chosen": -2.588085889816284, + "logits/rejected": -2.4837424755096436, + "logps/chosen": -276.7251281738281, + "logps/rejected": -307.32916259765625, + "loss": 0.0313, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.410992294549942, + "rewards/margins": 6.828526496887207, + "rewards/rejected": -7.239518165588379, + "step": 1530 + }, + { + "epoch": 1.59, + "learning_rate": 2.6100267891312666e-07, + "logits/chosen": -2.5293657779693604, + "logits/rejected": -2.4737491607666016, + "logps/chosen": -249.6378936767578, + "logps/rejected": -282.2840881347656, + "loss": 0.0485, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.0298715829849243, + "rewards/margins": 6.555671691894531, + "rewards/rejected": -7.58554220199585, + "step": 1540 + }, + { + "epoch": 1.6, + "learning_rate": 2.590891695369307e-07, + "logits/chosen": -2.5496673583984375, + "logits/rejected": -2.4890804290771484, + "logps/chosen": -300.34918212890625, + "logps/rejected": -333.11688232421875, + "loss": 0.0441, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.0020015239715576, + "rewards/margins": 6.947371006011963, + "rewards/rejected": -7.949372291564941, + "step": 1550 + }, + { + "epoch": 1.61, + "learning_rate": 2.571756601607348e-07, + "logits/chosen": -2.464364528656006, + "logits/rejected": -2.387753963470459, + "logps/chosen": -264.74920654296875, + "logps/rejected": -290.9281005859375, + "loss": 0.0492, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.6402736902236938, + "rewards/margins": 6.474742889404297, + "rewards/rejected": -7.115015983581543, + "step": 1560 + }, + { + "epoch": 1.62, + "learning_rate": 2.5526215078453884e-07, + "logits/chosen": -2.5369412899017334, + "logits/rejected": -2.425361156463623, + "logps/chosen": -284.7231750488281, + "logps/rejected": -296.4128112792969, + "loss": 0.0491, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.9682533144950867, + "rewards/margins": 6.315736770629883, + "rewards/rejected": -7.28399133682251, + "step": 1570 + }, + { + "epoch": 1.63, + "learning_rate": 2.533486414083429e-07, + "logits/chosen": -2.563443660736084, + "logits/rejected": -2.441769599914551, + "logps/chosen": -313.6112365722656, + "logps/rejected": -305.52239990234375, + "loss": 0.0388, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.46426552534103394, + "rewards/margins": 6.881335258483887, + "rewards/rejected": -7.3456010818481445, + "step": 1580 + }, + { + "epoch": 1.64, + "learning_rate": 2.5143513203214697e-07, + "logits/chosen": -2.5470073223114014, + "logits/rejected": -2.4438252449035645, + "logps/chosen": -251.0168914794922, + "logps/rejected": -280.91961669921875, + "loss": 0.0562, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.10129622370004654, + "rewards/margins": 6.691439151763916, + "rewards/rejected": -6.7927350997924805, + "step": 1590 + }, + { + "epoch": 1.65, + "learning_rate": 2.49521622655951e-07, + "logits/chosen": -2.610071897506714, + "logits/rejected": -2.5157220363616943, + "logps/chosen": -281.8477783203125, + "logps/rejected": -294.8487243652344, + "loss": 0.063, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.2210036963224411, + "rewards/margins": 6.306001663208008, + "rewards/rejected": -6.527005195617676, + "step": 1600 + }, + { + "epoch": 1.65, + "eval_logits/chosen": -2.5985054969787598, + "eval_logits/rejected": -2.503840923309326, + "eval_logps/chosen": -293.67022705078125, + "eval_logps/rejected": -275.4324645996094, + "eval_loss": 0.543339192867279, + "eval_rewards/accuracies": 0.7580000162124634, + "eval_rewards/chosen": -1.9823036193847656, + "eval_rewards/margins": 2.594174861907959, + "eval_rewards/rejected": -4.576478481292725, + "eval_runtime": 329.1368, + "eval_samples_per_second": 6.077, + "eval_steps_per_second": 0.38, + "step": 1600 + }, + { + "epoch": 1.66, + "learning_rate": 2.4760811327975504e-07, + "logits/chosen": -2.5374481678009033, + "logits/rejected": -2.4631876945495605, + "logps/chosen": -272.7527770996094, + "logps/rejected": -301.7461242675781, + "loss": 0.0515, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.12996195256710052, + "rewards/margins": 7.1182541847229, + "rewards/rejected": -7.248216152191162, + "step": 1610 + }, + { + "epoch": 1.67, + "learning_rate": 2.456946039035591e-07, + "logits/chosen": -2.5310726165771484, + "logits/rejected": -2.470857620239258, + "logps/chosen": -264.4376525878906, + "logps/rejected": -309.92205810546875, + "loss": 0.0466, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.09321962296962738, + "rewards/margins": 6.667203426361084, + "rewards/rejected": -6.760423183441162, + "step": 1620 + }, + { + "epoch": 1.68, + "learning_rate": 2.4378109452736316e-07, + "logits/chosen": -2.4673683643341064, + "logits/rejected": -2.4015889167785645, + "logps/chosen": -245.9351348876953, + "logps/rejected": -268.8101806640625, + "loss": 0.054, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.7791624665260315, + "rewards/margins": 6.342289924621582, + "rewards/rejected": -7.121452331542969, + "step": 1630 + }, + { + "epoch": 1.69, + "learning_rate": 2.418675851511672e-07, + "logits/chosen": -2.560784101486206, + "logits/rejected": -2.449129343032837, + "logps/chosen": -273.04229736328125, + "logps/rejected": -294.8717346191406, + "loss": 0.0389, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.1826784312725067, + "rewards/margins": 6.658658504486084, + "rewards/rejected": -6.841336727142334, + "step": 1640 + }, + { + "epoch": 1.7, + "learning_rate": 2.399540757749713e-07, + "logits/chosen": -2.5664353370666504, + "logits/rejected": -2.452826738357544, + "logps/chosen": -303.6127624511719, + "logps/rejected": -331.66033935546875, + "loss": 0.0434, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.40071630477905273, + "rewards/margins": 6.958899021148682, + "rewards/rejected": -7.359615325927734, + "step": 1650 + }, + { + "epoch": 1.71, + "learning_rate": 2.3804056639877535e-07, + "logits/chosen": -2.535295009613037, + "logits/rejected": -2.417152166366577, + "logps/chosen": -333.24072265625, + "logps/rejected": -316.44830322265625, + "loss": 0.0342, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.7423610091209412, + "rewards/margins": 7.322349548339844, + "rewards/rejected": -8.06471061706543, + "step": 1660 + }, + { + "epoch": 1.72, + "learning_rate": 2.361270570225794e-07, + "logits/chosen": -2.564420700073242, + "logits/rejected": -2.4815382957458496, + "logps/chosen": -262.5311279296875, + "logps/rejected": -331.90570068359375, + "loss": 0.0324, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.6712821125984192, + "rewards/margins": 7.437242031097412, + "rewards/rejected": -8.108525276184082, + "step": 1670 + }, + { + "epoch": 1.73, + "learning_rate": 2.3421354764638345e-07, + "logits/chosen": -2.515321969985962, + "logits/rejected": -2.3945698738098145, + "logps/chosen": -290.71563720703125, + "logps/rejected": -294.52008056640625, + "loss": 0.0412, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.4271801710128784, + "rewards/margins": 6.557519435882568, + "rewards/rejected": -7.984699249267578, + "step": 1680 + }, + { + "epoch": 1.74, + "learning_rate": 2.323000382701875e-07, + "logits/chosen": -2.523240566253662, + "logits/rejected": -2.432344436645508, + "logps/chosen": -275.192138671875, + "logps/rejected": -287.0332336425781, + "loss": 0.0567, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.1197443008422852, + "rewards/margins": 6.0831122398376465, + "rewards/rejected": -7.202856540679932, + "step": 1690 + }, + { + "epoch": 1.76, + "learning_rate": 2.3038652889399157e-07, + "logits/chosen": -2.4755594730377197, + "logits/rejected": -2.389221668243408, + "logps/chosen": -303.61181640625, + "logps/rejected": -317.91302490234375, + "loss": 0.0423, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.6061100959777832, + "rewards/margins": 7.471484184265137, + "rewards/rejected": -8.077593803405762, + "step": 1700 + }, + { + "epoch": 1.76, + "eval_logits/chosen": -2.5654473304748535, + "eval_logits/rejected": -2.4635632038116455, + "eval_logps/chosen": -300.39990234375, + "eval_logps/rejected": -283.8501892089844, + "eval_loss": 0.5821015238761902, + "eval_rewards/accuracies": 0.7540000081062317, + "eval_rewards/chosen": -2.655275344848633, + "eval_rewards/margins": 2.762974739074707, + "eval_rewards/rejected": -5.41825008392334, + "eval_runtime": 329.0804, + "eval_samples_per_second": 6.078, + "eval_steps_per_second": 0.38, + "step": 1700 + }, + { + "epoch": 1.77, + "learning_rate": 2.2847301951779563e-07, + "logits/chosen": -2.4770474433898926, + "logits/rejected": -2.4310386180877686, + "logps/chosen": -263.2637939453125, + "logps/rejected": -285.5542297363281, + "loss": 0.0644, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.271353542804718, + "rewards/margins": 6.415966987609863, + "rewards/rejected": -6.687320709228516, + "step": 1710 + }, + { + "epoch": 1.78, + "learning_rate": 2.265595101415997e-07, + "logits/chosen": -2.52775239944458, + "logits/rejected": -2.4429614543914795, + "logps/chosen": -280.1533203125, + "logps/rejected": -329.3421936035156, + "loss": 0.0626, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.26627108454704285, + "rewards/margins": 7.076508522033691, + "rewards/rejected": -7.342779636383057, + "step": 1720 + }, + { + "epoch": 1.79, + "learning_rate": 2.2464600076540373e-07, + "logits/chosen": -2.5030529499053955, + "logits/rejected": -2.4013702869415283, + "logps/chosen": -240.1175537109375, + "logps/rejected": -286.1576232910156, + "loss": 0.0656, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.5509620904922485, + "rewards/margins": 6.837267875671387, + "rewards/rejected": -7.388230323791504, + "step": 1730 + }, + { + "epoch": 1.8, + "learning_rate": 2.227324913892078e-07, + "logits/chosen": -2.5704593658447266, + "logits/rejected": -2.5203540325164795, + "logps/chosen": -287.09075927734375, + "logps/rejected": -313.4488830566406, + "loss": 0.0569, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.18775661289691925, + "rewards/margins": 6.396711826324463, + "rewards/rejected": -6.584468841552734, + "step": 1740 + }, + { + "epoch": 1.81, + "learning_rate": 2.2081898201301186e-07, + "logits/chosen": -2.455457925796509, + "logits/rejected": -2.402740716934204, + "logps/chosen": -269.8533630371094, + "logps/rejected": -295.97845458984375, + "loss": 0.0409, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.4922047257423401, + "rewards/margins": 6.239272117614746, + "rewards/rejected": -6.7314772605896, + "step": 1750 + }, + { + "epoch": 1.82, + "learning_rate": 2.1890547263681592e-07, + "logits/chosen": -2.5787675380706787, + "logits/rejected": -2.5012478828430176, + "logps/chosen": -270.3778381347656, + "logps/rejected": -303.08624267578125, + "loss": 0.0471, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.5367101430892944, + "rewards/margins": 6.749762058258057, + "rewards/rejected": -7.286472320556641, + "step": 1760 + }, + { + "epoch": 1.83, + "learning_rate": 2.1699196326061998e-07, + "logits/chosen": -2.498349189758301, + "logits/rejected": -2.4476866722106934, + "logps/chosen": -277.01446533203125, + "logps/rejected": -307.1866760253906, + "loss": 0.0436, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.36937451362609863, + "rewards/margins": 6.952645778656006, + "rewards/rejected": -7.322020053863525, + "step": 1770 + }, + { + "epoch": 1.84, + "learning_rate": 2.1507845388442402e-07, + "logits/chosen": -2.5487923622131348, + "logits/rejected": -2.4616153240203857, + "logps/chosen": -280.83990478515625, + "logps/rejected": -283.40008544921875, + "loss": 0.0603, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.3574257791042328, + "rewards/margins": 6.618617057800293, + "rewards/rejected": -6.976043701171875, + "step": 1780 + }, + { + "epoch": 1.85, + "learning_rate": 2.1316494450822808e-07, + "logits/chosen": -2.508967876434326, + "logits/rejected": -2.4427499771118164, + "logps/chosen": -282.5861511230469, + "logps/rejected": -310.12091064453125, + "loss": 0.059, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.5728462934494019, + "rewards/margins": 6.127859115600586, + "rewards/rejected": -6.700705051422119, + "step": 1790 + }, + { + "epoch": 1.86, + "learning_rate": 2.1125143513203214e-07, + "logits/chosen": -2.4674155712127686, + "logits/rejected": -2.4133477210998535, + "logps/chosen": -274.87060546875, + "logps/rejected": -287.4228210449219, + "loss": 0.0559, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.9308744668960571, + "rewards/margins": 6.335507869720459, + "rewards/rejected": -7.266383171081543, + "step": 1800 + }, + { + "epoch": 1.86, + "eval_logits/chosen": -2.5741446018218994, + "eval_logits/rejected": -2.4843475818634033, + "eval_logps/chosen": -299.6483154296875, + "eval_logps/rejected": -282.3105773925781, + "eval_loss": 0.5656670928001404, + "eval_rewards/accuracies": 0.7519999742507935, + "eval_rewards/chosen": -2.580115556716919, + "eval_rewards/margins": 2.684171438217163, + "eval_rewards/rejected": -5.264286518096924, + "eval_runtime": 330.2789, + "eval_samples_per_second": 6.055, + "eval_steps_per_second": 0.378, + "step": 1800 + }, + { + "epoch": 1.87, + "learning_rate": 2.093379257558362e-07, + "logits/chosen": -2.5033411979675293, + "logits/rejected": -2.432813882827759, + "logps/chosen": -247.942626953125, + "logps/rejected": -305.20867919921875, + "loss": 0.0516, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.7584810853004456, + "rewards/margins": 6.978280067443848, + "rewards/rejected": -7.73676061630249, + "step": 1810 + }, + { + "epoch": 1.88, + "learning_rate": 2.0742441637964026e-07, + "logits/chosen": -2.499147653579712, + "logits/rejected": -2.4221835136413574, + "logps/chosen": -284.1277160644531, + "logps/rejected": -317.18463134765625, + "loss": 0.0549, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.9464308619499207, + "rewards/margins": 6.537860870361328, + "rewards/rejected": -7.484292507171631, + "step": 1820 + }, + { + "epoch": 1.89, + "learning_rate": 2.055109070034443e-07, + "logits/chosen": -2.519296169281006, + "logits/rejected": -2.4646060466766357, + "logps/chosen": -279.44268798828125, + "logps/rejected": -308.8426513671875, + "loss": 0.05, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.49726948142051697, + "rewards/margins": 6.457852840423584, + "rewards/rejected": -6.9551215171813965, + "step": 1830 + }, + { + "epoch": 1.9, + "learning_rate": 2.0359739762724836e-07, + "logits/chosen": -2.5203187465667725, + "logits/rejected": -2.459474802017212, + "logps/chosen": -275.2139892578125, + "logps/rejected": -314.4609680175781, + "loss": 0.0526, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.601524293422699, + "rewards/margins": 6.1087565422058105, + "rewards/rejected": -6.7102813720703125, + "step": 1840 + }, + { + "epoch": 1.91, + "learning_rate": 2.0168388825105242e-07, + "logits/chosen": -2.523761034011841, + "logits/rejected": -2.4565975666046143, + "logps/chosen": -288.87469482421875, + "logps/rejected": -316.4368896484375, + "loss": 0.0576, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.4725467562675476, + "rewards/margins": 6.452563285827637, + "rewards/rejected": -6.92510986328125, + "step": 1850 + }, + { + "epoch": 1.92, + "learning_rate": 1.997703788748565e-07, + "logits/chosen": -2.49338960647583, + "logits/rejected": -2.41786527633667, + "logps/chosen": -295.960205078125, + "logps/rejected": -308.38922119140625, + "loss": 0.0607, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5486451387405396, + "rewards/margins": 6.254899024963379, + "rewards/rejected": -6.803544044494629, + "step": 1860 + }, + { + "epoch": 1.93, + "learning_rate": 1.9785686949866055e-07, + "logits/chosen": -2.4441514015197754, + "logits/rejected": -2.4188904762268066, + "logps/chosen": -280.95782470703125, + "logps/rejected": -308.5982360839844, + "loss": 0.0612, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.31675153970718384, + "rewards/margins": 6.945546627044678, + "rewards/rejected": -7.262298583984375, + "step": 1870 + }, + { + "epoch": 1.94, + "learning_rate": 1.9594336012246458e-07, + "logits/chosen": -2.518660545349121, + "logits/rejected": -2.458202838897705, + "logps/chosen": -288.21527099609375, + "logps/rejected": -335.2455139160156, + "loss": 0.0436, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.16030362248420715, + "rewards/margins": 6.654341697692871, + "rewards/rejected": -6.814645290374756, + "step": 1880 + }, + { + "epoch": 1.95, + "learning_rate": 1.9402985074626865e-07, + "logits/chosen": -2.5323879718780518, + "logits/rejected": -2.4646458625793457, + "logps/chosen": -253.74435424804688, + "logps/rejected": -308.44464111328125, + "loss": 0.0475, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.6830048561096191, + "rewards/margins": 6.6035966873168945, + "rewards/rejected": -7.286602020263672, + "step": 1890 + }, + { + "epoch": 1.96, + "learning_rate": 1.921163413700727e-07, + "logits/chosen": -2.5229687690734863, + "logits/rejected": -2.4853615760803223, + "logps/chosen": -314.48468017578125, + "logps/rejected": -311.384521484375, + "loss": 0.0468, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8841110467910767, + "rewards/margins": 6.407034873962402, + "rewards/rejected": -7.291146278381348, + "step": 1900 + }, + { + "epoch": 1.96, + "eval_logits/chosen": -2.539219379425049, + "eval_logits/rejected": -2.449059009552002, + "eval_logps/chosen": -298.44427490234375, + "eval_logps/rejected": -282.5741882324219, + "eval_loss": 0.5758858919143677, + "eval_rewards/accuracies": 0.7480000257492065, + "eval_rewards/chosen": -2.4597108364105225, + "eval_rewards/margins": 2.8309402465820312, + "eval_rewards/rejected": -5.290651321411133, + "eval_runtime": 329.9748, + "eval_samples_per_second": 6.061, + "eval_steps_per_second": 0.379, + "step": 1900 + }, + { + "epoch": 1.97, + "learning_rate": 1.9020283199387677e-07, + "logits/chosen": -2.4514074325561523, + "logits/rejected": -2.3297629356384277, + "logps/chosen": -287.038330078125, + "logps/rejected": -299.0102233886719, + "loss": 0.0499, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0671433210372925, + "rewards/margins": 5.865691184997559, + "rewards/rejected": -6.932834625244141, + "step": 1910 + }, + { + "epoch": 1.98, + "learning_rate": 1.8828932261768083e-07, + "logits/chosen": -2.4341206550598145, + "logits/rejected": -2.371324300765991, + "logps/chosen": -282.197998046875, + "logps/rejected": -301.5996398925781, + "loss": 0.0463, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2245758771896362, + "rewards/margins": 6.24572229385376, + "rewards/rejected": -7.470297813415527, + "step": 1920 + }, + { + "epoch": 1.99, + "learning_rate": 1.8637581324148487e-07, + "logits/chosen": -2.438136577606201, + "logits/rejected": -2.3593366146087646, + "logps/chosen": -286.82855224609375, + "logps/rejected": -316.28240966796875, + "loss": 0.0489, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.0146198272705078, + "rewards/margins": 6.5871477127075195, + "rewards/rejected": -7.601768493652344, + "step": 1930 + }, + { + "epoch": 2.0, + "learning_rate": 1.8446230386528893e-07, + "logits/chosen": -2.383092164993286, + "logits/rejected": -2.2963826656341553, + "logps/chosen": -294.37200927734375, + "logps/rejected": -295.9538879394531, + "loss": 0.0617, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.6222116947174072, + "rewards/margins": 6.908046722412109, + "rewards/rejected": -7.530259132385254, + "step": 1940 + }, + { + "epoch": 2.01, + "learning_rate": 1.82548794489093e-07, + "logits/chosen": -2.4570600986480713, + "logits/rejected": -2.4170963764190674, + "logps/chosen": -294.67333984375, + "logps/rejected": -328.25335693359375, + "loss": 0.0567, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.6761181950569153, + "rewards/margins": 6.681130886077881, + "rewards/rejected": -7.3572492599487305, + "step": 1950 + }, + { + "epoch": 2.02, + "learning_rate": 1.8063528511289706e-07, + "logits/chosen": -2.4279465675354004, + "logits/rejected": -2.3740410804748535, + "logps/chosen": -305.5331726074219, + "logps/rejected": -321.54974365234375, + "loss": 0.0629, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.977795422077179, + "rewards/margins": 6.151275634765625, + "rewards/rejected": -7.129071235656738, + "step": 1960 + }, + { + "epoch": 2.03, + "learning_rate": 1.7872177573670112e-07, + "logits/chosen": -2.4284489154815674, + "logits/rejected": -2.3778624534606934, + "logps/chosen": -238.752685546875, + "logps/rejected": -279.09161376953125, + "loss": 0.0655, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.1264437437057495, + "rewards/margins": 5.814446926116943, + "rewards/rejected": -6.940890312194824, + "step": 1970 + }, + { + "epoch": 2.04, + "learning_rate": 1.7680826636050515e-07, + "logits/chosen": -2.4329612255096436, + "logits/rejected": -2.374427080154419, + "logps/chosen": -259.0732116699219, + "logps/rejected": -254.0245361328125, + "loss": 0.0516, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.238419771194458, + "rewards/margins": 5.634509086608887, + "rewards/rejected": -6.872928619384766, + "step": 1980 + }, + { + "epoch": 2.05, + "learning_rate": 1.7489475698430921e-07, + "logits/chosen": -2.435520648956299, + "logits/rejected": -2.4173076152801514, + "logps/chosen": -271.8101501464844, + "logps/rejected": -295.2347412109375, + "loss": 0.0426, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.8866006731987, + "rewards/margins": 6.229605674743652, + "rewards/rejected": -7.116206169128418, + "step": 1990 + }, + { + "epoch": 2.07, + "learning_rate": 1.7298124760811328e-07, + "logits/chosen": -2.4372620582580566, + "logits/rejected": -2.3834497928619385, + "logps/chosen": -262.3973388671875, + "logps/rejected": -304.9111022949219, + "loss": 0.0576, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2331212759017944, + "rewards/margins": 5.813534259796143, + "rewards/rejected": -7.046655178070068, + "step": 2000 + }, + { + "epoch": 2.07, + "eval_logits/chosen": -2.5015718936920166, + "eval_logits/rejected": -2.413215160369873, + "eval_logps/chosen": -299.8445739746094, + "eval_logps/rejected": -282.8996887207031, + "eval_loss": 0.5613510608673096, + "eval_rewards/accuracies": 0.7620000243186951, + "eval_rewards/chosen": -2.5997402667999268, + "eval_rewards/margins": 2.7234604358673096, + "eval_rewards/rejected": -5.323200702667236, + "eval_runtime": 329.9108, + "eval_samples_per_second": 6.062, + "eval_steps_per_second": 0.379, + "step": 2000 + }, + { + "epoch": 2.08, + "learning_rate": 1.7106773823191734e-07, + "logits/chosen": -2.474686622619629, + "logits/rejected": -2.3782389163970947, + "logps/chosen": -324.77459716796875, + "logps/rejected": -299.6976623535156, + "loss": 0.0394, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.9665130376815796, + "rewards/margins": 6.133960723876953, + "rewards/rejected": -7.100473880767822, + "step": 2010 + }, + { + "epoch": 2.09, + "learning_rate": 1.691542288557214e-07, + "logits/chosen": -2.4789416790008545, + "logits/rejected": -2.4072928428649902, + "logps/chosen": -298.6253356933594, + "logps/rejected": -285.716064453125, + "loss": 0.0399, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.0751765966415405, + "rewards/margins": 5.9020466804504395, + "rewards/rejected": -6.9772233963012695, + "step": 2020 + }, + { + "epoch": 2.1, + "learning_rate": 1.6724071947952544e-07, + "logits/chosen": -2.4440250396728516, + "logits/rejected": -2.360992670059204, + "logps/chosen": -285.7022705078125, + "logps/rejected": -315.54937744140625, + "loss": 0.0295, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.0514458417892456, + "rewards/margins": 6.273679733276367, + "rewards/rejected": -7.325125694274902, + "step": 2030 + }, + { + "epoch": 2.11, + "learning_rate": 1.653272101033295e-07, + "logits/chosen": -2.4839930534362793, + "logits/rejected": -2.404003620147705, + "logps/chosen": -268.47308349609375, + "logps/rejected": -310.1471252441406, + "loss": 0.0317, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.2032978534698486, + "rewards/margins": 5.9852614402771, + "rewards/rejected": -7.188559532165527, + "step": 2040 + }, + { + "epoch": 2.12, + "learning_rate": 1.6341370072713356e-07, + "logits/chosen": -2.52724289894104, + "logits/rejected": -2.4334981441497803, + "logps/chosen": -312.8106689453125, + "logps/rejected": -323.5155334472656, + "loss": 0.0278, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.9176046252250671, + "rewards/margins": 6.5511651039123535, + "rewards/rejected": -7.4687700271606445, + "step": 2050 + }, + { + "epoch": 2.13, + "learning_rate": 1.6150019135093762e-07, + "logits/chosen": -2.4501614570617676, + "logits/rejected": -2.4063010215759277, + "logps/chosen": -260.9747314453125, + "logps/rejected": -286.7646179199219, + "loss": 0.0239, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.3017902374267578, + "rewards/margins": 6.399510860443115, + "rewards/rejected": -7.701300144195557, + "step": 2060 + }, + { + "epoch": 2.14, + "learning_rate": 1.5958668197474169e-07, + "logits/chosen": -2.491021156311035, + "logits/rejected": -2.484483480453491, + "logps/chosen": -273.3662109375, + "logps/rejected": -292.72528076171875, + "loss": 0.0237, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.281099557876587, + "rewards/margins": 6.190659523010254, + "rewards/rejected": -7.471758842468262, + "step": 2070 + }, + { + "epoch": 2.15, + "learning_rate": 1.5767317259854572e-07, + "logits/chosen": -2.4878249168395996, + "logits/rejected": -2.434704542160034, + "logps/chosen": -300.8520812988281, + "logps/rejected": -315.6512756347656, + "loss": 0.0222, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3595197200775146, + "rewards/margins": 6.689205169677734, + "rewards/rejected": -8.048724174499512, + "step": 2080 + }, + { + "epoch": 2.16, + "learning_rate": 1.5575966322234978e-07, + "logits/chosen": -2.46006441116333, + "logits/rejected": -2.346818447113037, + "logps/chosen": -302.2090759277344, + "logps/rejected": -295.3673095703125, + "loss": 0.0235, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1756342649459839, + "rewards/margins": 6.565530300140381, + "rewards/rejected": -7.7411651611328125, + "step": 2090 + }, + { + "epoch": 2.17, + "learning_rate": 1.5384615384615385e-07, + "logits/chosen": -2.456608295440674, + "logits/rejected": -2.355076313018799, + "logps/chosen": -282.78973388671875, + "logps/rejected": -310.10906982421875, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.484881043434143, + "rewards/margins": 7.286185264587402, + "rewards/rejected": -8.771065711975098, + "step": 2100 + }, + { + "epoch": 2.17, + "eval_logits/chosen": -2.5039713382720947, + "eval_logits/rejected": -2.4052250385284424, + "eval_logps/chosen": -305.83544921875, + "eval_logps/rejected": -293.5166320800781, + "eval_loss": 0.6182354688644409, + "eval_rewards/accuracies": 0.7639999985694885, + "eval_rewards/chosen": -3.198823928833008, + "eval_rewards/margins": 3.186070442199707, + "eval_rewards/rejected": -6.384894371032715, + "eval_runtime": 330.4934, + "eval_samples_per_second": 6.052, + "eval_steps_per_second": 0.378, + "step": 2100 + }, + { + "epoch": 2.18, + "learning_rate": 1.519326444699579e-07, + "logits/chosen": -2.447910785675049, + "logits/rejected": -2.3610103130340576, + "logps/chosen": -282.10418701171875, + "logps/rejected": -316.7162780761719, + "loss": 0.0157, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.4366261959075928, + "rewards/margins": 7.56749963760376, + "rewards/rejected": -9.004125595092773, + "step": 2110 + }, + { + "epoch": 2.19, + "learning_rate": 1.5001913509376197e-07, + "logits/chosen": -2.4108386039733887, + "logits/rejected": -2.3353419303894043, + "logps/chosen": -258.7259826660156, + "logps/rejected": -306.69049072265625, + "loss": 0.0193, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.2959132194519043, + "rewards/margins": 7.697202205657959, + "rewards/rejected": -8.993115425109863, + "step": 2120 + }, + { + "epoch": 2.2, + "learning_rate": 1.4810562571756603e-07, + "logits/chosen": -2.437985897064209, + "logits/rejected": -2.352027416229248, + "logps/chosen": -304.39508056640625, + "logps/rejected": -358.80987548828125, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8380666971206665, + "rewards/margins": 8.16486930847168, + "rewards/rejected": -10.002935409545898, + "step": 2130 + }, + { + "epoch": 2.21, + "learning_rate": 1.4619211634137007e-07, + "logits/chosen": -2.495406150817871, + "logits/rejected": -2.417428970336914, + "logps/chosen": -327.63592529296875, + "logps/rejected": -334.3495788574219, + "loss": 0.0151, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.7903884649276733, + "rewards/margins": 7.90988302230835, + "rewards/rejected": -9.700272560119629, + "step": 2140 + }, + { + "epoch": 2.22, + "learning_rate": 1.4427860696517413e-07, + "logits/chosen": -2.4637489318847656, + "logits/rejected": -2.3969979286193848, + "logps/chosen": -277.05206298828125, + "logps/rejected": -316.2105712890625, + "loss": 0.0135, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.583605170249939, + "rewards/margins": 8.034161567687988, + "rewards/rejected": -9.617767333984375, + "step": 2150 + }, + { + "epoch": 2.23, + "learning_rate": 1.423650975889782e-07, + "logits/chosen": -2.4575247764587402, + "logits/rejected": -2.3824923038482666, + "logps/chosen": -284.96319580078125, + "logps/rejected": -330.1680908203125, + "loss": 0.0118, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.8364570140838623, + "rewards/margins": 8.434205055236816, + "rewards/rejected": -10.270662307739258, + "step": 2160 + }, + { + "epoch": 2.24, + "learning_rate": 1.4045158821278225e-07, + "logits/chosen": -2.43703031539917, + "logits/rejected": -2.3671228885650635, + "logps/chosen": -336.13751220703125, + "logps/rejected": -354.32635498046875, + "loss": 0.0176, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.1960761547088623, + "rewards/margins": 8.524799346923828, + "rewards/rejected": -9.720874786376953, + "step": 2170 + }, + { + "epoch": 2.25, + "learning_rate": 1.3853807883658632e-07, + "logits/chosen": -2.3543310165405273, + "logits/rejected": -2.3394277095794678, + "logps/chosen": -272.6856384277344, + "logps/rejected": -339.7895812988281, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8999465703964233, + "rewards/margins": 8.533658981323242, + "rewards/rejected": -10.433606147766113, + "step": 2180 + }, + { + "epoch": 2.26, + "learning_rate": 1.3662456946039035e-07, + "logits/chosen": -2.386059522628784, + "logits/rejected": -2.2526228427886963, + "logps/chosen": -311.24481201171875, + "logps/rejected": -337.0218811035156, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3375542163848877, + "rewards/margins": 9.32975959777832, + "rewards/rejected": -10.667314529418945, + "step": 2190 + }, + { + "epoch": 2.27, + "learning_rate": 1.3471106008419441e-07, + "logits/chosen": -2.4342336654663086, + "logits/rejected": -2.3418126106262207, + "logps/chosen": -309.2164611816406, + "logps/rejected": -374.22576904296875, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8751029968261719, + "rewards/margins": 9.406895637512207, + "rewards/rejected": -11.281997680664062, + "step": 2200 + }, + { + "epoch": 2.27, + "eval_logits/chosen": -2.4494409561157227, + "eval_logits/rejected": -2.353502035140991, + "eval_logps/chosen": -319.8072204589844, + "eval_logps/rejected": -311.62286376953125, + "eval_loss": 0.7074651122093201, + "eval_rewards/accuracies": 0.7419999837875366, + "eval_rewards/chosen": -4.59600305557251, + "eval_rewards/margins": 3.599517345428467, + "eval_rewards/rejected": -8.195520401000977, + "eval_runtime": 330.3156, + "eval_samples_per_second": 6.055, + "eval_steps_per_second": 0.378, + "step": 2200 + }, + { + "epoch": 2.28, + "learning_rate": 1.3279755070799848e-07, + "logits/chosen": -2.337085247039795, + "logits/rejected": -2.296482563018799, + "logps/chosen": -290.27789306640625, + "logps/rejected": -344.79730224609375, + "loss": 0.0096, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.79486083984375, + "rewards/margins": 9.907821655273438, + "rewards/rejected": -11.702681541442871, + "step": 2210 + }, + { + "epoch": 2.29, + "learning_rate": 1.3088404133180254e-07, + "logits/chosen": -2.3764395713806152, + "logits/rejected": -2.3238234519958496, + "logps/chosen": -271.51763916015625, + "logps/rejected": -333.38580322265625, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.068006992340088, + "rewards/margins": 9.185879707336426, + "rewards/rejected": -11.253885269165039, + "step": 2220 + }, + { + "epoch": 2.3, + "learning_rate": 1.289705319556066e-07, + "logits/chosen": -2.3385708332061768, + "logits/rejected": -2.2395923137664795, + "logps/chosen": -283.2237854003906, + "logps/rejected": -340.7456970214844, + "loss": 0.0137, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.4410890340805054, + "rewards/margins": 9.802583694458008, + "rewards/rejected": -11.243673324584961, + "step": 2230 + }, + { + "epoch": 2.31, + "learning_rate": 1.2705702257941064e-07, + "logits/chosen": -2.3051886558532715, + "logits/rejected": -2.232696771621704, + "logps/chosen": -284.79730224609375, + "logps/rejected": -361.62896728515625, + "loss": 0.0135, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.2478265762329102, + "rewards/margins": 9.604982376098633, + "rewards/rejected": -10.85280990600586, + "step": 2240 + }, + { + "epoch": 2.32, + "learning_rate": 1.251435132032147e-07, + "logits/chosen": -2.236691474914551, + "logits/rejected": -2.1974494457244873, + "logps/chosen": -269.29443359375, + "logps/rejected": -338.17822265625, + "loss": 0.0139, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.1111952066421509, + "rewards/margins": 9.45576000213623, + "rewards/rejected": -10.566953659057617, + "step": 2250 + }, + { + "epoch": 2.33, + "learning_rate": 1.2323000382701873e-07, + "logits/chosen": -2.350980520248413, + "logits/rejected": -2.2944483757019043, + "logps/chosen": -288.3419494628906, + "logps/rejected": -323.5022277832031, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3432337045669556, + "rewards/margins": 9.604742050170898, + "rewards/rejected": -10.947977066040039, + "step": 2260 + }, + { + "epoch": 2.34, + "learning_rate": 1.213164944508228e-07, + "logits/chosen": -2.3275721073150635, + "logits/rejected": -2.25993013381958, + "logps/chosen": -260.2298889160156, + "logps/rejected": -314.8516845703125, + "loss": 0.0103, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.1889066696166992, + "rewards/margins": 8.959939956665039, + "rewards/rejected": -10.148846626281738, + "step": 2270 + }, + { + "epoch": 2.35, + "learning_rate": 1.1940298507462686e-07, + "logits/chosen": -2.3323237895965576, + "logits/rejected": -2.2373764514923096, + "logps/chosen": -270.145751953125, + "logps/rejected": -313.8666076660156, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1815932989120483, + "rewards/margins": 9.265039443969727, + "rewards/rejected": -10.446632385253906, + "step": 2280 + }, + { + "epoch": 2.36, + "learning_rate": 1.1748947569843092e-07, + "logits/chosen": -2.3373894691467285, + "logits/rejected": -2.2577757835388184, + "logps/chosen": -290.20556640625, + "logps/rejected": -351.8134765625, + "loss": 0.0088, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.011973261833191, + "rewards/margins": 9.961956977844238, + "rewards/rejected": -10.973930358886719, + "step": 2290 + }, + { + "epoch": 2.37, + "learning_rate": 1.1557596632223497e-07, + "logits/chosen": -2.344902515411377, + "logits/rejected": -2.2504470348358154, + "logps/chosen": -266.8149719238281, + "logps/rejected": -353.9951477050781, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1190001964569092, + "rewards/margins": 10.073897361755371, + "rewards/rejected": -11.192898750305176, + "step": 2300 + }, + { + "epoch": 2.37, + "eval_logits/chosen": -2.397207736968994, + "eval_logits/rejected": -2.2942862510681152, + "eval_logps/chosen": -315.9493408203125, + "eval_logps/rejected": -307.45587158203125, + "eval_loss": 0.7116624712944031, + "eval_rewards/accuracies": 0.7540000081062317, + "eval_rewards/chosen": -4.210216999053955, + "eval_rewards/margins": 3.5686044692993164, + "eval_rewards/rejected": -7.778822422027588, + "eval_runtime": 329.6201, + "eval_samples_per_second": 6.068, + "eval_steps_per_second": 0.379, + "step": 2300 + }, + { + "epoch": 2.39, + "learning_rate": 1.1366245694603903e-07, + "logits/chosen": -2.30245304107666, + "logits/rejected": -2.233797073364258, + "logps/chosen": -265.81402587890625, + "logps/rejected": -341.81341552734375, + "loss": 0.0119, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.4100592136383057, + "rewards/margins": 10.045531272888184, + "rewards/rejected": -11.455589294433594, + "step": 2310 + }, + { + "epoch": 2.4, + "learning_rate": 1.1174894756984308e-07, + "logits/chosen": -2.3736674785614014, + "logits/rejected": -2.2465882301330566, + "logps/chosen": -302.5309143066406, + "logps/rejected": -323.94378662109375, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3777319192886353, + "rewards/margins": 9.126462936401367, + "rewards/rejected": -10.504194259643555, + "step": 2320 + }, + { + "epoch": 2.41, + "learning_rate": 1.0983543819364714e-07, + "logits/chosen": -2.2658305168151855, + "logits/rejected": -2.225450038909912, + "logps/chosen": -304.5056457519531, + "logps/rejected": -348.7767639160156, + "loss": 0.0114, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.0617547035217285, + "rewards/margins": 9.54466724395752, + "rewards/rejected": -10.60642147064209, + "step": 2330 + }, + { + "epoch": 2.42, + "learning_rate": 1.079219288174512e-07, + "logits/chosen": -2.2534327507019043, + "logits/rejected": -2.2105424404144287, + "logps/chosen": -306.9606018066406, + "logps/rejected": -354.7109375, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.238394021987915, + "rewards/margins": 9.64238166809082, + "rewards/rejected": -10.880776405334473, + "step": 2340 + }, + { + "epoch": 2.43, + "learning_rate": 1.0600841944125525e-07, + "logits/chosen": -2.3249001502990723, + "logits/rejected": -2.2780890464782715, + "logps/chosen": -268.6153564453125, + "logps/rejected": -346.42047119140625, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2436628341674805, + "rewards/margins": 9.205244064331055, + "rewards/rejected": -10.448905944824219, + "step": 2350 + }, + { + "epoch": 2.44, + "learning_rate": 1.0409491006505931e-07, + "logits/chosen": -2.3749475479125977, + "logits/rejected": -2.277677059173584, + "logps/chosen": -300.11895751953125, + "logps/rejected": -348.1705017089844, + "loss": 0.0061, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.0702159404754639, + "rewards/margins": 9.171914100646973, + "rewards/rejected": -10.2421293258667, + "step": 2360 + }, + { + "epoch": 2.45, + "learning_rate": 1.0218140068886336e-07, + "logits/chosen": -2.2845263481140137, + "logits/rejected": -2.197556972503662, + "logps/chosen": -277.10174560546875, + "logps/rejected": -324.47216796875, + "loss": 0.01, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.2265087366104126, + "rewards/margins": 9.168752670288086, + "rewards/rejected": -10.395261764526367, + "step": 2370 + }, + { + "epoch": 2.46, + "learning_rate": 1.0026789131266743e-07, + "logits/chosen": -2.3598082065582275, + "logits/rejected": -2.265746593475342, + "logps/chosen": -294.85296630859375, + "logps/rejected": -352.6213073730469, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5622330904006958, + "rewards/margins": 9.054221153259277, + "rewards/rejected": -10.616453170776367, + "step": 2380 + }, + { + "epoch": 2.47, + "learning_rate": 9.835438193647149e-08, + "logits/chosen": -2.3179988861083984, + "logits/rejected": -2.2776644229888916, + "logps/chosen": -301.02130126953125, + "logps/rejected": -350.03765869140625, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5659319162368774, + "rewards/margins": 9.565340995788574, + "rewards/rejected": -11.13127326965332, + "step": 2390 + }, + { + "epoch": 2.48, + "learning_rate": 9.644087256027554e-08, + "logits/chosen": -2.3582870960235596, + "logits/rejected": -2.267897367477417, + "logps/chosen": -298.76849365234375, + "logps/rejected": -371.87237548828125, + "loss": 0.0104, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.6058155298233032, + "rewards/margins": 9.32790470123291, + "rewards/rejected": -10.93371868133545, + "step": 2400 + }, + { + "epoch": 2.48, + "eval_logits/chosen": -2.409748077392578, + "eval_logits/rejected": -2.3097164630889893, + "eval_logps/chosen": -317.2179870605469, + "eval_logps/rejected": -308.91986083984375, + "eval_loss": 0.7131382822990417, + "eval_rewards/accuracies": 0.7540000081062317, + "eval_rewards/chosen": -4.337080955505371, + "eval_rewards/margins": 3.5881383419036865, + "eval_rewards/rejected": -7.92521858215332, + "eval_runtime": 330.8382, + "eval_samples_per_second": 6.045, + "eval_steps_per_second": 0.378, + "step": 2400 + }, + { + "epoch": 2.49, + "learning_rate": 9.45273631840796e-08, + "logits/chosen": -2.34238862991333, + "logits/rejected": -2.2983975410461426, + "logps/chosen": -308.1969299316406, + "logps/rejected": -335.4678955078125, + "loss": 0.008, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.8870359659194946, + "rewards/margins": 9.149595260620117, + "rewards/rejected": -11.03663158416748, + "step": 2410 + }, + { + "epoch": 2.5, + "learning_rate": 9.261385380788366e-08, + "logits/chosen": -2.4131252765655518, + "logits/rejected": -2.2457823753356934, + "logps/chosen": -327.6688537597656, + "logps/rejected": -347.8465576171875, + "loss": 0.0113, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.6184027194976807, + "rewards/margins": 9.20851993560791, + "rewards/rejected": -10.826921463012695, + "step": 2420 + }, + { + "epoch": 2.51, + "learning_rate": 9.070034443168771e-08, + "logits/chosen": -2.3476271629333496, + "logits/rejected": -2.217395067214966, + "logps/chosen": -293.73785400390625, + "logps/rejected": -336.8843994140625, + "loss": 0.0077, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.0833864212036133, + "rewards/margins": 9.448251724243164, + "rewards/rejected": -11.531637191772461, + "step": 2430 + }, + { + "epoch": 2.52, + "learning_rate": 8.878683505549177e-08, + "logits/chosen": -2.3457190990448, + "logits/rejected": -2.294241428375244, + "logps/chosen": -289.23876953125, + "logps/rejected": -358.06390380859375, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4211084842681885, + "rewards/margins": 9.614474296569824, + "rewards/rejected": -11.035581588745117, + "step": 2440 + }, + { + "epoch": 2.53, + "learning_rate": 8.687332567929582e-08, + "logits/chosen": -2.321498394012451, + "logits/rejected": -2.2159245014190674, + "logps/chosen": -276.77105712890625, + "logps/rejected": -323.2955627441406, + "loss": 0.0122, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.7652084827423096, + "rewards/margins": 8.866913795471191, + "rewards/rejected": -10.632121086120605, + "step": 2450 + }, + { + "epoch": 2.54, + "learning_rate": 8.495981630309988e-08, + "logits/chosen": -2.3643240928649902, + "logits/rejected": -2.2679874897003174, + "logps/chosen": -282.6573181152344, + "logps/rejected": -344.65411376953125, + "loss": 0.0067, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.9658873081207275, + "rewards/margins": 9.22823715209961, + "rewards/rejected": -11.194124221801758, + "step": 2460 + }, + { + "epoch": 2.55, + "learning_rate": 8.304630692690395e-08, + "logits/chosen": -2.252459764480591, + "logits/rejected": -2.1973116397857666, + "logps/chosen": -299.83819580078125, + "logps/rejected": -377.33172607421875, + "loss": 0.0138, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -1.9341270923614502, + "rewards/margins": 9.789133071899414, + "rewards/rejected": -11.723260879516602, + "step": 2470 + }, + { + "epoch": 2.56, + "learning_rate": 8.1132797550708e-08, + "logits/chosen": -2.3481366634368896, + "logits/rejected": -2.285489320755005, + "logps/chosen": -295.62939453125, + "logps/rejected": -349.37347412109375, + "loss": 0.0124, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.7781429290771484, + "rewards/margins": 9.523946762084961, + "rewards/rejected": -11.302090644836426, + "step": 2480 + }, + { + "epoch": 2.57, + "learning_rate": 7.921928817451206e-08, + "logits/chosen": -2.2330024242401123, + "logits/rejected": -2.1658129692077637, + "logps/chosen": -266.7092590332031, + "logps/rejected": -324.52203369140625, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.702344298362732, + "rewards/margins": 9.572081565856934, + "rewards/rejected": -11.274426460266113, + "step": 2490 + }, + { + "epoch": 2.58, + "learning_rate": 7.73057787983161e-08, + "logits/chosen": -2.310683488845825, + "logits/rejected": -2.223453998565674, + "logps/chosen": -284.54583740234375, + "logps/rejected": -350.00469970703125, + "loss": 0.008, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.8044601678848267, + "rewards/margins": 9.359917640686035, + "rewards/rejected": -11.164377212524414, + "step": 2500 + }, + { + "epoch": 2.58, + "eval_logits/chosen": -2.376404047012329, + "eval_logits/rejected": -2.2755658626556396, + "eval_logps/chosen": -318.2084045410156, + "eval_logps/rejected": -311.3636474609375, + "eval_loss": 0.7328027486801147, + "eval_rewards/accuracies": 0.7519999742507935, + "eval_rewards/chosen": -4.436122894287109, + "eval_rewards/margins": 3.7334771156311035, + "eval_rewards/rejected": -8.169599533081055, + "eval_runtime": 330.2894, + "eval_samples_per_second": 6.055, + "eval_steps_per_second": 0.378, + "step": 2500 + }, + { + "epoch": 2.59, + "learning_rate": 7.539226942212017e-08, + "logits/chosen": -2.2334113121032715, + "logits/rejected": -2.171754837036133, + "logps/chosen": -278.3266906738281, + "logps/rejected": -326.1250915527344, + "loss": 0.0105, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.8271163702011108, + "rewards/margins": 9.274404525756836, + "rewards/rejected": -11.101519584655762, + "step": 2510 + }, + { + "epoch": 2.6, + "learning_rate": 7.347876004592423e-08, + "logits/chosen": -2.2726237773895264, + "logits/rejected": -2.2070040702819824, + "logps/chosen": -294.99591064453125, + "logps/rejected": -365.6657409667969, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.223010778427124, + "rewards/margins": 9.6091890335083, + "rewards/rejected": -11.832199096679688, + "step": 2520 + }, + { + "epoch": 2.61, + "learning_rate": 7.156525066972828e-08, + "logits/chosen": -2.232114553451538, + "logits/rejected": -2.1668314933776855, + "logps/chosen": -302.2503662109375, + "logps/rejected": -340.13568115234375, + "loss": 0.0112, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.7910659313201904, + "rewards/margins": 9.791200637817383, + "rewards/rejected": -11.582267761230469, + "step": 2530 + }, + { + "epoch": 2.62, + "learning_rate": 6.965174129353234e-08, + "logits/chosen": -2.3309519290924072, + "logits/rejected": -2.1995091438293457, + "logps/chosen": -290.02789306640625, + "logps/rejected": -338.7962341308594, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.841522216796875, + "rewards/margins": 9.228216171264648, + "rewards/rejected": -11.069738388061523, + "step": 2540 + }, + { + "epoch": 2.63, + "learning_rate": 6.773823191733639e-08, + "logits/chosen": -2.3205971717834473, + "logits/rejected": -2.2115180492401123, + "logps/chosen": -326.6658020019531, + "logps/rejected": -337.70684814453125, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.852582573890686, + "rewards/margins": 9.197256088256836, + "rewards/rejected": -11.049838066101074, + "step": 2550 + }, + { + "epoch": 2.64, + "learning_rate": 6.582472254114045e-08, + "logits/chosen": -2.294541835784912, + "logits/rejected": -2.1916394233703613, + "logps/chosen": -262.17401123046875, + "logps/rejected": -323.60498046875, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.84153151512146, + "rewards/margins": 9.305184364318848, + "rewards/rejected": -11.14671516418457, + "step": 2560 + }, + { + "epoch": 2.65, + "learning_rate": 6.391121316494451e-08, + "logits/chosen": -2.3827064037323, + "logits/rejected": -2.2606282234191895, + "logps/chosen": -295.3243408203125, + "logps/rejected": -343.00860595703125, + "loss": 0.0082, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.9150327444076538, + "rewards/margins": 9.581887245178223, + "rewards/rejected": -11.496919631958008, + "step": 2570 + }, + { + "epoch": 2.66, + "learning_rate": 6.199770378874856e-08, + "logits/chosen": -2.2721750736236572, + "logits/rejected": -2.233607292175293, + "logps/chosen": -286.4488220214844, + "logps/rejected": -361.966552734375, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6838099956512451, + "rewards/margins": 10.286967277526855, + "rewards/rejected": -11.97077751159668, + "step": 2580 + }, + { + "epoch": 2.67, + "learning_rate": 6.008419441255262e-08, + "logits/chosen": -2.2622904777526855, + "logits/rejected": -2.212773084640503, + "logps/chosen": -274.4309997558594, + "logps/rejected": -340.4100646972656, + "loss": 0.0179, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.554031252861023, + "rewards/margins": 9.818443298339844, + "rewards/rejected": -11.372475624084473, + "step": 2590 + }, + { + "epoch": 2.68, + "learning_rate": 5.817068503635668e-08, + "logits/chosen": -2.3351638317108154, + "logits/rejected": -2.226699113845825, + "logps/chosen": -269.5257568359375, + "logps/rejected": -310.54815673828125, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2515082359313965, + "rewards/margins": 8.843351364135742, + "rewards/rejected": -11.094860076904297, + "step": 2600 + }, + { + "epoch": 2.68, + "eval_logits/chosen": -2.418520927429199, + "eval_logits/rejected": -2.3138442039489746, + "eval_logps/chosen": -316.7310791015625, + "eval_logps/rejected": -309.56011962890625, + "eval_loss": 0.7192761898040771, + "eval_rewards/accuracies": 0.7599999904632568, + "eval_rewards/chosen": -4.288391590118408, + "eval_rewards/margins": 3.700854778289795, + "eval_rewards/rejected": -7.989245891571045, + "eval_runtime": 330.2015, + "eval_samples_per_second": 6.057, + "eval_steps_per_second": 0.379, + "step": 2600 + }, + { + "epoch": 2.69, + "learning_rate": 5.6257175660160735e-08, + "logits/chosen": -2.3566582202911377, + "logits/rejected": -2.2767062187194824, + "logps/chosen": -286.93414306640625, + "logps/rejected": -343.0436706542969, + "loss": 0.0041, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.3818962574005127, + "rewards/margins": 9.171338081359863, + "rewards/rejected": -10.553235054016113, + "step": 2610 + }, + { + "epoch": 2.71, + "learning_rate": 5.4343666283964784e-08, + "logits/chosen": -2.4049510955810547, + "logits/rejected": -2.256659746170044, + "logps/chosen": -307.3303527832031, + "logps/rejected": -365.7108459472656, + "loss": 0.0107, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.3092598915100098, + "rewards/margins": 9.412524223327637, + "rewards/rejected": -10.721784591674805, + "step": 2620 + }, + { + "epoch": 2.72, + "learning_rate": 5.243015690776884e-08, + "logits/chosen": -2.4375126361846924, + "logits/rejected": -2.30391001701355, + "logps/chosen": -343.348388671875, + "logps/rejected": -355.0252990722656, + "loss": 0.0072, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.578098177909851, + "rewards/margins": 9.311563491821289, + "rewards/rejected": -10.88966178894043, + "step": 2630 + }, + { + "epoch": 2.73, + "learning_rate": 5.05166475315729e-08, + "logits/chosen": -2.3526530265808105, + "logits/rejected": -2.2642931938171387, + "logps/chosen": -269.9921875, + "logps/rejected": -338.1278381347656, + "loss": 0.0048, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.4594029188156128, + "rewards/margins": 9.540541648864746, + "rewards/rejected": -10.999944686889648, + "step": 2640 + }, + { + "epoch": 2.74, + "learning_rate": 4.860313815537696e-08, + "logits/chosen": -2.3303122520446777, + "logits/rejected": -2.2322537899017334, + "logps/chosen": -304.40924072265625, + "logps/rejected": -339.898681640625, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.252328395843506, + "rewards/margins": 9.010406494140625, + "rewards/rejected": -11.262735366821289, + "step": 2650 + }, + { + "epoch": 2.75, + "learning_rate": 4.668962877918101e-08, + "logits/chosen": -2.300126552581787, + "logits/rejected": -2.1978871822357178, + "logps/chosen": -281.3654479980469, + "logps/rejected": -307.08197021484375, + "loss": 0.0129, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.039625644683838, + "rewards/margins": 8.738824844360352, + "rewards/rejected": -10.778450012207031, + "step": 2660 + }, + { + "epoch": 2.76, + "learning_rate": 4.477611940298507e-08, + "logits/chosen": -2.3153433799743652, + "logits/rejected": -2.2383222579956055, + "logps/chosen": -313.5261535644531, + "logps/rejected": -362.248779296875, + "loss": 0.0111, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.031290054321289, + "rewards/margins": 9.838370323181152, + "rewards/rejected": -11.869660377502441, + "step": 2670 + }, + { + "epoch": 2.77, + "learning_rate": 4.2862610026789124e-08, + "logits/chosen": -2.3045172691345215, + "logits/rejected": -2.2299160957336426, + "logps/chosen": -275.14276123046875, + "logps/rejected": -345.14617919921875, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9210354089736938, + "rewards/margins": 9.726446151733398, + "rewards/rejected": -11.647480964660645, + "step": 2680 + }, + { + "epoch": 2.78, + "learning_rate": 4.0949100650593186e-08, + "logits/chosen": -2.360119104385376, + "logits/rejected": -2.2811567783355713, + "logps/chosen": -291.0294494628906, + "logps/rejected": -362.3855285644531, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.178312063217163, + "rewards/margins": 9.444883346557617, + "rewards/rejected": -11.623196601867676, + "step": 2690 + }, + { + "epoch": 2.79, + "learning_rate": 3.903559127439724e-08, + "logits/chosen": -2.2991082668304443, + "logits/rejected": -2.2151143550872803, + "logps/chosen": -280.21710205078125, + "logps/rejected": -339.8370056152344, + "loss": 0.0089, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.628218173980713, + "rewards/margins": 10.084501266479492, + "rewards/rejected": -12.712719917297363, + "step": 2700 + }, + { + "epoch": 2.79, + "eval_logits/chosen": -2.395979881286621, + "eval_logits/rejected": -2.2942333221435547, + "eval_logps/chosen": -322.83795166015625, + "eval_logps/rejected": -316.2196044921875, + "eval_loss": 0.738807737827301, + "eval_rewards/accuracies": 0.765999972820282, + "eval_rewards/chosen": -4.899077892303467, + "eval_rewards/margins": 3.7561144828796387, + "eval_rewards/rejected": -8.655191421508789, + "eval_runtime": 330.5088, + "eval_samples_per_second": 6.051, + "eval_steps_per_second": 0.378, + "step": 2700 + }, + { + "epoch": 2.8, + "learning_rate": 3.71220818982013e-08, + "logits/chosen": -2.360029935836792, + "logits/rejected": -2.322143077850342, + "logps/chosen": -303.7354736328125, + "logps/rejected": -367.031005859375, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0700087547302246, + "rewards/margins": 9.75294017791748, + "rewards/rejected": -11.822949409484863, + "step": 2710 + }, + { + "epoch": 2.81, + "learning_rate": 3.520857252200535e-08, + "logits/chosen": -2.272301197052002, + "logits/rejected": -2.2256054878234863, + "logps/chosen": -273.448486328125, + "logps/rejected": -336.57305908203125, + "loss": 0.0366, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.192540168762207, + "rewards/margins": 9.447149276733398, + "rewards/rejected": -11.639688491821289, + "step": 2720 + }, + { + "epoch": 2.82, + "learning_rate": 3.3295063145809414e-08, + "logits/chosen": -2.3102471828460693, + "logits/rejected": -2.2527964115142822, + "logps/chosen": -293.8779602050781, + "logps/rejected": -362.58563232421875, + "loss": 0.0113, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.485677719116211, + "rewards/margins": 9.276286125183105, + "rewards/rejected": -11.761963844299316, + "step": 2730 + }, + { + "epoch": 2.83, + "learning_rate": 3.138155376961347e-08, + "logits/chosen": -2.342339038848877, + "logits/rejected": -2.276078462600708, + "logps/chosen": -300.9256286621094, + "logps/rejected": -342.9737243652344, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9704710245132446, + "rewards/margins": 9.322904586791992, + "rewards/rejected": -11.293375015258789, + "step": 2740 + }, + { + "epoch": 2.84, + "learning_rate": 2.9468044393417525e-08, + "logits/chosen": -2.323369264602661, + "logits/rejected": -2.2281460762023926, + "logps/chosen": -298.3914794921875, + "logps/rejected": -334.6033020019531, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0311081409454346, + "rewards/margins": 9.329480171203613, + "rewards/rejected": -11.360588073730469, + "step": 2750 + }, + { + "epoch": 2.85, + "learning_rate": 2.755453501722158e-08, + "logits/chosen": -2.251509189605713, + "logits/rejected": -2.2324776649475098, + "logps/chosen": -290.03106689453125, + "logps/rejected": -339.38238525390625, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.435882091522217, + "rewards/margins": 9.06352710723877, + "rewards/rejected": -11.499407768249512, + "step": 2760 + }, + { + "epoch": 2.86, + "learning_rate": 2.564102564102564e-08, + "logits/chosen": -2.2469379901885986, + "logits/rejected": -2.1791176795959473, + "logps/chosen": -287.87298583984375, + "logps/rejected": -333.5580139160156, + "loss": 0.0113, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.3411865234375, + "rewards/margins": 9.410177230834961, + "rewards/rejected": -11.751363754272461, + "step": 2770 + }, + { + "epoch": 2.87, + "learning_rate": 2.3727516264829695e-08, + "logits/chosen": -2.291459560394287, + "logits/rejected": -2.226825475692749, + "logps/chosen": -268.06280517578125, + "logps/rejected": -356.3489990234375, + "loss": 0.0061, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.386735677719116, + "rewards/margins": 9.646692276000977, + "rewards/rejected": -12.033427238464355, + "step": 2780 + }, + { + "epoch": 2.88, + "learning_rate": 2.1814006888633754e-08, + "logits/chosen": -2.290907382965088, + "logits/rejected": -2.2021796703338623, + "logps/chosen": -306.6482849121094, + "logps/rejected": -370.08038330078125, + "loss": 0.005, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.622554302215576, + "rewards/margins": 9.027674674987793, + "rewards/rejected": -11.650228500366211, + "step": 2790 + }, + { + "epoch": 2.89, + "learning_rate": 1.990049751243781e-08, + "logits/chosen": -2.3071415424346924, + "logits/rejected": -2.198819637298584, + "logps/chosen": -302.01904296875, + "logps/rejected": -338.0803527832031, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.463911294937134, + "rewards/margins": 9.179804801940918, + "rewards/rejected": -11.643716812133789, + "step": 2800 + }, + { + "epoch": 2.89, + "eval_logits/chosen": -2.364900588989258, + "eval_logits/rejected": -2.2619574069976807, + "eval_logps/chosen": -321.83087158203125, + "eval_logps/rejected": -316.2637939453125, + "eval_loss": 0.7342348694801331, + "eval_rewards/accuracies": 0.7639999985694885, + "eval_rewards/chosen": -4.798369407653809, + "eval_rewards/margins": 3.8612406253814697, + "eval_rewards/rejected": -8.659610748291016, + "eval_runtime": 330.2098, + "eval_samples_per_second": 6.057, + "eval_steps_per_second": 0.379, + "step": 2800 + }, + { + "epoch": 2.9, + "learning_rate": 1.7986988136241865e-08, + "logits/chosen": -2.27158784866333, + "logits/rejected": -2.264709949493408, + "logps/chosen": -289.5348205566406, + "logps/rejected": -370.21295166015625, + "loss": 0.0065, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.4264960289001465, + "rewards/margins": 8.935715675354004, + "rewards/rejected": -11.362211227416992, + "step": 2810 + }, + { + "epoch": 2.91, + "learning_rate": 1.6073478760045924e-08, + "logits/chosen": -2.2701802253723145, + "logits/rejected": -2.182992458343506, + "logps/chosen": -302.4941101074219, + "logps/rejected": -356.8041687011719, + "loss": 0.0059, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.402383804321289, + "rewards/margins": 8.9962739944458, + "rewards/rejected": -11.398656845092773, + "step": 2820 + }, + { + "epoch": 2.92, + "learning_rate": 1.4159969383849981e-08, + "logits/chosen": -2.266104221343994, + "logits/rejected": -2.209751605987549, + "logps/chosen": -311.89373779296875, + "logps/rejected": -364.10321044921875, + "loss": 0.0079, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.8278541564941406, + "rewards/margins": 9.237452507019043, + "rewards/rejected": -11.06530475616455, + "step": 2830 + }, + { + "epoch": 2.93, + "learning_rate": 1.2246460007654037e-08, + "logits/chosen": -2.2711033821105957, + "logits/rejected": -2.213872194290161, + "logps/chosen": -296.39361572265625, + "logps/rejected": -334.15985107421875, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3090121746063232, + "rewards/margins": 9.236477851867676, + "rewards/rejected": -11.545488357543945, + "step": 2840 + }, + { + "epoch": 2.94, + "learning_rate": 1.0332950631458094e-08, + "logits/chosen": -2.3495352268218994, + "logits/rejected": -2.28285813331604, + "logps/chosen": -316.610595703125, + "logps/rejected": -394.0874938964844, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.016587495803833, + "rewards/margins": 9.51962947845459, + "rewards/rejected": -11.536214828491211, + "step": 2850 + }, + { + "epoch": 2.95, + "learning_rate": 8.419441255262151e-09, + "logits/chosen": -2.369030237197876, + "logits/rejected": -2.2844557762145996, + "logps/chosen": -275.51312255859375, + "logps/rejected": -343.92144775390625, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.705946445465088, + "rewards/margins": 8.749194145202637, + "rewards/rejected": -11.455141067504883, + "step": 2860 + }, + { + "epoch": 2.96, + "learning_rate": 6.505931879066207e-09, + "logits/chosen": -2.3274118900299072, + "logits/rejected": -2.265143632888794, + "logps/chosen": -333.2660827636719, + "logps/rejected": -350.2222900390625, + "loss": 0.0108, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.5637712478637695, + "rewards/margins": 8.87217903137207, + "rewards/rejected": -11.435951232910156, + "step": 2870 + }, + { + "epoch": 2.97, + "learning_rate": 4.592422502870264e-09, + "logits/chosen": -2.3058080673217773, + "logits/rejected": -2.24003267288208, + "logps/chosen": -289.5403747558594, + "logps/rejected": -346.329345703125, + "loss": 0.0151, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.4935154914855957, + "rewards/margins": 8.715460777282715, + "rewards/rejected": -11.208975791931152, + "step": 2880 + }, + { + "epoch": 2.98, + "learning_rate": 2.6789131266743202e-09, + "logits/chosen": -2.305800437927246, + "logits/rejected": -2.183337926864624, + "logps/chosen": -303.07989501953125, + "logps/rejected": -338.078125, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6788885593414307, + "rewards/margins": 8.81675910949707, + "rewards/rejected": -11.495648384094238, + "step": 2890 + }, + { + "epoch": 2.99, + "learning_rate": 7.654037504783773e-10, + "logits/chosen": -2.2422897815704346, + "logits/rejected": -2.167163848876953, + "logps/chosen": -308.4017639160156, + "logps/rejected": -364.2879333496094, + "loss": 0.0094, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -2.076625347137451, + "rewards/margins": 9.432092666625977, + "rewards/rejected": -11.50871753692627, + "step": 2900 + }, + { + "epoch": 2.99, + "eval_logits/chosen": -2.3625237941741943, + "eval_logits/rejected": -2.259509325027466, + "eval_logps/chosen": -321.42047119140625, + "eval_logps/rejected": -315.8360595703125, + "eval_loss": 0.7374239563941956, + "eval_rewards/accuracies": 0.7580000162124634, + "eval_rewards/chosen": -4.75732946395874, + "eval_rewards/margins": 3.859508514404297, + "eval_rewards/rejected": -8.616838455200195, + "eval_runtime": 329.7799, + "eval_samples_per_second": 6.065, + "eval_steps_per_second": 0.379, + "step": 2900 + }, + { + "epoch": 3.0, + "step": 2904, + "total_flos": 0.0, + "train_loss": 0.20145473461829064, + "train_runtime": 66730.9037, + "train_samples_per_second": 2.786, + "train_steps_per_second": 0.044 + } + ], + "logging_steps": 10, + "max_steps": 2904, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}