diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10280 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.998972954467648, + "eval_steps": 100, + "global_step": 6570, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.5662100456621e-10, + "logits/chosen": -1.5087523460388184, + "logits/rejected": -1.5035094022750854, + "logps/chosen": -43.74905014038086, + "logps/rejected": -77.60680389404297, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 4.5662100456621e-09, + "logits/chosen": -1.545806646347046, + "logits/rejected": -1.455157995223999, + "logps/chosen": -55.80635452270508, + "logps/rejected": -55.2034912109375, + "loss": 0.6932, + "rewards/accuracies": 0.4305555522441864, + "rewards/chosen": 0.014816111885011196, + "rewards/margins": -0.003535893280059099, + "rewards/rejected": 0.018352005630731583, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 9.1324200913242e-09, + "logits/chosen": -1.5560485124588013, + "logits/rejected": -1.4880516529083252, + "logps/chosen": -54.789825439453125, + "logps/rejected": -61.110069274902344, + "loss": 0.6964, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.001987707568332553, + "rewards/margins": 0.002111446810886264, + "rewards/rejected": -0.00012373924255371094, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 1.36986301369863e-08, + "logits/chosen": -1.5476592779159546, + "logits/rejected": -1.460582971572876, + "logps/chosen": -53.77582931518555, + "logps/rejected": -60.30269241333008, + "loss": 0.7011, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.00331380357965827, + "rewards/margins": -0.014126944355666637, + "rewards/rejected": 0.010813141241669655, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 1.82648401826484e-08, + "logits/chosen": -1.5262000560760498, + "logits/rejected": -1.4483455419540405, + "logps/chosen": -59.4591064453125, + "logps/rejected": -60.53277587890625, + "loss": 0.6914, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.01075730286538601, + "rewards/margins": 0.009579157456755638, + "rewards/rejected": -0.020336460322141647, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 2.28310502283105e-08, + "logits/chosen": -1.5269324779510498, + "logits/rejected": -1.4619429111480713, + "logps/chosen": -48.80940628051758, + "logps/rejected": -54.6522102355957, + "loss": 0.6844, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.007998323068022728, + "rewards/margins": 0.05441845580935478, + "rewards/rejected": -0.0464201346039772, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 2.73972602739726e-08, + "logits/chosen": -1.5598713159561157, + "logits/rejected": -1.4761707782745361, + "logps/chosen": -56.5130615234375, + "logps/rejected": -59.2298698425293, + "loss": 0.6669, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0005574465030804276, + "rewards/margins": 0.07874791324138641, + "rewards/rejected": -0.07930536568164825, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 3.19634703196347e-08, + "logits/chosen": -1.5540889501571655, + "logits/rejected": -1.4865127801895142, + "logps/chosen": -55.98405838012695, + "logps/rejected": -60.471275329589844, + "loss": 0.6547, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.005887323524802923, + "rewards/margins": 0.08136871457099915, + "rewards/rejected": -0.0872560366988182, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 3.65296803652968e-08, + "logits/chosen": -1.5670634508132935, + "logits/rejected": -1.5008996725082397, + "logps/chosen": -52.0002555847168, + "logps/rejected": -56.97963333129883, + "loss": 0.6275, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.018660223111510277, + "rewards/margins": 0.12297725677490234, + "rewards/rejected": -0.14163747429847717, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.10958904109589e-08, + "logits/chosen": -1.5634491443634033, + "logits/rejected": -1.4947015047073364, + "logps/chosen": -50.369789123535156, + "logps/rejected": -58.014564514160156, + "loss": 0.587, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.005921304225921631, + "rewards/margins": 0.2610793709754944, + "rewards/rejected": -0.25515809655189514, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.5662100456621e-08, + "logits/chosen": -1.5410099029541016, + "logits/rejected": -1.460561990737915, + "logps/chosen": -54.32032012939453, + "logps/rejected": -58.245704650878906, + "loss": 0.5577, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.007982229813933372, + "rewards/margins": 0.3241470456123352, + "rewards/rejected": -0.3321292996406555, + "step": 100 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -1.3715848922729492, + "eval_logits/rejected": -1.307630181312561, + "eval_logps/chosen": -76.15989685058594, + "eval_logps/rejected": -60.609840393066406, + "eval_loss": 0.5743366479873657, + "eval_rewards/accuracies": 0.9022346138954163, + "eval_rewards/chosen": -0.08896368741989136, + "eval_rewards/margins": 0.2638399302959442, + "eval_rewards/rejected": -0.35280361771583557, + "eval_runtime": 185.6917, + "eval_samples_per_second": 15.413, + "eval_steps_per_second": 0.964, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 5.02283105022831e-08, + "logits/chosen": -1.5511906147003174, + "logits/rejected": -1.4669945240020752, + "logps/chosen": -55.648841857910156, + "logps/rejected": -60.24755859375, + "loss": 0.5325, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.00021359921083785594, + "rewards/margins": 0.4047005772590637, + "rewards/rejected": -0.4044870436191559, + "step": 110 + }, + { + "epoch": 0.05, + "learning_rate": 5.47945205479452e-08, + "logits/chosen": -1.5656968355178833, + "logits/rejected": -1.4793637990951538, + "logps/chosen": -56.9489631652832, + "logps/rejected": -60.153228759765625, + "loss": 0.4736, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.05813983827829361, + "rewards/margins": 0.514919638633728, + "rewards/rejected": -0.5730594396591187, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 5.93607305936073e-08, + "logits/chosen": -1.582545280456543, + "logits/rejected": -1.4928141832351685, + "logps/chosen": -57.66066360473633, + "logps/rejected": -60.691490173339844, + "loss": 0.3961, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.06978967040777206, + "rewards/margins": 0.7058674693107605, + "rewards/rejected": -0.7756571769714355, + "step": 130 + }, + { + "epoch": 0.06, + "learning_rate": 6.39269406392694e-08, + "logits/chosen": -1.6073917150497437, + "logits/rejected": -1.529598593711853, + "logps/chosen": -55.62971878051758, + "logps/rejected": -63.5817985534668, + "loss": 0.3421, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.046302586793899536, + "rewards/margins": 0.9670504331588745, + "rewards/rejected": -1.0133531093597412, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 6.84931506849315e-08, + "logits/chosen": -1.5868146419525146, + "logits/rejected": -1.5070278644561768, + "logps/chosen": -55.41338348388672, + "logps/rejected": -60.58258819580078, + "loss": 0.311, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.06830114126205444, + "rewards/margins": 1.1008093357086182, + "rewards/rejected": -1.1691104173660278, + "step": 150 + }, + { + "epoch": 0.07, + "learning_rate": 7.30593607305936e-08, + "logits/chosen": -1.568574070930481, + "logits/rejected": -1.4987123012542725, + "logps/chosen": -54.731719970703125, + "logps/rejected": -63.13419723510742, + "loss": 0.2781, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.09268207848072052, + "rewards/margins": 1.240229845046997, + "rewards/rejected": -1.3329120874404907, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 7.76255707762557e-08, + "logits/chosen": -1.583616018295288, + "logits/rejected": -1.5008718967437744, + "logps/chosen": -58.21760940551758, + "logps/rejected": -61.46492385864258, + "loss": 0.2618, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1754956990480423, + "rewards/margins": 1.261420488357544, + "rewards/rejected": -1.4369161128997803, + "step": 170 + }, + { + "epoch": 0.08, + "learning_rate": 8.21917808219178e-08, + "logits/chosen": -1.6093565225601196, + "logits/rejected": -1.516689658164978, + "logps/chosen": -54.5604248046875, + "logps/rejected": -61.80358123779297, + "loss": 0.233, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13866546750068665, + "rewards/margins": 1.441631555557251, + "rewards/rejected": -1.5802969932556152, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 8.67579908675799e-08, + "logits/chosen": -1.571337342262268, + "logits/rejected": -1.4844709634780884, + "logps/chosen": -52.31999969482422, + "logps/rejected": -58.7751350402832, + "loss": 0.1923, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13308748602867126, + "rewards/margins": 1.6981117725372314, + "rewards/rejected": -1.831199288368225, + "step": 190 + }, + { + "epoch": 0.09, + "learning_rate": 9.1324200913242e-08, + "logits/chosen": -1.5900229215621948, + "logits/rejected": -1.516791820526123, + "logps/chosen": -49.58915710449219, + "logps/rejected": -60.3437385559082, + "loss": 0.1502, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15844210982322693, + "rewards/margins": 2.0099329948425293, + "rewards/rejected": -2.168375015258789, + "step": 200 + }, + { + "epoch": 0.09, + "eval_logits/chosen": -1.4090580940246582, + "eval_logits/rejected": -1.3397127389907837, + "eval_logps/chosen": -77.1548080444336, + "eval_logps/rejected": -64.89435577392578, + "eval_loss": 0.17612037062644958, + "eval_rewards/accuracies": 0.9804469347000122, + "eval_rewards/chosen": -0.5864141583442688, + "eval_rewards/margins": 1.9086493253707886, + "eval_rewards/rejected": -2.495063543319702, + "eval_runtime": 171.9625, + "eval_samples_per_second": 16.643, + "eval_steps_per_second": 1.041, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 9.58904109589041e-08, + "logits/chosen": -1.5708708763122559, + "logits/rejected": -1.4990869760513306, + "logps/chosen": -54.06782150268555, + "logps/rejected": -65.75396728515625, + "loss": 0.1042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27950865030288696, + "rewards/margins": 2.414898157119751, + "rewards/rejected": -2.694406747817993, + "step": 210 + }, + { + "epoch": 0.1, + "learning_rate": 1.004566210045662e-07, + "logits/chosen": -1.6292283535003662, + "logits/rejected": -1.5556135177612305, + "logps/chosen": -53.032798767089844, + "logps/rejected": -62.6429328918457, + "loss": 0.0878, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28332996368408203, + "rewards/margins": 2.6439812183380127, + "rewards/rejected": -2.927311420440674, + "step": 220 + }, + { + "epoch": 0.1, + "learning_rate": 1.050228310502283e-07, + "logits/chosen": -1.6413395404815674, + "logits/rejected": -1.5558950901031494, + "logps/chosen": -60.82377243041992, + "logps/rejected": -66.6085205078125, + "loss": 0.0685, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32246023416519165, + "rewards/margins": 3.061707019805908, + "rewards/rejected": -3.384167194366455, + "step": 230 + }, + { + "epoch": 0.11, + "learning_rate": 1.095890410958904e-07, + "logits/chosen": -1.6124732494354248, + "logits/rejected": -1.5318291187286377, + "logps/chosen": -55.524559020996094, + "logps/rejected": -63.96097946166992, + "loss": 0.0697, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33981430530548096, + "rewards/margins": 3.162278175354004, + "rewards/rejected": -3.5020923614501953, + "step": 240 + }, + { + "epoch": 0.11, + "learning_rate": 1.141552511415525e-07, + "logits/chosen": -1.6445833444595337, + "logits/rejected": -1.5500088930130005, + "logps/chosen": -54.410560607910156, + "logps/rejected": -62.597938537597656, + "loss": 0.0588, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3053959012031555, + "rewards/margins": 3.180203914642334, + "rewards/rejected": -3.485599994659424, + "step": 250 + }, + { + "epoch": 0.12, + "learning_rate": 1.187214611872146e-07, + "logits/chosen": -1.634615182876587, + "logits/rejected": -1.548903226852417, + "logps/chosen": -58.29247283935547, + "logps/rejected": -65.41529846191406, + "loss": 0.0517, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42053937911987305, + "rewards/margins": 3.6800742149353027, + "rewards/rejected": -4.100613594055176, + "step": 260 + }, + { + "epoch": 0.12, + "learning_rate": 1.232876712328767e-07, + "logits/chosen": -1.6498243808746338, + "logits/rejected": -1.5349724292755127, + "logps/chosen": -57.144081115722656, + "logps/rejected": -66.8178939819336, + "loss": 0.0509, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36212849617004395, + "rewards/margins": 3.6991469860076904, + "rewards/rejected": -4.061275482177734, + "step": 270 + }, + { + "epoch": 0.13, + "learning_rate": 1.278538812785388e-07, + "logits/chosen": -1.639831304550171, + "logits/rejected": -1.569330096244812, + "logps/chosen": -54.149169921875, + "logps/rejected": -68.57421112060547, + "loss": 0.0364, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41847267746925354, + "rewards/margins": 4.1883344650268555, + "rewards/rejected": -4.606807708740234, + "step": 280 + }, + { + "epoch": 0.13, + "learning_rate": 1.324200913242009e-07, + "logits/chosen": -1.6218324899673462, + "logits/rejected": -1.5499851703643799, + "logps/chosen": -56.719825744628906, + "logps/rejected": -68.20590209960938, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4609605371952057, + "rewards/margins": 4.013735771179199, + "rewards/rejected": -4.474696636199951, + "step": 290 + }, + { + "epoch": 0.14, + "learning_rate": 1.36986301369863e-07, + "logits/chosen": -1.6696264743804932, + "logits/rejected": -1.5758763551712036, + "logps/chosen": -54.80878829956055, + "logps/rejected": -65.65101623535156, + "loss": 0.0367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43391790986061096, + "rewards/margins": 3.7488112449645996, + "rewards/rejected": -4.182730197906494, + "step": 300 + }, + { + "epoch": 0.14, + "eval_logits/chosen": -1.44279146194458, + "eval_logits/rejected": -1.368519902229309, + "eval_logps/chosen": -78.34496307373047, + "eval_logps/rejected": -69.59746551513672, + "eval_loss": 0.06399906426668167, + "eval_rewards/accuracies": 0.9860334992408752, + "eval_rewards/chosen": -1.1814894676208496, + "eval_rewards/margins": 3.665130138397217, + "eval_rewards/rejected": -4.846619606018066, + "eval_runtime": 135.8833, + "eval_samples_per_second": 21.062, + "eval_steps_per_second": 1.317, + "step": 300 + }, + { + "epoch": 0.14, + "learning_rate": 1.415525114155251e-07, + "logits/chosen": -1.6536544561386108, + "logits/rejected": -1.5657002925872803, + "logps/chosen": -52.22998046875, + "logps/rejected": -65.63001251220703, + "loss": 0.0392, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.43238696455955505, + "rewards/margins": 4.57238245010376, + "rewards/rejected": -5.004769325256348, + "step": 310 + }, + { + "epoch": 0.15, + "learning_rate": 1.461187214611872e-07, + "logits/chosen": -1.6720914840698242, + "logits/rejected": -1.5804774761199951, + "logps/chosen": -55.632659912109375, + "logps/rejected": -70.00082397460938, + "loss": 0.0281, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4426132142543793, + "rewards/margins": 4.479395866394043, + "rewards/rejected": -4.922008991241455, + "step": 320 + }, + { + "epoch": 0.15, + "learning_rate": 1.506849315068493e-07, + "logits/chosen": -1.6406399011611938, + "logits/rejected": -1.5606629848480225, + "logps/chosen": -54.38983154296875, + "logps/rejected": -70.45909881591797, + "loss": 0.031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.520280659198761, + "rewards/margins": 4.549806594848633, + "rewards/rejected": -5.070087432861328, + "step": 330 + }, + { + "epoch": 0.16, + "learning_rate": 1.552511415525114e-07, + "logits/chosen": -1.6501238346099854, + "logits/rejected": -1.5617876052856445, + "logps/chosen": -53.52082443237305, + "logps/rejected": -66.49567413330078, + "loss": 0.0284, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5217021703720093, + "rewards/margins": 4.813857555389404, + "rewards/rejected": -5.335558891296387, + "step": 340 + }, + { + "epoch": 0.16, + "learning_rate": 1.598173515981735e-07, + "logits/chosen": -1.6586627960205078, + "logits/rejected": -1.579021692276001, + "logps/chosen": -53.56574249267578, + "logps/rejected": -70.02999114990234, + "loss": 0.0247, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.5866273641586304, + "rewards/margins": 4.987410068511963, + "rewards/rejected": -5.574038505554199, + "step": 350 + }, + { + "epoch": 0.16, + "learning_rate": 1.643835616438356e-07, + "logits/chosen": -1.6780459880828857, + "logits/rejected": -1.596045732498169, + "logps/chosen": -54.865150451660156, + "logps/rejected": -70.90138244628906, + "loss": 0.0238, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6191946864128113, + "rewards/margins": 4.894677639007568, + "rewards/rejected": -5.513872146606445, + "step": 360 + }, + { + "epoch": 0.17, + "learning_rate": 1.689497716894977e-07, + "logits/chosen": -1.6677402257919312, + "logits/rejected": -1.57552170753479, + "logps/chosen": -55.3704833984375, + "logps/rejected": -69.22010803222656, + "loss": 0.0296, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.7166622281074524, + "rewards/margins": 4.735496520996094, + "rewards/rejected": -5.452158451080322, + "step": 370 + }, + { + "epoch": 0.17, + "learning_rate": 1.735159817351598e-07, + "logits/chosen": -1.6523818969726562, + "logits/rejected": -1.5386160612106323, + "logps/chosen": -55.9133186340332, + "logps/rejected": -67.37703704833984, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5371387004852295, + "rewards/margins": 5.164801597595215, + "rewards/rejected": -5.701941013336182, + "step": 380 + }, + { + "epoch": 0.18, + "learning_rate": 1.780821917808219e-07, + "logits/chosen": -1.6681104898452759, + "logits/rejected": -1.5768417119979858, + "logps/chosen": -54.644615173339844, + "logps/rejected": -68.48939514160156, + "loss": 0.0166, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.5451260805130005, + "rewards/margins": 5.735184669494629, + "rewards/rejected": -6.280310153961182, + "step": 390 + }, + { + "epoch": 0.18, + "learning_rate": 1.82648401826484e-07, + "logits/chosen": -1.6817266941070557, + "logits/rejected": -1.6053158044815063, + "logps/chosen": -50.52559280395508, + "logps/rejected": -69.53555297851562, + "loss": 0.0195, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5777836441993713, + "rewards/margins": 5.324347019195557, + "rewards/rejected": -5.902129650115967, + "step": 400 + }, + { + "epoch": 0.18, + "eval_logits/chosen": -1.4647938013076782, + "eval_logits/rejected": -1.387487769126892, + "eval_logps/chosen": -79.24311828613281, + "eval_logps/rejected": -72.7348403930664, + "eval_loss": 0.0418909415602684, + "eval_rewards/accuracies": 0.9832402467727661, + "eval_rewards/chosen": -1.6305665969848633, + "eval_rewards/margins": 4.784738063812256, + "eval_rewards/rejected": -6.415303707122803, + "eval_runtime": 177.0186, + "eval_samples_per_second": 16.168, + "eval_steps_per_second": 1.011, + "step": 400 + }, + { + "epoch": 0.19, + "learning_rate": 1.872146118721461e-07, + "logits/chosen": -1.7017066478729248, + "logits/rejected": -1.5788252353668213, + "logps/chosen": -61.74540328979492, + "logps/rejected": -69.6572036743164, + "loss": 0.0174, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.7025364637374878, + "rewards/margins": 5.482548713684082, + "rewards/rejected": -6.185084342956543, + "step": 410 + }, + { + "epoch": 0.19, + "learning_rate": 1.917808219178082e-07, + "logits/chosen": -1.6518733501434326, + "logits/rejected": -1.574467420578003, + "logps/chosen": -53.831336975097656, + "logps/rejected": -72.37210083007812, + "loss": 0.0178, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.8208284378051758, + "rewards/margins": 5.8398590087890625, + "rewards/rejected": -6.660687446594238, + "step": 420 + }, + { + "epoch": 0.2, + "learning_rate": 1.963470319634703e-07, + "logits/chosen": -1.6632076501846313, + "logits/rejected": -1.5873596668243408, + "logps/chosen": -53.70698928833008, + "logps/rejected": -68.99622344970703, + "loss": 0.0207, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.964774489402771, + "rewards/margins": 5.707071304321289, + "rewards/rejected": -6.67184591293335, + "step": 430 + }, + { + "epoch": 0.2, + "learning_rate": 2.009132420091324e-07, + "logits/chosen": -1.6617752313613892, + "logits/rejected": -1.5743049383163452, + "logps/chosen": -53.155479431152344, + "logps/rejected": -70.70321655273438, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.732166051864624, + "rewards/margins": 5.958271026611328, + "rewards/rejected": -6.690437316894531, + "step": 440 + }, + { + "epoch": 0.21, + "learning_rate": 2.054794520547945e-07, + "logits/chosen": -1.7175381183624268, + "logits/rejected": -1.6154544353485107, + "logps/chosen": -58.01020431518555, + "logps/rejected": -69.6778564453125, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8646949529647827, + "rewards/margins": 6.164088249206543, + "rewards/rejected": -7.028783321380615, + "step": 450 + }, + { + "epoch": 0.21, + "learning_rate": 2.100456621004566e-07, + "logits/chosen": -1.6831591129302979, + "logits/rejected": -1.5844804048538208, + "logps/chosen": -60.1660270690918, + "logps/rejected": -73.73701477050781, + "loss": 0.0165, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.0043270587921143, + "rewards/margins": 6.397038459777832, + "rewards/rejected": -7.401365756988525, + "step": 460 + }, + { + "epoch": 0.21, + "learning_rate": 2.146118721461187e-07, + "logits/chosen": -1.6989357471466064, + "logits/rejected": -1.5939372777938843, + "logps/chosen": -59.41900634765625, + "logps/rejected": -74.23500061035156, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7950853109359741, + "rewards/margins": 6.795047760009766, + "rewards/rejected": -7.5901336669921875, + "step": 470 + }, + { + "epoch": 0.22, + "learning_rate": 2.191780821917808e-07, + "logits/chosen": -1.6680141687393188, + "logits/rejected": -1.578650951385498, + "logps/chosen": -56.234466552734375, + "logps/rejected": -71.89026641845703, + "loss": 0.0156, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2601807117462158, + "rewards/margins": 6.4187421798706055, + "rewards/rejected": -7.6789231300354, + "step": 480 + }, + { + "epoch": 0.22, + "learning_rate": 2.237442922374429e-07, + "logits/chosen": -1.6936092376708984, + "logits/rejected": -1.59926438331604, + "logps/chosen": -56.20717239379883, + "logps/rejected": -73.23368072509766, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8995521664619446, + "rewards/margins": 6.879524230957031, + "rewards/rejected": -7.77907657623291, + "step": 490 + }, + { + "epoch": 0.23, + "learning_rate": 2.28310502283105e-07, + "logits/chosen": -1.698009729385376, + "logits/rejected": -1.5847851037979126, + "logps/chosen": -57.49848175048828, + "logps/rejected": -72.68367004394531, + "loss": 0.0128, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.065097451210022, + "rewards/margins": 6.5358781814575195, + "rewards/rejected": -7.600974082946777, + "step": 500 + }, + { + "epoch": 0.23, + "eval_logits/chosen": -1.4846999645233154, + "eval_logits/rejected": -1.4044886827468872, + "eval_logps/chosen": -80.25221252441406, + "eval_logps/rejected": -75.9832534790039, + "eval_loss": 0.03207468241453171, + "eval_rewards/accuracies": 0.9860334992408752, + "eval_rewards/chosen": -2.1351206302642822, + "eval_rewards/margins": 5.904390811920166, + "eval_rewards/rejected": -8.039511680603027, + "eval_runtime": 165.0524, + "eval_samples_per_second": 17.34, + "eval_steps_per_second": 1.085, + "step": 500 + }, + { + "epoch": 0.23, + "learning_rate": 2.328767123287671e-07, + "logits/chosen": -1.6877530813217163, + "logits/rejected": -1.5861059427261353, + "logps/chosen": -54.69840621948242, + "logps/rejected": -72.17831420898438, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0449682474136353, + "rewards/margins": 6.904466152191162, + "rewards/rejected": -7.949434757232666, + "step": 510 + }, + { + "epoch": 0.24, + "learning_rate": 2.374429223744292e-07, + "logits/chosen": -1.7049137353897095, + "logits/rejected": -1.5926499366760254, + "logps/chosen": -59.61528778076172, + "logps/rejected": -75.29037475585938, + "loss": 0.009, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.9810761213302612, + "rewards/margins": 7.4216814041137695, + "rewards/rejected": -8.40275764465332, + "step": 520 + }, + { + "epoch": 0.24, + "learning_rate": 2.420091324200913e-07, + "logits/chosen": -1.6999527215957642, + "logits/rejected": -1.6045364141464233, + "logps/chosen": -57.0565185546875, + "logps/rejected": -77.7638168334961, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.02046537399292, + "rewards/margins": 7.695591926574707, + "rewards/rejected": -8.716057777404785, + "step": 530 + }, + { + "epoch": 0.25, + "learning_rate": 2.465753424657534e-07, + "logits/chosen": -1.6926349401474, + "logits/rejected": -1.5947034358978271, + "logps/chosen": -59.54913330078125, + "logps/rejected": -80.18620300292969, + "loss": 0.0087, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.0159438848495483, + "rewards/margins": 7.910098075866699, + "rewards/rejected": -8.926042556762695, + "step": 540 + }, + { + "epoch": 0.25, + "learning_rate": 2.511415525114155e-07, + "logits/chosen": -1.684308648109436, + "logits/rejected": -1.5966044664382935, + "logps/chosen": -60.59437942504883, + "logps/rejected": -73.6009292602539, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3125141859054565, + "rewards/margins": 7.294568061828613, + "rewards/rejected": -8.607081413269043, + "step": 550 + }, + { + "epoch": 0.26, + "learning_rate": 2.557077625570776e-07, + "logits/chosen": -1.7317125797271729, + "logits/rejected": -1.647165298461914, + "logps/chosen": -56.182151794433594, + "logps/rejected": -75.43659973144531, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1344718933105469, + "rewards/margins": 7.353419303894043, + "rewards/rejected": -8.487890243530273, + "step": 560 + }, + { + "epoch": 0.26, + "learning_rate": 2.602739726027397e-07, + "logits/chosen": -1.7006349563598633, + "logits/rejected": -1.6112979650497437, + "logps/chosen": -55.806793212890625, + "logps/rejected": -76.87861633300781, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0782018899917603, + "rewards/margins": 7.8718671798706055, + "rewards/rejected": -8.950068473815918, + "step": 570 + }, + { + "epoch": 0.26, + "learning_rate": 2.648401826484018e-07, + "logits/chosen": -1.7162210941314697, + "logits/rejected": -1.6245304346084595, + "logps/chosen": -55.56931686401367, + "logps/rejected": -73.07711791992188, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2571735382080078, + "rewards/margins": 7.612096309661865, + "rewards/rejected": -8.869270324707031, + "step": 580 + }, + { + "epoch": 0.27, + "learning_rate": 2.694063926940639e-07, + "logits/chosen": -1.6970678567886353, + "logits/rejected": -1.6222326755523682, + "logps/chosen": -56.684051513671875, + "logps/rejected": -79.91156005859375, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2310670614242554, + "rewards/margins": 8.0743989944458, + "rewards/rejected": -9.305466651916504, + "step": 590 + }, + { + "epoch": 0.27, + "learning_rate": 2.73972602739726e-07, + "logits/chosen": -1.7427692413330078, + "logits/rejected": -1.6350624561309814, + "logps/chosen": -55.236839294433594, + "logps/rejected": -76.8140869140625, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1100609302520752, + "rewards/margins": 7.973969459533691, + "rewards/rejected": -9.084030151367188, + "step": 600 + }, + { + "epoch": 0.27, + "eval_logits/chosen": -1.4985913038253784, + "eval_logits/rejected": -1.4162646532058716, + "eval_logps/chosen": -81.62905883789062, + "eval_logps/rejected": -79.30266571044922, + "eval_loss": 0.029448172077536583, + "eval_rewards/accuracies": 0.9860334992408752, + "eval_rewards/chosen": -2.8235411643981934, + "eval_rewards/margins": 6.87567663192749, + "eval_rewards/rejected": -9.699217796325684, + "eval_runtime": 162.0935, + "eval_samples_per_second": 17.656, + "eval_steps_per_second": 1.104, + "step": 600 + }, + { + "epoch": 0.28, + "learning_rate": 2.785388127853881e-07, + "logits/chosen": -1.7095897197723389, + "logits/rejected": -1.6108310222625732, + "logps/chosen": -57.466880798339844, + "logps/rejected": -76.33137512207031, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2477577924728394, + "rewards/margins": 8.338712692260742, + "rewards/rejected": -9.586469650268555, + "step": 610 + }, + { + "epoch": 0.28, + "learning_rate": 2.831050228310502e-07, + "logits/chosen": -1.694031000137329, + "logits/rejected": -1.6111032962799072, + "logps/chosen": -54.303466796875, + "logps/rejected": -79.16754150390625, + "loss": 0.0112, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.403235673904419, + "rewards/margins": 8.552541732788086, + "rewards/rejected": -9.955777168273926, + "step": 620 + }, + { + "epoch": 0.29, + "learning_rate": 2.876712328767123e-07, + "logits/chosen": -1.7127506732940674, + "logits/rejected": -1.619024634361267, + "logps/chosen": -56.50312423706055, + "logps/rejected": -79.96643829345703, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.282273530960083, + "rewards/margins": 8.758012771606445, + "rewards/rejected": -10.040285110473633, + "step": 630 + }, + { + "epoch": 0.29, + "learning_rate": 2.922374429223744e-07, + "logits/chosen": -1.7468054294586182, + "logits/rejected": -1.6339282989501953, + "logps/chosen": -58.642433166503906, + "logps/rejected": -79.05645751953125, + "loss": 0.0137, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.4662420749664307, + "rewards/margins": 8.245123863220215, + "rewards/rejected": -9.711365699768066, + "step": 640 + }, + { + "epoch": 0.3, + "learning_rate": 2.968036529680365e-07, + "logits/chosen": -1.731302261352539, + "logits/rejected": -1.627869963645935, + "logps/chosen": -58.152130126953125, + "logps/rejected": -76.38700866699219, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5161360502243042, + "rewards/margins": 8.61723518371582, + "rewards/rejected": -10.133370399475098, + "step": 650 + }, + { + "epoch": 0.3, + "learning_rate": 2.998477929984779e-07, + "logits/chosen": -1.7117973566055298, + "logits/rejected": -1.6044315099716187, + "logps/chosen": -58.26856231689453, + "logps/rejected": -81.69371032714844, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4945389032363892, + "rewards/margins": 9.201311111450195, + "rewards/rejected": -10.69584846496582, + "step": 660 + }, + { + "epoch": 0.31, + "learning_rate": 2.993404363267377e-07, + "logits/chosen": -1.7394225597381592, + "logits/rejected": -1.6316182613372803, + "logps/chosen": -57.263755798339844, + "logps/rejected": -75.65992736816406, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1733144521713257, + "rewards/margins": 8.887094497680664, + "rewards/rejected": -10.060407638549805, + "step": 670 + }, + { + "epoch": 0.31, + "learning_rate": 2.9883307965499743e-07, + "logits/chosen": -1.6869474649429321, + "logits/rejected": -1.5928945541381836, + "logps/chosen": -57.73834228515625, + "logps/rejected": -81.07805633544922, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3280141353607178, + "rewards/margins": 9.038291931152344, + "rewards/rejected": -10.36630630493164, + "step": 680 + }, + { + "epoch": 0.31, + "learning_rate": 2.983257229832572e-07, + "logits/chosen": -1.7139816284179688, + "logits/rejected": -1.6298580169677734, + "logps/chosen": -55.35132598876953, + "logps/rejected": -78.6609878540039, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2809817790985107, + "rewards/margins": 8.990970611572266, + "rewards/rejected": -10.271952629089355, + "step": 690 + }, + { + "epoch": 0.32, + "learning_rate": 2.9781836631151696e-07, + "logits/chosen": -1.7303400039672852, + "logits/rejected": -1.6253429651260376, + "logps/chosen": -59.9201545715332, + "logps/rejected": -81.42010498046875, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3820278644561768, + "rewards/margins": 9.210229873657227, + "rewards/rejected": -10.592256546020508, + "step": 700 + }, + { + "epoch": 0.32, + "eval_logits/chosen": -1.5078723430633545, + "eval_logits/rejected": -1.4251151084899902, + "eval_logps/chosen": -81.525634765625, + "eval_logps/rejected": -81.45867919921875, + "eval_loss": 0.017705164849758148, + "eval_rewards/accuracies": 0.9832402467727661, + "eval_rewards/chosen": -2.7718329429626465, + "eval_rewards/margins": 8.005391120910645, + "eval_rewards/rejected": -10.777223587036133, + "eval_runtime": 165.2049, + "eval_samples_per_second": 17.324, + "eval_steps_per_second": 1.084, + "step": 700 + }, + { + "epoch": 0.32, + "learning_rate": 2.9731100963977676e-07, + "logits/chosen": -1.715698003768921, + "logits/rejected": -1.6322933435440063, + "logps/chosen": -57.885459899902344, + "logps/rejected": -84.03385925292969, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5874685049057007, + "rewards/margins": 9.31218433380127, + "rewards/rejected": -10.899652481079102, + "step": 710 + }, + { + "epoch": 0.33, + "learning_rate": 2.968036529680365e-07, + "logits/chosen": -1.7445329427719116, + "logits/rejected": -1.6722770929336548, + "logps/chosen": -54.62650680541992, + "logps/rejected": -83.03822326660156, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.637279748916626, + "rewards/margins": 9.309844017028809, + "rewards/rejected": -10.947123527526855, + "step": 720 + }, + { + "epoch": 0.33, + "learning_rate": 2.962962962962963e-07, + "logits/chosen": -1.7035659551620483, + "logits/rejected": -1.6001815795898438, + "logps/chosen": -55.35626983642578, + "logps/rejected": -77.52360534667969, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8972182273864746, + "rewards/margins": 9.760783195495605, + "rewards/rejected": -10.658002853393555, + "step": 730 + }, + { + "epoch": 0.34, + "learning_rate": 2.9578893962455603e-07, + "logits/chosen": -1.7120774984359741, + "logits/rejected": -1.6191877126693726, + "logps/chosen": -56.72313690185547, + "logps/rejected": -78.64081573486328, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2180434465408325, + "rewards/margins": 9.50749397277832, + "rewards/rejected": -10.72553825378418, + "step": 740 + }, + { + "epoch": 0.34, + "learning_rate": 2.952815829528158e-07, + "logits/chosen": -1.7385778427124023, + "logits/rejected": -1.6423228979110718, + "logps/chosen": -56.050682067871094, + "logps/rejected": -79.77486419677734, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8531892895698547, + "rewards/margins": 9.420397758483887, + "rewards/rejected": -10.273588180541992, + "step": 750 + }, + { + "epoch": 0.35, + "learning_rate": 2.9477422628107556e-07, + "logits/chosen": -1.7303050756454468, + "logits/rejected": -1.6287094354629517, + "logps/chosen": -61.40581512451172, + "logps/rejected": -78.7645263671875, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1273159980773926, + "rewards/margins": 9.553644180297852, + "rewards/rejected": -10.680959701538086, + "step": 760 + }, + { + "epoch": 0.35, + "learning_rate": 2.9426686960933536e-07, + "logits/chosen": -1.7399718761444092, + "logits/rejected": -1.6434978246688843, + "logps/chosen": -53.90864181518555, + "logps/rejected": -78.7162094116211, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8057798147201538, + "rewards/margins": 10.01326847076416, + "rewards/rejected": -10.819047927856445, + "step": 770 + }, + { + "epoch": 0.36, + "learning_rate": 2.937595129375951e-07, + "logits/chosen": -1.727130651473999, + "logits/rejected": -1.6336214542388916, + "logps/chosen": -59.56471633911133, + "logps/rejected": -78.61094665527344, + "loss": 0.0065, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.0801204442977905, + "rewards/margins": 9.437311172485352, + "rewards/rejected": -10.51743221282959, + "step": 780 + }, + { + "epoch": 0.36, + "learning_rate": 2.932521562658549e-07, + "logits/chosen": -1.7236578464508057, + "logits/rejected": -1.637286901473999, + "logps/chosen": -58.87115478515625, + "logps/rejected": -81.21770477294922, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2567155361175537, + "rewards/margins": 10.11281967163086, + "rewards/rejected": -11.369535446166992, + "step": 790 + }, + { + "epoch": 0.37, + "learning_rate": 2.9274479959411463e-07, + "logits/chosen": -1.7196662425994873, + "logits/rejected": -1.6370540857315063, + "logps/chosen": -58.21757888793945, + "logps/rejected": -84.54280090332031, + "loss": 0.0051, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.156782865524292, + "rewards/margins": 10.224605560302734, + "rewards/rejected": -11.381387710571289, + "step": 800 + }, + { + "epoch": 0.37, + "eval_logits/chosen": -1.518147587776184, + "eval_logits/rejected": -1.4352678060531616, + "eval_logps/chosen": -80.94294738769531, + "eval_logps/rejected": -82.54000091552734, + "eval_loss": 0.014408529736101627, + "eval_rewards/accuracies": 0.9832402467727661, + "eval_rewards/chosen": -2.4804840087890625, + "eval_rewards/margins": 8.837401390075684, + "eval_rewards/rejected": -11.317886352539062, + "eval_runtime": 169.3625, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 1.057, + "step": 800 + }, + { + "epoch": 0.37, + "learning_rate": 2.922374429223744e-07, + "logits/chosen": -1.708937644958496, + "logits/rejected": -1.6160999536514282, + "logps/chosen": -52.5054931640625, + "logps/rejected": -78.92777252197266, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7234916090965271, + "rewards/margins": 10.119041442871094, + "rewards/rejected": -10.842533111572266, + "step": 810 + }, + { + "epoch": 0.37, + "learning_rate": 2.9173008625063416e-07, + "logits/chosen": -1.722770094871521, + "logits/rejected": -1.6324989795684814, + "logps/chosen": -53.77677536010742, + "logps/rejected": -81.64833068847656, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8361567258834839, + "rewards/margins": 10.269185066223145, + "rewards/rejected": -11.105340957641602, + "step": 820 + }, + { + "epoch": 0.38, + "learning_rate": 2.9122272957889396e-07, + "logits/chosen": -1.7464574575424194, + "logits/rejected": -1.641235589981079, + "logps/chosen": -54.421730041503906, + "logps/rejected": -77.16957092285156, + "loss": 0.0067, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.5370277166366577, + "rewards/margins": 10.162129402160645, + "rewards/rejected": -10.69915771484375, + "step": 830 + }, + { + "epoch": 0.38, + "learning_rate": 2.907153729071537e-07, + "logits/chosen": -1.7367918491363525, + "logits/rejected": -1.635994553565979, + "logps/chosen": -54.7552375793457, + "logps/rejected": -81.75818634033203, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.014847755432129, + "rewards/margins": 10.362683296203613, + "rewards/rejected": -11.377530097961426, + "step": 840 + }, + { + "epoch": 0.39, + "learning_rate": 2.902080162354135e-07, + "logits/chosen": -1.7175086736679077, + "logits/rejected": -1.6312010288238525, + "logps/chosen": -55.76726150512695, + "logps/rejected": -80.70665740966797, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0785677433013916, + "rewards/margins": 10.594701766967773, + "rewards/rejected": -11.673269271850586, + "step": 850 + }, + { + "epoch": 0.39, + "learning_rate": 2.8970065956367323e-07, + "logits/chosen": -1.7384506464004517, + "logits/rejected": -1.6417887210845947, + "logps/chosen": -59.0673713684082, + "logps/rejected": -82.13581848144531, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1794620752334595, + "rewards/margins": 10.776390075683594, + "rewards/rejected": -11.955851554870605, + "step": 860 + }, + { + "epoch": 0.4, + "learning_rate": 2.89193302891933e-07, + "logits/chosen": -1.7018225193023682, + "logits/rejected": -1.6194498538970947, + "logps/chosen": -57.1175651550293, + "logps/rejected": -85.069091796875, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2594497203826904, + "rewards/margins": 11.165175437927246, + "rewards/rejected": -12.424626350402832, + "step": 870 + }, + { + "epoch": 0.4, + "learning_rate": 2.8868594622019276e-07, + "logits/chosen": -1.7552284002304077, + "logits/rejected": -1.6478898525238037, + "logps/chosen": -60.715362548828125, + "logps/rejected": -81.6158676147461, + "loss": 0.0072, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.4402903318405151, + "rewards/margins": 10.465791702270508, + "rewards/rejected": -11.906082153320312, + "step": 880 + }, + { + "epoch": 0.41, + "learning_rate": 2.8817858954845256e-07, + "logits/chosen": -1.710700273513794, + "logits/rejected": -1.6423050165176392, + "logps/chosen": -53.98700714111328, + "logps/rejected": -82.07879638671875, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2296615839004517, + "rewards/margins": 10.90865707397461, + "rewards/rejected": -12.138318061828613, + "step": 890 + }, + { + "epoch": 0.41, + "learning_rate": 2.876712328767123e-07, + "logits/chosen": -1.7296634912490845, + "logits/rejected": -1.6472947597503662, + "logps/chosen": -55.60344314575195, + "logps/rejected": -81.44587707519531, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.296541690826416, + "rewards/margins": 11.358396530151367, + "rewards/rejected": -12.654939651489258, + "step": 900 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -1.526100516319275, + "eval_logits/rejected": -1.4421030282974243, + "eval_logps/chosen": -81.6524658203125, + "eval_logps/rejected": -84.46768951416016, + "eval_loss": 0.016024084761738777, + "eval_rewards/accuracies": 0.9860334992408752, + "eval_rewards/chosen": -2.835247278213501, + "eval_rewards/margins": 9.446483612060547, + "eval_rewards/rejected": -12.281729698181152, + "eval_runtime": 149.6015, + "eval_samples_per_second": 19.131, + "eval_steps_per_second": 1.197, + "step": 900 + }, + { + "epoch": 0.42, + "learning_rate": 2.871638762049721e-07, + "logits/chosen": -1.7292686700820923, + "logits/rejected": -1.6218318939208984, + "logps/chosen": -55.28586959838867, + "logps/rejected": -83.52758026123047, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8380565643310547, + "rewards/margins": 11.400309562683105, + "rewards/rejected": -12.238367080688477, + "step": 910 + }, + { + "epoch": 0.42, + "learning_rate": 2.8665651953323183e-07, + "logits/chosen": -1.7416365146636963, + "logits/rejected": -1.6355617046356201, + "logps/chosen": -57.684844970703125, + "logps/rejected": -81.35444641113281, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8555625677108765, + "rewards/margins": 11.20235538482666, + "rewards/rejected": -12.057916641235352, + "step": 920 + }, + { + "epoch": 0.42, + "learning_rate": 2.861491628614916e-07, + "logits/chosen": -1.7450590133666992, + "logits/rejected": -1.6474605798721313, + "logps/chosen": -58.1208381652832, + "logps/rejected": -82.04862976074219, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2038339376449585, + "rewards/margins": 10.757715225219727, + "rewards/rejected": -11.961549758911133, + "step": 930 + }, + { + "epoch": 0.43, + "learning_rate": 2.8564180618975136e-07, + "logits/chosen": -1.756439208984375, + "logits/rejected": -1.6617294549942017, + "logps/chosen": -59.64165496826172, + "logps/rejected": -86.30843353271484, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3430993556976318, + "rewards/margins": 11.532323837280273, + "rewards/rejected": -12.8754243850708, + "step": 940 + }, + { + "epoch": 0.43, + "learning_rate": 2.8513444951801116e-07, + "logits/chosen": -1.732020378112793, + "logits/rejected": -1.643958330154419, + "logps/chosen": -55.513648986816406, + "logps/rejected": -84.36043548583984, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1080124378204346, + "rewards/margins": 11.76207160949707, + "rewards/rejected": -12.870083808898926, + "step": 950 + }, + { + "epoch": 0.44, + "learning_rate": 2.846270928462709e-07, + "logits/chosen": -1.7512563467025757, + "logits/rejected": -1.6647357940673828, + "logps/chosen": -57.02693557739258, + "logps/rejected": -85.93479919433594, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.300955057144165, + "rewards/margins": 11.752635955810547, + "rewards/rejected": -13.053593635559082, + "step": 960 + }, + { + "epoch": 0.44, + "learning_rate": 2.841197361745307e-07, + "logits/chosen": -1.7503582239151, + "logits/rejected": -1.6578338146209717, + "logps/chosen": -54.23548126220703, + "logps/rejected": -82.74885559082031, + "loss": 0.0069, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.0630229711532593, + "rewards/margins": 11.578569412231445, + "rewards/rejected": -12.641592025756836, + "step": 970 + }, + { + "epoch": 0.45, + "learning_rate": 2.8361237950279043e-07, + "logits/chosen": -1.7624849081039429, + "logits/rejected": -1.6492741107940674, + "logps/chosen": -59.6468391418457, + "logps/rejected": -83.25942993164062, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0746952295303345, + "rewards/margins": 12.109354972839355, + "rewards/rejected": -13.184049606323242, + "step": 980 + }, + { + "epoch": 0.45, + "learning_rate": 2.831050228310502e-07, + "logits/chosen": -1.7709052562713623, + "logits/rejected": -1.6669490337371826, + "logps/chosen": -58.214073181152344, + "logps/rejected": -80.26509094238281, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2640316486358643, + "rewards/margins": 10.607865333557129, + "rewards/rejected": -11.871896743774414, + "step": 990 + }, + { + "epoch": 0.46, + "learning_rate": 2.8259766615930996e-07, + "logits/chosen": -1.7690212726593018, + "logits/rejected": -1.664385199546814, + "logps/chosen": -58.22309494018555, + "logps/rejected": -80.5656967163086, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4453684091567993, + "rewards/margins": 11.335880279541016, + "rewards/rejected": -12.781248092651367, + "step": 1000 + }, + { + "epoch": 0.46, + "eval_logits/chosen": -1.5345439910888672, + "eval_logits/rejected": -1.4514362812042236, + "eval_logps/chosen": -81.75653076171875, + "eval_logps/rejected": -85.9760971069336, + "eval_loss": 0.012221959419548512, + "eval_rewards/accuracies": 0.9860334992408752, + "eval_rewards/chosen": -2.887273073196411, + "eval_rewards/margins": 10.148656845092773, + "eval_rewards/rejected": -13.035929679870605, + "eval_runtime": 144.624, + "eval_samples_per_second": 19.789, + "eval_steps_per_second": 1.238, + "step": 1000 + }, + { + "epoch": 0.46, + "learning_rate": 2.8209030948756976e-07, + "logits/chosen": -1.755025863647461, + "logits/rejected": -1.6674007177352905, + "logps/chosen": -57.46387481689453, + "logps/rejected": -87.47645568847656, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1511553525924683, + "rewards/margins": 12.040731430053711, + "rewards/rejected": -13.191886901855469, + "step": 1010 + }, + { + "epoch": 0.47, + "learning_rate": 2.815829528158295e-07, + "logits/chosen": -1.7087234258651733, + "logits/rejected": -1.6328926086425781, + "logps/chosen": -51.928794860839844, + "logps/rejected": -79.58134460449219, + "loss": 0.0057, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.155023217201233, + "rewards/margins": 11.272875785827637, + "rewards/rejected": -12.427899360656738, + "step": 1020 + }, + { + "epoch": 0.47, + "learning_rate": 2.810755961440893e-07, + "logits/chosen": -1.7401186227798462, + "logits/rejected": -1.653926134109497, + "logps/chosen": -54.856727600097656, + "logps/rejected": -85.71138000488281, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2455310821533203, + "rewards/margins": 11.82475757598877, + "rewards/rejected": -13.070286750793457, + "step": 1030 + }, + { + "epoch": 0.47, + "learning_rate": 2.8056823947234903e-07, + "logits/chosen": -1.7602002620697021, + "logits/rejected": -1.6599136590957642, + "logps/chosen": -57.8088493347168, + "logps/rejected": -83.03340911865234, + "loss": 0.0082, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2695609331130981, + "rewards/margins": 11.275420188903809, + "rewards/rejected": -12.544981002807617, + "step": 1040 + }, + { + "epoch": 0.48, + "learning_rate": 2.800608828006088e-07, + "logits/chosen": -1.7425765991210938, + "logits/rejected": -1.6693546772003174, + "logps/chosen": -54.41240310668945, + "logps/rejected": -84.59600830078125, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3567270040512085, + "rewards/margins": 11.407785415649414, + "rewards/rejected": -12.76451301574707, + "step": 1050 + }, + { + "epoch": 0.48, + "learning_rate": 2.7955352612886856e-07, + "logits/chosen": -1.7381328344345093, + "logits/rejected": -1.6626803874969482, + "logps/chosen": -55.707252502441406, + "logps/rejected": -82.98629760742188, + "loss": 0.0086, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.7733808755874634, + "rewards/margins": 11.287147521972656, + "rewards/rejected": -13.060528755187988, + "step": 1060 + }, + { + "epoch": 0.49, + "learning_rate": 2.7904616945712836e-07, + "logits/chosen": -1.7508115768432617, + "logits/rejected": -1.6619138717651367, + "logps/chosen": -57.57696533203125, + "logps/rejected": -82.67385864257812, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5290658473968506, + "rewards/margins": 12.146486282348633, + "rewards/rejected": -13.675552368164062, + "step": 1070 + }, + { + "epoch": 0.49, + "learning_rate": 2.785388127853881e-07, + "logits/chosen": -1.738328218460083, + "logits/rejected": -1.6436212062835693, + "logps/chosen": -54.58185577392578, + "logps/rejected": -80.33912658691406, + "loss": 0.0073, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.134696364402771, + "rewards/margins": 11.768366813659668, + "rewards/rejected": -12.90306282043457, + "step": 1080 + }, + { + "epoch": 0.5, + "learning_rate": 2.780314561136479e-07, + "logits/chosen": -1.7240383625030518, + "logits/rejected": -1.6409051418304443, + "logps/chosen": -55.646934509277344, + "logps/rejected": -87.40071105957031, + "loss": 0.0027, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.4618747234344482, + "rewards/margins": 12.264945983886719, + "rewards/rejected": -13.726821899414062, + "step": 1090 + }, + { + "epoch": 0.5, + "learning_rate": 2.7752409944190763e-07, + "logits/chosen": -1.7554023265838623, + "logits/rejected": -1.6604242324829102, + "logps/chosen": -57.77976608276367, + "logps/rejected": -83.35955810546875, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.751634955406189, + "rewards/margins": 11.482076644897461, + "rewards/rejected": -13.233711242675781, + "step": 1100 + }, + { + "epoch": 0.5, + "eval_logits/chosen": -1.5333517789840698, + "eval_logits/rejected": -1.4505860805511475, + "eval_logps/chosen": -81.65863037109375, + "eval_logps/rejected": -86.0610580444336, + "eval_loss": 0.011035408824682236, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -2.838331937789917, + "eval_rewards/margins": 10.240079879760742, + "eval_rewards/rejected": -13.078413009643555, + "eval_runtime": 146.4725, + "eval_samples_per_second": 19.54, + "eval_steps_per_second": 1.222, + "step": 1100 + }, + { + "epoch": 0.51, + "learning_rate": 2.770167427701674e-07, + "logits/chosen": -1.749803900718689, + "logits/rejected": -1.6486237049102783, + "logps/chosen": -58.17445755004883, + "logps/rejected": -84.91439056396484, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9937120676040649, + "rewards/margins": 11.985600471496582, + "rewards/rejected": -12.9793119430542, + "step": 1110 + }, + { + "epoch": 0.51, + "learning_rate": 2.7650938609842716e-07, + "logits/chosen": -1.7548671960830688, + "logits/rejected": -1.6473737955093384, + "logps/chosen": -56.917808532714844, + "logps/rejected": -81.3094711303711, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2224924564361572, + "rewards/margins": 11.221006393432617, + "rewards/rejected": -12.443498611450195, + "step": 1120 + }, + { + "epoch": 0.52, + "learning_rate": 2.7600202942668696e-07, + "logits/chosen": -1.757595419883728, + "logits/rejected": -1.6543792486190796, + "logps/chosen": -62.02849578857422, + "logps/rejected": -84.46827697753906, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3594145774841309, + "rewards/margins": 11.177923202514648, + "rewards/rejected": -12.537336349487305, + "step": 1130 + }, + { + "epoch": 0.52, + "learning_rate": 2.754946727549467e-07, + "logits/chosen": -1.7476240396499634, + "logits/rejected": -1.6613715887069702, + "logps/chosen": -58.78446578979492, + "logps/rejected": -89.38286590576172, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4613271951675415, + "rewards/margins": 11.705554008483887, + "rewards/rejected": -13.16688060760498, + "step": 1140 + }, + { + "epoch": 0.52, + "learning_rate": 2.749873160832065e-07, + "logits/chosen": -1.7503944635391235, + "logits/rejected": -1.6537967920303345, + "logps/chosen": -54.70173263549805, + "logps/rejected": -82.47148132324219, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9892008900642395, + "rewards/margins": 12.35951042175293, + "rewards/rejected": -13.348711013793945, + "step": 1150 + }, + { + "epoch": 0.53, + "learning_rate": 2.7447995941146623e-07, + "logits/chosen": -1.7634122371673584, + "logits/rejected": -1.6714951992034912, + "logps/chosen": -60.8097038269043, + "logps/rejected": -87.31422424316406, + "loss": 0.0108, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.9673036336898804, + "rewards/margins": 12.482806205749512, + "rewards/rejected": -14.450109481811523, + "step": 1160 + }, + { + "epoch": 0.53, + "learning_rate": 2.73972602739726e-07, + "logits/chosen": -1.7594560384750366, + "logits/rejected": -1.6858173608779907, + "logps/chosen": -55.721031188964844, + "logps/rejected": -90.5, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6576344966888428, + "rewards/margins": 12.7664213180542, + "rewards/rejected": -14.424057006835938, + "step": 1170 + }, + { + "epoch": 0.54, + "learning_rate": 2.7346524606798576e-07, + "logits/chosen": -1.7588996887207031, + "logits/rejected": -1.6747125387191772, + "logps/chosen": -57.83014678955078, + "logps/rejected": -88.86564636230469, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9523566961288452, + "rewards/margins": 12.802734375, + "rewards/rejected": -14.755091667175293, + "step": 1180 + }, + { + "epoch": 0.54, + "learning_rate": 2.7295788939624556e-07, + "logits/chosen": -1.7592281103134155, + "logits/rejected": -1.6504974365234375, + "logps/chosen": -58.18562698364258, + "logps/rejected": -86.49019622802734, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.525193452835083, + "rewards/margins": 12.773050308227539, + "rewards/rejected": -14.298243522644043, + "step": 1190 + }, + { + "epoch": 0.55, + "learning_rate": 2.724505327245053e-07, + "logits/chosen": -1.7617738246917725, + "logits/rejected": -1.666863203048706, + "logps/chosen": -59.43085861206055, + "logps/rejected": -85.90283203125, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.331127405166626, + "rewards/margins": 12.19595718383789, + "rewards/rejected": -13.527084350585938, + "step": 1200 + }, + { + "epoch": 0.55, + "eval_logits/chosen": -1.544122576713562, + "eval_logits/rejected": -1.4603493213653564, + "eval_logps/chosen": -82.71837615966797, + "eval_logps/rejected": -87.875732421875, + "eval_loss": 0.013027088716626167, + "eval_rewards/accuracies": 0.9860334992408752, + "eval_rewards/chosen": -3.368196725845337, + "eval_rewards/margins": 10.61755084991455, + "eval_rewards/rejected": -13.985747337341309, + "eval_runtime": 148.8985, + "eval_samples_per_second": 19.221, + "eval_steps_per_second": 1.202, + "step": 1200 + }, + { + "epoch": 0.55, + "learning_rate": 2.719431760527651e-07, + "logits/chosen": -1.7558351755142212, + "logits/rejected": -1.6655076742172241, + "logps/chosen": -56.12122344970703, + "logps/rejected": -84.51030731201172, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5430134534835815, + "rewards/margins": 12.190717697143555, + "rewards/rejected": -13.733731269836426, + "step": 1210 + }, + { + "epoch": 0.56, + "learning_rate": 2.7143581938102483e-07, + "logits/chosen": -1.7553596496582031, + "logits/rejected": -1.6564127206802368, + "logps/chosen": -56.277931213378906, + "logps/rejected": -83.83023834228516, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0945639610290527, + "rewards/margins": 12.596158981323242, + "rewards/rejected": -13.69072151184082, + "step": 1220 + }, + { + "epoch": 0.56, + "learning_rate": 2.709284627092846e-07, + "logits/chosen": -1.742206335067749, + "logits/rejected": -1.6503639221191406, + "logps/chosen": -58.675819396972656, + "logps/rejected": -89.0281982421875, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.533455491065979, + "rewards/margins": 13.204513549804688, + "rewards/rejected": -14.737970352172852, + "step": 1230 + }, + { + "epoch": 0.57, + "learning_rate": 2.7042110603754436e-07, + "logits/chosen": -1.7599554061889648, + "logits/rejected": -1.6664212942123413, + "logps/chosen": -62.36484909057617, + "logps/rejected": -86.59840393066406, + "loss": 0.0068, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.790294885635376, + "rewards/margins": 12.219128608703613, + "rewards/rejected": -14.009424209594727, + "step": 1240 + }, + { + "epoch": 0.57, + "learning_rate": 2.6991374936580416e-07, + "logits/chosen": -1.7607377767562866, + "logits/rejected": -1.6740095615386963, + "logps/chosen": -58.570457458496094, + "logps/rejected": -90.27938842773438, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5715006589889526, + "rewards/margins": 13.148752212524414, + "rewards/rejected": -14.720251083374023, + "step": 1250 + }, + { + "epoch": 0.58, + "learning_rate": 2.694063926940639e-07, + "logits/chosen": -1.7583293914794922, + "logits/rejected": -1.656664252281189, + "logps/chosen": -61.866050720214844, + "logps/rejected": -85.34736633300781, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.522429347038269, + "rewards/margins": 12.270319938659668, + "rewards/rejected": -13.792750358581543, + "step": 1260 + }, + { + "epoch": 0.58, + "learning_rate": 2.688990360223237e-07, + "logits/chosen": -1.7451664209365845, + "logits/rejected": -1.6569017171859741, + "logps/chosen": -54.42702102661133, + "logps/rejected": -87.1197509765625, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1746667623519897, + "rewards/margins": 12.838618278503418, + "rewards/rejected": -14.013284683227539, + "step": 1270 + }, + { + "epoch": 0.58, + "learning_rate": 2.6839167935058343e-07, + "logits/chosen": -1.7624620199203491, + "logits/rejected": -1.6860120296478271, + "logps/chosen": -56.553749084472656, + "logps/rejected": -87.63340759277344, + "loss": 0.0103, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.627769112586975, + "rewards/margins": 13.027348518371582, + "rewards/rejected": -14.655117988586426, + "step": 1280 + }, + { + "epoch": 0.59, + "learning_rate": 2.678843226788432e-07, + "logits/chosen": -1.7679738998413086, + "logits/rejected": -1.6850473880767822, + "logps/chosen": -55.26356887817383, + "logps/rejected": -90.05108642578125, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4258941411972046, + "rewards/margins": 13.58891487121582, + "rewards/rejected": -15.014808654785156, + "step": 1290 + }, + { + "epoch": 0.59, + "learning_rate": 2.6737696600710296e-07, + "logits/chosen": -1.7583335638046265, + "logits/rejected": -1.6575883626937866, + "logps/chosen": -59.30186080932617, + "logps/rejected": -90.12716674804688, + "loss": 0.0054, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.90740168094635, + "rewards/margins": 13.73097038269043, + "rewards/rejected": -15.638374328613281, + "step": 1300 + }, + { + "epoch": 0.59, + "eval_logits/chosen": -1.5403498411178589, + "eval_logits/rejected": -1.4576407670974731, + "eval_logps/chosen": -83.19163513183594, + "eval_logps/rejected": -89.70408630371094, + "eval_loss": 0.0123166274279356, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -3.604830503463745, + "eval_rewards/margins": 11.295095443725586, + "eval_rewards/rejected": -14.899927139282227, + "eval_runtime": 166.689, + "eval_samples_per_second": 17.17, + "eval_steps_per_second": 1.074, + "step": 1300 + }, + { + "epoch": 0.6, + "learning_rate": 2.6686960933536276e-07, + "logits/chosen": -1.7511463165283203, + "logits/rejected": -1.6742098331451416, + "logps/chosen": -56.27610397338867, + "logps/rejected": -87.4818344116211, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8156147003173828, + "rewards/margins": 13.021186828613281, + "rewards/rejected": -14.836801528930664, + "step": 1310 + }, + { + "epoch": 0.6, + "learning_rate": 2.663622526636225e-07, + "logits/chosen": -1.749943733215332, + "logits/rejected": -1.6831376552581787, + "logps/chosen": -55.67545700073242, + "logps/rejected": -88.80065155029297, + "loss": 0.0103, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.1307406425476074, + "rewards/margins": 12.877218246459961, + "rewards/rejected": -15.007959365844727, + "step": 1320 + }, + { + "epoch": 0.61, + "learning_rate": 2.658548959918823e-07, + "logits/chosen": -1.7757232189178467, + "logits/rejected": -1.6820189952850342, + "logps/chosen": -62.95960235595703, + "logps/rejected": -90.3966293334961, + "loss": 0.0057, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.706180214881897, + "rewards/margins": 13.415826797485352, + "rewards/rejected": -15.1220064163208, + "step": 1330 + }, + { + "epoch": 0.61, + "learning_rate": 2.6534753932014203e-07, + "logits/chosen": -1.7566606998443604, + "logits/rejected": -1.6727615594863892, + "logps/chosen": -55.65472412109375, + "logps/rejected": -86.75607299804688, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4121570587158203, + "rewards/margins": 13.099316596984863, + "rewards/rejected": -14.511472702026367, + "step": 1340 + }, + { + "epoch": 0.62, + "learning_rate": 2.648401826484018e-07, + "logits/chosen": -1.7660220861434937, + "logits/rejected": -1.6494724750518799, + "logps/chosen": -60.838462829589844, + "logps/rejected": -88.93028259277344, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3854892253875732, + "rewards/margins": 13.644729614257812, + "rewards/rejected": -15.030218124389648, + "step": 1350 + }, + { + "epoch": 0.62, + "learning_rate": 2.6433282597666156e-07, + "logits/chosen": -1.7497011423110962, + "logits/rejected": -1.646283507347107, + "logps/chosen": -60.607208251953125, + "logps/rejected": -86.80403900146484, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3235530853271484, + "rewards/margins": 13.664321899414062, + "rewards/rejected": -14.987874031066895, + "step": 1360 + }, + { + "epoch": 0.63, + "learning_rate": 2.6382546930492135e-07, + "logits/chosen": -1.7312263250350952, + "logits/rejected": -1.6483138799667358, + "logps/chosen": -55.373199462890625, + "logps/rejected": -89.51940155029297, + "loss": 0.0078, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.6450560092926025, + "rewards/margins": 13.383737564086914, + "rewards/rejected": -15.02879524230957, + "step": 1370 + }, + { + "epoch": 0.63, + "learning_rate": 2.633181126331811e-07, + "logits/chosen": -1.7593389749526978, + "logits/rejected": -1.690319299697876, + "logps/chosen": -52.8946647644043, + "logps/rejected": -86.19906616210938, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.703823447227478, + "rewards/margins": 12.947980880737305, + "rewards/rejected": -14.651806831359863, + "step": 1380 + }, + { + "epoch": 0.63, + "learning_rate": 2.628107559614409e-07, + "logits/chosen": -1.7741191387176514, + "logits/rejected": -1.6717126369476318, + "logps/chosen": -59.35906982421875, + "logps/rejected": -84.93653869628906, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5824410915374756, + "rewards/margins": 12.609251022338867, + "rewards/rejected": -14.191693305969238, + "step": 1390 + }, + { + "epoch": 0.64, + "learning_rate": 2.6230339928970063e-07, + "logits/chosen": -1.7360626459121704, + "logits/rejected": -1.667319655418396, + "logps/chosen": -55.236122131347656, + "logps/rejected": -87.0063705444336, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5939099788665771, + "rewards/margins": 13.322172164916992, + "rewards/rejected": -14.916082382202148, + "step": 1400 + }, + { + "epoch": 0.64, + "eval_logits/chosen": -1.54180109500885, + "eval_logits/rejected": -1.4597798585891724, + "eval_logps/chosen": -82.61723327636719, + "eval_logps/rejected": -90.00525665283203, + "eval_loss": 0.009081164374947548, + "eval_rewards/accuracies": 0.9860334992408752, + "eval_rewards/chosen": -3.3176283836364746, + "eval_rewards/margins": 11.732884407043457, + "eval_rewards/rejected": -15.050512313842773, + "eval_runtime": 149.6939, + "eval_samples_per_second": 19.119, + "eval_steps_per_second": 1.196, + "step": 1400 + }, + { + "epoch": 0.64, + "learning_rate": 2.617960426179604e-07, + "logits/chosen": -1.77938973903656, + "logits/rejected": -1.6862428188323975, + "logps/chosen": -60.312339782714844, + "logps/rejected": -90.48917388916016, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3172271251678467, + "rewards/margins": 13.71961784362793, + "rewards/rejected": -15.036844253540039, + "step": 1410 + }, + { + "epoch": 0.65, + "learning_rate": 2.6128868594622016e-07, + "logits/chosen": -1.7564280033111572, + "logits/rejected": -1.6784827709197998, + "logps/chosen": -56.887779235839844, + "logps/rejected": -87.00199890136719, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7040510177612305, + "rewards/margins": 13.034492492675781, + "rewards/rejected": -14.738543510437012, + "step": 1420 + }, + { + "epoch": 0.65, + "learning_rate": 2.6078132927447995e-07, + "logits/chosen": -1.7650423049926758, + "logits/rejected": -1.6766407489776611, + "logps/chosen": -57.2898063659668, + "logps/rejected": -85.12565612792969, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.521803617477417, + "rewards/margins": 13.181356430053711, + "rewards/rejected": -14.703161239624023, + "step": 1430 + }, + { + "epoch": 0.66, + "learning_rate": 2.602739726027397e-07, + "logits/chosen": -1.7596886157989502, + "logits/rejected": -1.654207468032837, + "logps/chosen": -56.0603141784668, + "logps/rejected": -87.854736328125, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3883931636810303, + "rewards/margins": 13.825471878051758, + "rewards/rejected": -15.21386432647705, + "step": 1440 + }, + { + "epoch": 0.66, + "learning_rate": 2.597666159309995e-07, + "logits/chosen": -1.7648146152496338, + "logits/rejected": -1.67717707157135, + "logps/chosen": -56.35142135620117, + "logps/rejected": -87.81778717041016, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.6936101913452148, + "rewards/margins": 13.936601638793945, + "rewards/rejected": -15.630210876464844, + "step": 1450 + }, + { + "epoch": 0.67, + "learning_rate": 2.5925925925925923e-07, + "logits/chosen": -1.7566111087799072, + "logits/rejected": -1.6657116413116455, + "logps/chosen": -58.238059997558594, + "logps/rejected": -87.4339370727539, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4842593669891357, + "rewards/margins": 12.764392852783203, + "rewards/rejected": -14.248652458190918, + "step": 1460 + }, + { + "epoch": 0.67, + "learning_rate": 2.58751902587519e-07, + "logits/chosen": -1.777064561843872, + "logits/rejected": -1.6611353158950806, + "logps/chosen": -58.111549377441406, + "logps/rejected": -84.71275329589844, + "loss": 0.0059, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.0342987775802612, + "rewards/margins": 13.157496452331543, + "rewards/rejected": -14.191795349121094, + "step": 1470 + }, + { + "epoch": 0.68, + "learning_rate": 2.5824454591577876e-07, + "logits/chosen": -1.7620245218276978, + "logits/rejected": -1.662798523902893, + "logps/chosen": -57.60813522338867, + "logps/rejected": -85.7889175415039, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4033831357955933, + "rewards/margins": 13.611114501953125, + "rewards/rejected": -15.014495849609375, + "step": 1480 + }, + { + "epoch": 0.68, + "learning_rate": 2.5773718924403855e-07, + "logits/chosen": -1.7630466222763062, + "logits/rejected": -1.6711034774780273, + "logps/chosen": -55.47661590576172, + "logps/rejected": -86.72578430175781, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5360097885131836, + "rewards/margins": 13.896036148071289, + "rewards/rejected": -15.432044982910156, + "step": 1490 + }, + { + "epoch": 0.68, + "learning_rate": 2.572298325722983e-07, + "logits/chosen": -1.7507346868515015, + "logits/rejected": -1.6613788604736328, + "logps/chosen": -60.52473068237305, + "logps/rejected": -88.97389221191406, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.761684775352478, + "rewards/margins": 13.575399398803711, + "rewards/rejected": -15.33708667755127, + "step": 1500 + }, + { + "epoch": 0.68, + "eval_logits/chosen": -1.549391269683838, + "eval_logits/rejected": -1.467103362083435, + "eval_logps/chosen": -82.59818267822266, + "eval_logps/rejected": -91.0326919555664, + "eval_loss": 0.008652918040752411, + "eval_rewards/accuracies": 0.9860334992408752, + "eval_rewards/chosen": -3.308105707168579, + "eval_rewards/margins": 12.256126403808594, + "eval_rewards/rejected": -15.564231872558594, + "eval_runtime": 155.0871, + "eval_samples_per_second": 18.454, + "eval_steps_per_second": 1.154, + "step": 1500 + }, + { + "epoch": 0.69, + "learning_rate": 2.567224759005581e-07, + "logits/chosen": -1.7494996786117554, + "logits/rejected": -1.6485868692398071, + "logps/chosen": -58.14558792114258, + "logps/rejected": -89.62889862060547, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5420677661895752, + "rewards/margins": 14.456304550170898, + "rewards/rejected": -15.998372077941895, + "step": 1510 + }, + { + "epoch": 0.69, + "learning_rate": 2.5621511922881783e-07, + "logits/chosen": -1.7610740661621094, + "logits/rejected": -1.6540189981460571, + "logps/chosen": -57.69807052612305, + "logps/rejected": -91.73440551757812, + "loss": 0.0105, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.6863176822662354, + "rewards/margins": 14.371050834655762, + "rewards/rejected": -16.057369232177734, + "step": 1520 + }, + { + "epoch": 0.7, + "learning_rate": 2.557077625570776e-07, + "logits/chosen": -1.7764087915420532, + "logits/rejected": -1.6975984573364258, + "logps/chosen": -54.03931427001953, + "logps/rejected": -88.61404418945312, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.407801866531372, + "rewards/margins": 13.469766616821289, + "rewards/rejected": -14.877568244934082, + "step": 1530 + }, + { + "epoch": 0.7, + "learning_rate": 2.5520040588533736e-07, + "logits/chosen": -1.7674833536148071, + "logits/rejected": -1.6577078104019165, + "logps/chosen": -59.08375930786133, + "logps/rejected": -85.60472106933594, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8764028549194336, + "rewards/margins": 13.601232528686523, + "rewards/rejected": -14.477635383605957, + "step": 1540 + }, + { + "epoch": 0.71, + "learning_rate": 2.5469304921359715e-07, + "logits/chosen": -1.7571929693222046, + "logits/rejected": -1.6718075275421143, + "logps/chosen": -56.97377395629883, + "logps/rejected": -88.74199676513672, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.215511679649353, + "rewards/margins": 13.31176471710205, + "rewards/rejected": -14.527276992797852, + "step": 1550 + }, + { + "epoch": 0.71, + "learning_rate": 2.541856925418569e-07, + "logits/chosen": -1.7460705041885376, + "logits/rejected": -1.6669387817382812, + "logps/chosen": -55.67962646484375, + "logps/rejected": -87.57090759277344, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.260811686515808, + "rewards/margins": 13.849576950073242, + "rewards/rejected": -15.110387802124023, + "step": 1560 + }, + { + "epoch": 0.72, + "learning_rate": 2.536783358701167e-07, + "logits/chosen": -1.7633936405181885, + "logits/rejected": -1.656254529953003, + "logps/chosen": -57.3261604309082, + "logps/rejected": -88.50245666503906, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2457377910614014, + "rewards/margins": 13.728734970092773, + "rewards/rejected": -14.974472045898438, + "step": 1570 + }, + { + "epoch": 0.72, + "learning_rate": 2.5317097919837643e-07, + "logits/chosen": -1.745391845703125, + "logits/rejected": -1.672279715538025, + "logps/chosen": -57.052711486816406, + "logps/rejected": -92.32408142089844, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.403147578239441, + "rewards/margins": 14.200845718383789, + "rewards/rejected": -15.60399341583252, + "step": 1580 + }, + { + "epoch": 0.73, + "learning_rate": 2.526636225266362e-07, + "logits/chosen": -1.7615629434585571, + "logits/rejected": -1.6586211919784546, + "logps/chosen": -58.931182861328125, + "logps/rejected": -93.81259155273438, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3059899806976318, + "rewards/margins": 14.414894104003906, + "rewards/rejected": -15.720884323120117, + "step": 1590 + }, + { + "epoch": 0.73, + "learning_rate": 2.5215626585489596e-07, + "logits/chosen": -1.7530238628387451, + "logits/rejected": -1.6644384860992432, + "logps/chosen": -54.5128173828125, + "logps/rejected": -87.1929702758789, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7006027698516846, + "rewards/margins": 13.703947067260742, + "rewards/rejected": -15.404550552368164, + "step": 1600 + }, + { + "epoch": 0.73, + "eval_logits/chosen": -1.5559909343719482, + "eval_logits/rejected": -1.4721708297729492, + "eval_logps/chosen": -83.0450668334961, + "eval_logps/rejected": -92.46697235107422, + "eval_loss": 0.00909368135035038, + "eval_rewards/accuracies": 0.9860334992408752, + "eval_rewards/chosen": -3.5315442085266113, + "eval_rewards/margins": 12.749824523925781, + "eval_rewards/rejected": -16.2813663482666, + "eval_runtime": 153.9516, + "eval_samples_per_second": 18.59, + "eval_steps_per_second": 1.163, + "step": 1600 + }, + { + "epoch": 0.73, + "learning_rate": 2.5164890918315575e-07, + "logits/chosen": -1.752454400062561, + "logits/rejected": -1.6705706119537354, + "logps/chosen": -57.991455078125, + "logps/rejected": -91.4581527709961, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0661113262176514, + "rewards/margins": 14.307818412780762, + "rewards/rejected": -16.373929977416992, + "step": 1610 + }, + { + "epoch": 0.74, + "learning_rate": 2.511415525114155e-07, + "logits/chosen": -1.7848793268203735, + "logits/rejected": -1.7035505771636963, + "logps/chosen": -56.839599609375, + "logps/rejected": -88.28500366210938, + "loss": 0.0037, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.018306255340576, + "rewards/margins": 13.947771072387695, + "rewards/rejected": -15.96607780456543, + "step": 1620 + }, + { + "epoch": 0.74, + "learning_rate": 2.506341958396753e-07, + "logits/chosen": -1.776012659072876, + "logits/rejected": -1.6706695556640625, + "logps/chosen": -58.133758544921875, + "logps/rejected": -88.62023162841797, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8454831838607788, + "rewards/margins": 14.76240062713623, + "rewards/rejected": -16.60788345336914, + "step": 1630 + }, + { + "epoch": 0.75, + "learning_rate": 2.5012683916793503e-07, + "logits/chosen": -1.780495047569275, + "logits/rejected": -1.6756473779678345, + "logps/chosen": -63.57946014404297, + "logps/rejected": -92.46826171875, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.026796579360962, + "rewards/margins": 14.724100112915039, + "rewards/rejected": -16.75089454650879, + "step": 1640 + }, + { + "epoch": 0.75, + "learning_rate": 2.496194824961948e-07, + "logits/chosen": -1.759849190711975, + "logits/rejected": -1.6756088733673096, + "logps/chosen": -56.996368408203125, + "logps/rejected": -93.39848327636719, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.910292625427246, + "rewards/margins": 15.01073932647705, + "rewards/rejected": -16.921031951904297, + "step": 1650 + }, + { + "epoch": 0.76, + "learning_rate": 2.4911212582445456e-07, + "logits/chosen": -1.7654517889022827, + "logits/rejected": -1.6813764572143555, + "logps/chosen": -55.60560989379883, + "logps/rejected": -92.54562377929688, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3576942682266235, + "rewards/margins": 15.633010864257812, + "rewards/rejected": -16.990703582763672, + "step": 1660 + }, + { + "epoch": 0.76, + "learning_rate": 2.4860476915271435e-07, + "logits/chosen": -1.767221450805664, + "logits/rejected": -1.6690009832382202, + "logps/chosen": -54.5999870300293, + "logps/rejected": -93.59965515136719, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9985862970352173, + "rewards/margins": 15.221014022827148, + "rewards/rejected": -16.219600677490234, + "step": 1670 + }, + { + "epoch": 0.77, + "learning_rate": 2.480974124809741e-07, + "logits/chosen": -1.7726835012435913, + "logits/rejected": -1.6497215032577515, + "logps/chosen": -58.1947021484375, + "logps/rejected": -90.62200164794922, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9589114189147949, + "rewards/margins": 14.714686393737793, + "rewards/rejected": -15.673599243164062, + "step": 1680 + }, + { + "epoch": 0.77, + "learning_rate": 2.475900558092339e-07, + "logits/chosen": -1.7549190521240234, + "logits/rejected": -1.6590404510498047, + "logps/chosen": -58.07526779174805, + "logps/rejected": -87.36026763916016, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1645383834838867, + "rewards/margins": 14.172067642211914, + "rewards/rejected": -15.3366060256958, + "step": 1690 + }, + { + "epoch": 0.78, + "learning_rate": 2.4708269913749363e-07, + "logits/chosen": -1.768355131149292, + "logits/rejected": -1.6747773885726929, + "logps/chosen": -57.80949783325195, + "logps/rejected": -87.95934295654297, + "loss": 0.0035, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.4355525970458984, + "rewards/margins": 14.236753463745117, + "rewards/rejected": -15.672307014465332, + "step": 1700 + }, + { + "epoch": 0.78, + "eval_logits/chosen": -1.5480554103851318, + "eval_logits/rejected": -1.466413140296936, + "eval_logps/chosen": -82.27859497070312, + "eval_logps/rejected": -91.71222686767578, + "eval_loss": 0.007809256669133902, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -3.1483092308044434, + "eval_rewards/margins": 12.755687713623047, + "eval_rewards/rejected": -15.903995513916016, + "eval_runtime": 150.663, + "eval_samples_per_second": 18.996, + "eval_steps_per_second": 1.188, + "step": 1700 + }, + { + "epoch": 0.78, + "learning_rate": 2.465753424657534e-07, + "logits/chosen": -1.7380459308624268, + "logits/rejected": -1.644565224647522, + "logps/chosen": -56.472679138183594, + "logps/rejected": -91.74127960205078, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2027267217636108, + "rewards/margins": 14.890820503234863, + "rewards/rejected": -16.093547821044922, + "step": 1710 + }, + { + "epoch": 0.79, + "learning_rate": 2.4606798579401316e-07, + "logits/chosen": -1.7520453929901123, + "logits/rejected": -1.6536376476287842, + "logps/chosen": -61.30470657348633, + "logps/rejected": -90.20903778076172, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4567862749099731, + "rewards/margins": 14.035242080688477, + "rewards/rejected": -15.492027282714844, + "step": 1720 + }, + { + "epoch": 0.79, + "learning_rate": 2.4556062912227295e-07, + "logits/chosen": -1.7642204761505127, + "logits/rejected": -1.6550449132919312, + "logps/chosen": -56.6153450012207, + "logps/rejected": -85.96258544921875, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2349016666412354, + "rewards/margins": 14.066560745239258, + "rewards/rejected": -15.301460266113281, + "step": 1730 + }, + { + "epoch": 0.79, + "learning_rate": 2.450532724505327e-07, + "logits/chosen": -1.7625477313995361, + "logits/rejected": -1.6494197845458984, + "logps/chosen": -58.4631462097168, + "logps/rejected": -90.04280853271484, + "loss": 0.0056, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.411668300628662, + "rewards/margins": 14.800588607788086, + "rewards/rejected": -16.21225929260254, + "step": 1740 + }, + { + "epoch": 0.8, + "learning_rate": 2.445459157787925e-07, + "logits/chosen": -1.7740110158920288, + "logits/rejected": -1.6718931198120117, + "logps/chosen": -56.41655731201172, + "logps/rejected": -88.89642333984375, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9772757291793823, + "rewards/margins": 14.739130020141602, + "rewards/rejected": -15.716405868530273, + "step": 1750 + }, + { + "epoch": 0.8, + "learning_rate": 2.4403855910705223e-07, + "logits/chosen": -1.7601970434188843, + "logits/rejected": -1.6815147399902344, + "logps/chosen": -55.41786575317383, + "logps/rejected": -90.36604309082031, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3811933994293213, + "rewards/margins": 14.588228225708008, + "rewards/rejected": -15.96942138671875, + "step": 1760 + }, + { + "epoch": 0.81, + "learning_rate": 2.43531202435312e-07, + "logits/chosen": -1.773404836654663, + "logits/rejected": -1.6569006443023682, + "logps/chosen": -57.115379333496094, + "logps/rejected": -87.33353424072266, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.131402850151062, + "rewards/margins": 14.441953659057617, + "rewards/rejected": -15.573358535766602, + "step": 1770 + }, + { + "epoch": 0.81, + "learning_rate": 2.4302384576357176e-07, + "logits/chosen": -1.7684681415557861, + "logits/rejected": -1.6918935775756836, + "logps/chosen": -59.896827697753906, + "logps/rejected": -94.4011001586914, + "loss": 0.0013, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.6811656951904297, + "rewards/margins": 15.493830680847168, + "rewards/rejected": -17.174999237060547, + "step": 1780 + }, + { + "epoch": 0.82, + "learning_rate": 2.4251648909183155e-07, + "logits/chosen": -1.7643531560897827, + "logits/rejected": -1.678773283958435, + "logps/chosen": -57.124267578125, + "logps/rejected": -93.88394927978516, + "loss": 0.0052, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.4721136093139648, + "rewards/margins": 15.75408935546875, + "rewards/rejected": -17.2262020111084, + "step": 1790 + }, + { + "epoch": 0.82, + "learning_rate": 2.420091324200913e-07, + "logits/chosen": -1.7517344951629639, + "logits/rejected": -1.6720161437988281, + "logps/chosen": -60.31678009033203, + "logps/rejected": -93.91047668457031, + "loss": 0.0094, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.1854450702667236, + "rewards/margins": 14.294087409973145, + "rewards/rejected": -16.479534149169922, + "step": 1800 + }, + { + "epoch": 0.82, + "eval_logits/chosen": -1.5389713048934937, + "eval_logits/rejected": -1.4571911096572876, + "eval_logps/chosen": -81.96665954589844, + "eval_logps/rejected": -91.73914337158203, + "eval_loss": 0.0071415891870856285, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -2.992342710494995, + "eval_rewards/margins": 12.925115585327148, + "eval_rewards/rejected": -15.917457580566406, + "eval_runtime": 165.3945, + "eval_samples_per_second": 17.304, + "eval_steps_per_second": 1.082, + "step": 1800 + }, + { + "epoch": 0.83, + "learning_rate": 2.415017757483511e-07, + "logits/chosen": -1.7610645294189453, + "logits/rejected": -1.6734730005264282, + "logps/chosen": -58.1474494934082, + "logps/rejected": -91.43634033203125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2573091983795166, + "rewards/margins": 15.121930122375488, + "rewards/rejected": -16.37923812866211, + "step": 1810 + }, + { + "epoch": 0.83, + "learning_rate": 2.409944190766108e-07, + "logits/chosen": -1.7458274364471436, + "logits/rejected": -1.6527506113052368, + "logps/chosen": -57.40966796875, + "logps/rejected": -93.22349548339844, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.624754548072815, + "rewards/margins": 15.039660453796387, + "rewards/rejected": -16.66441535949707, + "step": 1820 + }, + { + "epoch": 0.84, + "learning_rate": 2.404870624048706e-07, + "logits/chosen": -1.7511749267578125, + "logits/rejected": -1.666269302368164, + "logps/chosen": -58.3302001953125, + "logps/rejected": -90.82698059082031, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2160614728927612, + "rewards/margins": 14.482437133789062, + "rewards/rejected": -15.69849681854248, + "step": 1830 + }, + { + "epoch": 0.84, + "learning_rate": 2.3997970573313036e-07, + "logits/chosen": -1.7466942071914673, + "logits/rejected": -1.659889578819275, + "logps/chosen": -55.64778518676758, + "logps/rejected": -86.6668701171875, + "loss": 0.0026, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.3703393936157227, + "rewards/margins": 14.488919258117676, + "rewards/rejected": -15.859257698059082, + "step": 1840 + }, + { + "epoch": 0.84, + "learning_rate": 2.3947234906139015e-07, + "logits/chosen": -1.7588704824447632, + "logits/rejected": -1.664973497390747, + "logps/chosen": -58.49451446533203, + "logps/rejected": -91.96329498291016, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.37942636013031, + "rewards/margins": 15.162734985351562, + "rewards/rejected": -16.542163848876953, + "step": 1850 + }, + { + "epoch": 0.85, + "learning_rate": 2.389649923896499e-07, + "logits/chosen": -1.7728341817855835, + "logits/rejected": -1.6813457012176514, + "logps/chosen": -55.9604377746582, + "logps/rejected": -89.51544952392578, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.612262487411499, + "rewards/margins": 14.787466049194336, + "rewards/rejected": -16.399730682373047, + "step": 1860 + }, + { + "epoch": 0.85, + "learning_rate": 2.384576357179097e-07, + "logits/chosen": -1.7437846660614014, + "logits/rejected": -1.6534225940704346, + "logps/chosen": -56.951820373535156, + "logps/rejected": -91.35940551757812, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.487562894821167, + "rewards/margins": 14.857897758483887, + "rewards/rejected": -16.345458984375, + "step": 1870 + }, + { + "epoch": 0.86, + "learning_rate": 2.3795027904616943e-07, + "logits/chosen": -1.7715113162994385, + "logits/rejected": -1.6715120077133179, + "logps/chosen": -59.91661834716797, + "logps/rejected": -89.1376953125, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7439546585083008, + "rewards/margins": 15.043264389038086, + "rewards/rejected": -16.787220001220703, + "step": 1880 + }, + { + "epoch": 0.86, + "learning_rate": 2.374429223744292e-07, + "logits/chosen": -1.7696430683135986, + "logits/rejected": -1.689231514930725, + "logps/chosen": -56.52924346923828, + "logps/rejected": -90.79057312011719, + "loss": 0.0051, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.5264157056808472, + "rewards/margins": 14.896077156066895, + "rewards/rejected": -16.4224910736084, + "step": 1890 + }, + { + "epoch": 0.87, + "learning_rate": 2.3693556570268896e-07, + "logits/chosen": -1.769017219543457, + "logits/rejected": -1.684442162513733, + "logps/chosen": -56.181922912597656, + "logps/rejected": -93.59736633300781, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1465259790420532, + "rewards/margins": 15.11158561706543, + "rewards/rejected": -16.258113861083984, + "step": 1900 + }, + { + "epoch": 0.87, + "eval_logits/chosen": -1.551081657409668, + "eval_logits/rejected": -1.4689964056015015, + "eval_logps/chosen": -81.9542007446289, + "eval_logps/rejected": -92.96185302734375, + "eval_loss": 0.006640641484409571, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -2.986114025115967, + "eval_rewards/margins": 13.542698860168457, + "eval_rewards/rejected": -16.5288143157959, + "eval_runtime": 132.2326, + "eval_samples_per_second": 21.644, + "eval_steps_per_second": 1.354, + "step": 1900 + }, + { + "epoch": 0.87, + "learning_rate": 2.3642820903094873e-07, + "logits/chosen": -1.7713333368301392, + "logits/rejected": -1.6674282550811768, + "logps/chosen": -58.21760177612305, + "logps/rejected": -97.25029754638672, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0102754831314087, + "rewards/margins": 16.225215911865234, + "rewards/rejected": -17.235490798950195, + "step": 1910 + }, + { + "epoch": 0.88, + "learning_rate": 2.359208523592085e-07, + "logits/chosen": -1.7865272760391235, + "logits/rejected": -1.6899009943008423, + "logps/chosen": -57.2186164855957, + "logps/rejected": -89.74848175048828, + "loss": 0.0134, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.849109411239624, + "rewards/margins": 14.458150863647461, + "rewards/rejected": -16.307260513305664, + "step": 1920 + }, + { + "epoch": 0.88, + "learning_rate": 2.3541349568746826e-07, + "logits/chosen": -1.7519928216934204, + "logits/rejected": -1.6507072448730469, + "logps/chosen": -54.916954040527344, + "logps/rejected": -85.49049377441406, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1282603740692139, + "rewards/margins": 14.200277328491211, + "rewards/rejected": -15.328536987304688, + "step": 1930 + }, + { + "epoch": 0.89, + "learning_rate": 2.3490613901572803e-07, + "logits/chosen": -1.7461013793945312, + "logits/rejected": -1.6764867305755615, + "logps/chosen": -55.79955291748047, + "logps/rejected": -89.12510681152344, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.656272292137146, + "rewards/margins": 13.735760688781738, + "rewards/rejected": -15.392033576965332, + "step": 1940 + }, + { + "epoch": 0.89, + "learning_rate": 2.343987823439878e-07, + "logits/chosen": -1.750739336013794, + "logits/rejected": -1.6435902118682861, + "logps/chosen": -57.835601806640625, + "logps/rejected": -88.36417388916016, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1759008169174194, + "rewards/margins": 14.037012100219727, + "rewards/rejected": -15.212911605834961, + "step": 1950 + }, + { + "epoch": 0.89, + "learning_rate": 2.3389142567224756e-07, + "logits/chosen": -1.7469285726547241, + "logits/rejected": -1.675733208656311, + "logps/chosen": -59.03168487548828, + "logps/rejected": -93.41059112548828, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.538738489151001, + "rewards/margins": 14.39984130859375, + "rewards/rejected": -15.938581466674805, + "step": 1960 + }, + { + "epoch": 0.9, + "learning_rate": 2.3338406900050733e-07, + "logits/chosen": -1.7523949146270752, + "logits/rejected": -1.6582969427108765, + "logps/chosen": -57.25901412963867, + "logps/rejected": -92.3466796875, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8285577893257141, + "rewards/margins": 15.199548721313477, + "rewards/rejected": -16.028104782104492, + "step": 1970 + }, + { + "epoch": 0.9, + "learning_rate": 2.328767123287671e-07, + "logits/chosen": -1.7480812072753906, + "logits/rejected": -1.6603069305419922, + "logps/chosen": -58.4476203918457, + "logps/rejected": -91.93983459472656, + "loss": 0.004, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.3551121950149536, + "rewards/margins": 14.631512641906738, + "rewards/rejected": -15.986623764038086, + "step": 1980 + }, + { + "epoch": 0.91, + "learning_rate": 2.3236935565702686e-07, + "logits/chosen": -1.7778419256210327, + "logits/rejected": -1.6857637166976929, + "logps/chosen": -59.96107864379883, + "logps/rejected": -92.36823272705078, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.472414255142212, + "rewards/margins": 14.517046928405762, + "rewards/rejected": -15.989463806152344, + "step": 1990 + }, + { + "epoch": 0.91, + "learning_rate": 2.3186199898528663e-07, + "logits/chosen": -1.750738501548767, + "logits/rejected": -1.6730880737304688, + "logps/chosen": -56.4324951171875, + "logps/rejected": -91.2624740600586, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4734715223312378, + "rewards/margins": 14.515901565551758, + "rewards/rejected": -15.989372253417969, + "step": 2000 + }, + { + "epoch": 0.91, + "eval_logits/chosen": -1.539143681526184, + "eval_logits/rejected": -1.4576748609542847, + "eval_logps/chosen": -82.55216217041016, + "eval_logps/rejected": -91.96436309814453, + "eval_loss": 0.007588522508740425, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -3.2850937843322754, + "eval_rewards/margins": 12.74497127532959, + "eval_rewards/rejected": -16.030065536499023, + "eval_runtime": 169.8211, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 1.054, + "step": 2000 + }, + { + "epoch": 0.92, + "learning_rate": 2.313546423135464e-07, + "logits/chosen": -1.787705659866333, + "logits/rejected": -1.6910228729248047, + "logps/chosen": -60.171051025390625, + "logps/rejected": -88.08454132080078, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.567155361175537, + "rewards/margins": 13.747461318969727, + "rewards/rejected": -15.314616203308105, + "step": 2010 + }, + { + "epoch": 0.92, + "learning_rate": 2.3084728564180616e-07, + "logits/chosen": -1.7449241876602173, + "logits/rejected": -1.6786353588104248, + "logps/chosen": -55.51226043701172, + "logps/rejected": -90.74034881591797, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3218858242034912, + "rewards/margins": 14.683317184448242, + "rewards/rejected": -16.005203247070312, + "step": 2020 + }, + { + "epoch": 0.93, + "learning_rate": 2.3033992897006593e-07, + "logits/chosen": -1.757450819015503, + "logits/rejected": -1.6583503484725952, + "logps/chosen": -62.15929412841797, + "logps/rejected": -90.96791076660156, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4464218616485596, + "rewards/margins": 14.97315502166748, + "rewards/rejected": -16.41957664489746, + "step": 2030 + }, + { + "epoch": 0.93, + "learning_rate": 2.298325722983257e-07, + "logits/chosen": -1.7604230642318726, + "logits/rejected": -1.6536738872528076, + "logps/chosen": -59.402137756347656, + "logps/rejected": -92.2740707397461, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6296403408050537, + "rewards/margins": 15.20836353302002, + "rewards/rejected": -16.838003158569336, + "step": 2040 + }, + { + "epoch": 0.94, + "learning_rate": 2.2932521562658546e-07, + "logits/chosen": -1.7548086643218994, + "logits/rejected": -1.657857894897461, + "logps/chosen": -56.833946228027344, + "logps/rejected": -89.49751281738281, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.196375846862793, + "rewards/margins": 14.8095064163208, + "rewards/rejected": -16.00588035583496, + "step": 2050 + }, + { + "epoch": 0.94, + "learning_rate": 2.2881785895484523e-07, + "logits/chosen": -1.747240424156189, + "logits/rejected": -1.6511905193328857, + "logps/chosen": -58.68681716918945, + "logps/rejected": -88.55622863769531, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3744940757751465, + "rewards/margins": 14.411099433898926, + "rewards/rejected": -15.785593032836914, + "step": 2060 + }, + { + "epoch": 0.94, + "learning_rate": 2.28310502283105e-07, + "logits/chosen": -1.7425874471664429, + "logits/rejected": -1.6658105850219727, + "logps/chosen": -54.89007568359375, + "logps/rejected": -89.45921325683594, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7904685735702515, + "rewards/margins": 14.850664138793945, + "rewards/rejected": -16.641132354736328, + "step": 2070 + }, + { + "epoch": 0.95, + "learning_rate": 2.2780314561136476e-07, + "logits/chosen": -1.7678744792938232, + "logits/rejected": -1.6782909631729126, + "logps/chosen": -56.872894287109375, + "logps/rejected": -89.02281951904297, + "loss": 0.0018, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.518134355545044, + "rewards/margins": 15.083781242370605, + "rewards/rejected": -16.601917266845703, + "step": 2080 + }, + { + "epoch": 0.95, + "learning_rate": 2.2729578893962453e-07, + "logits/chosen": -1.7603305578231812, + "logits/rejected": -1.6793501377105713, + "logps/chosen": -59.30863571166992, + "logps/rejected": -93.47074890136719, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8892009258270264, + "rewards/margins": 15.660491943359375, + "rewards/rejected": -17.549694061279297, + "step": 2090 + }, + { + "epoch": 0.96, + "learning_rate": 2.267884322678843e-07, + "logits/chosen": -1.7693204879760742, + "logits/rejected": -1.6628926992416382, + "logps/chosen": -54.2353630065918, + "logps/rejected": -87.59960174560547, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0670645236968994, + "rewards/margins": 15.38727855682373, + "rewards/rejected": -16.454341888427734, + "step": 2100 + }, + { + "epoch": 0.96, + "eval_logits/chosen": -1.5439651012420654, + "eval_logits/rejected": -1.4616644382476807, + "eval_logps/chosen": -82.66470336914062, + "eval_logps/rejected": -93.65453338623047, + "eval_loss": 0.006382639519870281, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": -3.3413615226745605, + "eval_rewards/margins": 13.533792495727539, + "eval_rewards/rejected": -16.87515640258789, + "eval_runtime": 153.9099, + "eval_samples_per_second": 18.595, + "eval_steps_per_second": 1.163, + "step": 2100 + }, + { + "epoch": 0.96, + "learning_rate": 2.2628107559614406e-07, + "logits/chosen": -1.7645127773284912, + "logits/rejected": -1.6967523097991943, + "logps/chosen": -59.54228973388672, + "logps/rejected": -93.48818969726562, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.765673279762268, + "rewards/margins": 15.079645156860352, + "rewards/rejected": -16.845317840576172, + "step": 2110 + }, + { + "epoch": 0.97, + "learning_rate": 2.2577371892440383e-07, + "logits/chosen": -1.7425806522369385, + "logits/rejected": -1.658601999282837, + "logps/chosen": -55.3408088684082, + "logps/rejected": -92.50639343261719, + "loss": 0.0019, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.3979063034057617, + "rewards/margins": 15.3561429977417, + "rewards/rejected": -16.75404930114746, + "step": 2120 + }, + { + "epoch": 0.97, + "learning_rate": 2.252663622526636e-07, + "logits/chosen": -1.7674560546875, + "logits/rejected": -1.6785967350006104, + "logps/chosen": -59.449562072753906, + "logps/rejected": -94.57528686523438, + "loss": 0.0058, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.4636263847351074, + "rewards/margins": 15.344259262084961, + "rewards/rejected": -16.807886123657227, + "step": 2130 + }, + { + "epoch": 0.98, + "learning_rate": 2.2475900558092336e-07, + "logits/chosen": -1.7418906688690186, + "logits/rejected": -1.6623594760894775, + "logps/chosen": -57.31889724731445, + "logps/rejected": -90.8071060180664, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.710715889930725, + "rewards/margins": 14.743158340454102, + "rewards/rejected": -16.453876495361328, + "step": 2140 + }, + { + "epoch": 0.98, + "learning_rate": 2.2425164890918313e-07, + "logits/chosen": -1.7725855112075806, + "logits/rejected": -1.6637465953826904, + "logps/chosen": -64.91510772705078, + "logps/rejected": -95.10658264160156, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4892213344573975, + "rewards/margins": 15.396344184875488, + "rewards/rejected": -16.885562896728516, + "step": 2150 + }, + { + "epoch": 0.99, + "learning_rate": 2.237442922374429e-07, + "logits/chosen": -1.7567058801651, + "logits/rejected": -1.6639419794082642, + "logps/chosen": -55.096824645996094, + "logps/rejected": -89.47212219238281, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8129856586456299, + "rewards/margins": 14.918745040893555, + "rewards/rejected": -16.731731414794922, + "step": 2160 + }, + { + "epoch": 0.99, + "learning_rate": 2.2323693556570266e-07, + "logits/chosen": -1.7617263793945312, + "logits/rejected": -1.658855676651001, + "logps/chosen": -58.96918869018555, + "logps/rejected": -95.55270385742188, + "loss": 0.0048, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.275040626525879, + "rewards/margins": 15.53173828125, + "rewards/rejected": -16.806777954101562, + "step": 2170 + }, + { + "epoch": 1.0, + "learning_rate": 2.2272957889396242e-07, + "logits/chosen": -1.7455657720565796, + "logits/rejected": -1.6588319540023804, + "logps/chosen": -57.32484817504883, + "logps/rejected": -94.15316009521484, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5650216341018677, + "rewards/margins": 15.224810600280762, + "rewards/rejected": -16.789833068847656, + "step": 2180 + }, + { + "epoch": 1.0, + "learning_rate": 2.222222222222222e-07, + "logits/chosen": -1.7567503452301025, + "logits/rejected": -1.6721036434173584, + "logps/chosen": -55.82500457763672, + "logps/rejected": -87.40864562988281, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7859405279159546, + "rewards/margins": 14.830865859985352, + "rewards/rejected": -16.616806030273438, + "step": 2190 + }, + { + "epoch": 1.0, + "learning_rate": 2.2171486555048196e-07, + "logits/chosen": -1.780106544494629, + "logits/rejected": -1.6833308935165405, + "logps/chosen": -59.21906661987305, + "logps/rejected": -97.84840393066406, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1965326070785522, + "rewards/margins": 16.255271911621094, + "rewards/rejected": -17.45180320739746, + "step": 2200 + }, + { + "epoch": 1.0, + "eval_logits/chosen": -1.5444055795669556, + "eval_logits/rejected": -1.4629565477371216, + "eval_logps/chosen": -82.37531280517578, + "eval_logps/rejected": -93.55461883544922, + "eval_loss": 0.005960206501185894, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": -3.1966702938079834, + "eval_rewards/margins": 13.628522872924805, + "eval_rewards/rejected": -16.825193405151367, + "eval_runtime": 141.3335, + "eval_samples_per_second": 20.25, + "eval_steps_per_second": 1.267, + "step": 2200 + }, + { + "epoch": 1.01, + "learning_rate": 2.2120750887874172e-07, + "logits/chosen": -1.7576357126235962, + "logits/rejected": -1.6710838079452515, + "logps/chosen": -59.391929626464844, + "logps/rejected": -92.55592346191406, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5112824440002441, + "rewards/margins": 15.496170043945312, + "rewards/rejected": -17.0074520111084, + "step": 2210 + }, + { + "epoch": 1.01, + "learning_rate": 2.207001522070015e-07, + "logits/chosen": -1.7698043584823608, + "logits/rejected": -1.681131362915039, + "logps/chosen": -56.185142517089844, + "logps/rejected": -92.93355560302734, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5311224460601807, + "rewards/margins": 15.489564895629883, + "rewards/rejected": -17.02068519592285, + "step": 2220 + }, + { + "epoch": 1.02, + "learning_rate": 2.2019279553526126e-07, + "logits/chosen": -1.749076247215271, + "logits/rejected": -1.6408894062042236, + "logps/chosen": -58.44233322143555, + "logps/rejected": -85.44766998291016, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5151691436767578, + "rewards/margins": 14.953633308410645, + "rewards/rejected": -16.468799591064453, + "step": 2230 + }, + { + "epoch": 1.02, + "learning_rate": 2.1968543886352102e-07, + "logits/chosen": -1.7423725128173828, + "logits/rejected": -1.660238265991211, + "logps/chosen": -56.360084533691406, + "logps/rejected": -90.83768463134766, + "loss": 0.0036, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.8068774938583374, + "rewards/margins": 15.062307357788086, + "rewards/rejected": -16.869184494018555, + "step": 2240 + }, + { + "epoch": 1.03, + "learning_rate": 2.191780821917808e-07, + "logits/chosen": -1.745189905166626, + "logits/rejected": -1.669600248336792, + "logps/chosen": -56.39533615112305, + "logps/rejected": -91.84968566894531, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8725141286849976, + "rewards/margins": 14.959602355957031, + "rewards/rejected": -16.832117080688477, + "step": 2250 + }, + { + "epoch": 1.03, + "learning_rate": 2.1867072552004056e-07, + "logits/chosen": -1.7463161945343018, + "logits/rejected": -1.6642965078353882, + "logps/chosen": -57.425537109375, + "logps/rejected": -91.81517028808594, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.558890461921692, + "rewards/margins": 15.99829387664795, + "rewards/rejected": -17.557186126708984, + "step": 2260 + }, + { + "epoch": 1.04, + "learning_rate": 2.1816336884830032e-07, + "logits/chosen": -1.7645819187164307, + "logits/rejected": -1.6817333698272705, + "logps/chosen": -55.57514190673828, + "logps/rejected": -91.8698501586914, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1787233352661133, + "rewards/margins": 16.134780883789062, + "rewards/rejected": -17.31350326538086, + "step": 2270 + }, + { + "epoch": 1.04, + "learning_rate": 2.176560121765601e-07, + "logits/chosen": -1.7538105249404907, + "logits/rejected": -1.6650365591049194, + "logps/chosen": -59.37980270385742, + "logps/rejected": -94.99974060058594, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2272608280181885, + "rewards/margins": 15.5308198928833, + "rewards/rejected": -16.758081436157227, + "step": 2280 + }, + { + "epoch": 1.05, + "learning_rate": 2.1714865550481986e-07, + "logits/chosen": -1.7885656356811523, + "logits/rejected": -1.6814218759536743, + "logps/chosen": -55.93397903442383, + "logps/rejected": -94.24886322021484, + "loss": 0.0045, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.3392460346221924, + "rewards/margins": 16.647480010986328, + "rewards/rejected": -17.98672866821289, + "step": 2290 + }, + { + "epoch": 1.05, + "learning_rate": 2.1664129883307962e-07, + "logits/chosen": -1.728960633277893, + "logits/rejected": -1.662274718284607, + "logps/chosen": -57.202308654785156, + "logps/rejected": -98.20774841308594, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6883776187896729, + "rewards/margins": 16.555070877075195, + "rewards/rejected": -18.24344825744629, + "step": 2300 + }, + { + "epoch": 1.05, + "eval_logits/chosen": -1.5466911792755127, + "eval_logits/rejected": -1.4644973278045654, + "eval_logps/chosen": -83.1010513305664, + "eval_logps/rejected": -95.12531280517578, + "eval_loss": 0.006303800735622644, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -3.5595388412475586, + "eval_rewards/margins": 14.051005363464355, + "eval_rewards/rejected": -17.610544204711914, + "eval_runtime": 151.7854, + "eval_samples_per_second": 18.856, + "eval_steps_per_second": 1.179, + "step": 2300 + }, + { + "epoch": 1.05, + "learning_rate": 2.161339421613394e-07, + "logits/chosen": -1.7632097005844116, + "logits/rejected": -1.6761329174041748, + "logps/chosen": -57.0994873046875, + "logps/rejected": -96.46761322021484, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.760319709777832, + "rewards/margins": 16.308773040771484, + "rewards/rejected": -18.069091796875, + "step": 2310 + }, + { + "epoch": 1.06, + "learning_rate": 2.1562658548959916e-07, + "logits/chosen": -1.7615985870361328, + "logits/rejected": -1.6536455154418945, + "logps/chosen": -61.092079162597656, + "logps/rejected": -94.53724670410156, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4904835224151611, + "rewards/margins": 16.616823196411133, + "rewards/rejected": -18.1073055267334, + "step": 2320 + }, + { + "epoch": 1.06, + "learning_rate": 2.1511922881785892e-07, + "logits/chosen": -1.7435195446014404, + "logits/rejected": -1.6646251678466797, + "logps/chosen": -55.811553955078125, + "logps/rejected": -95.2570571899414, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7704877853393555, + "rewards/margins": 15.985902786254883, + "rewards/rejected": -17.756389617919922, + "step": 2330 + }, + { + "epoch": 1.07, + "learning_rate": 2.146118721461187e-07, + "logits/chosen": -1.7589401006698608, + "logits/rejected": -1.663800835609436, + "logps/chosen": -60.44816207885742, + "logps/rejected": -98.11687469482422, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8989307880401611, + "rewards/margins": 16.624099731445312, + "rewards/rejected": -18.523029327392578, + "step": 2340 + }, + { + "epoch": 1.07, + "learning_rate": 2.1410451547437846e-07, + "logits/chosen": -1.753797173500061, + "logits/rejected": -1.6788421869277954, + "logps/chosen": -56.67557907104492, + "logps/rejected": -98.96099090576172, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7312290668487549, + "rewards/margins": 16.829326629638672, + "rewards/rejected": -18.560556411743164, + "step": 2350 + }, + { + "epoch": 1.08, + "learning_rate": 2.1359715880263822e-07, + "logits/chosen": -1.7726614475250244, + "logits/rejected": -1.6805782318115234, + "logps/chosen": -57.83234786987305, + "logps/rejected": -95.62266540527344, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7165930271148682, + "rewards/margins": 16.315860748291016, + "rewards/rejected": -18.032451629638672, + "step": 2360 + }, + { + "epoch": 1.08, + "learning_rate": 2.13089802130898e-07, + "logits/chosen": -1.759373664855957, + "logits/rejected": -1.6703014373779297, + "logps/chosen": -54.686668395996094, + "logps/rejected": -95.24959564208984, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6667108535766602, + "rewards/margins": 17.19411849975586, + "rewards/rejected": -18.860828399658203, + "step": 2370 + }, + { + "epoch": 1.09, + "learning_rate": 2.1258244545915776e-07, + "logits/chosen": -1.7582851648330688, + "logits/rejected": -1.6722888946533203, + "logps/chosen": -58.01994705200195, + "logps/rejected": -97.08244323730469, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7737096548080444, + "rewards/margins": 16.389488220214844, + "rewards/rejected": -18.163196563720703, + "step": 2380 + }, + { + "epoch": 1.09, + "learning_rate": 2.1207508878741752e-07, + "logits/chosen": -1.7361366748809814, + "logits/rejected": -1.6514666080474854, + "logps/chosen": -56.37322998046875, + "logps/rejected": -87.77371978759766, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.5976836681365967, + "rewards/margins": 15.749250411987305, + "rewards/rejected": -18.346933364868164, + "step": 2390 + }, + { + "epoch": 1.1, + "learning_rate": 2.115677321156773e-07, + "logits/chosen": -1.7475221157073975, + "logits/rejected": -1.6648136377334595, + "logps/chosen": -60.138282775878906, + "logps/rejected": -98.83917236328125, + "loss": 0.0055, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.728782057762146, + "rewards/margins": 17.035816192626953, + "rewards/rejected": -18.764598846435547, + "step": 2400 + }, + { + "epoch": 1.1, + "eval_logits/chosen": -1.5441038608551025, + "eval_logits/rejected": -1.4605567455291748, + "eval_logps/chosen": -84.07401275634766, + "eval_logps/rejected": -97.23653411865234, + "eval_loss": 0.007000993005931377, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": -4.046020030975342, + "eval_rewards/margins": 14.620133399963379, + "eval_rewards/rejected": -18.666152954101562, + "eval_runtime": 146.16, + "eval_samples_per_second": 19.581, + "eval_steps_per_second": 1.225, + "step": 2400 + }, + { + "epoch": 1.1, + "learning_rate": 2.1106037544393706e-07, + "logits/chosen": -1.764243483543396, + "logits/rejected": -1.6860980987548828, + "logps/chosen": -56.706016540527344, + "logps/rejected": -101.9083251953125, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.575025200843811, + "rewards/margins": 17.899951934814453, + "rewards/rejected": -19.474979400634766, + "step": 2410 + }, + { + "epoch": 1.1, + "learning_rate": 2.1055301877219682e-07, + "logits/chosen": -1.787570595741272, + "logits/rejected": -1.6699540615081787, + "logps/chosen": -62.099884033203125, + "logps/rejected": -95.29823303222656, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.684003472328186, + "rewards/margins": 17.041250228881836, + "rewards/rejected": -18.725255966186523, + "step": 2420 + }, + { + "epoch": 1.11, + "learning_rate": 2.100456621004566e-07, + "logits/chosen": -1.7343931198120117, + "logits/rejected": -1.6694958209991455, + "logps/chosen": -53.398345947265625, + "logps/rejected": -93.80493927001953, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8752937316894531, + "rewards/margins": 17.16379165649414, + "rewards/rejected": -19.03908348083496, + "step": 2430 + }, + { + "epoch": 1.11, + "learning_rate": 2.0953830542871636e-07, + "logits/chosen": -1.7479832172393799, + "logits/rejected": -1.6645724773406982, + "logps/chosen": -59.2391242980957, + "logps/rejected": -97.51466369628906, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8753879070281982, + "rewards/margins": 16.62053871154785, + "rewards/rejected": -18.495925903320312, + "step": 2440 + }, + { + "epoch": 1.12, + "learning_rate": 2.0903094875697612e-07, + "logits/chosen": -1.7523056268692017, + "logits/rejected": -1.642364263534546, + "logps/chosen": -62.688987731933594, + "logps/rejected": -97.38086700439453, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8174155950546265, + "rewards/margins": 17.393173217773438, + "rewards/rejected": -19.210586547851562, + "step": 2450 + }, + { + "epoch": 1.12, + "learning_rate": 2.085235920852359e-07, + "logits/chosen": -1.7349144220352173, + "logits/rejected": -1.6552202701568604, + "logps/chosen": -58.827049255371094, + "logps/rejected": -96.5774917602539, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0927205085754395, + "rewards/margins": 17.134923934936523, + "rewards/rejected": -19.22764778137207, + "step": 2460 + }, + { + "epoch": 1.13, + "learning_rate": 2.0801623541349566e-07, + "logits/chosen": -1.7906172275543213, + "logits/rejected": -1.6924254894256592, + "logps/chosen": -55.4560661315918, + "logps/rejected": -94.42463684082031, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2818658351898193, + "rewards/margins": 16.96918296813965, + "rewards/rejected": -18.251049041748047, + "step": 2470 + }, + { + "epoch": 1.13, + "learning_rate": 2.0750887874175542e-07, + "logits/chosen": -1.7605549097061157, + "logits/rejected": -1.677120566368103, + "logps/chosen": -55.66670608520508, + "logps/rejected": -99.32123565673828, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1595937013626099, + "rewards/margins": 17.582664489746094, + "rewards/rejected": -18.742259979248047, + "step": 2480 + }, + { + "epoch": 1.14, + "learning_rate": 2.070015220700152e-07, + "logits/chosen": -1.7633788585662842, + "logits/rejected": -1.6667747497558594, + "logps/chosen": -57.516387939453125, + "logps/rejected": -93.65599822998047, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4830613136291504, + "rewards/margins": 16.72182846069336, + "rewards/rejected": -18.204891204833984, + "step": 2490 + }, + { + "epoch": 1.14, + "learning_rate": 2.0649416539827496e-07, + "logits/chosen": -1.7710567712783813, + "logits/rejected": -1.6819353103637695, + "logps/chosen": -57.67155075073242, + "logps/rejected": -93.34293365478516, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1700657606124878, + "rewards/margins": 16.193981170654297, + "rewards/rejected": -17.364046096801758, + "step": 2500 + }, + { + "epoch": 1.14, + "eval_logits/chosen": -1.550737977027893, + "eval_logits/rejected": -1.4679088592529297, + "eval_logps/chosen": -82.61907196044922, + "eval_logps/rejected": -95.11019134521484, + "eval_loss": 0.006657297257333994, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": -3.3185434341430664, + "eval_rewards/margins": 14.284436225891113, + "eval_rewards/rejected": -17.602981567382812, + "eval_runtime": 155.6342, + "eval_samples_per_second": 18.389, + "eval_steps_per_second": 1.15, + "step": 2500 + }, + { + "epoch": 1.15, + "learning_rate": 2.0598680872653472e-07, + "logits/chosen": -1.7823295593261719, + "logits/rejected": -1.68827223777771, + "logps/chosen": -58.17741775512695, + "logps/rejected": -93.47938537597656, + "loss": 0.0039, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.1728184223175049, + "rewards/margins": 16.308216094970703, + "rewards/rejected": -17.48103141784668, + "step": 2510 + }, + { + "epoch": 1.15, + "learning_rate": 2.054794520547945e-07, + "logits/chosen": -1.7661288976669312, + "logits/rejected": -1.6686477661132812, + "logps/chosen": -54.59541702270508, + "logps/rejected": -92.54632568359375, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8793942332267761, + "rewards/margins": 16.468664169311523, + "rewards/rejected": -17.34805679321289, + "step": 2520 + }, + { + "epoch": 1.15, + "learning_rate": 2.0497209538305426e-07, + "logits/chosen": -1.7487903833389282, + "logits/rejected": -1.6530921459197998, + "logps/chosen": -56.20011520385742, + "logps/rejected": -93.74838256835938, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4012267589569092, + "rewards/margins": 17.030271530151367, + "rewards/rejected": -18.431495666503906, + "step": 2530 + }, + { + "epoch": 1.16, + "learning_rate": 2.0446473871131402e-07, + "logits/chosen": -1.7581230401992798, + "logits/rejected": -1.6596683263778687, + "logps/chosen": -60.82426071166992, + "logps/rejected": -99.79011535644531, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3703958988189697, + "rewards/margins": 16.5430965423584, + "rewards/rejected": -17.91349220275879, + "step": 2540 + }, + { + "epoch": 1.16, + "learning_rate": 2.039573820395738e-07, + "logits/chosen": -1.7701380252838135, + "logits/rejected": -1.6606664657592773, + "logps/chosen": -59.037353515625, + "logps/rejected": -94.49665069580078, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0363215208053589, + "rewards/margins": 16.670988082885742, + "rewards/rejected": -17.70730972290039, + "step": 2550 + }, + { + "epoch": 1.17, + "learning_rate": 2.0345002536783356e-07, + "logits/chosen": -1.749725341796875, + "logits/rejected": -1.6626144647598267, + "logps/chosen": -55.888938903808594, + "logps/rejected": -90.47606658935547, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.187700629234314, + "rewards/margins": 16.13033676147461, + "rewards/rejected": -17.318037033081055, + "step": 2560 + }, + { + "epoch": 1.17, + "learning_rate": 2.0294266869609332e-07, + "logits/chosen": -1.7732551097869873, + "logits/rejected": -1.6830087900161743, + "logps/chosen": -56.351524353027344, + "logps/rejected": -92.32308197021484, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5203639268875122, + "rewards/margins": 16.521289825439453, + "rewards/rejected": -18.04165267944336, + "step": 2570 + }, + { + "epoch": 1.18, + "learning_rate": 2.024353120243531e-07, + "logits/chosen": -1.7540162801742554, + "logits/rejected": -1.673253059387207, + "logps/chosen": -52.99707794189453, + "logps/rejected": -97.1292724609375, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9613077044487, + "rewards/margins": 16.9852237701416, + "rewards/rejected": -17.946529388427734, + "step": 2580 + }, + { + "epoch": 1.18, + "learning_rate": 2.0192795535261286e-07, + "logits/chosen": -1.7799053192138672, + "logits/rejected": -1.6866953372955322, + "logps/chosen": -57.35761642456055, + "logps/rejected": -97.79578399658203, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8344005346298218, + "rewards/margins": 18.256929397583008, + "rewards/rejected": -19.091327667236328, + "step": 2590 + }, + { + "epoch": 1.19, + "learning_rate": 2.0142059868087262e-07, + "logits/chosen": -1.7495887279510498, + "logits/rejected": -1.670689344406128, + "logps/chosen": -57.3239860534668, + "logps/rejected": -94.37218475341797, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2658514976501465, + "rewards/margins": 16.88041114807129, + "rewards/rejected": -18.14626121520996, + "step": 2600 + }, + { + "epoch": 1.19, + "eval_logits/chosen": -1.5500638484954834, + "eval_logits/rejected": -1.466713309288025, + "eval_logps/chosen": -82.79623413085938, + "eval_logps/rejected": -96.38542175292969, + "eval_loss": 0.006424016784876585, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": -3.4071311950683594, + "eval_rewards/margins": 14.833463668823242, + "eval_rewards/rejected": -18.240596771240234, + "eval_runtime": 153.5802, + "eval_samples_per_second": 18.635, + "eval_steps_per_second": 1.166, + "step": 2600 + }, + { + "epoch": 1.19, + "learning_rate": 2.009132420091324e-07, + "logits/chosen": -1.7762126922607422, + "logits/rejected": -1.6902281045913696, + "logps/chosen": -53.2180290222168, + "logps/rejected": -96.89413452148438, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0934245586395264, + "rewards/margins": 17.65363883972168, + "rewards/rejected": -18.74706268310547, + "step": 2610 + }, + { + "epoch": 1.2, + "learning_rate": 2.0040588533739216e-07, + "logits/chosen": -1.7808773517608643, + "logits/rejected": -1.6828012466430664, + "logps/chosen": -59.0372428894043, + "logps/rejected": -96.43038177490234, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2977206707000732, + "rewards/margins": 17.016761779785156, + "rewards/rejected": -18.31447982788086, + "step": 2620 + }, + { + "epoch": 1.2, + "learning_rate": 1.9989852866565192e-07, + "logits/chosen": -1.734175443649292, + "logits/rejected": -1.6448099613189697, + "logps/chosen": -57.11977005004883, + "logps/rejected": -95.9421615600586, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2607675790786743, + "rewards/margins": 17.131710052490234, + "rewards/rejected": -18.39247703552246, + "step": 2630 + }, + { + "epoch": 1.21, + "learning_rate": 1.993911719939117e-07, + "logits/chosen": -1.7802501916885376, + "logits/rejected": -1.6809139251708984, + "logps/chosen": -55.82944869995117, + "logps/rejected": -91.70013427734375, + "loss": 0.0042, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.4456136226654053, + "rewards/margins": 16.31406021118164, + "rewards/rejected": -17.759674072265625, + "step": 2640 + }, + { + "epoch": 1.21, + "learning_rate": 1.9888381532217146e-07, + "logits/chosen": -1.7335536479949951, + "logits/rejected": -1.661350965499878, + "logps/chosen": -53.10137939453125, + "logps/rejected": -95.90177154541016, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.789583444595337, + "rewards/margins": 17.921833038330078, + "rewards/rejected": -19.711416244506836, + "step": 2650 + }, + { + "epoch": 1.21, + "learning_rate": 1.9837645865043122e-07, + "logits/chosen": -1.750737190246582, + "logits/rejected": -1.6568177938461304, + "logps/chosen": -55.086997985839844, + "logps/rejected": -95.12551879882812, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4792072772979736, + "rewards/margins": 18.116641998291016, + "rewards/rejected": -19.595848083496094, + "step": 2660 + }, + { + "epoch": 1.22, + "learning_rate": 1.97869101978691e-07, + "logits/chosen": -1.7495743036270142, + "logits/rejected": -1.6731303930282593, + "logps/chosen": -55.61684036254883, + "logps/rejected": -96.10287475585938, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7537901401519775, + "rewards/margins": 17.751415252685547, + "rewards/rejected": -19.505207061767578, + "step": 2670 + }, + { + "epoch": 1.22, + "learning_rate": 1.9736174530695076e-07, + "logits/chosen": -1.7520612478256226, + "logits/rejected": -1.6678498983383179, + "logps/chosen": -55.18487548828125, + "logps/rejected": -96.980224609375, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.197737455368042, + "rewards/margins": 18.08832550048828, + "rewards/rejected": -19.286062240600586, + "step": 2680 + }, + { + "epoch": 1.23, + "learning_rate": 1.9685438863521052e-07, + "logits/chosen": -1.7610679864883423, + "logits/rejected": -1.6806461811065674, + "logps/chosen": -55.430091857910156, + "logps/rejected": -97.28132629394531, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.306195855140686, + "rewards/margins": 18.163005828857422, + "rewards/rejected": -19.469202041625977, + "step": 2690 + }, + { + "epoch": 1.23, + "learning_rate": 1.963470319634703e-07, + "logits/chosen": -1.8031642436981201, + "logits/rejected": -1.7114074230194092, + "logps/chosen": -59.132850646972656, + "logps/rejected": -95.17094421386719, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.488271951675415, + "rewards/margins": 17.14899253845215, + "rewards/rejected": -18.637264251708984, + "step": 2700 + }, + { + "epoch": 1.23, + "eval_logits/chosen": -1.5495989322662354, + "eval_logits/rejected": -1.4647303819656372, + "eval_logps/chosen": -84.65057373046875, + "eval_logps/rejected": -99.30116271972656, + "eval_loss": 0.00904226116836071, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -4.334300518035889, + "eval_rewards/margins": 15.36416244506836, + "eval_rewards/rejected": -19.69846534729004, + "eval_runtime": 147.385, + "eval_samples_per_second": 19.419, + "eval_steps_per_second": 1.215, + "step": 2700 + }, + { + "epoch": 1.24, + "learning_rate": 1.9583967529173006e-07, + "logits/chosen": -1.7910597324371338, + "logits/rejected": -1.6921383142471313, + "logps/chosen": -58.234352111816406, + "logps/rejected": -97.64838409423828, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6721891164779663, + "rewards/margins": 17.89757537841797, + "rewards/rejected": -19.56976318359375, + "step": 2710 + }, + { + "epoch": 1.24, + "learning_rate": 1.9533231861998982e-07, + "logits/chosen": -1.7883247137069702, + "logits/rejected": -1.6830103397369385, + "logps/chosen": -61.363319396972656, + "logps/rejected": -100.30858612060547, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0055155754089355, + "rewards/margins": 18.509708404541016, + "rewards/rejected": -20.51522445678711, + "step": 2720 + }, + { + "epoch": 1.25, + "learning_rate": 1.948249619482496e-07, + "logits/chosen": -1.751837968826294, + "logits/rejected": -1.6661630868911743, + "logps/chosen": -55.377769470214844, + "logps/rejected": -95.15009307861328, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.795588731765747, + "rewards/margins": 18.258634567260742, + "rewards/rejected": -20.054224014282227, + "step": 2730 + }, + { + "epoch": 1.25, + "learning_rate": 1.9431760527650936e-07, + "logits/chosen": -1.7812420129776, + "logits/rejected": -1.6635338068008423, + "logps/chosen": -62.19695281982422, + "logps/rejected": -100.60261535644531, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6461451053619385, + "rewards/margins": 18.79757308959961, + "rewards/rejected": -20.44371795654297, + "step": 2740 + }, + { + "epoch": 1.26, + "learning_rate": 1.9381024860476912e-07, + "logits/chosen": -1.7579082250595093, + "logits/rejected": -1.6530691385269165, + "logps/chosen": -57.74163818359375, + "logps/rejected": -98.6023178100586, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6989234685897827, + "rewards/margins": 19.26529312133789, + "rewards/rejected": -20.964216232299805, + "step": 2750 + }, + { + "epoch": 1.26, + "learning_rate": 1.933028919330289e-07, + "logits/chosen": -1.7814481258392334, + "logits/rejected": -1.6904300451278687, + "logps/chosen": -57.6041374206543, + "logps/rejected": -98.92882537841797, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9272903203964233, + "rewards/margins": 19.180126190185547, + "rewards/rejected": -21.107418060302734, + "step": 2760 + }, + { + "epoch": 1.26, + "learning_rate": 1.9279553526128866e-07, + "logits/chosen": -1.7679035663604736, + "logits/rejected": -1.6667778491973877, + "logps/chosen": -59.31825637817383, + "logps/rejected": -99.1368408203125, + "loss": 0.0043, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.5422855615615845, + "rewards/margins": 19.345226287841797, + "rewards/rejected": -20.88751220703125, + "step": 2770 + }, + { + "epoch": 1.27, + "learning_rate": 1.9228817858954842e-07, + "logits/chosen": -1.770992636680603, + "logits/rejected": -1.6852118968963623, + "logps/chosen": -56.10600662231445, + "logps/rejected": -98.84684753417969, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3405952453613281, + "rewards/margins": 18.240341186523438, + "rewards/rejected": -19.580936431884766, + "step": 2780 + }, + { + "epoch": 1.27, + "learning_rate": 1.917808219178082e-07, + "logits/chosen": -1.748457908630371, + "logits/rejected": -1.6733148097991943, + "logps/chosen": -53.041015625, + "logps/rejected": -100.11251068115234, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6683629751205444, + "rewards/margins": 18.404541015625, + "rewards/rejected": -20.072906494140625, + "step": 2790 + }, + { + "epoch": 1.28, + "learning_rate": 1.9127346524606796e-07, + "logits/chosen": -1.778543472290039, + "logits/rejected": -1.6833875179290771, + "logps/chosen": -57.812828063964844, + "logps/rejected": -96.30428314208984, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4683773517608643, + "rewards/margins": 17.575725555419922, + "rewards/rejected": -19.04410171508789, + "step": 2800 + }, + { + "epoch": 1.28, + "eval_logits/chosen": -1.5407549142837524, + "eval_logits/rejected": -1.4569265842437744, + "eval_logps/chosen": -85.26310729980469, + "eval_logps/rejected": -99.38053131103516, + "eval_loss": 0.011327545158565044, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -4.640564918518066, + "eval_rewards/margins": 15.097585678100586, + "eval_rewards/rejected": -19.73814582824707, + "eval_runtime": 149.334, + "eval_samples_per_second": 19.165, + "eval_steps_per_second": 1.199, + "step": 2800 + }, + { + "epoch": 1.28, + "learning_rate": 1.9076610857432772e-07, + "logits/chosen": -1.7536699771881104, + "logits/rejected": -1.6619449853897095, + "logps/chosen": -56.8665771484375, + "logps/rejected": -99.80836486816406, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7938029766082764, + "rewards/margins": 18.09140396118164, + "rewards/rejected": -19.885204315185547, + "step": 2810 + }, + { + "epoch": 1.29, + "learning_rate": 1.902587519025875e-07, + "logits/chosen": -1.7796272039413452, + "logits/rejected": -1.673616647720337, + "logps/chosen": -61.139610290527344, + "logps/rejected": -97.94349670410156, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6869022846221924, + "rewards/margins": 18.66249656677246, + "rewards/rejected": -20.34939956665039, + "step": 2820 + }, + { + "epoch": 1.29, + "learning_rate": 1.8975139523084726e-07, + "logits/chosen": -1.7451549768447876, + "logits/rejected": -1.665014624595642, + "logps/chosen": -59.975379943847656, + "logps/rejected": -101.70437622070312, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.056490421295166, + "rewards/margins": 18.601909637451172, + "rewards/rejected": -20.658397674560547, + "step": 2830 + }, + { + "epoch": 1.3, + "learning_rate": 1.8924403855910702e-07, + "logits/chosen": -1.7664998769760132, + "logits/rejected": -1.6723295450210571, + "logps/chosen": -57.23393630981445, + "logps/rejected": -94.91930389404297, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.884436011314392, + "rewards/margins": 17.427194595336914, + "rewards/rejected": -19.311628341674805, + "step": 2840 + }, + { + "epoch": 1.3, + "learning_rate": 1.887366818873668e-07, + "logits/chosen": -1.7805742025375366, + "logits/rejected": -1.6844866275787354, + "logps/chosen": -58.0041618347168, + "logps/rejected": -97.0829849243164, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.111764907836914, + "rewards/margins": 17.889118194580078, + "rewards/rejected": -19.00088119506836, + "step": 2850 + }, + { + "epoch": 1.31, + "learning_rate": 1.8822932521562656e-07, + "logits/chosen": -1.7825946807861328, + "logits/rejected": -1.6918452978134155, + "logps/chosen": -56.925636291503906, + "logps/rejected": -97.35308074951172, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9460649490356445, + "rewards/margins": 17.18486976623535, + "rewards/rejected": -18.130931854248047, + "step": 2860 + }, + { + "epoch": 1.31, + "learning_rate": 1.8772196854388632e-07, + "logits/chosen": -1.7390912771224976, + "logits/rejected": -1.6564744710922241, + "logps/chosen": -57.98912811279297, + "logps/rejected": -96.44007873535156, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5826243162155151, + "rewards/margins": 16.839244842529297, + "rewards/rejected": -18.4218692779541, + "step": 2870 + }, + { + "epoch": 1.31, + "learning_rate": 1.872146118721461e-07, + "logits/chosen": -1.7849754095077515, + "logits/rejected": -1.672852873802185, + "logps/chosen": -61.340476989746094, + "logps/rejected": -93.8963851928711, + "loss": 0.0011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.613435983657837, + "rewards/margins": 17.021175384521484, + "rewards/rejected": -18.634611129760742, + "step": 2880 + }, + { + "epoch": 1.32, + "learning_rate": 1.8670725520040586e-07, + "logits/chosen": -1.750396728515625, + "logits/rejected": -1.6511199474334717, + "logps/chosen": -58.67247772216797, + "logps/rejected": -96.80357360839844, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.338128685951233, + "rewards/margins": 17.583751678466797, + "rewards/rejected": -18.9218807220459, + "step": 2890 + }, + { + "epoch": 1.32, + "learning_rate": 1.8619989852866562e-07, + "logits/chosen": -1.7634893655776978, + "logits/rejected": -1.6787548065185547, + "logps/chosen": -55.73259353637695, + "logps/rejected": -94.75193786621094, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5130577087402344, + "rewards/margins": 18.2349853515625, + "rewards/rejected": -19.748043060302734, + "step": 2900 + }, + { + "epoch": 1.32, + "eval_logits/chosen": -1.544886589050293, + "eval_logits/rejected": -1.4612011909484863, + "eval_logps/chosen": -83.85013580322266, + "eval_logps/rejected": -98.73179626464844, + "eval_loss": 0.006982807535678148, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": -3.9340834617614746, + "eval_rewards/margins": 15.479698181152344, + "eval_rewards/rejected": -19.413782119750977, + "eval_runtime": 144.0378, + "eval_samples_per_second": 19.87, + "eval_steps_per_second": 1.243, + "step": 2900 + }, + { + "epoch": 1.33, + "learning_rate": 1.856925418569254e-07, + "logits/chosen": -1.7577152252197266, + "logits/rejected": -1.6612701416015625, + "logps/chosen": -61.57184982299805, + "logps/rejected": -98.47500610351562, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2696821689605713, + "rewards/margins": 18.28770637512207, + "rewards/rejected": -19.557388305664062, + "step": 2910 + }, + { + "epoch": 1.33, + "learning_rate": 1.8518518518518516e-07, + "logits/chosen": -1.744296669960022, + "logits/rejected": -1.671971321105957, + "logps/chosen": -54.99235153198242, + "logps/rejected": -90.80326843261719, + "loss": 0.0068, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.0770174264907837, + "rewards/margins": 15.502540588378906, + "rewards/rejected": -16.579557418823242, + "step": 2920 + }, + { + "epoch": 1.34, + "learning_rate": 1.8467782851344492e-07, + "logits/chosen": -1.7668946981430054, + "logits/rejected": -1.6761682033538818, + "logps/chosen": -54.510459899902344, + "logps/rejected": -91.04711151123047, + "loss": 0.0019, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2146661281585693, + "rewards/margins": 15.932766914367676, + "rewards/rejected": -17.147432327270508, + "step": 2930 + }, + { + "epoch": 1.34, + "learning_rate": 1.841704718417047e-07, + "logits/chosen": -1.7573566436767578, + "logits/rejected": -1.654294729232788, + "logps/chosen": -58.834869384765625, + "logps/rejected": -94.41200256347656, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.412942886352539, + "rewards/margins": 16.644380569458008, + "rewards/rejected": -18.057323455810547, + "step": 2940 + }, + { + "epoch": 1.35, + "learning_rate": 1.8366311516996446e-07, + "logits/chosen": -1.7762553691864014, + "logits/rejected": -1.682163953781128, + "logps/chosen": -54.3316764831543, + "logps/rejected": -90.17029571533203, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1894292831420898, + "rewards/margins": 16.16569709777832, + "rewards/rejected": -17.355127334594727, + "step": 2950 + }, + { + "epoch": 1.35, + "learning_rate": 1.8315575849822422e-07, + "logits/chosen": -1.7757704257965088, + "logits/rejected": -1.684913992881775, + "logps/chosen": -56.261322021484375, + "logps/rejected": -95.15665435791016, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1488529443740845, + "rewards/margins": 16.14897346496582, + "rewards/rejected": -17.297826766967773, + "step": 2960 + }, + { + "epoch": 1.36, + "learning_rate": 1.82648401826484e-07, + "logits/chosen": -1.7451854944229126, + "logits/rejected": -1.6552518606185913, + "logps/chosen": -57.798240661621094, + "logps/rejected": -90.9861068725586, + "loss": 0.0034, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.4594552516937256, + "rewards/margins": 16.58235740661621, + "rewards/rejected": -18.041812896728516, + "step": 2970 + }, + { + "epoch": 1.36, + "learning_rate": 1.8214104515474375e-07, + "logits/chosen": -1.7445427179336548, + "logits/rejected": -1.6590522527694702, + "logps/chosen": -59.61603927612305, + "logps/rejected": -95.45381164550781, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.726776361465454, + "rewards/margins": 17.064167022705078, + "rewards/rejected": -18.790943145751953, + "step": 2980 + }, + { + "epoch": 1.36, + "learning_rate": 1.8163368848300352e-07, + "logits/chosen": -1.7461668252944946, + "logits/rejected": -1.6662800312042236, + "logps/chosen": -56.46452713012695, + "logps/rejected": -97.12699890136719, + "loss": 0.0076, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.519303560256958, + "rewards/margins": 16.730981826782227, + "rewards/rejected": -18.25028419494629, + "step": 2990 + }, + { + "epoch": 1.37, + "learning_rate": 1.811263318112633e-07, + "logits/chosen": -1.761304259300232, + "logits/rejected": -1.683823585510254, + "logps/chosen": -56.9975700378418, + "logps/rejected": -94.61572265625, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3946937322616577, + "rewards/margins": 16.722742080688477, + "rewards/rejected": -18.117435455322266, + "step": 3000 + }, + { + "epoch": 1.37, + "eval_logits/chosen": -1.539894461631775, + "eval_logits/rejected": -1.4586848020553589, + "eval_logps/chosen": -83.39830780029297, + "eval_logps/rejected": -96.94595336914062, + "eval_loss": 0.006562211085110903, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -3.708167791366577, + "eval_rewards/margins": 14.81269645690918, + "eval_rewards/rejected": -18.520864486694336, + "eval_runtime": 151.5133, + "eval_samples_per_second": 18.889, + "eval_steps_per_second": 1.181, + "step": 3000 + }, + { + "epoch": 1.37, + "learning_rate": 1.8061897513952305e-07, + "logits/chosen": -1.7640758752822876, + "logits/rejected": -1.6685714721679688, + "logps/chosen": -59.41301727294922, + "logps/rejected": -95.84724426269531, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2242387533187866, + "rewards/margins": 17.188039779663086, + "rewards/rejected": -18.412277221679688, + "step": 3010 + }, + { + "epoch": 1.38, + "learning_rate": 1.8011161846778282e-07, + "logits/chosen": -1.7586253881454468, + "logits/rejected": -1.6649078130722046, + "logps/chosen": -58.984031677246094, + "logps/rejected": -94.05524444580078, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5664620399475098, + "rewards/margins": 16.676197052001953, + "rewards/rejected": -18.242660522460938, + "step": 3020 + }, + { + "epoch": 1.38, + "learning_rate": 1.796042617960426e-07, + "logits/chosen": -1.773471474647522, + "logits/rejected": -1.6692659854888916, + "logps/chosen": -61.39658737182617, + "logps/rejected": -95.90094757080078, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9589765071868896, + "rewards/margins": 17.034753799438477, + "rewards/rejected": -18.993728637695312, + "step": 3030 + }, + { + "epoch": 1.39, + "learning_rate": 1.7909690512430235e-07, + "logits/chosen": -1.759385108947754, + "logits/rejected": -1.662372350692749, + "logps/chosen": -60.836090087890625, + "logps/rejected": -96.40751647949219, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8023784160614014, + "rewards/margins": 17.685691833496094, + "rewards/rejected": -19.488067626953125, + "step": 3040 + }, + { + "epoch": 1.39, + "learning_rate": 1.7858954845256212e-07, + "logits/chosen": -1.7491194009780884, + "logits/rejected": -1.6780580282211304, + "logps/chosen": -56.577880859375, + "logps/rejected": -98.60438537597656, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.453829050064087, + "rewards/margins": 17.57693862915039, + "rewards/rejected": -19.03076934814453, + "step": 3050 + }, + { + "epoch": 1.4, + "learning_rate": 1.780821917808219e-07, + "logits/chosen": -1.760709524154663, + "logits/rejected": -1.6798269748687744, + "logps/chosen": -58.53704833984375, + "logps/rejected": -99.60615539550781, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9645904302597046, + "rewards/margins": 17.165122985839844, + "rewards/rejected": -19.129714965820312, + "step": 3060 + }, + { + "epoch": 1.4, + "learning_rate": 1.7757483510908165e-07, + "logits/chosen": -1.7557331323623657, + "logits/rejected": -1.6769014596939087, + "logps/chosen": -55.544944763183594, + "logps/rejected": -99.79969787597656, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5912768840789795, + "rewards/margins": 18.37584114074707, + "rewards/rejected": -19.96711540222168, + "step": 3070 + }, + { + "epoch": 1.41, + "learning_rate": 1.7706747843734142e-07, + "logits/chosen": -1.7606086730957031, + "logits/rejected": -1.6541544198989868, + "logps/chosen": -56.68511199951172, + "logps/rejected": -91.82845306396484, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4311389923095703, + "rewards/margins": 17.00944709777832, + "rewards/rejected": -18.44058609008789, + "step": 3080 + }, + { + "epoch": 1.41, + "learning_rate": 1.765601217656012e-07, + "logits/chosen": -1.7437763214111328, + "logits/rejected": -1.653381586074829, + "logps/chosen": -58.844627380371094, + "logps/rejected": -94.74607849121094, + "loss": 0.0056, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.6268081665039062, + "rewards/margins": 16.87070083618164, + "rewards/rejected": -18.497509002685547, + "step": 3090 + }, + { + "epoch": 1.42, + "learning_rate": 1.7605276509386095e-07, + "logits/chosen": -1.7557626962661743, + "logits/rejected": -1.670015573501587, + "logps/chosen": -56.841880798339844, + "logps/rejected": -94.67625427246094, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3221409320831299, + "rewards/margins": 17.122955322265625, + "rewards/rejected": -18.445096969604492, + "step": 3100 + }, + { + "epoch": 1.42, + "eval_logits/chosen": -1.5296785831451416, + "eval_logits/rejected": -1.4479854106903076, + "eval_logps/chosen": -83.32076263427734, + "eval_logps/rejected": -97.17174530029297, + "eval_loss": 0.006353565026074648, + "eval_rewards/accuracies": 0.9972066879272461, + "eval_rewards/chosen": -3.6693906784057617, + "eval_rewards/margins": 14.964364051818848, + "eval_rewards/rejected": -18.633752822875977, + "eval_runtime": 170.5936, + "eval_samples_per_second": 16.777, + "eval_steps_per_second": 1.049, + "step": 3100 + }, + { + "epoch": 1.42, + "learning_rate": 1.7554540842212072e-07, + "logits/chosen": -1.7814935445785522, + "logits/rejected": -1.6783158779144287, + "logps/chosen": -60.32609939575195, + "logps/rejected": -93.3768081665039, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.453685998916626, + "rewards/margins": 16.54983901977539, + "rewards/rejected": -18.00352668762207, + "step": 3110 + }, + { + "epoch": 1.42, + "learning_rate": 1.750380517503805e-07, + "logits/chosen": -1.7732906341552734, + "logits/rejected": -1.6722043752670288, + "logps/chosen": -58.196807861328125, + "logps/rejected": -94.85267639160156, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4471790790557861, + "rewards/margins": 17.081787109375, + "rewards/rejected": -18.52896499633789, + "step": 3120 + }, + { + "epoch": 1.43, + "learning_rate": 1.7453069507864025e-07, + "logits/chosen": -1.7158924341201782, + "logits/rejected": -1.6260392665863037, + "logps/chosen": -57.551734924316406, + "logps/rejected": -91.12532043457031, + "loss": 0.0088, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.9693899154663086, + "rewards/margins": 16.378599166870117, + "rewards/rejected": -18.347990036010742, + "step": 3130 + }, + { + "epoch": 1.43, + "learning_rate": 1.7402333840690002e-07, + "logits/chosen": -1.7346522808074951, + "logits/rejected": -1.6474847793579102, + "logps/chosen": -57.03557205200195, + "logps/rejected": -95.57264709472656, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7559372186660767, + "rewards/margins": 17.30267906188965, + "rewards/rejected": -19.05861473083496, + "step": 3140 + }, + { + "epoch": 1.44, + "learning_rate": 1.735159817351598e-07, + "logits/chosen": -1.7555586099624634, + "logits/rejected": -1.6755212545394897, + "logps/chosen": -55.34453201293945, + "logps/rejected": -100.09938049316406, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3288910388946533, + "rewards/margins": 18.491540908813477, + "rewards/rejected": -19.820430755615234, + "step": 3150 + }, + { + "epoch": 1.44, + "learning_rate": 1.7300862506341955e-07, + "logits/chosen": -1.772899866104126, + "logits/rejected": -1.6719576120376587, + "logps/chosen": -60.80910110473633, + "logps/rejected": -97.62386322021484, + "loss": 0.0065, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.793237328529358, + "rewards/margins": 18.663257598876953, + "rewards/rejected": -20.456493377685547, + "step": 3160 + }, + { + "epoch": 1.45, + "learning_rate": 1.7250126839167932e-07, + "logits/chosen": -1.7909915447235107, + "logits/rejected": -1.6983429193496704, + "logps/chosen": -61.983421325683594, + "logps/rejected": -97.15614318847656, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7546970844268799, + "rewards/margins": 17.051715850830078, + "rewards/rejected": -18.806411743164062, + "step": 3170 + }, + { + "epoch": 1.45, + "learning_rate": 1.719939117199391e-07, + "logits/chosen": -1.7587168216705322, + "logits/rejected": -1.6700305938720703, + "logps/chosen": -55.97304153442383, + "logps/rejected": -97.49143981933594, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.573572039604187, + "rewards/margins": 17.24924087524414, + "rewards/rejected": -18.82281494140625, + "step": 3180 + }, + { + "epoch": 1.46, + "learning_rate": 1.7148655504819885e-07, + "logits/chosen": -1.72994065284729, + "logits/rejected": -1.6405225992202759, + "logps/chosen": -56.26521682739258, + "logps/rejected": -98.17828369140625, + "loss": 0.0025, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.3642914295196533, + "rewards/margins": 18.429710388183594, + "rewards/rejected": -19.794002532958984, + "step": 3190 + }, + { + "epoch": 1.46, + "learning_rate": 1.7097919837645862e-07, + "logits/chosen": -1.7546837329864502, + "logits/rejected": -1.6676174402236938, + "logps/chosen": -59.02223587036133, + "logps/rejected": -97.63322448730469, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8880170583724976, + "rewards/margins": 18.1130313873291, + "rewards/rejected": -20.001047134399414, + "step": 3200 + }, + { + "epoch": 1.46, + "eval_logits/chosen": -1.5306702852249146, + "eval_logits/rejected": -1.4482632875442505, + "eval_logps/chosen": -83.45709991455078, + "eval_logps/rejected": -98.23886108398438, + "eval_loss": 0.005945264827460051, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": -3.737560510635376, + "eval_rewards/margins": 15.429756164550781, + "eval_rewards/rejected": -19.167316436767578, + "eval_runtime": 153.8728, + "eval_samples_per_second": 18.6, + "eval_steps_per_second": 1.163, + "step": 3200 + }, + { + "epoch": 1.47, + "learning_rate": 1.704718417047184e-07, + "logits/chosen": -1.7227180004119873, + "logits/rejected": -1.6549056768417358, + "logps/chosen": -55.83765411376953, + "logps/rejected": -93.55619812011719, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1712257862091064, + "rewards/margins": 16.469789505004883, + "rewards/rejected": -18.641014099121094, + "step": 3210 + }, + { + "epoch": 1.47, + "learning_rate": 1.6996448503297815e-07, + "logits/chosen": -1.7486251592636108, + "logits/rejected": -1.669952154159546, + "logps/chosen": -53.65233612060547, + "logps/rejected": -96.1492691040039, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5661886930465698, + "rewards/margins": 17.999963760375977, + "rewards/rejected": -19.566152572631836, + "step": 3220 + }, + { + "epoch": 1.47, + "learning_rate": 1.6945712836123792e-07, + "logits/chosen": -1.7423101663589478, + "logits/rejected": -1.652051329612732, + "logps/chosen": -55.4273681640625, + "logps/rejected": -98.6847915649414, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.360779881477356, + "rewards/margins": 18.79489517211914, + "rewards/rejected": -20.155675888061523, + "step": 3230 + }, + { + "epoch": 1.48, + "learning_rate": 1.689497716894977e-07, + "logits/chosen": -1.7299396991729736, + "logits/rejected": -1.6439214944839478, + "logps/chosen": -57.0401611328125, + "logps/rejected": -98.83965301513672, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5103360414505005, + "rewards/margins": 18.032917022705078, + "rewards/rejected": -19.543251037597656, + "step": 3240 + }, + { + "epoch": 1.48, + "learning_rate": 1.6844241501775745e-07, + "logits/chosen": -1.7509998083114624, + "logits/rejected": -1.6693347692489624, + "logps/chosen": -59.036094665527344, + "logps/rejected": -98.47119903564453, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9223308563232422, + "rewards/margins": 17.635066986083984, + "rewards/rejected": -19.557395935058594, + "step": 3250 + }, + { + "epoch": 1.49, + "learning_rate": 1.6793505834601722e-07, + "logits/chosen": -1.738722562789917, + "logits/rejected": -1.6587598323822021, + "logps/chosen": -53.109886169433594, + "logps/rejected": -97.91802978515625, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6686608791351318, + "rewards/margins": 18.069690704345703, + "rewards/rejected": -19.73834991455078, + "step": 3260 + }, + { + "epoch": 1.49, + "learning_rate": 1.67427701674277e-07, + "logits/chosen": -1.7291982173919678, + "logits/rejected": -1.6407372951507568, + "logps/chosen": -54.598365783691406, + "logps/rejected": -95.91632080078125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.455635905265808, + "rewards/margins": 17.710803985595703, + "rewards/rejected": -19.166439056396484, + "step": 3270 + }, + { + "epoch": 1.5, + "learning_rate": 1.6692034500253675e-07, + "logits/chosen": -1.761309027671814, + "logits/rejected": -1.6706691980361938, + "logps/chosen": -56.527313232421875, + "logps/rejected": -98.2834243774414, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.34516179561615, + "rewards/margins": 18.49872589111328, + "rewards/rejected": -19.843887329101562, + "step": 3280 + }, + { + "epoch": 1.5, + "learning_rate": 1.6641298833079652e-07, + "logits/chosen": -1.741758942604065, + "logits/rejected": -1.6503000259399414, + "logps/chosen": -57.42247772216797, + "logps/rejected": -94.12330627441406, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4599244594573975, + "rewards/margins": 17.578405380249023, + "rewards/rejected": -19.038328170776367, + "step": 3290 + }, + { + "epoch": 1.51, + "learning_rate": 1.659056316590563e-07, + "logits/chosen": -1.7459741830825806, + "logits/rejected": -1.6614172458648682, + "logps/chosen": -53.6883659362793, + "logps/rejected": -90.2201156616211, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5786361694335938, + "rewards/margins": 17.19680404663086, + "rewards/rejected": -18.775440216064453, + "step": 3300 + }, + { + "epoch": 1.51, + "eval_logits/chosen": -1.5285367965698242, + "eval_logits/rejected": -1.4458637237548828, + "eval_logps/chosen": -83.92906188964844, + "eval_logps/rejected": -99.31780242919922, + "eval_loss": 0.006135547533631325, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -3.9735474586486816, + "eval_rewards/margins": 15.733240127563477, + "eval_rewards/rejected": -19.706789016723633, + "eval_runtime": 144.8765, + "eval_samples_per_second": 19.755, + "eval_steps_per_second": 1.236, + "step": 3300 + }, + { + "epoch": 1.51, + "learning_rate": 1.6539827498731605e-07, + "logits/chosen": -1.741248369216919, + "logits/rejected": -1.6535362005233765, + "logps/chosen": -58.149566650390625, + "logps/rejected": -93.27287292480469, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4163057804107666, + "rewards/margins": 18.03692054748535, + "rewards/rejected": -19.453222274780273, + "step": 3310 + }, + { + "epoch": 1.52, + "learning_rate": 1.6489091831557582e-07, + "logits/chosen": -1.7589938640594482, + "logits/rejected": -1.67257559299469, + "logps/chosen": -58.156166076660156, + "logps/rejected": -98.0214614868164, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6777223348617554, + "rewards/margins": 17.951839447021484, + "rewards/rejected": -19.629560470581055, + "step": 3320 + }, + { + "epoch": 1.52, + "learning_rate": 1.643835616438356e-07, + "logits/chosen": -1.7588249444961548, + "logits/rejected": -1.677903175354004, + "logps/chosen": -54.59656524658203, + "logps/rejected": -98.17218780517578, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.640683889389038, + "rewards/margins": 18.058780670166016, + "rewards/rejected": -19.699462890625, + "step": 3330 + }, + { + "epoch": 1.52, + "learning_rate": 1.6387620497209535e-07, + "logits/chosen": -1.750461220741272, + "logits/rejected": -1.669263482093811, + "logps/chosen": -55.287437438964844, + "logps/rejected": -98.08964538574219, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6613962650299072, + "rewards/margins": 18.753292083740234, + "rewards/rejected": -20.414690017700195, + "step": 3340 + }, + { + "epoch": 1.53, + "learning_rate": 1.6336884830035512e-07, + "logits/chosen": -1.737192153930664, + "logits/rejected": -1.6521022319793701, + "logps/chosen": -56.506439208984375, + "logps/rejected": -98.63439178466797, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.836482048034668, + "rewards/margins": 18.692378997802734, + "rewards/rejected": -20.528860092163086, + "step": 3350 + }, + { + "epoch": 1.53, + "learning_rate": 1.6286149162861489e-07, + "logits/chosen": -1.745640754699707, + "logits/rejected": -1.648800253868103, + "logps/chosen": -57.18326950073242, + "logps/rejected": -99.80830383300781, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6478908061981201, + "rewards/margins": 19.097484588623047, + "rewards/rejected": -20.74537467956543, + "step": 3360 + }, + { + "epoch": 1.54, + "learning_rate": 1.6235413495687465e-07, + "logits/chosen": -1.7396637201309204, + "logits/rejected": -1.6458467245101929, + "logps/chosen": -55.3143310546875, + "logps/rejected": -93.59313201904297, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.841915488243103, + "rewards/margins": 18.101560592651367, + "rewards/rejected": -19.943477630615234, + "step": 3370 + }, + { + "epoch": 1.54, + "learning_rate": 1.6184677828513442e-07, + "logits/chosen": -1.7592235803604126, + "logits/rejected": -1.6574302911758423, + "logps/chosen": -60.5203971862793, + "logps/rejected": -97.92893981933594, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1750316619873047, + "rewards/margins": 17.71821403503418, + "rewards/rejected": -19.89324378967285, + "step": 3380 + }, + { + "epoch": 1.55, + "learning_rate": 1.613394216133942e-07, + "logits/chosen": -1.7654197216033936, + "logits/rejected": -1.6703176498413086, + "logps/chosen": -58.678627014160156, + "logps/rejected": -97.8316650390625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7995555400848389, + "rewards/margins": 18.441692352294922, + "rewards/rejected": -20.241247177124023, + "step": 3390 + }, + { + "epoch": 1.55, + "learning_rate": 1.6083206494165398e-07, + "logits/chosen": -1.7394158840179443, + "logits/rejected": -1.6462080478668213, + "logps/chosen": -56.487518310546875, + "logps/rejected": -99.09591674804688, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0049827098846436, + "rewards/margins": 18.279010772705078, + "rewards/rejected": -20.28399658203125, + "step": 3400 + }, + { + "epoch": 1.55, + "eval_logits/chosen": -1.524535059928894, + "eval_logits/rejected": -1.4411815404891968, + "eval_logps/chosen": -84.63044738769531, + "eval_logps/rejected": -100.86541748046875, + "eval_loss": 0.006590413860976696, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": -4.324239730834961, + "eval_rewards/margins": 16.15635871887207, + "eval_rewards/rejected": -20.480600357055664, + "eval_runtime": 144.5579, + "eval_samples_per_second": 19.798, + "eval_steps_per_second": 1.238, + "step": 3400 + }, + { + "epoch": 1.56, + "learning_rate": 1.6032470826991375e-07, + "logits/chosen": -1.7354533672332764, + "logits/rejected": -1.6529903411865234, + "logps/chosen": -54.403541564941406, + "logps/rejected": -96.82051849365234, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.3602700233459473, + "rewards/margins": 17.785980224609375, + "rewards/rejected": -20.146251678466797, + "step": 3410 + }, + { + "epoch": 1.56, + "learning_rate": 1.598173515981735e-07, + "logits/chosen": -1.739316701889038, + "logits/rejected": -1.6420605182647705, + "logps/chosen": -57.6550407409668, + "logps/rejected": -97.14595794677734, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8596147298812866, + "rewards/margins": 17.877527236938477, + "rewards/rejected": -19.73714256286621, + "step": 3420 + }, + { + "epoch": 1.57, + "learning_rate": 1.5930999492643328e-07, + "logits/chosen": -1.7393652200698853, + "logits/rejected": -1.6525812149047852, + "logps/chosen": -59.345985412597656, + "logps/rejected": -100.73359680175781, + "loss": 0.0027, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.462760329246521, + "rewards/margins": 19.496845245361328, + "rewards/rejected": -20.959606170654297, + "step": 3430 + }, + { + "epoch": 1.57, + "learning_rate": 1.5880263825469305e-07, + "logits/chosen": -1.7402441501617432, + "logits/rejected": -1.6511075496673584, + "logps/chosen": -58.53057098388672, + "logps/rejected": -99.9593734741211, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.192598819732666, + "rewards/margins": 19.186471939086914, + "rewards/rejected": -21.379072189331055, + "step": 3440 + }, + { + "epoch": 1.57, + "learning_rate": 1.582952815829528e-07, + "logits/chosen": -1.757459044456482, + "logits/rejected": -1.6511247158050537, + "logps/chosen": -60.21623611450195, + "logps/rejected": -100.17884826660156, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.050049304962158, + "rewards/margins": 18.79250717163086, + "rewards/rejected": -20.842559814453125, + "step": 3450 + }, + { + "epoch": 1.58, + "learning_rate": 1.5778792491121258e-07, + "logits/chosen": -1.716695785522461, + "logits/rejected": -1.6268866062164307, + "logps/chosen": -60.640830993652344, + "logps/rejected": -101.121337890625, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.0517373085021973, + "rewards/margins": 19.40340232849121, + "rewards/rejected": -21.45513916015625, + "step": 3460 + }, + { + "epoch": 1.58, + "learning_rate": 1.5728056823947235e-07, + "logits/chosen": -1.7543067932128906, + "logits/rejected": -1.6606168746948242, + "logps/chosen": -55.8422737121582, + "logps/rejected": -100.81498718261719, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2675364017486572, + "rewards/margins": 19.677331924438477, + "rewards/rejected": -21.944866180419922, + "step": 3470 + }, + { + "epoch": 1.59, + "learning_rate": 1.567732115677321e-07, + "logits/chosen": -1.7218773365020752, + "logits/rejected": -1.645035982131958, + "logps/chosen": -56.04047393798828, + "logps/rejected": -106.46551513671875, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6016311645507812, + "rewards/margins": 20.23841667175293, + "rewards/rejected": -21.84004783630371, + "step": 3480 + }, + { + "epoch": 1.59, + "learning_rate": 1.5626585489599188e-07, + "logits/chosen": -1.7585750818252563, + "logits/rejected": -1.667877197265625, + "logps/chosen": -60.819419860839844, + "logps/rejected": -102.30345153808594, + "loss": 0.0024, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.7390552759170532, + "rewards/margins": 19.786039352416992, + "rewards/rejected": -21.525094985961914, + "step": 3490 + }, + { + "epoch": 1.6, + "learning_rate": 1.5575849822425165e-07, + "logits/chosen": -1.7323825359344482, + "logits/rejected": -1.6479371786117554, + "logps/chosen": -55.760154724121094, + "logps/rejected": -97.13644409179688, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.751079797744751, + "rewards/margins": 18.547962188720703, + "rewards/rejected": -20.299041748046875, + "step": 3500 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.5145334005355835, + "eval_logits/rejected": -1.4307856559753418, + "eval_logps/chosen": -85.55133819580078, + "eval_logps/rejected": -101.94500732421875, + "eval_loss": 0.009321879595518112, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -4.784679889678955, + "eval_rewards/margins": 16.235702514648438, + "eval_rewards/rejected": -21.0203857421875, + "eval_runtime": 162.7265, + "eval_samples_per_second": 17.588, + "eval_steps_per_second": 1.1, + "step": 3500 + }, + { + "epoch": 1.6, + "learning_rate": 1.552511415525114e-07, + "logits/chosen": -1.7493646144866943, + "logits/rejected": -1.6622288227081299, + "logps/chosen": -57.598350524902344, + "logps/rejected": -92.99654388427734, + "loss": 0.0046, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.8141578435897827, + "rewards/margins": 17.225921630859375, + "rewards/rejected": -19.040081024169922, + "step": 3510 + }, + { + "epoch": 1.61, + "learning_rate": 1.5474378488077118e-07, + "logits/chosen": -1.7772926092147827, + "logits/rejected": -1.686802864074707, + "logps/chosen": -58.4640007019043, + "logps/rejected": -99.5447769165039, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.02579927444458, + "rewards/margins": 18.582260131835938, + "rewards/rejected": -20.60805892944336, + "step": 3520 + }, + { + "epoch": 1.61, + "learning_rate": 1.5423642820903095e-07, + "logits/chosen": -1.7540311813354492, + "logits/rejected": -1.6697025299072266, + "logps/chosen": -58.2379150390625, + "logps/rejected": -98.92530822753906, + "loss": 0.0035, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.79413640499115, + "rewards/margins": 18.22900390625, + "rewards/rejected": -20.02313995361328, + "step": 3530 + }, + { + "epoch": 1.62, + "learning_rate": 1.537290715372907e-07, + "logits/chosen": -1.7464195489883423, + "logits/rejected": -1.656534194946289, + "logps/chosen": -55.99895095825195, + "logps/rejected": -97.24131774902344, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6266868114471436, + "rewards/margins": 18.616073608398438, + "rewards/rejected": -20.242759704589844, + "step": 3540 + }, + { + "epoch": 1.62, + "learning_rate": 1.5322171486555048e-07, + "logits/chosen": -1.7702693939208984, + "logits/rejected": -1.6523029804229736, + "logps/chosen": -62.9988899230957, + "logps/rejected": -101.58716583251953, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6101036071777344, + "rewards/margins": 19.229171752929688, + "rewards/rejected": -20.83927345275879, + "step": 3550 + }, + { + "epoch": 1.63, + "learning_rate": 1.5271435819381025e-07, + "logits/chosen": -1.7666019201278687, + "logits/rejected": -1.6752361059188843, + "logps/chosen": -62.15509796142578, + "logps/rejected": -101.52202606201172, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9190824031829834, + "rewards/margins": 18.65341567993164, + "rewards/rejected": -20.572498321533203, + "step": 3560 + }, + { + "epoch": 1.63, + "learning_rate": 1.5220700152207e-07, + "logits/chosen": -1.7496618032455444, + "logits/rejected": -1.6386051177978516, + "logps/chosen": -59.79640579223633, + "logps/rejected": -99.09815216064453, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0094802379608154, + "rewards/margins": 19.474361419677734, + "rewards/rejected": -21.483840942382812, + "step": 3570 + }, + { + "epoch": 1.63, + "learning_rate": 1.5169964485032978e-07, + "logits/chosen": -1.7477951049804688, + "logits/rejected": -1.6757123470306396, + "logps/chosen": -54.9462890625, + "logps/rejected": -98.96672058105469, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8885055780410767, + "rewards/margins": 18.964462280273438, + "rewards/rejected": -20.85296630859375, + "step": 3580 + }, + { + "epoch": 1.64, + "learning_rate": 1.5119228817858955e-07, + "logits/chosen": -1.7456642389297485, + "logits/rejected": -1.6490224599838257, + "logps/chosen": -59.340980529785156, + "logps/rejected": -104.44132995605469, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.894884467124939, + "rewards/margins": 19.993732452392578, + "rewards/rejected": -21.888614654541016, + "step": 3590 + }, + { + "epoch": 1.64, + "learning_rate": 1.506849315068493e-07, + "logits/chosen": -1.7418420314788818, + "logits/rejected": -1.6429531574249268, + "logps/chosen": -60.49907684326172, + "logps/rejected": -99.40533447265625, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.947064757347107, + "rewards/margins": 18.43744468688965, + "rewards/rejected": -20.384510040283203, + "step": 3600 + }, + { + "epoch": 1.64, + "eval_logits/chosen": -1.5208775997161865, + "eval_logits/rejected": -1.4372811317443848, + "eval_logps/chosen": -85.1228256225586, + "eval_logps/rejected": -101.82320404052734, + "eval_loss": 0.00761109683662653, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -4.570422649383545, + "eval_rewards/margins": 16.38906478881836, + "eval_rewards/rejected": -20.95948600769043, + "eval_runtime": 160.6097, + "eval_samples_per_second": 17.82, + "eval_steps_per_second": 1.115, + "step": 3600 + }, + { + "epoch": 1.65, + "learning_rate": 1.5017757483510908e-07, + "logits/chosen": -1.744821548461914, + "logits/rejected": -1.6511716842651367, + "logps/chosen": -57.97832107543945, + "logps/rejected": -104.1939468383789, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5856506824493408, + "rewards/margins": 20.175537109375, + "rewards/rejected": -21.761188507080078, + "step": 3610 + }, + { + "epoch": 1.65, + "learning_rate": 1.4967021816336885e-07, + "logits/chosen": -1.7192729711532593, + "logits/rejected": -1.6354973316192627, + "logps/chosen": -52.93940353393555, + "logps/rejected": -93.718994140625, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.6425870656967163, + "rewards/margins": 18.601680755615234, + "rewards/rejected": -20.244266510009766, + "step": 3620 + }, + { + "epoch": 1.66, + "learning_rate": 1.491628614916286e-07, + "logits/chosen": -1.7456947565078735, + "logits/rejected": -1.662951111793518, + "logps/chosen": -60.57215118408203, + "logps/rejected": -105.87451171875, + "loss": 0.0055, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.069582223892212, + "rewards/margins": 19.797704696655273, + "rewards/rejected": -21.86728858947754, + "step": 3630 + }, + { + "epoch": 1.66, + "learning_rate": 1.4865550481988838e-07, + "logits/chosen": -1.7416263818740845, + "logits/rejected": -1.659208059310913, + "logps/chosen": -57.92128372192383, + "logps/rejected": -102.81998443603516, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9102433919906616, + "rewards/margins": 19.905513763427734, + "rewards/rejected": -21.81575584411621, + "step": 3640 + }, + { + "epoch": 1.67, + "learning_rate": 1.4814814814814815e-07, + "logits/chosen": -1.7522609233856201, + "logits/rejected": -1.6549276113510132, + "logps/chosen": -61.67973709106445, + "logps/rejected": -102.40510559082031, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2504985332489014, + "rewards/margins": 18.94046401977539, + "rewards/rejected": -21.19096565246582, + "step": 3650 + }, + { + "epoch": 1.67, + "learning_rate": 1.476407914764079e-07, + "logits/chosen": -1.7534595727920532, + "logits/rejected": -1.6623141765594482, + "logps/chosen": -63.60465621948242, + "logps/rejected": -101.19085693359375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.382418394088745, + "rewards/margins": 18.830825805664062, + "rewards/rejected": -21.21324348449707, + "step": 3660 + }, + { + "epoch": 1.68, + "learning_rate": 1.4713343480466768e-07, + "logits/chosen": -1.755079984664917, + "logits/rejected": -1.6616798639297485, + "logps/chosen": -57.2696418762207, + "logps/rejected": -102.25415802001953, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1452555656433105, + "rewards/margins": 19.918466567993164, + "rewards/rejected": -22.063720703125, + "step": 3670 + }, + { + "epoch": 1.68, + "learning_rate": 1.4662607813292745e-07, + "logits/chosen": -1.758702278137207, + "logits/rejected": -1.657207727432251, + "logps/chosen": -58.540130615234375, + "logps/rejected": -99.57029724121094, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.671474814414978, + "rewards/margins": 19.523405075073242, + "rewards/rejected": -21.19487953186035, + "step": 3680 + }, + { + "epoch": 1.68, + "learning_rate": 1.461187214611872e-07, + "logits/chosen": -1.7256863117218018, + "logits/rejected": -1.639664888381958, + "logps/chosen": -58.728057861328125, + "logps/rejected": -101.06281280517578, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.20875883102417, + "rewards/margins": 19.420764923095703, + "rewards/rejected": -21.629526138305664, + "step": 3690 + }, + { + "epoch": 1.69, + "learning_rate": 1.4561136478944698e-07, + "logits/chosen": -1.7542873620986938, + "logits/rejected": -1.6643635034561157, + "logps/chosen": -57.53455352783203, + "logps/rejected": -104.0309066772461, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9398162364959717, + "rewards/margins": 20.248397827148438, + "rewards/rejected": -22.188215255737305, + "step": 3700 + }, + { + "epoch": 1.69, + "eval_logits/chosen": -1.5147947072982788, + "eval_logits/rejected": -1.4300434589385986, + "eval_logps/chosen": -85.57495880126953, + "eval_logps/rejected": -103.20862579345703, + "eval_loss": 0.008725001476705074, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -4.79649019241333, + "eval_rewards/margins": 16.855709075927734, + "eval_rewards/rejected": -21.652198791503906, + "eval_runtime": 152.7457, + "eval_samples_per_second": 18.737, + "eval_steps_per_second": 1.172, + "step": 3700 + }, + { + "epoch": 1.69, + "learning_rate": 1.4510400811770675e-07, + "logits/chosen": -1.727120041847229, + "logits/rejected": -1.611891746520996, + "logps/chosen": -63.399688720703125, + "logps/rejected": -101.63905334472656, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4259730577468872, + "rewards/margins": 19.92839241027832, + "rewards/rejected": -21.35436248779297, + "step": 3710 + }, + { + "epoch": 1.7, + "learning_rate": 1.445966514459665e-07, + "logits/chosen": -1.731498122215271, + "logits/rejected": -1.6396141052246094, + "logps/chosen": -57.56293869018555, + "logps/rejected": -103.0668716430664, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1499602794647217, + "rewards/margins": 19.873432159423828, + "rewards/rejected": -22.023391723632812, + "step": 3720 + }, + { + "epoch": 1.7, + "learning_rate": 1.4408929477422628e-07, + "logits/chosen": -1.7379181385040283, + "logits/rejected": -1.645946741104126, + "logps/chosen": -58.8451042175293, + "logps/rejected": -103.89896392822266, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7194750308990479, + "rewards/margins": 20.302326202392578, + "rewards/rejected": -22.021800994873047, + "step": 3730 + }, + { + "epoch": 1.71, + "learning_rate": 1.4358193810248604e-07, + "logits/chosen": -1.7280566692352295, + "logits/rejected": -1.6349445581436157, + "logps/chosen": -60.61736297607422, + "logps/rejected": -102.3194351196289, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.244107484817505, + "rewards/margins": 20.2021541595459, + "rewards/rejected": -22.446266174316406, + "step": 3740 + }, + { + "epoch": 1.71, + "learning_rate": 1.430745814307458e-07, + "logits/chosen": -1.7542724609375, + "logits/rejected": -1.6561269760131836, + "logps/chosen": -61.44940948486328, + "logps/rejected": -101.87255096435547, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5370864868164062, + "rewards/margins": 20.14695930480957, + "rewards/rejected": -22.68404769897461, + "step": 3750 + }, + { + "epoch": 1.72, + "learning_rate": 1.4256722475900558e-07, + "logits/chosen": -1.7171341180801392, + "logits/rejected": -1.6243823766708374, + "logps/chosen": -55.97332000732422, + "logps/rejected": -101.05335998535156, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7775256633758545, + "rewards/margins": 20.30779457092285, + "rewards/rejected": -22.08531951904297, + "step": 3760 + }, + { + "epoch": 1.72, + "learning_rate": 1.4205986808726534e-07, + "logits/chosen": -1.7553892135620117, + "logits/rejected": -1.6701765060424805, + "logps/chosen": -58.353965759277344, + "logps/rejected": -108.2108383178711, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4113121032714844, + "rewards/margins": 20.59480094909668, + "rewards/rejected": -23.006113052368164, + "step": 3770 + }, + { + "epoch": 1.73, + "learning_rate": 1.415525114155251e-07, + "logits/chosen": -1.7134288549423218, + "logits/rejected": -1.6168136596679688, + "logps/chosen": -61.3768424987793, + "logps/rejected": -104.5031509399414, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.53882098197937, + "rewards/margins": 20.721431732177734, + "rewards/rejected": -23.260251998901367, + "step": 3780 + }, + { + "epoch": 1.73, + "learning_rate": 1.4104515474378488e-07, + "logits/chosen": -1.7239723205566406, + "logits/rejected": -1.6319774389266968, + "logps/chosen": -61.295196533203125, + "logps/rejected": -101.477783203125, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6111342906951904, + "rewards/margins": 19.920166015625, + "rewards/rejected": -22.531299591064453, + "step": 3790 + }, + { + "epoch": 1.73, + "learning_rate": 1.4053779807204464e-07, + "logits/chosen": -1.7131726741790771, + "logits/rejected": -1.6239140033721924, + "logps/chosen": -58.66071701049805, + "logps/rejected": -100.33702087402344, + "loss": 0.0056, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.3244271278381348, + "rewards/margins": 19.755664825439453, + "rewards/rejected": -22.08009147644043, + "step": 3800 + }, + { + "epoch": 1.73, + "eval_logits/chosen": -1.505843162536621, + "eval_logits/rejected": -1.4213422536849976, + "eval_logps/chosen": -86.23440551757812, + "eval_logps/rejected": -104.42259216308594, + "eval_loss": 0.009252488613128662, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -5.1262125968933105, + "eval_rewards/margins": 17.132970809936523, + "eval_rewards/rejected": -22.25918197631836, + "eval_runtime": 143.7786, + "eval_samples_per_second": 19.906, + "eval_steps_per_second": 1.245, + "step": 3800 + }, + { + "epoch": 1.74, + "learning_rate": 1.400304414003044e-07, + "logits/chosen": -1.7435963153839111, + "logits/rejected": -1.6586761474609375, + "logps/chosen": -57.895790100097656, + "logps/rejected": -104.03873443603516, + "loss": 0.0012, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.4536798000335693, + "rewards/margins": 20.30872917175293, + "rewards/rejected": -22.762409210205078, + "step": 3810 + }, + { + "epoch": 1.74, + "learning_rate": 1.3952308472856418e-07, + "logits/chosen": -1.7410093545913696, + "logits/rejected": -1.6523460149765015, + "logps/chosen": -56.3589973449707, + "logps/rejected": -98.3576431274414, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9067256450653076, + "rewards/margins": 19.53439712524414, + "rewards/rejected": -21.441125869750977, + "step": 3820 + }, + { + "epoch": 1.75, + "learning_rate": 1.3901572805682394e-07, + "logits/chosen": -1.7533388137817383, + "logits/rejected": -1.6592824459075928, + "logps/chosen": -60.925758361816406, + "logps/rejected": -102.49827575683594, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.274437427520752, + "rewards/margins": 19.64658546447754, + "rewards/rejected": -21.921022415161133, + "step": 3830 + }, + { + "epoch": 1.75, + "learning_rate": 1.385083713850837e-07, + "logits/chosen": -1.7348659038543701, + "logits/rejected": -1.6244924068450928, + "logps/chosen": -58.76271438598633, + "logps/rejected": -102.5107192993164, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.965003252029419, + "rewards/margins": 20.968101501464844, + "rewards/rejected": -22.93310546875, + "step": 3840 + }, + { + "epoch": 1.76, + "learning_rate": 1.3800101471334348e-07, + "logits/chosen": -1.7477362155914307, + "logits/rejected": -1.6386569738388062, + "logps/chosen": -60.78839874267578, + "logps/rejected": -95.99488067626953, + "loss": 0.0056, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.53136944770813, + "rewards/margins": 18.4868221282959, + "rewards/rejected": -21.018192291259766, + "step": 3850 + }, + { + "epoch": 1.76, + "learning_rate": 1.3749365804160324e-07, + "logits/chosen": -1.7221263647079468, + "logits/rejected": -1.623956322669983, + "logps/chosen": -59.84740447998047, + "logps/rejected": -100.80842590332031, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.1374335289001465, + "rewards/margins": 19.245466232299805, + "rewards/rejected": -21.38290023803711, + "step": 3860 + }, + { + "epoch": 1.77, + "learning_rate": 1.36986301369863e-07, + "logits/chosen": -1.7168346643447876, + "logits/rejected": -1.6347538232803345, + "logps/chosen": -59.019287109375, + "logps/rejected": -106.46458435058594, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.109846591949463, + "rewards/margins": 20.929542541503906, + "rewards/rejected": -23.03938865661621, + "step": 3870 + }, + { + "epoch": 1.77, + "learning_rate": 1.3647894469812278e-07, + "logits/chosen": -1.7261329889297485, + "logits/rejected": -1.631757378578186, + "logps/chosen": -61.130950927734375, + "logps/rejected": -105.9852523803711, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3457083702087402, + "rewards/margins": 21.277233123779297, + "rewards/rejected": -23.622940063476562, + "step": 3880 + }, + { + "epoch": 1.78, + "learning_rate": 1.3597158802638254e-07, + "logits/chosen": -1.7206604480743408, + "logits/rejected": -1.634374976158142, + "logps/chosen": -56.732521057128906, + "logps/rejected": -104.9426498413086, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.531989097595215, + "rewards/margins": 20.72577476501465, + "rewards/rejected": -23.25776481628418, + "step": 3890 + }, + { + "epoch": 1.78, + "learning_rate": 1.354642313546423e-07, + "logits/chosen": -1.710542917251587, + "logits/rejected": -1.6287927627563477, + "logps/chosen": -60.89078903198242, + "logps/rejected": -100.89933013916016, + "loss": 0.0024, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.0867960453033447, + "rewards/margins": 19.906475067138672, + "rewards/rejected": -22.993270874023438, + "step": 3900 + }, + { + "epoch": 1.78, + "eval_logits/chosen": -1.4855576753616333, + "eval_logits/rejected": -1.4014440774917603, + "eval_logps/chosen": -87.70219421386719, + "eval_logps/rejected": -107.43189239501953, + "eval_loss": 0.011312047950923443, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -5.860104560852051, + "eval_rewards/margins": 17.90372085571289, + "eval_rewards/rejected": -23.76382827758789, + "eval_runtime": 140.5169, + "eval_samples_per_second": 20.368, + "eval_steps_per_second": 1.274, + "step": 3900 + }, + { + "epoch": 1.78, + "learning_rate": 1.3495687468290208e-07, + "logits/chosen": -1.7028968334197998, + "logits/rejected": -1.6030282974243164, + "logps/chosen": -59.007225036621094, + "logps/rejected": -101.65214538574219, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.518394947052002, + "rewards/margins": 20.77285385131836, + "rewards/rejected": -23.291250228881836, + "step": 3910 + }, + { + "epoch": 1.79, + "learning_rate": 1.3444951801116184e-07, + "logits/chosen": -1.7411220073699951, + "logits/rejected": -1.6353130340576172, + "logps/chosen": -60.95904541015625, + "logps/rejected": -99.79039001464844, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8324027061462402, + "rewards/margins": 20.231990814208984, + "rewards/rejected": -23.06439208984375, + "step": 3920 + }, + { + "epoch": 1.79, + "learning_rate": 1.339421613394216e-07, + "logits/chosen": -1.739633560180664, + "logits/rejected": -1.6502149105072021, + "logps/chosen": -61.77033233642578, + "logps/rejected": -108.11653137207031, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.709038734436035, + "rewards/margins": 21.357595443725586, + "rewards/rejected": -24.066635131835938, + "step": 3930 + }, + { + "epoch": 1.8, + "learning_rate": 1.3343480466768138e-07, + "logits/chosen": -1.713693380355835, + "logits/rejected": -1.620425820350647, + "logps/chosen": -57.366050720214844, + "logps/rejected": -103.63282775878906, + "loss": 0.0073, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.4770348072052, + "rewards/margins": 20.998950958251953, + "rewards/rejected": -23.475988388061523, + "step": 3940 + }, + { + "epoch": 1.8, + "learning_rate": 1.3292744799594114e-07, + "logits/chosen": -1.7242887020111084, + "logits/rejected": -1.635079026222229, + "logps/chosen": -54.93780517578125, + "logps/rejected": -101.4769515991211, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7494869232177734, + "rewards/margins": 20.059358596801758, + "rewards/rejected": -21.80884552001953, + "step": 3950 + }, + { + "epoch": 1.81, + "learning_rate": 1.324200913242009e-07, + "logits/chosen": -1.7211713790893555, + "logits/rejected": -1.635608434677124, + "logps/chosen": -59.47595977783203, + "logps/rejected": -102.97447204589844, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3539652824401855, + "rewards/margins": 20.05697250366211, + "rewards/rejected": -22.41093635559082, + "step": 3960 + }, + { + "epoch": 1.81, + "learning_rate": 1.3191273465246068e-07, + "logits/chosen": -1.712767243385315, + "logits/rejected": -1.6329208612442017, + "logps/chosen": -57.88202667236328, + "logps/rejected": -103.71693420410156, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4641976356506348, + "rewards/margins": 20.41353416442871, + "rewards/rejected": -22.87773323059082, + "step": 3970 + }, + { + "epoch": 1.82, + "learning_rate": 1.3140537798072044e-07, + "logits/chosen": -1.7149088382720947, + "logits/rejected": -1.6296100616455078, + "logps/chosen": -56.60906982421875, + "logps/rejected": -98.81111907958984, + "loss": 0.0045, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.384639263153076, + "rewards/margins": 19.819398880004883, + "rewards/rejected": -22.204036712646484, + "step": 3980 + }, + { + "epoch": 1.82, + "learning_rate": 1.308980213089802e-07, + "logits/chosen": -1.7182649374008179, + "logits/rejected": -1.6282594203948975, + "logps/chosen": -59.917518615722656, + "logps/rejected": -104.40673828125, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4817817211151123, + "rewards/margins": 20.250804901123047, + "rewards/rejected": -22.732585906982422, + "step": 3990 + }, + { + "epoch": 1.83, + "learning_rate": 1.3039066463723998e-07, + "logits/chosen": -1.7276928424835205, + "logits/rejected": -1.6250627040863037, + "logps/chosen": -61.18939971923828, + "logps/rejected": -102.0118408203125, + "loss": 0.0034, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.5209269523620605, + "rewards/margins": 20.70689582824707, + "rewards/rejected": -23.22782325744629, + "step": 4000 + }, + { + "epoch": 1.83, + "eval_logits/chosen": -1.5083593130111694, + "eval_logits/rejected": -1.4252426624298096, + "eval_logps/chosen": -85.39736938476562, + "eval_logps/rejected": -104.9570083618164, + "eval_loss": 0.005603944417089224, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": -4.707695484161377, + "eval_rewards/margins": 17.818696975708008, + "eval_rewards/rejected": -22.526391983032227, + "eval_runtime": 146.6093, + "eval_samples_per_second": 19.521, + "eval_steps_per_second": 1.221, + "step": 4000 + }, + { + "epoch": 1.83, + "learning_rate": 1.2988330796549974e-07, + "logits/chosen": -1.7435764074325562, + "logits/rejected": -1.6490623950958252, + "logps/chosen": -59.32027053833008, + "logps/rejected": -98.74386596679688, + "loss": 0.0066, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.657484769821167, + "rewards/margins": 18.735923767089844, + "rewards/rejected": -21.393407821655273, + "step": 4010 + }, + { + "epoch": 1.83, + "learning_rate": 1.293759512937595e-07, + "logits/chosen": -1.711301565170288, + "logits/rejected": -1.632124900817871, + "logps/chosen": -59.0091667175293, + "logps/rejected": -103.40666198730469, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.394582509994507, + "rewards/margins": 19.536239624023438, + "rewards/rejected": -21.93082618713379, + "step": 4020 + }, + { + "epoch": 1.84, + "learning_rate": 1.2886859462201928e-07, + "logits/chosen": -1.7019872665405273, + "logits/rejected": -1.6248067617416382, + "logps/chosen": -59.6156120300293, + "logps/rejected": -102.67716979980469, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.609593629837036, + "rewards/margins": 19.72539710998535, + "rewards/rejected": -22.334989547729492, + "step": 4030 + }, + { + "epoch": 1.84, + "learning_rate": 1.2836123795027904e-07, + "logits/chosen": -1.707863450050354, + "logits/rejected": -1.6396872997283936, + "logps/chosen": -55.025291442871094, + "logps/rejected": -102.29816436767578, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0787899494171143, + "rewards/margins": 19.629016876220703, + "rewards/rejected": -21.707805633544922, + "step": 4040 + }, + { + "epoch": 1.85, + "learning_rate": 1.278538812785388e-07, + "logits/chosen": -1.72417414188385, + "logits/rejected": -1.6330257654190063, + "logps/chosen": -61.2284049987793, + "logps/rejected": -106.95091247558594, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.1726553440093994, + "rewards/margins": 20.560657501220703, + "rewards/rejected": -22.733312606811523, + "step": 4050 + }, + { + "epoch": 1.85, + "learning_rate": 1.2734652460679858e-07, + "logits/chosen": -1.750223159790039, + "logits/rejected": -1.658272385597229, + "logps/chosen": -61.496177673339844, + "logps/rejected": -100.18705749511719, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0673086643218994, + "rewards/margins": 18.649917602539062, + "rewards/rejected": -20.717227935791016, + "step": 4060 + }, + { + "epoch": 1.86, + "learning_rate": 1.2683916793505834e-07, + "logits/chosen": -1.6931970119476318, + "logits/rejected": -1.624407410621643, + "logps/chosen": -55.50725173950195, + "logps/rejected": -103.68687438964844, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.344270944595337, + "rewards/margins": 20.012542724609375, + "rewards/rejected": -22.356815338134766, + "step": 4070 + }, + { + "epoch": 1.86, + "learning_rate": 1.263318112633181e-07, + "logits/chosen": -1.7359850406646729, + "logits/rejected": -1.6276299953460693, + "logps/chosen": -61.17974090576172, + "logps/rejected": -101.77490234375, + "loss": 0.0025, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.0892181396484375, + "rewards/margins": 18.614036560058594, + "rewards/rejected": -20.70325469970703, + "step": 4080 + }, + { + "epoch": 1.87, + "learning_rate": 1.2582445459157788e-07, + "logits/chosen": -1.7262541055679321, + "logits/rejected": -1.6360960006713867, + "logps/chosen": -60.631103515625, + "logps/rejected": -103.457275390625, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3985886573791504, + "rewards/margins": 19.82919692993164, + "rewards/rejected": -22.227787017822266, + "step": 4090 + }, + { + "epoch": 1.87, + "learning_rate": 1.2531709791983764e-07, + "logits/chosen": -1.7521579265594482, + "logits/rejected": -1.6497102975845337, + "logps/chosen": -58.76811599731445, + "logps/rejected": -104.64798736572266, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.8879072666168213, + "rewards/margins": 19.89638328552246, + "rewards/rejected": -21.784290313720703, + "step": 4100 + }, + { + "epoch": 1.87, + "eval_logits/chosen": -1.516519546508789, + "eval_logits/rejected": -1.4342458248138428, + "eval_logps/chosen": -84.54878234863281, + "eval_logps/rejected": -103.28939056396484, + "eval_loss": 0.005533331073820591, + "eval_rewards/accuracies": 0.9972066879272461, + "eval_rewards/chosen": -4.283407688140869, + "eval_rewards/margins": 17.4091796875, + "eval_rewards/rejected": -21.69258689880371, + "eval_runtime": 131.2471, + "eval_samples_per_second": 21.806, + "eval_steps_per_second": 1.364, + "step": 4100 + }, + { + "epoch": 1.88, + "learning_rate": 1.248097412480974e-07, + "logits/chosen": -1.7417571544647217, + "logits/rejected": -1.6413519382476807, + "logps/chosen": -60.33240509033203, + "logps/rejected": -102.51177978515625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6107101440429688, + "rewards/margins": 19.836740493774414, + "rewards/rejected": -21.447450637817383, + "step": 4110 + }, + { + "epoch": 1.88, + "learning_rate": 1.2430238457635718e-07, + "logits/chosen": -1.7408193349838257, + "logits/rejected": -1.6514530181884766, + "logps/chosen": -59.55634307861328, + "logps/rejected": -103.1875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4238922595977783, + "rewards/margins": 20.01588249206543, + "rewards/rejected": -22.43977165222168, + "step": 4120 + }, + { + "epoch": 1.89, + "learning_rate": 1.2379502790461694e-07, + "logits/chosen": -1.7198892831802368, + "logits/rejected": -1.6175673007965088, + "logps/chosen": -57.6531982421875, + "logps/rejected": -95.41826629638672, + "loss": 0.0054, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.287550449371338, + "rewards/margins": 18.77586555480957, + "rewards/rejected": -21.06341552734375, + "step": 4130 + }, + { + "epoch": 1.89, + "learning_rate": 1.232876712328767e-07, + "logits/chosen": -1.7502750158309937, + "logits/rejected": -1.6409406661987305, + "logps/chosen": -58.13480758666992, + "logps/rejected": -101.2226791381836, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8727871179580688, + "rewards/margins": 20.0989990234375, + "rewards/rejected": -21.971782684326172, + "step": 4140 + }, + { + "epoch": 1.89, + "learning_rate": 1.2278031456113648e-07, + "logits/chosen": -1.759542465209961, + "logits/rejected": -1.6358810663223267, + "logps/chosen": -66.03750610351562, + "logps/rejected": -101.20765686035156, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.005901575088501, + "rewards/margins": 19.78945541381836, + "rewards/rejected": -21.79535675048828, + "step": 4150 + }, + { + "epoch": 1.9, + "learning_rate": 1.2227295788939624e-07, + "logits/chosen": -1.7395435571670532, + "logits/rejected": -1.6474955081939697, + "logps/chosen": -59.45258331298828, + "logps/rejected": -103.9498062133789, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4537131786346436, + "rewards/margins": 20.302892684936523, + "rewards/rejected": -22.75660514831543, + "step": 4160 + }, + { + "epoch": 1.9, + "learning_rate": 1.21765601217656e-07, + "logits/chosen": -1.7050672769546509, + "logits/rejected": -1.6305850744247437, + "logps/chosen": -55.50453567504883, + "logps/rejected": -102.51220703125, + "loss": 0.0034, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.9435291290283203, + "rewards/margins": 19.749547958374023, + "rewards/rejected": -21.693078994750977, + "step": 4170 + }, + { + "epoch": 1.91, + "learning_rate": 1.2125824454591578e-07, + "logits/chosen": -1.7227487564086914, + "logits/rejected": -1.6449857950210571, + "logps/chosen": -60.23795700073242, + "logps/rejected": -103.9252700805664, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.536226511001587, + "rewards/margins": 20.490659713745117, + "rewards/rejected": -23.02688980102539, + "step": 4180 + }, + { + "epoch": 1.91, + "learning_rate": 1.2075088787417554e-07, + "logits/chosen": -1.7252013683319092, + "logits/rejected": -1.6343914270401, + "logps/chosen": -58.06959915161133, + "logps/rejected": -102.8636703491211, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.519500732421875, + "rewards/margins": 20.217092514038086, + "rewards/rejected": -22.73659324645996, + "step": 4190 + }, + { + "epoch": 1.92, + "learning_rate": 1.202435312024353e-07, + "logits/chosen": -1.7138278484344482, + "logits/rejected": -1.6251713037490845, + "logps/chosen": -59.78468704223633, + "logps/rejected": -101.74442291259766, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3976869583129883, + "rewards/margins": 19.762615203857422, + "rewards/rejected": -22.160301208496094, + "step": 4200 + }, + { + "epoch": 1.92, + "eval_logits/chosen": -1.5051515102386475, + "eval_logits/rejected": -1.4219218492507935, + "eval_logps/chosen": -86.4904556274414, + "eval_logps/rejected": -106.72370147705078, + "eval_loss": 0.006755765061825514, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -5.254235744476318, + "eval_rewards/margins": 18.15550422668457, + "eval_rewards/rejected": -23.409738540649414, + "eval_runtime": 167.7976, + "eval_samples_per_second": 17.056, + "eval_steps_per_second": 1.067, + "step": 4200 + }, + { + "epoch": 1.92, + "learning_rate": 1.1973617453069508e-07, + "logits/chosen": -1.7196204662322998, + "logits/rejected": -1.6182119846343994, + "logps/chosen": -57.03583908081055, + "logps/rejected": -105.01580810546875, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0129735469818115, + "rewards/margins": 22.04193687438965, + "rewards/rejected": -24.05491065979004, + "step": 4210 + }, + { + "epoch": 1.93, + "learning_rate": 1.1922881785895484e-07, + "logits/chosen": -1.7210935354232788, + "logits/rejected": -1.6417878866195679, + "logps/chosen": -59.10405349731445, + "logps/rejected": -108.11570739746094, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.621901035308838, + "rewards/margins": 21.172082901000977, + "rewards/rejected": -23.79398536682129, + "step": 4220 + }, + { + "epoch": 1.93, + "learning_rate": 1.187214611872146e-07, + "logits/chosen": -1.6935851573944092, + "logits/rejected": -1.6083828210830688, + "logps/chosen": -60.400779724121094, + "logps/rejected": -110.29762268066406, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7976603507995605, + "rewards/margins": 22.258953094482422, + "rewards/rejected": -25.05661392211914, + "step": 4230 + }, + { + "epoch": 1.94, + "learning_rate": 1.1821410451547436e-07, + "logits/chosen": -1.731496810913086, + "logits/rejected": -1.6574420928955078, + "logps/chosen": -56.34038162231445, + "logps/rejected": -107.848876953125, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.164360523223877, + "rewards/margins": 21.40178680419922, + "rewards/rejected": -23.56614875793457, + "step": 4240 + }, + { + "epoch": 1.94, + "learning_rate": 1.1770674784373413e-07, + "logits/chosen": -1.6939624547958374, + "logits/rejected": -1.6362526416778564, + "logps/chosen": -56.355072021484375, + "logps/rejected": -105.94290924072266, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9071943759918213, + "rewards/margins": 21.0939884185791, + "rewards/rejected": -24.001184463500977, + "step": 4250 + }, + { + "epoch": 1.94, + "learning_rate": 1.171993911719939e-07, + "logits/chosen": -1.716327428817749, + "logits/rejected": -1.6449180841445923, + "logps/chosen": -57.47583770751953, + "logps/rejected": -104.41899108886719, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8491833209991455, + "rewards/margins": 20.430660247802734, + "rewards/rejected": -23.279842376708984, + "step": 4260 + }, + { + "epoch": 1.95, + "learning_rate": 1.1669203450025366e-07, + "logits/chosen": -1.721514105796814, + "logits/rejected": -1.6436516046524048, + "logps/chosen": -58.92315673828125, + "logps/rejected": -103.16642761230469, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3837337493896484, + "rewards/margins": 20.6968936920166, + "rewards/rejected": -23.080623626708984, + "step": 4270 + }, + { + "epoch": 1.95, + "learning_rate": 1.1618467782851343e-07, + "logits/chosen": -1.7137353420257568, + "logits/rejected": -1.6262519359588623, + "logps/chosen": -57.92986297607422, + "logps/rejected": -106.00459289550781, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.288856029510498, + "rewards/margins": 21.330066680908203, + "rewards/rejected": -23.61892318725586, + "step": 4280 + }, + { + "epoch": 1.96, + "learning_rate": 1.156773211567732e-07, + "logits/chosen": -1.6893755197525024, + "logits/rejected": -1.5906169414520264, + "logps/chosen": -59.3648796081543, + "logps/rejected": -101.29659271240234, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7958602905273438, + "rewards/margins": 20.5319881439209, + "rewards/rejected": -23.327848434448242, + "step": 4290 + }, + { + "epoch": 1.96, + "learning_rate": 1.1516996448503296e-07, + "logits/chosen": -1.6958662271499634, + "logits/rejected": -1.611880898475647, + "logps/chosen": -59.040428161621094, + "logps/rejected": -108.2059326171875, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1087820529937744, + "rewards/margins": 21.529190063476562, + "rewards/rejected": -23.63797378540039, + "step": 4300 + }, + { + "epoch": 1.96, + "eval_logits/chosen": -1.4921066761016846, + "eval_logits/rejected": -1.4097992181777954, + "eval_logps/chosen": -86.48042297363281, + "eval_logps/rejected": -106.46904754638672, + "eval_loss": 0.007482536602765322, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -5.2492241859436035, + "eval_rewards/margins": 18.03318214416504, + "eval_rewards/rejected": -23.28240394592285, + "eval_runtime": 152.1579, + "eval_samples_per_second": 18.809, + "eval_steps_per_second": 1.176, + "step": 4300 + }, + { + "epoch": 1.97, + "learning_rate": 1.1466260781329273e-07, + "logits/chosen": -1.7025010585784912, + "logits/rejected": -1.6234486103057861, + "logps/chosen": -58.45012283325195, + "logps/rejected": -104.7151870727539, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.6991915702819824, + "rewards/margins": 20.52659034729004, + "rewards/rejected": -23.22578239440918, + "step": 4310 + }, + { + "epoch": 1.97, + "learning_rate": 1.141552511415525e-07, + "logits/chosen": -1.720330834388733, + "logits/rejected": -1.6162750720977783, + "logps/chosen": -61.7385139465332, + "logps/rejected": -103.19317626953125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2113752365112305, + "rewards/margins": 20.69656753540039, + "rewards/rejected": -22.907943725585938, + "step": 4320 + }, + { + "epoch": 1.98, + "learning_rate": 1.1364789446981226e-07, + "logits/chosen": -1.7113571166992188, + "logits/rejected": -1.620640516281128, + "logps/chosen": -65.27436828613281, + "logps/rejected": -107.57862854003906, + "loss": 0.0048, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.797924518585205, + "rewards/margins": 20.757877349853516, + "rewards/rejected": -23.555797576904297, + "step": 4330 + }, + { + "epoch": 1.98, + "learning_rate": 1.1314053779807203e-07, + "logits/chosen": -1.7076358795166016, + "logits/rejected": -1.6240257024765015, + "logps/chosen": -54.62898635864258, + "logps/rejected": -103.197509765625, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.441157341003418, + "rewards/margins": 20.72103500366211, + "rewards/rejected": -23.162189483642578, + "step": 4340 + }, + { + "epoch": 1.99, + "learning_rate": 1.126331811263318e-07, + "logits/chosen": -1.733656644821167, + "logits/rejected": -1.6405130624771118, + "logps/chosen": -62.487953186035156, + "logps/rejected": -102.05244445800781, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0095598697662354, + "rewards/margins": 19.754032135009766, + "rewards/rejected": -21.763591766357422, + "step": 4350 + }, + { + "epoch": 1.99, + "learning_rate": 1.1212582445459156e-07, + "logits/chosen": -1.7253767251968384, + "logits/rejected": -1.6205635070800781, + "logps/chosen": -63.746116638183594, + "logps/rejected": -108.1665267944336, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.041841506958008, + "rewards/margins": 20.789043426513672, + "rewards/rejected": -23.830883026123047, + "step": 4360 + }, + { + "epoch": 1.99, + "learning_rate": 1.1161846778285133e-07, + "logits/chosen": -1.7052295207977295, + "logits/rejected": -1.6197770833969116, + "logps/chosen": -53.97678756713867, + "logps/rejected": -105.70039367675781, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.049503803253174, + "rewards/margins": 21.83470916748047, + "rewards/rejected": -23.884212493896484, + "step": 4370 + }, + { + "epoch": 2.0, + "learning_rate": 1.111111111111111e-07, + "logits/chosen": -1.7089096307754517, + "logits/rejected": -1.6089298725128174, + "logps/chosen": -60.19716262817383, + "logps/rejected": -108.61988830566406, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3397135734558105, + "rewards/margins": 22.69954490661621, + "rewards/rejected": -25.03925895690918, + "step": 4380 + }, + { + "epoch": 2.0, + "learning_rate": 1.1060375443937086e-07, + "logits/chosen": -1.7114375829696655, + "logits/rejected": -1.6113615036010742, + "logps/chosen": -59.75043487548828, + "logps/rejected": -101.7315673828125, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.9044723510742188, + "rewards/margins": 20.202335357666016, + "rewards/rejected": -23.106807708740234, + "step": 4390 + }, + { + "epoch": 2.01, + "learning_rate": 1.1009639776763063e-07, + "logits/chosen": -1.7055928707122803, + "logits/rejected": -1.6417690515518188, + "logps/chosen": -56.58637619018555, + "logps/rejected": -108.21183013916016, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.384255886077881, + "rewards/margins": 21.54519271850586, + "rewards/rejected": -23.9294490814209, + "step": 4400 + }, + { + "epoch": 2.01, + "eval_logits/chosen": -1.49055814743042, + "eval_logits/rejected": -1.4087382555007935, + "eval_logps/chosen": -87.22195434570312, + "eval_logps/rejected": -107.77252960205078, + "eval_loss": 0.008222967386245728, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": -5.619990348815918, + "eval_rewards/margins": 18.314165115356445, + "eval_rewards/rejected": -23.93415641784668, + "eval_runtime": 162.4003, + "eval_samples_per_second": 17.623, + "eval_steps_per_second": 1.102, + "step": 4400 + }, + { + "epoch": 2.01, + "learning_rate": 1.095890410958904e-07, + "logits/chosen": -1.6955058574676514, + "logits/rejected": -1.615828514099121, + "logps/chosen": -61.4653205871582, + "logps/rejected": -108.96707916259766, + "loss": 0.0043, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.2798190116882324, + "rewards/margins": 20.68682098388672, + "rewards/rejected": -23.966638565063477, + "step": 4410 + }, + { + "epoch": 2.02, + "learning_rate": 1.0908168442415016e-07, + "logits/chosen": -1.7015609741210938, + "logits/rejected": -1.6179053783416748, + "logps/chosen": -60.08527755737305, + "logps/rejected": -104.529296875, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.195782423019409, + "rewards/margins": 21.137939453125, + "rewards/rejected": -24.333721160888672, + "step": 4420 + }, + { + "epoch": 2.02, + "learning_rate": 1.0857432775240993e-07, + "logits/chosen": -1.6842849254608154, + "logits/rejected": -1.6135200262069702, + "logps/chosen": -59.04445266723633, + "logps/rejected": -106.99456787109375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8437340259552, + "rewards/margins": 21.239042282104492, + "rewards/rejected": -24.082775115966797, + "step": 4430 + }, + { + "epoch": 2.03, + "learning_rate": 1.080669710806697e-07, + "logits/chosen": -1.7014793157577515, + "logits/rejected": -1.6122283935546875, + "logps/chosen": -57.35608673095703, + "logps/rejected": -107.4156723022461, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6503283977508545, + "rewards/margins": 22.468738555908203, + "rewards/rejected": -25.119068145751953, + "step": 4440 + }, + { + "epoch": 2.03, + "learning_rate": 1.0755961440892946e-07, + "logits/chosen": -1.6996396780014038, + "logits/rejected": -1.6209256649017334, + "logps/chosen": -56.53656005859375, + "logps/rejected": -108.75687408447266, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6461291313171387, + "rewards/margins": 22.431716918945312, + "rewards/rejected": -25.07784652709961, + "step": 4450 + }, + { + "epoch": 2.04, + "learning_rate": 1.0705225773718923e-07, + "logits/chosen": -1.6660305261611938, + "logits/rejected": -1.5947883129119873, + "logps/chosen": -58.48649215698242, + "logps/rejected": -106.2381591796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.893306016921997, + "rewards/margins": 21.230649948120117, + "rewards/rejected": -24.123958587646484, + "step": 4460 + }, + { + "epoch": 2.04, + "learning_rate": 1.06544901065449e-07, + "logits/chosen": -1.702347993850708, + "logits/rejected": -1.608782410621643, + "logps/chosen": -62.32471466064453, + "logps/rejected": -104.51692199707031, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5780274868011475, + "rewards/margins": 20.563426971435547, + "rewards/rejected": -23.14145278930664, + "step": 4470 + }, + { + "epoch": 2.04, + "learning_rate": 1.0603754439370876e-07, + "logits/chosen": -1.7274497747421265, + "logits/rejected": -1.617231011390686, + "logps/chosen": -61.200843811035156, + "logps/rejected": -107.774169921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7115633487701416, + "rewards/margins": 22.002866744995117, + "rewards/rejected": -24.714427947998047, + "step": 4480 + }, + { + "epoch": 2.05, + "learning_rate": 1.0553018772196853e-07, + "logits/chosen": -1.6974376440048218, + "logits/rejected": -1.6113479137420654, + "logps/chosen": -58.778778076171875, + "logps/rejected": -106.168701171875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8068671226501465, + "rewards/margins": 21.432415008544922, + "rewards/rejected": -24.239282608032227, + "step": 4490 + }, + { + "epoch": 2.05, + "learning_rate": 1.050228310502283e-07, + "logits/chosen": -1.6986557245254517, + "logits/rejected": -1.6170848608016968, + "logps/chosen": -56.63262176513672, + "logps/rejected": -107.97099304199219, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.7931902408599854, + "rewards/margins": 20.993011474609375, + "rewards/rejected": -23.78619956970215, + "step": 4500 + }, + { + "epoch": 2.05, + "eval_logits/chosen": -1.485693097114563, + "eval_logits/rejected": -1.403558373451233, + "eval_logps/chosen": -87.8786849975586, + "eval_logps/rejected": -109.02555084228516, + "eval_loss": 0.009082186035811901, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -5.948355197906494, + "eval_rewards/margins": 18.612306594848633, + "eval_rewards/rejected": -24.56066131591797, + "eval_runtime": 165.5502, + "eval_samples_per_second": 17.288, + "eval_steps_per_second": 1.081, + "step": 4500 + }, + { + "epoch": 2.06, + "learning_rate": 1.0451547437848806e-07, + "logits/chosen": -1.7064087390899658, + "logits/rejected": -1.6112060546875, + "logps/chosen": -61.651649475097656, + "logps/rejected": -103.2201919555664, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2393550872802734, + "rewards/margins": 21.777130126953125, + "rewards/rejected": -24.01648712158203, + "step": 4510 + }, + { + "epoch": 2.06, + "learning_rate": 1.0400811770674783e-07, + "logits/chosen": -1.6898605823516846, + "logits/rejected": -1.6058692932128906, + "logps/chosen": -54.533302307128906, + "logps/rejected": -103.07552337646484, + "loss": 0.0065, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.6751835346221924, + "rewards/margins": 21.617483139038086, + "rewards/rejected": -24.292667388916016, + "step": 4520 + }, + { + "epoch": 2.07, + "learning_rate": 1.035007610350076e-07, + "logits/chosen": -1.7024545669555664, + "logits/rejected": -1.6204640865325928, + "logps/chosen": -60.74576950073242, + "logps/rejected": -104.13387298583984, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.8922746181488037, + "rewards/margins": 20.929811477661133, + "rewards/rejected": -23.822086334228516, + "step": 4530 + }, + { + "epoch": 2.07, + "learning_rate": 1.0299340436326736e-07, + "logits/chosen": -1.6919069290161133, + "logits/rejected": -1.6228523254394531, + "logps/chosen": -58.68035125732422, + "logps/rejected": -105.86567687988281, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.017421007156372, + "rewards/margins": 20.6372013092041, + "rewards/rejected": -23.65462303161621, + "step": 4540 + }, + { + "epoch": 2.08, + "learning_rate": 1.0248604769152713e-07, + "logits/chosen": -1.6782621145248413, + "logits/rejected": -1.6088836193084717, + "logps/chosen": -56.09760665893555, + "logps/rejected": -108.61997985839844, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.794865131378174, + "rewards/margins": 21.76509666442871, + "rewards/rejected": -24.559961318969727, + "step": 4550 + }, + { + "epoch": 2.08, + "learning_rate": 1.019786910197869e-07, + "logits/chosen": -1.705232858657837, + "logits/rejected": -1.6264293193817139, + "logps/chosen": -61.69176483154297, + "logps/rejected": -104.30326843261719, + "loss": 0.0065, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.840217113494873, + "rewards/margins": 20.118412017822266, + "rewards/rejected": -23.958629608154297, + "step": 4560 + }, + { + "epoch": 2.09, + "learning_rate": 1.0147133434804666e-07, + "logits/chosen": -1.6915639638900757, + "logits/rejected": -1.5887486934661865, + "logps/chosen": -60.4460334777832, + "logps/rejected": -107.79622650146484, + "loss": 0.0054, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.5117955207824707, + "rewards/margins": 22.500534057617188, + "rewards/rejected": -25.0123291015625, + "step": 4570 + }, + { + "epoch": 2.09, + "learning_rate": 1.0096397767630643e-07, + "logits/chosen": -1.692584753036499, + "logits/rejected": -1.5952820777893066, + "logps/chosen": -60.45515060424805, + "logps/rejected": -107.60456848144531, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6986634731292725, + "rewards/margins": 22.19002914428711, + "rewards/rejected": -24.888689041137695, + "step": 4580 + }, + { + "epoch": 2.1, + "learning_rate": 1.004566210045662e-07, + "logits/chosen": -1.6760507822036743, + "logits/rejected": -1.6011130809783936, + "logps/chosen": -57.7957763671875, + "logps/rejected": -107.33087158203125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.696836471557617, + "rewards/margins": 21.473825454711914, + "rewards/rejected": -24.1706600189209, + "step": 4590 + }, + { + "epoch": 2.1, + "learning_rate": 9.994926433282596e-08, + "logits/chosen": -1.6986204385757446, + "logits/rejected": -1.613415002822876, + "logps/chosen": -60.96204376220703, + "logps/rejected": -112.97148132324219, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.995871067047119, + "rewards/margins": 22.890216827392578, + "rewards/rejected": -25.88608741760254, + "step": 4600 + }, + { + "epoch": 2.1, + "eval_logits/chosen": -1.480384349822998, + "eval_logits/rejected": -1.3979898691177368, + "eval_logps/chosen": -88.09606170654297, + "eval_logps/rejected": -109.98899841308594, + "eval_loss": 0.009054499678313732, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -6.057043552398682, + "eval_rewards/margins": 18.985332489013672, + "eval_rewards/rejected": -25.04237937927246, + "eval_runtime": 145.1115, + "eval_samples_per_second": 19.723, + "eval_steps_per_second": 1.234, + "step": 4600 + }, + { + "epoch": 2.1, + "learning_rate": 9.944190766108573e-08, + "logits/chosen": -1.7015920877456665, + "logits/rejected": -1.6078838109970093, + "logps/chosen": -60.95823287963867, + "logps/rejected": -107.05985260009766, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7135543823242188, + "rewards/margins": 21.817249298095703, + "rewards/rejected": -24.530803680419922, + "step": 4610 + }, + { + "epoch": 2.11, + "learning_rate": 9.89345509893455e-08, + "logits/chosen": -1.6830532550811768, + "logits/rejected": -1.6148555278778076, + "logps/chosen": -57.418052673339844, + "logps/rejected": -108.89680480957031, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.040402412414551, + "rewards/margins": 22.17599105834961, + "rewards/rejected": -25.21639060974121, + "step": 4620 + }, + { + "epoch": 2.11, + "learning_rate": 9.842719431760526e-08, + "logits/chosen": -1.6680667400360107, + "logits/rejected": -1.575258493423462, + "logps/chosen": -61.173545837402344, + "logps/rejected": -110.5925521850586, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2073662281036377, + "rewards/margins": 22.849517822265625, + "rewards/rejected": -26.056884765625, + "step": 4630 + }, + { + "epoch": 2.12, + "learning_rate": 9.791983764586503e-08, + "logits/chosen": -1.6535594463348389, + "logits/rejected": -1.560339331626892, + "logps/chosen": -58.0858154296875, + "logps/rejected": -102.92674255371094, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0235629081726074, + "rewards/margins": 21.722715377807617, + "rewards/rejected": -24.746280670166016, + "step": 4640 + }, + { + "epoch": 2.12, + "learning_rate": 9.74124809741248e-08, + "logits/chosen": -1.686806082725525, + "logits/rejected": -1.5998756885528564, + "logps/chosen": -58.31231689453125, + "logps/rejected": -111.29328918457031, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.88480544090271, + "rewards/margins": 23.29157257080078, + "rewards/rejected": -26.176376342773438, + "step": 4650 + }, + { + "epoch": 2.13, + "learning_rate": 9.690512430238456e-08, + "logits/chosen": -1.6860908269882202, + "logits/rejected": -1.591294527053833, + "logps/chosen": -60.230491638183594, + "logps/rejected": -108.77268981933594, + "loss": 0.0043, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.4025626182556152, + "rewards/margins": 22.170820236206055, + "rewards/rejected": -25.57338523864746, + "step": 4660 + }, + { + "epoch": 2.13, + "learning_rate": 9.639776763064433e-08, + "logits/chosen": -1.6946262121200562, + "logits/rejected": -1.6074960231781006, + "logps/chosen": -62.180450439453125, + "logps/rejected": -111.79603576660156, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5009357929229736, + "rewards/margins": 22.072834014892578, + "rewards/rejected": -25.57377052307129, + "step": 4670 + }, + { + "epoch": 2.14, + "learning_rate": 9.58904109589041e-08, + "logits/chosen": -1.692679762840271, + "logits/rejected": -1.5875680446624756, + "logps/chosen": -60.60527420043945, + "logps/rejected": -106.55836486816406, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.20111083984375, + "rewards/margins": 22.19775390625, + "rewards/rejected": -25.398862838745117, + "step": 4680 + }, + { + "epoch": 2.14, + "learning_rate": 9.538305428716386e-08, + "logits/chosen": -1.677522897720337, + "logits/rejected": -1.5916025638580322, + "logps/chosen": -64.4634017944336, + "logps/rejected": -106.9753189086914, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.455249786376953, + "rewards/margins": 21.74876594543457, + "rewards/rejected": -25.204017639160156, + "step": 4690 + }, + { + "epoch": 2.15, + "learning_rate": 9.487569761542363e-08, + "logits/chosen": -1.680101752281189, + "logits/rejected": -1.5896421670913696, + "logps/chosen": -60.176551818847656, + "logps/rejected": -102.62559509277344, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.885277271270752, + "rewards/margins": 21.611709594726562, + "rewards/rejected": -24.49698829650879, + "step": 4700 + }, + { + "epoch": 2.15, + "eval_logits/chosen": -1.4732141494750977, + "eval_logits/rejected": -1.3907400369644165, + "eval_logps/chosen": -88.74838256835938, + "eval_logps/rejected": -111.12362670898438, + "eval_loss": 0.01002542581409216, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -6.383204460144043, + "eval_rewards/margins": 19.226491928100586, + "eval_rewards/rejected": -25.609697341918945, + "eval_runtime": 175.6871, + "eval_samples_per_second": 16.29, + "eval_steps_per_second": 1.019, + "step": 4700 + }, + { + "epoch": 2.15, + "learning_rate": 9.43683409436834e-08, + "logits/chosen": -1.672640085220337, + "logits/rejected": -1.5956499576568604, + "logps/chosen": -60.055267333984375, + "logps/rejected": -114.6338119506836, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.480705738067627, + "rewards/margins": 23.14187240600586, + "rewards/rejected": -26.62257957458496, + "step": 4710 + }, + { + "epoch": 2.15, + "learning_rate": 9.386098427194316e-08, + "logits/chosen": -1.6616106033325195, + "logits/rejected": -1.575148105621338, + "logps/chosen": -61.344154357910156, + "logps/rejected": -107.2572250366211, + "loss": 0.0043, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.7119224071502686, + "rewards/margins": 21.411645889282227, + "rewards/rejected": -25.12356948852539, + "step": 4720 + }, + { + "epoch": 2.16, + "learning_rate": 9.335362760020293e-08, + "logits/chosen": -1.6699577569961548, + "logits/rejected": -1.5892879962921143, + "logps/chosen": -58.083099365234375, + "logps/rejected": -106.17130279541016, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9335007667541504, + "rewards/margins": 21.795217514038086, + "rewards/rejected": -25.72871971130371, + "step": 4730 + }, + { + "epoch": 2.16, + "learning_rate": 9.28462709284627e-08, + "logits/chosen": -1.6922982931137085, + "logits/rejected": -1.59556245803833, + "logps/chosen": -60.62724685668945, + "logps/rejected": -108.93511962890625, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.105750322341919, + "rewards/margins": 22.99467658996582, + "rewards/rejected": -26.10042381286621, + "step": 4740 + }, + { + "epoch": 2.17, + "learning_rate": 9.233891425672246e-08, + "logits/chosen": -1.7342430353164673, + "logits/rejected": -1.6406952142715454, + "logps/chosen": -60.60820388793945, + "logps/rejected": -106.60917663574219, + "loss": 0.0054, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.723804235458374, + "rewards/margins": 21.95320701599121, + "rewards/rejected": -24.677011489868164, + "step": 4750 + }, + { + "epoch": 2.17, + "learning_rate": 9.183155758498223e-08, + "logits/chosen": -1.6949208974838257, + "logits/rejected": -1.6064071655273438, + "logps/chosen": -59.212013244628906, + "logps/rejected": -105.79365539550781, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6465213298797607, + "rewards/margins": 21.635541915893555, + "rewards/rejected": -24.28206443786621, + "step": 4760 + }, + { + "epoch": 2.18, + "learning_rate": 9.1324200913242e-08, + "logits/chosen": -1.725602388381958, + "logits/rejected": -1.6477651596069336, + "logps/chosen": -61.2446174621582, + "logps/rejected": -111.3396987915039, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.967522621154785, + "rewards/margins": 22.574275970458984, + "rewards/rejected": -25.541797637939453, + "step": 4770 + }, + { + "epoch": 2.18, + "learning_rate": 9.081684424150176e-08, + "logits/chosen": -1.6925163269042969, + "logits/rejected": -1.6027542352676392, + "logps/chosen": -59.8698844909668, + "logps/rejected": -109.35646057128906, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.84021258354187, + "rewards/margins": 22.394512176513672, + "rewards/rejected": -25.234722137451172, + "step": 4780 + }, + { + "epoch": 2.19, + "learning_rate": 9.030948756976153e-08, + "logits/chosen": -1.6867091655731201, + "logits/rejected": -1.5830605030059814, + "logps/chosen": -61.847023010253906, + "logps/rejected": -107.93465423583984, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9739956855773926, + "rewards/margins": 22.876094818115234, + "rewards/rejected": -25.850088119506836, + "step": 4790 + }, + { + "epoch": 2.19, + "learning_rate": 8.98021308980213e-08, + "logits/chosen": -1.7063744068145752, + "logits/rejected": -1.6185452938079834, + "logps/chosen": -60.459068298339844, + "logps/rejected": -110.65385437011719, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.613921880722046, + "rewards/margins": 23.015316009521484, + "rewards/rejected": -25.629236221313477, + "step": 4800 + }, + { + "epoch": 2.19, + "eval_logits/chosen": -1.482709288597107, + "eval_logits/rejected": -1.4006223678588867, + "eval_logps/chosen": -87.56159210205078, + "eval_logps/rejected": -110.17630767822266, + "eval_loss": 0.007255359552800655, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -5.789804935455322, + "eval_rewards/margins": 19.346235275268555, + "eval_rewards/rejected": -25.13603973388672, + "eval_runtime": 148.1957, + "eval_samples_per_second": 19.312, + "eval_steps_per_second": 1.208, + "step": 4800 + }, + { + "epoch": 2.2, + "learning_rate": 8.929477422628106e-08, + "logits/chosen": -1.7169044017791748, + "logits/rejected": -1.6037685871124268, + "logps/chosen": -66.4024429321289, + "logps/rejected": -109.7977294921875, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.555539608001709, + "rewards/margins": 21.82910919189453, + "rewards/rejected": -25.3846492767334, + "step": 4810 + }, + { + "epoch": 2.2, + "learning_rate": 8.878741755454083e-08, + "logits/chosen": -1.7067817449569702, + "logits/rejected": -1.6083142757415771, + "logps/chosen": -62.67310333251953, + "logps/rejected": -111.0739517211914, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.06396746635437, + "rewards/margins": 22.700790405273438, + "rewards/rejected": -25.764759063720703, + "step": 4820 + }, + { + "epoch": 2.2, + "learning_rate": 8.82800608828006e-08, + "logits/chosen": -1.7074460983276367, + "logits/rejected": -1.624869704246521, + "logps/chosen": -62.67158889770508, + "logps/rejected": -105.26263427734375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1344337463378906, + "rewards/margins": 21.621511459350586, + "rewards/rejected": -24.75594711303711, + "step": 4830 + }, + { + "epoch": 2.21, + "learning_rate": 8.777270421106036e-08, + "logits/chosen": -1.704567313194275, + "logits/rejected": -1.6185743808746338, + "logps/chosen": -57.15745162963867, + "logps/rejected": -110.69510650634766, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8284034729003906, + "rewards/margins": 23.138578414916992, + "rewards/rejected": -25.966983795166016, + "step": 4840 + }, + { + "epoch": 2.21, + "learning_rate": 8.726534753932013e-08, + "logits/chosen": -1.6907163858413696, + "logits/rejected": -1.6202579736709595, + "logps/chosen": -56.169822692871094, + "logps/rejected": -107.8416748046875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6681342124938965, + "rewards/margins": 22.314682006835938, + "rewards/rejected": -24.98281478881836, + "step": 4850 + }, + { + "epoch": 2.22, + "learning_rate": 8.67579908675799e-08, + "logits/chosen": -1.6673482656478882, + "logits/rejected": -1.5913320779800415, + "logps/chosen": -57.061622619628906, + "logps/rejected": -108.7569351196289, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0472772121429443, + "rewards/margins": 22.588138580322266, + "rewards/rejected": -25.63541603088379, + "step": 4860 + }, + { + "epoch": 2.22, + "learning_rate": 8.625063419583966e-08, + "logits/chosen": -1.6885020732879639, + "logits/rejected": -1.6073684692382812, + "logps/chosen": -58.192726135253906, + "logps/rejected": -104.97102355957031, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7794158458709717, + "rewards/margins": 21.73690414428711, + "rewards/rejected": -24.516321182250977, + "step": 4870 + }, + { + "epoch": 2.23, + "learning_rate": 8.574327752409943e-08, + "logits/chosen": -1.6919105052947998, + "logits/rejected": -1.6105105876922607, + "logps/chosen": -54.97904586791992, + "logps/rejected": -108.90132141113281, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.928136110305786, + "rewards/margins": 22.817455291748047, + "rewards/rejected": -25.745594024658203, + "step": 4880 + }, + { + "epoch": 2.23, + "learning_rate": 8.52359208523592e-08, + "logits/chosen": -1.7173068523406982, + "logits/rejected": -1.6142711639404297, + "logps/chosen": -58.7863883972168, + "logps/rejected": -109.75135803222656, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4348838329315186, + "rewards/margins": 23.071659088134766, + "rewards/rejected": -25.506546020507812, + "step": 4890 + }, + { + "epoch": 2.24, + "learning_rate": 8.472856418061896e-08, + "logits/chosen": -1.698009729385376, + "logits/rejected": -1.5966373682022095, + "logps/chosen": -62.571998596191406, + "logps/rejected": -109.10028076171875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0530714988708496, + "rewards/margins": 22.67581558227539, + "rewards/rejected": -25.7288875579834, + "step": 4900 + }, + { + "epoch": 2.24, + "eval_logits/chosen": -1.473187804222107, + "eval_logits/rejected": -1.3907272815704346, + "eval_logps/chosen": -88.25776672363281, + "eval_logps/rejected": -111.77100372314453, + "eval_loss": 0.009084388613700867, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -6.137899875640869, + "eval_rewards/margins": 19.795488357543945, + "eval_rewards/rejected": -25.933391571044922, + "eval_runtime": 177.6801, + "eval_samples_per_second": 16.108, + "eval_steps_per_second": 1.007, + "step": 4900 + }, + { + "epoch": 2.24, + "learning_rate": 8.422120750887873e-08, + "logits/chosen": -1.6893600225448608, + "logits/rejected": -1.5956366062164307, + "logps/chosen": -60.0640869140625, + "logps/rejected": -111.0639877319336, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.929564952850342, + "rewards/margins": 22.824703216552734, + "rewards/rejected": -25.7542667388916, + "step": 4910 + }, + { + "epoch": 2.25, + "learning_rate": 8.37138508371385e-08, + "logits/chosen": -1.700185775756836, + "logits/rejected": -1.6091382503509521, + "logps/chosen": -61.7586784362793, + "logps/rejected": -114.5706558227539, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8398773670196533, + "rewards/margins": 23.557147979736328, + "rewards/rejected": -26.397024154663086, + "step": 4920 + }, + { + "epoch": 2.25, + "learning_rate": 8.320649416539826e-08, + "logits/chosen": -1.6746511459350586, + "logits/rejected": -1.587571382522583, + "logps/chosen": -62.784088134765625, + "logps/rejected": -110.8236312866211, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.5711326599121094, + "rewards/margins": 22.47551155090332, + "rewards/rejected": -26.046642303466797, + "step": 4930 + }, + { + "epoch": 2.25, + "learning_rate": 8.269913749365803e-08, + "logits/chosen": -1.7054109573364258, + "logits/rejected": -1.6170246601104736, + "logps/chosen": -64.3830337524414, + "logps/rejected": -114.76332092285156, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3856863975524902, + "rewards/margins": 23.24374771118164, + "rewards/rejected": -26.629430770874023, + "step": 4940 + }, + { + "epoch": 2.26, + "learning_rate": 8.21917808219178e-08, + "logits/chosen": -1.696123480796814, + "logits/rejected": -1.6094690561294556, + "logps/chosen": -61.13164138793945, + "logps/rejected": -114.35484313964844, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.768718957901001, + "rewards/margins": 24.020790100097656, + "rewards/rejected": -26.789508819580078, + "step": 4950 + }, + { + "epoch": 2.26, + "learning_rate": 8.168442415017756e-08, + "logits/chosen": -1.6941993236541748, + "logits/rejected": -1.6031553745269775, + "logps/chosen": -59.2587890625, + "logps/rejected": -110.17204284667969, + "loss": 0.0043, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.780848741531372, + "rewards/margins": 22.9312744140625, + "rewards/rejected": -25.71212387084961, + "step": 4960 + }, + { + "epoch": 2.27, + "learning_rate": 8.117706747843733e-08, + "logits/chosen": -1.6594960689544678, + "logits/rejected": -1.5845571756362915, + "logps/chosen": -59.62993621826172, + "logps/rejected": -109.40238952636719, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7585835456848145, + "rewards/margins": 22.709556579589844, + "rewards/rejected": -25.468141555786133, + "step": 4970 + }, + { + "epoch": 2.27, + "learning_rate": 8.06697108066971e-08, + "logits/chosen": -1.6639039516448975, + "logits/rejected": -1.5775517225265503, + "logps/chosen": -59.787498474121094, + "logps/rejected": -110.11994934082031, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1483092308044434, + "rewards/margins": 23.41049575805664, + "rewards/rejected": -26.558801651000977, + "step": 4980 + }, + { + "epoch": 2.28, + "learning_rate": 8.016235413495687e-08, + "logits/chosen": -1.6792118549346924, + "logits/rejected": -1.6143678426742554, + "logps/chosen": -58.59711456298828, + "logps/rejected": -112.262939453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5294151306152344, + "rewards/margins": 23.087398529052734, + "rewards/rejected": -26.6168155670166, + "step": 4990 + }, + { + "epoch": 2.28, + "learning_rate": 7.965499746321664e-08, + "logits/chosen": -1.6774799823760986, + "logits/rejected": -1.5791685581207275, + "logps/chosen": -61.466697692871094, + "logps/rejected": -119.5277328491211, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.4334099292755127, + "rewards/margins": 24.16744613647461, + "rewards/rejected": -27.600854873657227, + "step": 5000 + }, + { + "epoch": 2.28, + "eval_logits/chosen": -1.4564272165298462, + "eval_logits/rejected": -1.3738222122192383, + "eval_logps/chosen": -90.7276611328125, + "eval_logps/rejected": -115.12030792236328, + "eval_loss": 0.014685509726405144, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -7.372840881347656, + "eval_rewards/margins": 20.23519515991211, + "eval_rewards/rejected": -27.608036041259766, + "eval_runtime": 159.2397, + "eval_samples_per_second": 17.973, + "eval_steps_per_second": 1.124, + "step": 5000 + }, + { + "epoch": 2.29, + "learning_rate": 7.91476407914764e-08, + "logits/chosen": -1.6650855541229248, + "logits/rejected": -1.5862457752227783, + "logps/chosen": -61.99729537963867, + "logps/rejected": -112.56124114990234, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6777865886688232, + "rewards/margins": 23.270479202270508, + "rewards/rejected": -26.94826316833496, + "step": 5010 + }, + { + "epoch": 2.29, + "learning_rate": 7.864028411973617e-08, + "logits/chosen": -1.6611073017120361, + "logits/rejected": -1.579276204109192, + "logps/chosen": -61.78557586669922, + "logps/rejected": -111.4010009765625, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6312317848205566, + "rewards/margins": 23.880470275878906, + "rewards/rejected": -27.511699676513672, + "step": 5020 + }, + { + "epoch": 2.3, + "learning_rate": 7.813292744799594e-08, + "logits/chosen": -1.6593017578125, + "logits/rejected": -1.576297640800476, + "logps/chosen": -61.967369079589844, + "logps/rejected": -114.94383239746094, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.7899672985076904, + "rewards/margins": 24.285675048828125, + "rewards/rejected": -28.075641632080078, + "step": 5030 + }, + { + "epoch": 2.3, + "learning_rate": 7.76255707762557e-08, + "logits/chosen": -1.6597697734832764, + "logits/rejected": -1.576095461845398, + "logps/chosen": -60.80706787109375, + "logps/rejected": -112.26513671875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6342098712921143, + "rewards/margins": 23.36855697631836, + "rewards/rejected": -27.002765655517578, + "step": 5040 + }, + { + "epoch": 2.31, + "learning_rate": 7.711821410451547e-08, + "logits/chosen": -1.6522419452667236, + "logits/rejected": -1.563639760017395, + "logps/chosen": -63.43516159057617, + "logps/rejected": -111.27690124511719, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.502595901489258, + "rewards/margins": 23.448060989379883, + "rewards/rejected": -26.95065689086914, + "step": 5050 + }, + { + "epoch": 2.31, + "learning_rate": 7.661085743277524e-08, + "logits/chosen": -1.6515191793441772, + "logits/rejected": -1.5625699758529663, + "logps/chosen": -64.24688720703125, + "logps/rejected": -117.6348876953125, + "loss": 0.0011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.006891250610352, + "rewards/margins": 24.36283302307129, + "rewards/rejected": -28.369726181030273, + "step": 5060 + }, + { + "epoch": 2.31, + "learning_rate": 7.6103500761035e-08, + "logits/chosen": -1.6504627466201782, + "logits/rejected": -1.5722054243087769, + "logps/chosen": -59.683631896972656, + "logps/rejected": -113.41046142578125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.637004852294922, + "rewards/margins": 24.63381004333496, + "rewards/rejected": -28.270816802978516, + "step": 5070 + }, + { + "epoch": 2.32, + "learning_rate": 7.559614408929477e-08, + "logits/chosen": -1.6522117853164673, + "logits/rejected": -1.560707926750183, + "logps/chosen": -62.32175827026367, + "logps/rejected": -116.25910949707031, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.994751214981079, + "rewards/margins": 25.31358528137207, + "rewards/rejected": -28.308338165283203, + "step": 5080 + }, + { + "epoch": 2.32, + "learning_rate": 7.508878741755454e-08, + "logits/chosen": -1.6741764545440674, + "logits/rejected": -1.5819628238677979, + "logps/chosen": -61.825042724609375, + "logps/rejected": -112.41102600097656, + "loss": 0.0054, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.4560928344726562, + "rewards/margins": 24.269001007080078, + "rewards/rejected": -27.725093841552734, + "step": 5090 + }, + { + "epoch": 2.33, + "learning_rate": 7.45814307458143e-08, + "logits/chosen": -1.678259253501892, + "logits/rejected": -1.5955337285995483, + "logps/chosen": -62.3753547668457, + "logps/rejected": -110.7182846069336, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.586951732635498, + "rewards/margins": 23.020858764648438, + "rewards/rejected": -26.60780906677246, + "step": 5100 + }, + { + "epoch": 2.33, + "eval_logits/chosen": -1.4603537321090698, + "eval_logits/rejected": -1.3779700994491577, + "eval_logps/chosen": -89.79312896728516, + "eval_logps/rejected": -114.51570129394531, + "eval_loss": 0.011963811703026295, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -6.905576705932617, + "eval_rewards/margins": 20.400161743164062, + "eval_rewards/rejected": -27.305734634399414, + "eval_runtime": 137.8862, + "eval_samples_per_second": 20.756, + "eval_steps_per_second": 1.298, + "step": 5100 + }, + { + "epoch": 2.33, + "learning_rate": 7.407407407407407e-08, + "logits/chosen": -1.6667464971542358, + "logits/rejected": -1.590036392211914, + "logps/chosen": -62.1322021484375, + "logps/rejected": -112.29280090332031, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8729019165039062, + "rewards/margins": 23.566265106201172, + "rewards/rejected": -27.43916893005371, + "step": 5110 + }, + { + "epoch": 2.34, + "learning_rate": 7.356671740233384e-08, + "logits/chosen": -1.6505603790283203, + "logits/rejected": -1.551260232925415, + "logps/chosen": -60.57634735107422, + "logps/rejected": -110.75213623046875, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.6815459728240967, + "rewards/margins": 23.88407325744629, + "rewards/rejected": -27.56561851501465, + "step": 5120 + }, + { + "epoch": 2.34, + "learning_rate": 7.30593607305936e-08, + "logits/chosen": -1.6690067052841187, + "logits/rejected": -1.5922582149505615, + "logps/chosen": -63.007774353027344, + "logps/rejected": -111.46229553222656, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.688744068145752, + "rewards/margins": 22.96770477294922, + "rewards/rejected": -26.656448364257812, + "step": 5130 + }, + { + "epoch": 2.35, + "learning_rate": 7.255200405885337e-08, + "logits/chosen": -1.669487714767456, + "logits/rejected": -1.5800920724868774, + "logps/chosen": -60.822509765625, + "logps/rejected": -113.46736907958984, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.038383722305298, + "rewards/margins": 23.952011108398438, + "rewards/rejected": -26.990392684936523, + "step": 5140 + }, + { + "epoch": 2.35, + "learning_rate": 7.204464738711314e-08, + "logits/chosen": -1.6490485668182373, + "logits/rejected": -1.565785527229309, + "logps/chosen": -61.78581619262695, + "logps/rejected": -113.68495178222656, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.121267318725586, + "rewards/margins": 24.656448364257812, + "rewards/rejected": -27.7777156829834, + "step": 5150 + }, + { + "epoch": 2.36, + "learning_rate": 7.15372907153729e-08, + "logits/chosen": -1.689867615699768, + "logits/rejected": -1.5955573320388794, + "logps/chosen": -61.59331130981445, + "logps/rejected": -114.4730224609375, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.735996723175049, + "rewards/margins": 24.27120590209961, + "rewards/rejected": -28.0072021484375, + "step": 5160 + }, + { + "epoch": 2.36, + "learning_rate": 7.102993404363267e-08, + "logits/chosen": -1.6986421346664429, + "logits/rejected": -1.6016403436660767, + "logps/chosen": -62.81281661987305, + "logps/rejected": -111.57080078125, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.166194438934326, + "rewards/margins": 24.02010726928711, + "rewards/rejected": -27.18630599975586, + "step": 5170 + }, + { + "epoch": 2.36, + "learning_rate": 7.052257737189244e-08, + "logits/chosen": -1.664933443069458, + "logits/rejected": -1.5854079723358154, + "logps/chosen": -60.731178283691406, + "logps/rejected": -115.13871765136719, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7711570262908936, + "rewards/margins": 24.53057861328125, + "rewards/rejected": -28.301733016967773, + "step": 5180 + }, + { + "epoch": 2.37, + "learning_rate": 7.00152207001522e-08, + "logits/chosen": -1.676743507385254, + "logits/rejected": -1.5855991840362549, + "logps/chosen": -60.43587112426758, + "logps/rejected": -114.07183837890625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.224400758743286, + "rewards/margins": 23.96244239807129, + "rewards/rejected": -27.186847686767578, + "step": 5190 + }, + { + "epoch": 2.37, + "learning_rate": 6.950786402841197e-08, + "logits/chosen": -1.651254415512085, + "logits/rejected": -1.5747812986373901, + "logps/chosen": -64.29228210449219, + "logps/rejected": -112.44258117675781, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9219841957092285, + "rewards/margins": 23.39896011352539, + "rewards/rejected": -27.32094383239746, + "step": 5200 + }, + { + "epoch": 2.37, + "eval_logits/chosen": -1.4592654705047607, + "eval_logits/rejected": -1.377195954322815, + "eval_logps/chosen": -89.17169189453125, + "eval_logps/rejected": -115.13497161865234, + "eval_loss": 0.009668777696788311, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -6.594855308532715, + "eval_rewards/margins": 21.02051544189453, + "eval_rewards/rejected": -27.615367889404297, + "eval_runtime": 161.4474, + "eval_samples_per_second": 17.727, + "eval_steps_per_second": 1.109, + "step": 5200 + }, + { + "epoch": 2.38, + "learning_rate": 6.900050735667174e-08, + "logits/chosen": -1.6961199045181274, + "logits/rejected": -1.6116136312484741, + "logps/chosen": -61.264503479003906, + "logps/rejected": -113.85749816894531, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.226783275604248, + "rewards/margins": 24.32256317138672, + "rewards/rejected": -27.549346923828125, + "step": 5210 + }, + { + "epoch": 2.38, + "learning_rate": 6.84931506849315e-08, + "logits/chosen": -1.6688201427459717, + "logits/rejected": -1.5917457342147827, + "logps/chosen": -60.019065856933594, + "logps/rejected": -117.906005859375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.143035411834717, + "rewards/margins": 25.059246063232422, + "rewards/rejected": -28.202281951904297, + "step": 5220 + }, + { + "epoch": 2.39, + "learning_rate": 6.798579401319127e-08, + "logits/chosen": -1.6733934879302979, + "logits/rejected": -1.5905344486236572, + "logps/chosen": -60.477500915527344, + "logps/rejected": -115.54353332519531, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.429901599884033, + "rewards/margins": 24.198131561279297, + "rewards/rejected": -27.628036499023438, + "step": 5230 + }, + { + "epoch": 2.39, + "learning_rate": 6.747843734145104e-08, + "logits/chosen": -1.6754209995269775, + "logits/rejected": -1.5958062410354614, + "logps/chosen": -57.3133430480957, + "logps/rejected": -111.15318298339844, + "loss": 0.0034, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.7517170906066895, + "rewards/margins": 24.133460998535156, + "rewards/rejected": -26.885177612304688, + "step": 5240 + }, + { + "epoch": 2.4, + "learning_rate": 6.69710806697108e-08, + "logits/chosen": -1.6434913873672485, + "logits/rejected": -1.5607589483261108, + "logps/chosen": -61.31553268432617, + "logps/rejected": -114.73616027832031, + "loss": 0.0043, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.1473183631896973, + "rewards/margins": 24.592918395996094, + "rewards/rejected": -27.740238189697266, + "step": 5250 + }, + { + "epoch": 2.4, + "learning_rate": 6.646372399797057e-08, + "logits/chosen": -1.679810881614685, + "logits/rejected": -1.5854202508926392, + "logps/chosen": -60.2682991027832, + "logps/rejected": -108.10420227050781, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6345019340515137, + "rewards/margins": 23.013126373291016, + "rewards/rejected": -26.647632598876953, + "step": 5260 + }, + { + "epoch": 2.41, + "learning_rate": 6.595636732623034e-08, + "logits/chosen": -1.6663167476654053, + "logits/rejected": -1.583310842514038, + "logps/chosen": -61.53295135498047, + "logps/rejected": -111.26729583740234, + "loss": 0.0011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.6133663654327393, + "rewards/margins": 23.47526741027832, + "rewards/rejected": -27.0886287689209, + "step": 5270 + }, + { + "epoch": 2.41, + "learning_rate": 6.54490106544901e-08, + "logits/chosen": -1.6865384578704834, + "logits/rejected": -1.6003679037094116, + "logps/chosen": -60.403114318847656, + "logps/rejected": -114.53379821777344, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2604928016662598, + "rewards/margins": 25.53091049194336, + "rewards/rejected": -28.79140281677246, + "step": 5280 + }, + { + "epoch": 2.41, + "learning_rate": 6.494165398274987e-08, + "logits/chosen": -1.6446622610092163, + "logits/rejected": -1.5624706745147705, + "logps/chosen": -62.8109130859375, + "logps/rejected": -112.4520263671875, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.6601951122283936, + "rewards/margins": 23.925983428955078, + "rewards/rejected": -27.5861759185791, + "step": 5290 + }, + { + "epoch": 2.42, + "learning_rate": 6.443429731100964e-08, + "logits/chosen": -1.6465861797332764, + "logits/rejected": -1.563546061515808, + "logps/chosen": -62.8968505859375, + "logps/rejected": -117.574951171875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.694208860397339, + "rewards/margins": 25.211589813232422, + "rewards/rejected": -28.905797958374023, + "step": 5300 + }, + { + "epoch": 2.42, + "eval_logits/chosen": -1.4465004205703735, + "eval_logits/rejected": -1.3646644353866577, + "eval_logps/chosen": -91.00645446777344, + "eval_logps/rejected": -117.21986389160156, + "eval_loss": 0.015215998515486717, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -7.512238502502441, + "eval_rewards/margins": 21.14558219909668, + "eval_rewards/rejected": -28.657821655273438, + "eval_runtime": 149.4468, + "eval_samples_per_second": 19.151, + "eval_steps_per_second": 1.198, + "step": 5300 + }, + { + "epoch": 2.42, + "learning_rate": 6.39269406392694e-08, + "logits/chosen": -1.641343355178833, + "logits/rejected": -1.5607117414474487, + "logps/chosen": -62.00952911376953, + "logps/rejected": -118.48858642578125, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.9190986156463623, + "rewards/margins": 25.884418487548828, + "rewards/rejected": -29.80352210998535, + "step": 5310 + }, + { + "epoch": 2.43, + "learning_rate": 6.341958396752917e-08, + "logits/chosen": -1.6571009159088135, + "logits/rejected": -1.5537234544754028, + "logps/chosen": -63.40632247924805, + "logps/rejected": -119.00166320800781, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7866835594177246, + "rewards/margins": 25.64019775390625, + "rewards/rejected": -29.4268798828125, + "step": 5320 + }, + { + "epoch": 2.43, + "learning_rate": 6.291222729578894e-08, + "logits/chosen": -1.67231023311615, + "logits/rejected": -1.5877420902252197, + "logps/chosen": -63.801239013671875, + "logps/rejected": -115.84706115722656, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.1915388107299805, + "rewards/margins": 24.520641326904297, + "rewards/rejected": -28.71217918395996, + "step": 5330 + }, + { + "epoch": 2.44, + "learning_rate": 6.24048706240487e-08, + "logits/chosen": -1.6602236032485962, + "logits/rejected": -1.552490472793579, + "logps/chosen": -67.17897033691406, + "logps/rejected": -118.0072021484375, + "loss": 0.0054, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.39713191986084, + "rewards/margins": 24.628887176513672, + "rewards/rejected": -29.02602195739746, + "step": 5340 + }, + { + "epoch": 2.44, + "learning_rate": 6.189751395230847e-08, + "logits/chosen": -1.6501667499542236, + "logits/rejected": -1.5565497875213623, + "logps/chosen": -62.2840690612793, + "logps/rejected": -115.02555847167969, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.614900588989258, + "rewards/margins": 25.629772186279297, + "rewards/rejected": -29.244674682617188, + "step": 5350 + }, + { + "epoch": 2.45, + "learning_rate": 6.139015728056824e-08, + "logits/chosen": -1.6319433450698853, + "logits/rejected": -1.5535074472427368, + "logps/chosen": -60.017303466796875, + "logps/rejected": -115.55560302734375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5083460807800293, + "rewards/margins": 24.990381240844727, + "rewards/rejected": -28.498727798461914, + "step": 5360 + }, + { + "epoch": 2.45, + "learning_rate": 6.0882800608828e-08, + "logits/chosen": -1.6377484798431396, + "logits/rejected": -1.567611575126648, + "logps/chosen": -59.57817840576172, + "logps/rejected": -112.98590087890625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.69633412361145, + "rewards/margins": 25.142475128173828, + "rewards/rejected": -28.83881187438965, + "step": 5370 + }, + { + "epoch": 2.46, + "learning_rate": 6.037544393708777e-08, + "logits/chosen": -1.6721665859222412, + "logits/rejected": -1.5867160558700562, + "logps/chosen": -62.4495735168457, + "logps/rejected": -119.78460693359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2411720752716064, + "rewards/margins": 25.996212005615234, + "rewards/rejected": -29.237384796142578, + "step": 5380 + }, + { + "epoch": 2.46, + "learning_rate": 5.986808726534754e-08, + "logits/chosen": -1.6507494449615479, + "logits/rejected": -1.5618906021118164, + "logps/chosen": -62.99871826171875, + "logps/rejected": -118.33638763427734, + "loss": 0.0057, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.6402747631073, + "rewards/margins": 26.580303192138672, + "rewards/rejected": -30.2205810546875, + "step": 5390 + }, + { + "epoch": 2.46, + "learning_rate": 5.93607305936073e-08, + "logits/chosen": -1.6390947103500366, + "logits/rejected": -1.5543744564056396, + "logps/chosen": -59.22716522216797, + "logps/rejected": -118.3710708618164, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3072409629821777, + "rewards/margins": 26.537982940673828, + "rewards/rejected": -29.845226287841797, + "step": 5400 + }, + { + "epoch": 2.46, + "eval_logits/chosen": -1.4331419467926025, + "eval_logits/rejected": -1.3514832258224487, + "eval_logps/chosen": -91.3964614868164, + "eval_logps/rejected": -118.7977066040039, + "eval_loss": 0.014859071001410484, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -7.707246780395508, + "eval_rewards/margins": 21.739484786987305, + "eval_rewards/rejected": -29.446731567382812, + "eval_runtime": 148.897, + "eval_samples_per_second": 19.221, + "eval_steps_per_second": 1.202, + "step": 5400 + }, + { + "epoch": 2.47, + "learning_rate": 5.8853373921867065e-08, + "logits/chosen": -1.6490676403045654, + "logits/rejected": -1.5844876766204834, + "logps/chosen": -60.4987907409668, + "logps/rejected": -123.09349060058594, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7090137004852295, + "rewards/margins": 26.693166732788086, + "rewards/rejected": -30.40218162536621, + "step": 5410 + }, + { + "epoch": 2.47, + "learning_rate": 5.834601725012683e-08, + "logits/chosen": -1.6431608200073242, + "logits/rejected": -1.556091547012329, + "logps/chosen": -60.43384552001953, + "logps/rejected": -116.97358703613281, + "loss": 0.0011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.6738152503967285, + "rewards/margins": 25.790332794189453, + "rewards/rejected": -29.464147567749023, + "step": 5420 + }, + { + "epoch": 2.48, + "learning_rate": 5.78386605783866e-08, + "logits/chosen": -1.6185407638549805, + "logits/rejected": -1.5349016189575195, + "logps/chosen": -61.325828552246094, + "logps/rejected": -115.5124740600586, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4237639904022217, + "rewards/margins": 26.009695053100586, + "rewards/rejected": -29.433462142944336, + "step": 5430 + }, + { + "epoch": 2.48, + "learning_rate": 5.7331303906646365e-08, + "logits/chosen": -1.6398437023162842, + "logits/rejected": -1.5696852207183838, + "logps/chosen": -60.71503829956055, + "logps/rejected": -117.88394927978516, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8025786876678467, + "rewards/margins": 26.124065399169922, + "rewards/rejected": -29.926645278930664, + "step": 5440 + }, + { + "epoch": 2.49, + "learning_rate": 5.682394723490613e-08, + "logits/chosen": -1.642251968383789, + "logits/rejected": -1.5555260181427002, + "logps/chosen": -65.87081909179688, + "logps/rejected": -118.69813537597656, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.45894718170166, + "rewards/margins": 24.65566635131836, + "rewards/rejected": -29.114614486694336, + "step": 5450 + }, + { + "epoch": 2.49, + "learning_rate": 5.63165905631659e-08, + "logits/chosen": -1.626678466796875, + "logits/rejected": -1.54049551486969, + "logps/chosen": -65.37655639648438, + "logps/rejected": -124.11677551269531, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.931981325149536, + "rewards/margins": 27.34768295288086, + "rewards/rejected": -31.2796573638916, + "step": 5460 + }, + { + "epoch": 2.5, + "learning_rate": 5.5809233891425665e-08, + "logits/chosen": -1.622873067855835, + "logits/rejected": -1.536217212677002, + "logps/chosen": -60.86760711669922, + "logps/rejected": -113.6754379272461, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9700331687927246, + "rewards/margins": 25.673553466796875, + "rewards/rejected": -29.643585205078125, + "step": 5470 + }, + { + "epoch": 2.5, + "learning_rate": 5.530187721968543e-08, + "logits/chosen": -1.6449658870697021, + "logits/rejected": -1.557802677154541, + "logps/chosen": -65.7143783569336, + "logps/rejected": -118.18770599365234, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.456841945648193, + "rewards/margins": 25.29494285583496, + "rewards/rejected": -29.751785278320312, + "step": 5480 + }, + { + "epoch": 2.51, + "learning_rate": 5.47945205479452e-08, + "logits/chosen": -1.6212965250015259, + "logits/rejected": -1.5396636724472046, + "logps/chosen": -60.645843505859375, + "logps/rejected": -120.55546569824219, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.6203675270080566, + "rewards/margins": 26.316967010498047, + "rewards/rejected": -29.937335968017578, + "step": 5490 + }, + { + "epoch": 2.51, + "learning_rate": 5.4287163876204964e-08, + "logits/chosen": -1.640913963317871, + "logits/rejected": -1.5578389167785645, + "logps/chosen": -61.227508544921875, + "logps/rejected": -118.57939147949219, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.417339324951172, + "rewards/margins": 26.617385864257812, + "rewards/rejected": -30.034725189208984, + "step": 5500 + }, + { + "epoch": 2.51, + "eval_logits/chosen": -1.4292577505111694, + "eval_logits/rejected": -1.3483442068099976, + "eval_logps/chosen": -91.32805633544922, + "eval_logps/rejected": -118.79887390136719, + "eval_loss": 0.013684802688658237, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -7.673043251037598, + "eval_rewards/margins": 21.774276733398438, + "eval_rewards/rejected": -29.44732093811035, + "eval_runtime": 165.9448, + "eval_samples_per_second": 17.247, + "eval_steps_per_second": 1.079, + "step": 5500 + }, + { + "epoch": 2.52, + "learning_rate": 5.377980720446473e-08, + "logits/chosen": -1.6323438882827759, + "logits/rejected": -1.5507663488388062, + "logps/chosen": -61.16468048095703, + "logps/rejected": -113.951904296875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.240023612976074, + "rewards/margins": 25.353132247924805, + "rewards/rejected": -29.593158721923828, + "step": 5510 + }, + { + "epoch": 2.52, + "learning_rate": 5.32724505327245e-08, + "logits/chosen": -1.6588122844696045, + "logits/rejected": -1.5675216913223267, + "logps/chosen": -61.83917999267578, + "logps/rejected": -117.0645751953125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4707107543945312, + "rewards/margins": 25.591796875, + "rewards/rejected": -29.062509536743164, + "step": 5520 + }, + { + "epoch": 2.52, + "learning_rate": 5.2765093860984264e-08, + "logits/chosen": -1.6362457275390625, + "logits/rejected": -1.5562450885772705, + "logps/chosen": -62.32781219482422, + "logps/rejected": -117.09416198730469, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.09111213684082, + "rewards/margins": 25.149341583251953, + "rewards/rejected": -29.24045753479004, + "step": 5530 + }, + { + "epoch": 2.53, + "learning_rate": 5.225773718924403e-08, + "logits/chosen": -1.6216081380844116, + "logits/rejected": -1.5349209308624268, + "logps/chosen": -59.554840087890625, + "logps/rejected": -116.4613265991211, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.461890459060669, + "rewards/margins": 26.002004623413086, + "rewards/rejected": -29.46389389038086, + "step": 5540 + }, + { + "epoch": 2.53, + "learning_rate": 5.17503805175038e-08, + "logits/chosen": -1.647017478942871, + "logits/rejected": -1.570915699005127, + "logps/chosen": -59.53449249267578, + "logps/rejected": -115.95018005371094, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.120753049850464, + "rewards/margins": 26.011632919311523, + "rewards/rejected": -29.13239097595215, + "step": 5550 + }, + { + "epoch": 2.54, + "learning_rate": 5.1243023845763564e-08, + "logits/chosen": -1.635332465171814, + "logits/rejected": -1.5419440269470215, + "logps/chosen": -62.53186798095703, + "logps/rejected": -117.53709411621094, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5881824493408203, + "rewards/margins": 25.784774780273438, + "rewards/rejected": -29.372955322265625, + "step": 5560 + }, + { + "epoch": 2.54, + "learning_rate": 5.073566717402333e-08, + "logits/chosen": -1.6254523992538452, + "logits/rejected": -1.5545673370361328, + "logps/chosen": -58.76667404174805, + "logps/rejected": -119.00218200683594, + "loss": 0.0076, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.654752731323242, + "rewards/margins": 26.570575714111328, + "rewards/rejected": -30.225322723388672, + "step": 5570 + }, + { + "epoch": 2.55, + "learning_rate": 5.02283105022831e-08, + "logits/chosen": -1.6368452310562134, + "logits/rejected": -1.53933584690094, + "logps/chosen": -65.74815368652344, + "logps/rejected": -116.06086730957031, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.214093208312988, + "rewards/margins": 25.33750343322754, + "rewards/rejected": -29.55159568786621, + "step": 5580 + }, + { + "epoch": 2.55, + "learning_rate": 4.9720953830542864e-08, + "logits/chosen": -1.6262636184692383, + "logits/rejected": -1.5578850507736206, + "logps/chosen": -62.19173049926758, + "logps/rejected": -117.39012145996094, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.056756496429443, + "rewards/margins": 25.51288414001465, + "rewards/rejected": -29.56964111328125, + "step": 5590 + }, + { + "epoch": 2.56, + "learning_rate": 4.921359715880263e-08, + "logits/chosen": -1.6535956859588623, + "logits/rejected": -1.5675952434539795, + "logps/chosen": -60.3506965637207, + "logps/rejected": -115.2103271484375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7057414054870605, + "rewards/margins": 24.840530395507812, + "rewards/rejected": -28.546274185180664, + "step": 5600 + }, + { + "epoch": 2.56, + "eval_logits/chosen": -1.4299277067184448, + "eval_logits/rejected": -1.3484517335891724, + "eval_logps/chosen": -91.37982177734375, + "eval_logps/rejected": -119.24150085449219, + "eval_loss": 0.013339003548026085, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -7.698925495147705, + "eval_rewards/margins": 21.969703674316406, + "eval_rewards/rejected": -29.668628692626953, + "eval_runtime": 149.7582, + "eval_samples_per_second": 19.111, + "eval_steps_per_second": 1.195, + "step": 5600 + }, + { + "epoch": 2.56, + "learning_rate": 4.87062404870624e-08, + "logits/chosen": -1.6384315490722656, + "logits/rejected": -1.5487849712371826, + "logps/chosen": -63.22198486328125, + "logps/rejected": -117.51546478271484, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.881399154663086, + "rewards/margins": 26.307037353515625, + "rewards/rejected": -30.18843650817871, + "step": 5610 + }, + { + "epoch": 2.57, + "learning_rate": 4.8198883815322164e-08, + "logits/chosen": -1.6279752254486084, + "logits/rejected": -1.5550925731658936, + "logps/chosen": -62.881736755371094, + "logps/rejected": -119.9935302734375, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.314352989196777, + "rewards/margins": 25.769826889038086, + "rewards/rejected": -30.084178924560547, + "step": 5620 + }, + { + "epoch": 2.57, + "learning_rate": 4.769152714358193e-08, + "logits/chosen": -1.6415107250213623, + "logits/rejected": -1.5622230768203735, + "logps/chosen": -60.204978942871094, + "logps/rejected": -118.11802673339844, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6665031909942627, + "rewards/margins": 26.238367080688477, + "rewards/rejected": -29.904865264892578, + "step": 5630 + }, + { + "epoch": 2.57, + "learning_rate": 4.71841704718417e-08, + "logits/chosen": -1.624925971031189, + "logits/rejected": -1.5330023765563965, + "logps/chosen": -58.52467727661133, + "logps/rejected": -111.4828872680664, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.563800096511841, + "rewards/margins": 24.683853149414062, + "rewards/rejected": -28.24765396118164, + "step": 5640 + }, + { + "epoch": 2.58, + "learning_rate": 4.6676813800101464e-08, + "logits/chosen": -1.630906343460083, + "logits/rejected": -1.5402790307998657, + "logps/chosen": -60.6591682434082, + "logps/rejected": -115.6097183227539, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.623584032058716, + "rewards/margins": 26.047557830810547, + "rewards/rejected": -29.671142578125, + "step": 5650 + }, + { + "epoch": 2.58, + "learning_rate": 4.616945712836123e-08, + "logits/chosen": -1.6545413732528687, + "logits/rejected": -1.5670019388198853, + "logps/chosen": -63.90033721923828, + "logps/rejected": -112.67887878417969, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.858156681060791, + "rewards/margins": 24.520545959472656, + "rewards/rejected": -28.378704071044922, + "step": 5660 + }, + { + "epoch": 2.59, + "learning_rate": 4.5662100456621e-08, + "logits/chosen": -1.6373335123062134, + "logits/rejected": -1.5495936870574951, + "logps/chosen": -60.8790397644043, + "logps/rejected": -119.51078796386719, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7924132347106934, + "rewards/margins": 27.08707618713379, + "rewards/rejected": -30.87948989868164, + "step": 5670 + }, + { + "epoch": 2.59, + "learning_rate": 4.5154743784880764e-08, + "logits/chosen": -1.6513662338256836, + "logits/rejected": -1.583567500114441, + "logps/chosen": -59.11711883544922, + "logps/rejected": -120.10084533691406, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.343040943145752, + "rewards/margins": 26.122615814208984, + "rewards/rejected": -29.46565818786621, + "step": 5680 + }, + { + "epoch": 2.6, + "learning_rate": 4.464738711314053e-08, + "logits/chosen": -1.6662410497665405, + "logits/rejected": -1.5812619924545288, + "logps/chosen": -61.8986930847168, + "logps/rejected": -110.57454681396484, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.756959915161133, + "rewards/margins": 23.829944610595703, + "rewards/rejected": -27.586902618408203, + "step": 5690 + }, + { + "epoch": 2.6, + "learning_rate": 4.41400304414003e-08, + "logits/chosen": -1.6431344747543335, + "logits/rejected": -1.5499104261398315, + "logps/chosen": -62.566795349121094, + "logps/rejected": -118.3983383178711, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.171024799346924, + "rewards/margins": 27.057300567626953, + "rewards/rejected": -30.22832679748535, + "step": 5700 + }, + { + "epoch": 2.6, + "eval_logits/chosen": -1.4365618228912354, + "eval_logits/rejected": -1.3552583456039429, + "eval_logps/chosen": -89.70028686523438, + "eval_logps/rejected": -117.83853149414062, + "eval_loss": 0.009456491097807884, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -6.859156131744385, + "eval_rewards/margins": 22.108001708984375, + "eval_rewards/rejected": -28.9671573638916, + "eval_runtime": 162.7705, + "eval_samples_per_second": 17.583, + "eval_steps_per_second": 1.1, + "step": 5700 + }, + { + "epoch": 2.61, + "learning_rate": 4.3632673769660064e-08, + "logits/chosen": -1.6342830657958984, + "logits/rejected": -1.5472590923309326, + "logps/chosen": -63.207054138183594, + "logps/rejected": -120.1373291015625, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.573979616165161, + "rewards/margins": 25.88643455505371, + "rewards/rejected": -29.46041488647461, + "step": 5710 + }, + { + "epoch": 2.61, + "learning_rate": 4.312531709791983e-08, + "logits/chosen": -1.681544303894043, + "logits/rejected": -1.6141388416290283, + "logps/chosen": -63.15888214111328, + "logps/rejected": -116.94087982177734, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5882973670959473, + "rewards/margins": 24.538061141967773, + "rewards/rejected": -28.126361846923828, + "step": 5720 + }, + { + "epoch": 2.62, + "learning_rate": 4.26179604261796e-08, + "logits/chosen": -1.655426263809204, + "logits/rejected": -1.5624696016311646, + "logps/chosen": -56.862220764160156, + "logps/rejected": -109.0649185180664, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.455822467803955, + "rewards/margins": 25.6435489654541, + "rewards/rejected": -28.0993709564209, + "step": 5730 + }, + { + "epoch": 2.62, + "learning_rate": 4.2110603754439363e-08, + "logits/chosen": -1.6815840005874634, + "logits/rejected": -1.5926878452301025, + "logps/chosen": -59.546897888183594, + "logps/rejected": -115.58638763427734, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8910863399505615, + "rewards/margins": 25.475738525390625, + "rewards/rejected": -28.3668270111084, + "step": 5740 + }, + { + "epoch": 2.62, + "learning_rate": 4.160324708269913e-08, + "logits/chosen": -1.6645225286483765, + "logits/rejected": -1.5795118808746338, + "logps/chosen": -60.3840446472168, + "logps/rejected": -118.16209411621094, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7638068199157715, + "rewards/margins": 26.152246475219727, + "rewards/rejected": -28.91605567932129, + "step": 5750 + }, + { + "epoch": 2.63, + "learning_rate": 4.10958904109589e-08, + "logits/chosen": -1.6419804096221924, + "logits/rejected": -1.5658048391342163, + "logps/chosen": -55.76360321044922, + "logps/rejected": -111.81380462646484, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.067939043045044, + "rewards/margins": 24.284969329833984, + "rewards/rejected": -27.352909088134766, + "step": 5760 + }, + { + "epoch": 2.63, + "learning_rate": 4.0588533739218663e-08, + "logits/chosen": -1.6323308944702148, + "logits/rejected": -1.5415849685668945, + "logps/chosen": -57.9756965637207, + "logps/rejected": -110.98262786865234, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.81278920173645, + "rewards/margins": 25.500110626220703, + "rewards/rejected": -28.312902450561523, + "step": 5770 + }, + { + "epoch": 2.64, + "learning_rate": 4.0081177067478437e-08, + "logits/chosen": -1.6647393703460693, + "logits/rejected": -1.5759618282318115, + "logps/chosen": -62.323204040527344, + "logps/rejected": -111.48100280761719, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.236069917678833, + "rewards/margins": 24.6293888092041, + "rewards/rejected": -27.86545753479004, + "step": 5780 + }, + { + "epoch": 2.64, + "learning_rate": 3.95738203957382e-08, + "logits/chosen": -1.6411035060882568, + "logits/rejected": -1.5813742876052856, + "logps/chosen": -60.639862060546875, + "logps/rejected": -116.2388916015625, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.28511381149292, + "rewards/margins": 24.081039428710938, + "rewards/rejected": -28.36614990234375, + "step": 5790 + }, + { + "epoch": 2.65, + "learning_rate": 3.906646372399797e-08, + "logits/chosen": -1.678734540939331, + "logits/rejected": -1.5850093364715576, + "logps/chosen": -61.819984436035156, + "logps/rejected": -111.3331527709961, + "loss": 0.0054, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.188535213470459, + "rewards/margins": 24.102962493896484, + "rewards/rejected": -27.291500091552734, + "step": 5800 + }, + { + "epoch": 2.65, + "eval_logits/chosen": -1.445048213005066, + "eval_logits/rejected": -1.3636606931686401, + "eval_logps/chosen": -88.80926513671875, + "eval_logps/rejected": -116.7530746459961, + "eval_loss": 0.00770636135712266, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -6.413644313812256, + "eval_rewards/margins": 22.010774612426758, + "eval_rewards/rejected": -28.424423217773438, + "eval_runtime": 172.6366, + "eval_samples_per_second": 16.578, + "eval_steps_per_second": 1.037, + "step": 5800 + }, + { + "epoch": 2.65, + "learning_rate": 3.8559107052257736e-08, + "logits/chosen": -1.634439468383789, + "logits/rejected": -1.5573443174362183, + "logps/chosen": -57.23991775512695, + "logps/rejected": -114.6131591796875, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.1852035522460938, + "rewards/margins": 25.9045352935791, + "rewards/rejected": -29.089736938476562, + "step": 5810 + }, + { + "epoch": 2.66, + "learning_rate": 3.80517503805175e-08, + "logits/chosen": -1.6653354167938232, + "logits/rejected": -1.5760657787322998, + "logps/chosen": -63.47956466674805, + "logps/rejected": -118.31854248046875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.382948637008667, + "rewards/margins": 25.50372886657715, + "rewards/rejected": -28.886676788330078, + "step": 5820 + }, + { + "epoch": 2.66, + "learning_rate": 3.754439370877727e-08, + "logits/chosen": -1.6479460000991821, + "logits/rejected": -1.5641560554504395, + "logps/chosen": -61.220802307128906, + "logps/rejected": -117.40779876708984, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3633739948272705, + "rewards/margins": 25.617687225341797, + "rewards/rejected": -28.981060028076172, + "step": 5830 + }, + { + "epoch": 2.67, + "learning_rate": 3.7037037037037036e-08, + "logits/chosen": -1.6705583333969116, + "logits/rejected": -1.5847426652908325, + "logps/chosen": -58.475196838378906, + "logps/rejected": -117.71073913574219, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.079245090484619, + "rewards/margins": 26.204748153686523, + "rewards/rejected": -29.28399658203125, + "step": 5840 + }, + { + "epoch": 2.67, + "learning_rate": 3.65296803652968e-08, + "logits/chosen": -1.6485599279403687, + "logits/rejected": -1.56297767162323, + "logps/chosen": -60.04380416870117, + "logps/rejected": -109.9533920288086, + "loss": 0.0055, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.4925427436828613, + "rewards/margins": 23.410324096679688, + "rewards/rejected": -26.902868270874023, + "step": 5850 + }, + { + "epoch": 2.67, + "learning_rate": 3.602232369355657e-08, + "logits/chosen": -1.6266456842422485, + "logits/rejected": -1.5563054084777832, + "logps/chosen": -59.59053421020508, + "logps/rejected": -114.15860748291016, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7324161529541016, + "rewards/margins": 25.718669891357422, + "rewards/rejected": -29.45108985900879, + "step": 5860 + }, + { + "epoch": 2.68, + "learning_rate": 3.5514967021816336e-08, + "logits/chosen": -1.6728761196136475, + "logits/rejected": -1.5769294500350952, + "logps/chosen": -62.21794509887695, + "logps/rejected": -121.44730377197266, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.6361191272735596, + "rewards/margins": 26.636241912841797, + "rewards/rejected": -30.27235984802246, + "step": 5870 + }, + { + "epoch": 2.68, + "learning_rate": 3.50076103500761e-08, + "logits/chosen": -1.6521060466766357, + "logits/rejected": -1.5682947635650635, + "logps/chosen": -64.18666076660156, + "logps/rejected": -119.81550598144531, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.000080585479736, + "rewards/margins": 25.79050064086914, + "rewards/rejected": -29.790578842163086, + "step": 5880 + }, + { + "epoch": 2.69, + "learning_rate": 3.450025367833587e-08, + "logits/chosen": -1.6358667612075806, + "logits/rejected": -1.5582091808319092, + "logps/chosen": -63.92961502075195, + "logps/rejected": -118.2952880859375, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9304556846618652, + "rewards/margins": 26.11741065979004, + "rewards/rejected": -30.047870635986328, + "step": 5890 + }, + { + "epoch": 2.69, + "learning_rate": 3.3992897006595636e-08, + "logits/chosen": -1.6305748224258423, + "logits/rejected": -1.5501872301101685, + "logps/chosen": -62.05702590942383, + "logps/rejected": -121.21634674072266, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6642932891845703, + "rewards/margins": 26.954849243164062, + "rewards/rejected": -30.6191463470459, + "step": 5900 + }, + { + "epoch": 2.69, + "eval_logits/chosen": -1.4208369255065918, + "eval_logits/rejected": -1.3400059938430786, + "eval_logps/chosen": -91.27997589111328, + "eval_logps/rejected": -120.20849609375, + "eval_loss": 0.011545187793672085, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -7.648996829986572, + "eval_rewards/margins": 22.503141403198242, + "eval_rewards/rejected": -30.152135848999023, + "eval_runtime": 161.2749, + "eval_samples_per_second": 17.746, + "eval_steps_per_second": 1.11, + "step": 5900 + }, + { + "epoch": 2.7, + "learning_rate": 3.34855403348554e-08, + "logits/chosen": -1.6295617818832397, + "logits/rejected": -1.5396859645843506, + "logps/chosen": -68.30640411376953, + "logps/rejected": -116.1508560180664, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.036105155944824, + "rewards/margins": 25.045873641967773, + "rewards/rejected": -29.081979751586914, + "step": 5910 + }, + { + "epoch": 2.7, + "learning_rate": 3.297818366311517e-08, + "logits/chosen": -1.6219854354858398, + "logits/rejected": -1.55465829372406, + "logps/chosen": -62.98735427856445, + "logps/rejected": -118.68704986572266, + "loss": 0.0043, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.5867815017700195, + "rewards/margins": 25.48343849182129, + "rewards/rejected": -30.07021713256836, + "step": 5920 + }, + { + "epoch": 2.71, + "learning_rate": 3.2470826991374936e-08, + "logits/chosen": -1.637002944946289, + "logits/rejected": -1.551239252090454, + "logps/chosen": -65.75910949707031, + "logps/rejected": -118.2501449584961, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.5694169998168945, + "rewards/margins": 25.793041229248047, + "rewards/rejected": -30.362462997436523, + "step": 5930 + }, + { + "epoch": 2.71, + "learning_rate": 3.19634703196347e-08, + "logits/chosen": -1.6292927265167236, + "logits/rejected": -1.5572412014007568, + "logps/chosen": -62.1704216003418, + "logps/rejected": -121.44587707519531, + "loss": 0.0011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.114914894104004, + "rewards/margins": 26.280200958251953, + "rewards/rejected": -30.395116806030273, + "step": 5940 + }, + { + "epoch": 2.72, + "learning_rate": 3.145611364789447e-08, + "logits/chosen": -1.6549896001815796, + "logits/rejected": -1.567518949508667, + "logps/chosen": -62.098243713378906, + "logps/rejected": -116.75553131103516, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8145833015441895, + "rewards/margins": 25.686513900756836, + "rewards/rejected": -29.5010986328125, + "step": 5950 + }, + { + "epoch": 2.72, + "learning_rate": 3.0948756976154236e-08, + "logits/chosen": -1.6517982482910156, + "logits/rejected": -1.5621912479400635, + "logps/chosen": -61.54521560668945, + "logps/rejected": -115.05535888671875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3965892791748047, + "rewards/margins": 25.722614288330078, + "rewards/rejected": -29.11920166015625, + "step": 5960 + }, + { + "epoch": 2.73, + "learning_rate": 3.0441400304414e-08, + "logits/chosen": -1.6520181894302368, + "logits/rejected": -1.576278805732727, + "logps/chosen": -63.23960494995117, + "logps/rejected": -118.0584487915039, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.735790967941284, + "rewards/margins": 25.305919647216797, + "rewards/rejected": -29.04170799255371, + "step": 5970 + }, + { + "epoch": 2.73, + "learning_rate": 2.993404363267377e-08, + "logits/chosen": -1.630038857460022, + "logits/rejected": -1.5673866271972656, + "logps/chosen": -54.789329528808594, + "logps/rejected": -119.5348129272461, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0025506019592285, + "rewards/margins": 27.198843002319336, + "rewards/rejected": -30.20139503479004, + "step": 5980 + }, + { + "epoch": 2.73, + "learning_rate": 2.9426686960933532e-08, + "logits/chosen": -1.6585489511489868, + "logits/rejected": -1.5535162687301636, + "logps/chosen": -66.03738403320312, + "logps/rejected": -121.28465270996094, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.421753406524658, + "rewards/margins": 26.9661922454834, + "rewards/rejected": -30.3879451751709, + "step": 5990 + }, + { + "epoch": 2.74, + "learning_rate": 2.89193302891933e-08, + "logits/chosen": -1.670650839805603, + "logits/rejected": -1.5895731449127197, + "logps/chosen": -65.10139465332031, + "logps/rejected": -117.02482604980469, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6346237659454346, + "rewards/margins": 25.722705841064453, + "rewards/rejected": -29.35733413696289, + "step": 6000 + }, + { + "epoch": 2.74, + "eval_logits/chosen": -1.4316580295562744, + "eval_logits/rejected": -1.3509966135025024, + "eval_logps/chosen": -89.68942260742188, + "eval_logps/rejected": -118.18567657470703, + "eval_loss": 0.008585677482187748, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -6.853721618652344, + "eval_rewards/margins": 22.28700065612793, + "eval_rewards/rejected": -29.140722274780273, + "eval_runtime": 143.5018, + "eval_samples_per_second": 19.944, + "eval_steps_per_second": 1.247, + "step": 6000 + }, + { + "epoch": 2.74, + "learning_rate": 2.8411973617453066e-08, + "logits/chosen": -1.6392771005630493, + "logits/rejected": -1.5661704540252686, + "logps/chosen": -59.24951171875, + "logps/rejected": -116.7738265991211, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.125824213027954, + "rewards/margins": 26.19602394104004, + "rewards/rejected": -29.321847915649414, + "step": 6010 + }, + { + "epoch": 2.75, + "learning_rate": 2.7904616945712832e-08, + "logits/chosen": -1.6743838787078857, + "logits/rejected": -1.567811369895935, + "logps/chosen": -65.45565795898438, + "logps/rejected": -115.02339172363281, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.311161994934082, + "rewards/margins": 24.980754852294922, + "rewards/rejected": -29.291919708251953, + "step": 6020 + }, + { + "epoch": 2.75, + "learning_rate": 2.73972602739726e-08, + "logits/chosen": -1.6575504541397095, + "logits/rejected": -1.5639588832855225, + "logps/chosen": -61.6038818359375, + "logps/rejected": -113.43119812011719, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.067666530609131, + "rewards/margins": 25.01435661315918, + "rewards/rejected": -29.0820255279541, + "step": 6030 + }, + { + "epoch": 2.76, + "learning_rate": 2.6889903602232366e-08, + "logits/chosen": -1.6164219379425049, + "logits/rejected": -1.5363751649856567, + "logps/chosen": -62.669212341308594, + "logps/rejected": -122.5501480102539, + "loss": 0.0043, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.584050416946411, + "rewards/margins": 26.209972381591797, + "rewards/rejected": -29.794025421142578, + "step": 6040 + }, + { + "epoch": 2.76, + "learning_rate": 2.6382546930492132e-08, + "logits/chosen": -1.6212742328643799, + "logits/rejected": -1.531227946281433, + "logps/chosen": -63.161460876464844, + "logps/rejected": -114.44987487792969, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7137718200683594, + "rewards/margins": 25.035747528076172, + "rewards/rejected": -28.7495174407959, + "step": 6050 + }, + { + "epoch": 2.77, + "learning_rate": 2.58751902587519e-08, + "logits/chosen": -1.6605415344238281, + "logits/rejected": -1.5648448467254639, + "logps/chosen": -61.281028747558594, + "logps/rejected": -115.41572570800781, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.519256591796875, + "rewards/margins": 25.869674682617188, + "rewards/rejected": -29.388927459716797, + "step": 6060 + }, + { + "epoch": 2.77, + "learning_rate": 2.5367833587011665e-08, + "logits/chosen": -1.6297121047973633, + "logits/rejected": -1.5470812320709229, + "logps/chosen": -64.13361358642578, + "logps/rejected": -118.23075103759766, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.01121187210083, + "rewards/margins": 26.0483341217041, + "rewards/rejected": -30.059545516967773, + "step": 6070 + }, + { + "epoch": 2.78, + "learning_rate": 2.4860476915271432e-08, + "logits/chosen": -1.6354061365127563, + "logits/rejected": -1.5343222618103027, + "logps/chosen": -64.9552230834961, + "logps/rejected": -117.0967788696289, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.105961561203003, + "rewards/margins": 26.618520736694336, + "rewards/rejected": -29.724477767944336, + "step": 6080 + }, + { + "epoch": 2.78, + "learning_rate": 2.43531202435312e-08, + "logits/chosen": -1.6334095001220703, + "logits/rejected": -1.559619665145874, + "logps/chosen": -55.2518196105957, + "logps/rejected": -114.39505767822266, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2193074226379395, + "rewards/margins": 25.336313247680664, + "rewards/rejected": -28.555622100830078, + "step": 6090 + }, + { + "epoch": 2.78, + "learning_rate": 2.3845763571790965e-08, + "logits/chosen": -1.6395479440689087, + "logits/rejected": -1.560088872909546, + "logps/chosen": -62.160560607910156, + "logps/rejected": -113.92082214355469, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.746182918548584, + "rewards/margins": 24.4161434173584, + "rewards/rejected": -28.162328720092773, + "step": 6100 + }, + { + "epoch": 2.78, + "eval_logits/chosen": -1.4256998300552368, + "eval_logits/rejected": -1.345211386680603, + "eval_logps/chosen": -90.22210693359375, + "eval_logps/rejected": -119.16900634765625, + "eval_loss": 0.009460356086492538, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -7.120062828063965, + "eval_rewards/margins": 22.5123233795166, + "eval_rewards/rejected": -29.632389068603516, + "eval_runtime": 175.1008, + "eval_samples_per_second": 16.345, + "eval_steps_per_second": 1.022, + "step": 6100 + }, + { + "epoch": 2.79, + "learning_rate": 2.3338406900050732e-08, + "logits/chosen": -1.6284040212631226, + "logits/rejected": -1.541839361190796, + "logps/chosen": -60.50825119018555, + "logps/rejected": -111.8641128540039, + "loss": 0.0043, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.3182640075683594, + "rewards/margins": 24.865325927734375, + "rewards/rejected": -28.1835880279541, + "step": 6110 + }, + { + "epoch": 2.79, + "learning_rate": 2.28310502283105e-08, + "logits/chosen": -1.6222755908966064, + "logits/rejected": -1.5430351495742798, + "logps/chosen": -62.3880729675293, + "logps/rejected": -120.25767517089844, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.532604694366455, + "rewards/margins": 26.64389419555664, + "rewards/rejected": -30.176494598388672, + "step": 6120 + }, + { + "epoch": 2.8, + "learning_rate": 2.2323693556570265e-08, + "logits/chosen": -1.6467183828353882, + "logits/rejected": -1.5764975547790527, + "logps/chosen": -58.1132698059082, + "logps/rejected": -115.12342834472656, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.736299991607666, + "rewards/margins": 25.90152359008789, + "rewards/rejected": -29.6378231048584, + "step": 6130 + }, + { + "epoch": 2.8, + "learning_rate": 2.1816336884830032e-08, + "logits/chosen": -1.6268088817596436, + "logits/rejected": -1.5481709241867065, + "logps/chosen": -61.39824295043945, + "logps/rejected": -114.4950942993164, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7331886291503906, + "rewards/margins": 24.886098861694336, + "rewards/rejected": -28.619287490844727, + "step": 6140 + }, + { + "epoch": 2.81, + "learning_rate": 2.13089802130898e-08, + "logits/chosen": -1.6436065435409546, + "logits/rejected": -1.532901406288147, + "logps/chosen": -64.28096008300781, + "logps/rejected": -116.76171875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1694931983947754, + "rewards/margins": 26.952239990234375, + "rewards/rejected": -30.121734619140625, + "step": 6150 + }, + { + "epoch": 2.81, + "learning_rate": 2.0801623541349565e-08, + "logits/chosen": -1.6422874927520752, + "logits/rejected": -1.574110984802246, + "logps/chosen": -63.52398681640625, + "logps/rejected": -121.7143325805664, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.430600643157959, + "rewards/margins": 25.789005279541016, + "rewards/rejected": -29.219608306884766, + "step": 6160 + }, + { + "epoch": 2.82, + "learning_rate": 2.0294266869609332e-08, + "logits/chosen": -1.656306266784668, + "logits/rejected": -1.5762187242507935, + "logps/chosen": -64.04621887207031, + "logps/rejected": -115.48970031738281, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.133012294769287, + "rewards/margins": 25.261951446533203, + "rewards/rejected": -29.394962310791016, + "step": 6170 + }, + { + "epoch": 2.82, + "learning_rate": 1.97869101978691e-08, + "logits/chosen": -1.622867226600647, + "logits/rejected": -1.537571668624878, + "logps/chosen": -62.96330642700195, + "logps/rejected": -118.26654052734375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.150825262069702, + "rewards/margins": 26.657018661499023, + "rewards/rejected": -29.807846069335938, + "step": 6180 + }, + { + "epoch": 2.83, + "learning_rate": 1.9279553526128868e-08, + "logits/chosen": -1.657539963722229, + "logits/rejected": -1.5573053359985352, + "logps/chosen": -61.9826774597168, + "logps/rejected": -121.66966247558594, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.1908092498779297, + "rewards/margins": 27.3090877532959, + "rewards/rejected": -30.499902725219727, + "step": 6190 + }, + { + "epoch": 2.83, + "learning_rate": 1.8772196854388635e-08, + "logits/chosen": -1.6334965229034424, + "logits/rejected": -1.5571749210357666, + "logps/chosen": -58.16144943237305, + "logps/rejected": -113.46431732177734, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1828455924987793, + "rewards/margins": 24.975894927978516, + "rewards/rejected": -28.158737182617188, + "step": 6200 + }, + { + "epoch": 2.83, + "eval_logits/chosen": -1.4334616661071777, + "eval_logits/rejected": -1.3530946969985962, + "eval_logps/chosen": -89.77032470703125, + "eval_logps/rejected": -118.23873901367188, + "eval_loss": 0.008627377450466156, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -6.894176006317139, + "eval_rewards/margins": 22.2730770111084, + "eval_rewards/rejected": -29.167253494262695, + "eval_runtime": 167.5336, + "eval_samples_per_second": 17.083, + "eval_steps_per_second": 1.068, + "step": 6200 + }, + { + "epoch": 2.83, + "learning_rate": 1.82648401826484e-08, + "logits/chosen": -1.630913496017456, + "logits/rejected": -1.5411865711212158, + "logps/chosen": -63.88469314575195, + "logps/rejected": -117.57832336425781, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7570133209228516, + "rewards/margins": 26.40604591369629, + "rewards/rejected": -30.16305923461914, + "step": 6210 + }, + { + "epoch": 2.84, + "learning_rate": 1.7757483510908168e-08, + "logits/chosen": -1.659031867980957, + "logits/rejected": -1.5792479515075684, + "logps/chosen": -57.74346923828125, + "logps/rejected": -116.78248596191406, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.2035319805145264, + "rewards/margins": 25.624927520751953, + "rewards/rejected": -28.828460693359375, + "step": 6220 + }, + { + "epoch": 2.84, + "learning_rate": 1.7250126839167935e-08, + "logits/chosen": -1.6171165704727173, + "logits/rejected": -1.5536924600601196, + "logps/chosen": -56.93315505981445, + "logps/rejected": -116.37396240234375, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.59100604057312, + "rewards/margins": 25.44740867614746, + "rewards/rejected": -29.03841781616211, + "step": 6230 + }, + { + "epoch": 2.85, + "learning_rate": 1.67427701674277e-08, + "logits/chosen": -1.6382725238800049, + "logits/rejected": -1.5492085218429565, + "logps/chosen": -60.00202178955078, + "logps/rejected": -117.9487075805664, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.313509702682495, + "rewards/margins": 26.29458236694336, + "rewards/rejected": -29.608089447021484, + "step": 6240 + }, + { + "epoch": 2.85, + "learning_rate": 1.6235413495687468e-08, + "logits/chosen": -1.6389650106430054, + "logits/rejected": -1.5683645009994507, + "logps/chosen": -61.87580490112305, + "logps/rejected": -117.61279296875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.534771680831909, + "rewards/margins": 25.421100616455078, + "rewards/rejected": -28.95587158203125, + "step": 6250 + }, + { + "epoch": 2.86, + "learning_rate": 1.5728056823947235e-08, + "logits/chosen": -1.6355327367782593, + "logits/rejected": -1.5543670654296875, + "logps/chosen": -61.89397048950195, + "logps/rejected": -118.14009857177734, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3398423194885254, + "rewards/margins": 25.748271942138672, + "rewards/rejected": -29.088115692138672, + "step": 6260 + }, + { + "epoch": 2.86, + "learning_rate": 1.5220700152207e-08, + "logits/chosen": -1.6374504566192627, + "logits/rejected": -1.5645629167556763, + "logps/chosen": -57.43635940551758, + "logps/rejected": -117.78645324707031, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0963313579559326, + "rewards/margins": 26.56585693359375, + "rewards/rejected": -29.662189483642578, + "step": 6270 + }, + { + "epoch": 2.87, + "learning_rate": 1.4713343480466766e-08, + "logits/chosen": -1.6323999166488647, + "logits/rejected": -1.5513784885406494, + "logps/chosen": -63.46538162231445, + "logps/rejected": -117.1316909790039, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.105284690856934, + "rewards/margins": 25.079740524291992, + "rewards/rejected": -29.18502426147461, + "step": 6280 + }, + { + "epoch": 2.87, + "learning_rate": 1.4205986808726533e-08, + "logits/chosen": -1.6455732583999634, + "logits/rejected": -1.5496267080307007, + "logps/chosen": -63.40629196166992, + "logps/rejected": -116.4112777709961, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5596485137939453, + "rewards/margins": 26.04227638244629, + "rewards/rejected": -29.601924896240234, + "step": 6290 + }, + { + "epoch": 2.88, + "learning_rate": 1.36986301369863e-08, + "logits/chosen": -1.6346752643585205, + "logits/rejected": -1.5575860738754272, + "logps/chosen": -60.238128662109375, + "logps/rejected": -114.30989837646484, + "loss": 0.0013, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.088661193847656, + "rewards/margins": 24.86785888671875, + "rewards/rejected": -28.956518173217773, + "step": 6300 + }, + { + "epoch": 2.88, + "eval_logits/chosen": -1.4349452257156372, + "eval_logits/rejected": -1.3543336391448975, + "eval_logps/chosen": -89.65511322021484, + "eval_logps/rejected": -117.97098541259766, + "eval_loss": 0.00864337757229805, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -6.83656644821167, + "eval_rewards/margins": 22.196807861328125, + "eval_rewards/rejected": -29.033374786376953, + "eval_runtime": 154.2565, + "eval_samples_per_second": 18.554, + "eval_steps_per_second": 1.16, + "step": 6300 + }, + { + "epoch": 2.88, + "learning_rate": 1.3191273465246066e-08, + "logits/chosen": -1.637908935546875, + "logits/rejected": -1.5516226291656494, + "logps/chosen": -60.553466796875, + "logps/rejected": -114.1862564086914, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.289581775665283, + "rewards/margins": 24.721349716186523, + "rewards/rejected": -28.010934829711914, + "step": 6310 + }, + { + "epoch": 2.88, + "learning_rate": 1.2683916793505833e-08, + "logits/chosen": -1.662811279296875, + "logits/rejected": -1.5722968578338623, + "logps/chosen": -63.365997314453125, + "logps/rejected": -122.7975082397461, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5013763904571533, + "rewards/margins": 27.253204345703125, + "rewards/rejected": -30.754581451416016, + "step": 6320 + }, + { + "epoch": 2.89, + "learning_rate": 1.21765601217656e-08, + "logits/chosen": -1.6395689249038696, + "logits/rejected": -1.5583178997039795, + "logps/chosen": -57.32929611206055, + "logps/rejected": -112.72383117675781, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.334056854248047, + "rewards/margins": 25.49137306213379, + "rewards/rejected": -28.8254337310791, + "step": 6330 + }, + { + "epoch": 2.89, + "learning_rate": 1.1669203450025366e-08, + "logits/chosen": -1.6697086095809937, + "logits/rejected": -1.5900932550430298, + "logps/chosen": -62.0523796081543, + "logps/rejected": -119.71087646484375, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4745171070098877, + "rewards/margins": 25.79860496520996, + "rewards/rejected": -29.273122787475586, + "step": 6340 + }, + { + "epoch": 2.9, + "learning_rate": 1.1161846778285133e-08, + "logits/chosen": -1.6300337314605713, + "logits/rejected": -1.5525095462799072, + "logps/chosen": -61.092987060546875, + "logps/rejected": -115.71488189697266, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.913750410079956, + "rewards/margins": 25.886764526367188, + "rewards/rejected": -29.800512313842773, + "step": 6350 + }, + { + "epoch": 2.9, + "learning_rate": 1.06544901065449e-08, + "logits/chosen": -1.643493413925171, + "logits/rejected": -1.5590834617614746, + "logps/chosen": -60.048858642578125, + "logps/rejected": -115.4676513671875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.94218111038208, + "rewards/margins": 26.0200138092041, + "rewards/rejected": -28.962194442749023, + "step": 6360 + }, + { + "epoch": 2.91, + "learning_rate": 1.0147133434804666e-08, + "logits/chosen": -1.641506552696228, + "logits/rejected": -1.548807144165039, + "logps/chosen": -63.835044860839844, + "logps/rejected": -116.1264877319336, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6523032188415527, + "rewards/margins": 25.005321502685547, + "rewards/rejected": -28.657623291015625, + "step": 6370 + }, + { + "epoch": 2.91, + "learning_rate": 9.639776763064434e-09, + "logits/chosen": -1.6641569137573242, + "logits/rejected": -1.5705759525299072, + "logps/chosen": -58.545570373535156, + "logps/rejected": -114.12187194824219, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1781678199768066, + "rewards/margins": 26.338916778564453, + "rewards/rejected": -29.5170841217041, + "step": 6380 + }, + { + "epoch": 2.92, + "learning_rate": 9.1324200913242e-09, + "logits/chosen": -1.6400539875030518, + "logits/rejected": -1.5575604438781738, + "logps/chosen": -60.43939208984375, + "logps/rejected": -114.89385986328125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.559497833251953, + "rewards/margins": 25.163372039794922, + "rewards/rejected": -28.72286605834961, + "step": 6390 + }, + { + "epoch": 2.92, + "learning_rate": 8.625063419583967e-09, + "logits/chosen": -1.6212533712387085, + "logits/rejected": -1.5395368337631226, + "logps/chosen": -59.00303268432617, + "logps/rejected": -113.10612487792969, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.4267821311950684, + "rewards/margins": 25.679149627685547, + "rewards/rejected": -29.105932235717773, + "step": 6400 + }, + { + "epoch": 2.92, + "eval_logits/chosen": -1.4303452968597412, + "eval_logits/rejected": -1.3493685722351074, + "eval_logps/chosen": -89.9966049194336, + "eval_logps/rejected": -118.4869155883789, + "eval_loss": 0.009599537588655949, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -7.007308483123779, + "eval_rewards/margins": 22.284032821655273, + "eval_rewards/rejected": -29.291339874267578, + "eval_runtime": 148.4489, + "eval_samples_per_second": 19.279, + "eval_steps_per_second": 1.206, + "step": 6400 + }, + { + "epoch": 2.93, + "learning_rate": 8.117706747843734e-09, + "logits/chosen": -1.6490615606307983, + "logits/rejected": -1.5711501836776733, + "logps/chosen": -61.677642822265625, + "logps/rejected": -119.9886703491211, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.11264705657959, + "rewards/margins": 27.286340713500977, + "rewards/rejected": -30.398983001708984, + "step": 6410 + }, + { + "epoch": 2.93, + "learning_rate": 7.6103500761035e-09, + "logits/chosen": -1.631784439086914, + "logits/rejected": -1.5532324314117432, + "logps/chosen": -59.434417724609375, + "logps/rejected": -115.86344909667969, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2527592182159424, + "rewards/margins": 26.518869400024414, + "rewards/rejected": -29.77162742614746, + "step": 6420 + }, + { + "epoch": 2.94, + "learning_rate": 7.1029934043632664e-09, + "logits/chosen": -1.6500422954559326, + "logits/rejected": -1.5633658170700073, + "logps/chosen": -61.51234817504883, + "logps/rejected": -113.0541763305664, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.301517963409424, + "rewards/margins": 25.33908462524414, + "rewards/rejected": -28.64060401916504, + "step": 6430 + }, + { + "epoch": 2.94, + "learning_rate": 6.595636732623033e-09, + "logits/chosen": -1.6449449062347412, + "logits/rejected": -1.5771965980529785, + "logps/chosen": -54.834747314453125, + "logps/rejected": -118.65677642822266, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.671351671218872, + "rewards/margins": 26.696313858032227, + "rewards/rejected": -29.367666244506836, + "step": 6440 + }, + { + "epoch": 2.94, + "learning_rate": 6.0882800608828e-09, + "logits/chosen": -1.62326979637146, + "logits/rejected": -1.5562350749969482, + "logps/chosen": -59.994285583496094, + "logps/rejected": -118.3582992553711, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.617910861968994, + "rewards/margins": 25.71333885192871, + "rewards/rejected": -29.331247329711914, + "step": 6450 + }, + { + "epoch": 2.95, + "learning_rate": 5.580923389142566e-09, + "logits/chosen": -1.66298508644104, + "logits/rejected": -1.575870394706726, + "logps/chosen": -58.43715286254883, + "logps/rejected": -119.3067855834961, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6894166469573975, + "rewards/margins": 27.552875518798828, + "rewards/rejected": -30.242290496826172, + "step": 6460 + }, + { + "epoch": 2.95, + "learning_rate": 5.073566717402333e-09, + "logits/chosen": -1.6346296072006226, + "logits/rejected": -1.5523085594177246, + "logps/chosen": -62.90636444091797, + "logps/rejected": -115.53157043457031, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4161829948425293, + "rewards/margins": 24.956478118896484, + "rewards/rejected": -28.372661590576172, + "step": 6470 + }, + { + "epoch": 2.96, + "learning_rate": 4.5662100456621e-09, + "logits/chosen": -1.6323788166046143, + "logits/rejected": -1.5435467958450317, + "logps/chosen": -61.47710418701172, + "logps/rejected": -117.0919418334961, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.35223388671875, + "rewards/margins": 26.055648803710938, + "rewards/rejected": -29.407886505126953, + "step": 6480 + }, + { + "epoch": 2.96, + "learning_rate": 4.058853373921867e-09, + "logits/chosen": -1.6424974203109741, + "logits/rejected": -1.547910451889038, + "logps/chosen": -63.641624450683594, + "logps/rejected": -116.34974670410156, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9447073936462402, + "rewards/margins": 26.343042373657227, + "rewards/rejected": -29.28774642944336, + "step": 6490 + }, + { + "epoch": 2.97, + "learning_rate": 3.5514967021816332e-09, + "logits/chosen": -1.6433073282241821, + "logits/rejected": -1.5683605670928955, + "logps/chosen": -58.81917190551758, + "logps/rejected": -115.32568359375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.370504856109619, + "rewards/margins": 25.766164779663086, + "rewards/rejected": -29.136669158935547, + "step": 6500 + }, + { + "epoch": 2.97, + "eval_logits/chosen": -1.4297113418579102, + "eval_logits/rejected": -1.3493608236312866, + "eval_logps/chosen": -89.93760681152344, + "eval_logps/rejected": -118.577392578125, + "eval_loss": 0.009225493296980858, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -6.977811813354492, + "eval_rewards/margins": 22.3587703704834, + "eval_rewards/rejected": -29.336578369140625, + "eval_runtime": 145.0292, + "eval_samples_per_second": 19.734, + "eval_steps_per_second": 1.234, + "step": 6500 + }, + { + "epoch": 2.97, + "learning_rate": 3.0441400304414e-09, + "logits/chosen": -1.641736626625061, + "logits/rejected": -1.5471439361572266, + "logps/chosen": -61.745521545410156, + "logps/rejected": -118.31556701660156, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6029326915740967, + "rewards/margins": 26.154644012451172, + "rewards/rejected": -29.757577896118164, + "step": 6510 + }, + { + "epoch": 2.98, + "learning_rate": 2.5367833587011665e-09, + "logits/chosen": -1.6372482776641846, + "logits/rejected": -1.5472410917282104, + "logps/chosen": -64.36847686767578, + "logps/rejected": -118.65006256103516, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.840372085571289, + "rewards/margins": 26.441211700439453, + "rewards/rejected": -30.28158950805664, + "step": 6520 + }, + { + "epoch": 2.98, + "learning_rate": 2.0294266869609335e-09, + "logits/chosen": -1.622945785522461, + "logits/rejected": -1.5590341091156006, + "logps/chosen": -57.7742919921875, + "logps/rejected": -116.918212890625, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.9200615882873535, + "rewards/margins": 24.792682647705078, + "rewards/rejected": -28.712743759155273, + "step": 6530 + }, + { + "epoch": 2.99, + "learning_rate": 1.5220700152207e-09, + "logits/chosen": -1.6107476949691772, + "logits/rejected": -1.5307183265686035, + "logps/chosen": -57.80412673950195, + "logps/rejected": -114.65202331542969, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.344606399536133, + "rewards/margins": 25.22000503540039, + "rewards/rejected": -28.56460952758789, + "step": 6540 + }, + { + "epoch": 2.99, + "learning_rate": 1.0147133434804667e-09, + "logits/chosen": -1.650368332862854, + "logits/rejected": -1.5417366027832031, + "logps/chosen": -62.948486328125, + "logps/rejected": -114.2308578491211, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.576667070388794, + "rewards/margins": 25.144075393676758, + "rewards/rejected": -28.720739364624023, + "step": 6550 + }, + { + "epoch": 2.99, + "learning_rate": 5.073566717402334e-10, + "logits/chosen": -1.6579023599624634, + "logits/rejected": -1.574951410293579, + "logps/chosen": -62.673072814941406, + "logps/rejected": -113.37846374511719, + "loss": 0.0011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -3.956240177154541, + "rewards/margins": 25.087797164916992, + "rewards/rejected": -29.04403305053711, + "step": 6560 + }, + { + "epoch": 3.0, + "learning_rate": 0.0, + "logits/chosen": -1.6318897008895874, + "logits/rejected": -1.5588319301605225, + "logps/chosen": -58.1288948059082, + "logps/rejected": -115.969482421875, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4750983715057373, + "rewards/margins": 26.056133270263672, + "rewards/rejected": -29.531230926513672, + "step": 6570 + }, + { + "epoch": 3.0, + "step": 6570, + "total_flos": 0.0, + "train_loss": 0.019671504644591626, + "train_runtime": 60672.7973, + "train_samples_per_second": 6.932, + "train_steps_per_second": 0.108 + } + ], + "logging_steps": 10, + "max_steps": 6570, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}