diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9104 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 100, + "global_step": 5811, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 8.591065292096219e-10, + "logits/chosen": -2.810119152069092, + "logits/rejected": -2.8539578914642334, + "logps/chosen": -108.88716125488281, + "logps/rejected": -104.7931137084961, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 8.59106529209622e-09, + "logits/chosen": -3.0777981281280518, + "logits/rejected": -3.0556678771972656, + "logps/chosen": -324.0378112792969, + "logps/rejected": -248.84950256347656, + "loss": 0.6931, + "rewards/accuracies": 0.4444444477558136, + "rewards/chosen": 0.004365404602140188, + "rewards/margins": 0.002458281349390745, + "rewards/rejected": 0.0019071230199187994, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 1.718213058419244e-08, + "logits/chosen": -3.031554698944092, + "logits/rejected": -2.9927072525024414, + "logps/chosen": -246.7428741455078, + "logps/rejected": -176.8910675048828, + "loss": 0.6913, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.003744876477867365, + "rewards/margins": -0.005839090794324875, + "rewards/rejected": 0.0020942138507962227, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 2.5773195876288656e-08, + "logits/chosen": -3.0333802700042725, + "logits/rejected": -3.027919292449951, + "logps/chosen": -308.7424621582031, + "logps/rejected": -265.59039306640625, + "loss": 0.6904, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.005396988708525896, + "rewards/margins": 0.00396696338430047, + "rewards/rejected": 0.001430025091394782, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 3.436426116838488e-08, + "logits/chosen": -2.9431519508361816, + "logits/rejected": -2.97038197517395, + "logps/chosen": -315.57135009765625, + "logps/rejected": -228.1820068359375, + "loss": 0.682, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.0027233201544731855, + "rewards/margins": 0.021390482783317566, + "rewards/rejected": -0.018667161464691162, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.29553264604811e-08, + "logits/chosen": -3.1190497875213623, + "logits/rejected": -3.098569393157959, + "logps/chosen": -262.5951232910156, + "logps/rejected": -206.78384399414062, + "loss": 0.6735, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.01897308975458145, + "rewards/margins": 0.03295915946364403, + "rewards/rejected": -0.013986068777740002, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 5.154639175257731e-08, + "logits/chosen": -3.0057833194732666, + "logits/rejected": -2.970109224319458, + "logps/chosen": -257.2474365234375, + "logps/rejected": -242.32852172851562, + "loss": 0.665, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.03294278308749199, + "rewards/margins": 0.05787893012166023, + "rewards/rejected": -0.024936143308877945, + "step": 60 + }, + { + "epoch": 0.04, + "learning_rate": 6.013745704467354e-08, + "logits/chosen": -3.073664426803589, + "logits/rejected": -3.024251699447632, + "logps/chosen": -308.711669921875, + "logps/rejected": -248.31295776367188, + "loss": 0.6557, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.04562750831246376, + "rewards/margins": 0.09679891914129257, + "rewards/rejected": -0.051171403378248215, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 6.872852233676976e-08, + "logits/chosen": -3.061481475830078, + "logits/rejected": -3.0335938930511475, + "logps/chosen": -289.26007080078125, + "logps/rejected": -253.003173828125, + "loss": 0.6305, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.061396338045597076, + "rewards/margins": 0.12416829913854599, + "rewards/rejected": -0.06277195364236832, + "step": 80 + }, + { + "epoch": 0.05, + "learning_rate": 7.731958762886598e-08, + "logits/chosen": -3.0796360969543457, + "logits/rejected": -3.086453914642334, + "logps/chosen": -309.4039611816406, + "logps/rejected": -254.5652618408203, + "loss": 0.6248, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.048405859619379044, + "rewards/margins": 0.1540774703025818, + "rewards/rejected": -0.10567160695791245, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 8.59106529209622e-08, + "logits/chosen": -2.9399971961975098, + "logits/rejected": -2.9413506984710693, + "logps/chosen": -268.4250793457031, + "logps/rejected": -196.02508544921875, + "loss": 0.6144, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.03557150438427925, + "rewards/margins": 0.18206295371055603, + "rewards/rejected": -0.14649145305156708, + "step": 100 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -3.018644332885742, + "eval_logits/rejected": -3.0044751167297363, + "eval_logps/chosen": -270.5843200683594, + "eval_logps/rejected": -230.69760131835938, + "eval_loss": 0.5937883257865906, + "eval_rewards/accuracies": 0.722000002861023, + "eval_rewards/chosen": 0.05668351799249649, + "eval_rewards/margins": 0.2780429720878601, + "eval_rewards/rejected": -0.22135944664478302, + "eval_runtime": 299.8772, + "eval_samples_per_second": 6.669, + "eval_steps_per_second": 0.417, + "step": 100 + }, + { + "epoch": 0.06, + "learning_rate": 9.450171821305841e-08, + "logits/chosen": -3.010746717453003, + "logits/rejected": -2.995668888092041, + "logps/chosen": -268.6666259765625, + "logps/rejected": -245.5867156982422, + "loss": 0.6057, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.011533960700035095, + "rewards/margins": 0.28680604696273804, + "rewards/rejected": -0.2752721309661865, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 1.0309278350515462e-07, + "logits/chosen": -3.0465500354766846, + "logits/rejected": -3.0174379348754883, + "logps/chosen": -226.70651245117188, + "logps/rejected": -212.8759002685547, + "loss": 0.5731, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.026695668697357178, + "rewards/margins": 0.24723270535469055, + "rewards/rejected": -0.2739284038543701, + "step": 120 + }, + { + "epoch": 0.07, + "learning_rate": 1.1168384879725086e-07, + "logits/chosen": -3.089245319366455, + "logits/rejected": -3.1066346168518066, + "logps/chosen": -315.1956481933594, + "logps/rejected": -240.29312133789062, + "loss": 0.5804, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.13610997796058655, + "rewards/margins": 0.4461982846260071, + "rewards/rejected": -0.31008821725845337, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 1.202749140893471e-07, + "logits/chosen": -3.0325512886047363, + "logits/rejected": -2.9741909503936768, + "logps/chosen": -294.88067626953125, + "logps/rejected": -272.0750427246094, + "loss": 0.5569, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.009326432831585407, + "rewards/margins": 0.5915461182594299, + "rewards/rejected": -0.5822197198867798, + "step": 140 + }, + { + "epoch": 0.08, + "learning_rate": 1.2886597938144328e-07, + "logits/chosen": -3.084364414215088, + "logits/rejected": -3.02875018119812, + "logps/chosen": -287.40118408203125, + "logps/rejected": -246.5475616455078, + "loss": 0.5471, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.1425856351852417, + "rewards/margins": 0.7196646928787231, + "rewards/rejected": -0.5770790576934814, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 1.3745704467353952e-07, + "logits/chosen": -2.9751551151275635, + "logits/rejected": -2.9700520038604736, + "logps/chosen": -297.21075439453125, + "logps/rejected": -244.6047821044922, + "loss": 0.5369, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.10728351771831512, + "rewards/margins": 0.6044571399688721, + "rewards/rejected": -0.49717360734939575, + "step": 160 + }, + { + "epoch": 0.09, + "learning_rate": 1.4604810996563573e-07, + "logits/chosen": -3.058936595916748, + "logits/rejected": -3.021108627319336, + "logps/chosen": -282.1290283203125, + "logps/rejected": -237.5272979736328, + "loss": 0.4839, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.15872260928153992, + "rewards/margins": 0.9850108027458191, + "rewards/rejected": -0.826288104057312, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 1.5463917525773197e-07, + "logits/chosen": -3.039640426635742, + "logits/rejected": -2.9874491691589355, + "logps/chosen": -234.4727325439453, + "logps/rejected": -210.0829620361328, + "loss": 0.508, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05743076652288437, + "rewards/margins": 0.7193694114685059, + "rewards/rejected": -0.6619385480880737, + "step": 180 + }, + { + "epoch": 0.1, + "learning_rate": 1.6323024054982818e-07, + "logits/chosen": -2.985429525375366, + "logits/rejected": -2.9715933799743652, + "logps/chosen": -263.69964599609375, + "logps/rejected": -266.5009765625, + "loss": 0.5669, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.04090666025876999, + "rewards/margins": 0.6870480179786682, + "rewards/rejected": -0.6461412906646729, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 1.718213058419244e-07, + "logits/chosen": -2.942009449005127, + "logits/rejected": -2.9293673038482666, + "logps/chosen": -269.390625, + "logps/rejected": -192.42050170898438, + "loss": 0.4957, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.21698591113090515, + "rewards/margins": 0.8756136894226074, + "rewards/rejected": -0.6586278080940247, + "step": 200 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -2.9714457988739014, + "eval_logits/rejected": -2.955587148666382, + "eval_logps/chosen": -270.5447998046875, + "eval_logps/rejected": -235.9661102294922, + "eval_loss": 0.5132176280021667, + "eval_rewards/accuracies": 0.7459999918937683, + "eval_rewards/chosen": 0.060630541294813156, + "eval_rewards/margins": 0.8088454008102417, + "eval_rewards/rejected": -0.7482149004936218, + "eval_runtime": 301.0289, + "eval_samples_per_second": 6.644, + "eval_steps_per_second": 0.415, + "step": 200 + }, + { + "epoch": 0.11, + "learning_rate": 1.804123711340206e-07, + "logits/chosen": -2.9341390132904053, + "logits/rejected": -2.880955696105957, + "logps/chosen": -255.9530029296875, + "logps/rejected": -239.8656768798828, + "loss": 0.5635, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.19516435265541077, + "rewards/margins": 0.5975160598754883, + "rewards/rejected": -0.7926804423332214, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 1.8900343642611682e-07, + "logits/chosen": -3.0241286754608154, + "logits/rejected": -2.9803478717803955, + "logps/chosen": -285.34161376953125, + "logps/rejected": -238.00674438476562, + "loss": 0.5215, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.06667652726173401, + "rewards/margins": 0.7439771890640259, + "rewards/rejected": -0.8106536865234375, + "step": 220 + }, + { + "epoch": 0.12, + "learning_rate": 1.9759450171821303e-07, + "logits/chosen": -3.0200541019439697, + "logits/rejected": -2.9749975204467773, + "logps/chosen": -290.6888427734375, + "logps/rejected": -253.60012817382812, + "loss": 0.5113, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.03626754879951477, + "rewards/margins": 0.693548321723938, + "rewards/rejected": -0.7298158407211304, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 2.0618556701030925e-07, + "logits/chosen": -3.031228542327881, + "logits/rejected": -2.997079610824585, + "logps/chosen": -332.36529541015625, + "logps/rejected": -227.00833129882812, + "loss": 0.5376, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07132077217102051, + "rewards/margins": 0.716058075428009, + "rewards/rejected": -0.7873787879943848, + "step": 240 + }, + { + "epoch": 0.13, + "learning_rate": 2.1477663230240549e-07, + "logits/chosen": -3.1058244705200195, + "logits/rejected": -3.0761656761169434, + "logps/chosen": -272.7492980957031, + "logps/rejected": -253.9503173828125, + "loss": 0.5161, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.07308956235647202, + "rewards/margins": 0.9320653676986694, + "rewards/rejected": -0.8589757680892944, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 2.2336769759450173e-07, + "logits/chosen": -3.094557285308838, + "logits/rejected": -3.033235549926758, + "logps/chosen": -298.0885925292969, + "logps/rejected": -246.3491973876953, + "loss": 0.524, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.17855298519134521, + "rewards/margins": 0.9676022529602051, + "rewards/rejected": -0.7890492081642151, + "step": 260 + }, + { + "epoch": 0.14, + "learning_rate": 2.3195876288659794e-07, + "logits/chosen": -3.0374608039855957, + "logits/rejected": -3.03932523727417, + "logps/chosen": -274.2452697753906, + "logps/rejected": -235.6359100341797, + "loss": 0.4844, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.02701665833592415, + "rewards/margins": 0.7995314002037048, + "rewards/rejected": -0.772514820098877, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 2.405498281786942e-07, + "logits/chosen": -2.99436354637146, + "logits/rejected": -2.9718544483184814, + "logps/chosen": -309.51019287109375, + "logps/rejected": -250.88412475585938, + "loss": 0.5128, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.08803629130125046, + "rewards/margins": 1.0304720401763916, + "rewards/rejected": -0.9424357414245605, + "step": 280 + }, + { + "epoch": 0.15, + "learning_rate": 2.4914089347079036e-07, + "logits/chosen": -2.9639785289764404, + "logits/rejected": -2.948111057281494, + "logps/chosen": -290.60498046875, + "logps/rejected": -282.6380615234375, + "loss": 0.5165, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.08333762735128403, + "rewards/margins": 0.8697144389152527, + "rewards/rejected": -0.7863768339157104, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 2.5773195876288655e-07, + "logits/chosen": -3.0124526023864746, + "logits/rejected": -3.0208213329315186, + "logps/chosen": -260.4809875488281, + "logps/rejected": -248.8876953125, + "loss": 0.5257, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.06663007289171219, + "rewards/margins": 1.012616515159607, + "rewards/rejected": -1.0792466402053833, + "step": 300 + }, + { + "epoch": 0.15, + "eval_logits/chosen": -2.998927116394043, + "eval_logits/rejected": -2.98532772064209, + "eval_logps/chosen": -271.51171875, + "eval_logps/rejected": -238.74551391601562, + "eval_loss": 0.4975211024284363, + "eval_rewards/accuracies": 0.7519999742507935, + "eval_rewards/chosen": -0.03605831041932106, + "eval_rewards/margins": 0.9900941252708435, + "eval_rewards/rejected": -1.026152491569519, + "eval_runtime": 299.8592, + "eval_samples_per_second": 6.67, + "eval_steps_per_second": 0.417, + "step": 300 + }, + { + "epoch": 0.16, + "learning_rate": 2.663230240549828e-07, + "logits/chosen": -3.0704574584960938, + "logits/rejected": -3.070812702178955, + "logps/chosen": -263.6881408691406, + "logps/rejected": -226.4467010498047, + "loss": 0.5101, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.10588334500789642, + "rewards/margins": 0.8894448280334473, + "rewards/rejected": -0.9953282475471497, + "step": 310 + }, + { + "epoch": 0.17, + "learning_rate": 2.7491408934707903e-07, + "logits/chosen": -3.0514838695526123, + "logits/rejected": -3.0334386825561523, + "logps/chosen": -296.7230224609375, + "logps/rejected": -235.6595916748047, + "loss": 0.4537, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.14300963282585144, + "rewards/margins": 1.31166672706604, + "rewards/rejected": -1.4546763896942139, + "step": 320 + }, + { + "epoch": 0.17, + "learning_rate": 2.835051546391752e-07, + "logits/chosen": -3.0409178733825684, + "logits/rejected": -3.0209853649139404, + "logps/chosen": -302.88775634765625, + "logps/rejected": -250.03475952148438, + "loss": 0.4813, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.18107786774635315, + "rewards/margins": 1.172987699508667, + "rewards/rejected": -1.3540656566619873, + "step": 330 + }, + { + "epoch": 0.18, + "learning_rate": 2.9209621993127146e-07, + "logits/chosen": -3.000317335128784, + "logits/rejected": -2.9957940578460693, + "logps/chosen": -285.11346435546875, + "logps/rejected": -246.8708953857422, + "loss": 0.5306, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.17257550358772278, + "rewards/margins": 0.7849529981613159, + "rewards/rejected": -0.9575284719467163, + "step": 340 + }, + { + "epoch": 0.18, + "learning_rate": 3.006872852233677e-07, + "logits/chosen": -3.086730480194092, + "logits/rejected": -3.0460643768310547, + "logps/chosen": -232.62631225585938, + "logps/rejected": -228.4468231201172, + "loss": 0.4653, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13442695140838623, + "rewards/margins": 1.164433240890503, + "rewards/rejected": -1.2988600730895996, + "step": 350 + }, + { + "epoch": 0.19, + "learning_rate": 3.0927835051546394e-07, + "logits/chosen": -3.075042247772217, + "logits/rejected": -3.080418109893799, + "logps/chosen": -265.05126953125, + "logps/rejected": -218.3927001953125, + "loss": 0.4768, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.11520049721002579, + "rewards/margins": 1.164591908454895, + "rewards/rejected": -1.0493913888931274, + "step": 360 + }, + { + "epoch": 0.19, + "learning_rate": 3.178694158075601e-07, + "logits/chosen": -3.0801005363464355, + "logits/rejected": -3.0352022647857666, + "logps/chosen": -252.08358764648438, + "logps/rejected": -205.0348663330078, + "loss": 0.4763, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.09602154791355133, + "rewards/margins": 1.2616920471191406, + "rewards/rejected": -1.16567063331604, + "step": 370 + }, + { + "epoch": 0.2, + "learning_rate": 3.2646048109965636e-07, + "logits/chosen": -3.02718448638916, + "logits/rejected": -3.018253803253174, + "logps/chosen": -240.01785278320312, + "logps/rejected": -218.6881103515625, + "loss": 0.5578, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.08774475753307343, + "rewards/margins": 0.9482590556144714, + "rewards/rejected": -1.036003828048706, + "step": 380 + }, + { + "epoch": 0.2, + "learning_rate": 3.3505154639175255e-07, + "logits/chosen": -3.1347148418426514, + "logits/rejected": -3.1124050617218018, + "logps/chosen": -255.76754760742188, + "logps/rejected": -214.763427734375, + "loss": 0.4347, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.12095893919467926, + "rewards/margins": 1.100426435470581, + "rewards/rejected": -1.2213853597640991, + "step": 390 + }, + { + "epoch": 0.21, + "learning_rate": 3.436426116838488e-07, + "logits/chosen": -3.159269332885742, + "logits/rejected": -3.1348633766174316, + "logps/chosen": -253.93325805664062, + "logps/rejected": -193.1920928955078, + "loss": 0.556, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3844020664691925, + "rewards/margins": 0.7617012858390808, + "rewards/rejected": -1.1461035013198853, + "step": 400 + }, + { + "epoch": 0.21, + "eval_logits/chosen": -3.0931246280670166, + "eval_logits/rejected": -3.084690809249878, + "eval_logps/chosen": -272.16705322265625, + "eval_logps/rejected": -240.4776153564453, + "eval_loss": 0.4935062527656555, + "eval_rewards/accuracies": 0.7760000228881836, + "eval_rewards/chosen": -0.10159354656934738, + "eval_rewards/margins": 1.0977704524993896, + "eval_rewards/rejected": -1.1993640661239624, + "eval_runtime": 295.6794, + "eval_samples_per_second": 6.764, + "eval_steps_per_second": 0.423, + "step": 400 + }, + { + "epoch": 0.21, + "learning_rate": 3.5223367697594503e-07, + "logits/chosen": -3.0230696201324463, + "logits/rejected": -2.964069128036499, + "logps/chosen": -306.8080749511719, + "logps/rejected": -221.2211151123047, + "loss": 0.4107, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.09494096785783768, + "rewards/margins": 1.1914355754852295, + "rewards/rejected": -1.2863763570785522, + "step": 410 + }, + { + "epoch": 0.22, + "learning_rate": 3.608247422680412e-07, + "logits/chosen": -3.0272717475891113, + "logits/rejected": -2.9719436168670654, + "logps/chosen": -280.74859619140625, + "logps/rejected": -237.41921997070312, + "loss": 0.4879, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.24850067496299744, + "rewards/margins": 1.2745717763900757, + "rewards/rejected": -1.523072361946106, + "step": 420 + }, + { + "epoch": 0.22, + "learning_rate": 3.6941580756013745e-07, + "logits/chosen": -3.0856499671936035, + "logits/rejected": -3.03281831741333, + "logps/chosen": -242.9888916015625, + "logps/rejected": -221.92031860351562, + "loss": 0.4766, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6125269532203674, + "rewards/margins": 1.3152192831039429, + "rewards/rejected": -1.9277461767196655, + "step": 430 + }, + { + "epoch": 0.23, + "learning_rate": 3.7800687285223364e-07, + "logits/chosen": -3.0084025859832764, + "logits/rejected": -3.060877561569214, + "logps/chosen": -287.5694885253906, + "logps/rejected": -285.35040283203125, + "loss": 0.5522, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.3822043538093567, + "rewards/margins": 1.0044233798980713, + "rewards/rejected": -1.3866277933120728, + "step": 440 + }, + { + "epoch": 0.23, + "learning_rate": 3.865979381443299e-07, + "logits/chosen": -3.0907702445983887, + "logits/rejected": -3.085932970046997, + "logps/chosen": -262.8836975097656, + "logps/rejected": -250.1670379638672, + "loss": 0.4844, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.1354902982711792, + "rewards/margins": 0.8856312036514282, + "rewards/rejected": -1.0211213827133179, + "step": 450 + }, + { + "epoch": 0.24, + "learning_rate": 3.9518900343642607e-07, + "logits/chosen": -3.0575408935546875, + "logits/rejected": -3.0283780097961426, + "logps/chosen": -269.05120849609375, + "logps/rejected": -255.0241241455078, + "loss": 0.5529, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3627804219722748, + "rewards/margins": 1.0761009454727173, + "rewards/rejected": -1.438881278038025, + "step": 460 + }, + { + "epoch": 0.24, + "learning_rate": 4.037800687285223e-07, + "logits/chosen": -3.123262405395508, + "logits/rejected": -3.0892791748046875, + "logps/chosen": -317.90850830078125, + "logps/rejected": -207.1751708984375, + "loss": 0.4478, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.31438368558883667, + "rewards/margins": 1.2641030550003052, + "rewards/rejected": -1.578486680984497, + "step": 470 + }, + { + "epoch": 0.25, + "learning_rate": 4.123711340206185e-07, + "logits/chosen": -3.19368314743042, + "logits/rejected": -3.1357719898223877, + "logps/chosen": -285.27130126953125, + "logps/rejected": -249.72097778320312, + "loss": 0.482, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.15568087995052338, + "rewards/margins": 1.13084077835083, + "rewards/rejected": -1.2865216732025146, + "step": 480 + }, + { + "epoch": 0.25, + "learning_rate": 4.209621993127148e-07, + "logits/chosen": -2.9999001026153564, + "logits/rejected": -2.989119052886963, + "logps/chosen": -263.2889099121094, + "logps/rejected": -240.6525421142578, + "loss": 0.4095, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3779570460319519, + "rewards/margins": 1.3387038707733154, + "rewards/rejected": -1.7166610956192017, + "step": 490 + }, + { + "epoch": 0.26, + "learning_rate": 4.2955326460481097e-07, + "logits/chosen": -3.199814558029175, + "logits/rejected": -3.170172691345215, + "logps/chosen": -274.55523681640625, + "logps/rejected": -257.8343505859375, + "loss": 0.5409, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5398232936859131, + "rewards/margins": 0.9405809640884399, + "rewards/rejected": -1.4804041385650635, + "step": 500 + }, + { + "epoch": 0.26, + "eval_logits/chosen": -3.076664686203003, + "eval_logits/rejected": -3.0543596744537354, + "eval_logps/chosen": -275.1524963378906, + "eval_logps/rejected": -244.3592071533203, + "eval_loss": 0.4952601194381714, + "eval_rewards/accuracies": 0.777999997138977, + "eval_rewards/chosen": -0.40013551712036133, + "eval_rewards/margins": 1.187387466430664, + "eval_rewards/rejected": -1.5875229835510254, + "eval_runtime": 298.7439, + "eval_samples_per_second": 6.695, + "eval_steps_per_second": 0.418, + "step": 500 + }, + { + "epoch": 0.26, + "learning_rate": 4.381443298969072e-07, + "logits/chosen": -3.0252814292907715, + "logits/rejected": -3.044193744659424, + "logps/chosen": -288.96246337890625, + "logps/rejected": -249.6056671142578, + "loss": 0.5222, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6331709027290344, + "rewards/margins": 0.7817397117614746, + "rewards/rejected": -1.4149106740951538, + "step": 510 + }, + { + "epoch": 0.27, + "learning_rate": 4.4673539518900345e-07, + "logits/chosen": -2.9890036582946777, + "logits/rejected": -2.975595474243164, + "logps/chosen": -250.694580078125, + "logps/rejected": -223.31900024414062, + "loss": 0.5092, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.5249029397964478, + "rewards/margins": 1.4869499206542969, + "rewards/rejected": -2.011852741241455, + "step": 520 + }, + { + "epoch": 0.27, + "learning_rate": 4.5532646048109964e-07, + "logits/chosen": -3.036130428314209, + "logits/rejected": -3.005101442337036, + "logps/chosen": -279.5271911621094, + "logps/rejected": -233.37759399414062, + "loss": 0.5129, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4594394266605377, + "rewards/margins": 1.0354315042495728, + "rewards/rejected": -1.494870901107788, + "step": 530 + }, + { + "epoch": 0.28, + "learning_rate": 4.639175257731959e-07, + "logits/chosen": -3.071089267730713, + "logits/rejected": -3.0566036701202393, + "logps/chosen": -280.0428161621094, + "logps/rejected": -255.1620635986328, + "loss": 0.5469, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6389147043228149, + "rewards/margins": 0.8219264149665833, + "rewards/rejected": -1.4608410596847534, + "step": 540 + }, + { + "epoch": 0.28, + "learning_rate": 4.7250859106529206e-07, + "logits/chosen": -3.052964687347412, + "logits/rejected": -3.027625560760498, + "logps/chosen": -269.45050048828125, + "logps/rejected": -242.0845489501953, + "loss": 0.5201, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5871966481208801, + "rewards/margins": 1.219684362411499, + "rewards/rejected": -1.8068811893463135, + "step": 550 + }, + { + "epoch": 0.29, + "learning_rate": 4.810996563573884e-07, + "logits/chosen": -3.0290699005126953, + "logits/rejected": -2.9819164276123047, + "logps/chosen": -308.82049560546875, + "logps/rejected": -262.578369140625, + "loss": 0.5215, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.4544792175292969, + "rewards/margins": 1.2229182720184326, + "rewards/rejected": -1.67739737033844, + "step": 560 + }, + { + "epoch": 0.29, + "learning_rate": 4.896907216494845e-07, + "logits/chosen": -3.095975160598755, + "logits/rejected": -3.038904905319214, + "logps/chosen": -278.1103515625, + "logps/rejected": -263.58197021484375, + "loss": 0.4939, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7382952570915222, + "rewards/margins": 0.9173868894577026, + "rewards/rejected": -1.6556819677352905, + "step": 570 + }, + { + "epoch": 0.3, + "learning_rate": 4.982817869415807e-07, + "logits/chosen": -3.1059720516204834, + "logits/rejected": -3.0128486156463623, + "logps/chosen": -273.25616455078125, + "logps/rejected": -216.08547973632812, + "loss": 0.545, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.3375282287597656, + "rewards/margins": 1.2573668956756592, + "rewards/rejected": -1.5948951244354248, + "step": 580 + }, + { + "epoch": 0.3, + "learning_rate": 4.992350353796136e-07, + "logits/chosen": -3.0131020545959473, + "logits/rejected": -3.0127346515655518, + "logps/chosen": -247.0398406982422, + "logps/rejected": -247.3001708984375, + "loss": 0.4945, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.33573365211486816, + "rewards/margins": 1.4024873971939087, + "rewards/rejected": -1.7382211685180664, + "step": 590 + }, + { + "epoch": 0.31, + "learning_rate": 4.982788296041308e-07, + "logits/chosen": -3.1070754528045654, + "logits/rejected": -3.0217490196228027, + "logps/chosen": -250.99771118164062, + "logps/rejected": -224.89303588867188, + "loss": 0.5161, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5898759961128235, + "rewards/margins": 1.1816720962524414, + "rewards/rejected": -1.7715480327606201, + "step": 600 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -3.0461089611053467, + "eval_logits/rejected": -3.0234925746917725, + "eval_logps/chosen": -274.2987976074219, + "eval_logps/rejected": -242.63465881347656, + "eval_loss": 0.5194836854934692, + "eval_rewards/accuracies": 0.7419999837875366, + "eval_rewards/chosen": -0.31476885080337524, + "eval_rewards/margins": 1.100299596786499, + "eval_rewards/rejected": -1.4150683879852295, + "eval_runtime": 299.932, + "eval_samples_per_second": 6.668, + "eval_steps_per_second": 0.417, + "step": 600 + }, + { + "epoch": 0.31, + "learning_rate": 4.973226238286479e-07, + "logits/chosen": -3.0412392616271973, + "logits/rejected": -3.001218318939209, + "logps/chosen": -323.3726501464844, + "logps/rejected": -270.83984375, + "loss": 0.5421, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.24728074669837952, + "rewards/margins": 1.267622947692871, + "rewards/rejected": -1.5149036645889282, + "step": 610 + }, + { + "epoch": 0.32, + "learning_rate": 4.96366418053165e-07, + "logits/chosen": -3.121735095977783, + "logits/rejected": -3.0834250450134277, + "logps/chosen": -280.3814392089844, + "logps/rejected": -269.0967712402344, + "loss": 0.5492, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1741463840007782, + "rewards/margins": 0.844623863697052, + "rewards/rejected": -1.0187702178955078, + "step": 620 + }, + { + "epoch": 0.33, + "learning_rate": 4.954102122776821e-07, + "logits/chosen": -3.1227710247039795, + "logits/rejected": -3.003854513168335, + "logps/chosen": -253.03939819335938, + "logps/rejected": -200.9890899658203, + "loss": 0.4995, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1886899173259735, + "rewards/margins": 1.2468225955963135, + "rewards/rejected": -1.4355127811431885, + "step": 630 + }, + { + "epoch": 0.33, + "learning_rate": 4.944540065021993e-07, + "logits/chosen": -2.9064483642578125, + "logits/rejected": -2.925783634185791, + "logps/chosen": -238.77737426757812, + "logps/rejected": -206.6617431640625, + "loss": 0.4772, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.456102192401886, + "rewards/margins": 1.4002044200897217, + "rewards/rejected": -1.856306791305542, + "step": 640 + }, + { + "epoch": 0.34, + "learning_rate": 4.934978007267163e-07, + "logits/chosen": -3.025247812271118, + "logits/rejected": -2.9976277351379395, + "logps/chosen": -279.1222839355469, + "logps/rejected": -253.5683135986328, + "loss": 0.8206, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.023301448673009872, + "rewards/margins": 1.3701492547988892, + "rewards/rejected": -1.3934507369995117, + "step": 650 + }, + { + "epoch": 0.34, + "learning_rate": 4.925415949512335e-07, + "logits/chosen": -2.996875524520874, + "logits/rejected": -2.918454170227051, + "logps/chosen": -331.8418884277344, + "logps/rejected": -253.6525115966797, + "loss": 0.5038, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.24123434722423553, + "rewards/margins": 1.4405347108840942, + "rewards/rejected": -1.6817691326141357, + "step": 660 + }, + { + "epoch": 0.35, + "learning_rate": 4.915853891757506e-07, + "logits/chosen": -2.8935065269470215, + "logits/rejected": -2.898484706878662, + "logps/chosen": -197.3555908203125, + "logps/rejected": -241.32046508789062, + "loss": 0.5704, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.24199140071868896, + "rewards/margins": 0.7882445454597473, + "rewards/rejected": -1.030236005783081, + "step": 670 + }, + { + "epoch": 0.35, + "learning_rate": 4.906291834002677e-07, + "logits/chosen": -2.962968349456787, + "logits/rejected": -2.9006264209747314, + "logps/chosen": -281.5236511230469, + "logps/rejected": -256.7275085449219, + "loss": 0.5252, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.608171820640564, + "rewards/margins": 0.9431388974189758, + "rewards/rejected": -1.5513107776641846, + "step": 680 + }, + { + "epoch": 0.36, + "learning_rate": 4.896729776247848e-07, + "logits/chosen": -2.9835548400878906, + "logits/rejected": -2.9527204036712646, + "logps/chosen": -289.92987060546875, + "logps/rejected": -251.1087188720703, + "loss": 0.4655, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.38817405700683594, + "rewards/margins": 1.28240168094635, + "rewards/rejected": -1.6705758571624756, + "step": 690 + }, + { + "epoch": 0.36, + "learning_rate": 4.88716771849302e-07, + "logits/chosen": -3.0054280757904053, + "logits/rejected": -2.911468029022217, + "logps/chosen": -329.260009765625, + "logps/rejected": -274.3800964355469, + "loss": 0.4913, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.265338271856308, + "rewards/margins": 1.6260671615600586, + "rewards/rejected": -1.8914053440093994, + "step": 700 + }, + { + "epoch": 0.36, + "eval_logits/chosen": -2.9585862159729004, + "eval_logits/rejected": -2.9301576614379883, + "eval_logps/chosen": -277.00439453125, + "eval_logps/rejected": -247.15345764160156, + "eval_loss": 0.5227752923965454, + "eval_rewards/accuracies": 0.7799999713897705, + "eval_rewards/chosen": -0.5853266716003418, + "eval_rewards/margins": 1.2816225290298462, + "eval_rewards/rejected": -1.866949200630188, + "eval_runtime": 297.2392, + "eval_samples_per_second": 6.729, + "eval_steps_per_second": 0.421, + "step": 700 + }, + { + "epoch": 0.37, + "learning_rate": 4.87760566073819e-07, + "logits/chosen": -2.9115710258483887, + "logits/rejected": -2.9119551181793213, + "logps/chosen": -302.12396240234375, + "logps/rejected": -242.7657470703125, + "loss": 0.5125, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.7263463139533997, + "rewards/margins": 1.2942359447479248, + "rewards/rejected": -2.020582675933838, + "step": 710 + }, + { + "epoch": 0.37, + "learning_rate": 4.868043602983362e-07, + "logits/chosen": -3.0313706398010254, + "logits/rejected": -2.987967014312744, + "logps/chosen": -302.0757751464844, + "logps/rejected": -299.59466552734375, + "loss": 0.5112, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.8701627850532532, + "rewards/margins": 1.6429879665374756, + "rewards/rejected": -2.513150691986084, + "step": 720 + }, + { + "epoch": 0.38, + "learning_rate": 4.858481545228533e-07, + "logits/chosen": -3.0496833324432373, + "logits/rejected": -2.9578769207000732, + "logps/chosen": -324.0188293457031, + "logps/rejected": -285.79510498046875, + "loss": 0.485, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7672210931777954, + "rewards/margins": 1.4055955410003662, + "rewards/rejected": -2.172816514968872, + "step": 730 + }, + { + "epoch": 0.38, + "learning_rate": 4.848919487473704e-07, + "logits/chosen": -2.9866185188293457, + "logits/rejected": -2.962374687194824, + "logps/chosen": -300.18084716796875, + "logps/rejected": -295.59954833984375, + "loss": 0.5883, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8200961947441101, + "rewards/margins": 1.3473241329193115, + "rewards/rejected": -2.1674201488494873, + "step": 740 + }, + { + "epoch": 0.39, + "learning_rate": 4.839357429718875e-07, + "logits/chosen": -3.005702257156372, + "logits/rejected": -3.033315658569336, + "logps/chosen": -272.52850341796875, + "logps/rejected": -243.59927368164062, + "loss": 0.5489, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6764557957649231, + "rewards/margins": 1.2463300228118896, + "rewards/rejected": -1.922785997390747, + "step": 750 + }, + { + "epoch": 0.39, + "learning_rate": 4.829795371964047e-07, + "logits/chosen": -3.028440237045288, + "logits/rejected": -2.97921085357666, + "logps/chosen": -304.7507019042969, + "logps/rejected": -267.3523254394531, + "loss": 0.5266, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.5715482831001282, + "rewards/margins": 1.3217369318008423, + "rewards/rejected": -1.8932850360870361, + "step": 760 + }, + { + "epoch": 0.4, + "learning_rate": 4.820233314209217e-07, + "logits/chosen": -2.8253636360168457, + "logits/rejected": -2.8096349239349365, + "logps/chosen": -256.4940490722656, + "logps/rejected": -226.732666015625, + "loss": 0.5891, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7520522475242615, + "rewards/margins": 1.1242681741714478, + "rewards/rejected": -1.876320242881775, + "step": 770 + }, + { + "epoch": 0.4, + "learning_rate": 4.810671256454389e-07, + "logits/chosen": -2.8110451698303223, + "logits/rejected": -2.8266806602478027, + "logps/chosen": -302.3857116699219, + "logps/rejected": -285.738525390625, + "loss": 0.4987, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6822640895843506, + "rewards/margins": 1.6061044931411743, + "rewards/rejected": -2.2883687019348145, + "step": 780 + }, + { + "epoch": 0.41, + "learning_rate": 4.80110919869956e-07, + "logits/chosen": -2.858147382736206, + "logits/rejected": -2.80432391166687, + "logps/chosen": -298.38995361328125, + "logps/rejected": -229.829345703125, + "loss": 0.5486, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6031957864761353, + "rewards/margins": 1.3144261837005615, + "rewards/rejected": -1.9176222085952759, + "step": 790 + }, + { + "epoch": 0.41, + "learning_rate": 4.791547140944731e-07, + "logits/chosen": -2.7301101684570312, + "logits/rejected": -2.7294387817382812, + "logps/chosen": -222.55526733398438, + "logps/rejected": -228.2888641357422, + "loss": 0.4724, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5060822367668152, + "rewards/margins": 1.4019149541854858, + "rewards/rejected": -1.9079973697662354, + "step": 800 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -2.8297252655029297, + "eval_logits/rejected": -2.798793315887451, + "eval_logps/chosen": -277.2220764160156, + "eval_logps/rejected": -249.0490264892578, + "eval_loss": 0.514238178730011, + "eval_rewards/accuracies": 0.7620000243186951, + "eval_rewards/chosen": -0.6070927977561951, + "eval_rewards/margins": 1.4494118690490723, + "eval_rewards/rejected": -2.056504487991333, + "eval_runtime": 297.9515, + "eval_samples_per_second": 6.713, + "eval_steps_per_second": 0.42, + "step": 800 + }, + { + "epoch": 0.42, + "learning_rate": 4.781985083189902e-07, + "logits/chosen": -2.810962677001953, + "logits/rejected": -2.766624927520752, + "logps/chosen": -255.8228759765625, + "logps/rejected": -262.8910217285156, + "loss": 0.5184, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5636113286018372, + "rewards/margins": 1.295597791671753, + "rewards/rejected": -1.8592088222503662, + "step": 810 + }, + { + "epoch": 0.42, + "learning_rate": 4.772423025435074e-07, + "logits/chosen": -2.8432140350341797, + "logits/rejected": -2.8048148155212402, + "logps/chosen": -279.9248962402344, + "logps/rejected": -271.9587707519531, + "loss": 0.5416, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6159058213233948, + "rewards/margins": 0.8555160760879517, + "rewards/rejected": -1.4714218378067017, + "step": 820 + }, + { + "epoch": 0.43, + "learning_rate": 4.762860967680244e-07, + "logits/chosen": -2.9207639694213867, + "logits/rejected": -2.9197988510131836, + "logps/chosen": -242.93972778320312, + "logps/rejected": -200.70346069335938, + "loss": 0.5519, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5735063552856445, + "rewards/margins": 1.0570865869522095, + "rewards/rejected": -1.630592703819275, + "step": 830 + }, + { + "epoch": 0.43, + "learning_rate": 4.7532989099254154e-07, + "logits/chosen": -2.903390407562256, + "logits/rejected": -2.811583995819092, + "logps/chosen": -262.1546936035156, + "logps/rejected": -235.65078735351562, + "loss": 0.5592, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7549904584884644, + "rewards/margins": 0.8606253862380981, + "rewards/rejected": -1.6156158447265625, + "step": 840 + }, + { + "epoch": 0.44, + "learning_rate": 4.7437368521705866e-07, + "logits/chosen": -2.9836201667785645, + "logits/rejected": -2.9336819648742676, + "logps/chosen": -252.79244995117188, + "logps/rejected": -262.43499755859375, + "loss": 0.5252, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.45973125100135803, + "rewards/margins": 1.5895912647247314, + "rewards/rejected": -2.0493226051330566, + "step": 850 + }, + { + "epoch": 0.44, + "learning_rate": 4.7341747944157577e-07, + "logits/chosen": -2.9847192764282227, + "logits/rejected": -2.968533754348755, + "logps/chosen": -274.5762023925781, + "logps/rejected": -252.141357421875, + "loss": 0.5585, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6592631340026855, + "rewards/margins": 1.054240107536316, + "rewards/rejected": -1.7135032415390015, + "step": 860 + }, + { + "epoch": 0.45, + "learning_rate": 4.724612736660929e-07, + "logits/chosen": -2.8903841972351074, + "logits/rejected": -2.8222813606262207, + "logps/chosen": -276.86968994140625, + "logps/rejected": -245.29885864257812, + "loss": 0.4805, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3650582730770111, + "rewards/margins": 1.8204562664031982, + "rewards/rejected": -2.1855146884918213, + "step": 870 + }, + { + "epoch": 0.45, + "learning_rate": 4.7150506789061006e-07, + "logits/chosen": -2.984184741973877, + "logits/rejected": -2.9287283420562744, + "logps/chosen": -291.3039855957031, + "logps/rejected": -292.13543701171875, + "loss": 0.503, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7998818159103394, + "rewards/margins": 1.5341354608535767, + "rewards/rejected": -2.334017515182495, + "step": 880 + }, + { + "epoch": 0.46, + "learning_rate": 4.7054886211512717e-07, + "logits/chosen": -2.9736297130584717, + "logits/rejected": -2.954500913619995, + "logps/chosen": -281.38250732421875, + "logps/rejected": -250.498779296875, + "loss": 0.5356, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46949291229248047, + "rewards/margins": 1.1354753971099854, + "rewards/rejected": -1.6049684286117554, + "step": 890 + }, + { + "epoch": 0.46, + "learning_rate": 4.695926563396443e-07, + "logits/chosen": -3.040417432785034, + "logits/rejected": -2.989995002746582, + "logps/chosen": -276.5390930175781, + "logps/rejected": -237.54824829101562, + "loss": 0.5157, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8211701512336731, + "rewards/margins": 0.9814871549606323, + "rewards/rejected": -1.8026573657989502, + "step": 900 + }, + { + "epoch": 0.46, + "eval_logits/chosen": -2.9777884483337402, + "eval_logits/rejected": -2.946300506591797, + "eval_logps/chosen": -277.0157470703125, + "eval_logps/rejected": -246.65028381347656, + "eval_loss": 0.5049863457679749, + "eval_rewards/accuracies": 0.765999972820282, + "eval_rewards/chosen": -0.5864599347114563, + "eval_rewards/margins": 1.230170488357544, + "eval_rewards/rejected": -1.8166306018829346, + "eval_runtime": 297.0482, + "eval_samples_per_second": 6.733, + "eval_steps_per_second": 0.421, + "step": 900 + }, + { + "epoch": 0.47, + "learning_rate": 4.686364505641614e-07, + "logits/chosen": -2.992205858230591, + "logits/rejected": -2.9321579933166504, + "logps/chosen": -276.30682373046875, + "logps/rejected": -238.8515167236328, + "loss": 0.4941, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6449284553527832, + "rewards/margins": 1.4866305589675903, + "rewards/rejected": -2.131558895111084, + "step": 910 + }, + { + "epoch": 0.47, + "learning_rate": 4.676802447886785e-07, + "logits/chosen": -2.90757155418396, + "logits/rejected": -2.8897616863250732, + "logps/chosen": -264.0755310058594, + "logps/rejected": -240.0446014404297, + "loss": 0.5385, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5407442450523376, + "rewards/margins": 1.37613844871521, + "rewards/rejected": -1.916882872581482, + "step": 920 + }, + { + "epoch": 0.48, + "learning_rate": 4.6672403901319564e-07, + "logits/chosen": -2.92992901802063, + "logits/rejected": -2.9072909355163574, + "logps/chosen": -258.2342224121094, + "logps/rejected": -241.68197631835938, + "loss": 0.4592, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.22730322182178497, + "rewards/margins": 1.7852048873901367, + "rewards/rejected": -2.012507915496826, + "step": 930 + }, + { + "epoch": 0.49, + "learning_rate": 4.6576783323771275e-07, + "logits/chosen": -2.8399574756622314, + "logits/rejected": -2.8085784912109375, + "logps/chosen": -229.1579132080078, + "logps/rejected": -221.8878173828125, + "loss": 0.4922, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.6926137804985046, + "rewards/margins": 1.2121039628982544, + "rewards/rejected": -1.9047178030014038, + "step": 940 + }, + { + "epoch": 0.49, + "learning_rate": 4.6481162746222987e-07, + "logits/chosen": -2.837979793548584, + "logits/rejected": -2.820255994796753, + "logps/chosen": -288.9910583496094, + "logps/rejected": -254.75588989257812, + "loss": 0.5064, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.9452236294746399, + "rewards/margins": 1.558538794517517, + "rewards/rejected": -2.503762722015381, + "step": 950 + }, + { + "epoch": 0.5, + "learning_rate": 4.63855421686747e-07, + "logits/chosen": -2.897672176361084, + "logits/rejected": -2.847825527191162, + "logps/chosen": -291.0904846191406, + "logps/rejected": -248.66873168945312, + "loss": 0.5254, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5009971261024475, + "rewards/margins": 1.568277359008789, + "rewards/rejected": -2.069274663925171, + "step": 960 + }, + { + "epoch": 0.5, + "learning_rate": 4.628992159112641e-07, + "logits/chosen": -2.8176021575927734, + "logits/rejected": -2.863546848297119, + "logps/chosen": -266.3712158203125, + "logps/rejected": -269.98291015625, + "loss": 0.5196, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.17499732971191406, + "rewards/margins": 1.6475900411605835, + "rewards/rejected": -1.822587251663208, + "step": 970 + }, + { + "epoch": 0.51, + "learning_rate": 4.6194301013578116e-07, + "logits/chosen": -2.889280319213867, + "logits/rejected": -2.88883376121521, + "logps/chosen": -320.40277099609375, + "logps/rejected": -254.3000946044922, + "loss": 0.5441, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.4573570191860199, + "rewards/margins": 1.5372388362884521, + "rewards/rejected": -1.9945958852767944, + "step": 980 + }, + { + "epoch": 0.51, + "learning_rate": 4.609868043602983e-07, + "logits/chosen": -2.9552786350250244, + "logits/rejected": -2.889835834503174, + "logps/chosen": -253.4173583984375, + "logps/rejected": -232.3767852783203, + "loss": 0.4735, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4632430672645569, + "rewards/margins": 1.0684535503387451, + "rewards/rejected": -1.5316965579986572, + "step": 990 + }, + { + "epoch": 0.52, + "learning_rate": 4.600305985848154e-07, + "logits/chosen": -2.904284954071045, + "logits/rejected": -2.867764472961426, + "logps/chosen": -245.16519165039062, + "logps/rejected": -240.11782836914062, + "loss": 0.4641, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.47291144728660583, + "rewards/margins": 1.0328925848007202, + "rewards/rejected": -1.5058040618896484, + "step": 1000 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -2.92161226272583, + "eval_logits/rejected": -2.891594171524048, + "eval_logps/chosen": -276.3018798828125, + "eval_logps/rejected": -248.4610595703125, + "eval_loss": 0.5090581774711609, + "eval_rewards/accuracies": 0.7580000162124634, + "eval_rewards/chosen": -0.515073835849762, + "eval_rewards/margins": 1.4826339483261108, + "eval_rewards/rejected": -1.997707724571228, + "eval_runtime": 299.1136, + "eval_samples_per_second": 6.686, + "eval_steps_per_second": 0.418, + "step": 1000 + }, + { + "epoch": 0.52, + "learning_rate": 4.590743928093325e-07, + "logits/chosen": -2.7496089935302734, + "logits/rejected": -2.7067320346832275, + "logps/chosen": -304.1459655761719, + "logps/rejected": -259.58782958984375, + "loss": 0.5741, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7305110692977905, + "rewards/margins": 1.1352968215942383, + "rewards/rejected": -1.8658077716827393, + "step": 1010 + }, + { + "epoch": 0.53, + "learning_rate": 4.581181870338497e-07, + "logits/chosen": -2.8158769607543945, + "logits/rejected": -2.7898595333099365, + "logps/chosen": -316.39007568359375, + "logps/rejected": -285.505615234375, + "loss": 0.4861, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.3617403507232666, + "rewards/margins": 1.6614677906036377, + "rewards/rejected": -2.0232081413269043, + "step": 1020 + }, + { + "epoch": 0.53, + "learning_rate": 4.571619812583668e-07, + "logits/chosen": -2.8434338569641113, + "logits/rejected": -2.816493272781372, + "logps/chosen": -268.98223876953125, + "logps/rejected": -274.27459716796875, + "loss": 0.4956, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6085302233695984, + "rewards/margins": 1.38003671169281, + "rewards/rejected": -1.9885669946670532, + "step": 1030 + }, + { + "epoch": 0.54, + "learning_rate": 4.562057754828839e-07, + "logits/chosen": -2.8416907787323, + "logits/rejected": -2.7812013626098633, + "logps/chosen": -283.7867431640625, + "logps/rejected": -249.9698028564453, + "loss": 0.551, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5633155703544617, + "rewards/margins": 1.5839368104934692, + "rewards/rejected": -2.1472525596618652, + "step": 1040 + }, + { + "epoch": 0.54, + "learning_rate": 4.55249569707401e-07, + "logits/chosen": -2.910600185394287, + "logits/rejected": -2.8899829387664795, + "logps/chosen": -235.5374298095703, + "logps/rejected": -255.29824829101562, + "loss": 0.4963, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8693474531173706, + "rewards/margins": 1.2028529644012451, + "rewards/rejected": -2.072200298309326, + "step": 1050 + }, + { + "epoch": 0.55, + "learning_rate": 4.5429336393191814e-07, + "logits/chosen": -2.8421430587768555, + "logits/rejected": -2.862175703048706, + "logps/chosen": -250.13607788085938, + "logps/rejected": -224.20864868164062, + "loss": 0.5393, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6483901739120483, + "rewards/margins": 1.0726064443588257, + "rewards/rejected": -1.720996618270874, + "step": 1060 + }, + { + "epoch": 0.55, + "learning_rate": 4.5333715815643525e-07, + "logits/chosen": -2.9626762866973877, + "logits/rejected": -2.9626893997192383, + "logps/chosen": -299.9131774902344, + "logps/rejected": -270.77386474609375, + "loss": 0.5598, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.0119389295578003, + "rewards/margins": 1.3936216831207275, + "rewards/rejected": -2.4055607318878174, + "step": 1070 + }, + { + "epoch": 0.56, + "learning_rate": 4.5238095238095237e-07, + "logits/chosen": -2.9695396423339844, + "logits/rejected": -2.9188308715820312, + "logps/chosen": -283.611328125, + "logps/rejected": -250.3421630859375, + "loss": 0.5169, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1383975744247437, + "rewards/margins": 1.2842782735824585, + "rewards/rejected": -2.422675609588623, + "step": 1080 + }, + { + "epoch": 0.56, + "learning_rate": 4.514247466054695e-07, + "logits/chosen": -2.906083345413208, + "logits/rejected": -2.8836545944213867, + "logps/chosen": -230.08450317382812, + "logps/rejected": -224.95791625976562, + "loss": 0.5159, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.9879177808761597, + "rewards/margins": 1.4775643348693848, + "rewards/rejected": -2.465482234954834, + "step": 1090 + }, + { + "epoch": 0.57, + "learning_rate": 4.504685408299866e-07, + "logits/chosen": -2.810950756072998, + "logits/rejected": -2.805164098739624, + "logps/chosen": -303.5326843261719, + "logps/rejected": -280.25927734375, + "loss": 0.5558, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0075418949127197, + "rewards/margins": 1.3667891025543213, + "rewards/rejected": -2.374330759048462, + "step": 1100 + }, + { + "epoch": 0.57, + "eval_logits/chosen": -2.8914411067962646, + "eval_logits/rejected": -2.860114336013794, + "eval_logps/chosen": -279.2667541503906, + "eval_logps/rejected": -249.60362243652344, + "eval_loss": 0.49709653854370117, + "eval_rewards/accuracies": 0.7699999809265137, + "eval_rewards/chosen": -0.8115612268447876, + "eval_rewards/margins": 1.3004019260406494, + "eval_rewards/rejected": -2.1119627952575684, + "eval_runtime": 296.2613, + "eval_samples_per_second": 6.751, + "eval_steps_per_second": 0.422, + "step": 1100 + }, + { + "epoch": 0.57, + "learning_rate": 4.495123350545037e-07, + "logits/chosen": -2.894946336746216, + "logits/rejected": -2.880427122116089, + "logps/chosen": -302.75360107421875, + "logps/rejected": -287.68927001953125, + "loss": 0.5009, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.7256338596343994, + "rewards/margins": 1.2386404275894165, + "rewards/rejected": -1.9642740488052368, + "step": 1110 + }, + { + "epoch": 0.58, + "learning_rate": 4.4855612927902083e-07, + "logits/chosen": -2.8273870944976807, + "logits/rejected": -2.8010201454162598, + "logps/chosen": -305.2731628417969, + "logps/rejected": -255.99533081054688, + "loss": 0.5916, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2193191051483154, + "rewards/margins": 1.1791822910308838, + "rewards/rejected": -2.398501396179199, + "step": 1120 + }, + { + "epoch": 0.58, + "learning_rate": 4.4759992350353795e-07, + "logits/chosen": -2.8475892543792725, + "logits/rejected": -2.7523796558380127, + "logps/chosen": -290.9092712402344, + "logps/rejected": -231.7282257080078, + "loss": 0.4341, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9197354316711426, + "rewards/margins": 1.6278235912322998, + "rewards/rejected": -2.5475590229034424, + "step": 1130 + }, + { + "epoch": 0.59, + "learning_rate": 4.46643717728055e-07, + "logits/chosen": -2.773775815963745, + "logits/rejected": -2.7185730934143066, + "logps/chosen": -249.8427734375, + "logps/rejected": -257.61114501953125, + "loss": 0.5095, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.9906557202339172, + "rewards/margins": 1.4055380821228027, + "rewards/rejected": -2.396193742752075, + "step": 1140 + }, + { + "epoch": 0.59, + "learning_rate": 4.4568751195257213e-07, + "logits/chosen": -2.783116340637207, + "logits/rejected": -2.7151153087615967, + "logps/chosen": -330.0415344238281, + "logps/rejected": -256.75262451171875, + "loss": 0.5247, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.5400577783584595, + "rewards/margins": 1.7343899011611938, + "rewards/rejected": -2.2744476795196533, + "step": 1150 + }, + { + "epoch": 0.6, + "learning_rate": 4.447313061770893e-07, + "logits/chosen": -2.7966766357421875, + "logits/rejected": -2.7340331077575684, + "logps/chosen": -263.78033447265625, + "logps/rejected": -267.43609619140625, + "loss": 0.5293, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.573074460029602, + "rewards/margins": 1.5856841802597046, + "rewards/rejected": -2.1587586402893066, + "step": 1160 + }, + { + "epoch": 0.6, + "learning_rate": 4.437751004016064e-07, + "logits/chosen": -2.684180498123169, + "logits/rejected": -2.628032684326172, + "logps/chosen": -229.15811157226562, + "logps/rejected": -241.14602661132812, + "loss": 0.5069, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6570131778717041, + "rewards/margins": 1.2180211544036865, + "rewards/rejected": -1.8750343322753906, + "step": 1170 + }, + { + "epoch": 0.61, + "learning_rate": 4.4281889462612353e-07, + "logits/chosen": -2.8334476947784424, + "logits/rejected": -2.8391318321228027, + "logps/chosen": -269.18353271484375, + "logps/rejected": -228.5656280517578, + "loss": 0.4841, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.5598064064979553, + "rewards/margins": 1.3947551250457764, + "rewards/rejected": -1.9545615911483765, + "step": 1180 + }, + { + "epoch": 0.61, + "learning_rate": 4.4186268885064064e-07, + "logits/chosen": -2.858812093734741, + "logits/rejected": -2.861037492752075, + "logps/chosen": -296.47418212890625, + "logps/rejected": -253.0843505859375, + "loss": 0.5356, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.768179178237915, + "rewards/margins": 1.159069299697876, + "rewards/rejected": -1.9272483587265015, + "step": 1190 + }, + { + "epoch": 0.62, + "learning_rate": 4.4090648307515776e-07, + "logits/chosen": -2.862907648086548, + "logits/rejected": -2.834665298461914, + "logps/chosen": -216.3045196533203, + "logps/rejected": -189.45518493652344, + "loss": 0.4877, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8047316670417786, + "rewards/margins": 1.0661463737487793, + "rewards/rejected": -1.8708778619766235, + "step": 1200 + }, + { + "epoch": 0.62, + "eval_logits/chosen": -2.8769757747650146, + "eval_logits/rejected": -2.833991765975952, + "eval_logps/chosen": -276.7474060058594, + "eval_logps/rejected": -247.43191528320312, + "eval_loss": 0.5092260837554932, + "eval_rewards/accuracies": 0.7639999985694885, + "eval_rewards/chosen": -0.5596281886100769, + "eval_rewards/margins": 1.3351647853851318, + "eval_rewards/rejected": -1.894792914390564, + "eval_runtime": 296.8641, + "eval_samples_per_second": 6.737, + "eval_steps_per_second": 0.421, + "step": 1200 + }, + { + "epoch": 0.62, + "learning_rate": 4.399502772996749e-07, + "logits/chosen": -2.869292736053467, + "logits/rejected": -2.847054958343506, + "logps/chosen": -274.1513671875, + "logps/rejected": -256.44647216796875, + "loss": 0.4977, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8532268404960632, + "rewards/margins": 1.425225019454956, + "rewards/rejected": -2.278452157974243, + "step": 1210 + }, + { + "epoch": 0.63, + "learning_rate": 4.38994071524192e-07, + "logits/chosen": -2.8490569591522217, + "logits/rejected": -2.857327938079834, + "logps/chosen": -254.9918670654297, + "logps/rejected": -223.4118194580078, + "loss": 0.4921, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.870094895362854, + "rewards/margins": 1.205338478088379, + "rewards/rejected": -2.0754332542419434, + "step": 1220 + }, + { + "epoch": 0.64, + "learning_rate": 4.380378657487091e-07, + "logits/chosen": -2.90724778175354, + "logits/rejected": -2.839764356613159, + "logps/chosen": -317.3240051269531, + "logps/rejected": -304.71771240234375, + "loss": 0.4931, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.45973819494247437, + "rewards/margins": 1.6618239879608154, + "rewards/rejected": -2.1215622425079346, + "step": 1230 + }, + { + "epoch": 0.64, + "learning_rate": 4.370816599732262e-07, + "logits/chosen": -2.9106605052948, + "logits/rejected": -2.865180730819702, + "logps/chosen": -310.6402282714844, + "logps/rejected": -236.48153686523438, + "loss": 0.4856, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.191367506980896, + "rewards/margins": 1.2832874059677124, + "rewards/rejected": -2.4746549129486084, + "step": 1240 + }, + { + "epoch": 0.65, + "learning_rate": 4.3612545419774334e-07, + "logits/chosen": -2.8365566730499268, + "logits/rejected": -2.8342230319976807, + "logps/chosen": -237.7244110107422, + "logps/rejected": -268.4637451171875, + "loss": 0.5643, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1750293970108032, + "rewards/margins": 0.9123395085334778, + "rewards/rejected": -2.0873687267303467, + "step": 1250 + }, + { + "epoch": 0.65, + "learning_rate": 4.3516924842226045e-07, + "logits/chosen": -2.8220834732055664, + "logits/rejected": -2.7954087257385254, + "logps/chosen": -284.5871887207031, + "logps/rejected": -250.57449340820312, + "loss": 0.5149, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.7969790101051331, + "rewards/margins": 1.5051862001419067, + "rewards/rejected": -2.3021652698516846, + "step": 1260 + }, + { + "epoch": 0.66, + "learning_rate": 4.3421304264677757e-07, + "logits/chosen": -2.867050886154175, + "logits/rejected": -2.848658323287964, + "logps/chosen": -258.2459411621094, + "logps/rejected": -223.54580688476562, + "loss": 0.4854, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -0.8680087924003601, + "rewards/margins": 1.5067569017410278, + "rewards/rejected": -2.374765634536743, + "step": 1270 + }, + { + "epoch": 0.66, + "learning_rate": 4.332568368712947e-07, + "logits/chosen": -2.969085931777954, + "logits/rejected": -2.9144904613494873, + "logps/chosen": -273.5484313964844, + "logps/rejected": -251.82192993164062, + "loss": 0.5912, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0105066299438477, + "rewards/margins": 1.149637222290039, + "rewards/rejected": -2.1601438522338867, + "step": 1280 + }, + { + "epoch": 0.67, + "learning_rate": 4.323006310958118e-07, + "logits/chosen": -2.9112117290496826, + "logits/rejected": -2.872497320175171, + "logps/chosen": -316.62298583984375, + "logps/rejected": -268.509765625, + "loss": 0.4789, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8553665280342102, + "rewards/margins": 1.4781516790390015, + "rewards/rejected": -2.3335182666778564, + "step": 1290 + }, + { + "epoch": 0.67, + "learning_rate": 4.313444253203289e-07, + "logits/chosen": -2.8557441234588623, + "logits/rejected": -2.847517490386963, + "logps/chosen": -256.849609375, + "logps/rejected": -240.433837890625, + "loss": 0.4922, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8805469274520874, + "rewards/margins": 1.5468670129776, + "rewards/rejected": -2.4274144172668457, + "step": 1300 + }, + { + "epoch": 0.67, + "eval_logits/chosen": -2.8517472743988037, + "eval_logits/rejected": -2.8187196254730225, + "eval_logps/chosen": -280.490966796875, + "eval_logps/rejected": -252.22872924804688, + "eval_loss": 0.5181106925010681, + "eval_rewards/accuracies": 0.7459999918937683, + "eval_rewards/chosen": -0.9339839220046997, + "eval_rewards/margins": 1.4404925107955933, + "eval_rewards/rejected": -2.374476671218872, + "eval_runtime": 296.964, + "eval_samples_per_second": 6.735, + "eval_steps_per_second": 0.421, + "step": 1300 + }, + { + "epoch": 0.68, + "learning_rate": 4.3038821954484603e-07, + "logits/chosen": -2.860063076019287, + "logits/rejected": -2.794776439666748, + "logps/chosen": -283.9671325683594, + "logps/rejected": -255.06192016601562, + "loss": 0.464, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1593186855316162, + "rewards/margins": 1.200408697128296, + "rewards/rejected": -2.359727382659912, + "step": 1310 + }, + { + "epoch": 0.68, + "learning_rate": 4.2943201376936315e-07, + "logits/chosen": -2.7973389625549316, + "logits/rejected": -2.769406318664551, + "logps/chosen": -282.5267028808594, + "logps/rejected": -258.59527587890625, + "loss": 0.5268, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1852004528045654, + "rewards/margins": 1.5480226278305054, + "rewards/rejected": -2.7332231998443604, + "step": 1320 + }, + { + "epoch": 0.69, + "learning_rate": 4.2847580799388026e-07, + "logits/chosen": -2.8179421424865723, + "logits/rejected": -2.78226900100708, + "logps/chosen": -297.841552734375, + "logps/rejected": -285.6810302734375, + "loss": 0.5583, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.291656255722046, + "rewards/margins": 1.1764315366744995, + "rewards/rejected": -2.468087911605835, + "step": 1330 + }, + { + "epoch": 0.69, + "learning_rate": 4.275196022183974e-07, + "logits/chosen": -2.7832727432250977, + "logits/rejected": -2.721656322479248, + "logps/chosen": -301.8523254394531, + "logps/rejected": -235.78750610351562, + "loss": 0.4971, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.1573271751403809, + "rewards/margins": 1.297183632850647, + "rewards/rejected": -2.4545111656188965, + "step": 1340 + }, + { + "epoch": 0.7, + "learning_rate": 4.265633964429145e-07, + "logits/chosen": -2.8424277305603027, + "logits/rejected": -2.831519365310669, + "logps/chosen": -269.0021667480469, + "logps/rejected": -215.4640350341797, + "loss": 0.5687, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.104570746421814, + "rewards/margins": 0.9818031191825867, + "rewards/rejected": -2.086374044418335, + "step": 1350 + }, + { + "epoch": 0.7, + "learning_rate": 4.256071906674316e-07, + "logits/chosen": -2.841625690460205, + "logits/rejected": -2.8271470069885254, + "logps/chosen": -314.56719970703125, + "logps/rejected": -280.16204833984375, + "loss": 0.5947, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.9873099327087402, + "rewards/margins": 1.2393622398376465, + "rewards/rejected": -2.226672410964966, + "step": 1360 + }, + { + "epoch": 0.71, + "learning_rate": 4.246509848919487e-07, + "logits/chosen": -2.875816822052002, + "logits/rejected": -2.875920057296753, + "logps/chosen": -262.7657165527344, + "logps/rejected": -266.3171691894531, + "loss": 0.5325, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8712446093559265, + "rewards/margins": 1.4807698726654053, + "rewards/rejected": -2.3520145416259766, + "step": 1370 + }, + { + "epoch": 0.71, + "learning_rate": 4.2369477911646584e-07, + "logits/chosen": -2.901019811630249, + "logits/rejected": -2.860865592956543, + "logps/chosen": -275.1546325683594, + "logps/rejected": -237.35043334960938, + "loss": 0.5662, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.190070390701294, + "rewards/margins": 1.1778628826141357, + "rewards/rejected": -2.3679332733154297, + "step": 1380 + }, + { + "epoch": 0.72, + "learning_rate": 4.2273857334098296e-07, + "logits/chosen": -2.8525962829589844, + "logits/rejected": -2.7851364612579346, + "logps/chosen": -280.11444091796875, + "logps/rejected": -238.8421630859375, + "loss": 0.55, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.947921633720398, + "rewards/margins": 1.1795740127563477, + "rewards/rejected": -2.127495527267456, + "step": 1390 + }, + { + "epoch": 0.72, + "learning_rate": 4.2178236756550007e-07, + "logits/chosen": -2.8216989040374756, + "logits/rejected": -2.7960610389709473, + "logps/chosen": -291.2049255371094, + "logps/rejected": -238.7577362060547, + "loss": 0.5515, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9337828755378723, + "rewards/margins": 1.1081970930099487, + "rewards/rejected": -2.041980266571045, + "step": 1400 + }, + { + "epoch": 0.72, + "eval_logits/chosen": -2.870413303375244, + "eval_logits/rejected": -2.8487894535064697, + "eval_logps/chosen": -281.02386474609375, + "eval_logps/rejected": -250.60337829589844, + "eval_loss": 0.5081000924110413, + "eval_rewards/accuracies": 0.7440000176429749, + "eval_rewards/chosen": -0.987274169921875, + "eval_rewards/margins": 1.2246668338775635, + "eval_rewards/rejected": -2.2119410037994385, + "eval_runtime": 297.404, + "eval_samples_per_second": 6.725, + "eval_steps_per_second": 0.42, + "step": 1400 + }, + { + "epoch": 0.73, + "learning_rate": 4.208261617900172e-07, + "logits/chosen": -2.8915770053863525, + "logits/rejected": -2.8828485012054443, + "logps/chosen": -284.6546325683594, + "logps/rejected": -214.5458984375, + "loss": 0.4857, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.0735814571380615, + "rewards/margins": 1.3267624378204346, + "rewards/rejected": -2.400344133377075, + "step": 1410 + }, + { + "epoch": 0.73, + "learning_rate": 4.198699560145343e-07, + "logits/chosen": -2.7232089042663574, + "logits/rejected": -2.735395908355713, + "logps/chosen": -261.86456298828125, + "logps/rejected": -242.8522186279297, + "loss": 0.5901, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9799901247024536, + "rewards/margins": 1.3254430294036865, + "rewards/rejected": -2.3054332733154297, + "step": 1420 + }, + { + "epoch": 0.74, + "learning_rate": 4.189137502390514e-07, + "logits/chosen": -2.8386452198028564, + "logits/rejected": -2.7790274620056152, + "logps/chosen": -276.7398376464844, + "logps/rejected": -271.81732177734375, + "loss": 0.5506, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8171237707138062, + "rewards/margins": 1.1944704055786133, + "rewards/rejected": -2.01159405708313, + "step": 1430 + }, + { + "epoch": 0.74, + "learning_rate": 4.179575444635686e-07, + "logits/chosen": -2.8361704349517822, + "logits/rejected": -2.781494617462158, + "logps/chosen": -331.5342712402344, + "logps/rejected": -278.0868225097656, + "loss": 0.5754, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.726187527179718, + "rewards/margins": 1.3111015558242798, + "rewards/rejected": -2.0372891426086426, + "step": 1440 + }, + { + "epoch": 0.75, + "learning_rate": 4.170013386880857e-07, + "logits/chosen": -2.763598918914795, + "logits/rejected": -2.744124412536621, + "logps/chosen": -274.3957824707031, + "logps/rejected": -286.14544677734375, + "loss": 0.4956, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8051531910896301, + "rewards/margins": 1.4850131273269653, + "rewards/rejected": -2.2901663780212402, + "step": 1450 + }, + { + "epoch": 0.75, + "learning_rate": 4.1604513291260277e-07, + "logits/chosen": -2.7877087593078613, + "logits/rejected": -2.7506332397460938, + "logps/chosen": -263.64764404296875, + "logps/rejected": -259.18243408203125, + "loss": 0.5687, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7930214405059814, + "rewards/margins": 1.4395453929901123, + "rewards/rejected": -2.2325668334960938, + "step": 1460 + }, + { + "epoch": 0.76, + "learning_rate": 4.150889271371199e-07, + "logits/chosen": -2.735426425933838, + "logits/rejected": -2.7236385345458984, + "logps/chosen": -284.9507751464844, + "logps/rejected": -236.884521484375, + "loss": 0.5582, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9369648098945618, + "rewards/margins": 1.3734034299850464, + "rewards/rejected": -2.310368299484253, + "step": 1470 + }, + { + "epoch": 0.76, + "learning_rate": 4.14132721361637e-07, + "logits/chosen": -2.770085334777832, + "logits/rejected": -2.688161611557007, + "logps/chosen": -275.33111572265625, + "logps/rejected": -206.7180938720703, + "loss": 0.4587, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.0923352241516113, + "rewards/margins": 1.6801226139068604, + "rewards/rejected": -2.7724575996398926, + "step": 1480 + }, + { + "epoch": 0.77, + "learning_rate": 4.131765155861541e-07, + "logits/chosen": -2.7762527465820312, + "logits/rejected": -2.747938632965088, + "logps/chosen": -239.770751953125, + "logps/rejected": -215.8168182373047, + "loss": 0.5012, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.7464852333068848, + "rewards/margins": 1.6632707118988037, + "rewards/rejected": -2.4097559452056885, + "step": 1490 + }, + { + "epoch": 0.77, + "learning_rate": 4.1222030981067123e-07, + "logits/chosen": -2.8420047760009766, + "logits/rejected": -2.8117308616638184, + "logps/chosen": -303.0367126464844, + "logps/rejected": -269.77996826171875, + "loss": 0.4349, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0034947395324707, + "rewards/margins": 1.6452404260635376, + "rewards/rejected": -2.6487350463867188, + "step": 1500 + }, + { + "epoch": 0.77, + "eval_logits/chosen": -2.8601479530334473, + "eval_logits/rejected": -2.840158700942993, + "eval_logps/chosen": -280.1994323730469, + "eval_logps/rejected": -252.74588012695312, + "eval_loss": 0.4996170699596405, + "eval_rewards/accuracies": 0.7580000162124634, + "eval_rewards/chosen": -0.9048290252685547, + "eval_rewards/margins": 1.5213594436645508, + "eval_rewards/rejected": -2.4261887073516846, + "eval_runtime": 297.5672, + "eval_samples_per_second": 6.721, + "eval_steps_per_second": 0.42, + "step": 1500 + }, + { + "epoch": 0.78, + "learning_rate": 4.1126410403518835e-07, + "logits/chosen": -2.8034896850585938, + "logits/rejected": -2.779625415802002, + "logps/chosen": -248.8442840576172, + "logps/rejected": -269.21832275390625, + "loss": 0.5365, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1126397848129272, + "rewards/margins": 1.5354644060134888, + "rewards/rejected": -2.648104190826416, + "step": 1510 + }, + { + "epoch": 0.78, + "learning_rate": 4.1030789825970546e-07, + "logits/chosen": -2.775416851043701, + "logits/rejected": -2.771148443222046, + "logps/chosen": -294.32366943359375, + "logps/rejected": -273.7279052734375, + "loss": 0.5477, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8498045206069946, + "rewards/margins": 1.3190572261810303, + "rewards/rejected": -2.1688618659973145, + "step": 1520 + }, + { + "epoch": 0.79, + "learning_rate": 4.093516924842226e-07, + "logits/chosen": -2.7791056632995605, + "logits/rejected": -2.7741143703460693, + "logps/chosen": -285.6884460449219, + "logps/rejected": -259.9753723144531, + "loss": 0.4439, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6743859648704529, + "rewards/margins": 1.2439590692520142, + "rewards/rejected": -1.9183450937271118, + "step": 1530 + }, + { + "epoch": 0.8, + "learning_rate": 4.083954867087397e-07, + "logits/chosen": -2.916766405105591, + "logits/rejected": -2.8723397254943848, + "logps/chosen": -262.5208740234375, + "logps/rejected": -250.9338836669922, + "loss": 0.4902, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7652831673622131, + "rewards/margins": 1.256225347518921, + "rewards/rejected": -2.0215084552764893, + "step": 1540 + }, + { + "epoch": 0.8, + "learning_rate": 4.074392809332568e-07, + "logits/chosen": -2.8186182975769043, + "logits/rejected": -2.783643960952759, + "logps/chosen": -318.96734619140625, + "logps/rejected": -273.291259765625, + "loss": 0.5233, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7978024482727051, + "rewards/margins": 2.1130380630493164, + "rewards/rejected": -2.9108405113220215, + "step": 1550 + }, + { + "epoch": 0.81, + "learning_rate": 4.064830751577739e-07, + "logits/chosen": -2.8384575843811035, + "logits/rejected": -2.792706251144409, + "logps/chosen": -263.0948791503906, + "logps/rejected": -248.63986206054688, + "loss": 0.474, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.6642600297927856, + "rewards/margins": 1.660636305809021, + "rewards/rejected": -2.3248965740203857, + "step": 1560 + }, + { + "epoch": 0.81, + "learning_rate": 4.0552686938229104e-07, + "logits/chosen": -2.845766305923462, + "logits/rejected": -2.8493919372558594, + "logps/chosen": -261.2109375, + "logps/rejected": -236.23056030273438, + "loss": 0.4627, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5801820755004883, + "rewards/margins": 1.506900429725647, + "rewards/rejected": -2.087082624435425, + "step": 1570 + }, + { + "epoch": 0.82, + "learning_rate": 4.045706636068082e-07, + "logits/chosen": -2.8144640922546387, + "logits/rejected": -2.752319097518921, + "logps/chosen": -277.097412109375, + "logps/rejected": -248.9197235107422, + "loss": 0.4852, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.9225671887397766, + "rewards/margins": 1.5256555080413818, + "rewards/rejected": -2.4482226371765137, + "step": 1580 + }, + { + "epoch": 0.82, + "learning_rate": 4.036144578313253e-07, + "logits/chosen": -2.780768871307373, + "logits/rejected": -2.7448782920837402, + "logps/chosen": -280.2907409667969, + "logps/rejected": -259.012451171875, + "loss": 0.58, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9570592641830444, + "rewards/margins": 1.3608639240264893, + "rewards/rejected": -2.3179233074188232, + "step": 1590 + }, + { + "epoch": 0.83, + "learning_rate": 4.0265825205584244e-07, + "logits/chosen": -2.801021099090576, + "logits/rejected": -2.766148805618286, + "logps/chosen": -294.86181640625, + "logps/rejected": -266.83282470703125, + "loss": 0.5446, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9832652807235718, + "rewards/margins": 1.3677330017089844, + "rewards/rejected": -2.3509984016418457, + "step": 1600 + }, + { + "epoch": 0.83, + "eval_logits/chosen": -2.7852587699890137, + "eval_logits/rejected": -2.7610068321228027, + "eval_logps/chosen": -279.8681335449219, + "eval_logps/rejected": -252.8737030029297, + "eval_loss": 0.4926547408103943, + "eval_rewards/accuracies": 0.765999972820282, + "eval_rewards/chosen": -0.8716984987258911, + "eval_rewards/margins": 1.5672730207443237, + "eval_rewards/rejected": -2.438971519470215, + "eval_runtime": 297.4562, + "eval_samples_per_second": 6.724, + "eval_steps_per_second": 0.42, + "step": 1600 + }, + { + "epoch": 0.83, + "learning_rate": 4.0170204628035956e-07, + "logits/chosen": -2.779371976852417, + "logits/rejected": -2.779296398162842, + "logps/chosen": -227.3587188720703, + "logps/rejected": -220.8697052001953, + "loss": 0.5078, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.7682415246963501, + "rewards/margins": 1.6325304508209229, + "rewards/rejected": -2.4007718563079834, + "step": 1610 + }, + { + "epoch": 0.84, + "learning_rate": 4.007458405048766e-07, + "logits/chosen": -2.79868745803833, + "logits/rejected": -2.7551894187927246, + "logps/chosen": -313.44866943359375, + "logps/rejected": -284.7059631347656, + "loss": 0.4781, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0337600708007812, + "rewards/margins": 1.553781270980835, + "rewards/rejected": -2.587541103363037, + "step": 1620 + }, + { + "epoch": 0.84, + "learning_rate": 3.9978963472939373e-07, + "logits/chosen": -2.781686782836914, + "logits/rejected": -2.7553551197052, + "logps/chosen": -282.82470703125, + "logps/rejected": -247.119384765625, + "loss": 0.4834, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8043079376220703, + "rewards/margins": 1.690171480178833, + "rewards/rejected": -2.4944794178009033, + "step": 1630 + }, + { + "epoch": 0.85, + "learning_rate": 3.9883342895391085e-07, + "logits/chosen": -2.7971036434173584, + "logits/rejected": -2.7414002418518066, + "logps/chosen": -324.1792907714844, + "logps/rejected": -256.4706726074219, + "loss": 0.4813, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.9165623784065247, + "rewards/margins": 1.7748088836669922, + "rewards/rejected": -2.691371440887451, + "step": 1640 + }, + { + "epoch": 0.85, + "learning_rate": 3.9787722317842796e-07, + "logits/chosen": -2.839597225189209, + "logits/rejected": -2.805170774459839, + "logps/chosen": -296.72198486328125, + "logps/rejected": -214.109619140625, + "loss": 0.5182, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6080407500267029, + "rewards/margins": 1.669585943222046, + "rewards/rejected": -2.2776267528533936, + "step": 1650 + }, + { + "epoch": 0.86, + "learning_rate": 3.969210174029451e-07, + "logits/chosen": -2.8030974864959717, + "logits/rejected": -2.746521472930908, + "logps/chosen": -282.7402038574219, + "logps/rejected": -277.02227783203125, + "loss": 0.4973, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.145887851715088, + "rewards/margins": 1.514013648033142, + "rewards/rejected": -2.6599013805389404, + "step": 1660 + }, + { + "epoch": 0.86, + "learning_rate": 3.959648116274622e-07, + "logits/chosen": -2.822636365890503, + "logits/rejected": -2.7770602703094482, + "logps/chosen": -274.1295471191406, + "logps/rejected": -237.7220458984375, + "loss": 0.6338, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1833336353302002, + "rewards/margins": 1.1599174737930298, + "rewards/rejected": -2.3432514667510986, + "step": 1670 + }, + { + "epoch": 0.87, + "learning_rate": 3.950086058519793e-07, + "logits/chosen": -2.8634159564971924, + "logits/rejected": -2.855304479598999, + "logps/chosen": -250.9813232421875, + "logps/rejected": -238.8641357421875, + "loss": 0.5555, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0109765529632568, + "rewards/margins": 1.2353746891021729, + "rewards/rejected": -2.2463512420654297, + "step": 1680 + }, + { + "epoch": 0.87, + "learning_rate": 3.9405240007649643e-07, + "logits/chosen": -2.9426653385162354, + "logits/rejected": -2.9147517681121826, + "logps/chosen": -265.8846130371094, + "logps/rejected": -255.80490112304688, + "loss": 0.4857, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.008003830909729, + "rewards/margins": 1.246358871459961, + "rewards/rejected": -2.2543625831604004, + "step": 1690 + }, + { + "epoch": 0.88, + "learning_rate": 3.9309619430101354e-07, + "logits/chosen": -2.8152334690093994, + "logits/rejected": -2.780714750289917, + "logps/chosen": -261.5868225097656, + "logps/rejected": -245.407470703125, + "loss": 0.5242, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9333425760269165, + "rewards/margins": 1.554662823677063, + "rewards/rejected": -2.4880051612854004, + "step": 1700 + }, + { + "epoch": 0.88, + "eval_logits/chosen": -2.8524723052978516, + "eval_logits/rejected": -2.8269340991973877, + "eval_logps/chosen": -278.135498046875, + "eval_logps/rejected": -249.865478515625, + "eval_loss": 0.48644253611564636, + "eval_rewards/accuracies": 0.777999997138977, + "eval_rewards/chosen": -0.6984347105026245, + "eval_rewards/margins": 1.4397144317626953, + "eval_rewards/rejected": -2.1381492614746094, + "eval_runtime": 297.4083, + "eval_samples_per_second": 6.725, + "eval_steps_per_second": 0.42, + "step": 1700 + }, + { + "epoch": 0.88, + "learning_rate": 3.9213998852553066e-07, + "logits/chosen": -2.858901262283325, + "logits/rejected": -2.8314156532287598, + "logps/chosen": -326.45538330078125, + "logps/rejected": -269.11468505859375, + "loss": 0.5017, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.9739869832992554, + "rewards/margins": 1.2920843362808228, + "rewards/rejected": -2.2660715579986572, + "step": 1710 + }, + { + "epoch": 0.89, + "learning_rate": 3.9118378275004783e-07, + "logits/chosen": -2.879965305328369, + "logits/rejected": -2.8734488487243652, + "logps/chosen": -274.63604736328125, + "logps/rejected": -316.09521484375, + "loss": 0.5748, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.099215030670166, + "rewards/margins": 1.0683257579803467, + "rewards/rejected": -2.167541027069092, + "step": 1720 + }, + { + "epoch": 0.89, + "learning_rate": 3.9022757697456494e-07, + "logits/chosen": -2.7558462619781494, + "logits/rejected": -2.771763324737549, + "logps/chosen": -331.04693603515625, + "logps/rejected": -281.0125427246094, + "loss": 0.4806, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7285557985305786, + "rewards/margins": 1.2618799209594727, + "rewards/rejected": -1.9904358386993408, + "step": 1730 + }, + { + "epoch": 0.9, + "learning_rate": 3.8927137119908206e-07, + "logits/chosen": -2.8156943321228027, + "logits/rejected": -2.782925844192505, + "logps/chosen": -299.8418884277344, + "logps/rejected": -229.73574829101562, + "loss": 0.5426, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9351360201835632, + "rewards/margins": 1.2108604907989502, + "rewards/rejected": -2.14599609375, + "step": 1740 + }, + { + "epoch": 0.9, + "learning_rate": 3.883151654235992e-07, + "logits/chosen": -2.8140454292297363, + "logits/rejected": -2.774932861328125, + "logps/chosen": -295.07867431640625, + "logps/rejected": -263.8828125, + "loss": 0.4908, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.813959002494812, + "rewards/margins": 1.7653522491455078, + "rewards/rejected": -2.579310894012451, + "step": 1750 + }, + { + "epoch": 0.91, + "learning_rate": 3.873589596481163e-07, + "logits/chosen": -2.8576390743255615, + "logits/rejected": -2.8543241024017334, + "logps/chosen": -293.5692443847656, + "logps/rejected": -266.193359375, + "loss": 0.5689, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2586266994476318, + "rewards/margins": 1.2890112400054932, + "rewards/rejected": -2.547637939453125, + "step": 1760 + }, + { + "epoch": 0.91, + "learning_rate": 3.864027538726334e-07, + "logits/chosen": -2.7836225032806396, + "logits/rejected": -2.7786474227905273, + "logps/chosen": -277.83740234375, + "logps/rejected": -254.81494140625, + "loss": 0.5213, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8805364370346069, + "rewards/margins": 1.7172577381134033, + "rewards/rejected": -2.5977942943573, + "step": 1770 + }, + { + "epoch": 0.92, + "learning_rate": 3.8544654809715047e-07, + "logits/chosen": -2.7854583263397217, + "logits/rejected": -2.757338523864746, + "logps/chosen": -291.8116760253906, + "logps/rejected": -260.56121826171875, + "loss": 0.531, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1720925569534302, + "rewards/margins": 1.3734238147735596, + "rewards/rejected": -2.5455162525177, + "step": 1780 + }, + { + "epoch": 0.92, + "learning_rate": 3.844903423216676e-07, + "logits/chosen": -2.767277240753174, + "logits/rejected": -2.7258496284484863, + "logps/chosen": -269.7626647949219, + "logps/rejected": -236.54165649414062, + "loss": 0.5328, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7634437680244446, + "rewards/margins": 1.3117390871047974, + "rewards/rejected": -2.0751829147338867, + "step": 1790 + }, + { + "epoch": 0.93, + "learning_rate": 3.835341365461847e-07, + "logits/chosen": -2.716766834259033, + "logits/rejected": -2.6774024963378906, + "logps/chosen": -272.6806640625, + "logps/rejected": -222.8508758544922, + "loss": 0.5266, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -0.6215282678604126, + "rewards/margins": 1.5075757503509521, + "rewards/rejected": -2.129103899002075, + "step": 1800 + }, + { + "epoch": 0.93, + "eval_logits/chosen": -2.7715063095092773, + "eval_logits/rejected": -2.7381045818328857, + "eval_logps/chosen": -276.5621032714844, + "eval_logps/rejected": -247.96278381347656, + "eval_loss": 0.501970112323761, + "eval_rewards/accuracies": 0.7760000228881836, + "eval_rewards/chosen": -0.5410944819450378, + "eval_rewards/margins": 1.406785011291504, + "eval_rewards/rejected": -1.9478795528411865, + "eval_runtime": 297.6711, + "eval_samples_per_second": 6.719, + "eval_steps_per_second": 0.42, + "step": 1800 + }, + { + "epoch": 0.93, + "learning_rate": 3.825779307707018e-07, + "logits/chosen": -2.7607715129852295, + "logits/rejected": -2.7069694995880127, + "logps/chosen": -208.4616241455078, + "logps/rejected": -229.7495574951172, + "loss": 0.5221, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6570498943328857, + "rewards/margins": 1.241496205329895, + "rewards/rejected": -1.8985458612442017, + "step": 1810 + }, + { + "epoch": 0.94, + "learning_rate": 3.8162172499521893e-07, + "logits/chosen": -2.776834726333618, + "logits/rejected": -2.709933042526245, + "logps/chosen": -266.1866455078125, + "logps/rejected": -218.61087036132812, + "loss": 0.5243, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8236967921257019, + "rewards/margins": 1.372011423110962, + "rewards/rejected": -2.1957080364227295, + "step": 1820 + }, + { + "epoch": 0.94, + "learning_rate": 3.8066551921973605e-07, + "logits/chosen": -2.7101306915283203, + "logits/rejected": -2.7025110721588135, + "logps/chosen": -261.57611083984375, + "logps/rejected": -236.56466674804688, + "loss": 0.4898, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6180542707443237, + "rewards/margins": 1.3089244365692139, + "rewards/rejected": -1.9269788265228271, + "step": 1830 + }, + { + "epoch": 0.95, + "learning_rate": 3.7970931344425316e-07, + "logits/chosen": -2.691938877105713, + "logits/rejected": -2.637636661529541, + "logps/chosen": -298.92431640625, + "logps/rejected": -240.8381805419922, + "loss": 0.4783, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7817220091819763, + "rewards/margins": 1.1453922986984253, + "rewards/rejected": -1.9271142482757568, + "step": 1840 + }, + { + "epoch": 0.96, + "learning_rate": 3.787531076687703e-07, + "logits/chosen": -2.5811028480529785, + "logits/rejected": -2.577949285507202, + "logps/chosen": -250.656494140625, + "logps/rejected": -207.6524658203125, + "loss": 0.5401, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.8317922353744507, + "rewards/margins": 1.5317630767822266, + "rewards/rejected": -2.363555669784546, + "step": 1850 + }, + { + "epoch": 0.96, + "learning_rate": 3.7779690189328745e-07, + "logits/chosen": -2.7253241539001465, + "logits/rejected": -2.7099854946136475, + "logps/chosen": -261.5533142089844, + "logps/rejected": -238.36221313476562, + "loss": 0.4759, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.91804438829422, + "rewards/margins": 1.307298183441162, + "rewards/rejected": -2.2253427505493164, + "step": 1860 + }, + { + "epoch": 0.97, + "learning_rate": 3.7684069611780456e-07, + "logits/chosen": -2.7499070167541504, + "logits/rejected": -2.6969666481018066, + "logps/chosen": -284.4208068847656, + "logps/rejected": -250.7512664794922, + "loss": 0.503, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5610580444335938, + "rewards/margins": 1.5138256549835205, + "rewards/rejected": -2.0748836994171143, + "step": 1870 + }, + { + "epoch": 0.97, + "learning_rate": 3.758844903423217e-07, + "logits/chosen": -2.7842278480529785, + "logits/rejected": -2.731440305709839, + "logps/chosen": -256.0408630371094, + "logps/rejected": -248.4185791015625, + "loss": 0.5077, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7275162935256958, + "rewards/margins": 1.5901552438735962, + "rewards/rejected": -2.317671060562134, + "step": 1880 + }, + { + "epoch": 0.98, + "learning_rate": 3.749282845668388e-07, + "logits/chosen": -2.7996678352355957, + "logits/rejected": -2.7667319774627686, + "logps/chosen": -303.9225769042969, + "logps/rejected": -270.538330078125, + "loss": 0.506, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.9399793744087219, + "rewards/margins": 1.2743322849273682, + "rewards/rejected": -2.2143118381500244, + "step": 1890 + }, + { + "epoch": 0.98, + "learning_rate": 3.739720787913559e-07, + "logits/chosen": -2.7126071453094482, + "logits/rejected": -2.6714975833892822, + "logps/chosen": -266.16204833984375, + "logps/rejected": -232.2299041748047, + "loss": 0.498, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9018408060073853, + "rewards/margins": 1.3833215236663818, + "rewards/rejected": -2.2851624488830566, + "step": 1900 + }, + { + "epoch": 0.98, + "eval_logits/chosen": -2.766378879547119, + "eval_logits/rejected": -2.7298452854156494, + "eval_logps/chosen": -278.0451965332031, + "eval_logps/rejected": -248.81500244140625, + "eval_loss": 0.5085920691490173, + "eval_rewards/accuracies": 0.7639999985694885, + "eval_rewards/chosen": -0.6894029378890991, + "eval_rewards/margins": 1.3436976671218872, + "eval_rewards/rejected": -2.0331006050109863, + "eval_runtime": 299.0083, + "eval_samples_per_second": 6.689, + "eval_steps_per_second": 0.418, + "step": 1900 + }, + { + "epoch": 0.99, + "learning_rate": 3.73015873015873e-07, + "logits/chosen": -2.7295749187469482, + "logits/rejected": -2.746682643890381, + "logps/chosen": -278.3253479003906, + "logps/rejected": -262.86627197265625, + "loss": 0.5109, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7267035245895386, + "rewards/margins": 0.9839091300964355, + "rewards/rejected": -1.7106126546859741, + "step": 1910 + }, + { + "epoch": 0.99, + "learning_rate": 3.7205966724039014e-07, + "logits/chosen": -2.6448540687561035, + "logits/rejected": -2.645981550216675, + "logps/chosen": -288.9250183105469, + "logps/rejected": -249.8538055419922, + "loss": 0.5003, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9050509333610535, + "rewards/margins": 1.1149990558624268, + "rewards/rejected": -2.020050048828125, + "step": 1920 + }, + { + "epoch": 1.0, + "learning_rate": 3.711034614649072e-07, + "logits/chosen": -2.7751498222351074, + "logits/rejected": -2.698579788208008, + "logps/chosen": -307.6148376464844, + "logps/rejected": -244.74441528320312, + "loss": 0.4483, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7877746224403381, + "rewards/margins": 1.6704633235931396, + "rewards/rejected": -2.458237886428833, + "step": 1930 + }, + { + "epoch": 1.0, + "learning_rate": 3.701472556894243e-07, + "logits/chosen": -2.634181261062622, + "logits/rejected": -2.642824172973633, + "logps/chosen": -251.3314971923828, + "logps/rejected": -285.38165283203125, + "loss": 0.3676, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.13440167903900146, + "rewards/margins": 2.4729697704315186, + "rewards/rejected": -2.6073713302612305, + "step": 1940 + }, + { + "epoch": 1.01, + "learning_rate": 3.6919104991394144e-07, + "logits/chosen": -2.7603354454040527, + "logits/rejected": -2.731630325317383, + "logps/chosen": -261.4866027832031, + "logps/rejected": -264.4168701171875, + "loss": 0.0836, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.2352325916290283, + "rewards/margins": 5.749847412109375, + "rewards/rejected": -4.514614105224609, + "step": 1950 + }, + { + "epoch": 1.01, + "learning_rate": 3.6823484413845855e-07, + "logits/chosen": -2.6434993743896484, + "logits/rejected": -2.6232972145080566, + "logps/chosen": -262.1065979003906, + "logps/rejected": -287.74688720703125, + "loss": 0.0745, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.580449640750885, + "rewards/margins": 5.3898138999938965, + "rewards/rejected": -4.809364318847656, + "step": 1960 + }, + { + "epoch": 1.02, + "learning_rate": 3.6727863836297567e-07, + "logits/chosen": -2.6330463886260986, + "logits/rejected": -2.614422559738159, + "logps/chosen": -244.67660522460938, + "logps/rejected": -267.0379333496094, + "loss": 0.0937, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.0386600494384766, + "rewards/margins": 6.133784294128418, + "rewards/rejected": -5.095124244689941, + "step": 1970 + }, + { + "epoch": 1.02, + "learning_rate": 3.663224325874928e-07, + "logits/chosen": -2.607079029083252, + "logits/rejected": -2.5714335441589355, + "logps/chosen": -243.2287139892578, + "logps/rejected": -281.1238708496094, + "loss": 0.1006, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.41417521238327026, + "rewards/margins": 5.176269054412842, + "rewards/rejected": -4.7620930671691895, + "step": 1980 + }, + { + "epoch": 1.03, + "learning_rate": 3.653662268120099e-07, + "logits/chosen": -2.5927295684814453, + "logits/rejected": -2.5773234367370605, + "logps/chosen": -241.07821655273438, + "logps/rejected": -316.1360778808594, + "loss": 0.0684, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.1088030338287354, + "rewards/margins": 5.955197334289551, + "rewards/rejected": -4.846394062042236, + "step": 1990 + }, + { + "epoch": 1.03, + "learning_rate": 3.6441002103652707e-07, + "logits/chosen": -2.6101760864257812, + "logits/rejected": -2.565425157546997, + "logps/chosen": -264.8179626464844, + "logps/rejected": -296.73468017578125, + "loss": 0.0664, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5726389288902283, + "rewards/margins": 6.380110263824463, + "rewards/rejected": -5.807471752166748, + "step": 2000 + }, + { + "epoch": 1.03, + "eval_logits/chosen": -2.6604604721069336, + "eval_logits/rejected": -2.613698720932007, + "eval_logps/chosen": -282.8529968261719, + "eval_logps/rejected": -260.2071533203125, + "eval_loss": 0.513712465763092, + "eval_rewards/accuracies": 0.7620000243186951, + "eval_rewards/chosen": -1.1701849699020386, + "eval_rewards/margins": 2.0021331310272217, + "eval_rewards/rejected": -3.17231822013855, + "eval_runtime": 296.246, + "eval_samples_per_second": 6.751, + "eval_steps_per_second": 0.422, + "step": 2000 + }, + { + "epoch": 1.04, + "learning_rate": 3.634538152610442e-07, + "logits/chosen": -2.6072421073913574, + "logits/rejected": -2.6007983684539795, + "logps/chosen": -271.474853515625, + "logps/rejected": -304.2977294921875, + "loss": 0.0641, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6836616396903992, + "rewards/margins": 6.16135311126709, + "rewards/rejected": -5.477691173553467, + "step": 2010 + }, + { + "epoch": 1.04, + "learning_rate": 3.624976094855613e-07, + "logits/chosen": -2.552222490310669, + "logits/rejected": -2.539797067642212, + "logps/chosen": -255.1763916015625, + "logps/rejected": -268.0435485839844, + "loss": 0.0654, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6575755476951599, + "rewards/margins": 5.849895477294922, + "rewards/rejected": -5.192319393157959, + "step": 2020 + }, + { + "epoch": 1.05, + "learning_rate": 3.615414037100784e-07, + "logits/chosen": -2.5665032863616943, + "logits/rejected": -2.5033249855041504, + "logps/chosen": -281.29022216796875, + "logps/rejected": -272.2781677246094, + "loss": 0.0794, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.328784704208374, + "rewards/margins": 5.696002006530762, + "rewards/rejected": -5.367217063903809, + "step": 2030 + }, + { + "epoch": 1.05, + "learning_rate": 3.6058519793459553e-07, + "logits/chosen": -2.5166163444519043, + "logits/rejected": -2.5345325469970703, + "logps/chosen": -240.7671661376953, + "logps/rejected": -250.6477508544922, + "loss": 0.0894, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.7341517210006714, + "rewards/margins": 5.550318241119385, + "rewards/rejected": -4.816165924072266, + "step": 2040 + }, + { + "epoch": 1.06, + "learning_rate": 3.5962899215911265e-07, + "logits/chosen": -2.593902587890625, + "logits/rejected": -2.5292165279388428, + "logps/chosen": -237.86239624023438, + "logps/rejected": -280.6207580566406, + "loss": 0.0518, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.4292508065700531, + "rewards/margins": 5.5217485427856445, + "rewards/rejected": -5.092497825622559, + "step": 2050 + }, + { + "epoch": 1.06, + "learning_rate": 3.5867278638362976e-07, + "logits/chosen": -2.593247175216675, + "logits/rejected": -2.5439682006835938, + "logps/chosen": -305.3078918457031, + "logps/rejected": -318.10394287109375, + "loss": 0.0645, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.7294039130210876, + "rewards/margins": 6.271230220794678, + "rewards/rejected": -5.5418267250061035, + "step": 2060 + }, + { + "epoch": 1.07, + "learning_rate": 3.577165806081469e-07, + "logits/chosen": -2.617610454559326, + "logits/rejected": -2.591120481491089, + "logps/chosen": -252.36483764648438, + "logps/rejected": -263.81060791015625, + "loss": 0.1051, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.2951541841030121, + "rewards/margins": 5.425782680511475, + "rewards/rejected": -5.13062858581543, + "step": 2070 + }, + { + "epoch": 1.07, + "learning_rate": 3.56760374832664e-07, + "logits/chosen": -2.684091567993164, + "logits/rejected": -2.570307493209839, + "logps/chosen": -280.3758544921875, + "logps/rejected": -314.53021240234375, + "loss": 0.0386, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4124327898025513, + "rewards/margins": 7.719065189361572, + "rewards/rejected": -6.306632041931152, + "step": 2080 + }, + { + "epoch": 1.08, + "learning_rate": 3.5580416905718106e-07, + "logits/chosen": -2.590378522872925, + "logits/rejected": -2.546607732772827, + "logps/chosen": -300.8092346191406, + "logps/rejected": -268.47479248046875, + "loss": 0.1004, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4799577295780182, + "rewards/margins": 5.814119338989258, + "rewards/rejected": -5.33416223526001, + "step": 2090 + }, + { + "epoch": 1.08, + "learning_rate": 3.5484796328169817e-07, + "logits/chosen": -2.576677083969116, + "logits/rejected": -2.5376296043395996, + "logps/chosen": -259.6259765625, + "logps/rejected": -255.3178253173828, + "loss": 0.0698, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.22541138529777527, + "rewards/margins": 5.577375411987305, + "rewards/rejected": -5.351963996887207, + "step": 2100 + }, + { + "epoch": 1.08, + "eval_logits/chosen": -2.669224739074707, + "eval_logits/rejected": -2.6219115257263184, + "eval_logps/chosen": -284.7966003417969, + "eval_logps/rejected": -264.1526794433594, + "eval_loss": 0.5326563715934753, + "eval_rewards/accuracies": 0.7680000066757202, + "eval_rewards/chosen": -1.3645479679107666, + "eval_rewards/margins": 2.2023234367370605, + "eval_rewards/rejected": -3.566871166229248, + "eval_runtime": 298.4532, + "eval_samples_per_second": 6.701, + "eval_steps_per_second": 0.419, + "step": 2100 + }, + { + "epoch": 1.09, + "learning_rate": 3.538917575062153e-07, + "logits/chosen": -2.5463109016418457, + "logits/rejected": -2.5333731174468994, + "logps/chosen": -258.64874267578125, + "logps/rejected": -310.0502014160156, + "loss": 0.0481, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.3605276346206665, + "rewards/margins": 6.687548637390137, + "rewards/rejected": -6.32702112197876, + "step": 2110 + }, + { + "epoch": 1.09, + "learning_rate": 3.529355517307324e-07, + "logits/chosen": -2.5875303745269775, + "logits/rejected": -2.510730266571045, + "logps/chosen": -245.9647979736328, + "logps/rejected": -296.9500732421875, + "loss": 0.0642, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.650764524936676, + "rewards/margins": 7.069943904876709, + "rewards/rejected": -6.419180393218994, + "step": 2120 + }, + { + "epoch": 1.1, + "learning_rate": 3.519793459552495e-07, + "logits/chosen": -2.5960545539855957, + "logits/rejected": -2.567936658859253, + "logps/chosen": -264.7110900878906, + "logps/rejected": -300.5650939941406, + "loss": 0.111, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.45437049865722656, + "rewards/margins": 5.234097480773926, + "rewards/rejected": -5.688467979431152, + "step": 2130 + }, + { + "epoch": 1.1, + "learning_rate": 3.510231401797667e-07, + "logits/chosen": -2.5926661491394043, + "logits/rejected": -2.551384925842285, + "logps/chosen": -302.0068359375, + "logps/rejected": -296.42669677734375, + "loss": 0.0941, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.4076964855194092, + "rewards/margins": 5.991205215454102, + "rewards/rejected": -6.39890193939209, + "step": 2140 + }, + { + "epoch": 1.11, + "learning_rate": 3.500669344042838e-07, + "logits/chosen": -2.660684585571289, + "logits/rejected": -2.563539505004883, + "logps/chosen": -253.825439453125, + "logps/rejected": -288.27764892578125, + "loss": 0.0651, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.30672699213027954, + "rewards/margins": 5.953993797302246, + "rewards/rejected": -6.260720729827881, + "step": 2150 + }, + { + "epoch": 1.12, + "learning_rate": 3.491107286288009e-07, + "logits/chosen": -2.648838758468628, + "logits/rejected": -2.5848755836486816, + "logps/chosen": -266.71893310546875, + "logps/rejected": -304.06243896484375, + "loss": 0.0881, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.042765144258737564, + "rewards/margins": 6.342136859893799, + "rewards/rejected": -6.384902000427246, + "step": 2160 + }, + { + "epoch": 1.12, + "learning_rate": 3.4815452285331803e-07, + "logits/chosen": -2.5549755096435547, + "logits/rejected": -2.5574491024017334, + "logps/chosen": -282.77789306640625, + "logps/rejected": -277.75860595703125, + "loss": 0.0885, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.24789047241210938, + "rewards/margins": 5.591107368469238, + "rewards/rejected": -5.838997840881348, + "step": 2170 + }, + { + "epoch": 1.13, + "learning_rate": 3.4719831707783515e-07, + "logits/chosen": -2.5701591968536377, + "logits/rejected": -2.532634735107422, + "logps/chosen": -297.8733825683594, + "logps/rejected": -325.23663330078125, + "loss": 0.0784, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.12840011715888977, + "rewards/margins": 7.079614162445068, + "rewards/rejected": -7.208014488220215, + "step": 2180 + }, + { + "epoch": 1.13, + "learning_rate": 3.4624211130235227e-07, + "logits/chosen": -2.673182964324951, + "logits/rejected": -2.5937423706054688, + "logps/chosen": -249.6380615234375, + "logps/rejected": -269.7789306640625, + "loss": 0.0777, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.5297588109970093, + "rewards/margins": 6.0325798988342285, + "rewards/rejected": -6.562338352203369, + "step": 2190 + }, + { + "epoch": 1.14, + "learning_rate": 3.452859055268694e-07, + "logits/chosen": -2.686246156692505, + "logits/rejected": -2.6082072257995605, + "logps/chosen": -254.46072387695312, + "logps/rejected": -270.94268798828125, + "loss": 0.0715, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.22074970602989197, + "rewards/margins": 6.174362659454346, + "rewards/rejected": -6.39511251449585, + "step": 2200 + }, + { + "epoch": 1.14, + "eval_logits/chosen": -2.739684820175171, + "eval_logits/rejected": -2.694882392883301, + "eval_logps/chosen": -291.6701354980469, + "eval_logps/rejected": -270.4673156738281, + "eval_loss": 0.5423225164413452, + "eval_rewards/accuracies": 0.7620000243186951, + "eval_rewards/chosen": -2.0518994331359863, + "eval_rewards/margins": 2.146435022354126, + "eval_rewards/rejected": -4.198334693908691, + "eval_runtime": 299.094, + "eval_samples_per_second": 6.687, + "eval_steps_per_second": 0.418, + "step": 2200 + }, + { + "epoch": 1.14, + "learning_rate": 3.443296997513865e-07, + "logits/chosen": -2.7505545616149902, + "logits/rejected": -2.7318546772003174, + "logps/chosen": -296.1077880859375, + "logps/rejected": -329.1132507324219, + "loss": 0.0644, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.44653525948524475, + "rewards/margins": 6.901062965393066, + "rewards/rejected": -6.454527378082275, + "step": 2210 + }, + { + "epoch": 1.15, + "learning_rate": 3.433734939759036e-07, + "logits/chosen": -2.696167469024658, + "logits/rejected": -2.6370933055877686, + "logps/chosen": -286.5255432128906, + "logps/rejected": -339.99163818359375, + "loss": 0.1059, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.624750554561615, + "rewards/margins": 8.09054946899414, + "rewards/rejected": -7.465798854827881, + "step": 2220 + }, + { + "epoch": 1.15, + "learning_rate": 3.4241728820042073e-07, + "logits/chosen": -2.603076696395874, + "logits/rejected": -2.5586435794830322, + "logps/chosen": -243.6158905029297, + "logps/rejected": -307.83331298828125, + "loss": 0.083, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.5600066781044006, + "rewards/margins": 5.806141376495361, + "rewards/rejected": -6.366147518157959, + "step": 2230 + }, + { + "epoch": 1.16, + "learning_rate": 3.4146108242493784e-07, + "logits/chosen": -2.673128128051758, + "logits/rejected": -2.6862502098083496, + "logps/chosen": -236.10879516601562, + "logps/rejected": -281.21832275390625, + "loss": 0.0947, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.06583935022354126, + "rewards/margins": 5.878711700439453, + "rewards/rejected": -5.812871932983398, + "step": 2240 + }, + { + "epoch": 1.16, + "learning_rate": 3.405048766494549e-07, + "logits/chosen": -2.659985065460205, + "logits/rejected": -2.6177918910980225, + "logps/chosen": -289.13067626953125, + "logps/rejected": -277.3434143066406, + "loss": 0.1001, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.20128068327903748, + "rewards/margins": 6.173056602478027, + "rewards/rejected": -5.971776008605957, + "step": 2250 + }, + { + "epoch": 1.17, + "learning_rate": 3.39548670873972e-07, + "logits/chosen": -2.5996155738830566, + "logits/rejected": -2.5736541748046875, + "logps/chosen": -308.55535888671875, + "logps/rejected": -343.0357360839844, + "loss": 0.1096, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.6509183049201965, + "rewards/margins": 7.569863796234131, + "rewards/rejected": -6.9189453125, + "step": 2260 + }, + { + "epoch": 1.17, + "learning_rate": 3.3859246509848914e-07, + "logits/chosen": -2.6264021396636963, + "logits/rejected": -2.614025592803955, + "logps/chosen": -267.51007080078125, + "logps/rejected": -306.4515380859375, + "loss": 0.0809, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.013781326822936535, + "rewards/margins": 6.145535945892334, + "rewards/rejected": -6.1593170166015625, + "step": 2270 + }, + { + "epoch": 1.18, + "learning_rate": 3.376362593230063e-07, + "logits/chosen": -2.6009116172790527, + "logits/rejected": -2.6116909980773926, + "logps/chosen": -251.38931274414062, + "logps/rejected": -315.5372009277344, + "loss": 0.0738, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.17372338473796844, + "rewards/margins": 6.592174530029297, + "rewards/rejected": -6.765897274017334, + "step": 2280 + }, + { + "epoch": 1.18, + "learning_rate": 3.366800535475234e-07, + "logits/chosen": -2.62135648727417, + "logits/rejected": -2.5482916831970215, + "logps/chosen": -286.6933288574219, + "logps/rejected": -283.2353515625, + "loss": 0.0764, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5743136405944824, + "rewards/margins": 7.015399932861328, + "rewards/rejected": -6.4410858154296875, + "step": 2290 + }, + { + "epoch": 1.19, + "learning_rate": 3.3572384777204054e-07, + "logits/chosen": -2.680253028869629, + "logits/rejected": -2.6442158222198486, + "logps/chosen": -292.6054992675781, + "logps/rejected": -284.4306640625, + "loss": 0.0548, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.3452996611595154, + "rewards/margins": 6.642431735992432, + "rewards/rejected": -6.2971320152282715, + "step": 2300 + }, + { + "epoch": 1.19, + "eval_logits/chosen": -2.642500877380371, + "eval_logits/rejected": -2.5995521545410156, + "eval_logps/chosen": -288.6898498535156, + "eval_logps/rejected": -269.0300598144531, + "eval_loss": 0.5458693504333496, + "eval_rewards/accuracies": 0.7699999809265137, + "eval_rewards/chosen": -1.7538715600967407, + "eval_rewards/margins": 2.3007359504699707, + "eval_rewards/rejected": -4.05460786819458, + "eval_runtime": 297.5625, + "eval_samples_per_second": 6.721, + "eval_steps_per_second": 0.42, + "step": 2300 + }, + { + "epoch": 1.19, + "learning_rate": 3.3476764199655765e-07, + "logits/chosen": -2.574187994003296, + "logits/rejected": -2.549806594848633, + "logps/chosen": -210.81497192382812, + "logps/rejected": -279.15716552734375, + "loss": 0.0779, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.1547781527042389, + "rewards/margins": 6.8966169357299805, + "rewards/rejected": -6.741837978363037, + "step": 2310 + }, + { + "epoch": 1.2, + "learning_rate": 3.3381143622107477e-07, + "logits/chosen": -2.5899195671081543, + "logits/rejected": -2.5857739448547363, + "logps/chosen": -304.3234558105469, + "logps/rejected": -318.6616516113281, + "loss": 0.0478, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32505854964256287, + "rewards/margins": 7.544722080230713, + "rewards/rejected": -7.219663143157959, + "step": 2320 + }, + { + "epoch": 1.2, + "learning_rate": 3.328552304455919e-07, + "logits/chosen": -2.6164278984069824, + "logits/rejected": -2.6033473014831543, + "logps/chosen": -260.3442687988281, + "logps/rejected": -289.3071594238281, + "loss": 0.0729, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.08277694880962372, + "rewards/margins": 6.275121688842773, + "rewards/rejected": -6.357898235321045, + "step": 2330 + }, + { + "epoch": 1.21, + "learning_rate": 3.31899024670109e-07, + "logits/chosen": -2.6337881088256836, + "logits/rejected": -2.551001787185669, + "logps/chosen": -280.34246826171875, + "logps/rejected": -293.72711181640625, + "loss": 0.0607, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.17926494777202606, + "rewards/margins": 6.4748077392578125, + "rewards/rejected": -6.6540727615356445, + "step": 2340 + }, + { + "epoch": 1.21, + "learning_rate": 3.309428188946261e-07, + "logits/chosen": -2.562415599822998, + "logits/rejected": -2.542945384979248, + "logps/chosen": -250.19735717773438, + "logps/rejected": -277.0072326660156, + "loss": 0.0894, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.4253063201904297, + "rewards/margins": 5.756030082702637, + "rewards/rejected": -6.181336879730225, + "step": 2350 + }, + { + "epoch": 1.22, + "learning_rate": 3.2998661311914323e-07, + "logits/chosen": -2.665956497192383, + "logits/rejected": -2.605408191680908, + "logps/chosen": -261.0832214355469, + "logps/rejected": -300.29302978515625, + "loss": 0.0591, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27276185154914856, + "rewards/margins": 6.997501373291016, + "rewards/rejected": -6.724739074707031, + "step": 2360 + }, + { + "epoch": 1.22, + "learning_rate": 3.2903040734366035e-07, + "logits/chosen": -2.6756982803344727, + "logits/rejected": -2.642782211303711, + "logps/chosen": -248.23556518554688, + "logps/rejected": -303.837646484375, + "loss": 0.0849, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.3109845519065857, + "rewards/margins": 6.526381015777588, + "rewards/rejected": -6.215396404266357, + "step": 2370 + }, + { + "epoch": 1.23, + "learning_rate": 3.2807420156817746e-07, + "logits/chosen": -2.573936939239502, + "logits/rejected": -2.576326847076416, + "logps/chosen": -243.3905792236328, + "logps/rejected": -271.8777160644531, + "loss": 0.0749, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.18687370419502258, + "rewards/margins": 5.986743927001953, + "rewards/rejected": -5.799870014190674, + "step": 2380 + }, + { + "epoch": 1.23, + "learning_rate": 3.271179957926946e-07, + "logits/chosen": -2.662365198135376, + "logits/rejected": -2.6264593601226807, + "logps/chosen": -281.0094909667969, + "logps/rejected": -295.4397888183594, + "loss": 0.0852, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.3190084993839264, + "rewards/margins": 6.439042568206787, + "rewards/rejected": -6.758051872253418, + "step": 2390 + }, + { + "epoch": 1.24, + "learning_rate": 3.261617900172117e-07, + "logits/chosen": -2.617631435394287, + "logits/rejected": -2.6006240844726562, + "logps/chosen": -238.6389923095703, + "logps/rejected": -291.9773864746094, + "loss": 0.0897, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.4103802740573883, + "rewards/margins": 6.404238700866699, + "rewards/rejected": -5.99385929107666, + "step": 2400 + }, + { + "epoch": 1.24, + "eval_logits/chosen": -2.6870288848876953, + "eval_logits/rejected": -2.651212692260742, + "eval_logps/chosen": -287.7002258300781, + "eval_logps/rejected": -265.7117004394531, + "eval_loss": 0.5316546559333801, + "eval_rewards/accuracies": 0.7639999985694885, + "eval_rewards/chosen": -1.6549092531204224, + "eval_rewards/margins": 2.067864179611206, + "eval_rewards/rejected": -3.722773551940918, + "eval_runtime": 299.878, + "eval_samples_per_second": 6.669, + "eval_steps_per_second": 0.417, + "step": 2400 + }, + { + "epoch": 1.24, + "learning_rate": 3.2520558424172876e-07, + "logits/chosen": -2.6913504600524902, + "logits/rejected": -2.6211981773376465, + "logps/chosen": -267.68817138671875, + "logps/rejected": -276.04193115234375, + "loss": 0.0703, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.30733993649482727, + "rewards/margins": 6.186968803405762, + "rewards/rejected": -5.879629611968994, + "step": 2410 + }, + { + "epoch": 1.25, + "learning_rate": 3.242493784662459e-07, + "logits/chosen": -2.6455018520355225, + "logits/rejected": -2.625913381576538, + "logps/chosen": -256.44488525390625, + "logps/rejected": -287.00726318359375, + "loss": 0.0744, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.23726698756217957, + "rewards/margins": 6.133591651916504, + "rewards/rejected": -6.370858192443848, + "step": 2420 + }, + { + "epoch": 1.25, + "learning_rate": 3.2329317269076304e-07, + "logits/chosen": -2.65103816986084, + "logits/rejected": -2.6203525066375732, + "logps/chosen": -281.08306884765625, + "logps/rejected": -334.32568359375, + "loss": 0.0902, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.14011399447917938, + "rewards/margins": 7.048730373382568, + "rewards/rejected": -6.908616542816162, + "step": 2430 + }, + { + "epoch": 1.26, + "learning_rate": 3.2233696691528016e-07, + "logits/chosen": -2.5833468437194824, + "logits/rejected": -2.5940117835998535, + "logps/chosen": -253.43359375, + "logps/rejected": -312.6004638671875, + "loss": 0.0887, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03585803508758545, + "rewards/margins": 6.511855125427246, + "rewards/rejected": -6.475996971130371, + "step": 2440 + }, + { + "epoch": 1.26, + "learning_rate": 3.2138076113979727e-07, + "logits/chosen": -2.586594581604004, + "logits/rejected": -2.5869569778442383, + "logps/chosen": -274.05706787109375, + "logps/rejected": -318.03045654296875, + "loss": 0.096, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.25964584946632385, + "rewards/margins": 6.5868659019470215, + "rewards/rejected": -6.8465118408203125, + "step": 2450 + }, + { + "epoch": 1.27, + "learning_rate": 3.204245553643144e-07, + "logits/chosen": -2.582857608795166, + "logits/rejected": -2.5616345405578613, + "logps/chosen": -307.5089111328125, + "logps/rejected": -302.73980712890625, + "loss": 0.0696, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.01987607404589653, + "rewards/margins": 6.0425639152526855, + "rewards/rejected": -6.022687911987305, + "step": 2460 + }, + { + "epoch": 1.28, + "learning_rate": 3.194683495888315e-07, + "logits/chosen": -2.6513664722442627, + "logits/rejected": -2.628636360168457, + "logps/chosen": -296.8031005859375, + "logps/rejected": -277.1523742675781, + "loss": 0.0925, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.0834658145904541, + "rewards/margins": 5.658120155334473, + "rewards/rejected": -5.574653625488281, + "step": 2470 + }, + { + "epoch": 1.28, + "learning_rate": 3.185121438133486e-07, + "logits/chosen": -2.6223368644714355, + "logits/rejected": -2.6064722537994385, + "logps/chosen": -280.4040222167969, + "logps/rejected": -355.2570495605469, + "loss": 0.083, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.3895830512046814, + "rewards/margins": 6.714540958404541, + "rewards/rejected": -6.324957847595215, + "step": 2480 + }, + { + "epoch": 1.29, + "learning_rate": 3.1755593803786574e-07, + "logits/chosen": -2.655653953552246, + "logits/rejected": -2.6335813999176025, + "logps/chosen": -228.7120819091797, + "logps/rejected": -280.1087646484375, + "loss": 0.0821, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3908086121082306, + "rewards/margins": 5.965733528137207, + "rewards/rejected": -6.356542110443115, + "step": 2490 + }, + { + "epoch": 1.29, + "learning_rate": 3.1659973226238285e-07, + "logits/chosen": -2.6471128463745117, + "logits/rejected": -2.619386672973633, + "logps/chosen": -270.0829772949219, + "logps/rejected": -283.39752197265625, + "loss": 0.0842, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.08627805858850479, + "rewards/margins": 6.602423191070557, + "rewards/rejected": -6.5161452293396, + "step": 2500 + }, + { + "epoch": 1.29, + "eval_logits/chosen": -2.684290885925293, + "eval_logits/rejected": -2.6529762744903564, + "eval_logps/chosen": -294.15118408203125, + "eval_logps/rejected": -273.7510681152344, + "eval_loss": 0.5710099935531616, + "eval_rewards/accuracies": 0.765999972820282, + "eval_rewards/chosen": -2.300004482269287, + "eval_rewards/margins": 2.226706027984619, + "eval_rewards/rejected": -4.526710510253906, + "eval_runtime": 300.4148, + "eval_samples_per_second": 6.657, + "eval_steps_per_second": 0.416, + "step": 2500 + }, + { + "epoch": 1.3, + "learning_rate": 3.1564352648689997e-07, + "logits/chosen": -2.7135045528411865, + "logits/rejected": -2.6824328899383545, + "logps/chosen": -302.2213439941406, + "logps/rejected": -292.8287353515625, + "loss": 0.1094, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.13228394091129303, + "rewards/margins": 6.033846855163574, + "rewards/rejected": -6.166131019592285, + "step": 2510 + }, + { + "epoch": 1.3, + "learning_rate": 3.146873207114171e-07, + "logits/chosen": -2.6831634044647217, + "logits/rejected": -2.643829822540283, + "logps/chosen": -239.953369140625, + "logps/rejected": -290.42901611328125, + "loss": 0.0793, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.24583473801612854, + "rewards/margins": 7.089083671569824, + "rewards/rejected": -6.843249320983887, + "step": 2520 + }, + { + "epoch": 1.31, + "learning_rate": 3.137311149359342e-07, + "logits/chosen": -2.7059950828552246, + "logits/rejected": -2.6721725463867188, + "logps/chosen": -309.16510009765625, + "logps/rejected": -334.92529296875, + "loss": 0.077, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.09306098520755768, + "rewards/margins": 6.407054901123047, + "rewards/rejected": -6.31399393081665, + "step": 2530 + }, + { + "epoch": 1.31, + "learning_rate": 3.127749091604513e-07, + "logits/chosen": -2.7140984535217285, + "logits/rejected": -2.7086362838745117, + "logps/chosen": -302.92572021484375, + "logps/rejected": -342.76318359375, + "loss": 0.091, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.17233842611312866, + "rewards/margins": 7.973818778991699, + "rewards/rejected": -7.801480293273926, + "step": 2540 + }, + { + "epoch": 1.32, + "learning_rate": 3.1181870338496843e-07, + "logits/chosen": -2.6527652740478516, + "logits/rejected": -2.6552717685699463, + "logps/chosen": -207.1505584716797, + "logps/rejected": -294.25811767578125, + "loss": 0.0893, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.10080035030841827, + "rewards/margins": 6.058371067047119, + "rewards/rejected": -6.159171104431152, + "step": 2550 + }, + { + "epoch": 1.32, + "learning_rate": 3.108624976094856e-07, + "logits/chosen": -2.6768598556518555, + "logits/rejected": -2.66867733001709, + "logps/chosen": -259.58905029296875, + "logps/rejected": -266.781982421875, + "loss": 0.081, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.8046671152114868, + "rewards/margins": 6.409237861633301, + "rewards/rejected": -5.604570388793945, + "step": 2560 + }, + { + "epoch": 1.33, + "learning_rate": 3.0990629183400266e-07, + "logits/chosen": -2.6110267639160156, + "logits/rejected": -2.597078800201416, + "logps/chosen": -286.2091369628906, + "logps/rejected": -312.1487121582031, + "loss": 0.076, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.18477150797843933, + "rewards/margins": 7.183107852935791, + "rewards/rejected": -6.998335838317871, + "step": 2570 + }, + { + "epoch": 1.33, + "learning_rate": 3.089500860585198e-07, + "logits/chosen": -2.6049301624298096, + "logits/rejected": -2.606522560119629, + "logps/chosen": -232.1073760986328, + "logps/rejected": -264.76092529296875, + "loss": 0.1077, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7609155774116516, + "rewards/margins": 5.248475074768066, + "rewards/rejected": -6.009390830993652, + "step": 2580 + }, + { + "epoch": 1.34, + "learning_rate": 3.079938802830369e-07, + "logits/chosen": -2.691229820251465, + "logits/rejected": -2.6520042419433594, + "logps/chosen": -254.48287963867188, + "logps/rejected": -234.5712890625, + "loss": 0.067, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.14560198783874512, + "rewards/margins": 5.6187334060668945, + "rewards/rejected": -5.764335632324219, + "step": 2590 + }, + { + "epoch": 1.34, + "learning_rate": 3.07037674507554e-07, + "logits/chosen": -2.6273081302642822, + "logits/rejected": -2.630744457244873, + "logps/chosen": -296.9062805175781, + "logps/rejected": -350.70074462890625, + "loss": 0.1321, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.28318193554878235, + "rewards/margins": 7.059614658355713, + "rewards/rejected": -6.776432991027832, + "step": 2600 + }, + { + "epoch": 1.34, + "eval_logits/chosen": -2.734260082244873, + "eval_logits/rejected": -2.7094430923461914, + "eval_logps/chosen": -289.38946533203125, + "eval_logps/rejected": -267.0450439453125, + "eval_loss": 0.5334489941596985, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -1.8238333463668823, + "eval_rewards/margins": 2.032271385192871, + "eval_rewards/rejected": -3.8561043739318848, + "eval_runtime": 298.2165, + "eval_samples_per_second": 6.707, + "eval_steps_per_second": 0.419, + "step": 2600 + }, + { + "epoch": 1.35, + "learning_rate": 3.060814687320711e-07, + "logits/chosen": -2.6680526733398438, + "logits/rejected": -2.6304268836975098, + "logps/chosen": -281.00341796875, + "logps/rejected": -281.87396240234375, + "loss": 0.0959, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.2273908108472824, + "rewards/margins": 6.250962257385254, + "rewards/rejected": -6.023571968078613, + "step": 2610 + }, + { + "epoch": 1.35, + "learning_rate": 3.0512526295658824e-07, + "logits/chosen": -2.6950268745422363, + "logits/rejected": -2.6460578441619873, + "logps/chosen": -286.7471923828125, + "logps/rejected": -297.45050048828125, + "loss": 0.1119, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.0691387727856636, + "rewards/margins": 6.203841209411621, + "rewards/rejected": -6.134702682495117, + "step": 2620 + }, + { + "epoch": 1.36, + "learning_rate": 3.0416905718110536e-07, + "logits/chosen": -2.748300075531006, + "logits/rejected": -2.7518250942230225, + "logps/chosen": -246.3492889404297, + "logps/rejected": -267.51470947265625, + "loss": 0.0881, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.12618187069892883, + "rewards/margins": 6.258318901062012, + "rewards/rejected": -6.3845014572143555, + "step": 2630 + }, + { + "epoch": 1.36, + "learning_rate": 3.0321285140562247e-07, + "logits/chosen": -2.769554615020752, + "logits/rejected": -2.716841459274292, + "logps/chosen": -250.5794219970703, + "logps/rejected": -274.2592468261719, + "loss": 0.1548, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0954054594039917, + "rewards/margins": 5.371593475341797, + "rewards/rejected": -6.466999053955078, + "step": 2640 + }, + { + "epoch": 1.37, + "learning_rate": 3.022566456301396e-07, + "logits/chosen": -2.8216865062713623, + "logits/rejected": -2.7998435497283936, + "logps/chosen": -262.3215026855469, + "logps/rejected": -294.09063720703125, + "loss": 0.1213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2869683802127838, + "rewards/margins": 6.307840347290039, + "rewards/rejected": -6.594809055328369, + "step": 2650 + }, + { + "epoch": 1.37, + "learning_rate": 3.013004398546567e-07, + "logits/chosen": -2.724449396133423, + "logits/rejected": -2.644949197769165, + "logps/chosen": -270.1006774902344, + "logps/rejected": -294.309814453125, + "loss": 0.075, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.46882614493370056, + "rewards/margins": 6.1476945877075195, + "rewards/rejected": -6.616520881652832, + "step": 2660 + }, + { + "epoch": 1.38, + "learning_rate": 3.003442340791738e-07, + "logits/chosen": -2.8228209018707275, + "logits/rejected": -2.819795608520508, + "logps/chosen": -264.2869567871094, + "logps/rejected": -294.621826171875, + "loss": 0.0879, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.5818114280700684, + "rewards/margins": 7.002253532409668, + "rewards/rejected": -7.5840654373168945, + "step": 2670 + }, + { + "epoch": 1.38, + "learning_rate": 2.9938802830369093e-07, + "logits/chosen": -2.795841693878174, + "logits/rejected": -2.807483434677124, + "logps/chosen": -265.37322998046875, + "logps/rejected": -302.5888671875, + "loss": 0.1097, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.6445812582969666, + "rewards/margins": 6.2356858253479, + "rewards/rejected": -6.880267143249512, + "step": 2680 + }, + { + "epoch": 1.39, + "learning_rate": 2.9843182252820805e-07, + "logits/chosen": -2.715318441390991, + "logits/rejected": -2.7065510749816895, + "logps/chosen": -251.9051971435547, + "logps/rejected": -291.34918212890625, + "loss": 0.0851, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.3353874683380127, + "rewards/margins": 6.849355220794678, + "rewards/rejected": -7.184743404388428, + "step": 2690 + }, + { + "epoch": 1.39, + "learning_rate": 2.974756167527252e-07, + "logits/chosen": -2.794362783432007, + "logits/rejected": -2.784069061279297, + "logps/chosen": -243.68063354492188, + "logps/rejected": -289.0542297363281, + "loss": 0.0862, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5554946660995483, + "rewards/margins": 5.511413097381592, + "rewards/rejected": -6.06690788269043, + "step": 2700 + }, + { + "epoch": 1.39, + "eval_logits/chosen": -2.71694016456604, + "eval_logits/rejected": -2.69527268409729, + "eval_logps/chosen": -289.63067626953125, + "eval_logps/rejected": -267.99761962890625, + "eval_loss": 0.5442701578140259, + "eval_rewards/accuracies": 0.7519999742507935, + "eval_rewards/chosen": -1.8479559421539307, + "eval_rewards/margins": 2.103407621383667, + "eval_rewards/rejected": -3.9513633251190186, + "eval_runtime": 302.0105, + "eval_samples_per_second": 6.622, + "eval_steps_per_second": 0.414, + "step": 2700 + }, + { + "epoch": 1.4, + "learning_rate": 2.9651941097724233e-07, + "logits/chosen": -2.7475249767303467, + "logits/rejected": -2.725043296813965, + "logps/chosen": -282.15606689453125, + "logps/rejected": -271.9478759765625, + "loss": 0.1062, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.22154350578784943, + "rewards/margins": 5.806771755218506, + "rewards/rejected": -6.028315544128418, + "step": 2710 + }, + { + "epoch": 1.4, + "learning_rate": 2.9556320520175945e-07, + "logits/chosen": -2.6693906784057617, + "logits/rejected": -2.645045757293701, + "logps/chosen": -268.8005676269531, + "logps/rejected": -278.6295471191406, + "loss": 0.1059, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.5214961767196655, + "rewards/margins": 6.091368198394775, + "rewards/rejected": -6.612864017486572, + "step": 2720 + }, + { + "epoch": 1.41, + "learning_rate": 2.946069994262765e-07, + "logits/chosen": -2.7901053428649902, + "logits/rejected": -2.7124366760253906, + "logps/chosen": -295.39605712890625, + "logps/rejected": -323.22674560546875, + "loss": 0.0622, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1495104283094406, + "rewards/margins": 6.738787651062012, + "rewards/rejected": -6.888298034667969, + "step": 2730 + }, + { + "epoch": 1.41, + "learning_rate": 2.9365079365079363e-07, + "logits/chosen": -2.758653163909912, + "logits/rejected": -2.6714606285095215, + "logps/chosen": -279.1718444824219, + "logps/rejected": -265.2136535644531, + "loss": 0.1065, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.2586991488933563, + "rewards/margins": 6.57647705078125, + "rewards/rejected": -6.835176944732666, + "step": 2740 + }, + { + "epoch": 1.42, + "learning_rate": 2.9269458787531074e-07, + "logits/chosen": -2.671984910964966, + "logits/rejected": -2.6374223232269287, + "logps/chosen": -245.06051635742188, + "logps/rejected": -311.33538818359375, + "loss": 0.0834, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7288433313369751, + "rewards/margins": 5.889183521270752, + "rewards/rejected": -6.6180267333984375, + "step": 2750 + }, + { + "epoch": 1.42, + "learning_rate": 2.9173838209982786e-07, + "logits/chosen": -2.746222972869873, + "logits/rejected": -2.675706624984741, + "logps/chosen": -267.20440673828125, + "logps/rejected": -248.35855102539062, + "loss": 0.0966, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5821000337600708, + "rewards/margins": 5.743124961853027, + "rewards/rejected": -6.325225353240967, + "step": 2760 + }, + { + "epoch": 1.43, + "learning_rate": 2.90782176324345e-07, + "logits/chosen": -2.694669485092163, + "logits/rejected": -2.655134677886963, + "logps/chosen": -325.7140808105469, + "logps/rejected": -293.95166015625, + "loss": 0.066, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.05179126188158989, + "rewards/margins": 6.887429237365723, + "rewards/rejected": -6.83563756942749, + "step": 2770 + }, + { + "epoch": 1.44, + "learning_rate": 2.898259705488621e-07, + "logits/chosen": -2.818296432495117, + "logits/rejected": -2.813457727432251, + "logps/chosen": -266.9579162597656, + "logps/rejected": -275.5874938964844, + "loss": 0.0825, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4114197790622711, + "rewards/margins": 6.127683639526367, + "rewards/rejected": -6.539102077484131, + "step": 2780 + }, + { + "epoch": 1.44, + "learning_rate": 2.888697647733792e-07, + "logits/chosen": -2.7553584575653076, + "logits/rejected": -2.7237467765808105, + "logps/chosen": -323.9237060546875, + "logps/rejected": -323.90655517578125, + "loss": 0.0819, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.25978711247444153, + "rewards/margins": 7.298235893249512, + "rewards/rejected": -7.038449287414551, + "step": 2790 + }, + { + "epoch": 1.45, + "learning_rate": 2.879135589978963e-07, + "logits/chosen": -2.657649278640747, + "logits/rejected": -2.7081754207611084, + "logps/chosen": -239.7465362548828, + "logps/rejected": -296.0489196777344, + "loss": 0.0954, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.0186956487596035, + "rewards/margins": 5.776867866516113, + "rewards/rejected": -5.758172035217285, + "step": 2800 + }, + { + "epoch": 1.45, + "eval_logits/chosen": -2.712128162384033, + "eval_logits/rejected": -2.689998149871826, + "eval_logps/chosen": -290.46826171875, + "eval_logps/rejected": -268.4657897949219, + "eval_loss": 0.5472421646118164, + "eval_rewards/accuracies": 0.7620000243186951, + "eval_rewards/chosen": -1.9317100048065186, + "eval_rewards/margins": 2.066469669342041, + "eval_rewards/rejected": -3.9981796741485596, + "eval_runtime": 302.1561, + "eval_samples_per_second": 6.619, + "eval_steps_per_second": 0.414, + "step": 2800 + }, + { + "epoch": 1.45, + "learning_rate": 2.8695735322241344e-07, + "logits/chosen": -2.733955144882202, + "logits/rejected": -2.7273309230804443, + "logps/chosen": -277.91851806640625, + "logps/rejected": -294.29095458984375, + "loss": 0.0969, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4033392369747162, + "rewards/margins": 6.4010772705078125, + "rewards/rejected": -5.997737884521484, + "step": 2810 + }, + { + "epoch": 1.46, + "learning_rate": 2.8600114744693055e-07, + "logits/chosen": -2.6529266834259033, + "logits/rejected": -2.619670867919922, + "logps/chosen": -246.15811157226562, + "logps/rejected": -305.12042236328125, + "loss": 0.0762, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.3458639979362488, + "rewards/margins": 7.0330352783203125, + "rewards/rejected": -6.68717098236084, + "step": 2820 + }, + { + "epoch": 1.46, + "learning_rate": 2.8504494167144767e-07, + "logits/chosen": -2.7030186653137207, + "logits/rejected": -2.6916511058807373, + "logps/chosen": -265.45953369140625, + "logps/rejected": -304.0247802734375, + "loss": 0.0815, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.11693648248910904, + "rewards/margins": 6.367062568664551, + "rewards/rejected": -6.483999729156494, + "step": 2830 + }, + { + "epoch": 1.47, + "learning_rate": 2.8408873589596484e-07, + "logits/chosen": -2.731295108795166, + "logits/rejected": -2.6814284324645996, + "logps/chosen": -263.04351806640625, + "logps/rejected": -281.86529541015625, + "loss": 0.0685, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.13037601113319397, + "rewards/margins": 6.561248779296875, + "rewards/rejected": -6.691624641418457, + "step": 2840 + }, + { + "epoch": 1.47, + "learning_rate": 2.8313253012048195e-07, + "logits/chosen": -2.591728687286377, + "logits/rejected": -2.601846218109131, + "logps/chosen": -229.76559448242188, + "logps/rejected": -327.7354431152344, + "loss": 0.0748, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.1586235761642456, + "rewards/margins": 7.073062896728516, + "rewards/rejected": -6.914440155029297, + "step": 2850 + }, + { + "epoch": 1.48, + "learning_rate": 2.8217632434499907e-07, + "logits/chosen": -2.754502296447754, + "logits/rejected": -2.707017421722412, + "logps/chosen": -289.90240478515625, + "logps/rejected": -316.56988525390625, + "loss": 0.0788, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.3420160114765167, + "rewards/margins": 6.77099084854126, + "rewards/rejected": -7.11300802230835, + "step": 2860 + }, + { + "epoch": 1.48, + "learning_rate": 2.812201185695162e-07, + "logits/chosen": -2.682600259780884, + "logits/rejected": -2.6694142818450928, + "logps/chosen": -238.49545288085938, + "logps/rejected": -300.5420837402344, + "loss": 0.0725, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1682194173336029, + "rewards/margins": 5.978898048400879, + "rewards/rejected": -6.1471171379089355, + "step": 2870 + }, + { + "epoch": 1.49, + "learning_rate": 2.802639127940333e-07, + "logits/chosen": -2.7259023189544678, + "logits/rejected": -2.651639461517334, + "logps/chosen": -265.7987365722656, + "logps/rejected": -257.5804138183594, + "loss": 0.114, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.36955079436302185, + "rewards/margins": 6.632456302642822, + "rewards/rejected": -7.002006530761719, + "step": 2880 + }, + { + "epoch": 1.49, + "learning_rate": 2.7930770701855036e-07, + "logits/chosen": -2.7521653175354004, + "logits/rejected": -2.6949660778045654, + "logps/chosen": -270.8799133300781, + "logps/rejected": -282.875, + "loss": 0.122, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.07829879969358444, + "rewards/margins": 6.847074031829834, + "rewards/rejected": -6.76877498626709, + "step": 2890 + }, + { + "epoch": 1.5, + "learning_rate": 2.783515012430675e-07, + "logits/chosen": -2.6613528728485107, + "logits/rejected": -2.6102874279022217, + "logps/chosen": -269.69146728515625, + "logps/rejected": -276.59228515625, + "loss": 0.0979, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.2745317220687866, + "rewards/margins": 6.088465213775635, + "rewards/rejected": -6.362997531890869, + "step": 2900 + }, + { + "epoch": 1.5, + "eval_logits/chosen": -2.678755044937134, + "eval_logits/rejected": -2.646636724472046, + "eval_logps/chosen": -292.6034240722656, + "eval_logps/rejected": -270.4625549316406, + "eval_loss": 0.5471131205558777, + "eval_rewards/accuracies": 0.7540000081062317, + "eval_rewards/chosen": -2.145230531692505, + "eval_rewards/margins": 2.0526273250579834, + "eval_rewards/rejected": -4.197857856750488, + "eval_runtime": 300.0931, + "eval_samples_per_second": 6.665, + "eval_steps_per_second": 0.417, + "step": 2900 + }, + { + "epoch": 1.5, + "learning_rate": 2.773952954675846e-07, + "logits/chosen": -2.661973237991333, + "logits/rejected": -2.6546902656555176, + "logps/chosen": -267.7713317871094, + "logps/rejected": -256.48779296875, + "loss": 0.0672, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.36151501536369324, + "rewards/margins": 5.869540691375732, + "rewards/rejected": -6.231055736541748, + "step": 2910 + }, + { + "epoch": 1.51, + "learning_rate": 2.764390896921017e-07, + "logits/chosen": -2.712329626083374, + "logits/rejected": -2.6236677169799805, + "logps/chosen": -299.66705322265625, + "logps/rejected": -286.8455505371094, + "loss": 0.0836, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04764469712972641, + "rewards/margins": 6.375118255615234, + "rewards/rejected": -6.422762870788574, + "step": 2920 + }, + { + "epoch": 1.51, + "learning_rate": 2.754828839166188e-07, + "logits/chosen": -2.586707592010498, + "logits/rejected": -2.588397264480591, + "logps/chosen": -257.25665283203125, + "logps/rejected": -272.2353820800781, + "loss": 0.0692, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.04652285575866699, + "rewards/margins": 7.283473014831543, + "rewards/rejected": -7.329996585845947, + "step": 2930 + }, + { + "epoch": 1.52, + "learning_rate": 2.7452667814113594e-07, + "logits/chosen": -2.7274789810180664, + "logits/rejected": -2.6998512744903564, + "logps/chosen": -285.88995361328125, + "logps/rejected": -302.61798095703125, + "loss": 0.0848, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.24309101700782776, + "rewards/margins": 6.804354667663574, + "rewards/rejected": -6.561263084411621, + "step": 2940 + }, + { + "epoch": 1.52, + "learning_rate": 2.7357047236565306e-07, + "logits/chosen": -2.642345666885376, + "logits/rejected": -2.656949281692505, + "logps/chosen": -255.52157592773438, + "logps/rejected": -290.87457275390625, + "loss": 0.0661, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.14165842533111572, + "rewards/margins": 6.226951599121094, + "rewards/rejected": -6.085293292999268, + "step": 2950 + }, + { + "epoch": 1.53, + "learning_rate": 2.7261426659017017e-07, + "logits/chosen": -2.6985981464385986, + "logits/rejected": -2.6935291290283203, + "logps/chosen": -259.4523010253906, + "logps/rejected": -285.291748046875, + "loss": 0.0821, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.24156935513019562, + "rewards/margins": 5.883909225463867, + "rewards/rejected": -6.125478744506836, + "step": 2960 + }, + { + "epoch": 1.53, + "learning_rate": 2.716580608146873e-07, + "logits/chosen": -2.7121007442474365, + "logits/rejected": -2.680630922317505, + "logps/chosen": -323.2484436035156, + "logps/rejected": -312.9270324707031, + "loss": 0.0678, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.317016065120697, + "rewards/margins": 7.44516134262085, + "rewards/rejected": -7.12814474105835, + "step": 2970 + }, + { + "epoch": 1.54, + "learning_rate": 2.7070185503920446e-07, + "logits/chosen": -2.5762784481048584, + "logits/rejected": -2.556363344192505, + "logps/chosen": -247.0843963623047, + "logps/rejected": -302.10186767578125, + "loss": 0.0652, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.16331888735294342, + "rewards/margins": 6.702650547027588, + "rewards/rejected": -6.865969657897949, + "step": 2980 + }, + { + "epoch": 1.54, + "learning_rate": 2.6974564926372157e-07, + "logits/chosen": -2.768145799636841, + "logits/rejected": -2.686316967010498, + "logps/chosen": -287.52178955078125, + "logps/rejected": -308.16510009765625, + "loss": 0.0648, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07840989530086517, + "rewards/margins": 7.003350257873535, + "rewards/rejected": -6.924940586090088, + "step": 2990 + }, + { + "epoch": 1.55, + "learning_rate": 2.687894434882387e-07, + "logits/chosen": -2.7189621925354004, + "logits/rejected": -2.6321442127227783, + "logps/chosen": -240.61349487304688, + "logps/rejected": -254.2037811279297, + "loss": 0.0732, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.35617923736572266, + "rewards/margins": 5.727667331695557, + "rewards/rejected": -6.083846092224121, + "step": 3000 + }, + { + "epoch": 1.55, + "eval_logits/chosen": -2.698098659515381, + "eval_logits/rejected": -2.671609401702881, + "eval_logps/chosen": -291.4029235839844, + "eval_logps/rejected": -270.5026550292969, + "eval_loss": 0.551217794418335, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -2.0251784324645996, + "eval_rewards/margins": 2.176687717437744, + "eval_rewards/rejected": -4.201866149902344, + "eval_runtime": 300.5162, + "eval_samples_per_second": 6.655, + "eval_steps_per_second": 0.416, + "step": 3000 + }, + { + "epoch": 1.55, + "learning_rate": 2.678332377127558e-07, + "logits/chosen": -2.71705961227417, + "logits/rejected": -2.6564738750457764, + "logps/chosen": -300.3543701171875, + "logps/rejected": -306.4097595214844, + "loss": 0.0827, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.2911555767059326, + "rewards/margins": 7.625485897064209, + "rewards/rejected": -7.334330081939697, + "step": 3010 + }, + { + "epoch": 1.56, + "learning_rate": 2.668770319372729e-07, + "logits/chosen": -2.7231059074401855, + "logits/rejected": -2.6993746757507324, + "logps/chosen": -267.49078369140625, + "logps/rejected": -254.30569458007812, + "loss": 0.0776, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.07040025293827057, + "rewards/margins": 5.528401851654053, + "rewards/rejected": -5.458001613616943, + "step": 3020 + }, + { + "epoch": 1.56, + "learning_rate": 2.6592082616179004e-07, + "logits/chosen": -2.5468788146972656, + "logits/rejected": -2.5402424335479736, + "logps/chosen": -196.69125366210938, + "logps/rejected": -264.7854919433594, + "loss": 0.0868, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.013843962922692299, + "rewards/margins": 5.93640661239624, + "rewards/rejected": -5.950250625610352, + "step": 3030 + }, + { + "epoch": 1.57, + "learning_rate": 2.649646203863071e-07, + "logits/chosen": -2.5948565006256104, + "logits/rejected": -2.6171257495880127, + "logps/chosen": -252.2794647216797, + "logps/rejected": -301.02874755859375, + "loss": 0.0587, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.2185756266117096, + "rewards/margins": 6.193685531616211, + "rewards/rejected": -5.975109577178955, + "step": 3040 + }, + { + "epoch": 1.57, + "learning_rate": 2.640084146108242e-07, + "logits/chosen": -2.654193878173828, + "logits/rejected": -2.633950710296631, + "logps/chosen": -303.9206237792969, + "logps/rejected": -296.4111022949219, + "loss": 0.0733, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.1474265307188034, + "rewards/margins": 6.6915788650512695, + "rewards/rejected": -6.839005470275879, + "step": 3050 + }, + { + "epoch": 1.58, + "learning_rate": 2.6305220883534133e-07, + "logits/chosen": -2.668823719024658, + "logits/rejected": -2.5736072063446045, + "logps/chosen": -236.24972534179688, + "logps/rejected": -276.8191833496094, + "loss": 0.0712, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.06152166798710823, + "rewards/margins": 6.0490899085998535, + "rewards/rejected": -6.1106109619140625, + "step": 3060 + }, + { + "epoch": 1.58, + "learning_rate": 2.6209600305985845e-07, + "logits/chosen": -2.6920838356018066, + "logits/rejected": -2.642549991607666, + "logps/chosen": -274.13934326171875, + "logps/rejected": -306.42523193359375, + "loss": 0.1021, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5211185812950134, + "rewards/margins": 7.7590436935424805, + "rewards/rejected": -7.2379255294799805, + "step": 3070 + }, + { + "epoch": 1.59, + "learning_rate": 2.6113979728437556e-07, + "logits/chosen": -2.7744197845458984, + "logits/rejected": -2.754517078399658, + "logps/chosen": -300.2927551269531, + "logps/rejected": -293.2771911621094, + "loss": 0.1094, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.10171057283878326, + "rewards/margins": 6.362521171569824, + "rewards/rejected": -6.260810375213623, + "step": 3080 + }, + { + "epoch": 1.6, + "learning_rate": 2.601835915088927e-07, + "logits/chosen": -2.571173906326294, + "logits/rejected": -2.569072961807251, + "logps/chosen": -233.9067840576172, + "logps/rejected": -281.82879638671875, + "loss": 0.1037, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.19151082634925842, + "rewards/margins": 6.079700946807861, + "rewards/rejected": -5.888190269470215, + "step": 3090 + }, + { + "epoch": 1.6, + "learning_rate": 2.592273857334098e-07, + "logits/chosen": -2.63468074798584, + "logits/rejected": -2.6281745433807373, + "logps/chosen": -259.4231262207031, + "logps/rejected": -289.1959228515625, + "loss": 0.0799, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.1904747486114502, + "rewards/margins": 6.4702324867248535, + "rewards/rejected": -6.279757022857666, + "step": 3100 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -2.7142674922943115, + "eval_logits/rejected": -2.6703028678894043, + "eval_logps/chosen": -290.0393371582031, + "eval_logps/rejected": -267.2228698730469, + "eval_loss": 0.5415002107620239, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -1.888822078704834, + "eval_rewards/margins": 1.9850670099258423, + "eval_rewards/rejected": -3.873889207839966, + "eval_runtime": 299.388, + "eval_samples_per_second": 6.68, + "eval_steps_per_second": 0.418, + "step": 3100 + }, + { + "epoch": 1.61, + "learning_rate": 2.582711799579269e-07, + "logits/chosen": -2.7312028408050537, + "logits/rejected": -2.7376887798309326, + "logps/chosen": -292.3309326171875, + "logps/rejected": -306.7789611816406, + "loss": 0.0849, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.12501433491706848, + "rewards/margins": 6.4690046310424805, + "rewards/rejected": -6.594018459320068, + "step": 3110 + }, + { + "epoch": 1.61, + "learning_rate": 2.573149741824441e-07, + "logits/chosen": -2.7475574016571045, + "logits/rejected": -2.6412527561187744, + "logps/chosen": -301.6943664550781, + "logps/rejected": -288.72088623046875, + "loss": 0.0913, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.25444021821022034, + "rewards/margins": 6.481881618499756, + "rewards/rejected": -6.227441310882568, + "step": 3120 + }, + { + "epoch": 1.62, + "learning_rate": 2.563587684069612e-07, + "logits/chosen": -2.6992063522338867, + "logits/rejected": -2.66088604927063, + "logps/chosen": -297.042236328125, + "logps/rejected": -300.57843017578125, + "loss": 0.0612, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.2802169919013977, + "rewards/margins": 6.343818664550781, + "rewards/rejected": -6.624035835266113, + "step": 3130 + }, + { + "epoch": 1.62, + "learning_rate": 2.554025626314783e-07, + "logits/chosen": -2.7833545207977295, + "logits/rejected": -2.6407017707824707, + "logps/chosen": -273.5995788574219, + "logps/rejected": -312.2384948730469, + "loss": 0.096, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5391936898231506, + "rewards/margins": 6.512132167816162, + "rewards/rejected": -7.051326751708984, + "step": 3140 + }, + { + "epoch": 1.63, + "learning_rate": 2.544463568559954e-07, + "logits/chosen": -2.8172030448913574, + "logits/rejected": -2.6880176067352295, + "logps/chosen": -309.4474792480469, + "logps/rejected": -321.9583435058594, + "loss": 0.0707, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.013124823570251465, + "rewards/margins": 6.90747594833374, + "rewards/rejected": -6.894351005554199, + "step": 3150 + }, + { + "epoch": 1.63, + "learning_rate": 2.5349015108051254e-07, + "logits/chosen": -2.7259583473205566, + "logits/rejected": -2.6957221031188965, + "logps/chosen": -251.1798858642578, + "logps/rejected": -275.9666748046875, + "loss": 0.0833, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.09325708448886871, + "rewards/margins": 6.963329315185547, + "rewards/rejected": -6.8700714111328125, + "step": 3160 + }, + { + "epoch": 1.64, + "learning_rate": 2.5253394530502966e-07, + "logits/chosen": -2.6244192123413086, + "logits/rejected": -2.589078426361084, + "logps/chosen": -324.7796936035156, + "logps/rejected": -322.0079650878906, + "loss": 0.0591, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.2864592969417572, + "rewards/margins": 7.4498138427734375, + "rewards/rejected": -7.16335391998291, + "step": 3170 + }, + { + "epoch": 1.64, + "learning_rate": 2.5157773952954677e-07, + "logits/chosen": -2.804133892059326, + "logits/rejected": -2.7516913414001465, + "logps/chosen": -306.5554504394531, + "logps/rejected": -300.6597900390625, + "loss": 0.0944, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.08758392184972763, + "rewards/margins": 6.123711109161377, + "rewards/rejected": -6.211295127868652, + "step": 3180 + }, + { + "epoch": 1.65, + "learning_rate": 2.506215337540639e-07, + "logits/chosen": -2.680217742919922, + "logits/rejected": -2.6095592975616455, + "logps/chosen": -257.2310485839844, + "logps/rejected": -300.31072998046875, + "loss": 0.082, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.01602376066148281, + "rewards/margins": 6.316248416900635, + "rewards/rejected": -6.332272052764893, + "step": 3190 + }, + { + "epoch": 1.65, + "learning_rate": 2.4966532797858095e-07, + "logits/chosen": -2.6483314037323, + "logits/rejected": -2.5556366443634033, + "logps/chosen": -279.1159362792969, + "logps/rejected": -281.1722106933594, + "loss": 0.07, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.2574574947357178, + "rewards/margins": 7.106630802154541, + "rewards/rejected": -6.849173069000244, + "step": 3200 + }, + { + "epoch": 1.65, + "eval_logits/chosen": -2.700206995010376, + "eval_logits/rejected": -2.6565675735473633, + "eval_logps/chosen": -289.6077575683594, + "eval_logps/rejected": -268.78326416015625, + "eval_loss": 0.5399491190910339, + "eval_rewards/accuracies": 0.7639999985694885, + "eval_rewards/chosen": -1.8456586599349976, + "eval_rewards/margins": 2.184269666671753, + "eval_rewards/rejected": -4.029928207397461, + "eval_runtime": 300.5168, + "eval_samples_per_second": 6.655, + "eval_steps_per_second": 0.416, + "step": 3200 + }, + { + "epoch": 1.66, + "learning_rate": 2.4870912220309807e-07, + "logits/chosen": -2.6509642601013184, + "logits/rejected": -2.6520817279815674, + "logps/chosen": -273.5619201660156, + "logps/rejected": -275.95147705078125, + "loss": 0.0892, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.26873114705085754, + "rewards/margins": 5.601580619812012, + "rewards/rejected": -5.870312690734863, + "step": 3210 + }, + { + "epoch": 1.66, + "learning_rate": 2.477529164276152e-07, + "logits/chosen": -2.8350400924682617, + "logits/rejected": -2.743255138397217, + "logps/chosen": -274.95733642578125, + "logps/rejected": -308.00567626953125, + "loss": 0.0899, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.03380532190203667, + "rewards/margins": 6.9070143699646, + "rewards/rejected": -6.873208522796631, + "step": 3220 + }, + { + "epoch": 1.67, + "learning_rate": 2.4679671065213235e-07, + "logits/chosen": -2.786475658416748, + "logits/rejected": -2.7416672706604004, + "logps/chosen": -283.60577392578125, + "logps/rejected": -320.0465393066406, + "loss": 0.0866, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.037688374519348145, + "rewards/margins": 6.618558406829834, + "rewards/rejected": -6.580870151519775, + "step": 3230 + }, + { + "epoch": 1.67, + "learning_rate": 2.4584050487664947e-07, + "logits/chosen": -2.762442111968994, + "logits/rejected": -2.637181282043457, + "logps/chosen": -284.50079345703125, + "logps/rejected": -291.54669189453125, + "loss": 0.0466, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18414416909217834, + "rewards/margins": 6.982729434967041, + "rewards/rejected": -6.798585414886475, + "step": 3240 + }, + { + "epoch": 1.68, + "learning_rate": 2.448842991011666e-07, + "logits/chosen": -2.7496209144592285, + "logits/rejected": -2.670368194580078, + "logps/chosen": -291.591796875, + "logps/rejected": -293.0029602050781, + "loss": 0.0933, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.37867432832717896, + "rewards/margins": 6.212046146392822, + "rewards/rejected": -6.590720176696777, + "step": 3250 + }, + { + "epoch": 1.68, + "learning_rate": 2.439280933256837e-07, + "logits/chosen": -2.6777236461639404, + "logits/rejected": -2.6595215797424316, + "logps/chosen": -238.927978515625, + "logps/rejected": -308.22271728515625, + "loss": 0.0572, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11050692945718765, + "rewards/margins": 7.144793510437012, + "rewards/rejected": -7.0342864990234375, + "step": 3260 + }, + { + "epoch": 1.69, + "learning_rate": 2.429718875502008e-07, + "logits/chosen": -2.5337674617767334, + "logits/rejected": -2.5987701416015625, + "logps/chosen": -258.04766845703125, + "logps/rejected": -331.2449035644531, + "loss": 0.0687, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.22830796241760254, + "rewards/margins": 7.673661708831787, + "rewards/rejected": -7.4453535079956055, + "step": 3270 + }, + { + "epoch": 1.69, + "learning_rate": 2.420156817747179e-07, + "logits/chosen": -2.686455726623535, + "logits/rejected": -2.5261008739471436, + "logps/chosen": -270.0923767089844, + "logps/rejected": -264.47076416015625, + "loss": 0.0651, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2098160982131958, + "rewards/margins": 6.175947189331055, + "rewards/rejected": -6.385763168334961, + "step": 3280 + }, + { + "epoch": 1.7, + "learning_rate": 2.41059475999235e-07, + "logits/chosen": -2.6996235847473145, + "logits/rejected": -2.5689122676849365, + "logps/chosen": -245.3484649658203, + "logps/rejected": -278.2618408203125, + "loss": 0.0797, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.643414318561554, + "rewards/margins": 5.8331074714660645, + "rewards/rejected": -6.4765214920043945, + "step": 3290 + }, + { + "epoch": 1.7, + "learning_rate": 2.4010327022375216e-07, + "logits/chosen": -2.7380359172821045, + "logits/rejected": -2.7304420471191406, + "logps/chosen": -302.05413818359375, + "logps/rejected": -321.96063232421875, + "loss": 0.0808, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.16380396485328674, + "rewards/margins": 7.5016984939575195, + "rewards/rejected": -7.337894439697266, + "step": 3300 + }, + { + "epoch": 1.7, + "eval_logits/chosen": -2.7340078353881836, + "eval_logits/rejected": -2.68426513671875, + "eval_logps/chosen": -293.4576416015625, + "eval_logps/rejected": -274.8385314941406, + "eval_loss": 0.5593692660331726, + "eval_rewards/accuracies": 0.7639999985694885, + "eval_rewards/chosen": -2.23065185546875, + "eval_rewards/margins": 2.4048030376434326, + "eval_rewards/rejected": -4.6354546546936035, + "eval_runtime": 302.5894, + "eval_samples_per_second": 6.61, + "eval_steps_per_second": 0.413, + "step": 3300 + }, + { + "epoch": 1.71, + "learning_rate": 2.391470644482693e-07, + "logits/chosen": -2.7412731647491455, + "logits/rejected": -2.7463791370391846, + "logps/chosen": -284.0135192871094, + "logps/rejected": -313.3152770996094, + "loss": 0.0728, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.21982452273368835, + "rewards/margins": 6.8465423583984375, + "rewards/rejected": -6.626717567443848, + "step": 3310 + }, + { + "epoch": 1.71, + "learning_rate": 2.3819085867278636e-07, + "logits/chosen": -2.659803867340088, + "logits/rejected": -2.6273205280303955, + "logps/chosen": -213.12435913085938, + "logps/rejected": -251.2199249267578, + "loss": 0.0828, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.0200145244598389, + "rewards/margins": 5.845543384552002, + "rewards/rejected": -6.865557670593262, + "step": 3320 + }, + { + "epoch": 1.72, + "learning_rate": 2.3723465289730348e-07, + "logits/chosen": -2.778775453567505, + "logits/rejected": -2.728415012359619, + "logps/chosen": -302.9705505371094, + "logps/rejected": -256.00201416015625, + "loss": 0.0958, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.004472860600799322, + "rewards/margins": 5.540419578552246, + "rewards/rejected": -5.54489278793335, + "step": 3330 + }, + { + "epoch": 1.72, + "learning_rate": 2.362784471218206e-07, + "logits/chosen": -2.744642734527588, + "logits/rejected": -2.6768505573272705, + "logps/chosen": -259.7644348144531, + "logps/rejected": -312.2467956542969, + "loss": 0.0621, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.14360444247722626, + "rewards/margins": 7.058108329772949, + "rewards/rejected": -6.914504051208496, + "step": 3340 + }, + { + "epoch": 1.73, + "learning_rate": 2.353222413463377e-07, + "logits/chosen": -2.674807071685791, + "logits/rejected": -2.620729684829712, + "logps/chosen": -267.46630859375, + "logps/rejected": -311.1983642578125, + "loss": 0.0585, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.08036639541387558, + "rewards/margins": 7.000423431396484, + "rewards/rejected": -7.080790042877197, + "step": 3350 + }, + { + "epoch": 1.73, + "learning_rate": 2.3436603557085483e-07, + "logits/chosen": -2.680379629135132, + "logits/rejected": -2.690441131591797, + "logps/chosen": -278.5157775878906, + "logps/rejected": -315.74798583984375, + "loss": 0.0741, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.24022746086120605, + "rewards/margins": 6.1009416580200195, + "rewards/rejected": -6.341168403625488, + "step": 3360 + }, + { + "epoch": 1.74, + "learning_rate": 2.3340982979537197e-07, + "logits/chosen": -2.704501152038574, + "logits/rejected": -2.6352505683898926, + "logps/chosen": -342.2712097167969, + "logps/rejected": -336.1389465332031, + "loss": 0.0836, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.4207138121128082, + "rewards/margins": 6.910533905029297, + "rewards/rejected": -6.489820957183838, + "step": 3370 + }, + { + "epoch": 1.74, + "learning_rate": 2.3245362401988909e-07, + "logits/chosen": -2.704493761062622, + "logits/rejected": -2.6764588356018066, + "logps/chosen": -286.54571533203125, + "logps/rejected": -298.95361328125, + "loss": 0.0822, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39437445998191833, + "rewards/margins": 7.7266998291015625, + "rewards/rejected": -7.3323259353637695, + "step": 3380 + }, + { + "epoch": 1.75, + "learning_rate": 2.314974182444062e-07, + "logits/chosen": -2.733954906463623, + "logits/rejected": -2.712998867034912, + "logps/chosen": -265.21197509765625, + "logps/rejected": -303.2349853515625, + "loss": 0.0623, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1423777341842651, + "rewards/margins": 6.146125793457031, + "rewards/rejected": -7.288504123687744, + "step": 3390 + }, + { + "epoch": 1.76, + "learning_rate": 2.305412124689233e-07, + "logits/chosen": -2.6285359859466553, + "logits/rejected": -2.5890676975250244, + "logps/chosen": -263.24896240234375, + "logps/rejected": -293.53497314453125, + "loss": 0.0501, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.26912063360214233, + "rewards/margins": 7.229107856750488, + "rewards/rejected": -7.498227596282959, + "step": 3400 + }, + { + "epoch": 1.76, + "eval_logits/chosen": -2.6943509578704834, + "eval_logits/rejected": -2.642679214477539, + "eval_logps/chosen": -296.305908203125, + "eval_logps/rejected": -278.03448486328125, + "eval_loss": 0.5704072117805481, + "eval_rewards/accuracies": 0.765999972820282, + "eval_rewards/chosen": -2.515477180480957, + "eval_rewards/margins": 2.4395759105682373, + "eval_rewards/rejected": -4.955053806304932, + "eval_runtime": 301.3731, + "eval_samples_per_second": 6.636, + "eval_steps_per_second": 0.415, + "step": 3400 + }, + { + "epoch": 1.76, + "learning_rate": 2.295850066934404e-07, + "logits/chosen": -2.695094347000122, + "logits/rejected": -2.6034932136535645, + "logps/chosen": -278.7093200683594, + "logps/rejected": -310.04681396484375, + "loss": 0.0762, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.8295940160751343, + "rewards/margins": 6.8828444480896, + "rewards/rejected": -7.712438106536865, + "step": 3410 + }, + { + "epoch": 1.77, + "learning_rate": 2.2862880091795752e-07, + "logits/chosen": -2.7348952293395996, + "logits/rejected": -2.684424877166748, + "logps/chosen": -269.9104309082031, + "logps/rejected": -328.19610595703125, + "loss": 0.0801, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.4503478407859802, + "rewards/margins": 7.062024116516113, + "rewards/rejected": -7.512372016906738, + "step": 3420 + }, + { + "epoch": 1.77, + "learning_rate": 2.2767259514247464e-07, + "logits/chosen": -2.749939441680908, + "logits/rejected": -2.7340288162231445, + "logps/chosen": -270.1997985839844, + "logps/rejected": -326.0268249511719, + "loss": 0.0772, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.09085409343242645, + "rewards/margins": 6.569288730621338, + "rewards/rejected": -6.6601433753967285, + "step": 3430 + }, + { + "epoch": 1.78, + "learning_rate": 2.2671638936699178e-07, + "logits/chosen": -2.6335549354553223, + "logits/rejected": -2.623380184173584, + "logps/chosen": -264.9308776855469, + "logps/rejected": -305.0411682128906, + "loss": 0.07, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.14353379607200623, + "rewards/margins": 6.764189720153809, + "rewards/rejected": -6.620657444000244, + "step": 3440 + }, + { + "epoch": 1.78, + "learning_rate": 2.257601835915089e-07, + "logits/chosen": -2.742511034011841, + "logits/rejected": -2.719316005706787, + "logps/chosen": -324.2193603515625, + "logps/rejected": -318.42327880859375, + "loss": 0.0674, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.2821727395057678, + "rewards/margins": 7.147420406341553, + "rewards/rejected": -6.8652472496032715, + "step": 3450 + }, + { + "epoch": 1.79, + "learning_rate": 2.24803977816026e-07, + "logits/chosen": -2.6763834953308105, + "logits/rejected": -2.627119779586792, + "logps/chosen": -280.9945068359375, + "logps/rejected": -283.1276550292969, + "loss": 0.1153, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.19678916037082672, + "rewards/margins": 6.571404933929443, + "rewards/rejected": -6.76819372177124, + "step": 3460 + }, + { + "epoch": 1.79, + "learning_rate": 2.2384777204054313e-07, + "logits/chosen": -2.783391237258911, + "logits/rejected": -2.6957125663757324, + "logps/chosen": -286.83990478515625, + "logps/rejected": -304.48968505859375, + "loss": 0.0711, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5841490626335144, + "rewards/margins": 7.363424777984619, + "rewards/rejected": -6.779275417327881, + "step": 3470 + }, + { + "epoch": 1.8, + "learning_rate": 2.2289156626506022e-07, + "logits/chosen": -2.6243674755096436, + "logits/rejected": -2.5819458961486816, + "logps/chosen": -290.9930419921875, + "logps/rejected": -303.67974853515625, + "loss": 0.072, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.38888758420944214, + "rewards/margins": 7.5612592697143555, + "rewards/rejected": -7.9501471519470215, + "step": 3480 + }, + { + "epoch": 1.8, + "learning_rate": 2.2193536048957733e-07, + "logits/chosen": -2.8293557167053223, + "logits/rejected": -2.742828130722046, + "logps/chosen": -313.91278076171875, + "logps/rejected": -317.4000549316406, + "loss": 0.096, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.0450088270008564, + "rewards/margins": 6.599936485290527, + "rewards/rejected": -6.554927825927734, + "step": 3490 + }, + { + "epoch": 1.81, + "learning_rate": 2.2097915471409445e-07, + "logits/chosen": -2.7180981636047363, + "logits/rejected": -2.7207627296447754, + "logps/chosen": -282.8636779785156, + "logps/rejected": -304.4784240722656, + "loss": 0.061, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.5040356516838074, + "rewards/margins": 6.437845706939697, + "rewards/rejected": -6.941880702972412, + "step": 3500 + }, + { + "epoch": 1.81, + "eval_logits/chosen": -2.7404065132141113, + "eval_logits/rejected": -2.708589792251587, + "eval_logps/chosen": -293.3233947753906, + "eval_logps/rejected": -273.42083740234375, + "eval_loss": 0.5561814904212952, + "eval_rewards/accuracies": 0.7599999904632568, + "eval_rewards/chosen": -2.2172250747680664, + "eval_rewards/margins": 2.2764604091644287, + "eval_rewards/rejected": -4.493685722351074, + "eval_runtime": 302.9106, + "eval_samples_per_second": 6.603, + "eval_steps_per_second": 0.413, + "step": 3500 + }, + { + "epoch": 1.81, + "learning_rate": 2.200229489386116e-07, + "logits/chosen": -2.741328716278076, + "logits/rejected": -2.7085936069488525, + "logps/chosen": -282.50091552734375, + "logps/rejected": -330.7566223144531, + "loss": 0.0508, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.4392349123954773, + "rewards/margins": 7.699382781982422, + "rewards/rejected": -8.138618469238281, + "step": 3510 + }, + { + "epoch": 1.82, + "learning_rate": 2.190667431631287e-07, + "logits/chosen": -2.688417434692383, + "logits/rejected": -2.695262908935547, + "logps/chosen": -270.17486572265625, + "logps/rejected": -363.36553955078125, + "loss": 0.1207, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.14541302621364594, + "rewards/margins": 7.5206708908081055, + "rewards/rejected": -7.666083335876465, + "step": 3520 + }, + { + "epoch": 1.82, + "learning_rate": 2.1811053738764582e-07, + "logits/chosen": -2.640284776687622, + "logits/rejected": -2.6383633613586426, + "logps/chosen": -243.1065673828125, + "logps/rejected": -314.18792724609375, + "loss": 0.0634, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.18995524942874908, + "rewards/margins": 6.731640815734863, + "rewards/rejected": -6.921595573425293, + "step": 3530 + }, + { + "epoch": 1.83, + "learning_rate": 2.1715433161216294e-07, + "logits/chosen": -2.6342291831970215, + "logits/rejected": -2.6640098094940186, + "logps/chosen": -270.2723388671875, + "logps/rejected": -315.5150451660156, + "loss": 0.0751, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.2791522741317749, + "rewards/margins": 6.453176975250244, + "rewards/rejected": -6.73232889175415, + "step": 3540 + }, + { + "epoch": 1.83, + "learning_rate": 2.1619812583668005e-07, + "logits/chosen": -2.678250789642334, + "logits/rejected": -2.6710121631622314, + "logps/chosen": -256.47808837890625, + "logps/rejected": -284.9803771972656, + "loss": 0.0761, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.6811217069625854, + "rewards/margins": 6.993284702301025, + "rewards/rejected": -7.6744065284729, + "step": 3550 + }, + { + "epoch": 1.84, + "learning_rate": 2.1524192006119714e-07, + "logits/chosen": -2.679922342300415, + "logits/rejected": -2.6698668003082275, + "logps/chosen": -256.57073974609375, + "logps/rejected": -293.0118408203125, + "loss": 0.0698, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2304511070251465, + "rewards/margins": 6.001145362854004, + "rewards/rejected": -7.23159646987915, + "step": 3560 + }, + { + "epoch": 1.84, + "learning_rate": 2.1428571428571426e-07, + "logits/chosen": -2.687896490097046, + "logits/rejected": -2.6100172996520996, + "logps/chosen": -251.7646026611328, + "logps/rejected": -273.5567932128906, + "loss": 0.0454, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.9797040820121765, + "rewards/margins": 6.288348197937012, + "rewards/rejected": -7.268052577972412, + "step": 3570 + }, + { + "epoch": 1.85, + "learning_rate": 2.133295085102314e-07, + "logits/chosen": -2.590709924697876, + "logits/rejected": -2.507107973098755, + "logps/chosen": -256.82061767578125, + "logps/rejected": -288.29132080078125, + "loss": 0.0449, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.220476508140564, + "rewards/margins": 7.095966339111328, + "rewards/rejected": -8.31644344329834, + "step": 3580 + }, + { + "epoch": 1.85, + "learning_rate": 2.1237330273474851e-07, + "logits/chosen": -2.6958794593811035, + "logits/rejected": -2.6530776023864746, + "logps/chosen": -309.91583251953125, + "logps/rejected": -307.0137634277344, + "loss": 0.0964, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.5907996892929077, + "rewards/margins": 7.160977363586426, + "rewards/rejected": -7.751777648925781, + "step": 3590 + }, + { + "epoch": 1.86, + "learning_rate": 2.1141709695926563e-07, + "logits/chosen": -2.672590970993042, + "logits/rejected": -2.6199073791503906, + "logps/chosen": -275.89251708984375, + "logps/rejected": -277.8507080078125, + "loss": 0.0979, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.5147138833999634, + "rewards/margins": 7.132517337799072, + "rewards/rejected": -7.647230625152588, + "step": 3600 + }, + { + "epoch": 1.86, + "eval_logits/chosen": -2.676504373550415, + "eval_logits/rejected": -2.63808536529541, + "eval_logps/chosen": -297.6461181640625, + "eval_logps/rejected": -278.80682373046875, + "eval_loss": 0.5656457543373108, + "eval_rewards/accuracies": 0.7519999742507935, + "eval_rewards/chosen": -2.6494967937469482, + "eval_rewards/margins": 2.3827850818634033, + "eval_rewards/rejected": -5.032281875610352, + "eval_runtime": 302.5456, + "eval_samples_per_second": 6.611, + "eval_steps_per_second": 0.413, + "step": 3600 + }, + { + "epoch": 1.86, + "learning_rate": 2.1046089118378275e-07, + "logits/chosen": -2.69031023979187, + "logits/rejected": -2.6448917388916016, + "logps/chosen": -263.35003662109375, + "logps/rejected": -321.37274169921875, + "loss": 0.0665, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.4072280824184418, + "rewards/margins": 7.940283298492432, + "rewards/rejected": -8.347511291503906, + "step": 3610 + }, + { + "epoch": 1.87, + "learning_rate": 2.0950468540829986e-07, + "logits/chosen": -2.6567223072052, + "logits/rejected": -2.6022956371307373, + "logps/chosen": -280.3554992675781, + "logps/rejected": -290.8537292480469, + "loss": 0.0922, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.44895225763320923, + "rewards/margins": 7.379188537597656, + "rewards/rejected": -7.828141212463379, + "step": 3620 + }, + { + "epoch": 1.87, + "learning_rate": 2.0854847963281698e-07, + "logits/chosen": -2.6773931980133057, + "logits/rejected": -2.565727710723877, + "logps/chosen": -306.9902648925781, + "logps/rejected": -301.0198669433594, + "loss": 0.0857, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.3986026644706726, + "rewards/margins": 7.065288543701172, + "rewards/rejected": -7.463891506195068, + "step": 3630 + }, + { + "epoch": 1.88, + "learning_rate": 2.0759227385733407e-07, + "logits/chosen": -2.4982149600982666, + "logits/rejected": -2.458949327468872, + "logps/chosen": -307.76922607421875, + "logps/rejected": -307.8131408691406, + "loss": 0.0704, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.26402151584625244, + "rewards/margins": 7.018546104431152, + "rewards/rejected": -7.282568454742432, + "step": 3640 + }, + { + "epoch": 1.88, + "learning_rate": 2.066360680818512e-07, + "logits/chosen": -2.6682722568511963, + "logits/rejected": -2.592215061187744, + "logps/chosen": -331.39129638671875, + "logps/rejected": -314.46356201171875, + "loss": 0.0729, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.12419804185628891, + "rewards/margins": 7.393265724182129, + "rewards/rejected": -7.269067287445068, + "step": 3650 + }, + { + "epoch": 1.89, + "learning_rate": 2.0567986230636832e-07, + "logits/chosen": -2.648822784423828, + "logits/rejected": -2.575870990753174, + "logps/chosen": -257.9449157714844, + "logps/rejected": -305.73406982421875, + "loss": 0.0953, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.2408134937286377, + "rewards/margins": 7.346316337585449, + "rewards/rejected": -7.58712911605835, + "step": 3660 + }, + { + "epoch": 1.89, + "learning_rate": 2.0472365653088544e-07, + "logits/chosen": -2.6813411712646484, + "logits/rejected": -2.612908363342285, + "logps/chosen": -304.7392272949219, + "logps/rejected": -299.3567810058594, + "loss": 0.0756, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.8448777198791504, + "rewards/margins": 6.271629810333252, + "rewards/rejected": -7.116507053375244, + "step": 3670 + }, + { + "epoch": 1.9, + "learning_rate": 2.0376745075540256e-07, + "logits/chosen": -2.702895164489746, + "logits/rejected": -2.6631016731262207, + "logps/chosen": -324.67694091796875, + "logps/rejected": -321.83392333984375, + "loss": 0.0663, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.580236554145813, + "rewards/margins": 6.977442741394043, + "rewards/rejected": -7.55767822265625, + "step": 3680 + }, + { + "epoch": 1.91, + "learning_rate": 2.0281124497991967e-07, + "logits/chosen": -2.6680264472961426, + "logits/rejected": -2.6557843685150146, + "logps/chosen": -284.6235656738281, + "logps/rejected": -324.8849182128906, + "loss": 0.1062, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.3347598910331726, + "rewards/margins": 6.98086404800415, + "rewards/rejected": -7.3156232833862305, + "step": 3690 + }, + { + "epoch": 1.91, + "learning_rate": 2.018550392044368e-07, + "logits/chosen": -2.6927990913391113, + "logits/rejected": -2.640979290008545, + "logps/chosen": -279.6319580078125, + "logps/rejected": -350.6274719238281, + "loss": 0.0631, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.232979416847229, + "rewards/margins": 7.658511161804199, + "rewards/rejected": -7.891491889953613, + "step": 3700 + }, + { + "epoch": 1.91, + "eval_logits/chosen": -2.6818108558654785, + "eval_logits/rejected": -2.6407127380371094, + "eval_logps/chosen": -296.2057189941406, + "eval_logps/rejected": -276.4331359863281, + "eval_loss": 0.5667564272880554, + "eval_rewards/accuracies": 0.7559999823570251, + "eval_rewards/chosen": -2.5054566860198975, + "eval_rewards/margins": 2.2894585132598877, + "eval_rewards/rejected": -4.794915676116943, + "eval_runtime": 301.041, + "eval_samples_per_second": 6.644, + "eval_steps_per_second": 0.415, + "step": 3700 + }, + { + "epoch": 1.92, + "learning_rate": 2.0089883342895388e-07, + "logits/chosen": -2.7003157138824463, + "logits/rejected": -2.6551907062530518, + "logps/chosen": -314.6983337402344, + "logps/rejected": -279.4690246582031, + "loss": 0.0751, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.11149871349334717, + "rewards/margins": 7.081504821777344, + "rewards/rejected": -7.1930036544799805, + "step": 3710 + }, + { + "epoch": 1.92, + "learning_rate": 1.9994262765347102e-07, + "logits/chosen": -2.723034381866455, + "logits/rejected": -2.627500534057617, + "logps/chosen": -287.4389343261719, + "logps/rejected": -261.7805480957031, + "loss": 0.0768, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5197538733482361, + "rewards/margins": 6.516218662261963, + "rewards/rejected": -7.035972595214844, + "step": 3720 + }, + { + "epoch": 1.93, + "learning_rate": 1.9898642187798813e-07, + "logits/chosen": -2.5865015983581543, + "logits/rejected": -2.56524920463562, + "logps/chosen": -290.79486083984375, + "logps/rejected": -328.8044738769531, + "loss": 0.0591, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.0011775374878197908, + "rewards/margins": 8.083017349243164, + "rewards/rejected": -8.081838607788086, + "step": 3730 + }, + { + "epoch": 1.93, + "learning_rate": 1.9803021610250525e-07, + "logits/chosen": -2.7334277629852295, + "logits/rejected": -2.7040934562683105, + "logps/chosen": -237.2183074951172, + "logps/rejected": -288.6593322753906, + "loss": 0.0791, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.4630967080593109, + "rewards/margins": 6.583277702331543, + "rewards/rejected": -7.046375274658203, + "step": 3740 + }, + { + "epoch": 1.94, + "learning_rate": 1.9707401032702237e-07, + "logits/chosen": -2.5976366996765137, + "logits/rejected": -2.5525100231170654, + "logps/chosen": -261.64508056640625, + "logps/rejected": -296.0137939453125, + "loss": 0.0514, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5384448170661926, + "rewards/margins": 6.905440330505371, + "rewards/rejected": -7.443885803222656, + "step": 3750 + }, + { + "epoch": 1.94, + "learning_rate": 1.9611780455153948e-07, + "logits/chosen": -2.697441577911377, + "logits/rejected": -2.7309083938598633, + "logps/chosen": -240.364013671875, + "logps/rejected": -301.3322448730469, + "loss": 0.0921, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.21534188091754913, + "rewards/margins": 6.707180976867676, + "rewards/rejected": -6.922522068023682, + "step": 3760 + }, + { + "epoch": 1.95, + "learning_rate": 1.951615987760566e-07, + "logits/chosen": -2.6353142261505127, + "logits/rejected": -2.5878772735595703, + "logps/chosen": -280.3433532714844, + "logps/rejected": -305.8084411621094, + "loss": 0.0782, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.7940788269042969, + "rewards/margins": 6.9906744956970215, + "rewards/rejected": -7.78475284576416, + "step": 3770 + }, + { + "epoch": 1.95, + "learning_rate": 1.942053930005737e-07, + "logits/chosen": -2.793308734893799, + "logits/rejected": -2.7050063610076904, + "logps/chosen": -291.4981689453125, + "logps/rejected": -308.5917053222656, + "loss": 0.0587, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.6085634827613831, + "rewards/margins": 7.077228546142578, + "rewards/rejected": -7.685791969299316, + "step": 3780 + }, + { + "epoch": 1.96, + "learning_rate": 1.9324918722509086e-07, + "logits/chosen": -2.645498514175415, + "logits/rejected": -2.691744089126587, + "logps/chosen": -265.4901428222656, + "logps/rejected": -334.2118835449219, + "loss": 0.0877, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.1245644092559814, + "rewards/margins": 7.277432441711426, + "rewards/rejected": -8.401995658874512, + "step": 3790 + }, + { + "epoch": 1.96, + "learning_rate": 1.9229298144960794e-07, + "logits/chosen": -2.718837261199951, + "logits/rejected": -2.6985907554626465, + "logps/chosen": -283.3094177246094, + "logps/rejected": -304.67669677734375, + "loss": 0.1202, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5752776861190796, + "rewards/margins": 7.11301326751709, + "rewards/rejected": -7.688291072845459, + "step": 3800 + }, + { + "epoch": 1.96, + "eval_logits/chosen": -2.7124974727630615, + "eval_logits/rejected": -2.671638011932373, + "eval_logps/chosen": -297.732177734375, + "eval_logps/rejected": -275.73297119140625, + "eval_loss": 0.5678086280822754, + "eval_rewards/accuracies": 0.7580000162124634, + "eval_rewards/chosen": -2.658102035522461, + "eval_rewards/margins": 2.0667974948883057, + "eval_rewards/rejected": -4.7248992919921875, + "eval_runtime": 302.3235, + "eval_samples_per_second": 6.615, + "eval_steps_per_second": 0.413, + "step": 3800 + }, + { + "epoch": 1.97, + "learning_rate": 1.9133677567412506e-07, + "logits/chosen": -2.7177176475524902, + "logits/rejected": -2.6701908111572266, + "logps/chosen": -288.8520202636719, + "logps/rejected": -305.2877197265625, + "loss": 0.0885, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.9760677218437195, + "rewards/margins": 6.295001029968262, + "rewards/rejected": -7.271068572998047, + "step": 3810 + }, + { + "epoch": 1.97, + "learning_rate": 1.9038056989864218e-07, + "logits/chosen": -2.6947813034057617, + "logits/rejected": -2.6857385635375977, + "logps/chosen": -250.64999389648438, + "logps/rejected": -273.1249694824219, + "loss": 0.1056, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.41516774892807007, + "rewards/margins": 5.893454551696777, + "rewards/rejected": -6.308621883392334, + "step": 3820 + }, + { + "epoch": 1.98, + "learning_rate": 1.894243641231593e-07, + "logits/chosen": -2.49280047416687, + "logits/rejected": -2.485044002532959, + "logps/chosen": -268.3055725097656, + "logps/rejected": -257.1871337890625, + "loss": 0.0927, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.7658621072769165, + "rewards/margins": 6.147838592529297, + "rewards/rejected": -6.913701057434082, + "step": 3830 + }, + { + "epoch": 1.98, + "learning_rate": 1.884681583476764e-07, + "logits/chosen": -2.7352261543273926, + "logits/rejected": -2.720942497253418, + "logps/chosen": -289.215087890625, + "logps/rejected": -305.9485778808594, + "loss": 0.0578, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.321073055267334, + "rewards/margins": 5.713677406311035, + "rewards/rejected": -7.034750938415527, + "step": 3840 + }, + { + "epoch": 1.99, + "learning_rate": 1.8751195257219352e-07, + "logits/chosen": -2.7396533489227295, + "logits/rejected": -2.7132935523986816, + "logps/chosen": -256.33660888671875, + "logps/rejected": -292.3953552246094, + "loss": 0.0822, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.813510537147522, + "rewards/margins": 6.204690456390381, + "rewards/rejected": -7.018200874328613, + "step": 3850 + }, + { + "epoch": 1.99, + "learning_rate": 1.8655574679671067e-07, + "logits/chosen": -2.714446544647217, + "logits/rejected": -2.670003890991211, + "logps/chosen": -292.04962158203125, + "logps/rejected": -316.65447998046875, + "loss": 0.1048, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.8342695236206055, + "rewards/margins": 6.873841285705566, + "rewards/rejected": -7.7081098556518555, + "step": 3860 + }, + { + "epoch": 2.0, + "learning_rate": 1.8559954102122778e-07, + "logits/chosen": -2.723661184310913, + "logits/rejected": -2.6586456298828125, + "logps/chosen": -279.65679931640625, + "logps/rejected": -303.198486328125, + "loss": 0.093, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.4174138903617859, + "rewards/margins": 6.7883734703063965, + "rewards/rejected": -7.205787658691406, + "step": 3870 + }, + { + "epoch": 2.0, + "learning_rate": 1.8464333524574487e-07, + "logits/chosen": -2.7239067554473877, + "logits/rejected": -2.6996660232543945, + "logps/chosen": -256.20709228515625, + "logps/rejected": -281.5430908203125, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3029792308807373, + "rewards/margins": 7.203469276428223, + "rewards/rejected": -7.506447792053223, + "step": 3880 + }, + { + "epoch": 2.01, + "learning_rate": 1.8368712947026199e-07, + "logits/chosen": -2.7349143028259277, + "logits/rejected": -2.69462251663208, + "logps/chosen": -285.8705749511719, + "logps/rejected": -302.57537841796875, + "loss": 0.014, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.05275885388255119, + "rewards/margins": 7.5370941162109375, + "rewards/rejected": -7.589852809906006, + "step": 3890 + }, + { + "epoch": 2.01, + "learning_rate": 1.827309236947791e-07, + "logits/chosen": -2.6346869468688965, + "logits/rejected": -2.6188418865203857, + "logps/chosen": -245.5050506591797, + "logps/rejected": -325.5300598144531, + "loss": 0.022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.2961425185203552, + "rewards/margins": 8.76488971710205, + "rewards/rejected": -9.061031341552734, + "step": 3900 + }, + { + "epoch": 2.01, + "eval_logits/chosen": -2.7125446796417236, + "eval_logits/rejected": -2.6679580211639404, + "eval_logps/chosen": -298.0444030761719, + "eval_logps/rejected": -280.1555480957031, + "eval_loss": 0.5656534433364868, + "eval_rewards/accuracies": 0.7720000147819519, + "eval_rewards/chosen": -2.6893272399902344, + "eval_rewards/margins": 2.4778311252593994, + "eval_rewards/rejected": -5.167158603668213, + "eval_runtime": 300.3774, + "eval_samples_per_second": 6.658, + "eval_steps_per_second": 0.416, + "step": 3900 + }, + { + "epoch": 2.02, + "learning_rate": 1.8177471791929622e-07, + "logits/chosen": -2.6352615356445312, + "logits/rejected": -2.6043148040771484, + "logps/chosen": -264.2796325683594, + "logps/rejected": -364.203369140625, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25604504346847534, + "rewards/margins": 8.365068435668945, + "rewards/rejected": -8.621111869812012, + "step": 3910 + }, + { + "epoch": 2.02, + "learning_rate": 1.8081851214381333e-07, + "logits/chosen": -2.5759568214416504, + "logits/rejected": -2.56921648979187, + "logps/chosen": -275.2734069824219, + "logps/rejected": -348.78875732421875, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5250979661941528, + "rewards/margins": 8.301558494567871, + "rewards/rejected": -8.826656341552734, + "step": 3920 + }, + { + "epoch": 2.03, + "learning_rate": 1.7986230636833047e-07, + "logits/chosen": -2.6585121154785156, + "logits/rejected": -2.5779411792755127, + "logps/chosen": -225.691650390625, + "logps/rejected": -295.9264831542969, + "loss": 0.0179, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5348653197288513, + "rewards/margins": 7.498388767242432, + "rewards/rejected": -8.033254623413086, + "step": 3930 + }, + { + "epoch": 2.03, + "learning_rate": 1.789061005928476e-07, + "logits/chosen": -2.7394919395446777, + "logits/rejected": -2.6301112174987793, + "logps/chosen": -310.79754638671875, + "logps/rejected": -293.79046630859375, + "loss": 0.0221, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.4678827226161957, + "rewards/margins": 7.93142032623291, + "rewards/rejected": -8.39930248260498, + "step": 3940 + }, + { + "epoch": 2.04, + "learning_rate": 1.7794989481736468e-07, + "logits/chosen": -2.7052078247070312, + "logits/rejected": -2.637937307357788, + "logps/chosen": -296.1398010253906, + "logps/rejected": -321.91900634765625, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4104957580566406, + "rewards/margins": 8.665520668029785, + "rewards/rejected": -9.076016426086426, + "step": 3950 + }, + { + "epoch": 2.04, + "learning_rate": 1.769936890418818e-07, + "logits/chosen": -2.66807222366333, + "logits/rejected": -2.6150898933410645, + "logps/chosen": -270.62188720703125, + "logps/rejected": -317.2550048828125, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22330737113952637, + "rewards/margins": 8.861315727233887, + "rewards/rejected": -8.638009071350098, + "step": 3960 + }, + { + "epoch": 2.05, + "learning_rate": 1.760374832663989e-07, + "logits/chosen": -2.611741542816162, + "logits/rejected": -2.5388216972351074, + "logps/chosen": -275.30755615234375, + "logps/rejected": -330.19219970703125, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39820218086242676, + "rewards/margins": 8.278533935546875, + "rewards/rejected": -8.676736831665039, + "step": 3970 + }, + { + "epoch": 2.05, + "learning_rate": 1.7508127749091603e-07, + "logits/chosen": -2.68839955329895, + "logits/rejected": -2.650808811187744, + "logps/chosen": -281.78289794921875, + "logps/rejected": -326.82745361328125, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6049606204032898, + "rewards/margins": 8.425558090209961, + "rewards/rejected": -9.030518531799316, + "step": 3980 + }, + { + "epoch": 2.06, + "learning_rate": 1.7412507171543314e-07, + "logits/chosen": -2.673745632171631, + "logits/rejected": -2.614142894744873, + "logps/chosen": -274.6407775878906, + "logps/rejected": -293.92449951171875, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0188729763031006, + "rewards/margins": 8.362761497497559, + "rewards/rejected": -9.381634712219238, + "step": 3990 + }, + { + "epoch": 2.07, + "learning_rate": 1.7316886593995028e-07, + "logits/chosen": -2.684051513671875, + "logits/rejected": -2.6487350463867188, + "logps/chosen": -255.32919311523438, + "logps/rejected": -315.4376220703125, + "loss": 0.0177, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.5388384461402893, + "rewards/margins": 9.196451187133789, + "rewards/rejected": -9.73529052734375, + "step": 4000 + }, + { + "epoch": 2.07, + "eval_logits/chosen": -2.691579580307007, + "eval_logits/rejected": -2.64309024810791, + "eval_logps/chosen": -304.6116943359375, + "eval_logps/rejected": -291.3919372558594, + "eval_loss": 0.6171462535858154, + "eval_rewards/accuracies": 0.7680000066757202, + "eval_rewards/chosen": -3.346055746078491, + "eval_rewards/margins": 2.9447388648986816, + "eval_rewards/rejected": -6.290794372558594, + "eval_runtime": 300.442, + "eval_samples_per_second": 6.657, + "eval_steps_per_second": 0.416, + "step": 4000 + }, + { + "epoch": 2.07, + "learning_rate": 1.722126601644674e-07, + "logits/chosen": -2.6704154014587402, + "logits/rejected": -2.551290273666382, + "logps/chosen": -283.76666259765625, + "logps/rejected": -310.22857666015625, + "loss": 0.0218, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.4321608543395996, + "rewards/margins": 8.552752494812012, + "rewards/rejected": -8.984914779663086, + "step": 4010 + }, + { + "epoch": 2.08, + "learning_rate": 1.7125645438898452e-07, + "logits/chosen": -2.6245718002319336, + "logits/rejected": -2.6031408309936523, + "logps/chosen": -276.89520263671875, + "logps/rejected": -321.8138122558594, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7024073004722595, + "rewards/margins": 9.13112735748291, + "rewards/rejected": -9.833534240722656, + "step": 4020 + }, + { + "epoch": 2.08, + "learning_rate": 1.703002486135016e-07, + "logits/chosen": -2.6619951725006104, + "logits/rejected": -2.646958112716675, + "logps/chosen": -293.88006591796875, + "logps/rejected": -332.16082763671875, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06791017204523087, + "rewards/margins": 9.907838821411133, + "rewards/rejected": -9.975748062133789, + "step": 4030 + }, + { + "epoch": 2.09, + "learning_rate": 1.6934404283801872e-07, + "logits/chosen": -2.6504015922546387, + "logits/rejected": -2.5901706218719482, + "logps/chosen": -283.5688171386719, + "logps/rejected": -338.7544860839844, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1157902255654335, + "rewards/margins": 9.678095817565918, + "rewards/rejected": -9.793886184692383, + "step": 4040 + }, + { + "epoch": 2.09, + "learning_rate": 1.6838783706253584e-07, + "logits/chosen": -2.6505672931671143, + "logits/rejected": -2.600067615509033, + "logps/chosen": -248.70849609375, + "logps/rejected": -300.7835388183594, + "loss": 0.0163, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5731819272041321, + "rewards/margins": 8.532538414001465, + "rewards/rejected": -9.105721473693848, + "step": 4050 + }, + { + "epoch": 2.1, + "learning_rate": 1.6743163128705295e-07, + "logits/chosen": -2.6042139530181885, + "logits/rejected": -2.545107364654541, + "logps/chosen": -260.8901672363281, + "logps/rejected": -328.5799255371094, + "loss": 0.0193, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.0383368730545044, + "rewards/margins": 9.177366256713867, + "rewards/rejected": -10.215703964233398, + "step": 4060 + }, + { + "epoch": 2.1, + "learning_rate": 1.664754255115701e-07, + "logits/chosen": -2.617219924926758, + "logits/rejected": -2.5846307277679443, + "logps/chosen": -256.7451477050781, + "logps/rejected": -329.6984558105469, + "loss": 0.0151, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6538764834403992, + "rewards/margins": 9.410051345825195, + "rewards/rejected": -10.063928604125977, + "step": 4070 + }, + { + "epoch": 2.11, + "learning_rate": 1.655192197360872e-07, + "logits/chosen": -2.779808521270752, + "logits/rejected": -2.682321548461914, + "logps/chosen": -315.32232666015625, + "logps/rejected": -357.68408203125, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6023126840591431, + "rewards/margins": 9.032594680786133, + "rewards/rejected": -9.634907722473145, + "step": 4080 + }, + { + "epoch": 2.11, + "learning_rate": 1.6456301396060433e-07, + "logits/chosen": -2.686439037322998, + "logits/rejected": -2.628729820251465, + "logps/chosen": -306.7290344238281, + "logps/rejected": -362.8133239746094, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5802473425865173, + "rewards/margins": 8.59655475616455, + "rewards/rejected": -9.176801681518555, + "step": 4090 + }, + { + "epoch": 2.12, + "learning_rate": 1.6360680818512144e-07, + "logits/chosen": -2.633235216140747, + "logits/rejected": -2.62958025932312, + "logps/chosen": -264.7658996582031, + "logps/rejected": -323.9909362792969, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38502559065818787, + "rewards/margins": 8.613489151000977, + "rewards/rejected": -8.998516082763672, + "step": 4100 + }, + { + "epoch": 2.12, + "eval_logits/chosen": -2.670144557952881, + "eval_logits/rejected": -2.622467041015625, + "eval_logps/chosen": -304.59942626953125, + "eval_logps/rejected": -292.2874450683594, + "eval_loss": 0.6389336585998535, + "eval_rewards/accuracies": 0.765999972820282, + "eval_rewards/chosen": -3.344829559326172, + "eval_rewards/margins": 3.0355160236358643, + "eval_rewards/rejected": -6.380346298217773, + "eval_runtime": 301.6754, + "eval_samples_per_second": 6.63, + "eval_steps_per_second": 0.414, + "step": 4100 + }, + { + "epoch": 2.12, + "learning_rate": 1.6265060240963853e-07, + "logits/chosen": -2.7134909629821777, + "logits/rejected": -2.6375508308410645, + "logps/chosen": -307.7324523925781, + "logps/rejected": -317.06622314453125, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44676604866981506, + "rewards/margins": 8.252876281738281, + "rewards/rejected": -8.6996431350708, + "step": 4110 + }, + { + "epoch": 2.13, + "learning_rate": 1.6169439663415565e-07, + "logits/chosen": -2.729085922241211, + "logits/rejected": -2.639957904815674, + "logps/chosen": -287.22015380859375, + "logps/rejected": -338.83001708984375, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5336463451385498, + "rewards/margins": 10.47472858428955, + "rewards/rejected": -11.008374214172363, + "step": 4120 + }, + { + "epoch": 2.13, + "learning_rate": 1.6073819085867276e-07, + "logits/chosen": -2.7341055870056152, + "logits/rejected": -2.757556676864624, + "logps/chosen": -270.1176452636719, + "logps/rejected": -388.94757080078125, + "loss": 0.013, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.2091827690601349, + "rewards/margins": 9.677942276000977, + "rewards/rejected": -9.887125968933105, + "step": 4130 + }, + { + "epoch": 2.14, + "learning_rate": 1.597819850831899e-07, + "logits/chosen": -2.6629955768585205, + "logits/rejected": -2.578319549560547, + "logps/chosen": -250.00076293945312, + "logps/rejected": -334.855712890625, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4083684980869293, + "rewards/margins": 9.928139686584473, + "rewards/rejected": -10.336507797241211, + "step": 4140 + }, + { + "epoch": 2.14, + "learning_rate": 1.5882577930770702e-07, + "logits/chosen": -2.7194714546203613, + "logits/rejected": -2.620279312133789, + "logps/chosen": -283.8021545410156, + "logps/rejected": -331.04412841796875, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6026127934455872, + "rewards/margins": 9.564842224121094, + "rewards/rejected": -10.167454719543457, + "step": 4150 + }, + { + "epoch": 2.15, + "learning_rate": 1.5786957353222414e-07, + "logits/chosen": -2.687295436859131, + "logits/rejected": -2.585244655609131, + "logps/chosen": -325.5843200683594, + "logps/rejected": -326.9093017578125, + "loss": 0.0169, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.12477810680866241, + "rewards/margins": 9.427237510681152, + "rewards/rejected": -9.55201530456543, + "step": 4160 + }, + { + "epoch": 2.15, + "learning_rate": 1.5691336775674125e-07, + "logits/chosen": -2.6061997413635254, + "logits/rejected": -2.583631992340088, + "logps/chosen": -257.86517333984375, + "logps/rejected": -322.37945556640625, + "loss": 0.0183, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44339266419410706, + "rewards/margins": 10.549286842346191, + "rewards/rejected": -10.99267864227295, + "step": 4170 + }, + { + "epoch": 2.16, + "learning_rate": 1.5595716198125837e-07, + "logits/chosen": -2.646427631378174, + "logits/rejected": -2.572059392929077, + "logps/chosen": -298.1983947753906, + "logps/rejected": -348.5059814453125, + "loss": 0.0299, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2386928796768188, + "rewards/margins": 9.014298439025879, + "rewards/rejected": -10.252991676330566, + "step": 4180 + }, + { + "epoch": 2.16, + "learning_rate": 1.5500095620577546e-07, + "logits/chosen": -2.745795488357544, + "logits/rejected": -2.687631130218506, + "logps/chosen": -265.0744934082031, + "logps/rejected": -290.81414794921875, + "loss": 0.0126, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.1059540510177612, + "rewards/margins": 8.500246047973633, + "rewards/rejected": -9.606199264526367, + "step": 4190 + }, + { + "epoch": 2.17, + "learning_rate": 1.5404475043029257e-07, + "logits/chosen": -2.689143657684326, + "logits/rejected": -2.657597303390503, + "logps/chosen": -265.9485778808594, + "logps/rejected": -319.83935546875, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9064074754714966, + "rewards/margins": 9.307806015014648, + "rewards/rejected": -10.214213371276855, + "step": 4200 + }, + { + "epoch": 2.17, + "eval_logits/chosen": -2.679666042327881, + "eval_logits/rejected": -2.63228702545166, + "eval_logps/chosen": -306.5373229980469, + "eval_logps/rejected": -294.5120849609375, + "eval_loss": 0.6562466621398926, + "eval_rewards/accuracies": 0.7620000243186951, + "eval_rewards/chosen": -3.5386173725128174, + "eval_rewards/margins": 3.064192295074463, + "eval_rewards/rejected": -6.602809906005859, + "eval_runtime": 299.9162, + "eval_samples_per_second": 6.669, + "eval_steps_per_second": 0.417, + "step": 4200 + }, + { + "epoch": 2.17, + "learning_rate": 1.5308854465480971e-07, + "logits/chosen": -2.7112841606140137, + "logits/rejected": -2.6639649868011475, + "logps/chosen": -277.16107177734375, + "logps/rejected": -296.5805358886719, + "loss": 0.0116, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.49038490653038025, + "rewards/margins": 8.583968162536621, + "rewards/rejected": -9.07435417175293, + "step": 4210 + }, + { + "epoch": 2.18, + "learning_rate": 1.5213233887932683e-07, + "logits/chosen": -2.64880633354187, + "logits/rejected": -2.590536594390869, + "logps/chosen": -312.7170104980469, + "logps/rejected": -328.259521484375, + "loss": 0.0065, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.9708864092826843, + "rewards/margins": 8.74626350402832, + "rewards/rejected": -9.71714973449707, + "step": 4220 + }, + { + "epoch": 2.18, + "learning_rate": 1.5117613310384395e-07, + "logits/chosen": -2.682318925857544, + "logits/rejected": -2.6444616317749023, + "logps/chosen": -293.54766845703125, + "logps/rejected": -340.8854064941406, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.087975263595581, + "rewards/margins": 10.280842781066895, + "rewards/rejected": -11.368818283081055, + "step": 4230 + }, + { + "epoch": 2.19, + "learning_rate": 1.5021992732836106e-07, + "logits/chosen": -2.6920106410980225, + "logits/rejected": -2.6487114429473877, + "logps/chosen": -290.64080810546875, + "logps/rejected": -374.7381896972656, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0875754356384277, + "rewards/margins": 10.12022590637207, + "rewards/rejected": -11.207801818847656, + "step": 4240 + }, + { + "epoch": 2.19, + "learning_rate": 1.4926372155287818e-07, + "logits/chosen": -2.6744678020477295, + "logits/rejected": -2.684854030609131, + "logps/chosen": -235.4513397216797, + "logps/rejected": -328.29443359375, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1996930837631226, + "rewards/margins": 9.253260612487793, + "rewards/rejected": -10.452953338623047, + "step": 4250 + }, + { + "epoch": 2.2, + "learning_rate": 1.483075157773953e-07, + "logits/chosen": -2.7144248485565186, + "logits/rejected": -2.6114330291748047, + "logps/chosen": -287.950927734375, + "logps/rejected": -306.99176025390625, + "loss": 0.0094, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.7344351410865784, + "rewards/margins": 8.17457389831543, + "rewards/rejected": -8.909008979797363, + "step": 4260 + }, + { + "epoch": 2.2, + "learning_rate": 1.4735131000191238e-07, + "logits/chosen": -2.6291093826293945, + "logits/rejected": -2.6125636100769043, + "logps/chosen": -247.571533203125, + "logps/rejected": -277.3638610839844, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6242862939834595, + "rewards/margins": 8.818742752075195, + "rewards/rejected": -9.443029403686523, + "step": 4270 + }, + { + "epoch": 2.21, + "learning_rate": 1.4639510422642952e-07, + "logits/chosen": -2.675595283508301, + "logits/rejected": -2.637773275375366, + "logps/chosen": -250.6396942138672, + "logps/rejected": -305.4952697753906, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2030136585235596, + "rewards/margins": 8.31075668334961, + "rewards/rejected": -9.513771057128906, + "step": 4280 + }, + { + "epoch": 2.21, + "learning_rate": 1.4543889845094664e-07, + "logits/chosen": -2.618609666824341, + "logits/rejected": -2.6568849086761475, + "logps/chosen": -308.46539306640625, + "logps/rejected": -380.2574157714844, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23853091895580292, + "rewards/margins": 10.959972381591797, + "rewards/rejected": -11.198503494262695, + "step": 4290 + }, + { + "epoch": 2.22, + "learning_rate": 1.4448269267546376e-07, + "logits/chosen": -2.652076244354248, + "logits/rejected": -2.6042404174804688, + "logps/chosen": -331.9740295410156, + "logps/rejected": -353.1656494140625, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22926807403564453, + "rewards/margins": 10.5140962600708, + "rewards/rejected": -10.284828186035156, + "step": 4300 + }, + { + "epoch": 2.22, + "eval_logits/chosen": -2.6678168773651123, + "eval_logits/rejected": -2.619150400161743, + "eval_logps/chosen": -308.19952392578125, + "eval_logps/rejected": -297.47637939453125, + "eval_loss": 0.6742202639579773, + "eval_rewards/accuracies": 0.7559999823570251, + "eval_rewards/chosen": -3.704840898513794, + "eval_rewards/margins": 3.194397449493408, + "eval_rewards/rejected": -6.899238586425781, + "eval_runtime": 300.3339, + "eval_samples_per_second": 6.659, + "eval_steps_per_second": 0.416, + "step": 4300 + }, + { + "epoch": 2.23, + "learning_rate": 1.4352648689998087e-07, + "logits/chosen": -2.7097830772399902, + "logits/rejected": -2.593984365463257, + "logps/chosen": -259.73486328125, + "logps/rejected": -311.3151550292969, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.753538966178894, + "rewards/margins": 9.248102188110352, + "rewards/rejected": -10.001642227172852, + "step": 4310 + }, + { + "epoch": 2.23, + "learning_rate": 1.42570281124498e-07, + "logits/chosen": -2.680701494216919, + "logits/rejected": -2.6096949577331543, + "logps/chosen": -305.2640075683594, + "logps/rejected": -359.37835693359375, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4520825743675232, + "rewards/margins": 9.839849472045898, + "rewards/rejected": -10.2919340133667, + "step": 4320 + }, + { + "epoch": 2.24, + "learning_rate": 1.416140753490151e-07, + "logits/chosen": -2.6360256671905518, + "logits/rejected": -2.669020414352417, + "logps/chosen": -304.3402099609375, + "logps/rejected": -366.10369873046875, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6026782989501953, + "rewards/margins": 10.772412300109863, + "rewards/rejected": -11.375089645385742, + "step": 4330 + }, + { + "epoch": 2.24, + "learning_rate": 1.4065786957353222e-07, + "logits/chosen": -2.6518890857696533, + "logits/rejected": -2.6714396476745605, + "logps/chosen": -288.86322021484375, + "logps/rejected": -343.5704040527344, + "loss": 0.0143, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.0478026866912842, + "rewards/margins": 9.44815731048584, + "rewards/rejected": -10.495959281921387, + "step": 4340 + }, + { + "epoch": 2.25, + "learning_rate": 1.3970166379804933e-07, + "logits/chosen": -2.6951656341552734, + "logits/rejected": -2.616739273071289, + "logps/chosen": -326.5705261230469, + "logps/rejected": -336.5369873046875, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3368417024612427, + "rewards/margins": 9.160501480102539, + "rewards/rejected": -10.497343063354492, + "step": 4350 + }, + { + "epoch": 2.25, + "learning_rate": 1.3874545802256645e-07, + "logits/chosen": -2.711392641067505, + "logits/rejected": -2.6601951122283936, + "logps/chosen": -275.40130615234375, + "logps/rejected": -315.70989990234375, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0467510223388672, + "rewards/margins": 9.312707901000977, + "rewards/rejected": -10.359457969665527, + "step": 4360 + }, + { + "epoch": 2.26, + "learning_rate": 1.3778925224708357e-07, + "logits/chosen": -2.5673935413360596, + "logits/rejected": -2.5685951709747314, + "logps/chosen": -266.0102233886719, + "logps/rejected": -328.2872009277344, + "loss": 0.0068, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.188098669052124, + "rewards/margins": 9.597440719604492, + "rewards/rejected": -10.785538673400879, + "step": 4370 + }, + { + "epoch": 2.26, + "learning_rate": 1.3683304647160068e-07, + "logits/chosen": -2.599808931350708, + "logits/rejected": -2.571666717529297, + "logps/chosen": -257.6767272949219, + "logps/rejected": -352.75115966796875, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1264151334762573, + "rewards/margins": 10.437751770019531, + "rewards/rejected": -11.564168930053711, + "step": 4380 + }, + { + "epoch": 2.27, + "learning_rate": 1.358768406961178e-07, + "logits/chosen": -2.595768928527832, + "logits/rejected": -2.6167728900909424, + "logps/chosen": -254.9929656982422, + "logps/rejected": -339.8115234375, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3337209224700928, + "rewards/margins": 10.193733215332031, + "rewards/rejected": -11.527453422546387, + "step": 4390 + }, + { + "epoch": 2.27, + "learning_rate": 1.349206349206349e-07, + "logits/chosen": -2.634752035140991, + "logits/rejected": -2.6203811168670654, + "logps/chosen": -250.7345733642578, + "logps/rejected": -308.99835205078125, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.247837781906128, + "rewards/margins": 8.695775985717773, + "rewards/rejected": -10.943613052368164, + "step": 4400 + }, + { + "epoch": 2.27, + "eval_logits/chosen": -2.645358085632324, + "eval_logits/rejected": -2.5974912643432617, + "eval_logps/chosen": -312.79296875, + "eval_logps/rejected": -303.3212585449219, + "eval_loss": 0.6981696486473083, + "eval_rewards/accuracies": 0.7680000066757202, + "eval_rewards/chosen": -4.164183616638184, + "eval_rewards/margins": 3.319542169570923, + "eval_rewards/rejected": -7.483725070953369, + "eval_runtime": 300.4701, + "eval_samples_per_second": 6.656, + "eval_steps_per_second": 0.416, + "step": 4400 + }, + { + "epoch": 2.28, + "learning_rate": 1.3396442914515203e-07, + "logits/chosen": -2.5926036834716797, + "logits/rejected": -2.5172619819641113, + "logps/chosen": -292.2915344238281, + "logps/rejected": -357.282958984375, + "loss": 0.0094, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.6816883087158203, + "rewards/margins": 10.120698928833008, + "rewards/rejected": -11.802387237548828, + "step": 4410 + }, + { + "epoch": 2.28, + "learning_rate": 1.3300822336966917e-07, + "logits/chosen": -2.655643939971924, + "logits/rejected": -2.5098280906677246, + "logps/chosen": -315.7123107910156, + "logps/rejected": -347.8288879394531, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6822350025177002, + "rewards/margins": 11.133058547973633, + "rewards/rejected": -11.815293312072754, + "step": 4420 + }, + { + "epoch": 2.29, + "learning_rate": 1.3205201759418626e-07, + "logits/chosen": -2.5708107948303223, + "logits/rejected": -2.4846396446228027, + "logps/chosen": -311.37249755859375, + "logps/rejected": -330.8601989746094, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6858748197555542, + "rewards/margins": 9.757244110107422, + "rewards/rejected": -10.44311809539795, + "step": 4430 + }, + { + "epoch": 2.29, + "learning_rate": 1.3109581181870338e-07, + "logits/chosen": -2.7022705078125, + "logits/rejected": -2.6619653701782227, + "logps/chosen": -325.13134765625, + "logps/rejected": -335.73382568359375, + "loss": 0.0094, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.3868844509124756, + "rewards/margins": 10.037075996398926, + "rewards/rejected": -10.423959732055664, + "step": 4440 + }, + { + "epoch": 2.3, + "learning_rate": 1.301396060432205e-07, + "logits/chosen": -2.6541895866394043, + "logits/rejected": -2.559623956680298, + "logps/chosen": -318.50872802734375, + "logps/rejected": -341.30523681640625, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.30984628200531, + "rewards/margins": 9.713074684143066, + "rewards/rejected": -11.022921562194824, + "step": 4450 + }, + { + "epoch": 2.3, + "learning_rate": 1.291834002677376e-07, + "logits/chosen": -2.5706753730773926, + "logits/rejected": -2.5507564544677734, + "logps/chosen": -281.9523620605469, + "logps/rejected": -352.04986572265625, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5520942211151123, + "rewards/margins": 9.727930068969727, + "rewards/rejected": -11.280024528503418, + "step": 4460 + }, + { + "epoch": 2.31, + "learning_rate": 1.2822719449225472e-07, + "logits/chosen": -2.5962116718292236, + "logits/rejected": -2.560507297515869, + "logps/chosen": -223.4073944091797, + "logps/rejected": -311.93670654296875, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.894232153892517, + "rewards/margins": 9.102167129516602, + "rewards/rejected": -10.996397972106934, + "step": 4470 + }, + { + "epoch": 2.31, + "learning_rate": 1.2727098871677184e-07, + "logits/chosen": -2.677396774291992, + "logits/rejected": -2.623908519744873, + "logps/chosen": -348.23760986328125, + "logps/rejected": -390.755859375, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8249503374099731, + "rewards/margins": 10.461699485778809, + "rewards/rejected": -11.286649703979492, + "step": 4480 + }, + { + "epoch": 2.32, + "learning_rate": 1.2631478294128898e-07, + "logits/chosen": -2.7189507484436035, + "logits/rejected": -2.6741905212402344, + "logps/chosen": -275.53314208984375, + "logps/rejected": -339.3215026855469, + "loss": 0.0241, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.085033655166626, + "rewards/margins": 9.075556755065918, + "rewards/rejected": -10.160591125488281, + "step": 4490 + }, + { + "epoch": 2.32, + "learning_rate": 1.253585771658061e-07, + "logits/chosen": -2.619215965270996, + "logits/rejected": -2.6242549419403076, + "logps/chosen": -251.1781005859375, + "logps/rejected": -334.31689453125, + "loss": 0.0173, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.1404932737350464, + "rewards/margins": 8.950601577758789, + "rewards/rejected": -10.091094017028809, + "step": 4500 + }, + { + "epoch": 2.32, + "eval_logits/chosen": -2.6393561363220215, + "eval_logits/rejected": -2.596702814102173, + "eval_logps/chosen": -310.2903747558594, + "eval_logps/rejected": -297.9649963378906, + "eval_loss": 0.6661145091056824, + "eval_rewards/accuracies": 0.765999972820282, + "eval_rewards/chosen": -3.9139232635498047, + "eval_rewards/margins": 3.0341811180114746, + "eval_rewards/rejected": -6.948104381561279, + "eval_runtime": 301.1722, + "eval_samples_per_second": 6.641, + "eval_steps_per_second": 0.415, + "step": 4500 + }, + { + "epoch": 2.33, + "learning_rate": 1.2440237139032319e-07, + "logits/chosen": -2.7100770473480225, + "logits/rejected": -2.663496494293213, + "logps/chosen": -296.59197998046875, + "logps/rejected": -317.30499267578125, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1978495121002197, + "rewards/margins": 9.186239242553711, + "rewards/rejected": -10.384088516235352, + "step": 4510 + }, + { + "epoch": 2.33, + "learning_rate": 1.234461656148403e-07, + "logits/chosen": -2.6586012840270996, + "logits/rejected": -2.6368064880371094, + "logps/chosen": -323.82513427734375, + "logps/rejected": -360.9985046386719, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9345844984054565, + "rewards/margins": 10.220632553100586, + "rewards/rejected": -11.155217170715332, + "step": 4520 + }, + { + "epoch": 2.34, + "learning_rate": 1.2248995983935742e-07, + "logits/chosen": -2.680159091949463, + "logits/rejected": -2.663702964782715, + "logps/chosen": -302.0219421386719, + "logps/rejected": -340.0579528808594, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9645177721977234, + "rewards/margins": 9.70927619934082, + "rewards/rejected": -10.673794746398926, + "step": 4530 + }, + { + "epoch": 2.34, + "learning_rate": 1.2153375406387456e-07, + "logits/chosen": -2.6001861095428467, + "logits/rejected": -2.597421169281006, + "logps/chosen": -296.09771728515625, + "logps/rejected": -371.7466735839844, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9812561273574829, + "rewards/margins": 10.29119873046875, + "rewards/rejected": -11.272455215454102, + "step": 4540 + }, + { + "epoch": 2.35, + "learning_rate": 1.2057754828839165e-07, + "logits/chosen": -2.639939546585083, + "logits/rejected": -2.6484570503234863, + "logps/chosen": -273.16802978515625, + "logps/rejected": -337.97332763671875, + "loss": 0.0118, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.728509247303009, + "rewards/margins": 9.519789695739746, + "rewards/rejected": -10.248300552368164, + "step": 4550 + }, + { + "epoch": 2.35, + "learning_rate": 1.1962134251290876e-07, + "logits/chosen": -2.6198954582214355, + "logits/rejected": -2.5811574459075928, + "logps/chosen": -298.90081787109375, + "logps/rejected": -304.1781311035156, + "loss": 0.0194, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.5390739440917969, + "rewards/margins": 8.423462867736816, + "rewards/rejected": -9.962536811828613, + "step": 4560 + }, + { + "epoch": 2.36, + "learning_rate": 1.1866513673742588e-07, + "logits/chosen": -2.658489465713501, + "logits/rejected": -2.6300225257873535, + "logps/chosen": -298.3923034667969, + "logps/rejected": -292.21429443359375, + "loss": 0.0205, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.46941104531288147, + "rewards/margins": 8.323554992675781, + "rewards/rejected": -8.79296588897705, + "step": 4570 + }, + { + "epoch": 2.36, + "learning_rate": 1.1770893096194301e-07, + "logits/chosen": -2.557232141494751, + "logits/rejected": -2.510739326477051, + "logps/chosen": -286.90557861328125, + "logps/rejected": -323.8365173339844, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8109794855117798, + "rewards/margins": 9.279717445373535, + "rewards/rejected": -10.090696334838867, + "step": 4580 + }, + { + "epoch": 2.37, + "learning_rate": 1.1675272518646012e-07, + "logits/chosen": -2.633720636367798, + "logits/rejected": -2.6340861320495605, + "logps/chosen": -241.53549194335938, + "logps/rejected": -319.45574951171875, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45409899950027466, + "rewards/margins": 8.88463020324707, + "rewards/rejected": -9.338728904724121, + "step": 4590 + }, + { + "epoch": 2.37, + "learning_rate": 1.1579651941097724e-07, + "logits/chosen": -2.6740808486938477, + "logits/rejected": -2.641120672225952, + "logps/chosen": -316.9101257324219, + "logps/rejected": -327.01593017578125, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7234556674957275, + "rewards/margins": 9.027227401733398, + "rewards/rejected": -9.75068187713623, + "step": 4600 + }, + { + "epoch": 2.37, + "eval_logits/chosen": -2.606816291809082, + "eval_logits/rejected": -2.5628256797790527, + "eval_logps/chosen": -308.2720642089844, + "eval_logps/rejected": -296.76300048828125, + "eval_loss": 0.6605738997459412, + "eval_rewards/accuracies": 0.7639999985694885, + "eval_rewards/chosen": -3.712094306945801, + "eval_rewards/margins": 3.1158080101013184, + "eval_rewards/rejected": -6.827902317047119, + "eval_runtime": 300.7538, + "eval_samples_per_second": 6.65, + "eval_steps_per_second": 0.416, + "step": 4600 + }, + { + "epoch": 2.38, + "learning_rate": 1.1484031363549436e-07, + "logits/chosen": -2.508571147918701, + "logits/rejected": -2.490762710571289, + "logps/chosen": -304.9242248535156, + "logps/rejected": -335.79913330078125, + "loss": 0.0131, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.0555249452590942, + "rewards/margins": 9.724864959716797, + "rewards/rejected": -10.780390739440918, + "step": 4610 + }, + { + "epoch": 2.39, + "learning_rate": 1.1388410786001147e-07, + "logits/chosen": -2.606189012527466, + "logits/rejected": -2.6098804473876953, + "logps/chosen": -260.5215759277344, + "logps/rejected": -345.4276123046875, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.055306315422058, + "rewards/margins": 8.989636421203613, + "rewards/rejected": -10.044942855834961, + "step": 4620 + }, + { + "epoch": 2.39, + "learning_rate": 1.1292790208452859e-07, + "logits/chosen": -2.602384090423584, + "logits/rejected": -2.5431814193725586, + "logps/chosen": -231.57400512695312, + "logps/rejected": -317.2869567871094, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7374894618988037, + "rewards/margins": 9.130033493041992, + "rewards/rejected": -9.867525100708008, + "step": 4630 + }, + { + "epoch": 2.4, + "learning_rate": 1.119716963090457e-07, + "logits/chosen": -2.5372633934020996, + "logits/rejected": -2.529026985168457, + "logps/chosen": -253.7849884033203, + "logps/rejected": -338.38812255859375, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3635234832763672, + "rewards/margins": 9.181906700134277, + "rewards/rejected": -10.545430183410645, + "step": 4640 + }, + { + "epoch": 2.4, + "learning_rate": 1.1101549053356282e-07, + "logits/chosen": -2.589287757873535, + "logits/rejected": -2.546631336212158, + "logps/chosen": -292.82049560546875, + "logps/rejected": -336.3973083496094, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0994226932525635, + "rewards/margins": 9.20947265625, + "rewards/rejected": -10.308894157409668, + "step": 4650 + }, + { + "epoch": 2.41, + "learning_rate": 1.1005928475807993e-07, + "logits/chosen": -2.5391857624053955, + "logits/rejected": -2.514249324798584, + "logps/chosen": -240.534912109375, + "logps/rejected": -316.34271240234375, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.430264949798584, + "rewards/margins": 8.712211608886719, + "rewards/rejected": -10.142476081848145, + "step": 4660 + }, + { + "epoch": 2.41, + "learning_rate": 1.0910307898259705e-07, + "logits/chosen": -2.475175380706787, + "logits/rejected": -2.4967644214630127, + "logps/chosen": -259.62890625, + "logps/rejected": -365.22540283203125, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9143625497817993, + "rewards/margins": 9.281826972961426, + "rewards/rejected": -10.196188926696777, + "step": 4670 + }, + { + "epoch": 2.42, + "learning_rate": 1.0814687320711418e-07, + "logits/chosen": -2.4412999153137207, + "logits/rejected": -2.4557924270629883, + "logps/chosen": -230.18643188476562, + "logps/rejected": -358.99578857421875, + "loss": 0.01, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.9413540959358215, + "rewards/margins": 10.510141372680664, + "rewards/rejected": -11.451495170593262, + "step": 4680 + }, + { + "epoch": 2.42, + "learning_rate": 1.0719066743163128e-07, + "logits/chosen": -2.5392403602600098, + "logits/rejected": -2.531977415084839, + "logps/chosen": -321.71453857421875, + "logps/rejected": -349.1875305175781, + "loss": 0.0168, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.1652792692184448, + "rewards/margins": 9.923773765563965, + "rewards/rejected": -11.089054107666016, + "step": 4690 + }, + { + "epoch": 2.43, + "learning_rate": 1.062344616561484e-07, + "logits/chosen": -2.553591251373291, + "logits/rejected": -2.520071268081665, + "logps/chosen": -287.4483642578125, + "logps/rejected": -355.64996337890625, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7835440039634705, + "rewards/margins": 10.432835578918457, + "rewards/rejected": -11.216379165649414, + "step": 4700 + }, + { + "epoch": 2.43, + "eval_logits/chosen": -2.561335563659668, + "eval_logits/rejected": -2.5127322673797607, + "eval_logps/chosen": -310.2392883300781, + "eval_logps/rejected": -300.09649658203125, + "eval_loss": 0.6704944968223572, + "eval_rewards/accuracies": 0.7680000066757202, + "eval_rewards/chosen": -3.908813238143921, + "eval_rewards/margins": 3.2524404525756836, + "eval_rewards/rejected": -7.161253929138184, + "eval_runtime": 300.5289, + "eval_samples_per_second": 6.655, + "eval_steps_per_second": 0.416, + "step": 4700 + }, + { + "epoch": 2.43, + "learning_rate": 1.0527825588066551e-07, + "logits/chosen": -2.5191619396209717, + "logits/rejected": -2.5092837810516357, + "logps/chosen": -290.0946350097656, + "logps/rejected": -342.290771484375, + "loss": 0.0103, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.5244763493537903, + "rewards/margins": 11.548242568969727, + "rewards/rejected": -12.07271957397461, + "step": 4710 + }, + { + "epoch": 2.44, + "learning_rate": 1.0432205010518264e-07, + "logits/chosen": -2.528475522994995, + "logits/rejected": -2.5448966026306152, + "logps/chosen": -250.07003784179688, + "logps/rejected": -357.0660400390625, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9180407524108887, + "rewards/margins": 10.637187004089355, + "rewards/rejected": -11.555229187011719, + "step": 4720 + }, + { + "epoch": 2.44, + "learning_rate": 1.0336584432969974e-07, + "logits/chosen": -2.4682259559631348, + "logits/rejected": -2.4316627979278564, + "logps/chosen": -244.00253295898438, + "logps/rejected": -322.43280029296875, + "loss": 0.0116, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.7326822280883789, + "rewards/margins": 9.885066032409668, + "rewards/rejected": -10.617748260498047, + "step": 4730 + }, + { + "epoch": 2.45, + "learning_rate": 1.0240963855421686e-07, + "logits/chosen": -2.4702906608581543, + "logits/rejected": -2.4408233165740967, + "logps/chosen": -328.27734375, + "logps/rejected": -347.71484375, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8403548002243042, + "rewards/margins": 9.786865234375, + "rewards/rejected": -10.62722110748291, + "step": 4740 + }, + { + "epoch": 2.45, + "learning_rate": 1.0145343277873399e-07, + "logits/chosen": -2.5148329734802246, + "logits/rejected": -2.5080223083496094, + "logps/chosen": -317.0351867675781, + "logps/rejected": -354.51104736328125, + "loss": 0.0106, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.0553228855133057, + "rewards/margins": 9.899486541748047, + "rewards/rejected": -10.95481014251709, + "step": 4750 + }, + { + "epoch": 2.46, + "learning_rate": 1.004972270032511e-07, + "logits/chosen": -2.392381191253662, + "logits/rejected": -2.311124086380005, + "logps/chosen": -252.53857421875, + "logps/rejected": -287.5732727050781, + "loss": 0.012, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.5313899517059326, + "rewards/margins": 9.025603294372559, + "rewards/rejected": -10.55699348449707, + "step": 4760 + }, + { + "epoch": 2.46, + "learning_rate": 9.95410212277682e-08, + "logits/chosen": -2.542147397994995, + "logits/rejected": -2.5188517570495605, + "logps/chosen": -286.36468505859375, + "logps/rejected": -352.60162353515625, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2542285919189453, + "rewards/margins": 10.117773056030273, + "rewards/rejected": -11.372002601623535, + "step": 4770 + }, + { + "epoch": 2.47, + "learning_rate": 9.858481545228532e-08, + "logits/chosen": -2.5597620010375977, + "logits/rejected": -2.46887469291687, + "logps/chosen": -299.46929931640625, + "logps/rejected": -323.3019104003906, + "loss": 0.0087, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2001690864562988, + "rewards/margins": 9.905837059020996, + "rewards/rejected": -11.10600757598877, + "step": 4780 + }, + { + "epoch": 2.47, + "learning_rate": 9.762860967680245e-08, + "logits/chosen": -2.473726749420166, + "logits/rejected": -2.386307954788208, + "logps/chosen": -299.240478515625, + "logps/rejected": -367.3982849121094, + "loss": 0.0081, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.1765098571777344, + "rewards/margins": 10.347665786743164, + "rewards/rejected": -12.524175643920898, + "step": 4790 + }, + { + "epoch": 2.48, + "learning_rate": 9.667240390131957e-08, + "logits/chosen": -2.590724229812622, + "logits/rejected": -2.485705614089966, + "logps/chosen": -312.03411865234375, + "logps/rejected": -355.8626403808594, + "loss": 0.0099, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.032738447189331, + "rewards/margins": 10.802017211914062, + "rewards/rejected": -11.834755897521973, + "step": 4800 + }, + { + "epoch": 2.48, + "eval_logits/chosen": -2.5658154487609863, + "eval_logits/rejected": -2.516890048980713, + "eval_logps/chosen": -310.987548828125, + "eval_logps/rejected": -301.03643798828125, + "eval_loss": 0.6825354099273682, + "eval_rewards/accuracies": 0.7720000147819519, + "eval_rewards/chosen": -3.9836413860321045, + "eval_rewards/margins": 3.27160382270813, + "eval_rewards/rejected": -7.255245208740234, + "eval_runtime": 301.6322, + "eval_samples_per_second": 6.631, + "eval_steps_per_second": 0.414, + "step": 4800 + }, + { + "epoch": 2.48, + "learning_rate": 9.571619812583667e-08, + "logits/chosen": -2.4577534198760986, + "logits/rejected": -2.455472946166992, + "logps/chosen": -291.69390869140625, + "logps/rejected": -345.99505615234375, + "loss": 0.0141, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.8659383654594421, + "rewards/margins": 9.381586074829102, + "rewards/rejected": -10.24752426147461, + "step": 4810 + }, + { + "epoch": 2.49, + "learning_rate": 9.47599923503538e-08, + "logits/chosen": -2.563683032989502, + "logits/rejected": -2.500837802886963, + "logps/chosen": -276.25091552734375, + "logps/rejected": -345.19488525390625, + "loss": 0.0231, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.059242606163025, + "rewards/margins": 9.535378456115723, + "rewards/rejected": -10.594621658325195, + "step": 4820 + }, + { + "epoch": 2.49, + "learning_rate": 9.380378657487091e-08, + "logits/chosen": -2.5754013061523438, + "logits/rejected": -2.565793037414551, + "logps/chosen": -277.8792419433594, + "logps/rejected": -331.2759094238281, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.173643946647644, + "rewards/margins": 9.848978042602539, + "rewards/rejected": -11.022623062133789, + "step": 4830 + }, + { + "epoch": 2.5, + "learning_rate": 9.284758079938803e-08, + "logits/chosen": -2.5554699897766113, + "logits/rejected": -2.4842638969421387, + "logps/chosen": -283.8736572265625, + "logps/rejected": -321.87579345703125, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.062213659286499, + "rewards/margins": 9.168981552124023, + "rewards/rejected": -10.231194496154785, + "step": 4840 + }, + { + "epoch": 2.5, + "learning_rate": 9.189137502390513e-08, + "logits/chosen": -2.5778510570526123, + "logits/rejected": -2.5440096855163574, + "logps/chosen": -286.9813537597656, + "logps/rejected": -396.63922119140625, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4980794191360474, + "rewards/margins": 9.38342571258545, + "rewards/rejected": -10.88150405883789, + "step": 4850 + }, + { + "epoch": 2.51, + "learning_rate": 9.093516924842226e-08, + "logits/chosen": -2.5628631114959717, + "logits/rejected": -2.538412570953369, + "logps/chosen": -267.45501708984375, + "logps/rejected": -355.0173034667969, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.669926643371582, + "rewards/margins": 10.511675834655762, + "rewards/rejected": -11.181602478027344, + "step": 4860 + }, + { + "epoch": 2.51, + "learning_rate": 8.997896347293938e-08, + "logits/chosen": -2.601468563079834, + "logits/rejected": -2.582242488861084, + "logps/chosen": -285.9410705566406, + "logps/rejected": -378.1112976074219, + "loss": 0.0155, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.9583877325057983, + "rewards/margins": 9.609073638916016, + "rewards/rejected": -10.567461013793945, + "step": 4870 + }, + { + "epoch": 2.52, + "learning_rate": 8.902275769745648e-08, + "logits/chosen": -2.5055062770843506, + "logits/rejected": -2.5199389457702637, + "logps/chosen": -252.2093505859375, + "logps/rejected": -352.6201477050781, + "loss": 0.0115, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.0273876190185547, + "rewards/margins": 10.693878173828125, + "rewards/rejected": -11.72126579284668, + "step": 4880 + }, + { + "epoch": 2.52, + "learning_rate": 8.806655192197361e-08, + "logits/chosen": -2.4830613136291504, + "logits/rejected": -2.5039305686950684, + "logps/chosen": -244.4785919189453, + "logps/rejected": -365.1302795410156, + "loss": 0.0077, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.3728035092353821, + "rewards/margins": 11.747212409973145, + "rewards/rejected": -12.120016098022461, + "step": 4890 + }, + { + "epoch": 2.53, + "learning_rate": 8.711034614649072e-08, + "logits/chosen": -2.563028573989868, + "logits/rejected": -2.5216782093048096, + "logps/chosen": -270.1450500488281, + "logps/rejected": -354.80352783203125, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.764126181602478, + "rewards/margins": 10.060375213623047, + "rewards/rejected": -11.824501991271973, + "step": 4900 + }, + { + "epoch": 2.53, + "eval_logits/chosen": -2.5843636989593506, + "eval_logits/rejected": -2.5330135822296143, + "eval_logps/chosen": -313.6849365234375, + "eval_logps/rejected": -306.07098388671875, + "eval_loss": 0.6937812566757202, + "eval_rewards/accuracies": 0.765999972820282, + "eval_rewards/chosen": -4.253377914428711, + "eval_rewards/margins": 3.5053179264068604, + "eval_rewards/rejected": -7.758696556091309, + "eval_runtime": 300.2355, + "eval_samples_per_second": 6.661, + "eval_steps_per_second": 0.416, + "step": 4900 + }, + { + "epoch": 2.53, + "learning_rate": 8.615414037100784e-08, + "logits/chosen": -2.5201663970947266, + "logits/rejected": -2.455021381378174, + "logps/chosen": -345.52008056640625, + "logps/rejected": -342.25579833984375, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6654549241065979, + "rewards/margins": 10.094911575317383, + "rewards/rejected": -10.76036548614502, + "step": 4910 + }, + { + "epoch": 2.54, + "learning_rate": 8.519793459552494e-08, + "logits/chosen": -2.5423104763031006, + "logits/rejected": -2.4734034538269043, + "logps/chosen": -264.29180908203125, + "logps/rejected": -338.65118408203125, + "loss": 0.0268, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.032165288925171, + "rewards/margins": 10.210509300231934, + "rewards/rejected": -11.242673873901367, + "step": 4920 + }, + { + "epoch": 2.55, + "learning_rate": 8.424172882004207e-08, + "logits/chosen": -2.5510902404785156, + "logits/rejected": -2.5576791763305664, + "logps/chosen": -282.1768798828125, + "logps/rejected": -373.37408447265625, + "loss": 0.0322, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5426809787750244, + "rewards/margins": 10.031122207641602, + "rewards/rejected": -10.57380485534668, + "step": 4930 + }, + { + "epoch": 2.55, + "learning_rate": 8.328552304455919e-08, + "logits/chosen": -2.5924887657165527, + "logits/rejected": -2.5332224369049072, + "logps/chosen": -251.1088409423828, + "logps/rejected": -318.834228515625, + "loss": 0.0142, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5188874006271362, + "rewards/margins": 9.845662117004395, + "rewards/rejected": -10.36454963684082, + "step": 4940 + }, + { + "epoch": 2.56, + "learning_rate": 8.23293172690763e-08, + "logits/chosen": -2.5838980674743652, + "logits/rejected": -2.5227537155151367, + "logps/chosen": -282.2577819824219, + "logps/rejected": -345.3276672363281, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00644490122795105, + "rewards/margins": 11.34768295288086, + "rewards/rejected": -11.354127883911133, + "step": 4950 + }, + { + "epoch": 2.56, + "learning_rate": 8.137311149359343e-08, + "logits/chosen": -2.6914446353912354, + "logits/rejected": -2.566591739654541, + "logps/chosen": -332.66265869140625, + "logps/rejected": -341.6344299316406, + "loss": 0.0042, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.37903502583503723, + "rewards/margins": 11.008275985717773, + "rewards/rejected": -11.387311935424805, + "step": 4960 + }, + { + "epoch": 2.57, + "learning_rate": 8.041690571811053e-08, + "logits/chosen": -2.585280418395996, + "logits/rejected": -2.60071063041687, + "logps/chosen": -288.63519287109375, + "logps/rejected": -351.8276672363281, + "loss": 0.0299, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.1368919610977173, + "rewards/margins": 11.00975227355957, + "rewards/rejected": -12.146644592285156, + "step": 4970 + }, + { + "epoch": 2.57, + "learning_rate": 7.946069994262765e-08, + "logits/chosen": -2.5482683181762695, + "logits/rejected": -2.5048727989196777, + "logps/chosen": -263.3780517578125, + "logps/rejected": -324.5472717285156, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8058077096939087, + "rewards/margins": 10.41413688659668, + "rewards/rejected": -11.219945907592773, + "step": 4980 + }, + { + "epoch": 2.58, + "learning_rate": 7.850449416714476e-08, + "logits/chosen": -2.600778102874756, + "logits/rejected": -2.5648064613342285, + "logps/chosen": -305.68170166015625, + "logps/rejected": -346.5186462402344, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.30596125125885, + "rewards/margins": 9.398767471313477, + "rewards/rejected": -10.704729080200195, + "step": 4990 + }, + { + "epoch": 2.58, + "learning_rate": 7.754828839166188e-08, + "logits/chosen": -2.571073055267334, + "logits/rejected": -2.5455102920532227, + "logps/chosen": -278.08050537109375, + "logps/rejected": -354.3592224121094, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2086985111236572, + "rewards/margins": 9.833106994628906, + "rewards/rejected": -11.041807174682617, + "step": 5000 + }, + { + "epoch": 2.58, + "eval_logits/chosen": -2.582568645477295, + "eval_logits/rejected": -2.5329596996307373, + "eval_logps/chosen": -314.1288146972656, + "eval_logps/rejected": -306.4033508300781, + "eval_loss": 0.6948726773262024, + "eval_rewards/accuracies": 0.765999972820282, + "eval_rewards/chosen": -4.29776668548584, + "eval_rewards/margins": 3.4941699504852295, + "eval_rewards/rejected": -7.79193639755249, + "eval_runtime": 298.8189, + "eval_samples_per_second": 6.693, + "eval_steps_per_second": 0.418, + "step": 5000 + }, + { + "epoch": 2.59, + "learning_rate": 7.6592082616179e-08, + "logits/chosen": -2.5867271423339844, + "logits/rejected": -2.5527195930480957, + "logps/chosen": -324.69915771484375, + "logps/rejected": -377.70257568359375, + "loss": 0.0118, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.7334973812103271, + "rewards/margins": 10.30841064453125, + "rewards/rejected": -12.041908264160156, + "step": 5010 + }, + { + "epoch": 2.59, + "learning_rate": 7.563587684069611e-08, + "logits/chosen": -2.5030980110168457, + "logits/rejected": -2.520484447479248, + "logps/chosen": -287.6493225097656, + "logps/rejected": -358.873291015625, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5187357068061829, + "rewards/margins": 10.844244003295898, + "rewards/rejected": -11.362979888916016, + "step": 5020 + }, + { + "epoch": 2.6, + "learning_rate": 7.467967106521324e-08, + "logits/chosen": -2.5578770637512207, + "logits/rejected": -2.527339458465576, + "logps/chosen": -257.82684326171875, + "logps/rejected": -265.83624267578125, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0698001384735107, + "rewards/margins": 8.80046272277832, + "rewards/rejected": -9.870262145996094, + "step": 5030 + }, + { + "epoch": 2.6, + "learning_rate": 7.372346528973034e-08, + "logits/chosen": -2.610701322555542, + "logits/rejected": -2.526312828063965, + "logps/chosen": -296.7576904296875, + "logps/rejected": -331.4477233886719, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9005954265594482, + "rewards/margins": 9.4951171875, + "rewards/rejected": -11.395713806152344, + "step": 5040 + }, + { + "epoch": 2.61, + "learning_rate": 7.276725951424746e-08, + "logits/chosen": -2.4974420070648193, + "logits/rejected": -2.4610750675201416, + "logps/chosen": -270.6094970703125, + "logps/rejected": -348.76385498046875, + "loss": 0.0153, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.7320085763931274, + "rewards/margins": 10.474514961242676, + "rewards/rejected": -12.206523895263672, + "step": 5050 + }, + { + "epoch": 2.61, + "learning_rate": 7.181105373876457e-08, + "logits/chosen": -2.512381076812744, + "logits/rejected": -2.4702210426330566, + "logps/chosen": -271.5727233886719, + "logps/rejected": -311.0246276855469, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9624974131584167, + "rewards/margins": 9.830079078674316, + "rewards/rejected": -10.792574882507324, + "step": 5060 + }, + { + "epoch": 2.62, + "learning_rate": 7.08548479632817e-08, + "logits/chosen": -2.4948513507843018, + "logits/rejected": -2.4570116996765137, + "logps/chosen": -316.3506774902344, + "logps/rejected": -339.68853759765625, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7037729620933533, + "rewards/margins": 10.431275367736816, + "rewards/rejected": -11.135048866271973, + "step": 5070 + }, + { + "epoch": 2.62, + "learning_rate": 6.98986421877988e-08, + "logits/chosen": -2.579561233520508, + "logits/rejected": -2.521808385848999, + "logps/chosen": -298.4128723144531, + "logps/rejected": -363.44580078125, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.250028133392334, + "rewards/margins": 9.64060115814209, + "rewards/rejected": -10.890630722045898, + "step": 5080 + }, + { + "epoch": 2.63, + "learning_rate": 6.894243641231592e-08, + "logits/chosen": -2.4587960243225098, + "logits/rejected": -2.4281344413757324, + "logps/chosen": -270.9654846191406, + "logps/rejected": -342.9233093261719, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1718332767486572, + "rewards/margins": 10.706388473510742, + "rewards/rejected": -11.87822151184082, + "step": 5090 + }, + { + "epoch": 2.63, + "learning_rate": 6.798623063683305e-08, + "logits/chosen": -2.4603219032287598, + "logits/rejected": -2.3965721130371094, + "logps/chosen": -280.44354248046875, + "logps/rejected": -350.62969970703125, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.22660231590271, + "rewards/margins": 11.59477424621582, + "rewards/rejected": -12.821374893188477, + "step": 5100 + }, + { + "epoch": 2.63, + "eval_logits/chosen": -2.5619547367095947, + "eval_logits/rejected": -2.509472131729126, + "eval_logps/chosen": -314.65869140625, + "eval_logps/rejected": -308.5892333984375, + "eval_loss": 0.7238790392875671, + "eval_rewards/accuracies": 0.7639999985694885, + "eval_rewards/chosen": -4.350755214691162, + "eval_rewards/margins": 3.659768581390381, + "eval_rewards/rejected": -8.010523796081543, + "eval_runtime": 300.7886, + "eval_samples_per_second": 6.649, + "eval_steps_per_second": 0.416, + "step": 5100 + }, + { + "epoch": 2.64, + "learning_rate": 6.703002486135017e-08, + "logits/chosen": -2.472276210784912, + "logits/rejected": -2.442483901977539, + "logps/chosen": -250.888671875, + "logps/rejected": -368.9104309082031, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1285686492919922, + "rewards/margins": 12.090982437133789, + "rewards/rejected": -13.219549179077148, + "step": 5110 + }, + { + "epoch": 2.64, + "learning_rate": 6.607381908586727e-08, + "logits/chosen": -2.526215076446533, + "logits/rejected": -2.428818941116333, + "logps/chosen": -290.0104675292969, + "logps/rejected": -391.9622497558594, + "loss": 0.0189, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.7901774644851685, + "rewards/margins": 11.513010025024414, + "rewards/rejected": -12.303187370300293, + "step": 5120 + }, + { + "epoch": 2.65, + "learning_rate": 6.511761331038438e-08, + "logits/chosen": -2.552222490310669, + "logits/rejected": -2.5270204544067383, + "logps/chosen": -241.63101196289062, + "logps/rejected": -325.4315490722656, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.719938039779663, + "rewards/margins": 9.429548263549805, + "rewards/rejected": -11.149487495422363, + "step": 5130 + }, + { + "epoch": 2.65, + "learning_rate": 6.416140753490151e-08, + "logits/chosen": -2.5769755840301514, + "logits/rejected": -2.5583062171936035, + "logps/chosen": -326.62939453125, + "logps/rejected": -382.25396728515625, + "loss": 0.0126, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.0803858041763306, + "rewards/margins": 10.431987762451172, + "rewards/rejected": -11.512374877929688, + "step": 5140 + }, + { + "epoch": 2.66, + "learning_rate": 6.320520175941863e-08, + "logits/chosen": -2.4850387573242188, + "logits/rejected": -2.431705951690674, + "logps/chosen": -268.9663391113281, + "logps/rejected": -328.4075622558594, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5687530040740967, + "rewards/margins": 10.12572956085205, + "rewards/rejected": -11.69448184967041, + "step": 5150 + }, + { + "epoch": 2.66, + "learning_rate": 6.224899598393573e-08, + "logits/chosen": -2.6069729328155518, + "logits/rejected": -2.563819408416748, + "logps/chosen": -326.76837158203125, + "logps/rejected": -387.1705017089844, + "loss": 0.0177, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.9824767112731934, + "rewards/margins": 10.304605484008789, + "rewards/rejected": -11.287081718444824, + "step": 5160 + }, + { + "epoch": 2.67, + "learning_rate": 6.129279020845286e-08, + "logits/chosen": -2.618408441543579, + "logits/rejected": -2.4645755290985107, + "logps/chosen": -276.08221435546875, + "logps/rejected": -314.79351806640625, + "loss": 0.0076, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.246117353439331, + "rewards/margins": 10.326366424560547, + "rewards/rejected": -11.572484016418457, + "step": 5170 + }, + { + "epoch": 2.67, + "learning_rate": 6.033658443296998e-08, + "logits/chosen": -2.59645938873291, + "logits/rejected": -2.5549280643463135, + "logps/chosen": -305.74566650390625, + "logps/rejected": -360.2120361328125, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6957218647003174, + "rewards/margins": 10.49669075012207, + "rewards/rejected": -12.192411422729492, + "step": 5180 + }, + { + "epoch": 2.68, + "learning_rate": 5.9380378657487085e-08, + "logits/chosen": -2.56986927986145, + "logits/rejected": -2.562441349029541, + "logps/chosen": -304.5476989746094, + "logps/rejected": -355.4156494140625, + "loss": 0.0057, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.3462127447128296, + "rewards/margins": 10.37061595916748, + "rewards/rejected": -11.716829299926758, + "step": 5190 + }, + { + "epoch": 2.68, + "learning_rate": 5.842417288200421e-08, + "logits/chosen": -2.591763973236084, + "logits/rejected": -2.607100486755371, + "logps/chosen": -318.5129699707031, + "logps/rejected": -368.0654602050781, + "loss": 0.0074, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.1077558994293213, + "rewards/margins": 10.874872207641602, + "rewards/rejected": -11.982629776000977, + "step": 5200 + }, + { + "epoch": 2.68, + "eval_logits/chosen": -2.5890767574310303, + "eval_logits/rejected": -2.5378119945526123, + "eval_logps/chosen": -318.5146789550781, + "eval_logps/rejected": -313.303466796875, + "eval_loss": 0.739378035068512, + "eval_rewards/accuracies": 0.765999972820282, + "eval_rewards/chosen": -4.736356258392334, + "eval_rewards/margins": 3.745591878890991, + "eval_rewards/rejected": -8.481947898864746, + "eval_runtime": 299.196, + "eval_samples_per_second": 6.685, + "eval_steps_per_second": 0.418, + "step": 5200 + }, + { + "epoch": 2.69, + "learning_rate": 5.7467967106521317e-08, + "logits/chosen": -2.5239500999450684, + "logits/rejected": -2.5438942909240723, + "logps/chosen": -223.29531860351562, + "logps/rejected": -347.07537841796875, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5479010343551636, + "rewards/margins": 10.697844505310059, + "rewards/rejected": -12.245744705200195, + "step": 5210 + }, + { + "epoch": 2.69, + "learning_rate": 5.651176133103844e-08, + "logits/chosen": -2.6476407051086426, + "logits/rejected": -2.590280771255493, + "logps/chosen": -288.53741455078125, + "logps/rejected": -378.91717529296875, + "loss": 0.0125, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.879062294960022, + "rewards/margins": 11.550715446472168, + "rewards/rejected": -12.429778099060059, + "step": 5220 + }, + { + "epoch": 2.7, + "learning_rate": 5.555555555555555e-08, + "logits/chosen": -2.51558256149292, + "logits/rejected": -2.4594240188598633, + "logps/chosen": -275.41180419921875, + "logps/rejected": -302.51751708984375, + "loss": 0.0091, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2985538244247437, + "rewards/margins": 10.125242233276367, + "rewards/rejected": -11.423796653747559, + "step": 5230 + }, + { + "epoch": 2.71, + "learning_rate": 5.459934978007267e-08, + "logits/chosen": -2.5778565406799316, + "logits/rejected": -2.54152250289917, + "logps/chosen": -298.53887939453125, + "logps/rejected": -359.42413330078125, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5041391849517822, + "rewards/margins": 11.008560180664062, + "rewards/rejected": -12.512699127197266, + "step": 5240 + }, + { + "epoch": 2.71, + "learning_rate": 5.3643144004589786e-08, + "logits/chosen": -2.5127241611480713, + "logits/rejected": -2.4316296577453613, + "logps/chosen": -334.34368896484375, + "logps/rejected": -361.0766906738281, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4334515631198883, + "rewards/margins": 11.821561813354492, + "rewards/rejected": -12.255014419555664, + "step": 5250 + }, + { + "epoch": 2.72, + "learning_rate": 5.26869382291069e-08, + "logits/chosen": -2.5240254402160645, + "logits/rejected": -2.513014316558838, + "logps/chosen": -255.37158203125, + "logps/rejected": -314.8083190917969, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.473276138305664, + "rewards/margins": 10.024567604064941, + "rewards/rejected": -11.497844696044922, + "step": 5260 + }, + { + "epoch": 2.72, + "learning_rate": 5.173073245362402e-08, + "logits/chosen": -2.5280280113220215, + "logits/rejected": -2.4819343090057373, + "logps/chosen": -300.80059814453125, + "logps/rejected": -369.16351318359375, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.161496877670288, + "rewards/margins": 9.86224365234375, + "rewards/rejected": -12.0237398147583, + "step": 5270 + }, + { + "epoch": 2.73, + "learning_rate": 5.077452667814113e-08, + "logits/chosen": -2.5500526428222656, + "logits/rejected": -2.4608266353607178, + "logps/chosen": -255.2032012939453, + "logps/rejected": -345.13336181640625, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8557249307632446, + "rewards/margins": 11.390253067016602, + "rewards/rejected": -13.245976448059082, + "step": 5280 + }, + { + "epoch": 2.73, + "learning_rate": 4.981832090265825e-08, + "logits/chosen": -2.5720067024230957, + "logits/rejected": -2.509190082550049, + "logps/chosen": -265.0337829589844, + "logps/rejected": -347.77569580078125, + "loss": 0.0043, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.3671785593032837, + "rewards/margins": 11.596514701843262, + "rewards/rejected": -12.96369457244873, + "step": 5290 + }, + { + "epoch": 2.74, + "learning_rate": 4.8862115127175364e-08, + "logits/chosen": -2.655913829803467, + "logits/rejected": -2.5226495265960693, + "logps/chosen": -314.6326599121094, + "logps/rejected": -358.582275390625, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.139702081680298, + "rewards/margins": 10.213295936584473, + "rewards/rejected": -12.352998733520508, + "step": 5300 + }, + { + "epoch": 2.74, + "eval_logits/chosen": -2.605203151702881, + "eval_logits/rejected": -2.553877353668213, + "eval_logps/chosen": -317.5019226074219, + "eval_logps/rejected": -312.4739990234375, + "eval_loss": 0.7335207462310791, + "eval_rewards/accuracies": 0.7720000147819519, + "eval_rewards/chosen": -4.635079860687256, + "eval_rewards/margins": 3.7639193534851074, + "eval_rewards/rejected": -8.39900016784668, + "eval_runtime": 297.7404, + "eval_samples_per_second": 6.717, + "eval_steps_per_second": 0.42, + "step": 5300 + }, + { + "epoch": 2.74, + "learning_rate": 4.790590935169248e-08, + "logits/chosen": -2.5588815212249756, + "logits/rejected": -2.5604608058929443, + "logps/chosen": -266.3474426269531, + "logps/rejected": -430.2942810058594, + "loss": 0.0132, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.032094717025757, + "rewards/margins": 10.764850616455078, + "rewards/rejected": -12.796945571899414, + "step": 5310 + }, + { + "epoch": 2.75, + "learning_rate": 4.69497035762096e-08, + "logits/chosen": -2.53786301612854, + "logits/rejected": -2.4553189277648926, + "logps/chosen": -265.3934326171875, + "logps/rejected": -339.6708068847656, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.617504119873047, + "rewards/margins": 10.523672103881836, + "rewards/rejected": -13.1411771774292, + "step": 5320 + }, + { + "epoch": 2.75, + "learning_rate": 4.599349780072671e-08, + "logits/chosen": -2.5566892623901367, + "logits/rejected": -2.4526925086975098, + "logps/chosen": -304.0606994628906, + "logps/rejected": -329.9059753417969, + "loss": 0.0096, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.4853826761245728, + "rewards/margins": 10.810678482055664, + "rewards/rejected": -12.296060562133789, + "step": 5330 + }, + { + "epoch": 2.76, + "learning_rate": 4.5037292025243834e-08, + "logits/chosen": -2.5405123233795166, + "logits/rejected": -2.550758123397827, + "logps/chosen": -308.7420349121094, + "logps/rejected": -389.48101806640625, + "loss": 0.0118, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.569156289100647, + "rewards/margins": 10.718561172485352, + "rewards/rejected": -12.287717819213867, + "step": 5340 + }, + { + "epoch": 2.76, + "learning_rate": 4.408108624976094e-08, + "logits/chosen": -2.648345470428467, + "logits/rejected": -2.5815377235412598, + "logps/chosen": -283.7870788574219, + "logps/rejected": -385.6964416503906, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0726970434188843, + "rewards/margins": 12.462373733520508, + "rewards/rejected": -13.535069465637207, + "step": 5350 + }, + { + "epoch": 2.77, + "learning_rate": 4.3124880474278065e-08, + "logits/chosen": -2.61507248878479, + "logits/rejected": -2.591545581817627, + "logps/chosen": -261.71783447265625, + "logps/rejected": -331.97210693359375, + "loss": 0.0211, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.4280383586883545, + "rewards/margins": 10.27877426147461, + "rewards/rejected": -11.706812858581543, + "step": 5360 + }, + { + "epoch": 2.77, + "learning_rate": 4.2168674698795174e-08, + "logits/chosen": -2.5542023181915283, + "logits/rejected": -2.4944653511047363, + "logps/chosen": -257.595947265625, + "logps/rejected": -365.1188659667969, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7011358737945557, + "rewards/margins": 10.8641357421875, + "rewards/rejected": -12.565271377563477, + "step": 5370 + }, + { + "epoch": 2.78, + "learning_rate": 4.1212468923312296e-08, + "logits/chosen": -2.4464447498321533, + "logits/rejected": -2.421036720275879, + "logps/chosen": -290.8499755859375, + "logps/rejected": -364.34796142578125, + "loss": 0.0101, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.6131134033203125, + "rewards/margins": 10.417032241821289, + "rewards/rejected": -12.030145645141602, + "step": 5380 + }, + { + "epoch": 2.78, + "learning_rate": 4.025626314782941e-08, + "logits/chosen": -2.420766592025757, + "logits/rejected": -2.430091381072998, + "logps/chosen": -284.9582824707031, + "logps/rejected": -337.31640625, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.669730544090271, + "rewards/margins": 10.3810453414917, + "rewards/rejected": -12.050777435302734, + "step": 5390 + }, + { + "epoch": 2.79, + "learning_rate": 3.930005737234653e-08, + "logits/chosen": -2.5880367755889893, + "logits/rejected": -2.573629856109619, + "logps/chosen": -262.8695373535156, + "logps/rejected": -303.2467956542969, + "loss": 0.0163, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.6934680938720703, + "rewards/margins": 9.696393966674805, + "rewards/rejected": -11.389862060546875, + "step": 5400 + }, + { + "epoch": 2.79, + "eval_logits/chosen": -2.599304437637329, + "eval_logits/rejected": -2.548959970474243, + "eval_logps/chosen": -317.89239501953125, + "eval_logps/rejected": -312.4419860839844, + "eval_loss": 0.7316961288452148, + "eval_rewards/accuracies": 0.7699999809265137, + "eval_rewards/chosen": -4.674126148223877, + "eval_rewards/margins": 3.721679925918579, + "eval_rewards/rejected": -8.395805358886719, + "eval_runtime": 300.0204, + "eval_samples_per_second": 6.666, + "eval_steps_per_second": 0.417, + "step": 5400 + }, + { + "epoch": 2.79, + "learning_rate": 3.8343851596863644e-08, + "logits/chosen": -2.597282886505127, + "logits/rejected": -2.50775146484375, + "logps/chosen": -255.15243530273438, + "logps/rejected": -311.01910400390625, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9441089630126953, + "rewards/margins": 10.98766803741455, + "rewards/rejected": -11.931777000427246, + "step": 5410 + }, + { + "epoch": 2.8, + "learning_rate": 3.738764582138076e-08, + "logits/chosen": -2.560591220855713, + "logits/rejected": -2.4639830589294434, + "logps/chosen": -329.83734130859375, + "logps/rejected": -376.46160888671875, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5904229879379272, + "rewards/margins": 11.07619857788086, + "rewards/rejected": -12.666620254516602, + "step": 5420 + }, + { + "epoch": 2.8, + "learning_rate": 3.6431440045897875e-08, + "logits/chosen": -2.515839099884033, + "logits/rejected": -2.483527421951294, + "logps/chosen": -282.5474853515625, + "logps/rejected": -376.92926025390625, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3852026462554932, + "rewards/margins": 10.924342155456543, + "rewards/rejected": -12.309544563293457, + "step": 5430 + }, + { + "epoch": 2.81, + "learning_rate": 3.547523427041499e-08, + "logits/chosen": -2.64237117767334, + "logits/rejected": -2.6202080249786377, + "logps/chosen": -293.0863952636719, + "logps/rejected": -397.6541748046875, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9685888290405273, + "rewards/margins": 12.01826286315918, + "rewards/rejected": -12.986851692199707, + "step": 5440 + }, + { + "epoch": 2.81, + "learning_rate": 3.4519028494932106e-08, + "logits/chosen": -2.4823646545410156, + "logits/rejected": -2.4257140159606934, + "logps/chosen": -298.60693359375, + "logps/rejected": -372.49212646484375, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.65126633644104, + "rewards/margins": 11.615009307861328, + "rewards/rejected": -13.266276359558105, + "step": 5450 + }, + { + "epoch": 2.82, + "learning_rate": 3.356282271944923e-08, + "logits/chosen": -2.5652999877929688, + "logits/rejected": -2.575331449508667, + "logps/chosen": -265.50970458984375, + "logps/rejected": -378.0349426269531, + "loss": 0.0133, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2861878871917725, + "rewards/margins": 11.414156913757324, + "rewards/rejected": -12.70034408569336, + "step": 5460 + }, + { + "epoch": 2.82, + "learning_rate": 3.260661694396634e-08, + "logits/chosen": -2.6633830070495605, + "logits/rejected": -2.553030490875244, + "logps/chosen": -338.38525390625, + "logps/rejected": -344.780517578125, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.167046308517456, + "rewards/margins": 11.513765335083008, + "rewards/rejected": -12.68081283569336, + "step": 5470 + }, + { + "epoch": 2.83, + "learning_rate": 3.165041116848346e-08, + "logits/chosen": -2.5658626556396484, + "logits/rejected": -2.5562121868133545, + "logps/chosen": -277.5936584472656, + "logps/rejected": -432.04608154296875, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6860519647598267, + "rewards/margins": 11.724603652954102, + "rewards/rejected": -13.41065502166748, + "step": 5480 + }, + { + "epoch": 2.83, + "learning_rate": 3.0694205393000576e-08, + "logits/chosen": -2.554914712905884, + "logits/rejected": -2.485353469848633, + "logps/chosen": -263.68682861328125, + "logps/rejected": -348.3326110839844, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5707225799560547, + "rewards/margins": 11.460761070251465, + "rewards/rejected": -12.031484603881836, + "step": 5490 + }, + { + "epoch": 2.84, + "learning_rate": 2.9737999617517688e-08, + "logits/chosen": -2.5359983444213867, + "logits/rejected": -2.471630811691284, + "logps/chosen": -306.6964416503906, + "logps/rejected": -340.04913330078125, + "loss": 0.0081, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.7643492221832275, + "rewards/margins": 10.403191566467285, + "rewards/rejected": -12.167540550231934, + "step": 5500 + }, + { + "epoch": 2.84, + "eval_logits/chosen": -2.5815768241882324, + "eval_logits/rejected": -2.5306899547576904, + "eval_logps/chosen": -320.3167419433594, + "eval_logps/rejected": -315.4290771484375, + "eval_loss": 0.7419750094413757, + "eval_rewards/accuracies": 0.7739999890327454, + "eval_rewards/chosen": -4.916560649871826, + "eval_rewards/margins": 3.777946710586548, + "eval_rewards/rejected": -8.694506645202637, + "eval_runtime": 297.0642, + "eval_samples_per_second": 6.733, + "eval_steps_per_second": 0.421, + "step": 5500 + }, + { + "epoch": 2.84, + "learning_rate": 2.8781793842034804e-08, + "logits/chosen": -2.4522650241851807, + "logits/rejected": -2.330350875854492, + "logps/chosen": -261.5328674316406, + "logps/rejected": -335.68560791015625, + "loss": 0.0164, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.278339147567749, + "rewards/margins": 10.198699951171875, + "rewards/rejected": -12.477038383483887, + "step": 5510 + }, + { + "epoch": 2.85, + "learning_rate": 2.782558806655192e-08, + "logits/chosen": -2.4790380001068115, + "logits/rejected": -2.49639630317688, + "logps/chosen": -275.767333984375, + "logps/rejected": -372.3228759765625, + "loss": 0.014, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.5172851085662842, + "rewards/margins": 10.855243682861328, + "rewards/rejected": -12.372529029846191, + "step": 5520 + }, + { + "epoch": 2.85, + "learning_rate": 2.6869382291069035e-08, + "logits/chosen": -2.5798544883728027, + "logits/rejected": -2.5143635272979736, + "logps/chosen": -305.1454772949219, + "logps/rejected": -370.1262512207031, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6373401880264282, + "rewards/margins": 10.685625076293945, + "rewards/rejected": -12.322965621948242, + "step": 5530 + }, + { + "epoch": 2.86, + "learning_rate": 2.591317651558615e-08, + "logits/chosen": -2.508338451385498, + "logits/rejected": -2.518322467803955, + "logps/chosen": -269.467529296875, + "logps/rejected": -350.46337890625, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.773882508277893, + "rewards/margins": 10.512941360473633, + "rewards/rejected": -12.286825180053711, + "step": 5540 + }, + { + "epoch": 2.87, + "learning_rate": 2.4956970740103267e-08, + "logits/chosen": -2.555595636367798, + "logits/rejected": -2.4913864135742188, + "logps/chosen": -284.4319763183594, + "logps/rejected": -388.2416076660156, + "loss": 0.0155, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.8780571222305298, + "rewards/margins": 11.021939277648926, + "rewards/rejected": -12.899995803833008, + "step": 5550 + }, + { + "epoch": 2.87, + "learning_rate": 2.4000764964620386e-08, + "logits/chosen": -2.5701186656951904, + "logits/rejected": -2.5412697792053223, + "logps/chosen": -359.2135314941406, + "logps/rejected": -392.4355773925781, + "loss": 0.0154, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.2382028102874756, + "rewards/margins": 10.054285049438477, + "rewards/rejected": -12.292488098144531, + "step": 5560 + }, + { + "epoch": 2.88, + "learning_rate": 2.30445591891375e-08, + "logits/chosen": -2.623213768005371, + "logits/rejected": -2.5315768718719482, + "logps/chosen": -332.5467834472656, + "logps/rejected": -376.62481689453125, + "loss": 0.0077, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2165770530700684, + "rewards/margins": 11.590463638305664, + "rewards/rejected": -12.807042121887207, + "step": 5570 + }, + { + "epoch": 2.88, + "learning_rate": 2.2088353413654617e-08, + "logits/chosen": -2.49461030960083, + "logits/rejected": -2.4208273887634277, + "logps/chosen": -245.84646606445312, + "logps/rejected": -314.0872497558594, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8297621011734009, + "rewards/margins": 11.114880561828613, + "rewards/rejected": -12.9446439743042, + "step": 5580 + }, + { + "epoch": 2.89, + "learning_rate": 2.1132147638171733e-08, + "logits/chosen": -2.5482990741729736, + "logits/rejected": -2.506981611251831, + "logps/chosen": -323.66943359375, + "logps/rejected": -346.53009033203125, + "loss": 0.0155, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.7804180383682251, + "rewards/margins": 11.199524879455566, + "rewards/rejected": -11.97994327545166, + "step": 5590 + }, + { + "epoch": 2.89, + "learning_rate": 2.0175941862688848e-08, + "logits/chosen": -2.5294761657714844, + "logits/rejected": -2.535964250564575, + "logps/chosen": -261.10882568359375, + "logps/rejected": -339.1511535644531, + "loss": 0.0067, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.790201187133789, + "rewards/margins": 11.019615173339844, + "rewards/rejected": -12.80981731414795, + "step": 5600 + }, + { + "epoch": 2.89, + "eval_logits/chosen": -2.594068765640259, + "eval_logits/rejected": -2.5437283515930176, + "eval_logps/chosen": -320.7321472167969, + "eval_logps/rejected": -315.70770263671875, + "eval_loss": 0.7368908524513245, + "eval_rewards/accuracies": 0.7680000066757202, + "eval_rewards/chosen": -4.958104610443115, + "eval_rewards/margins": 3.7642662525177, + "eval_rewards/rejected": -8.722371101379395, + "eval_runtime": 299.4877, + "eval_samples_per_second": 6.678, + "eval_steps_per_second": 0.417, + "step": 5600 + }, + { + "epoch": 2.9, + "learning_rate": 1.9219736087205964e-08, + "logits/chosen": -2.54457950592041, + "logits/rejected": -2.4718551635742188, + "logps/chosen": -263.2037048339844, + "logps/rejected": -339.2086486816406, + "loss": 0.016, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.8428022861480713, + "rewards/margins": 10.111788749694824, + "rewards/rejected": -11.954591751098633, + "step": 5610 + }, + { + "epoch": 2.9, + "learning_rate": 1.826353031172308e-08, + "logits/chosen": -2.52895188331604, + "logits/rejected": -2.4624314308166504, + "logps/chosen": -319.7272644042969, + "logps/rejected": -443.2732849121094, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3951648473739624, + "rewards/margins": 11.68992805480957, + "rewards/rejected": -13.085092544555664, + "step": 5620 + }, + { + "epoch": 2.91, + "learning_rate": 1.73073245362402e-08, + "logits/chosen": -2.5705406665802, + "logits/rejected": -2.5043656826019287, + "logps/chosen": -300.62847900390625, + "logps/rejected": -335.4035949707031, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.186084747314453, + "rewards/margins": 10.557881355285645, + "rewards/rejected": -12.743965148925781, + "step": 5630 + }, + { + "epoch": 2.91, + "learning_rate": 1.6351118760757314e-08, + "logits/chosen": -2.544924259185791, + "logits/rejected": -2.5001089572906494, + "logps/chosen": -283.0209045410156, + "logps/rejected": -336.7970886230469, + "loss": 0.0132, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.8115791082382202, + "rewards/margins": 9.958032608032227, + "rewards/rejected": -11.769611358642578, + "step": 5640 + }, + { + "epoch": 2.92, + "learning_rate": 1.539491298527443e-08, + "logits/chosen": -2.55588436126709, + "logits/rejected": -2.5477097034454346, + "logps/chosen": -249.8664093017578, + "logps/rejected": -357.674560546875, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4484872817993164, + "rewards/margins": 10.401117324829102, + "rewards/rejected": -12.849603652954102, + "step": 5650 + }, + { + "epoch": 2.92, + "learning_rate": 1.4438707209791546e-08, + "logits/chosen": -2.547852039337158, + "logits/rejected": -2.4804680347442627, + "logps/chosen": -302.8385314941406, + "logps/rejected": -329.81573486328125, + "loss": 0.0149, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.3558497428894043, + "rewards/margins": 10.600205421447754, + "rewards/rejected": -11.956053733825684, + "step": 5660 + }, + { + "epoch": 2.93, + "learning_rate": 1.3482501434308661e-08, + "logits/chosen": -2.547434091567993, + "logits/rejected": -2.484344005584717, + "logps/chosen": -294.35064697265625, + "logps/rejected": -320.4677429199219, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5904505252838135, + "rewards/margins": 10.197576522827148, + "rewards/rejected": -11.788025856018066, + "step": 5670 + }, + { + "epoch": 2.93, + "learning_rate": 1.2526295658825777e-08, + "logits/chosen": -2.650617837905884, + "logits/rejected": -2.6219732761383057, + "logps/chosen": -305.6776428222656, + "logps/rejected": -382.5854187011719, + "loss": 0.0152, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.8251022100448608, + "rewards/margins": 11.217008590698242, + "rewards/rejected": -13.042112350463867, + "step": 5680 + }, + { + "epoch": 2.94, + "learning_rate": 1.1570089883342895e-08, + "logits/chosen": -2.489759922027588, + "logits/rejected": -2.4366366863250732, + "logps/chosen": -311.3402404785156, + "logps/rejected": -412.3067932128906, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8317899703979492, + "rewards/margins": 11.631093978881836, + "rewards/rejected": -13.462884902954102, + "step": 5690 + }, + { + "epoch": 2.94, + "learning_rate": 1.061388410786001e-08, + "logits/chosen": -2.534816026687622, + "logits/rejected": -2.5374817848205566, + "logps/chosen": -282.93951416015625, + "logps/rejected": -327.12738037109375, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5196959972381592, + "rewards/margins": 10.49896240234375, + "rewards/rejected": -12.018656730651855, + "step": 5700 + }, + { + "epoch": 2.94, + "eval_logits/chosen": -2.59464430809021, + "eval_logits/rejected": -2.5442099571228027, + "eval_logps/chosen": -320.8699951171875, + "eval_logps/rejected": -315.9825744628906, + "eval_loss": 0.7345340251922607, + "eval_rewards/accuracies": 0.7720000147819519, + "eval_rewards/chosen": -4.971884727478027, + "eval_rewards/margins": 3.777974843978882, + "eval_rewards/rejected": -8.749860763549805, + "eval_runtime": 297.7462, + "eval_samples_per_second": 6.717, + "eval_steps_per_second": 0.42, + "step": 5700 + }, + { + "epoch": 2.95, + "learning_rate": 9.657678332377126e-09, + "logits/chosen": -2.497479200363159, + "logits/rejected": -2.4477274417877197, + "logps/chosen": -284.3431091308594, + "logps/rejected": -325.3197326660156, + "loss": 0.0102, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.3208651542663574, + "rewards/margins": 10.637207984924316, + "rewards/rejected": -11.958072662353516, + "step": 5710 + }, + { + "epoch": 2.95, + "learning_rate": 8.701472556894243e-09, + "logits/chosen": -2.5374457836151123, + "logits/rejected": -2.405616521835327, + "logps/chosen": -269.211181640625, + "logps/rejected": -373.4117736816406, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.543179988861084, + "rewards/margins": 10.95097541809082, + "rewards/rejected": -13.49415397644043, + "step": 5720 + }, + { + "epoch": 2.96, + "learning_rate": 7.745266781411359e-09, + "logits/chosen": -2.5577011108398438, + "logits/rejected": -2.5819551944732666, + "logps/chosen": -314.5643615722656, + "logps/rejected": -386.89361572265625, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3582957983016968, + "rewards/margins": 10.754124641418457, + "rewards/rejected": -12.112419128417969, + "step": 5730 + }, + { + "epoch": 2.96, + "learning_rate": 6.7890610059284754e-09, + "logits/chosen": -2.55308198928833, + "logits/rejected": -2.474553346633911, + "logps/chosen": -246.30685424804688, + "logps/rejected": -292.82403564453125, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7753121852874756, + "rewards/margins": 9.459589004516602, + "rewards/rejected": -11.234902381896973, + "step": 5740 + }, + { + "epoch": 2.97, + "learning_rate": 5.832855230445592e-09, + "logits/chosen": -2.4907386302948, + "logits/rejected": -2.467416286468506, + "logps/chosen": -288.83050537109375, + "logps/rejected": -346.51788330078125, + "loss": 0.0149, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.4427422285079956, + "rewards/margins": 11.456911087036133, + "rewards/rejected": -12.899653434753418, + "step": 5750 + }, + { + "epoch": 2.97, + "learning_rate": 4.8766494549627085e-09, + "logits/chosen": -2.5566134452819824, + "logits/rejected": -2.4680893421173096, + "logps/chosen": -283.3470458984375, + "logps/rejected": -355.33673095703125, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5444302558898926, + "rewards/margins": 11.971147537231445, + "rewards/rejected": -12.515579223632812, + "step": 5760 + }, + { + "epoch": 2.98, + "learning_rate": 3.920443679479824e-09, + "logits/chosen": -2.6467669010162354, + "logits/rejected": -2.580441951751709, + "logps/chosen": -326.4743347167969, + "logps/rejected": -346.74090576171875, + "loss": 0.0151, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8142646551132202, + "rewards/margins": 10.477445602416992, + "rewards/rejected": -12.291709899902344, + "step": 5770 + }, + { + "epoch": 2.98, + "learning_rate": 2.96423790399694e-09, + "logits/chosen": -2.60914945602417, + "logits/rejected": -2.5680363178253174, + "logps/chosen": -299.6268005371094, + "logps/rejected": -368.2593688964844, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.949828863143921, + "rewards/margins": 9.881691932678223, + "rewards/rejected": -11.831521987915039, + "step": 5780 + }, + { + "epoch": 2.99, + "learning_rate": 2.008032128514056e-09, + "logits/chosen": -2.5586049556732178, + "logits/rejected": -2.5496246814727783, + "logps/chosen": -296.46295166015625, + "logps/rejected": -365.4637451171875, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8471219539642334, + "rewards/margins": 10.26219367980957, + "rewards/rejected": -12.1093168258667, + "step": 5790 + }, + { + "epoch": 2.99, + "learning_rate": 1.0518263530311723e-09, + "logits/chosen": -2.589639663696289, + "logits/rejected": -2.5423808097839355, + "logps/chosen": -245.2353973388672, + "logps/rejected": -353.00701904296875, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8704856634140015, + "rewards/margins": 10.102082252502441, + "rewards/rejected": -11.972566604614258, + "step": 5800 + }, + { + "epoch": 2.99, + "eval_logits/chosen": -2.5956168174743652, + "eval_logits/rejected": -2.5452351570129395, + "eval_logps/chosen": -320.2925109863281, + "eval_logps/rejected": -315.3341064453125, + "eval_loss": 0.7337509989738464, + "eval_rewards/accuracies": 0.7699999809265137, + "eval_rewards/chosen": -4.9141364097595215, + "eval_rewards/margins": 3.7708754539489746, + "eval_rewards/rejected": -8.685011863708496, + "eval_runtime": 297.2004, + "eval_samples_per_second": 6.729, + "eval_steps_per_second": 0.421, + "step": 5800 + }, + { + "epoch": 3.0, + "learning_rate": 9.562057754828839e-11, + "logits/chosen": -2.5458855628967285, + "logits/rejected": -2.548276901245117, + "logps/chosen": -248.52377319335938, + "logps/rejected": -351.306640625, + "loss": 0.0125, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.725280523300171, + "rewards/margins": 9.489947319030762, + "rewards/rejected": -11.215229988098145, + "step": 5810 + }, + { + "epoch": 3.0, + "step": 5811, + "total_flos": 0.0, + "train_loss": 0.2059708398652644, + "train_runtime": 84831.0718, + "train_samples_per_second": 2.191, + "train_steps_per_second": 0.069 + } + ], + "logging_steps": 10, + "max_steps": 5811, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}