diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13798 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 100, + "global_step": 8826, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.662514156285391e-10, + "logits/chosen": -1.5211243629455566, + "logits/rejected": -0.9348576664924622, + "logps/chosen": -412.05706787109375, + "logps/rejected": -913.2714233398438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 5.66251415628539e-09, + "logits/chosen": -1.4790747165679932, + "logits/rejected": -1.2363442182540894, + "logps/chosen": -681.765625, + "logps/rejected": -639.8472290039062, + "loss": 0.8073, + "rewards/accuracies": 0.4166666567325592, + "rewards/chosen": -0.03606545552611351, + "rewards/margins": 0.13089190423488617, + "rewards/rejected": -0.16695736348628998, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 1.132502831257078e-08, + "logits/chosen": -1.4936949014663696, + "logits/rejected": -1.2116796970367432, + "logps/chosen": -392.9115905761719, + "logps/rejected": -550.4598388671875, + "loss": 0.8388, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.08892551809549332, + "rewards/margins": 0.03743192180991173, + "rewards/rejected": 0.05149358510971069, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 1.698754246885617e-08, + "logits/chosen": -1.4575920104980469, + "logits/rejected": -1.2192105054855347, + "logps/chosen": -550.06103515625, + "logps/rejected": -525.7174682617188, + "loss": 0.7824, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02771606482565403, + "rewards/margins": 0.20261940360069275, + "rewards/rejected": -0.17490334808826447, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 2.265005662514156e-08, + "logits/chosen": -1.4789021015167236, + "logits/rejected": -1.220725655555725, + "logps/chosen": -411.79425048828125, + "logps/rejected": -576.1282348632812, + "loss": 0.8512, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.1698690801858902, + "rewards/margins": -0.12088494002819061, + "rewards/rejected": -0.04898417368531227, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 2.8312570781426952e-08, + "logits/chosen": -1.516235113143921, + "logits/rejected": -1.1653461456298828, + "logps/chosen": -361.09307861328125, + "logps/rejected": -673.5965576171875, + "loss": 0.8676, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.13890251517295837, + "rewards/margins": 0.11448585987091064, + "rewards/rejected": 0.024416672065854073, + "step": 50 + }, + { + "epoch": 0.02, + "learning_rate": 3.397508493771234e-08, + "logits/chosen": -1.5026288032531738, + "logits/rejected": -1.2371063232421875, + "logps/chosen": -460.19171142578125, + "logps/rejected": -448.1392517089844, + "loss": 0.8151, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.20903334021568298, + "rewards/margins": 0.2622700035572052, + "rewards/rejected": -0.05323667451739311, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 3.9637599093997736e-08, + "logits/chosen": -1.4962165355682373, + "logits/rejected": -1.216679334640503, + "logps/chosen": -425.6822814941406, + "logps/rejected": -605.38330078125, + "loss": 0.7992, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.07853921502828598, + "rewards/margins": 0.1831229031085968, + "rewards/rejected": -0.10458371788263321, + "step": 70 + }, + { + "epoch": 0.03, + "learning_rate": 4.530011325028312e-08, + "logits/chosen": -1.4759505987167358, + "logits/rejected": -1.1879253387451172, + "logps/chosen": -589.6921997070312, + "logps/rejected": -481.1573791503906, + "loss": 0.8174, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.06853078305721283, + "rewards/margins": 0.07774414867162704, + "rewards/rejected": -0.00921335257589817, + "step": 80 + }, + { + "epoch": 0.03, + "learning_rate": 5.096262740656852e-08, + "logits/chosen": -1.5050536394119263, + "logits/rejected": -1.2695952653884888, + "logps/chosen": -376.61431884765625, + "logps/rejected": -367.48974609375, + "loss": 0.7832, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.011920909397304058, + "rewards/margins": 0.0361761674284935, + "rewards/rejected": -0.02425524592399597, + "step": 90 + }, + { + "epoch": 0.03, + "learning_rate": 5.6625141562853904e-08, + "logits/chosen": -1.5045042037963867, + "logits/rejected": -1.1648657321929932, + "logps/chosen": -352.58160400390625, + "logps/rejected": -513.9573364257812, + "loss": 0.873, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.022640805691480637, + "rewards/margins": -0.03715241327881813, + "rewards/rejected": 0.01451160293072462, + "step": 100 + }, + { + "epoch": 0.03, + "eval_logits/chosen": -1.4928536415100098, + "eval_logits/rejected": -1.1954454183578491, + "eval_logps/chosen": -417.6762390136719, + "eval_logps/rejected": -559.7548217773438, + "eval_loss": 0.7546952962875366, + "eval_rewards/accuracies": 0.5917508602142334, + "eval_rewards/chosen": 0.22465135157108307, + "eval_rewards/margins": 0.24849803745746613, + "eval_rewards/rejected": -0.02384665422141552, + "eval_runtime": 564.5637, + "eval_samples_per_second": 16.827, + "eval_steps_per_second": 0.526, + "step": 100 + }, + { + "epoch": 0.04, + "learning_rate": 6.22876557191393e-08, + "logits/chosen": -1.4991860389709473, + "logits/rejected": -1.2015098333358765, + "logps/chosen": -366.98663330078125, + "logps/rejected": -800.510498046875, + "loss": 0.7635, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.15777522325515747, + "rewards/margins": 0.17782074213027954, + "rewards/rejected": -0.020045507699251175, + "step": 110 + }, + { + "epoch": 0.04, + "learning_rate": 6.795016987542468e-08, + "logits/chosen": -1.5294487476348877, + "logits/rejected": -1.2778466939926147, + "logps/chosen": -389.4922790527344, + "logps/rejected": -546.4539794921875, + "loss": 0.7741, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.14168071746826172, + "rewards/margins": 0.15838415920734406, + "rewards/rejected": -0.016703438013792038, + "step": 120 + }, + { + "epoch": 0.04, + "learning_rate": 7.361268403171007e-08, + "logits/chosen": -1.4900524616241455, + "logits/rejected": -1.2108103036880493, + "logps/chosen": -440.649658203125, + "logps/rejected": -581.1105346679688, + "loss": 0.7329, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.25806570053100586, + "rewards/margins": 0.29932084679603577, + "rewards/rejected": -0.04125513508915901, + "step": 130 + }, + { + "epoch": 0.05, + "learning_rate": 7.927519818799547e-08, + "logits/chosen": -1.4997873306274414, + "logits/rejected": -1.2810332775115967, + "logps/chosen": -324.90716552734375, + "logps/rejected": -612.2393798828125, + "loss": 0.74, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.25267493724823, + "rewards/margins": 0.13709421455860138, + "rewards/rejected": 0.1155807226896286, + "step": 140 + }, + { + "epoch": 0.05, + "learning_rate": 8.493771234428086e-08, + "logits/chosen": -1.5006136894226074, + "logits/rejected": -1.231439471244812, + "logps/chosen": -370.1236267089844, + "logps/rejected": -552.7628173828125, + "loss": 0.7135, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.21485865116119385, + "rewards/margins": 0.2915591597557068, + "rewards/rejected": -0.07670050114393234, + "step": 150 + }, + { + "epoch": 0.05, + "learning_rate": 9.060022650056625e-08, + "logits/chosen": -1.499180793762207, + "logits/rejected": -1.2177469730377197, + "logps/chosen": -349.7593994140625, + "logps/rejected": -512.0928955078125, + "loss": 0.7132, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.3074072003364563, + "rewards/margins": 0.505518913269043, + "rewards/rejected": -0.19811174273490906, + "step": 160 + }, + { + "epoch": 0.06, + "learning_rate": 9.626274065685163e-08, + "logits/chosen": -1.488895058631897, + "logits/rejected": -1.2142362594604492, + "logps/chosen": -456.4414978027344, + "logps/rejected": -481.563232421875, + "loss": 0.6602, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.45000943541526794, + "rewards/margins": 0.46701937913894653, + "rewards/rejected": -0.017009949311614037, + "step": 170 + }, + { + "epoch": 0.06, + "learning_rate": 1.0192525481313703e-07, + "logits/chosen": -1.4814162254333496, + "logits/rejected": -1.2387354373931885, + "logps/chosen": -507.9383850097656, + "logps/rejected": -626.4871215820312, + "loss": 0.6356, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.42191895842552185, + "rewards/margins": 0.4351392388343811, + "rewards/rejected": -0.013220282271504402, + "step": 180 + }, + { + "epoch": 0.06, + "learning_rate": 1.0758776896942241e-07, + "logits/chosen": -1.4989941120147705, + "logits/rejected": -1.222532868385315, + "logps/chosen": -372.0489807128906, + "logps/rejected": -567.7716674804688, + "loss": 0.5777, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5717140436172485, + "rewards/margins": 0.6350502967834473, + "rewards/rejected": -0.06333623826503754, + "step": 190 + }, + { + "epoch": 0.07, + "learning_rate": 1.1325028312570781e-07, + "logits/chosen": -1.501927137374878, + "logits/rejected": -1.1784226894378662, + "logps/chosen": -430.58428955078125, + "logps/rejected": -435.82989501953125, + "loss": 0.6069, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.5867192149162292, + "rewards/margins": 0.44763174653053284, + "rewards/rejected": 0.1390875279903412, + "step": 200 + }, + { + "epoch": 0.07, + "eval_logits/chosen": -1.4956140518188477, + "eval_logits/rejected": -1.193248987197876, + "eval_logps/chosen": -412.6498718261719, + "eval_logps/rejected": -559.9497680664062, + "eval_loss": 0.5675464868545532, + "eval_rewards/accuracies": 0.7407407164573669, + "eval_rewards/chosen": 0.7272892594337463, + "eval_rewards/margins": 0.7706289291381836, + "eval_rewards/rejected": -0.04333961755037308, + "eval_runtime": 555.542, + "eval_samples_per_second": 17.1, + "eval_steps_per_second": 0.535, + "step": 200 + }, + { + "epoch": 0.07, + "learning_rate": 1.189127972819932e-07, + "logits/chosen": -1.50724196434021, + "logits/rejected": -1.28584885597229, + "logps/chosen": -359.71942138671875, + "logps/rejected": -729.51171875, + "loss": 0.5645, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.7700144052505493, + "rewards/margins": 0.9522495269775391, + "rewards/rejected": -0.18223515152931213, + "step": 210 + }, + { + "epoch": 0.07, + "learning_rate": 1.245753114382786e-07, + "logits/chosen": -1.4929430484771729, + "logits/rejected": -1.2647950649261475, + "logps/chosen": -488.6161193847656, + "logps/rejected": -480.91192626953125, + "loss": 0.5156, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.8714560270309448, + "rewards/margins": 0.9610016942024231, + "rewards/rejected": -0.08954562246799469, + "step": 220 + }, + { + "epoch": 0.08, + "learning_rate": 1.3023782559456398e-07, + "logits/chosen": -1.4973535537719727, + "logits/rejected": -1.2505285739898682, + "logps/chosen": -435.5409240722656, + "logps/rejected": -597.1155395507812, + "loss": 0.5301, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.7545233368873596, + "rewards/margins": 0.7451528906822205, + "rewards/rejected": 0.00937043409794569, + "step": 230 + }, + { + "epoch": 0.08, + "learning_rate": 1.3590033975084937e-07, + "logits/chosen": -1.499418020248413, + "logits/rejected": -1.183406114578247, + "logps/chosen": -415.9150390625, + "logps/rejected": -700.021484375, + "loss": 0.5488, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.8038797378540039, + "rewards/margins": 0.9167583584785461, + "rewards/rejected": -0.1128787249326706, + "step": 240 + }, + { + "epoch": 0.08, + "learning_rate": 1.4156285390713476e-07, + "logits/chosen": -1.4998773336410522, + "logits/rejected": -1.1643227338790894, + "logps/chosen": -358.8067321777344, + "logps/rejected": -530.6439208984375, + "loss": 0.4943, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 1.0926392078399658, + "rewards/margins": 1.1960188150405884, + "rewards/rejected": -0.10337953269481659, + "step": 250 + }, + { + "epoch": 0.09, + "learning_rate": 1.4722536806342014e-07, + "logits/chosen": -1.4876563549041748, + "logits/rejected": -1.3128819465637207, + "logps/chosen": -428.2396545410156, + "logps/rejected": -588.4712524414062, + "loss": 0.4949, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.9048154950141907, + "rewards/margins": 1.0305545330047607, + "rewards/rejected": -0.12573912739753723, + "step": 260 + }, + { + "epoch": 0.09, + "learning_rate": 1.5288788221970556e-07, + "logits/chosen": -1.5035580396652222, + "logits/rejected": -1.177336573600769, + "logps/chosen": -381.9916076660156, + "logps/rejected": -404.8735656738281, + "loss": 0.4518, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.9627830386161804, + "rewards/margins": 1.0808956623077393, + "rewards/rejected": -0.11811268329620361, + "step": 270 + }, + { + "epoch": 0.1, + "learning_rate": 1.5855039637599094e-07, + "logits/chosen": -1.4745981693267822, + "logits/rejected": -1.1251944303512573, + "logps/chosen": -355.103515625, + "logps/rejected": -386.7832946777344, + "loss": 0.4726, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 1.1413819789886475, + "rewards/margins": 1.351929783821106, + "rewards/rejected": -0.21054776012897491, + "step": 280 + }, + { + "epoch": 0.1, + "learning_rate": 1.642129105322763e-07, + "logits/chosen": -1.5018924474716187, + "logits/rejected": -1.1681270599365234, + "logps/chosen": -324.175048828125, + "logps/rejected": -560.7718505859375, + "loss": 0.4552, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 1.1393592357635498, + "rewards/margins": 1.2083717584609985, + "rewards/rejected": -0.0690125972032547, + "step": 290 + }, + { + "epoch": 0.1, + "learning_rate": 1.6987542468856172e-07, + "logits/chosen": -1.5084751844406128, + "logits/rejected": -1.1097322702407837, + "logps/chosen": -354.4365234375, + "logps/rejected": -629.7656860351562, + "loss": 0.3668, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 1.4591721296310425, + "rewards/margins": 1.9503206014633179, + "rewards/rejected": -0.4911483824253082, + "step": 300 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -1.4862111806869507, + "eval_logits/rejected": -1.1768033504486084, + "eval_logps/chosen": -406.06756591796875, + "eval_logps/rejected": -561.8173217773438, + "eval_loss": 0.3913293778896332, + "eval_rewards/accuracies": 0.8552188277244568, + "eval_rewards/chosen": 1.3855180740356445, + "eval_rewards/margins": 1.6156105995178223, + "eval_rewards/rejected": -0.23009245097637177, + "eval_runtime": 557.8232, + "eval_samples_per_second": 17.03, + "eval_steps_per_second": 0.532, + "step": 300 + }, + { + "epoch": 0.11, + "learning_rate": 1.755379388448471e-07, + "logits/chosen": -1.5006253719329834, + "logits/rejected": -1.229182481765747, + "logps/chosen": -367.0664978027344, + "logps/rejected": -544.0938720703125, + "loss": 0.3707, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.5107367038726807, + "rewards/margins": 1.6374858617782593, + "rewards/rejected": -0.1267491579055786, + "step": 310 + }, + { + "epoch": 0.11, + "learning_rate": 1.812004530011325e-07, + "logits/chosen": -1.4654045104980469, + "logits/rejected": -1.2092571258544922, + "logps/chosen": -563.8804931640625, + "logps/rejected": -446.86865234375, + "loss": 0.3515, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.4659141302108765, + "rewards/margins": 1.6658271551132202, + "rewards/rejected": -0.1999128758907318, + "step": 320 + }, + { + "epoch": 0.11, + "learning_rate": 1.868629671574179e-07, + "logits/chosen": -1.4784510135650635, + "logits/rejected": -1.1505080461502075, + "logps/chosen": -423.0321350097656, + "logps/rejected": -624.2034912109375, + "loss": 0.3567, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.8489134311676025, + "rewards/margins": 2.2615528106689453, + "rewards/rejected": -0.412639319896698, + "step": 330 + }, + { + "epoch": 0.12, + "learning_rate": 1.9252548131370327e-07, + "logits/chosen": -1.4860397577285767, + "logits/rejected": -1.2079493999481201, + "logps/chosen": -334.85870361328125, + "logps/rejected": -556.5076293945312, + "loss": 0.3593, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.720995545387268, + "rewards/margins": 2.0472042560577393, + "rewards/rejected": -0.3262086510658264, + "step": 340 + }, + { + "epoch": 0.12, + "learning_rate": 1.9818799546998865e-07, + "logits/chosen": -1.4854962825775146, + "logits/rejected": -1.2439110279083252, + "logps/chosen": -398.25738525390625, + "logps/rejected": -615.8267211914062, + "loss": 0.3672, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 1.6038004159927368, + "rewards/margins": 1.7042919397354126, + "rewards/rejected": -0.10049135982990265, + "step": 350 + }, + { + "epoch": 0.12, + "learning_rate": 2.0385050962627407e-07, + "logits/chosen": -1.4964433908462524, + "logits/rejected": -1.2187970876693726, + "logps/chosen": -328.8074645996094, + "logps/rejected": -432.735595703125, + "loss": 0.3063, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 1.825738549232483, + "rewards/margins": 2.113569974899292, + "rewards/rejected": -0.28783148527145386, + "step": 360 + }, + { + "epoch": 0.13, + "learning_rate": 2.0951302378255946e-07, + "logits/chosen": -1.4886192083358765, + "logits/rejected": -1.21079421043396, + "logps/chosen": -336.3409118652344, + "logps/rejected": -823.2545776367188, + "loss": 0.2989, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 1.7591960430145264, + "rewards/margins": 2.0594146251678467, + "rewards/rejected": -0.30021852254867554, + "step": 370 + }, + { + "epoch": 0.13, + "learning_rate": 2.1517553793884482e-07, + "logits/chosen": -1.508397102355957, + "logits/rejected": -1.180199384689331, + "logps/chosen": -374.81207275390625, + "logps/rejected": -697.0372924804688, + "loss": 0.3436, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 2.0694453716278076, + "rewards/margins": 2.4799132347106934, + "rewards/rejected": -0.4104679524898529, + "step": 380 + }, + { + "epoch": 0.13, + "learning_rate": 2.2083805209513023e-07, + "logits/chosen": -1.4677913188934326, + "logits/rejected": -1.2046908140182495, + "logps/chosen": -406.46978759765625, + "logps/rejected": -440.15625, + "loss": 0.2969, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.9875986576080322, + "rewards/margins": 2.27653431892395, + "rewards/rejected": -0.2889358401298523, + "step": 390 + }, + { + "epoch": 0.14, + "learning_rate": 2.2650056625141562e-07, + "logits/chosen": -1.501773476600647, + "logits/rejected": -1.1108996868133545, + "logps/chosen": -356.6120910644531, + "logps/rejected": -667.3951416015625, + "loss": 0.2547, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 2.3899526596069336, + "rewards/margins": 2.9119009971618652, + "rewards/rejected": -0.5219482183456421, + "step": 400 + }, + { + "epoch": 0.14, + "eval_logits/chosen": -1.4768403768539429, + "eval_logits/rejected": -1.1602541208267212, + "eval_logps/chosen": -399.50067138671875, + "eval_logps/rejected": -562.875, + "eval_loss": 0.2941679358482361, + "eval_rewards/accuracies": 0.8897306323051453, + "eval_rewards/chosen": 2.0422072410583496, + "eval_rewards/margins": 2.3780689239501953, + "eval_rewards/rejected": -0.33586132526397705, + "eval_runtime": 557.7678, + "eval_samples_per_second": 17.032, + "eval_steps_per_second": 0.532, + "step": 400 + }, + { + "epoch": 0.14, + "learning_rate": 2.32163080407701e-07, + "logits/chosen": -1.4988408088684082, + "logits/rejected": -1.2369036674499512, + "logps/chosen": -330.7264404296875, + "logps/rejected": -554.8790283203125, + "loss": 0.3047, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 2.2827889919281006, + "rewards/margins": 2.5526533126831055, + "rewards/rejected": -0.2698643207550049, + "step": 410 + }, + { + "epoch": 0.14, + "learning_rate": 2.378255945639864e-07, + "logits/chosen": -1.4763238430023193, + "logits/rejected": -1.2129998207092285, + "logps/chosen": -340.05377197265625, + "logps/rejected": -737.7872924804688, + "loss": 0.2858, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.1625332832336426, + "rewards/margins": 2.517582416534424, + "rewards/rejected": -0.3550493121147156, + "step": 420 + }, + { + "epoch": 0.15, + "learning_rate": 2.434881087202718e-07, + "logits/chosen": -1.4784632921218872, + "logits/rejected": -1.1962693929672241, + "logps/chosen": -323.239013671875, + "logps/rejected": -658.5704345703125, + "loss": 0.2601, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 2.277160406112671, + "rewards/margins": 2.741499185562134, + "rewards/rejected": -0.46433886885643005, + "step": 430 + }, + { + "epoch": 0.15, + "learning_rate": 2.491506228765572e-07, + "logits/chosen": -1.4870140552520752, + "logits/rejected": -1.1514933109283447, + "logps/chosen": -341.9891662597656, + "logps/rejected": -488.63958740234375, + "loss": 0.2809, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 2.0882136821746826, + "rewards/margins": 2.6241490840911865, + "rewards/rejected": -0.5359354019165039, + "step": 440 + }, + { + "epoch": 0.15, + "learning_rate": 2.548131370328426e-07, + "logits/chosen": -1.4753116369247437, + "logits/rejected": -1.1047940254211426, + "logps/chosen": -377.7895812988281, + "logps/rejected": -408.9909973144531, + "loss": 0.301, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 2.0638253688812256, + "rewards/margins": 2.6936306953430176, + "rewards/rejected": -0.6298057436943054, + "step": 450 + }, + { + "epoch": 0.16, + "learning_rate": 2.6047565118912797e-07, + "logits/chosen": -1.4589248895645142, + "logits/rejected": -1.1196017265319824, + "logps/chosen": -420.7325134277344, + "logps/rejected": -457.3374938964844, + "loss": 0.2485, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 2.4863243103027344, + "rewards/margins": 2.819601535797119, + "rewards/rejected": -0.33327731490135193, + "step": 460 + }, + { + "epoch": 0.16, + "learning_rate": 2.6613816534541335e-07, + "logits/chosen": -1.4597301483154297, + "logits/rejected": -1.1874730587005615, + "logps/chosen": -511.9232482910156, + "logps/rejected": -309.4697570800781, + "loss": 0.2489, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 2.6990227699279785, + "rewards/margins": 3.127842903137207, + "rewards/rejected": -0.4288204312324524, + "step": 470 + }, + { + "epoch": 0.16, + "learning_rate": 2.7180067950169874e-07, + "logits/chosen": -1.473819613456726, + "logits/rejected": -1.2549312114715576, + "logps/chosen": -331.3185119628906, + "logps/rejected": -640.925048828125, + "loss": 0.2113, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 2.59657883644104, + "rewards/margins": 2.959690809249878, + "rewards/rejected": -0.3631117343902588, + "step": 480 + }, + { + "epoch": 0.17, + "learning_rate": 2.7746319365798413e-07, + "logits/chosen": -1.499083399772644, + "logits/rejected": -1.1673336029052734, + "logps/chosen": -359.90985107421875, + "logps/rejected": -538.0321044921875, + "loss": 0.2013, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.1838698387145996, + "rewards/margins": 3.737438201904297, + "rewards/rejected": -0.5535683631896973, + "step": 490 + }, + { + "epoch": 0.17, + "learning_rate": 2.831257078142695e-07, + "logits/chosen": -1.4566560983657837, + "logits/rejected": -1.182016134262085, + "logps/chosen": -492.3095703125, + "logps/rejected": -347.855224609375, + "loss": 0.2496, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 2.7738585472106934, + "rewards/margins": 3.232032060623169, + "rewards/rejected": -0.45817336440086365, + "step": 500 + }, + { + "epoch": 0.17, + "eval_logits/chosen": -1.4660594463348389, + "eval_logits/rejected": -1.1393872499465942, + "eval_logps/chosen": -394.1635437011719, + "eval_logps/rejected": -565.11376953125, + "eval_loss": 0.23232005536556244, + "eval_rewards/accuracies": 0.9183501601219177, + "eval_rewards/chosen": 2.5759212970733643, + "eval_rewards/margins": 3.135658025741577, + "eval_rewards/rejected": -0.5597367286682129, + "eval_runtime": 558.4472, + "eval_samples_per_second": 17.011, + "eval_steps_per_second": 0.532, + "step": 500 + }, + { + "epoch": 0.17, + "learning_rate": 2.887882219705549e-07, + "logits/chosen": -1.466706395149231, + "logits/rejected": -1.1716924905776978, + "logps/chosen": -445.7529296875, + "logps/rejected": -697.1843872070312, + "loss": 0.223, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 2.6778664588928223, + "rewards/margins": 3.3280444145202637, + "rewards/rejected": -0.6501787304878235, + "step": 510 + }, + { + "epoch": 0.18, + "learning_rate": 2.944507361268403e-07, + "logits/chosen": -1.4755175113677979, + "logits/rejected": -1.1727478504180908, + "logps/chosen": -348.1329650878906, + "logps/rejected": -588.0647583007812, + "loss": 0.2265, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 2.5383663177490234, + "rewards/margins": 3.085991382598877, + "rewards/rejected": -0.5476250648498535, + "step": 520 + }, + { + "epoch": 0.18, + "learning_rate": 3.001132502831257e-07, + "logits/chosen": -1.4678590297698975, + "logits/rejected": -1.1564358472824097, + "logps/chosen": -341.45440673828125, + "logps/rejected": -643.4864501953125, + "loss": 0.2223, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 2.856698989868164, + "rewards/margins": 3.310093641281128, + "rewards/rejected": -0.45339447259902954, + "step": 530 + }, + { + "epoch": 0.18, + "learning_rate": 3.057757644394111e-07, + "logits/chosen": -1.4738667011260986, + "logits/rejected": -1.1423356533050537, + "logps/chosen": -325.92303466796875, + "logps/rejected": -451.5601501464844, + "loss": 0.2186, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.91322660446167, + "rewards/margins": 3.7133572101593018, + "rewards/rejected": -0.8001307249069214, + "step": 540 + }, + { + "epoch": 0.19, + "learning_rate": 3.114382785956965e-07, + "logits/chosen": -1.4692233800888062, + "logits/rejected": -1.180604338645935, + "logps/chosen": -339.2276306152344, + "logps/rejected": -508.8544006347656, + "loss": 0.2336, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.8062305450439453, + "rewards/margins": 3.3374485969543457, + "rewards/rejected": -0.5312176942825317, + "step": 550 + }, + { + "epoch": 0.19, + "learning_rate": 3.171007927519819e-07, + "logits/chosen": -1.4367458820343018, + "logits/rejected": -1.1548749208450317, + "logps/chosen": -466.42010498046875, + "logps/rejected": -468.79071044921875, + "loss": 0.2064, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 2.7937171459198, + "rewards/margins": 3.523512601852417, + "rewards/rejected": -0.7297953367233276, + "step": 560 + }, + { + "epoch": 0.19, + "learning_rate": 3.227633069082673e-07, + "logits/chosen": -1.4456270933151245, + "logits/rejected": -1.2024356126785278, + "logps/chosen": -506.5753479003906, + "logps/rejected": -358.99530029296875, + "loss": 0.1766, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.0914306640625, + "rewards/margins": 3.6810905933380127, + "rewards/rejected": -0.5896599888801575, + "step": 570 + }, + { + "epoch": 0.2, + "learning_rate": 3.284258210645526e-07, + "logits/chosen": -1.4482395648956299, + "logits/rejected": -1.1533546447753906, + "logps/chosen": -470.9947204589844, + "logps/rejected": -537.549560546875, + "loss": 0.2383, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.135359287261963, + "rewards/margins": 3.7812187671661377, + "rewards/rejected": -0.6458595991134644, + "step": 580 + }, + { + "epoch": 0.2, + "learning_rate": 3.34088335220838e-07, + "logits/chosen": -1.4674651622772217, + "logits/rejected": -1.1439255475997925, + "logps/chosen": -344.49658203125, + "logps/rejected": -418.3880920410156, + "loss": 0.195, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.0652554035186768, + "rewards/margins": 3.763399839401245, + "rewards/rejected": -0.6981445550918579, + "step": 590 + }, + { + "epoch": 0.2, + "learning_rate": 3.3975084937712344e-07, + "logits/chosen": -1.4366505146026611, + "logits/rejected": -1.1366952657699585, + "logps/chosen": -459.80157470703125, + "logps/rejected": -712.8412475585938, + "loss": 0.2099, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 2.773369789123535, + "rewards/margins": 3.337761402130127, + "rewards/rejected": -0.5643914341926575, + "step": 600 + }, + { + "epoch": 0.2, + "eval_logits/chosen": -1.4513214826583862, + "eval_logits/rejected": -1.1137259006500244, + "eval_logps/chosen": -389.5693664550781, + "eval_logps/rejected": -566.9301147460938, + "eval_loss": 0.19786527752876282, + "eval_rewards/accuracies": 0.9242424368858337, + "eval_rewards/chosen": 3.03533673286438, + "eval_rewards/margins": 3.7767136096954346, + "eval_rewards/rejected": -0.7413766384124756, + "eval_runtime": 557.6968, + "eval_samples_per_second": 17.034, + "eval_steps_per_second": 0.533, + "step": 600 + }, + { + "epoch": 0.21, + "learning_rate": 3.454133635334088e-07, + "logits/chosen": -1.4552167654037476, + "logits/rejected": -1.1644409894943237, + "logps/chosen": -375.9332580566406, + "logps/rejected": -491.3536071777344, + "loss": 0.2025, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.59043550491333, + "rewards/margins": 3.5148017406463623, + "rewards/rejected": -0.9243658781051636, + "step": 610 + }, + { + "epoch": 0.21, + "learning_rate": 3.510758776896942e-07, + "logits/chosen": -1.4473903179168701, + "logits/rejected": -1.1493202447891235, + "logps/chosen": -475.545166015625, + "logps/rejected": -621.3789672851562, + "loss": 0.178, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.2095494270324707, + "rewards/margins": 3.685187578201294, + "rewards/rejected": -0.4756384491920471, + "step": 620 + }, + { + "epoch": 0.21, + "learning_rate": 3.567383918459796e-07, + "logits/chosen": -1.4265742301940918, + "logits/rejected": -1.151318073272705, + "logps/chosen": -429.902587890625, + "logps/rejected": -325.6732482910156, + "loss": 0.1846, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.842986822128296, + "rewards/margins": 3.7076828479766846, + "rewards/rejected": -0.8646961450576782, + "step": 630 + }, + { + "epoch": 0.22, + "learning_rate": 3.62400906002265e-07, + "logits/chosen": -1.4646637439727783, + "logits/rejected": -1.0810739994049072, + "logps/chosen": -405.18206787109375, + "logps/rejected": -612.15234375, + "loss": 0.1364, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.224799394607544, + "rewards/margins": 4.224340915679932, + "rewards/rejected": -0.9995414614677429, + "step": 640 + }, + { + "epoch": 0.22, + "learning_rate": 3.6806342015855037e-07, + "logits/chosen": -1.4459707736968994, + "logits/rejected": -1.116072654724121, + "logps/chosen": -379.79400634765625, + "logps/rejected": -399.90972900390625, + "loss": 0.1666, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.2206974029541016, + "rewards/margins": 4.059481143951416, + "rewards/rejected": -0.8387835621833801, + "step": 650 + }, + { + "epoch": 0.22, + "learning_rate": 3.737259343148358e-07, + "logits/chosen": -1.4792070388793945, + "logits/rejected": -1.111628770828247, + "logps/chosen": -328.38763427734375, + "logps/rejected": -561.1533813476562, + "loss": 0.1527, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 3.2477493286132812, + "rewards/margins": 4.300591945648193, + "rewards/rejected": -1.052842617034912, + "step": 660 + }, + { + "epoch": 0.23, + "learning_rate": 3.7938844847112115e-07, + "logits/chosen": -1.4564247131347656, + "logits/rejected": -1.0802199840545654, + "logps/chosen": -385.3539123535156, + "logps/rejected": -661.917724609375, + "loss": 0.1937, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 2.859179735183716, + "rewards/margins": 3.7947134971618652, + "rewards/rejected": -0.9355341196060181, + "step": 670 + }, + { + "epoch": 0.23, + "learning_rate": 3.8505096262740653e-07, + "logits/chosen": -1.4478049278259277, + "logits/rejected": -1.0582859516143799, + "logps/chosen": -353.5517578125, + "logps/rejected": -597.3490600585938, + "loss": 0.1473, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.531507968902588, + "rewards/margins": 4.614091396331787, + "rewards/rejected": -1.0825836658477783, + "step": 680 + }, + { + "epoch": 0.23, + "learning_rate": 3.907134767836919e-07, + "logits/chosen": -1.4576222896575928, + "logits/rejected": -1.0983006954193115, + "logps/chosen": -433.95928955078125, + "logps/rejected": -602.529296875, + "loss": 0.1297, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.8816752433776855, + "rewards/margins": 4.943896293640137, + "rewards/rejected": -1.062220573425293, + "step": 690 + }, + { + "epoch": 0.24, + "learning_rate": 3.963759909399773e-07, + "logits/chosen": -1.465179443359375, + "logits/rejected": -1.128832459449768, + "logps/chosen": -382.7997131347656, + "logps/rejected": -630.0484619140625, + "loss": 0.123, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.064541339874268, + "rewards/margins": 5.324443340301514, + "rewards/rejected": -1.2599024772644043, + "step": 700 + }, + { + "epoch": 0.24, + "eval_logits/chosen": -1.4531193971633911, + "eval_logits/rejected": -1.1146677732467651, + "eval_logps/chosen": -385.52484130859375, + "eval_logps/rejected": -570.780029296875, + "eval_loss": 0.16242532432079315, + "eval_rewards/accuracies": 0.9436026811599731, + "eval_rewards/chosen": 3.439793825149536, + "eval_rewards/margins": 4.566161632537842, + "eval_rewards/rejected": -1.1263678073883057, + "eval_runtime": 557.5109, + "eval_samples_per_second": 17.04, + "eval_steps_per_second": 0.533, + "step": 700 + }, + { + "epoch": 0.24, + "learning_rate": 4.0203850509626275e-07, + "logits/chosen": -1.4666264057159424, + "logits/rejected": -1.15610671043396, + "logps/chosen": -378.5975036621094, + "logps/rejected": -438.9236755371094, + "loss": 0.161, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.497119188308716, + "rewards/margins": 4.482955455780029, + "rewards/rejected": -0.9858363270759583, + "step": 710 + }, + { + "epoch": 0.24, + "learning_rate": 4.0770101925254814e-07, + "logits/chosen": -1.4681179523468018, + "logits/rejected": -1.192880392074585, + "logps/chosen": -355.46856689453125, + "logps/rejected": -485.2081604003906, + "loss": 0.1794, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.3350415229797363, + "rewards/margins": 4.372036933898926, + "rewards/rejected": -1.0369951725006104, + "step": 720 + }, + { + "epoch": 0.25, + "learning_rate": 4.133635334088335e-07, + "logits/chosen": -1.4489364624023438, + "logits/rejected": -1.1306109428405762, + "logps/chosen": -425.83642578125, + "logps/rejected": -534.4620361328125, + "loss": 0.1436, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.5910911560058594, + "rewards/margins": 4.702983379364014, + "rewards/rejected": -1.1118929386138916, + "step": 730 + }, + { + "epoch": 0.25, + "learning_rate": 4.190260475651189e-07, + "logits/chosen": -1.4708889722824097, + "logits/rejected": -1.1714198589324951, + "logps/chosen": -409.99609375, + "logps/rejected": -479.97039794921875, + "loss": 0.1676, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.325287342071533, + "rewards/margins": 4.273362159729004, + "rewards/rejected": -0.9480754733085632, + "step": 740 + }, + { + "epoch": 0.25, + "learning_rate": 4.2468856172140424e-07, + "logits/chosen": -1.4582195281982422, + "logits/rejected": -1.095037817955017, + "logps/chosen": -320.46221923828125, + "logps/rejected": -575.8782958984375, + "loss": 0.1077, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.9203059673309326, + "rewards/margins": 5.040288925170898, + "rewards/rejected": -1.1199829578399658, + "step": 750 + }, + { + "epoch": 0.26, + "learning_rate": 4.3035107587768963e-07, + "logits/chosen": -1.452093243598938, + "logits/rejected": -1.1727025508880615, + "logps/chosen": -321.6336975097656, + "logps/rejected": -486.25909423828125, + "loss": 0.1564, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.6308226585388184, + "rewards/margins": 4.718327522277832, + "rewards/rejected": -1.0875051021575928, + "step": 760 + }, + { + "epoch": 0.26, + "learning_rate": 4.3601359003397507e-07, + "logits/chosen": -1.464379906654358, + "logits/rejected": -1.1972758769989014, + "logps/chosen": -453.8150329589844, + "logps/rejected": -565.5223388671875, + "loss": 0.1334, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.648606061935425, + "rewards/margins": 4.990372657775879, + "rewards/rejected": -1.3417659997940063, + "step": 770 + }, + { + "epoch": 0.27, + "learning_rate": 4.4167610419026046e-07, + "logits/chosen": -1.4624255895614624, + "logits/rejected": -1.1212713718414307, + "logps/chosen": -462.9256286621094, + "logps/rejected": -670.3197021484375, + "loss": 0.134, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.7583529949188232, + "rewards/margins": 5.105889797210693, + "rewards/rejected": -1.3475372791290283, + "step": 780 + }, + { + "epoch": 0.27, + "learning_rate": 4.4733861834654585e-07, + "logits/chosen": -1.4728128910064697, + "logits/rejected": -1.1267871856689453, + "logps/chosen": -372.94757080078125, + "logps/rejected": -378.6418762207031, + "loss": 0.1337, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.946627378463745, + "rewards/margins": 5.0403361320495605, + "rewards/rejected": -1.093708872795105, + "step": 790 + }, + { + "epoch": 0.27, + "learning_rate": 4.5300113250283123e-07, + "logits/chosen": -1.4499714374542236, + "logits/rejected": -1.183134913444519, + "logps/chosen": -390.2574462890625, + "logps/rejected": -518.8786010742188, + "loss": 0.1211, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.772278308868408, + "rewards/margins": 4.817863941192627, + "rewards/rejected": -1.0455856323242188, + "step": 800 + }, + { + "epoch": 0.27, + "eval_logits/chosen": -1.4555169343948364, + "eval_logits/rejected": -1.1125926971435547, + "eval_logps/chosen": -382.0456237792969, + "eval_logps/rejected": -573.342529296875, + "eval_loss": 0.14035290479660034, + "eval_rewards/accuracies": 0.945286214351654, + "eval_rewards/chosen": 3.7877144813537598, + "eval_rewards/margins": 5.170328617095947, + "eval_rewards/rejected": -1.3826148509979248, + "eval_runtime": 559.1401, + "eval_samples_per_second": 16.99, + "eval_steps_per_second": 0.531, + "step": 800 + }, + { + "epoch": 0.28, + "learning_rate": 4.586636466591166e-07, + "logits/chosen": -1.461599588394165, + "logits/rejected": -1.1533044576644897, + "logps/chosen": -393.0902099609375, + "logps/rejected": -644.8455200195312, + "loss": 0.142, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.7143197059631348, + "rewards/margins": 5.217325687408447, + "rewards/rejected": -1.503006100654602, + "step": 810 + }, + { + "epoch": 0.28, + "learning_rate": 4.64326160815402e-07, + "logits/chosen": -1.429278016090393, + "logits/rejected": -1.1481190919876099, + "logps/chosen": -478.57330322265625, + "logps/rejected": -437.3929748535156, + "loss": 0.1406, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.5481009483337402, + "rewards/margins": 4.7597270011901855, + "rewards/rejected": -1.2116261720657349, + "step": 820 + }, + { + "epoch": 0.28, + "learning_rate": 4.6998867497168745e-07, + "logits/chosen": -1.4626922607421875, + "logits/rejected": -1.1649787425994873, + "logps/chosen": -306.91302490234375, + "logps/rejected": -604.1826171875, + "loss": 0.1585, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.9734911918640137, + "rewards/margins": 5.191909313201904, + "rewards/rejected": -1.218418002128601, + "step": 830 + }, + { + "epoch": 0.29, + "learning_rate": 4.756511891279728e-07, + "logits/chosen": -1.44761323928833, + "logits/rejected": -1.1732923984527588, + "logps/chosen": -399.44720458984375, + "logps/rejected": -514.2167358398438, + "loss": 0.1367, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.7745490074157715, + "rewards/margins": 6.333249092102051, + "rewards/rejected": -1.5587003231048584, + "step": 840 + }, + { + "epoch": 0.29, + "learning_rate": 4.813137032842582e-07, + "logits/chosen": -1.4558974504470825, + "logits/rejected": -1.1677448749542236, + "logps/chosen": -407.19183349609375, + "logps/rejected": -783.762451171875, + "loss": 0.1312, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.162660598754883, + "rewards/margins": 5.449455738067627, + "rewards/rejected": -1.2867952585220337, + "step": 850 + }, + { + "epoch": 0.29, + "learning_rate": 4.869762174405436e-07, + "logits/chosen": -1.4543912410736084, + "logits/rejected": -1.1719005107879639, + "logps/chosen": -375.68780517578125, + "logps/rejected": -581.8094482421875, + "loss": 0.1222, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.192741394042969, + "rewards/margins": 5.466836452484131, + "rewards/rejected": -1.274095058441162, + "step": 860 + }, + { + "epoch": 0.3, + "learning_rate": 4.92638731596829e-07, + "logits/chosen": -1.4511828422546387, + "logits/rejected": -1.2218043804168701, + "logps/chosen": -481.80963134765625, + "logps/rejected": -643.4028930664062, + "loss": 0.1238, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 5.114841938018799, + "rewards/margins": 6.360867977142334, + "rewards/rejected": -1.2460262775421143, + "step": 870 + }, + { + "epoch": 0.3, + "learning_rate": 4.983012457531144e-07, + "logits/chosen": -1.4728432893753052, + "logits/rejected": -1.1584317684173584, + "logps/chosen": -296.8708801269531, + "logps/rejected": -503.31103515625, + "loss": 0.1181, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.537439346313477, + "rewards/margins": 6.069893836975098, + "rewards/rejected": -1.5324543714523315, + "step": 880 + }, + { + "epoch": 0.3, + "learning_rate": 4.995593604431575e-07, + "logits/chosen": -1.457878828048706, + "logits/rejected": -1.1777491569519043, + "logps/chosen": -368.54327392578125, + "logps/rejected": -394.3490295410156, + "loss": 0.123, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.9159674644470215, + "rewards/margins": 5.224673271179199, + "rewards/rejected": -1.3087060451507568, + "step": 890 + }, + { + "epoch": 0.31, + "learning_rate": 4.989298753619539e-07, + "logits/chosen": -1.4818239212036133, + "logits/rejected": -1.0495814085006714, + "logps/chosen": -337.33685302734375, + "logps/rejected": -470.76434326171875, + "loss": 0.1398, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.484321117401123, + "rewards/margins": 6.072963237762451, + "rewards/rejected": -1.58864164352417, + "step": 900 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -1.4567501544952393, + "eval_logits/rejected": -1.1145392656326294, + "eval_logps/chosen": -378.73443603515625, + "eval_logps/rejected": -575.2359008789062, + "eval_loss": 0.13047146797180176, + "eval_rewards/accuracies": 0.9545454382896423, + "eval_rewards/chosen": 4.118832111358643, + "eval_rewards/margins": 5.690784454345703, + "eval_rewards/rejected": -1.57195246219635, + "eval_runtime": 562.4059, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 0.528, + "step": 900 + }, + { + "epoch": 0.31, + "learning_rate": 4.983003902807503e-07, + "logits/chosen": -1.4585669040679932, + "logits/rejected": -1.1607335805892944, + "logps/chosen": -449.2415466308594, + "logps/rejected": -393.2831726074219, + "loss": 0.1413, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.7599587440490723, + "rewards/margins": 5.023233413696289, + "rewards/rejected": -1.263274908065796, + "step": 910 + }, + { + "epoch": 0.31, + "learning_rate": 4.976709051995467e-07, + "logits/chosen": -1.464983344078064, + "logits/rejected": -1.1285371780395508, + "logps/chosen": -325.03167724609375, + "logps/rejected": -499.2435607910156, + "loss": 0.1069, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.230907440185547, + "rewards/margins": 5.925202369689941, + "rewards/rejected": -1.6942945718765259, + "step": 920 + }, + { + "epoch": 0.32, + "learning_rate": 4.970414201183432e-07, + "logits/chosen": -1.4669829607009888, + "logits/rejected": -1.1600773334503174, + "logps/chosen": -360.4229431152344, + "logps/rejected": -613.53125, + "loss": 0.128, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.795597076416016, + "rewards/margins": 6.477330684661865, + "rewards/rejected": -1.6817344427108765, + "step": 930 + }, + { + "epoch": 0.32, + "learning_rate": 4.964119350371396e-07, + "logits/chosen": -1.4740451574325562, + "logits/rejected": -1.1996599435806274, + "logps/chosen": -315.9418640136719, + "logps/rejected": -561.3172607421875, + "loss": 0.0997, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.4924516677856445, + "rewards/margins": 5.849211692810059, + "rewards/rejected": -1.3567595481872559, + "step": 940 + }, + { + "epoch": 0.32, + "learning_rate": 4.95782449955936e-07, + "logits/chosen": -1.478124976158142, + "logits/rejected": -1.218483567237854, + "logps/chosen": -381.94207763671875, + "logps/rejected": -511.00640869140625, + "loss": 0.115, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.513301372528076, + "rewards/margins": 5.9051666259765625, + "rewards/rejected": -1.3918651342391968, + "step": 950 + }, + { + "epoch": 0.33, + "learning_rate": 4.951529648747325e-07, + "logits/chosen": -1.4566136598587036, + "logits/rejected": -1.170207142829895, + "logps/chosen": -449.7284240722656, + "logps/rejected": -546.5609130859375, + "loss": 0.107, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.879666328430176, + "rewards/margins": 6.199456691741943, + "rewards/rejected": -1.3197907209396362, + "step": 960 + }, + { + "epoch": 0.33, + "learning_rate": 4.945234797935289e-07, + "logits/chosen": -1.4734047651290894, + "logits/rejected": -1.1685858964920044, + "logps/chosen": -304.51953125, + "logps/rejected": -717.8421630859375, + "loss": 0.1137, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.157977104187012, + "rewards/margins": 5.652981758117676, + "rewards/rejected": -1.495004653930664, + "step": 970 + }, + { + "epoch": 0.33, + "learning_rate": 4.938939947123252e-07, + "logits/chosen": -1.4529250860214233, + "logits/rejected": -1.1147228479385376, + "logps/chosen": -449.6807556152344, + "logps/rejected": -659.5738525390625, + "loss": 0.1455, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.358736991882324, + "rewards/margins": 6.285690784454346, + "rewards/rejected": -1.926953673362732, + "step": 980 + }, + { + "epoch": 0.34, + "learning_rate": 4.932645096311217e-07, + "logits/chosen": -1.4718146324157715, + "logits/rejected": -1.1147180795669556, + "logps/chosen": -303.16143798828125, + "logps/rejected": -646.6487426757812, + "loss": 0.0892, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.832289218902588, + "rewards/margins": 6.990142822265625, + "rewards/rejected": -2.1578540802001953, + "step": 990 + }, + { + "epoch": 0.34, + "learning_rate": 4.926350245499181e-07, + "logits/chosen": -1.4667094945907593, + "logits/rejected": -1.1550071239471436, + "logps/chosen": -403.26776123046875, + "logps/rejected": -484.25164794921875, + "loss": 0.1161, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.706210136413574, + "rewards/margins": 6.499111175537109, + "rewards/rejected": -1.7929000854492188, + "step": 1000 + }, + { + "epoch": 0.34, + "eval_logits/chosen": -1.4604599475860596, + "eval_logits/rejected": -1.121699571609497, + "eval_logps/chosen": -376.8678283691406, + "eval_logps/rejected": -576.9345092773438, + "eval_loss": 0.10661116242408752, + "eval_rewards/accuracies": 0.9646464586257935, + "eval_rewards/chosen": 4.305492401123047, + "eval_rewards/margins": 6.047306537628174, + "eval_rewards/rejected": -1.741814136505127, + "eval_runtime": 559.5024, + "eval_samples_per_second": 16.979, + "eval_steps_per_second": 0.531, + "step": 1000 + }, + { + "epoch": 0.34, + "learning_rate": 4.920055394687146e-07, + "logits/chosen": -1.4534475803375244, + "logits/rejected": -1.1997301578521729, + "logps/chosen": -409.09454345703125, + "logps/rejected": -466.78021240234375, + "loss": 0.141, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.639789581298828, + "rewards/margins": 6.110454559326172, + "rewards/rejected": -1.4706652164459229, + "step": 1010 + }, + { + "epoch": 0.35, + "learning_rate": 4.91376054387511e-07, + "logits/chosen": -1.4943947792053223, + "logits/rejected": -1.1799076795578003, + "logps/chosen": -290.9139709472656, + "logps/rejected": -521.3275146484375, + "loss": 0.1161, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.345009803771973, + "rewards/margins": 5.780714988708496, + "rewards/rejected": -1.4357054233551025, + "step": 1020 + }, + { + "epoch": 0.35, + "learning_rate": 4.907465693063074e-07, + "logits/chosen": -1.4816995859146118, + "logits/rejected": -1.2097212076187134, + "logps/chosen": -303.4338684082031, + "logps/rejected": -505.17425537109375, + "loss": 0.1256, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.848861217498779, + "rewards/margins": 6.538815498352051, + "rewards/rejected": -1.6899540424346924, + "step": 1030 + }, + { + "epoch": 0.35, + "learning_rate": 4.901170842251039e-07, + "logits/chosen": -1.465075135231018, + "logits/rejected": -1.221210241317749, + "logps/chosen": -499.671630859375, + "logps/rejected": -694.0237426757812, + "loss": 0.1036, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.632418394088745, + "rewards/margins": 5.9121294021606445, + "rewards/rejected": -2.2797110080718994, + "step": 1040 + }, + { + "epoch": 0.36, + "learning_rate": 4.894875991439003e-07, + "logits/chosen": -1.4962574243545532, + "logits/rejected": -1.1775150299072266, + "logps/chosen": -482.98779296875, + "logps/rejected": -501.41357421875, + "loss": 0.1278, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 4.165386199951172, + "rewards/margins": 6.322039604187012, + "rewards/rejected": -2.156653642654419, + "step": 1050 + }, + { + "epoch": 0.36, + "learning_rate": 4.888581140626966e-07, + "logits/chosen": -1.4862196445465088, + "logits/rejected": -1.123844027519226, + "logps/chosen": -382.0890197753906, + "logps/rejected": -545.5562744140625, + "loss": 0.1166, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.130069255828857, + "rewards/margins": 5.989940166473389, + "rewards/rejected": -1.8598709106445312, + "step": 1060 + }, + { + "epoch": 0.36, + "learning_rate": 4.882286289814931e-07, + "logits/chosen": -1.4778748750686646, + "logits/rejected": -1.208962082862854, + "logps/chosen": -436.08245849609375, + "logps/rejected": -453.83349609375, + "loss": 0.1006, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.168934345245361, + "rewards/margins": 6.147931098937988, + "rewards/rejected": -1.9789960384368896, + "step": 1070 + }, + { + "epoch": 0.37, + "learning_rate": 4.875991439002896e-07, + "logits/chosen": -1.4882242679595947, + "logits/rejected": -1.1991513967514038, + "logps/chosen": -378.70538330078125, + "logps/rejected": -457.91229248046875, + "loss": 0.0775, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.272151470184326, + "rewards/margins": 6.493917942047119, + "rewards/rejected": -2.2217655181884766, + "step": 1080 + }, + { + "epoch": 0.37, + "learning_rate": 4.869696588190859e-07, + "logits/chosen": -1.474613070487976, + "logits/rejected": -1.1653835773468018, + "logps/chosen": -379.74224853515625, + "logps/rejected": -336.6827392578125, + "loss": 0.0883, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.0387654304504395, + "rewards/margins": 5.831040382385254, + "rewards/rejected": -1.7922754287719727, + "step": 1090 + }, + { + "epoch": 0.37, + "learning_rate": 4.863401737378824e-07, + "logits/chosen": -1.4771358966827393, + "logits/rejected": -1.1721785068511963, + "logps/chosen": -383.4172668457031, + "logps/rejected": -587.6443481445312, + "loss": 0.1109, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.375439167022705, + "rewards/margins": 6.930808067321777, + "rewards/rejected": -2.555368661880493, + "step": 1100 + }, + { + "epoch": 0.37, + "eval_logits/chosen": -1.4682918787002563, + "eval_logits/rejected": -1.1333562135696411, + "eval_logps/chosen": -375.6896667480469, + "eval_logps/rejected": -579.5653076171875, + "eval_loss": 0.10061251372098923, + "eval_rewards/accuracies": 0.9621211886405945, + "eval_rewards/chosen": 4.423308372497559, + "eval_rewards/margins": 6.4282026290893555, + "eval_rewards/rejected": -2.0048940181732178, + "eval_runtime": 558.7102, + "eval_samples_per_second": 17.003, + "eval_steps_per_second": 0.532, + "step": 1100 + }, + { + "epoch": 0.38, + "learning_rate": 4.857106886566788e-07, + "logits/chosen": -1.499935507774353, + "logits/rejected": -1.0978552103042603, + "logps/chosen": -276.2793884277344, + "logps/rejected": -644.7932739257812, + "loss": 0.1157, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.046536922454834, + "rewards/margins": 6.15374231338501, + "rewards/rejected": -2.1072051525115967, + "step": 1110 + }, + { + "epoch": 0.38, + "learning_rate": 4.850812035754753e-07, + "logits/chosen": -1.504651665687561, + "logits/rejected": -1.163279414176941, + "logps/chosen": -331.82476806640625, + "logps/rejected": -574.56396484375, + "loss": 0.0758, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.1561055183410645, + "rewards/margins": 6.304588317871094, + "rewards/rejected": -2.148482322692871, + "step": 1120 + }, + { + "epoch": 0.38, + "learning_rate": 4.844517184942716e-07, + "logits/chosen": -1.4773398637771606, + "logits/rejected": -1.1397154331207275, + "logps/chosen": -407.5681457519531, + "logps/rejected": -392.0281677246094, + "loss": 0.0751, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.043157577514648, + "rewards/margins": 6.012135028839111, + "rewards/rejected": -1.9689773321151733, + "step": 1130 + }, + { + "epoch": 0.39, + "learning_rate": 4.838222334130681e-07, + "logits/chosen": -1.4563764333724976, + "logits/rejected": -1.1378897428512573, + "logps/chosen": -381.33721923828125, + "logps/rejected": -499.71453857421875, + "loss": 0.0858, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.372216701507568, + "rewards/margins": 6.784847259521484, + "rewards/rejected": -2.412630558013916, + "step": 1140 + }, + { + "epoch": 0.39, + "learning_rate": 4.831927483318645e-07, + "logits/chosen": -1.459613561630249, + "logits/rejected": -1.1508173942565918, + "logps/chosen": -418.51934814453125, + "logps/rejected": -362.7253112792969, + "loss": 0.0712, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.949090003967285, + "rewards/margins": 7.040833950042725, + "rewards/rejected": -2.091745138168335, + "step": 1150 + }, + { + "epoch": 0.39, + "learning_rate": 4.82563263250661e-07, + "logits/chosen": -1.4770419597625732, + "logits/rejected": -1.137213110923767, + "logps/chosen": -462.734619140625, + "logps/rejected": -488.783203125, + "loss": 0.0925, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.036229133605957, + "rewards/margins": 6.372972011566162, + "rewards/rejected": -2.3367438316345215, + "step": 1160 + }, + { + "epoch": 0.4, + "learning_rate": 4.819337781694573e-07, + "logits/chosen": -1.4739525318145752, + "logits/rejected": -1.1760450601577759, + "logps/chosen": -301.63739013671875, + "logps/rejected": -570.0889892578125, + "loss": 0.0873, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.054856300354004, + "rewards/margins": 6.374632358551025, + "rewards/rejected": -2.3197762966156006, + "step": 1170 + }, + { + "epoch": 0.4, + "learning_rate": 4.813042930882538e-07, + "logits/chosen": -1.4727171659469604, + "logits/rejected": -1.1396564245224, + "logps/chosen": -357.81060791015625, + "logps/rejected": -721.6929931640625, + "loss": 0.0862, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.238607883453369, + "rewards/margins": 6.158675670623779, + "rewards/rejected": -1.9200681447982788, + "step": 1180 + }, + { + "epoch": 0.4, + "learning_rate": 4.806748080070503e-07, + "logits/chosen": -1.4509950876235962, + "logits/rejected": -1.1030436754226685, + "logps/chosen": -397.36956787109375, + "logps/rejected": -412.7225036621094, + "loss": 0.1312, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.9541778564453125, + "rewards/margins": 6.331896781921387, + "rewards/rejected": -2.3777191638946533, + "step": 1190 + }, + { + "epoch": 0.41, + "learning_rate": 4.800453229258466e-07, + "logits/chosen": -1.4880512952804565, + "logits/rejected": -1.1717326641082764, + "logps/chosen": -466.0701599121094, + "logps/rejected": -512.2325439453125, + "loss": 0.0983, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 3.805614471435547, + "rewards/margins": 5.888615608215332, + "rewards/rejected": -2.0830016136169434, + "step": 1200 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -1.4812698364257812, + "eval_logits/rejected": -1.1544265747070312, + "eval_logps/chosen": -376.84259033203125, + "eval_logps/rejected": -585.1442260742188, + "eval_loss": 0.08805635571479797, + "eval_rewards/accuracies": 0.9638047218322754, + "eval_rewards/chosen": 4.30801248550415, + "eval_rewards/margins": 6.870803356170654, + "eval_rewards/rejected": -2.562790632247925, + "eval_runtime": 558.5146, + "eval_samples_per_second": 17.009, + "eval_steps_per_second": 0.532, + "step": 1200 + }, + { + "epoch": 0.41, + "learning_rate": 4.79415837844643e-07, + "logits/chosen": -1.5082863569259644, + "logits/rejected": -1.0951197147369385, + "logps/chosen": -383.1314392089844, + "logps/rejected": -330.7431335449219, + "loss": 0.0899, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.398260116577148, + "rewards/margins": 6.688971042633057, + "rewards/rejected": -2.2907111644744873, + "step": 1210 + }, + { + "epoch": 0.41, + "learning_rate": 4.787863527634395e-07, + "logits/chosen": -1.4863263368606567, + "logits/rejected": -1.1680128574371338, + "logps/chosen": -367.881591796875, + "logps/rejected": -639.2601928710938, + "loss": 0.0715, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.4459023475646973, + "rewards/margins": 5.875584602355957, + "rewards/rejected": -2.4296820163726807, + "step": 1220 + }, + { + "epoch": 0.42, + "learning_rate": 4.781568676822359e-07, + "logits/chosen": -1.5003607273101807, + "logits/rejected": -1.2009575366973877, + "logps/chosen": -303.6417541503906, + "logps/rejected": -833.9129638671875, + "loss": 0.0652, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.535092353820801, + "rewards/margins": 7.268994331359863, + "rewards/rejected": -2.7339024543762207, + "step": 1230 + }, + { + "epoch": 0.42, + "learning_rate": 4.775273826010323e-07, + "logits/chosen": -1.487640142440796, + "logits/rejected": -1.1861073970794678, + "logps/chosen": -397.42230224609375, + "logps/rejected": -612.1988525390625, + "loss": 0.0755, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.0881805419921875, + "rewards/margins": 7.059517860412598, + "rewards/rejected": -2.97133731842041, + "step": 1240 + }, + { + "epoch": 0.42, + "learning_rate": 4.768978975198288e-07, + "logits/chosen": -1.511664867401123, + "logits/rejected": -1.2150501012802124, + "logps/chosen": -388.0561218261719, + "logps/rejected": -792.2547607421875, + "loss": 0.0844, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.064549446105957, + "rewards/margins": 6.036563396453857, + "rewards/rejected": -1.9720141887664795, + "step": 1250 + }, + { + "epoch": 0.43, + "learning_rate": 4.762684124386252e-07, + "logits/chosen": -1.4869279861450195, + "logits/rejected": -1.187788724899292, + "logps/chosen": -395.345947265625, + "logps/rejected": -720.666259765625, + "loss": 0.0797, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.516745090484619, + "rewards/margins": 6.927421569824219, + "rewards/rejected": -2.4106764793395996, + "step": 1260 + }, + { + "epoch": 0.43, + "learning_rate": 4.756389273574216e-07, + "logits/chosen": -1.4869451522827148, + "logits/rejected": -1.1384398937225342, + "logps/chosen": -434.06707763671875, + "logps/rejected": -621.151123046875, + "loss": 0.1099, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.157801151275635, + "rewards/margins": 6.983107566833496, + "rewards/rejected": -2.8253061771392822, + "step": 1270 + }, + { + "epoch": 0.44, + "learning_rate": 4.7500944227621803e-07, + "logits/chosen": -1.5181313753128052, + "logits/rejected": -1.1548988819122314, + "logps/chosen": -341.97991943359375, + "logps/rejected": -437.0553283691406, + "loss": 0.07, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.964907646179199, + "rewards/margins": 7.61227560043335, + "rewards/rejected": -2.6473681926727295, + "step": 1280 + }, + { + "epoch": 0.44, + "learning_rate": 4.7437995719501445e-07, + "logits/chosen": -1.5122239589691162, + "logits/rejected": -1.1255097389221191, + "logps/chosen": -312.6338806152344, + "logps/rejected": -590.165771484375, + "loss": 0.092, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.116483211517334, + "rewards/margins": 7.0477190017700195, + "rewards/rejected": -2.9312353134155273, + "step": 1290 + }, + { + "epoch": 0.44, + "learning_rate": 4.737504721138109e-07, + "logits/chosen": -1.4762308597564697, + "logits/rejected": -1.280274510383606, + "logps/chosen": -376.7564697265625, + "logps/rejected": -612.2212524414062, + "loss": 0.0965, + "rewards/accuracies": 0.9375, + "rewards/chosen": 3.7296531200408936, + "rewards/margins": 6.230870723724365, + "rewards/rejected": -2.5012173652648926, + "step": 1300 + }, + { + "epoch": 0.44, + "eval_logits/chosen": -1.4955438375473022, + "eval_logits/rejected": -1.1650733947753906, + "eval_logps/chosen": -376.46563720703125, + "eval_logps/rejected": -586.2010498046875, + "eval_loss": 0.07776026427745819, + "eval_rewards/accuracies": 0.9621211886405945, + "eval_rewards/chosen": 4.345710277557373, + "eval_rewards/margins": 7.014177322387695, + "eval_rewards/rejected": -2.6684677600860596, + "eval_runtime": 557.2631, + "eval_samples_per_second": 17.048, + "eval_steps_per_second": 0.533, + "step": 1300 + }, + { + "epoch": 0.45, + "learning_rate": 4.7312098703260735e-07, + "logits/chosen": -1.5184252262115479, + "logits/rejected": -1.1957508325576782, + "logps/chosen": -311.2591857910156, + "logps/rejected": -480.68292236328125, + "loss": 0.0953, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.280787467956543, + "rewards/margins": 6.255151748657227, + "rewards/rejected": -1.974363923072815, + "step": 1310 + }, + { + "epoch": 0.45, + "learning_rate": 4.724915019514038e-07, + "logits/chosen": -1.510637640953064, + "logits/rejected": -1.2856345176696777, + "logps/chosen": -368.8239440917969, + "logps/rejected": -697.6632690429688, + "loss": 0.0716, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.30528450012207, + "rewards/margins": 6.924792289733887, + "rewards/rejected": -2.6195082664489746, + "step": 1320 + }, + { + "epoch": 0.45, + "learning_rate": 4.7186201687020014e-07, + "logits/chosen": -1.4979273080825806, + "logits/rejected": -1.1541168689727783, + "logps/chosen": -365.5442810058594, + "logps/rejected": -421.36944580078125, + "loss": 0.0514, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.126973628997803, + "rewards/margins": 6.720233917236328, + "rewards/rejected": -2.5932602882385254, + "step": 1330 + }, + { + "epoch": 0.46, + "learning_rate": 4.7123253178899657e-07, + "logits/chosen": -1.528595209121704, + "logits/rejected": -1.1713165044784546, + "logps/chosen": -297.8626403808594, + "logps/rejected": -514.033203125, + "loss": 0.0598, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.073615074157715, + "rewards/margins": 7.79195499420166, + "rewards/rejected": -2.718339443206787, + "step": 1340 + }, + { + "epoch": 0.46, + "learning_rate": 4.70603046707793e-07, + "logits/chosen": -1.51167893409729, + "logits/rejected": -1.1580116748809814, + "logps/chosen": -312.6409606933594, + "logps/rejected": -650.9946899414062, + "loss": 0.0438, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.685142517089844, + "rewards/margins": 7.504499912261963, + "rewards/rejected": -2.8193564414978027, + "step": 1350 + }, + { + "epoch": 0.46, + "learning_rate": 4.699735616265894e-07, + "logits/chosen": -1.5130786895751953, + "logits/rejected": -1.2474558353424072, + "logps/chosen": -377.84332275390625, + "logps/rejected": -523.3129272460938, + "loss": 0.0939, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 5.0046186447143555, + "rewards/margins": 7.879345893859863, + "rewards/rejected": -2.874727725982666, + "step": 1360 + }, + { + "epoch": 0.47, + "learning_rate": 4.693440765453859e-07, + "logits/chosen": -1.490384817123413, + "logits/rejected": -1.2038155794143677, + "logps/chosen": -328.25286865234375, + "logps/rejected": -761.7498779296875, + "loss": 0.0683, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.487544298171997, + "rewards/margins": 6.234673500061035, + "rewards/rejected": -2.74712872505188, + "step": 1370 + }, + { + "epoch": 0.47, + "learning_rate": 4.687145914641823e-07, + "logits/chosen": -1.5184184312820435, + "logits/rejected": -1.1741076707839966, + "logps/chosen": -353.9080810546875, + "logps/rejected": -635.247802734375, + "loss": 0.0929, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.5855584144592285, + "rewards/margins": 7.686548709869385, + "rewards/rejected": -3.1009905338287354, + "step": 1380 + }, + { + "epoch": 0.47, + "learning_rate": 4.6808510638297873e-07, + "logits/chosen": -1.522088646888733, + "logits/rejected": -1.2158727645874023, + "logps/chosen": -351.5359191894531, + "logps/rejected": -644.1468505859375, + "loss": 0.0492, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.511094570159912, + "rewards/margins": 7.028720855712891, + "rewards/rejected": -2.5176265239715576, + "step": 1390 + }, + { + "epoch": 0.48, + "learning_rate": 4.674556213017751e-07, + "logits/chosen": -1.4998633861541748, + "logits/rejected": -1.1622101068496704, + "logps/chosen": -290.06842041015625, + "logps/rejected": -539.2385864257812, + "loss": 0.0542, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.643207550048828, + "rewards/margins": 8.093907356262207, + "rewards/rejected": -3.4506993293762207, + "step": 1400 + }, + { + "epoch": 0.48, + "eval_logits/chosen": -1.4971849918365479, + "eval_logits/rejected": -1.1767184734344482, + "eval_logps/chosen": -376.1544494628906, + "eval_logps/rejected": -591.0454711914062, + "eval_loss": 0.07047037780284882, + "eval_rewards/accuracies": 0.9739057421684265, + "eval_rewards/chosen": 4.37682580947876, + "eval_rewards/margins": 7.529730796813965, + "eval_rewards/rejected": -3.152905225753784, + "eval_runtime": 560.5241, + "eval_samples_per_second": 16.948, + "eval_steps_per_second": 0.53, + "step": 1400 + }, + { + "epoch": 0.48, + "learning_rate": 4.668261362205715e-07, + "logits/chosen": -1.5113210678100586, + "logits/rejected": -1.1424446105957031, + "logps/chosen": -411.2066345214844, + "logps/rejected": -590.9227294921875, + "loss": 0.0406, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.477583885192871, + "rewards/margins": 7.580605983734131, + "rewards/rejected": -3.1030232906341553, + "step": 1410 + }, + { + "epoch": 0.48, + "learning_rate": 4.6619665113936795e-07, + "logits/chosen": -1.4880897998809814, + "logits/rejected": -1.2573105096817017, + "logps/chosen": -389.78216552734375, + "logps/rejected": -645.0698852539062, + "loss": 0.0541, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.234439849853516, + "rewards/margins": 7.179551601409912, + "rewards/rejected": -2.9451115131378174, + "step": 1420 + }, + { + "epoch": 0.49, + "learning_rate": 4.6556716605816437e-07, + "logits/chosen": -1.4822556972503662, + "logits/rejected": -1.2449822425842285, + "logps/chosen": -369.6302185058594, + "logps/rejected": -580.4920654296875, + "loss": 0.1068, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.7416486740112305, + "rewards/margins": 7.650518894195557, + "rewards/rejected": -2.9088706970214844, + "step": 1430 + }, + { + "epoch": 0.49, + "learning_rate": 4.6493768097696085e-07, + "logits/chosen": -1.484257698059082, + "logits/rejected": -1.2153236865997314, + "logps/chosen": -476.56768798828125, + "logps/rejected": -481.64215087890625, + "loss": 0.0769, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.0144147872924805, + "rewards/margins": 6.710434913635254, + "rewards/rejected": -2.6960203647613525, + "step": 1440 + }, + { + "epoch": 0.49, + "learning_rate": 4.6430819589575727e-07, + "logits/chosen": -1.5058079957962036, + "logits/rejected": -1.1947168111801147, + "logps/chosen": -321.9506530761719, + "logps/rejected": -408.3023376464844, + "loss": 0.0574, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.223228931427002, + "rewards/margins": 7.1983208656311035, + "rewards/rejected": -2.9750924110412598, + "step": 1450 + }, + { + "epoch": 0.5, + "learning_rate": 4.636787108145537e-07, + "logits/chosen": -1.4803054332733154, + "logits/rejected": -1.2647490501403809, + "logps/chosen": -452.32525634765625, + "logps/rejected": -638.7464599609375, + "loss": 0.0685, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.184966087341309, + "rewards/margins": 6.936063289642334, + "rewards/rejected": -2.7510969638824463, + "step": 1460 + }, + { + "epoch": 0.5, + "learning_rate": 4.630492257333501e-07, + "logits/chosen": -1.535620093345642, + "logits/rejected": -1.1835294961929321, + "logps/chosen": -303.8070983886719, + "logps/rejected": -456.5039978027344, + "loss": 0.052, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.9574809074401855, + "rewards/margins": 6.753536224365234, + "rewards/rejected": -2.796055316925049, + "step": 1470 + }, + { + "epoch": 0.5, + "learning_rate": 4.624197406521465e-07, + "logits/chosen": -1.479691982269287, + "logits/rejected": -1.1124011278152466, + "logps/chosen": -291.1435852050781, + "logps/rejected": -527.3496704101562, + "loss": 0.0763, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.824152708053589, + "rewards/margins": 7.007646083831787, + "rewards/rejected": -3.1834936141967773, + "step": 1480 + }, + { + "epoch": 0.51, + "learning_rate": 4.617902555709429e-07, + "logits/chosen": -1.507291555404663, + "logits/rejected": -1.237839937210083, + "logps/chosen": -350.85577392578125, + "logps/rejected": -355.04254150390625, + "loss": 0.0685, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.336248874664307, + "rewards/margins": 7.563298225402832, + "rewards/rejected": -3.2270493507385254, + "step": 1490 + }, + { + "epoch": 0.51, + "learning_rate": 4.611607704897394e-07, + "logits/chosen": -1.5084125995635986, + "logits/rejected": -1.2210139036178589, + "logps/chosen": -451.03997802734375, + "logps/rejected": -565.1669921875, + "loss": 0.053, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.211254119873047, + "rewards/margins": 7.1787285804748535, + "rewards/rejected": -2.9674737453460693, + "step": 1500 + }, + { + "epoch": 0.51, + "eval_logits/chosen": -1.5056781768798828, + "eval_logits/rejected": -1.1796625852584839, + "eval_logps/chosen": -375.913330078125, + "eval_logps/rejected": -591.7845458984375, + "eval_loss": 0.06593501567840576, + "eval_rewards/accuracies": 0.9781144857406616, + "eval_rewards/chosen": 4.40094518661499, + "eval_rewards/margins": 7.62776517868042, + "eval_rewards/rejected": -3.2268192768096924, + "eval_runtime": 562.7492, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 0.528, + "step": 1500 + }, + { + "epoch": 0.51, + "learning_rate": 4.605312854085358e-07, + "logits/chosen": -1.5074743032455444, + "logits/rejected": -1.1961781978607178, + "logps/chosen": -428.0045471191406, + "logps/rejected": -502.8687438964844, + "loss": 0.0858, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.250360488891602, + "rewards/margins": 7.344718933105469, + "rewards/rejected": -3.0943586826324463, + "step": 1510 + }, + { + "epoch": 0.52, + "learning_rate": 4.5990180032733223e-07, + "logits/chosen": -1.4917250871658325, + "logits/rejected": -1.2147313356399536, + "logps/chosen": -383.5940856933594, + "logps/rejected": -645.8688354492188, + "loss": 0.0869, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.184120178222656, + "rewards/margins": 7.126355171203613, + "rewards/rejected": -2.9422342777252197, + "step": 1520 + }, + { + "epoch": 0.52, + "learning_rate": 4.5927231524612865e-07, + "logits/chosen": -1.507495641708374, + "logits/rejected": -1.1616270542144775, + "logps/chosen": -358.37762451171875, + "logps/rejected": -422.3489685058594, + "loss": 0.0442, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.923793315887451, + "rewards/margins": 7.375317573547363, + "rewards/rejected": -3.451524019241333, + "step": 1530 + }, + { + "epoch": 0.52, + "learning_rate": 4.586428301649251e-07, + "logits/chosen": -1.5201808214187622, + "logits/rejected": -1.2455103397369385, + "logps/chosen": -319.1759033203125, + "logps/rejected": -706.5762939453125, + "loss": 0.0708, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.504089832305908, + "rewards/margins": 7.234611511230469, + "rewards/rejected": -2.7305221557617188, + "step": 1540 + }, + { + "epoch": 0.53, + "learning_rate": 4.5801334508372145e-07, + "logits/chosen": -1.490705132484436, + "logits/rejected": -1.1946533918380737, + "logps/chosen": -400.42462158203125, + "logps/rejected": -477.0884704589844, + "loss": 0.0689, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 3.6824710369110107, + "rewards/margins": 6.845651149749756, + "rewards/rejected": -3.163179874420166, + "step": 1550 + }, + { + "epoch": 0.53, + "learning_rate": 4.573838600025179e-07, + "logits/chosen": -1.4747754335403442, + "logits/rejected": -1.193642258644104, + "logps/chosen": -477.27069091796875, + "logps/rejected": -653.9908447265625, + "loss": 0.061, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.9809257984161377, + "rewards/margins": 7.1034979820251465, + "rewards/rejected": -3.122572660446167, + "step": 1560 + }, + { + "epoch": 0.53, + "learning_rate": 4.5675437492131434e-07, + "logits/chosen": -1.5094478130340576, + "logits/rejected": -1.230588436126709, + "logps/chosen": -350.9673156738281, + "logps/rejected": -631.0221557617188, + "loss": 0.0514, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.611149311065674, + "rewards/margins": 8.26158618927002, + "rewards/rejected": -3.650437116622925, + "step": 1570 + }, + { + "epoch": 0.54, + "learning_rate": 4.5612488984011077e-07, + "logits/chosen": -1.5176488161087036, + "logits/rejected": -1.1694844961166382, + "logps/chosen": -378.80938720703125, + "logps/rejected": -530.5906372070312, + "loss": 0.071, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.952810287475586, + "rewards/margins": 7.061129093170166, + "rewards/rejected": -3.108318567276001, + "step": 1580 + }, + { + "epoch": 0.54, + "learning_rate": 4.554954047589072e-07, + "logits/chosen": -1.5134356021881104, + "logits/rejected": -1.1116071939468384, + "logps/chosen": -377.2322692871094, + "logps/rejected": -465.6065368652344, + "loss": 0.0643, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.143061637878418, + "rewards/margins": 7.735688209533691, + "rewards/rejected": -3.5926265716552734, + "step": 1590 + }, + { + "epoch": 0.54, + "learning_rate": 4.548659196777036e-07, + "logits/chosen": -1.4901964664459229, + "logits/rejected": -1.188247561454773, + "logps/chosen": -528.5428466796875, + "logps/rejected": -491.759033203125, + "loss": 0.0653, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.822246074676514, + "rewards/margins": 7.757513999938965, + "rewards/rejected": -2.9352688789367676, + "step": 1600 + }, + { + "epoch": 0.54, + "eval_logits/chosen": -1.4980485439300537, + "eval_logits/rejected": -1.168175220489502, + "eval_logps/chosen": -376.3569641113281, + "eval_logps/rejected": -591.5099487304688, + "eval_loss": 0.06801147758960724, + "eval_rewards/accuracies": 0.9781144857406616, + "eval_rewards/chosen": 4.356579303741455, + "eval_rewards/margins": 7.555936813354492, + "eval_rewards/rejected": -3.199357748031616, + "eval_runtime": 563.0826, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 0.527, + "step": 1600 + }, + { + "epoch": 0.55, + "learning_rate": 4.5423643459650003e-07, + "logits/chosen": -1.506359338760376, + "logits/rejected": -1.2093185186386108, + "logps/chosen": -386.937744140625, + "logps/rejected": -680.9929809570312, + "loss": 0.0693, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.096471309661865, + "rewards/margins": 7.564708709716797, + "rewards/rejected": -3.4682374000549316, + "step": 1610 + }, + { + "epoch": 0.55, + "learning_rate": 4.536069495152965e-07, + "logits/chosen": -1.5148202180862427, + "logits/rejected": -1.2351607084274292, + "logps/chosen": -367.4833068847656, + "logps/rejected": -660.5146484375, + "loss": 0.0791, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.265519142150879, + "rewards/margins": 7.527351379394531, + "rewards/rejected": -3.261831283569336, + "step": 1620 + }, + { + "epoch": 0.55, + "learning_rate": 4.529774644340929e-07, + "logits/chosen": -1.5120306015014648, + "logits/rejected": -1.1738466024398804, + "logps/chosen": -397.86138916015625, + "logps/rejected": -495.5794982910156, + "loss": 0.0603, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.230652809143066, + "rewards/margins": 7.56689453125, + "rewards/rejected": -3.3362419605255127, + "step": 1630 + }, + { + "epoch": 0.56, + "learning_rate": 4.523479793528893e-07, + "logits/chosen": -1.5158121585845947, + "logits/rejected": -1.1896756887435913, + "logps/chosen": -356.7864685058594, + "logps/rejected": -508.6224060058594, + "loss": 0.0602, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.039921760559082, + "rewards/margins": 7.325720310211182, + "rewards/rejected": -3.2857985496520996, + "step": 1640 + }, + { + "epoch": 0.56, + "learning_rate": 4.517184942716857e-07, + "logits/chosen": -1.50119149684906, + "logits/rejected": -1.2252064943313599, + "logps/chosen": -446.0723571777344, + "logps/rejected": -457.798828125, + "loss": 0.0559, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.727872371673584, + "rewards/margins": 6.772833347320557, + "rewards/rejected": -3.0449607372283936, + "step": 1650 + }, + { + "epoch": 0.56, + "learning_rate": 4.5108900919048215e-07, + "logits/chosen": -1.518447756767273, + "logits/rejected": -1.1939891576766968, + "logps/chosen": -397.7601013183594, + "logps/rejected": -521.1400146484375, + "loss": 0.0398, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.9350357055664062, + "rewards/margins": 7.125546455383301, + "rewards/rejected": -3.1905105113983154, + "step": 1660 + }, + { + "epoch": 0.57, + "learning_rate": 4.5045952410927857e-07, + "logits/chosen": -1.5098764896392822, + "logits/rejected": -1.2581671476364136, + "logps/chosen": -367.23773193359375, + "logps/rejected": -548.2299194335938, + "loss": 0.0485, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.583366394042969, + "rewards/margins": 7.744868278503418, + "rewards/rejected": -3.1615025997161865, + "step": 1670 + }, + { + "epoch": 0.57, + "learning_rate": 4.4983003902807505e-07, + "logits/chosen": -1.4997934103012085, + "logits/rejected": -1.1904213428497314, + "logps/chosen": -481.642578125, + "logps/rejected": -772.6529541015625, + "loss": 0.0426, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.6279144287109375, + "rewards/margins": 8.57080078125, + "rewards/rejected": -3.9428858757019043, + "step": 1680 + }, + { + "epoch": 0.57, + "learning_rate": 4.4920055394687147e-07, + "logits/chosen": -1.535498857498169, + "logits/rejected": -1.2115448713302612, + "logps/chosen": -322.3951721191406, + "logps/rejected": -504.7547912597656, + "loss": 0.0343, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.012532711029053, + "rewards/margins": 7.89080810546875, + "rewards/rejected": -3.8782737255096436, + "step": 1690 + }, + { + "epoch": 0.58, + "learning_rate": 4.485710688656679e-07, + "logits/chosen": -1.5308136940002441, + "logits/rejected": -1.2494858503341675, + "logps/chosen": -346.1595764160156, + "logps/rejected": -533.1759033203125, + "loss": 0.0634, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.455540180206299, + "rewards/margins": 8.111928939819336, + "rewards/rejected": -3.6563892364501953, + "step": 1700 + }, + { + "epoch": 0.58, + "eval_logits/chosen": -1.5175681114196777, + "eval_logits/rejected": -1.1987800598144531, + "eval_logps/chosen": -377.4786376953125, + "eval_logps/rejected": -597.483154296875, + "eval_loss": 0.055251117795705795, + "eval_rewards/accuracies": 0.9764309525489807, + "eval_rewards/chosen": 4.244411468505859, + "eval_rewards/margins": 8.041091918945312, + "eval_rewards/rejected": -3.796680212020874, + "eval_runtime": 562.4385, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 0.528, + "step": 1700 + }, + { + "epoch": 0.58, + "learning_rate": 4.4794158378446426e-07, + "logits/chosen": -1.5052268505096436, + "logits/rejected": -1.237554669380188, + "logps/chosen": -442.6924743652344, + "logps/rejected": -772.1906127929688, + "loss": 0.062, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.252256870269775, + "rewards/margins": 7.803114414215088, + "rewards/rejected": -3.5508575439453125, + "step": 1710 + }, + { + "epoch": 0.58, + "learning_rate": 4.473120987032607e-07, + "logits/chosen": -1.5154906511306763, + "logits/rejected": -1.2507343292236328, + "logps/chosen": -369.5589294433594, + "logps/rejected": -969.9295654296875, + "loss": 0.0775, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.055435657501221, + "rewards/margins": 8.542444229125977, + "rewards/rejected": -4.487008094787598, + "step": 1720 + }, + { + "epoch": 0.59, + "learning_rate": 4.466826136220571e-07, + "logits/chosen": -1.5469777584075928, + "logits/rejected": -1.157042145729065, + "logps/chosen": -334.96044921875, + "logps/rejected": -469.9227600097656, + "loss": 0.0544, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.539370059967041, + "rewards/margins": 8.756429672241211, + "rewards/rejected": -4.217058181762695, + "step": 1730 + }, + { + "epoch": 0.59, + "learning_rate": 4.460531285408536e-07, + "logits/chosen": -1.5168081521987915, + "logits/rejected": -1.2331032752990723, + "logps/chosen": -519.6591796875, + "logps/rejected": -489.31280517578125, + "loss": 0.0557, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8884081840515137, + "rewards/margins": 7.911124229431152, + "rewards/rejected": -4.022716045379639, + "step": 1740 + }, + { + "epoch": 0.59, + "learning_rate": 4.4542364345965e-07, + "logits/chosen": -1.5378777980804443, + "logits/rejected": -1.2380226850509644, + "logps/chosen": -445.75811767578125, + "logps/rejected": -485.1026306152344, + "loss": 0.0572, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.6285247802734375, + "rewards/margins": 9.083527565002441, + "rewards/rejected": -4.455002784729004, + "step": 1750 + }, + { + "epoch": 0.6, + "learning_rate": 4.4479415837844643e-07, + "logits/chosen": -1.53352952003479, + "logits/rejected": -1.2841970920562744, + "logps/chosen": -433.17144775390625, + "logps/rejected": -543.0953369140625, + "loss": 0.0374, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.4324493408203125, + "rewards/margins": 8.223987579345703, + "rewards/rejected": -3.7915382385253906, + "step": 1760 + }, + { + "epoch": 0.6, + "learning_rate": 4.4416467329724285e-07, + "logits/chosen": -1.5385587215423584, + "logits/rejected": -1.2588846683502197, + "logps/chosen": -317.13092041015625, + "logps/rejected": -734.0889282226562, + "loss": 0.0451, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.6969404220581055, + "rewards/margins": 8.934842109680176, + "rewards/rejected": -4.23790168762207, + "step": 1770 + }, + { + "epoch": 0.61, + "learning_rate": 4.435351882160392e-07, + "logits/chosen": -1.5031440258026123, + "logits/rejected": -1.2368115186691284, + "logps/chosen": -402.53765869140625, + "logps/rejected": -517.0589599609375, + "loss": 0.0702, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.7892088890075684, + "rewards/margins": 8.129257202148438, + "rewards/rejected": -4.340047836303711, + "step": 1780 + }, + { + "epoch": 0.61, + "learning_rate": 4.4290570313483564e-07, + "logits/chosen": -1.5281856060028076, + "logits/rejected": -1.2300007343292236, + "logps/chosen": -522.0947875976562, + "logps/rejected": -497.9402770996094, + "loss": 0.0602, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.384037971496582, + "rewards/margins": 8.281453132629395, + "rewards/rejected": -3.897414445877075, + "step": 1790 + }, + { + "epoch": 0.61, + "learning_rate": 4.422762180536321e-07, + "logits/chosen": -1.5527245998382568, + "logits/rejected": -1.2166951894760132, + "logps/chosen": -329.9271240234375, + "logps/rejected": -664.5421142578125, + "loss": 0.0574, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.23252010345459, + "rewards/margins": 7.895871162414551, + "rewards/rejected": -3.6633505821228027, + "step": 1800 + }, + { + "epoch": 0.61, + "eval_logits/chosen": -1.5299410820007324, + "eval_logits/rejected": -1.2222399711608887, + "eval_logps/chosen": -376.84649658203125, + "eval_logps/rejected": -598.8565673828125, + "eval_loss": 0.048959773033857346, + "eval_rewards/accuracies": 0.9789562225341797, + "eval_rewards/chosen": 4.307624816894531, + "eval_rewards/margins": 8.241641998291016, + "eval_rewards/rejected": -3.9340178966522217, + "eval_runtime": 562.7699, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 0.528, + "step": 1800 + }, + { + "epoch": 0.62, + "learning_rate": 4.4164673297242854e-07, + "logits/chosen": -1.544440507888794, + "logits/rejected": -1.1920955181121826, + "logps/chosen": -452.1075134277344, + "logps/rejected": -630.123046875, + "loss": 0.0405, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.471999645233154, + "rewards/margins": 8.641316413879395, + "rewards/rejected": -4.169317245483398, + "step": 1810 + }, + { + "epoch": 0.62, + "learning_rate": 4.4101724789122497e-07, + "logits/chosen": -1.5326340198516846, + "logits/rejected": -1.2949206829071045, + "logps/chosen": -387.7938537597656, + "logps/rejected": -852.89990234375, + "loss": 0.0399, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.200954437255859, + "rewards/margins": 8.185534477233887, + "rewards/rejected": -3.9845802783966064, + "step": 1820 + }, + { + "epoch": 0.62, + "learning_rate": 4.403877628100214e-07, + "logits/chosen": -1.5321006774902344, + "logits/rejected": -1.227288007736206, + "logps/chosen": -391.6388854980469, + "logps/rejected": -552.68115234375, + "loss": 0.0591, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.8821747303009033, + "rewards/margins": 7.14884090423584, + "rewards/rejected": -3.2666656970977783, + "step": 1830 + }, + { + "epoch": 0.63, + "learning_rate": 4.397582777288178e-07, + "logits/chosen": -1.518857717514038, + "logits/rejected": -1.1626359224319458, + "logps/chosen": -464.2646484375, + "logps/rejected": -687.5324096679688, + "loss": 0.0428, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.447764873504639, + "rewards/margins": 8.781205177307129, + "rewards/rejected": -4.333439826965332, + "step": 1840 + }, + { + "epoch": 0.63, + "learning_rate": 4.3912879264761423e-07, + "logits/chosen": -1.536007285118103, + "logits/rejected": -1.2505146265029907, + "logps/chosen": -397.3358154296875, + "logps/rejected": -569.1234741210938, + "loss": 0.0484, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.30867862701416, + "rewards/margins": 8.0530366897583, + "rewards/rejected": -3.744358539581299, + "step": 1850 + }, + { + "epoch": 0.63, + "learning_rate": 4.3849930756641066e-07, + "logits/chosen": -1.519709825515747, + "logits/rejected": -1.2987468242645264, + "logps/chosen": -365.6405029296875, + "logps/rejected": -682.091796875, + "loss": 0.0535, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.439739227294922, + "rewards/margins": 8.027719497680664, + "rewards/rejected": -3.587979793548584, + "step": 1860 + }, + { + "epoch": 0.64, + "learning_rate": 4.378698224852071e-07, + "logits/chosen": -1.5471036434173584, + "logits/rejected": -1.2738673686981201, + "logps/chosen": -380.0992431640625, + "logps/rejected": -520.5606689453125, + "loss": 0.0508, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8066792488098145, + "rewards/margins": 7.732104301452637, + "rewards/rejected": -3.9254257678985596, + "step": 1870 + }, + { + "epoch": 0.64, + "learning_rate": 4.372403374040035e-07, + "logits/chosen": -1.568764567375183, + "logits/rejected": -1.2342442274093628, + "logps/chosen": -319.6278076171875, + "logps/rejected": -544.8094482421875, + "loss": 0.0321, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.085818290710449, + "rewards/margins": 7.950621604919434, + "rewards/rejected": -3.8648037910461426, + "step": 1880 + }, + { + "epoch": 0.64, + "learning_rate": 4.366108523227999e-07, + "logits/chosen": -1.55269193649292, + "logits/rejected": -1.2731910943984985, + "logps/chosen": -417.42449951171875, + "logps/rejected": -563.5051879882812, + "loss": 0.0586, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.185043811798096, + "rewards/margins": 7.943039894104004, + "rewards/rejected": -3.7579948902130127, + "step": 1890 + }, + { + "epoch": 0.65, + "learning_rate": 4.3598136724159635e-07, + "logits/chosen": -1.539876103401184, + "logits/rejected": -1.2617334127426147, + "logps/chosen": -467.43023681640625, + "logps/rejected": -588.6837158203125, + "loss": 0.0518, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.698001384735107, + "rewards/margins": 8.413629531860352, + "rewards/rejected": -3.7156288623809814, + "step": 1900 + }, + { + "epoch": 0.65, + "eval_logits/chosen": -1.5390018224716187, + "eval_logits/rejected": -1.2285270690917969, + "eval_logps/chosen": -377.1029968261719, + "eval_logps/rejected": -599.8411865234375, + "eval_loss": 0.042402442544698715, + "eval_rewards/accuracies": 0.9856902360916138, + "eval_rewards/chosen": 4.2819743156433105, + "eval_rewards/margins": 8.314457893371582, + "eval_rewards/rejected": -4.032483100891113, + "eval_runtime": 561.2779, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 0.529, + "step": 1900 + }, + { + "epoch": 0.65, + "learning_rate": 4.3535188216039277e-07, + "logits/chosen": -1.5424803495407104, + "logits/rejected": -1.249219298362732, + "logps/chosen": -353.60699462890625, + "logps/rejected": -571.567626953125, + "loss": 0.0343, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.207779884338379, + "rewards/margins": 8.062320709228516, + "rewards/rejected": -3.8545405864715576, + "step": 1910 + }, + { + "epoch": 0.65, + "learning_rate": 4.3472239707918925e-07, + "logits/chosen": -1.5563422441482544, + "logits/rejected": -1.2192165851593018, + "logps/chosen": -304.295654296875, + "logps/rejected": -424.29730224609375, + "loss": 0.0296, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.354735374450684, + "rewards/margins": 8.167760848999023, + "rewards/rejected": -3.8130269050598145, + "step": 1920 + }, + { + "epoch": 0.66, + "learning_rate": 4.3409291199798567e-07, + "logits/chosen": -1.5476926565170288, + "logits/rejected": -1.2900664806365967, + "logps/chosen": -332.105712890625, + "logps/rejected": -742.2824096679688, + "loss": 0.0492, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.450702667236328, + "rewards/margins": 8.134770393371582, + "rewards/rejected": -3.684067487716675, + "step": 1930 + }, + { + "epoch": 0.66, + "learning_rate": 4.3346342691678204e-07, + "logits/chosen": -1.5577868223190308, + "logits/rejected": -1.2450841665267944, + "logps/chosen": -329.3459777832031, + "logps/rejected": -775.57763671875, + "loss": 0.0468, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.9985337257385254, + "rewards/margins": 8.838262557983398, + "rewards/rejected": -4.839729309082031, + "step": 1940 + }, + { + "epoch": 0.66, + "learning_rate": 4.3283394183557846e-07, + "logits/chosen": -1.515390157699585, + "logits/rejected": -1.246093511581421, + "logps/chosen": -554.6671752929688, + "logps/rejected": -527.1033935546875, + "loss": 0.0436, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.5228190422058105, + "rewards/margins": 8.128091812133789, + "rewards/rejected": -3.605271577835083, + "step": 1950 + }, + { + "epoch": 0.67, + "learning_rate": 4.322044567543749e-07, + "logits/chosen": -1.5471566915512085, + "logits/rejected": -1.2447845935821533, + "logps/chosen": -390.18218994140625, + "logps/rejected": -510.04913330078125, + "loss": 0.0546, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.299077033996582, + "rewards/margins": 8.56313419342041, + "rewards/rejected": -4.264057159423828, + "step": 1960 + }, + { + "epoch": 0.67, + "learning_rate": 4.315749716731713e-07, + "logits/chosen": -1.5380796194076538, + "logits/rejected": -1.2724196910858154, + "logps/chosen": -404.30548095703125, + "logps/rejected": -755.0365600585938, + "loss": 0.049, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.9293715953826904, + "rewards/margins": 7.587133884429932, + "rewards/rejected": -3.657761335372925, + "step": 1970 + }, + { + "epoch": 0.67, + "learning_rate": 4.309454865919678e-07, + "logits/chosen": -1.5430433750152588, + "logits/rejected": -1.3027762174606323, + "logps/chosen": -406.6456604003906, + "logps/rejected": -550.5321044921875, + "loss": 0.0447, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.133244037628174, + "rewards/margins": 8.012167930603027, + "rewards/rejected": -3.878924608230591, + "step": 1980 + }, + { + "epoch": 0.68, + "learning_rate": 4.303160015107642e-07, + "logits/chosen": -1.4892184734344482, + "logits/rejected": -1.2781140804290771, + "logps/chosen": -435.0169372558594, + "logps/rejected": -417.202880859375, + "loss": 0.0518, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.610887050628662, + "rewards/margins": 7.699536323547363, + "rewards/rejected": -4.088650226593018, + "step": 1990 + }, + { + "epoch": 0.68, + "learning_rate": 4.2968651642956063e-07, + "logits/chosen": -1.5247738361358643, + "logits/rejected": -1.2678837776184082, + "logps/chosen": -378.3091125488281, + "logps/rejected": -489.31170654296875, + "loss": 0.0376, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.8665459156036377, + "rewards/margins": 7.611338138580322, + "rewards/rejected": -3.7447915077209473, + "step": 2000 + }, + { + "epoch": 0.68, + "eval_logits/chosen": -1.5310970544815063, + "eval_logits/rejected": -1.227981448173523, + "eval_logps/chosen": -376.9974670410156, + "eval_logps/rejected": -600.6128540039062, + "eval_loss": 0.04229723662137985, + "eval_rewards/accuracies": 0.9840067625045776, + "eval_rewards/chosen": 4.292530536651611, + "eval_rewards/margins": 8.402181625366211, + "eval_rewards/rejected": -4.1096510887146, + "eval_runtime": 562.496, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 0.528, + "step": 2000 + }, + { + "epoch": 0.68, + "learning_rate": 4.29057031348357e-07, + "logits/chosen": -1.5398401021957397, + "logits/rejected": -1.2270891666412354, + "logps/chosen": -320.632080078125, + "logps/rejected": -594.7230834960938, + "loss": 0.0374, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.212071418762207, + "rewards/margins": 8.138837814331055, + "rewards/rejected": -3.926767349243164, + "step": 2010 + }, + { + "epoch": 0.69, + "learning_rate": 4.284275462671534e-07, + "logits/chosen": -1.5300395488739014, + "logits/rejected": -1.2081743478775024, + "logps/chosen": -326.40313720703125, + "logps/rejected": -681.0775146484375, + "loss": 0.05, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.38870096206665, + "rewards/margins": 8.826338768005371, + "rewards/rejected": -4.437638282775879, + "step": 2020 + }, + { + "epoch": 0.69, + "learning_rate": 4.2779806118594984e-07, + "logits/chosen": -1.5051060914993286, + "logits/rejected": -1.2562944889068604, + "logps/chosen": -431.694091796875, + "logps/rejected": -482.49932861328125, + "loss": 0.0467, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.9686760902404785, + "rewards/margins": 7.745867729187012, + "rewards/rejected": -3.7771923542022705, + "step": 2030 + }, + { + "epoch": 0.69, + "learning_rate": 4.271685761047463e-07, + "logits/chosen": -1.5369694232940674, + "logits/rejected": -1.2425193786621094, + "logps/chosen": -314.2693176269531, + "logps/rejected": -543.4124755859375, + "loss": 0.0359, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.711358547210693, + "rewards/margins": 9.045827865600586, + "rewards/rejected": -4.334470272064209, + "step": 2040 + }, + { + "epoch": 0.7, + "learning_rate": 4.2653909102354274e-07, + "logits/chosen": -1.5255584716796875, + "logits/rejected": -1.2049405574798584, + "logps/chosen": -394.3075256347656, + "logps/rejected": -499.96588134765625, + "loss": 0.0367, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.6083781719207764, + "rewards/margins": 7.717264652252197, + "rewards/rejected": -4.108885765075684, + "step": 2050 + }, + { + "epoch": 0.7, + "learning_rate": 4.2590960594233917e-07, + "logits/chosen": -1.5256173610687256, + "logits/rejected": -1.2074105739593506, + "logps/chosen": -405.6077880859375, + "logps/rejected": -514.4899291992188, + "loss": 0.0414, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.283182621002197, + "rewards/margins": 8.416224479675293, + "rewards/rejected": -4.1330413818359375, + "step": 2060 + }, + { + "epoch": 0.7, + "learning_rate": 4.252801208611356e-07, + "logits/chosen": -1.5494182109832764, + "logits/rejected": -1.2858482599258423, + "logps/chosen": -375.99591064453125, + "logps/rejected": -550.1553955078125, + "loss": 0.0299, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.863327503204346, + "rewards/margins": 9.237825393676758, + "rewards/rejected": -4.374497413635254, + "step": 2070 + }, + { + "epoch": 0.71, + "learning_rate": 4.24650635779932e-07, + "logits/chosen": -1.539885401725769, + "logits/rejected": -1.1814885139465332, + "logps/chosen": -297.65142822265625, + "logps/rejected": -425.31365966796875, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.669113636016846, + "rewards/margins": 8.758540153503418, + "rewards/rejected": -4.089427947998047, + "step": 2080 + }, + { + "epoch": 0.71, + "learning_rate": 4.240211506987284e-07, + "logits/chosen": -1.5367573499679565, + "logits/rejected": -1.2334643602371216, + "logps/chosen": -405.69537353515625, + "logps/rejected": -570.9161376953125, + "loss": 0.0421, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.756657600402832, + "rewards/margins": 9.043152809143066, + "rewards/rejected": -4.28649377822876, + "step": 2090 + }, + { + "epoch": 0.71, + "learning_rate": 4.233916656175248e-07, + "logits/chosen": -1.5229039192199707, + "logits/rejected": -1.2804169654846191, + "logps/chosen": -428.501220703125, + "logps/rejected": -829.7421875, + "loss": 0.0339, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.044557571411133, + "rewards/margins": 8.1494722366333, + "rewards/rejected": -4.104913711547852, + "step": 2100 + }, + { + "epoch": 0.71, + "eval_logits/chosen": -1.5422166585922241, + "eval_logits/rejected": -1.2371177673339844, + "eval_logps/chosen": -377.4996032714844, + "eval_logps/rejected": -603.4857788085938, + "eval_loss": 0.04239306226372719, + "eval_rewards/accuracies": 0.9882155060768127, + "eval_rewards/chosen": 4.242310523986816, + "eval_rewards/margins": 8.639253616333008, + "eval_rewards/rejected": -4.39694356918335, + "eval_runtime": 560.2447, + "eval_samples_per_second": 16.957, + "eval_steps_per_second": 0.53, + "step": 2100 + }, + { + "epoch": 0.72, + "learning_rate": 4.227621805363213e-07, + "logits/chosen": -1.532913327217102, + "logits/rejected": -1.2680933475494385, + "logps/chosen": -349.3253479003906, + "logps/rejected": -506.9595642089844, + "loss": 0.0643, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.009087085723877, + "rewards/margins": 8.217607498168945, + "rewards/rejected": -4.208521366119385, + "step": 2110 + }, + { + "epoch": 0.72, + "learning_rate": 4.221326954551177e-07, + "logits/chosen": -1.5485155582427979, + "logits/rejected": -1.2827317714691162, + "logps/chosen": -341.2211608886719, + "logps/rejected": -703.082763671875, + "loss": 0.0357, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.5526018142700195, + "rewards/margins": 8.804117202758789, + "rewards/rejected": -4.251516342163086, + "step": 2120 + }, + { + "epoch": 0.72, + "learning_rate": 4.215032103739141e-07, + "logits/chosen": -1.5233689546585083, + "logits/rejected": -1.2221336364746094, + "logps/chosen": -386.87762451171875, + "logps/rejected": -646.2404174804688, + "loss": 0.0322, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.079219818115234, + "rewards/margins": 8.469517707824707, + "rewards/rejected": -4.3902974128723145, + "step": 2130 + }, + { + "epoch": 0.73, + "learning_rate": 4.2087372529271055e-07, + "logits/chosen": -1.533658742904663, + "logits/rejected": -1.218871831893921, + "logps/chosen": -350.62725830078125, + "logps/rejected": -464.7601623535156, + "loss": 0.0405, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.357470512390137, + "rewards/margins": 8.038476943969727, + "rewards/rejected": -3.681006908416748, + "step": 2140 + }, + { + "epoch": 0.73, + "learning_rate": 4.2024424021150697e-07, + "logits/chosen": -1.554192066192627, + "logits/rejected": -1.2222068309783936, + "logps/chosen": -328.8388977050781, + "logps/rejected": -640.3116455078125, + "loss": 0.047, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.5149736404418945, + "rewards/margins": 9.069557189941406, + "rewards/rejected": -4.554583549499512, + "step": 2150 + }, + { + "epoch": 0.73, + "learning_rate": 4.1961475513030334e-07, + "logits/chosen": -1.5296955108642578, + "logits/rejected": -1.2369314432144165, + "logps/chosen": -476.97161865234375, + "logps/rejected": -346.4566650390625, + "loss": 0.0451, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.924145221710205, + "rewards/margins": 8.177742958068848, + "rewards/rejected": -4.253598213195801, + "step": 2160 + }, + { + "epoch": 0.74, + "learning_rate": 4.189852700490998e-07, + "logits/chosen": -1.5315951108932495, + "logits/rejected": -1.2657560110092163, + "logps/chosen": -378.7848205566406, + "logps/rejected": -477.8885192871094, + "loss": 0.0428, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.24775505065918, + "rewards/margins": 8.36426067352295, + "rewards/rejected": -4.1165056228637695, + "step": 2170 + }, + { + "epoch": 0.74, + "learning_rate": 4.1835578496789624e-07, + "logits/chosen": -1.5282453298568726, + "logits/rejected": -1.2338428497314453, + "logps/chosen": -409.56146240234375, + "logps/rejected": -564.6990966796875, + "loss": 0.0319, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.238018035888672, + "rewards/margins": 8.548566818237305, + "rewards/rejected": -4.310548305511475, + "step": 2180 + }, + { + "epoch": 0.74, + "learning_rate": 4.1772629988669266e-07, + "logits/chosen": -1.516488790512085, + "logits/rejected": -1.2576848268508911, + "logps/chosen": -393.1282653808594, + "logps/rejected": -557.4349365234375, + "loss": 0.0267, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.546344757080078, + "rewards/margins": 9.40922737121582, + "rewards/rejected": -4.862882614135742, + "step": 2190 + }, + { + "epoch": 0.75, + "learning_rate": 4.170968148054891e-07, + "logits/chosen": -1.4799468517303467, + "logits/rejected": -1.2085634469985962, + "logps/chosen": -650.8047485351562, + "logps/rejected": -517.8765869140625, + "loss": 0.0323, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.5135245323181152, + "rewards/margins": 7.809338569641113, + "rewards/rejected": -4.29581356048584, + "step": 2200 + }, + { + "epoch": 0.75, + "eval_logits/chosen": -1.5286318063735962, + "eval_logits/rejected": -1.2197929620742798, + "eval_logps/chosen": -376.90679931640625, + "eval_logps/rejected": -603.0662841796875, + "eval_loss": 0.041807182133197784, + "eval_rewards/accuracies": 0.9831649661064148, + "eval_rewards/chosen": 4.301596641540527, + "eval_rewards/margins": 8.656588554382324, + "eval_rewards/rejected": -4.354991912841797, + "eval_runtime": 563.6418, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 0.527, + "step": 2200 + }, + { + "epoch": 0.75, + "learning_rate": 4.164673297242855e-07, + "logits/chosen": -1.5049222707748413, + "logits/rejected": -1.313751459121704, + "logps/chosen": -555.3997802734375, + "logps/rejected": -628.4067993164062, + "loss": 0.0463, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.7256641387939453, + "rewards/margins": 7.7227606773376465, + "rewards/rejected": -3.997096300125122, + "step": 2210 + }, + { + "epoch": 0.75, + "learning_rate": 4.1583784464308193e-07, + "logits/chosen": -1.5462090969085693, + "logits/rejected": -1.3063080310821533, + "logps/chosen": -286.6033630371094, + "logps/rejected": -531.8728637695312, + "loss": 0.0434, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.269690036773682, + "rewards/margins": 8.543286323547363, + "rewards/rejected": -4.273596286773682, + "step": 2220 + }, + { + "epoch": 0.76, + "learning_rate": 4.152083595618784e-07, + "logits/chosen": -1.5421247482299805, + "logits/rejected": -1.2891871929168701, + "logps/chosen": -301.48406982421875, + "logps/rejected": -478.1954040527344, + "loss": 0.0388, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.02353572845459, + "rewards/margins": 8.50233268737793, + "rewards/rejected": -4.478796482086182, + "step": 2230 + }, + { + "epoch": 0.76, + "learning_rate": 4.145788744806748e-07, + "logits/chosen": -1.5282901525497437, + "logits/rejected": -1.2586205005645752, + "logps/chosen": -317.33905029296875, + "logps/rejected": -551.4359130859375, + "loss": 0.029, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.127307891845703, + "rewards/margins": 8.632368087768555, + "rewards/rejected": -4.505061149597168, + "step": 2240 + }, + { + "epoch": 0.76, + "learning_rate": 4.139493893994712e-07, + "logits/chosen": -1.5504400730133057, + "logits/rejected": -1.2088788747787476, + "logps/chosen": -352.94439697265625, + "logps/rejected": -413.91632080078125, + "loss": 0.0244, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.164759635925293, + "rewards/margins": 8.671494483947754, + "rewards/rejected": -4.506735324859619, + "step": 2250 + }, + { + "epoch": 0.77, + "learning_rate": 4.133199043182676e-07, + "logits/chosen": -1.5537081956863403, + "logits/rejected": -1.188960313796997, + "logps/chosen": -323.99847412109375, + "logps/rejected": -584.423583984375, + "loss": 0.0433, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.01249885559082, + "rewards/margins": 8.70199203491211, + "rewards/rejected": -4.689493179321289, + "step": 2260 + }, + { + "epoch": 0.77, + "learning_rate": 4.1269041923706404e-07, + "logits/chosen": -1.5271246433258057, + "logits/rejected": -1.273632526397705, + "logps/chosen": -432.05615234375, + "logps/rejected": -546.5620727539062, + "loss": 0.0406, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.356633186340332, + "rewards/margins": 9.435579299926758, + "rewards/rejected": -5.078945159912109, + "step": 2270 + }, + { + "epoch": 0.77, + "learning_rate": 4.1206093415586047e-07, + "logits/chosen": -1.5644853115081787, + "logits/rejected": -1.3468990325927734, + "logps/chosen": -337.88018798828125, + "logps/rejected": -564.326904296875, + "loss": 0.0459, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.625871658325195, + "rewards/margins": 8.88005256652832, + "rewards/rejected": -4.254181861877441, + "step": 2280 + }, + { + "epoch": 0.78, + "learning_rate": 4.1143144907465694e-07, + "logits/chosen": -1.547375202178955, + "logits/rejected": -1.2993566989898682, + "logps/chosen": -333.759521484375, + "logps/rejected": -608.6412353515625, + "loss": 0.0556, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.400376319885254, + "rewards/margins": 8.578264236450195, + "rewards/rejected": -4.177888870239258, + "step": 2290 + }, + { + "epoch": 0.78, + "learning_rate": 4.1080196399345336e-07, + "logits/chosen": -1.542283296585083, + "logits/rejected": -1.3141086101531982, + "logps/chosen": -364.62347412109375, + "logps/rejected": -623.4647216796875, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9940249919891357, + "rewards/margins": 8.561315536499023, + "rewards/rejected": -4.56728982925415, + "step": 2300 + }, + { + "epoch": 0.78, + "eval_logits/chosen": -1.5229995250701904, + "eval_logits/rejected": -1.2157522439956665, + "eval_logps/chosen": -378.2882080078125, + "eval_logps/rejected": -606.1791381835938, + "eval_loss": 0.038565024733543396, + "eval_rewards/accuracies": 0.9882155060768127, + "eval_rewards/chosen": 4.163454055786133, + "eval_rewards/margins": 8.829736709594727, + "eval_rewards/rejected": -4.666282653808594, + "eval_runtime": 562.3055, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 0.528, + "step": 2300 + }, + { + "epoch": 0.79, + "learning_rate": 4.101724789122498e-07, + "logits/chosen": -1.5153863430023193, + "logits/rejected": -1.2825404405593872, + "logps/chosen": -469.64190673828125, + "logps/rejected": -612.6008911132812, + "loss": 0.0435, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.869655132293701, + "rewards/margins": 8.503313064575195, + "rewards/rejected": -4.633657455444336, + "step": 2310 + }, + { + "epoch": 0.79, + "learning_rate": 4.0954299383104616e-07, + "logits/chosen": -1.5398194789886475, + "logits/rejected": -1.3200174570083618, + "logps/chosen": -369.828125, + "logps/rejected": -524.4706420898438, + "loss": 0.0205, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.964080333709717, + "rewards/margins": 7.756634712219238, + "rewards/rejected": -3.792553663253784, + "step": 2320 + }, + { + "epoch": 0.79, + "learning_rate": 4.089135087498426e-07, + "logits/chosen": -1.5082850456237793, + "logits/rejected": -1.180332064628601, + "logps/chosen": -585.714111328125, + "logps/rejected": -525.0670776367188, + "loss": 0.0519, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.500771999359131, + "rewards/margins": 8.17918586730957, + "rewards/rejected": -4.678414344787598, + "step": 2330 + }, + { + "epoch": 0.8, + "learning_rate": 4.08284023668639e-07, + "logits/chosen": -1.5206671953201294, + "logits/rejected": -1.20794677734375, + "logps/chosen": -374.95330810546875, + "logps/rejected": -915.6882934570312, + "loss": 0.026, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.186873435974121, + "rewards/margins": 9.288768768310547, + "rewards/rejected": -5.101895809173584, + "step": 2340 + }, + { + "epoch": 0.8, + "learning_rate": 4.076545385874355e-07, + "logits/chosen": -1.5187537670135498, + "logits/rejected": -1.2163974046707153, + "logps/chosen": -351.074951171875, + "logps/rejected": -441.74163818359375, + "loss": 0.025, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.453653812408447, + "rewards/margins": 9.313101768493652, + "rewards/rejected": -4.859448432922363, + "step": 2350 + }, + { + "epoch": 0.8, + "learning_rate": 4.070250535062319e-07, + "logits/chosen": -1.539375901222229, + "logits/rejected": -1.1889965534210205, + "logps/chosen": -307.39520263671875, + "logps/rejected": -513.2401123046875, + "loss": 0.0447, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.3630781173706055, + "rewards/margins": 9.987470626831055, + "rewards/rejected": -5.624392509460449, + "step": 2360 + }, + { + "epoch": 0.81, + "learning_rate": 4.063955684250283e-07, + "logits/chosen": -1.505784273147583, + "logits/rejected": -1.2568755149841309, + "logps/chosen": -363.8207702636719, + "logps/rejected": -698.6258544921875, + "loss": 0.0388, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 4.040036201477051, + "rewards/margins": 8.35677719116211, + "rewards/rejected": -4.3167405128479, + "step": 2370 + }, + { + "epoch": 0.81, + "learning_rate": 4.0576608334382475e-07, + "logits/chosen": -1.508847713470459, + "logits/rejected": -1.2050701379776, + "logps/chosen": -392.56903076171875, + "logps/rejected": -483.5404357910156, + "loss": 0.0205, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.085127830505371, + "rewards/margins": 9.434377670288086, + "rewards/rejected": -5.349248886108398, + "step": 2380 + }, + { + "epoch": 0.81, + "learning_rate": 4.051365982626211e-07, + "logits/chosen": -1.5226056575775146, + "logits/rejected": -1.240067720413208, + "logps/chosen": -388.24945068359375, + "logps/rejected": -711.7886962890625, + "loss": 0.0329, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.306074619293213, + "rewards/margins": 9.975082397460938, + "rewards/rejected": -5.669007301330566, + "step": 2390 + }, + { + "epoch": 0.82, + "learning_rate": 4.0450711318141754e-07, + "logits/chosen": -1.55873441696167, + "logits/rejected": -1.2647191286087036, + "logps/chosen": -345.291259765625, + "logps/rejected": -588.80029296875, + "loss": 0.0296, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.113094329833984, + "rewards/margins": 9.082869529724121, + "rewards/rejected": -4.969775199890137, + "step": 2400 + }, + { + "epoch": 0.82, + "eval_logits/chosen": -1.5267930030822754, + "eval_logits/rejected": -1.2346751689910889, + "eval_logps/chosen": -379.9329833984375, + "eval_logps/rejected": -613.5353393554688, + "eval_loss": 0.03161880746483803, + "eval_rewards/accuracies": 0.9907407164573669, + "eval_rewards/chosen": 3.998974561691284, + "eval_rewards/margins": 9.400873184204102, + "eval_rewards/rejected": -5.401898384094238, + "eval_runtime": 563.6413, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 0.527, + "step": 2400 + }, + { + "epoch": 0.82, + "learning_rate": 4.03877628100214e-07, + "logits/chosen": -1.5341020822525024, + "logits/rejected": -1.222886085510254, + "logps/chosen": -346.28973388671875, + "logps/rejected": -496.89703369140625, + "loss": 0.0248, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.076117515563965, + "rewards/margins": 8.80964469909668, + "rewards/rejected": -4.733527183532715, + "step": 2410 + }, + { + "epoch": 0.82, + "learning_rate": 4.0324814301901044e-07, + "logits/chosen": -1.5130847692489624, + "logits/rejected": -1.3096258640289307, + "logps/chosen": -426.77056884765625, + "logps/rejected": -514.4892578125, + "loss": 0.038, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.953486204147339, + "rewards/margins": 8.722945213317871, + "rewards/rejected": -4.769458770751953, + "step": 2420 + }, + { + "epoch": 0.83, + "learning_rate": 4.0261865793780686e-07, + "logits/chosen": -1.5152466297149658, + "logits/rejected": -1.3115499019622803, + "logps/chosen": -383.27496337890625, + "logps/rejected": -638.0162353515625, + "loss": 0.0357, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.2650957107543945, + "rewards/margins": 9.19054889678955, + "rewards/rejected": -4.92545223236084, + "step": 2430 + }, + { + "epoch": 0.83, + "learning_rate": 4.019891728566033e-07, + "logits/chosen": -1.5054044723510742, + "logits/rejected": -1.275802731513977, + "logps/chosen": -400.35601806640625, + "logps/rejected": -730.1416625976562, + "loss": 0.0291, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.373312473297119, + "rewards/margins": 8.42037582397461, + "rewards/rejected": -5.047062397003174, + "step": 2440 + }, + { + "epoch": 0.83, + "learning_rate": 4.013596877753997e-07, + "logits/chosen": -1.5292766094207764, + "logits/rejected": -1.2346397638320923, + "logps/chosen": -332.4093322753906, + "logps/rejected": -647.0613403320312, + "loss": 0.0246, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.051926136016846, + "rewards/margins": 8.744019508361816, + "rewards/rejected": -4.692093372344971, + "step": 2450 + }, + { + "epoch": 0.84, + "learning_rate": 4.0073020269419613e-07, + "logits/chosen": -1.5082180500030518, + "logits/rejected": -1.288271188735962, + "logps/chosen": -306.493896484375, + "logps/rejected": -610.8732299804688, + "loss": 0.0353, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.245797157287598, + "rewards/margins": 9.181161880493164, + "rewards/rejected": -4.93536376953125, + "step": 2460 + }, + { + "epoch": 0.84, + "learning_rate": 4.0010071761299255e-07, + "logits/chosen": -1.5287882089614868, + "logits/rejected": -1.2420308589935303, + "logps/chosen": -393.2254333496094, + "logps/rejected": -566.6124877929688, + "loss": 0.0389, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.39141321182251, + "rewards/margins": 9.52265453338623, + "rewards/rejected": -5.131240367889404, + "step": 2470 + }, + { + "epoch": 0.84, + "learning_rate": 3.99471232531789e-07, + "logits/chosen": -1.4780056476593018, + "logits/rejected": -1.2894506454467773, + "logps/chosen": -393.1434020996094, + "logps/rejected": -559.2583618164062, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.251838207244873, + "rewards/margins": 9.137258529663086, + "rewards/rejected": -4.88541841506958, + "step": 2480 + }, + { + "epoch": 0.85, + "learning_rate": 3.988417474505854e-07, + "logits/chosen": -1.532470464706421, + "logits/rejected": -1.3118858337402344, + "logps/chosen": -381.4765625, + "logps/rejected": -462.48150634765625, + "loss": 0.036, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9704525470733643, + "rewards/margins": 9.043941497802734, + "rewards/rejected": -5.073489189147949, + "step": 2490 + }, + { + "epoch": 0.85, + "learning_rate": 3.982122623693818e-07, + "logits/chosen": -1.4979053735733032, + "logits/rejected": -1.1929200887680054, + "logps/chosen": -455.6328125, + "logps/rejected": -459.0594177246094, + "loss": 0.0289, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.064755916595459, + "rewards/margins": 9.953048706054688, + "rewards/rejected": -5.888293266296387, + "step": 2500 + }, + { + "epoch": 0.85, + "eval_logits/chosen": -1.510881781578064, + "eval_logits/rejected": -1.2152314186096191, + "eval_logps/chosen": -378.8585510253906, + "eval_logps/rejected": -611.6158447265625, + "eval_loss": 0.0314827486872673, + "eval_rewards/accuracies": 0.9907407164573669, + "eval_rewards/chosen": 4.106417655944824, + "eval_rewards/margins": 9.316365242004395, + "eval_rewards/rejected": -5.2099480628967285, + "eval_runtime": 564.0304, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 0.527, + "step": 2500 + }, + { + "epoch": 0.85, + "learning_rate": 3.9758277728817824e-07, + "logits/chosen": -1.5118482112884521, + "logits/rejected": -1.2419885396957397, + "logps/chosen": -401.3212890625, + "logps/rejected": -574.5198974609375, + "loss": 0.0597, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.206341743469238, + "rewards/margins": 8.69351863861084, + "rewards/rejected": -4.487177848815918, + "step": 2510 + }, + { + "epoch": 0.86, + "learning_rate": 3.9695329220697467e-07, + "logits/chosen": -1.5447056293487549, + "logits/rejected": -1.3146144151687622, + "logps/chosen": -318.0782165527344, + "logps/rejected": -829.5208129882812, + "loss": 0.0361, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.538026809692383, + "rewards/margins": 9.752525329589844, + "rewards/rejected": -5.214499473571777, + "step": 2520 + }, + { + "epoch": 0.86, + "learning_rate": 3.9632380712577114e-07, + "logits/chosen": -1.5258814096450806, + "logits/rejected": -1.3022167682647705, + "logps/chosen": -436.61614990234375, + "logps/rejected": -608.7833251953125, + "loss": 0.0387, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.6255292892456055, + "rewards/margins": 9.243282318115234, + "rewards/rejected": -4.617753982543945, + "step": 2530 + }, + { + "epoch": 0.86, + "learning_rate": 3.9569432204456756e-07, + "logits/chosen": -1.5103905200958252, + "logits/rejected": -1.2482256889343262, + "logps/chosen": -315.0715026855469, + "logps/rejected": -474.76531982421875, + "loss": 0.021, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.300446510314941, + "rewards/margins": 8.985579490661621, + "rewards/rejected": -4.685133457183838, + "step": 2540 + }, + { + "epoch": 0.87, + "learning_rate": 3.9506483696336393e-07, + "logits/chosen": -1.5255568027496338, + "logits/rejected": -1.2736884355545044, + "logps/chosen": -374.7379455566406, + "logps/rejected": -599.9737548828125, + "loss": 0.0349, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.317574501037598, + "rewards/margins": 9.692848205566406, + "rewards/rejected": -5.375273704528809, + "step": 2550 + }, + { + "epoch": 0.87, + "learning_rate": 3.9443535188216036e-07, + "logits/chosen": -1.5424623489379883, + "logits/rejected": -1.2154197692871094, + "logps/chosen": -319.04510498046875, + "logps/rejected": -670.06298828125, + "loss": 0.0236, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.232719421386719, + "rewards/margins": 9.533174514770508, + "rewards/rejected": -5.300455570220947, + "step": 2560 + }, + { + "epoch": 0.87, + "learning_rate": 3.938058668009568e-07, + "logits/chosen": -1.5329948663711548, + "logits/rejected": -1.2300420999526978, + "logps/chosen": -399.0639953613281, + "logps/rejected": -636.7238159179688, + "loss": 0.0424, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.576413631439209, + "rewards/margins": 8.986705780029297, + "rewards/rejected": -5.410292148590088, + "step": 2570 + }, + { + "epoch": 0.88, + "learning_rate": 3.931763817197532e-07, + "logits/chosen": -1.5329632759094238, + "logits/rejected": -1.2864882946014404, + "logps/chosen": -381.23663330078125, + "logps/rejected": -402.7793884277344, + "loss": 0.0256, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.674665927886963, + "rewards/margins": 8.285665512084961, + "rewards/rejected": -4.610999584197998, + "step": 2580 + }, + { + "epoch": 0.88, + "learning_rate": 3.925468966385497e-07, + "logits/chosen": -1.5401965379714966, + "logits/rejected": -1.232896089553833, + "logps/chosen": -447.3214416503906, + "logps/rejected": -378.88092041015625, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.291096210479736, + "rewards/margins": 9.32802677154541, + "rewards/rejected": -5.036929607391357, + "step": 2590 + }, + { + "epoch": 0.88, + "learning_rate": 3.919174115573461e-07, + "logits/chosen": -1.5437438488006592, + "logits/rejected": -1.1860885620117188, + "logps/chosen": -329.0899658203125, + "logps/rejected": -600.9002685546875, + "loss": 0.0326, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.928496837615967, + "rewards/margins": 9.814313888549805, + "rewards/rejected": -5.885817050933838, + "step": 2600 + }, + { + "epoch": 0.88, + "eval_logits/chosen": -1.5353665351867676, + "eval_logits/rejected": -1.243375301361084, + "eval_logps/chosen": -379.0233459472656, + "eval_logps/rejected": -613.5463256835938, + "eval_loss": 0.02802525833249092, + "eval_rewards/accuracies": 0.9907407164573669, + "eval_rewards/chosen": 4.089937686920166, + "eval_rewards/margins": 9.492928504943848, + "eval_rewards/rejected": -5.40299129486084, + "eval_runtime": 562.2813, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 0.528, + "step": 2600 + }, + { + "epoch": 0.89, + "learning_rate": 3.912879264761425e-07, + "logits/chosen": -1.5287754535675049, + "logits/rejected": -1.3370130062103271, + "logps/chosen": -390.73492431640625, + "logps/rejected": -673.8240356445312, + "loss": 0.0248, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.9431278705596924, + "rewards/margins": 8.70613956451416, + "rewards/rejected": -4.7630109786987305, + "step": 2610 + }, + { + "epoch": 0.89, + "learning_rate": 3.906584413949389e-07, + "logits/chosen": -1.5354580879211426, + "logits/rejected": -1.3661589622497559, + "logps/chosen": -477.1390075683594, + "logps/rejected": -534.7681884765625, + "loss": 0.031, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.629973888397217, + "rewards/margins": 8.619714736938477, + "rewards/rejected": -4.989741325378418, + "step": 2620 + }, + { + "epoch": 0.89, + "learning_rate": 3.900289563137353e-07, + "logits/chosen": -1.5522435903549194, + "logits/rejected": -1.2485020160675049, + "logps/chosen": -346.8073425292969, + "logps/rejected": -616.5460205078125, + "loss": 0.0234, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.9878170490264893, + "rewards/margins": 9.264127731323242, + "rewards/rejected": -5.276310920715332, + "step": 2630 + }, + { + "epoch": 0.9, + "learning_rate": 3.8939947123253174e-07, + "logits/chosen": -1.5199863910675049, + "logits/rejected": -1.2665718793869019, + "logps/chosen": -358.22784423828125, + "logps/rejected": -507.31884765625, + "loss": 0.0276, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.361278533935547, + "rewards/margins": 8.100163459777832, + "rewards/rejected": -4.738885402679443, + "step": 2640 + }, + { + "epoch": 0.9, + "learning_rate": 3.887699861513282e-07, + "logits/chosen": -1.5518028736114502, + "logits/rejected": -1.2743380069732666, + "logps/chosen": -291.95867919921875, + "logps/rejected": -467.4893493652344, + "loss": 0.0176, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.034001350402832, + "rewards/margins": 9.228925704956055, + "rewards/rejected": -5.194925308227539, + "step": 2650 + }, + { + "epoch": 0.9, + "learning_rate": 3.8814050107012464e-07, + "logits/chosen": -1.5411412715911865, + "logits/rejected": -1.2300548553466797, + "logps/chosen": -396.4468078613281, + "logps/rejected": -430.2041015625, + "loss": 0.0262, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.364605903625488, + "rewards/margins": 9.552851676940918, + "rewards/rejected": -5.18824577331543, + "step": 2660 + }, + { + "epoch": 0.91, + "learning_rate": 3.8751101598892106e-07, + "logits/chosen": -1.5622729063034058, + "logits/rejected": -1.3079333305358887, + "logps/chosen": -338.65924072265625, + "logps/rejected": -719.083984375, + "loss": 0.0199, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.658270835876465, + "rewards/margins": 9.85779094696045, + "rewards/rejected": -5.199519157409668, + "step": 2670 + }, + { + "epoch": 0.91, + "learning_rate": 3.868815309077175e-07, + "logits/chosen": -1.5321825742721558, + "logits/rejected": -1.2312917709350586, + "logps/chosen": -394.9886779785156, + "logps/rejected": -772.0485229492188, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8315110206604004, + "rewards/margins": 9.679927825927734, + "rewards/rejected": -5.848416328430176, + "step": 2680 + }, + { + "epoch": 0.91, + "learning_rate": 3.862520458265139e-07, + "logits/chosen": -1.5324981212615967, + "logits/rejected": -1.1967226266860962, + "logps/chosen": -429.6515197753906, + "logps/rejected": -791.8549194335938, + "loss": 0.0239, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7250256538391113, + "rewards/margins": 9.896958351135254, + "rewards/rejected": -6.171932697296143, + "step": 2690 + }, + { + "epoch": 0.92, + "learning_rate": 3.856225607453103e-07, + "logits/chosen": -1.546281337738037, + "logits/rejected": -1.282551884651184, + "logps/chosen": -263.76092529296875, + "logps/rejected": -506.40283203125, + "loss": 0.025, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.27431058883667, + "rewards/margins": 9.976223945617676, + "rewards/rejected": -5.701913356781006, + "step": 2700 + }, + { + "epoch": 0.92, + "eval_logits/chosen": -1.5268328189849854, + "eval_logits/rejected": -1.2282987833023071, + "eval_logps/chosen": -379.46002197265625, + "eval_logps/rejected": -611.9110107421875, + "eval_loss": 0.03331367298960686, + "eval_rewards/accuracies": 0.9924242496490479, + "eval_rewards/chosen": 4.046272277832031, + "eval_rewards/margins": 9.285730361938477, + "eval_rewards/rejected": -5.239457607269287, + "eval_runtime": 562.9532, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 0.528, + "step": 2700 + }, + { + "epoch": 0.92, + "learning_rate": 3.8499307566410675e-07, + "logits/chosen": -1.5378925800323486, + "logits/rejected": -1.2586190700531006, + "logps/chosen": -326.33538818359375, + "logps/rejected": -565.2320556640625, + "loss": 0.0277, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9691033363342285, + "rewards/margins": 9.820357322692871, + "rewards/rejected": -5.851254940032959, + "step": 2710 + }, + { + "epoch": 0.92, + "learning_rate": 3.843635905829032e-07, + "logits/chosen": -1.507794737815857, + "logits/rejected": -1.2618348598480225, + "logps/chosen": -481.65924072265625, + "logps/rejected": -708.0718994140625, + "loss": 0.0358, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.332957744598389, + "rewards/margins": 9.97797966003418, + "rewards/rejected": -5.645020961761475, + "step": 2720 + }, + { + "epoch": 0.93, + "learning_rate": 3.837341055016996e-07, + "logits/chosen": -1.5221909284591675, + "logits/rejected": -1.2824532985687256, + "logps/chosen": -321.3945007324219, + "logps/rejected": -687.73779296875, + "loss": 0.0197, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9533684253692627, + "rewards/margins": 8.84711742401123, + "rewards/rejected": -4.893749237060547, + "step": 2730 + }, + { + "epoch": 0.93, + "learning_rate": 3.83104620420496e-07, + "logits/chosen": -1.4993531703948975, + "logits/rejected": -1.299523115158081, + "logps/chosen": -396.795166015625, + "logps/rejected": -920.8742065429688, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4250988960266113, + "rewards/margins": 9.259854316711426, + "rewards/rejected": -5.834755897521973, + "step": 2740 + }, + { + "epoch": 0.93, + "learning_rate": 3.8247513533929244e-07, + "logits/chosen": -1.5348014831542969, + "logits/rejected": -1.2512391805648804, + "logps/chosen": -322.89642333984375, + "logps/rejected": -759.0184326171875, + "loss": 0.0175, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8769328594207764, + "rewards/margins": 9.700733184814453, + "rewards/rejected": -5.823800086975098, + "step": 2750 + }, + { + "epoch": 0.94, + "learning_rate": 3.8184565025808887e-07, + "logits/chosen": -1.527889609336853, + "logits/rejected": -1.250536322593689, + "logps/chosen": -431.0650329589844, + "logps/rejected": -663.8050537109375, + "loss": 0.0213, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.874131679534912, + "rewards/margins": 9.852746963500977, + "rewards/rejected": -5.978613376617432, + "step": 2760 + }, + { + "epoch": 0.94, + "learning_rate": 3.8121616517688534e-07, + "logits/chosen": -1.531960129737854, + "logits/rejected": -1.2742459774017334, + "logps/chosen": -310.0575866699219, + "logps/rejected": -606.11572265625, + "loss": 0.0235, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.596333980560303, + "rewards/margins": 10.050146102905273, + "rewards/rejected": -5.453811168670654, + "step": 2770 + }, + { + "epoch": 0.94, + "learning_rate": 3.805866800956817e-07, + "logits/chosen": -1.5259926319122314, + "logits/rejected": -1.1937414407730103, + "logps/chosen": -317.17852783203125, + "logps/rejected": -362.4829406738281, + "loss": 0.0298, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.656906843185425, + "rewards/margins": 9.111510276794434, + "rewards/rejected": -5.454604148864746, + "step": 2780 + }, + { + "epoch": 0.95, + "learning_rate": 3.7995719501447813e-07, + "logits/chosen": -1.5253241062164307, + "logits/rejected": -1.2063783407211304, + "logps/chosen": -328.53070068359375, + "logps/rejected": -502.744873046875, + "loss": 0.0175, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.340774059295654, + "rewards/margins": 9.440818786621094, + "rewards/rejected": -5.100043773651123, + "step": 2790 + }, + { + "epoch": 0.95, + "learning_rate": 3.7932770993327456e-07, + "logits/chosen": -1.5284957885742188, + "logits/rejected": -1.1917234659194946, + "logps/chosen": -336.49969482421875, + "logps/rejected": -846.9683837890625, + "loss": 0.0273, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.0211567878723145, + "rewards/margins": 10.17439079284668, + "rewards/rejected": -6.153233528137207, + "step": 2800 + }, + { + "epoch": 0.95, + "eval_logits/chosen": -1.5270861387252808, + "eval_logits/rejected": -1.2253377437591553, + "eval_logps/chosen": -378.8770446777344, + "eval_logps/rejected": -614.4639282226562, + "eval_loss": 0.02587779611349106, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 4.104568004608154, + "eval_rewards/margins": 9.599316596984863, + "eval_rewards/rejected": -5.494749069213867, + "eval_runtime": 563.8045, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 0.527, + "step": 2800 + }, + { + "epoch": 0.96, + "learning_rate": 3.78698224852071e-07, + "logits/chosen": -1.5471569299697876, + "logits/rejected": -1.2516077756881714, + "logps/chosen": -305.4718322753906, + "logps/rejected": -416.7831115722656, + "loss": 0.0373, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.316678047180176, + "rewards/margins": 9.637255668640137, + "rewards/rejected": -5.320578575134277, + "step": 2810 + }, + { + "epoch": 0.96, + "learning_rate": 3.780687397708674e-07, + "logits/chosen": -1.5574287176132202, + "logits/rejected": -1.2676112651824951, + "logps/chosen": -336.58599853515625, + "logps/rejected": -439.8106384277344, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.731220245361328, + "rewards/margins": 8.744790077209473, + "rewards/rejected": -5.013570785522461, + "step": 2820 + }, + { + "epoch": 0.96, + "learning_rate": 3.774392546896638e-07, + "logits/chosen": -1.5384368896484375, + "logits/rejected": -1.2853554487228394, + "logps/chosen": -421.58978271484375, + "logps/rejected": -723.2725830078125, + "loss": 0.0239, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9282431602478027, + "rewards/margins": 9.140935897827148, + "rewards/rejected": -5.212693691253662, + "step": 2830 + }, + { + "epoch": 0.97, + "learning_rate": 3.768097696084603e-07, + "logits/chosen": -1.5289303064346313, + "logits/rejected": -1.2238986492156982, + "logps/chosen": -370.78082275390625, + "logps/rejected": -599.9653930664062, + "loss": 0.0229, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.519654273986816, + "rewards/margins": 9.799467086791992, + "rewards/rejected": -5.279813289642334, + "step": 2840 + }, + { + "epoch": 0.97, + "learning_rate": 3.761802845272567e-07, + "logits/chosen": -1.5358774662017822, + "logits/rejected": -1.2638745307922363, + "logps/chosen": -379.4000244140625, + "logps/rejected": -508.39984130859375, + "loss": 0.0286, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.119468688964844, + "rewards/margins": 9.560442924499512, + "rewards/rejected": -5.440975666046143, + "step": 2850 + }, + { + "epoch": 0.97, + "learning_rate": 3.755507994460531e-07, + "logits/chosen": -1.5533860921859741, + "logits/rejected": -1.2589747905731201, + "logps/chosen": -344.5013732910156, + "logps/rejected": -486.7314453125, + "loss": 0.0179, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.358794212341309, + "rewards/margins": 9.376945495605469, + "rewards/rejected": -5.018151760101318, + "step": 2860 + }, + { + "epoch": 0.98, + "learning_rate": 3.749213143648495e-07, + "logits/chosen": -1.5261322259902954, + "logits/rejected": -1.2655422687530518, + "logps/chosen": -451.4296875, + "logps/rejected": -684.3489990234375, + "loss": 0.0394, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.8236517906188965, + "rewards/margins": 8.851622581481934, + "rewards/rejected": -5.027971267700195, + "step": 2870 + }, + { + "epoch": 0.98, + "learning_rate": 3.7429182928364594e-07, + "logits/chosen": -1.5526050329208374, + "logits/rejected": -1.310326337814331, + "logps/chosen": -395.00360107421875, + "logps/rejected": -890.1891479492188, + "loss": 0.0323, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.6357905864715576, + "rewards/margins": 9.361474990844727, + "rewards/rejected": -5.725684642791748, + "step": 2880 + }, + { + "epoch": 0.98, + "learning_rate": 3.7366234420244236e-07, + "logits/chosen": -1.5236667394638062, + "logits/rejected": -1.2612967491149902, + "logps/chosen": -533.245849609375, + "logps/rejected": -674.8106079101562, + "loss": 0.018, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.7383265495300293, + "rewards/margins": 9.209358215332031, + "rewards/rejected": -5.47103214263916, + "step": 2890 + }, + { + "epoch": 0.99, + "learning_rate": 3.7303285912123884e-07, + "logits/chosen": -1.5468064546585083, + "logits/rejected": -1.281954050064087, + "logps/chosen": -342.9344177246094, + "logps/rejected": -551.2838134765625, + "loss": 0.0197, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.083136081695557, + "rewards/margins": 9.778923988342285, + "rewards/rejected": -5.695788383483887, + "step": 2900 + }, + { + "epoch": 0.99, + "eval_logits/chosen": -1.5321004390716553, + "eval_logits/rejected": -1.232041358947754, + "eval_logps/chosen": -378.2807922363281, + "eval_logps/rejected": -612.9524536132812, + "eval_loss": 0.036046989262104034, + "eval_rewards/accuracies": 0.9907407164573669, + "eval_rewards/chosen": 4.164193153381348, + "eval_rewards/margins": 9.507803916931152, + "eval_rewards/rejected": -5.343610763549805, + "eval_runtime": 563.7807, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 0.527, + "step": 2900 + }, + { + "epoch": 0.99, + "learning_rate": 3.7240337404003526e-07, + "logits/chosen": -1.5427501201629639, + "logits/rejected": -1.3076882362365723, + "logps/chosen": -328.9602355957031, + "logps/rejected": -725.5062866210938, + "loss": 0.0172, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.543259859085083, + "rewards/margins": 8.809348106384277, + "rewards/rejected": -5.266088485717773, + "step": 2910 + }, + { + "epoch": 0.99, + "learning_rate": 3.717738889588317e-07, + "logits/chosen": -1.527445673942566, + "logits/rejected": -1.2516653537750244, + "logps/chosen": -339.581298828125, + "logps/rejected": -767.8448486328125, + "loss": 0.0197, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8266119956970215, + "rewards/margins": 9.275941848754883, + "rewards/rejected": -5.449328899383545, + "step": 2920 + }, + { + "epoch": 1.0, + "learning_rate": 3.7114440387762805e-07, + "logits/chosen": -1.5251435041427612, + "logits/rejected": -1.282825231552124, + "logps/chosen": -319.2281799316406, + "logps/rejected": -426.3926696777344, + "loss": 0.0222, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8006958961486816, + "rewards/margins": 8.93952751159668, + "rewards/rejected": -5.138833045959473, + "step": 2930 + }, + { + "epoch": 1.0, + "learning_rate": 3.705149187964245e-07, + "logits/chosen": -1.5176670551300049, + "logits/rejected": -1.2432739734649658, + "logps/chosen": -399.40960693359375, + "logps/rejected": -545.31884765625, + "loss": 0.0326, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.027955055236816, + "rewards/margins": 8.990588188171387, + "rewards/rejected": -4.96263313293457, + "step": 2940 + }, + { + "epoch": 1.0, + "learning_rate": 3.698854337152209e-07, + "logits/chosen": -1.5423166751861572, + "logits/rejected": -1.2030185461044312, + "logps/chosen": -318.4764709472656, + "logps/rejected": -504.6060485839844, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.160604000091553, + "rewards/margins": 9.719947814941406, + "rewards/rejected": -5.559342384338379, + "step": 2950 + }, + { + "epoch": 1.01, + "learning_rate": 3.692559486340174e-07, + "logits/chosen": -1.5226755142211914, + "logits/rejected": -1.2700029611587524, + "logps/chosen": -363.5705871582031, + "logps/rejected": -967.1926879882812, + "loss": 0.0203, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.36875057220459, + "rewards/margins": 10.397601127624512, + "rewards/rejected": -6.028850555419922, + "step": 2960 + }, + { + "epoch": 1.01, + "learning_rate": 3.686264635528138e-07, + "logits/chosen": -1.5728189945220947, + "logits/rejected": -1.29799485206604, + "logps/chosen": -304.05059814453125, + "logps/rejected": -298.78167724609375, + "loss": 0.0294, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.294428825378418, + "rewards/margins": 9.008630752563477, + "rewards/rejected": -4.714202404022217, + "step": 2970 + }, + { + "epoch": 1.01, + "learning_rate": 3.679969784716102e-07, + "logits/chosen": -1.5687190294265747, + "logits/rejected": -1.2704347372055054, + "logps/chosen": -287.04669189453125, + "logps/rejected": -531.3434448242188, + "loss": 0.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.327779293060303, + "rewards/margins": 9.918437004089355, + "rewards/rejected": -5.590658187866211, + "step": 2980 + }, + { + "epoch": 1.02, + "learning_rate": 3.6736749339040664e-07, + "logits/chosen": -1.54791259765625, + "logits/rejected": -1.3313263654708862, + "logps/chosen": -392.12286376953125, + "logps/rejected": -647.05615234375, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.109222412109375, + "rewards/margins": 9.626128196716309, + "rewards/rejected": -5.516905784606934, + "step": 2990 + }, + { + "epoch": 1.02, + "learning_rate": 3.6673800830920307e-07, + "logits/chosen": -1.5233992338180542, + "logits/rejected": -1.2798643112182617, + "logps/chosen": -369.7725524902344, + "logps/rejected": -553.8580322265625, + "loss": 0.0196, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.1038079261779785, + "rewards/margins": 8.883817672729492, + "rewards/rejected": -4.780010223388672, + "step": 3000 + }, + { + "epoch": 1.02, + "eval_logits/chosen": -1.5308232307434082, + "eval_logits/rejected": -1.2357827425003052, + "eval_logps/chosen": -381.1749267578125, + "eval_logps/rejected": -619.3846435546875, + "eval_loss": 0.026668569073081017, + "eval_rewards/accuracies": 0.9949495196342468, + "eval_rewards/chosen": 3.874782085418701, + "eval_rewards/margins": 9.861611366271973, + "eval_rewards/rejected": -5.98682975769043, + "eval_runtime": 562.9424, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 0.528, + "step": 3000 + }, + { + "epoch": 1.02, + "learning_rate": 3.6610852322799943e-07, + "logits/chosen": -1.5399169921875, + "logits/rejected": -1.306549310684204, + "logps/chosen": -449.6426696777344, + "logps/rejected": -627.6109619140625, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.017568111419678, + "rewards/margins": 9.882436752319336, + "rewards/rejected": -5.8648681640625, + "step": 3010 + }, + { + "epoch": 1.03, + "learning_rate": 3.654790381467959e-07, + "logits/chosen": -1.5538936853408813, + "logits/rejected": -1.2739226818084717, + "logps/chosen": -341.322265625, + "logps/rejected": -423.27392578125, + "loss": 0.0203, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7740719318389893, + "rewards/margins": 9.619070053100586, + "rewards/rejected": -5.844997882843018, + "step": 3020 + }, + { + "epoch": 1.03, + "learning_rate": 3.6484955306559233e-07, + "logits/chosen": -1.535865068435669, + "logits/rejected": -1.325559377670288, + "logps/chosen": -470.14288330078125, + "logps/rejected": -489.502685546875, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.2607011795043945, + "rewards/margins": 9.805902481079102, + "rewards/rejected": -5.545201778411865, + "step": 3030 + }, + { + "epoch": 1.03, + "learning_rate": 3.6422006798438876e-07, + "logits/chosen": -1.5578217506408691, + "logits/rejected": -1.2919658422470093, + "logps/chosen": -354.1922302246094, + "logps/rejected": -561.917236328125, + "loss": 0.0296, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.6218857765197754, + "rewards/margins": 9.295665740966797, + "rewards/rejected": -5.673779487609863, + "step": 3040 + }, + { + "epoch": 1.04, + "learning_rate": 3.635905829031852e-07, + "logits/chosen": -1.524993658065796, + "logits/rejected": -1.2915582656860352, + "logps/chosen": -453.56231689453125, + "logps/rejected": -486.10882568359375, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5384349822998047, + "rewards/margins": 9.471923828125, + "rewards/rejected": -5.933487892150879, + "step": 3050 + }, + { + "epoch": 1.04, + "learning_rate": 3.629610978219816e-07, + "logits/chosen": -1.5351612567901611, + "logits/rejected": -1.298592209815979, + "logps/chosen": -367.17657470703125, + "logps/rejected": -793.2417602539062, + "loss": 0.016, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.294528007507324, + "rewards/margins": 10.177715301513672, + "rewards/rejected": -5.883187294006348, + "step": 3060 + }, + { + "epoch": 1.04, + "learning_rate": 3.62331612740778e-07, + "logits/chosen": -1.5254700183868408, + "logits/rejected": -1.290189504623413, + "logps/chosen": -445.72198486328125, + "logps/rejected": -591.864990234375, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.142092704772949, + "rewards/margins": 9.956101417541504, + "rewards/rejected": -5.8140082359313965, + "step": 3070 + }, + { + "epoch": 1.05, + "learning_rate": 3.617021276595745e-07, + "logits/chosen": -1.5166041851043701, + "logits/rejected": -1.3582340478897095, + "logps/chosen": -583.5050048828125, + "logps/rejected": -504.0105895996094, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8232510089874268, + "rewards/margins": 9.612903594970703, + "rewards/rejected": -5.789652347564697, + "step": 3080 + }, + { + "epoch": 1.05, + "learning_rate": 3.6107264257837087e-07, + "logits/chosen": -1.5523070096969604, + "logits/rejected": -1.2708463668823242, + "logps/chosen": -379.41131591796875, + "logps/rejected": -700.8568115234375, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.978430986404419, + "rewards/margins": 10.113374710083008, + "rewards/rejected": -6.134943962097168, + "step": 3090 + }, + { + "epoch": 1.05, + "learning_rate": 3.604431574971673e-07, + "logits/chosen": -1.5368545055389404, + "logits/rejected": -1.2980880737304688, + "logps/chosen": -362.3525695800781, + "logps/rejected": -583.7112426757812, + "loss": 0.0188, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.259115695953369, + "rewards/margins": 8.917471885681152, + "rewards/rejected": -5.658356666564941, + "step": 3100 + }, + { + "epoch": 1.05, + "eval_logits/chosen": -1.5361324548721313, + "eval_logits/rejected": -1.2365063428878784, + "eval_logps/chosen": -381.4704895019531, + "eval_logps/rejected": -620.4247436523438, + "eval_loss": 0.026834391057491302, + "eval_rewards/accuracies": 0.9949495196342468, + "eval_rewards/chosen": 3.8452234268188477, + "eval_rewards/margins": 9.936064720153809, + "eval_rewards/rejected": -6.090841770172119, + "eval_runtime": 562.4866, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 0.528, + "step": 3100 + }, + { + "epoch": 1.06, + "learning_rate": 3.598136724159637e-07, + "logits/chosen": -1.5441906452178955, + "logits/rejected": -1.300480604171753, + "logps/chosen": -448.13916015625, + "logps/rejected": -441.6670837402344, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8185150623321533, + "rewards/margins": 9.702341079711914, + "rewards/rejected": -5.88382625579834, + "step": 3110 + }, + { + "epoch": 1.06, + "learning_rate": 3.5918418733476014e-07, + "logits/chosen": -1.5524084568023682, + "logits/rejected": -1.3157343864440918, + "logps/chosen": -465.8070373535156, + "logps/rejected": -569.8115844726562, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9874141216278076, + "rewards/margins": 10.586256980895996, + "rewards/rejected": -6.598842620849609, + "step": 3120 + }, + { + "epoch": 1.06, + "learning_rate": 3.5855470225355656e-07, + "logits/chosen": -1.5547677278518677, + "logits/rejected": -1.3013343811035156, + "logps/chosen": -387.306884765625, + "logps/rejected": -624.9599609375, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9266676902770996, + "rewards/margins": 9.803277015686035, + "rewards/rejected": -5.8766093254089355, + "step": 3130 + }, + { + "epoch": 1.07, + "learning_rate": 3.5792521717235304e-07, + "logits/chosen": -1.555638313293457, + "logits/rejected": -1.293177843093872, + "logps/chosen": -399.3710632324219, + "logps/rejected": -489.9842224121094, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8865151405334473, + "rewards/margins": 9.556222915649414, + "rewards/rejected": -5.669708728790283, + "step": 3140 + }, + { + "epoch": 1.07, + "learning_rate": 3.5729573209114946e-07, + "logits/chosen": -1.5274536609649658, + "logits/rejected": -1.3201470375061035, + "logps/chosen": -323.3263854980469, + "logps/rejected": -593.4132690429688, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.108608245849609, + "rewards/margins": 10.004108428955078, + "rewards/rejected": -5.895500659942627, + "step": 3150 + }, + { + "epoch": 1.07, + "learning_rate": 3.5666624700994583e-07, + "logits/chosen": -1.5636541843414307, + "logits/rejected": -1.2370617389678955, + "logps/chosen": -326.51104736328125, + "logps/rejected": -946.2828369140625, + "loss": 0.0153, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.9125022888183594, + "rewards/margins": 10.39599323272705, + "rewards/rejected": -6.483490943908691, + "step": 3160 + }, + { + "epoch": 1.08, + "learning_rate": 3.5603676192874225e-07, + "logits/chosen": -1.5561641454696655, + "logits/rejected": -1.310093641281128, + "logps/chosen": -312.96258544921875, + "logps/rejected": -478.3831481933594, + "loss": 0.0212, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.199625015258789, + "rewards/margins": 9.61187744140625, + "rewards/rejected": -5.412252426147461, + "step": 3170 + }, + { + "epoch": 1.08, + "learning_rate": 3.554072768475387e-07, + "logits/chosen": -1.547011137008667, + "logits/rejected": -1.2715133428573608, + "logps/chosen": -470.8583984375, + "logps/rejected": -560.4715576171875, + "loss": 0.0163, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.9313292503356934, + "rewards/margins": 9.736650466918945, + "rewards/rejected": -5.805321216583252, + "step": 3180 + }, + { + "epoch": 1.08, + "learning_rate": 3.547777917663351e-07, + "logits/chosen": -1.5640223026275635, + "logits/rejected": -1.2130903005599976, + "logps/chosen": -361.28753662109375, + "logps/rejected": -677.9293823242188, + "loss": 0.0138, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.703754425048828, + "rewards/margins": 9.726037979125977, + "rewards/rejected": -6.0222859382629395, + "step": 3190 + }, + { + "epoch": 1.09, + "learning_rate": 3.5414830668513157e-07, + "logits/chosen": -1.568625807762146, + "logits/rejected": -1.2628543376922607, + "logps/chosen": -301.23089599609375, + "logps/rejected": -482.57928466796875, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.0514445304870605, + "rewards/margins": 10.71653938293457, + "rewards/rejected": -6.665093898773193, + "step": 3200 + }, + { + "epoch": 1.09, + "eval_logits/chosen": -1.556128740310669, + "eval_logits/rejected": -1.2627047300338745, + "eval_logps/chosen": -382.1876525878906, + "eval_logps/rejected": -623.1463012695312, + "eval_loss": 0.023126764222979546, + "eval_rewards/accuracies": 0.9907407164573669, + "eval_rewards/chosen": 3.7735111713409424, + "eval_rewards/margins": 10.136504173278809, + "eval_rewards/rejected": -6.3629937171936035, + "eval_runtime": 560.1412, + "eval_samples_per_second": 16.96, + "eval_steps_per_second": 0.53, + "step": 3200 + }, + { + "epoch": 1.09, + "learning_rate": 3.53518821603928e-07, + "logits/chosen": -1.5664528608322144, + "logits/rejected": -1.2894829511642456, + "logps/chosen": -319.388427734375, + "logps/rejected": -487.09698486328125, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2873737812042236, + "rewards/margins": 9.995000839233398, + "rewards/rejected": -6.707627296447754, + "step": 3210 + }, + { + "epoch": 1.09, + "learning_rate": 3.528893365227244e-07, + "logits/chosen": -1.5418627262115479, + "logits/rejected": -1.2390079498291016, + "logps/chosen": -480.0328674316406, + "logps/rejected": -533.3216552734375, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.072500467300415, + "rewards/margins": 9.110407829284668, + "rewards/rejected": -6.037907600402832, + "step": 3220 + }, + { + "epoch": 1.1, + "learning_rate": 3.5225985144152084e-07, + "logits/chosen": -1.5690703392028809, + "logits/rejected": -1.2530103921890259, + "logps/chosen": -344.64813232421875, + "logps/rejected": -483.99090576171875, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9700818061828613, + "rewards/margins": 10.728215217590332, + "rewards/rejected": -6.758131980895996, + "step": 3230 + }, + { + "epoch": 1.1, + "learning_rate": 3.516303663603172e-07, + "logits/chosen": -1.565423607826233, + "logits/rejected": -1.3082515001296997, + "logps/chosen": -392.6990051269531, + "logps/rejected": -559.715087890625, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.313260316848755, + "rewards/margins": 9.54464340209961, + "rewards/rejected": -6.231383323669434, + "step": 3240 + }, + { + "epoch": 1.1, + "learning_rate": 3.5100088127911363e-07, + "logits/chosen": -1.5372222661972046, + "logits/rejected": -1.2375788688659668, + "logps/chosen": -421.9637145996094, + "logps/rejected": -440.1932678222656, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9397454261779785, + "rewards/margins": 10.521345138549805, + "rewards/rejected": -6.581600189208984, + "step": 3250 + }, + { + "epoch": 1.11, + "learning_rate": 3.503713961979101e-07, + "logits/chosen": -1.5304957628250122, + "logits/rejected": -1.2879369258880615, + "logps/chosen": -372.69427490234375, + "logps/rejected": -608.7009887695312, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.13718581199646, + "rewards/margins": 9.672079086303711, + "rewards/rejected": -6.534893035888672, + "step": 3260 + }, + { + "epoch": 1.11, + "learning_rate": 3.4974191111670653e-07, + "logits/chosen": -1.5726665258407593, + "logits/rejected": -1.335750699043274, + "logps/chosen": -414.6236877441406, + "logps/rejected": -816.2572021484375, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6817421913146973, + "rewards/margins": 10.407289505004883, + "rewards/rejected": -6.725546360015869, + "step": 3270 + }, + { + "epoch": 1.11, + "learning_rate": 3.4911242603550296e-07, + "logits/chosen": -1.5278464555740356, + "logits/rejected": -1.3466991186141968, + "logps/chosen": -345.5179748535156, + "logps/rejected": -527.89599609375, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.018115520477295, + "rewards/margins": 8.973227500915527, + "rewards/rejected": -5.955111980438232, + "step": 3280 + }, + { + "epoch": 1.12, + "learning_rate": 3.484829409542994e-07, + "logits/chosen": -1.5523159503936768, + "logits/rejected": -1.2810847759246826, + "logps/chosen": -442.94793701171875, + "logps/rejected": -576.2291870117188, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.925013303756714, + "rewards/margins": 10.412287712097168, + "rewards/rejected": -6.487275123596191, + "step": 3290 + }, + { + "epoch": 1.12, + "learning_rate": 3.478534558730958e-07, + "logits/chosen": -1.5540058612823486, + "logits/rejected": -1.2268660068511963, + "logps/chosen": -381.5820617675781, + "logps/rejected": -528.8215942382812, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6574196815490723, + "rewards/margins": 9.489648818969727, + "rewards/rejected": -5.8322296142578125, + "step": 3300 + }, + { + "epoch": 1.12, + "eval_logits/chosen": -1.531604528427124, + "eval_logits/rejected": -1.2410249710083008, + "eval_logps/chosen": -382.43218994140625, + "eval_logps/rejected": -625.3326416015625, + "eval_loss": 0.021844467148184776, + "eval_rewards/accuracies": 0.9957912564277649, + "eval_rewards/chosen": 3.749055862426758, + "eval_rewards/margins": 10.330683708190918, + "eval_rewards/rejected": -6.581628799438477, + "eval_runtime": 562.5979, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 0.528, + "step": 3300 + }, + { + "epoch": 1.13, + "learning_rate": 3.4722397079189217e-07, + "logits/chosen": -1.5403664112091064, + "logits/rejected": -1.2344160079956055, + "logps/chosen": -328.0329895019531, + "logps/rejected": -639.121337890625, + "loss": 0.0151, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.4270644187927246, + "rewards/margins": 9.514490127563477, + "rewards/rejected": -6.087424278259277, + "step": 3310 + }, + { + "epoch": 1.13, + "learning_rate": 3.4659448571068865e-07, + "logits/chosen": -1.5222011804580688, + "logits/rejected": -1.258724331855774, + "logps/chosen": -499.6629943847656, + "logps/rejected": -593.1663208007812, + "loss": 0.0161, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.6279196739196777, + "rewards/margins": 10.360054016113281, + "rewards/rejected": -6.7321343421936035, + "step": 3320 + }, + { + "epoch": 1.13, + "learning_rate": 3.4596500062948507e-07, + "logits/chosen": -1.5412685871124268, + "logits/rejected": -1.2143152952194214, + "logps/chosen": -415.35638427734375, + "logps/rejected": -619.3327026367188, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.177037715911865, + "rewards/margins": 12.480097770690918, + "rewards/rejected": -8.303060531616211, + "step": 3330 + }, + { + "epoch": 1.14, + "learning_rate": 3.453355155482815e-07, + "logits/chosen": -1.5484035015106201, + "logits/rejected": -1.24642014503479, + "logps/chosen": -378.8363037109375, + "logps/rejected": -947.1253662109375, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.706882953643799, + "rewards/margins": 11.088679313659668, + "rewards/rejected": -7.381796360015869, + "step": 3340 + }, + { + "epoch": 1.14, + "learning_rate": 3.447060304670779e-07, + "logits/chosen": -1.5696405172348022, + "logits/rejected": -1.2043745517730713, + "logps/chosen": -337.546630859375, + "logps/rejected": -559.9576416015625, + "loss": 0.0224, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.41611385345459, + "rewards/margins": 10.978442192077637, + "rewards/rejected": -6.562326908111572, + "step": 3350 + }, + { + "epoch": 1.14, + "learning_rate": 3.4407654538587434e-07, + "logits/chosen": -1.5310184955596924, + "logits/rejected": -1.3733822107315063, + "logps/chosen": -391.3695983886719, + "logps/rejected": -724.014892578125, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.912929058074951, + "rewards/margins": 10.426431655883789, + "rewards/rejected": -6.513503074645996, + "step": 3360 + }, + { + "epoch": 1.15, + "learning_rate": 3.4344706030467076e-07, + "logits/chosen": -1.524610996246338, + "logits/rejected": -1.2261892557144165, + "logps/chosen": -327.8486328125, + "logps/rejected": -795.5487060546875, + "loss": 0.0168, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.4892947673797607, + "rewards/margins": 10.68218994140625, + "rewards/rejected": -7.192893981933594, + "step": 3370 + }, + { + "epoch": 1.15, + "learning_rate": 3.4281757522346724e-07, + "logits/chosen": -1.5339491367340088, + "logits/rejected": -1.3159081935882568, + "logps/chosen": -450.0570373535156, + "logps/rejected": -524.4630737304688, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.960024356842041, + "rewards/margins": 10.203170776367188, + "rewards/rejected": -6.243147373199463, + "step": 3380 + }, + { + "epoch": 1.15, + "learning_rate": 3.421880901422636e-07, + "logits/chosen": -1.5401431322097778, + "logits/rejected": -1.308737874031067, + "logps/chosen": -378.6482238769531, + "logps/rejected": -783.6815185546875, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9178872108459473, + "rewards/margins": 11.063507080078125, + "rewards/rejected": -7.145620822906494, + "step": 3390 + }, + { + "epoch": 1.16, + "learning_rate": 3.4155860506106003e-07, + "logits/chosen": -1.537680983543396, + "logits/rejected": -1.3012498617172241, + "logps/chosen": -406.70648193359375, + "logps/rejected": -645.6878662109375, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8645145893096924, + "rewards/margins": 9.825982093811035, + "rewards/rejected": -5.9614667892456055, + "step": 3400 + }, + { + "epoch": 1.16, + "eval_logits/chosen": -1.5386831760406494, + "eval_logits/rejected": -1.2519314289093018, + "eval_logps/chosen": -382.81329345703125, + "eval_logps/rejected": -626.4235229492188, + "eval_loss": 0.018864383921027184, + "eval_rewards/accuracies": 0.9957912564277649, + "eval_rewards/chosen": 3.7109427452087402, + "eval_rewards/margins": 10.401658058166504, + "eval_rewards/rejected": -6.690715312957764, + "eval_runtime": 562.5159, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 0.528, + "step": 3400 + }, + { + "epoch": 1.16, + "learning_rate": 3.4092911997985645e-07, + "logits/chosen": -1.5627996921539307, + "logits/rejected": -1.3064839839935303, + "logps/chosen": -368.0377197265625, + "logps/rejected": -625.5384521484375, + "loss": 0.0218, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.132103681564331, + "rewards/margins": 9.798089981079102, + "rewards/rejected": -6.665986061096191, + "step": 3410 + }, + { + "epoch": 1.16, + "learning_rate": 3.402996348986529e-07, + "logits/chosen": -1.5163140296936035, + "logits/rejected": -1.2684190273284912, + "logps/chosen": -460.0826110839844, + "logps/rejected": -698.2129516601562, + "loss": 0.0135, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.357525587081909, + "rewards/margins": 10.22006607055664, + "rewards/rejected": -6.862539768218994, + "step": 3420 + }, + { + "epoch": 1.17, + "learning_rate": 3.396701498174493e-07, + "logits/chosen": -1.5437471866607666, + "logits/rejected": -1.3229434490203857, + "logps/chosen": -419.491455078125, + "logps/rejected": -571.1338500976562, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7035796642303467, + "rewards/margins": 10.30061149597168, + "rewards/rejected": -6.597031593322754, + "step": 3430 + }, + { + "epoch": 1.17, + "learning_rate": 3.3904066473624577e-07, + "logits/chosen": -1.5641080141067505, + "logits/rejected": -1.2949795722961426, + "logps/chosen": -303.3148193359375, + "logps/rejected": -664.458984375, + "loss": 0.014, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.4792182445526123, + "rewards/margins": 10.327569007873535, + "rewards/rejected": -6.848350524902344, + "step": 3440 + }, + { + "epoch": 1.17, + "learning_rate": 3.384111796550422e-07, + "logits/chosen": -1.5624980926513672, + "logits/rejected": -1.354408621788025, + "logps/chosen": -325.1142883300781, + "logps/rejected": -730.6846923828125, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.188638687133789, + "rewards/margins": 10.407112121582031, + "rewards/rejected": -7.2184739112854, + "step": 3450 + }, + { + "epoch": 1.18, + "learning_rate": 3.377816945738386e-07, + "logits/chosen": -1.5600968599319458, + "logits/rejected": -1.3628467321395874, + "logps/chosen": -288.1431579589844, + "logps/rejected": -544.7928466796875, + "loss": 0.0213, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.626802444458008, + "rewards/margins": 10.158662796020508, + "rewards/rejected": -6.531861305236816, + "step": 3460 + }, + { + "epoch": 1.18, + "learning_rate": 3.37152209492635e-07, + "logits/chosen": -1.537868618965149, + "logits/rejected": -1.323835015296936, + "logps/chosen": -367.98504638671875, + "logps/rejected": -684.2203979492188, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.524312973022461, + "rewards/margins": 10.402295112609863, + "rewards/rejected": -6.877980709075928, + "step": 3470 + }, + { + "epoch": 1.18, + "learning_rate": 3.365227244114314e-07, + "logits/chosen": -1.5589523315429688, + "logits/rejected": -1.388816237449646, + "logps/chosen": -310.3872985839844, + "logps/rejected": -544.1265869140625, + "loss": 0.0114, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.119651794433594, + "rewards/margins": 10.41513729095459, + "rewards/rejected": -6.295487403869629, + "step": 3480 + }, + { + "epoch": 1.19, + "learning_rate": 3.3589323933022783e-07, + "logits/chosen": -1.549437165260315, + "logits/rejected": -1.2430723905563354, + "logps/chosen": -424.7110290527344, + "logps/rejected": -632.2919311523438, + "loss": 0.0101, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.24579119682312, + "rewards/margins": 10.87434196472168, + "rewards/rejected": -7.6285505294799805, + "step": 3490 + }, + { + "epoch": 1.19, + "learning_rate": 3.3526375424902426e-07, + "logits/chosen": -1.5652432441711426, + "logits/rejected": -1.3288078308105469, + "logps/chosen": -373.62237548828125, + "logps/rejected": -578.0263671875, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.193802833557129, + "rewards/margins": 10.306367874145508, + "rewards/rejected": -7.1125640869140625, + "step": 3500 + }, + { + "epoch": 1.19, + "eval_logits/chosen": -1.5462299585342407, + "eval_logits/rejected": -1.2701737880706787, + "eval_logps/chosen": -383.78515625, + "eval_logps/rejected": -630.64453125, + "eval_loss": 0.019120950251817703, + "eval_rewards/accuracies": 0.994107723236084, + "eval_rewards/chosen": 3.613757610321045, + "eval_rewards/margins": 10.726574897766113, + "eval_rewards/rejected": -7.112815856933594, + "eval_runtime": 563.4834, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 0.527, + "step": 3500 + }, + { + "epoch": 1.19, + "learning_rate": 3.3463426916782073e-07, + "logits/chosen": -1.5445168018341064, + "logits/rejected": -1.2720247507095337, + "logps/chosen": -362.05322265625, + "logps/rejected": -486.078857421875, + "loss": 0.0103, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 3.5915985107421875, + "rewards/margins": 9.760786056518555, + "rewards/rejected": -6.169186592102051, + "step": 3510 + }, + { + "epoch": 1.2, + "learning_rate": 3.3400478408661716e-07, + "logits/chosen": -1.5472060441970825, + "logits/rejected": -1.358382225036621, + "logps/chosen": -344.2837219238281, + "logps/rejected": -428.633056640625, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.326205253601074, + "rewards/margins": 11.489296913146973, + "rewards/rejected": -7.16309118270874, + "step": 3520 + }, + { + "epoch": 1.2, + "learning_rate": 3.333752990054136e-07, + "logits/chosen": -1.5623257160186768, + "logits/rejected": -1.311713457107544, + "logps/chosen": -378.5647277832031, + "logps/rejected": -713.7362060546875, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.105370998382568, + "rewards/margins": 10.871156692504883, + "rewards/rejected": -6.765786170959473, + "step": 3530 + }, + { + "epoch": 1.2, + "learning_rate": 3.3274581392420995e-07, + "logits/chosen": -1.5748411417007446, + "logits/rejected": -1.30417799949646, + "logps/chosen": -299.34942626953125, + "logps/rejected": -452.14886474609375, + "loss": 0.0153, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.776268482208252, + "rewards/margins": 10.08252239227295, + "rewards/rejected": -6.3062543869018555, + "step": 3540 + }, + { + "epoch": 1.21, + "learning_rate": 3.3211632884300637e-07, + "logits/chosen": -1.5301434993743896, + "logits/rejected": -1.2431023120880127, + "logps/chosen": -434.5494689941406, + "logps/rejected": -603.0121459960938, + "loss": 0.0112, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.5567264556884766, + "rewards/margins": 10.754892349243164, + "rewards/rejected": -7.198164463043213, + "step": 3550 + }, + { + "epoch": 1.21, + "learning_rate": 3.314868437618028e-07, + "logits/chosen": -1.536667823791504, + "logits/rejected": -1.302473783493042, + "logps/chosen": -461.10687255859375, + "logps/rejected": -602.4193115234375, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6489691734313965, + "rewards/margins": 10.982636451721191, + "rewards/rejected": -7.333667755126953, + "step": 3560 + }, + { + "epoch": 1.21, + "learning_rate": 3.3085735868059927e-07, + "logits/chosen": -1.5425341129302979, + "logits/rejected": -1.3245007991790771, + "logps/chosen": -519.387451171875, + "logps/rejected": -477.8439025878906, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.236023902893066, + "rewards/margins": 11.063962936401367, + "rewards/rejected": -6.827939510345459, + "step": 3570 + }, + { + "epoch": 1.22, + "learning_rate": 3.302278735993957e-07, + "logits/chosen": -1.558509349822998, + "logits/rejected": -1.2751100063323975, + "logps/chosen": -475.23858642578125, + "logps/rejected": -801.991943359375, + "loss": 0.0132, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.120568037033081, + "rewards/margins": 10.369867324829102, + "rewards/rejected": -7.249299049377441, + "step": 3580 + }, + { + "epoch": 1.22, + "learning_rate": 3.295983885181921e-07, + "logits/chosen": -1.562759518623352, + "logits/rejected": -1.3440775871276855, + "logps/chosen": -558.598388671875, + "logps/rejected": -782.6259765625, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9504032135009766, + "rewards/margins": 10.753255844116211, + "rewards/rejected": -6.802853584289551, + "step": 3590 + }, + { + "epoch": 1.22, + "learning_rate": 3.2896890343698854e-07, + "logits/chosen": -1.5683956146240234, + "logits/rejected": -1.292709231376648, + "logps/chosen": -306.02789306640625, + "logps/rejected": -579.3465576171875, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.182302951812744, + "rewards/margins": 10.139721870422363, + "rewards/rejected": -6.957418918609619, + "step": 3600 + }, + { + "epoch": 1.22, + "eval_logits/chosen": -1.548082709312439, + "eval_logits/rejected": -1.2642260789871216, + "eval_logps/chosen": -383.9829406738281, + "eval_logps/rejected": -632.6978149414062, + "eval_loss": 0.017524201422929764, + "eval_rewards/accuracies": 0.9949495196342468, + "eval_rewards/chosen": 3.5939829349517822, + "eval_rewards/margins": 10.91212272644043, + "eval_rewards/rejected": -7.318140029907227, + "eval_runtime": 561.3316, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 0.529, + "step": 3600 + }, + { + "epoch": 1.23, + "learning_rate": 3.2833941835578496e-07, + "logits/chosen": -1.5476725101470947, + "logits/rejected": -1.2592203617095947, + "logps/chosen": -336.96209716796875, + "logps/rejected": -688.2083129882812, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4398536682128906, + "rewards/margins": 11.275859832763672, + "rewards/rejected": -7.836006164550781, + "step": 3610 + }, + { + "epoch": 1.23, + "learning_rate": 3.2770993327458133e-07, + "logits/chosen": -1.5690919160842896, + "logits/rejected": -1.2492401599884033, + "logps/chosen": -391.0542297363281, + "logps/rejected": -459.27374267578125, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7982184886932373, + "rewards/margins": 11.101940155029297, + "rewards/rejected": -7.303722381591797, + "step": 3620 + }, + { + "epoch": 1.23, + "learning_rate": 3.270804481933778e-07, + "logits/chosen": -1.5592089891433716, + "logits/rejected": -1.2544500827789307, + "logps/chosen": -328.0624084472656, + "logps/rejected": -647.0172119140625, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.291409730911255, + "rewards/margins": 10.406785011291504, + "rewards/rejected": -7.115375518798828, + "step": 3630 + }, + { + "epoch": 1.24, + "learning_rate": 3.2645096311217423e-07, + "logits/chosen": -1.5468913316726685, + "logits/rejected": -1.312497854232788, + "logps/chosen": -408.9129638671875, + "logps/rejected": -408.50360107421875, + "loss": 0.0103, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.325671434402466, + "rewards/margins": 9.630389213562012, + "rewards/rejected": -6.304718017578125, + "step": 3640 + }, + { + "epoch": 1.24, + "learning_rate": 3.2582147803097065e-07, + "logits/chosen": -1.5437870025634766, + "logits/rejected": -1.2960712909698486, + "logps/chosen": -350.2049560546875, + "logps/rejected": -615.3699340820312, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5708584785461426, + "rewards/margins": 10.423314094543457, + "rewards/rejected": -6.85245418548584, + "step": 3650 + }, + { + "epoch": 1.24, + "learning_rate": 3.251919929497671e-07, + "logits/chosen": -1.5394858121871948, + "logits/rejected": -1.2867348194122314, + "logps/chosen": -422.38433837890625, + "logps/rejected": -501.934814453125, + "loss": 0.0128, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.5384135246276855, + "rewards/margins": 11.0324125289917, + "rewards/rejected": -7.493998050689697, + "step": 3660 + }, + { + "epoch": 1.25, + "learning_rate": 3.245625078685635e-07, + "logits/chosen": -1.5278021097183228, + "logits/rejected": -1.2714591026306152, + "logps/chosen": -506.66961669921875, + "logps/rejected": -788.076904296875, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.359327793121338, + "rewards/margins": 11.180188179016113, + "rewards/rejected": -7.820860862731934, + "step": 3670 + }, + { + "epoch": 1.25, + "learning_rate": 3.239330227873599e-07, + "logits/chosen": -1.5510963201522827, + "logits/rejected": -1.2875866889953613, + "logps/chosen": -351.9230041503906, + "logps/rejected": -531.8778076171875, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9361369609832764, + "rewards/margins": 11.744085311889648, + "rewards/rejected": -7.807949066162109, + "step": 3680 + }, + { + "epoch": 1.25, + "learning_rate": 3.233035377061564e-07, + "logits/chosen": -1.5468543767929077, + "logits/rejected": -1.3294947147369385, + "logps/chosen": -374.6852111816406, + "logps/rejected": -555.9889526367188, + "loss": 0.0108, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.9911885261535645, + "rewards/margins": 10.140085220336914, + "rewards/rejected": -7.14889669418335, + "step": 3690 + }, + { + "epoch": 1.26, + "learning_rate": 3.2267405262495277e-07, + "logits/chosen": -1.5682952404022217, + "logits/rejected": -1.3557989597320557, + "logps/chosen": -390.01153564453125, + "logps/rejected": -586.96533203125, + "loss": 0.0175, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8475356101989746, + "rewards/margins": 10.769521713256836, + "rewards/rejected": -7.921985626220703, + "step": 3700 + }, + { + "epoch": 1.26, + "eval_logits/chosen": -1.5503476858139038, + "eval_logits/rejected": -1.290449857711792, + "eval_logps/chosen": -385.1369934082031, + "eval_logps/rejected": -637.77001953125, + "eval_loss": 0.018260376527905464, + "eval_rewards/accuracies": 0.9949495196342468, + "eval_rewards/chosen": 3.478574514389038, + "eval_rewards/margins": 11.303938865661621, + "eval_rewards/rejected": -7.825364589691162, + "eval_runtime": 562.5698, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 0.528, + "step": 3700 + }, + { + "epoch": 1.26, + "learning_rate": 3.220445675437492e-07, + "logits/chosen": -1.5756982564926147, + "logits/rejected": -1.302247166633606, + "logps/chosen": -356.7356872558594, + "logps/rejected": -800.9180297851562, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0516490936279297, + "rewards/margins": 12.311636924743652, + "rewards/rejected": -9.259988784790039, + "step": 3710 + }, + { + "epoch": 1.26, + "learning_rate": 3.214150824625456e-07, + "logits/chosen": -1.561956763267517, + "logits/rejected": -1.2722795009613037, + "logps/chosen": -323.0436096191406, + "logps/rejected": -676.3961791992188, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8152756690979004, + "rewards/margins": 10.528733253479004, + "rewards/rejected": -7.713456630706787, + "step": 3720 + }, + { + "epoch": 1.27, + "learning_rate": 3.2078559738134203e-07, + "logits/chosen": -1.5560318231582642, + "logits/rejected": -1.3369327783584595, + "logps/chosen": -435.696044921875, + "logps/rejected": -703.270751953125, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6595470905303955, + "rewards/margins": 10.275975227355957, + "rewards/rejected": -7.616427421569824, + "step": 3730 + }, + { + "epoch": 1.27, + "learning_rate": 3.2015611230013846e-07, + "logits/chosen": -1.5670607089996338, + "logits/rejected": -1.3175945281982422, + "logps/chosen": -346.5909729003906, + "logps/rejected": -778.6409912109375, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.066230773925781, + "rewards/margins": 11.33479118347168, + "rewards/rejected": -7.26855993270874, + "step": 3740 + }, + { + "epoch": 1.27, + "learning_rate": 3.1952662721893493e-07, + "logits/chosen": -1.5476568937301636, + "logits/rejected": -1.3414742946624756, + "logps/chosen": -302.6426086425781, + "logps/rejected": -650.2315673828125, + "loss": 0.0152, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.3412926197052, + "rewards/margins": 10.654970169067383, + "rewards/rejected": -7.313677787780762, + "step": 3750 + }, + { + "epoch": 1.28, + "learning_rate": 3.1889714213773135e-07, + "logits/chosen": -1.5716911554336548, + "logits/rejected": -1.3497941493988037, + "logps/chosen": -394.611328125, + "logps/rejected": -635.4236450195312, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5342471599578857, + "rewards/margins": 11.284036636352539, + "rewards/rejected": -7.749789237976074, + "step": 3760 + }, + { + "epoch": 1.28, + "learning_rate": 3.182676570565277e-07, + "logits/chosen": -1.5562270879745483, + "logits/rejected": -1.3234447240829468, + "logps/chosen": -326.9047546386719, + "logps/rejected": -616.5462646484375, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5032589435577393, + "rewards/margins": 11.69961166381836, + "rewards/rejected": -8.196352005004883, + "step": 3770 + }, + { + "epoch": 1.28, + "learning_rate": 3.1763817197532415e-07, + "logits/chosen": -1.552811861038208, + "logits/rejected": -1.3049066066741943, + "logps/chosen": -379.5268859863281, + "logps/rejected": -554.747314453125, + "loss": 0.0077, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.412034273147583, + "rewards/margins": 11.240354537963867, + "rewards/rejected": -7.828319549560547, + "step": 3780 + }, + { + "epoch": 1.29, + "learning_rate": 3.1700868689412057e-07, + "logits/chosen": -1.5628570318222046, + "logits/rejected": -1.2982027530670166, + "logps/chosen": -491.29559326171875, + "logps/rejected": -721.3206787109375, + "loss": 0.0106, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.6234030723571777, + "rewards/margins": 11.745047569274902, + "rewards/rejected": -8.121644020080566, + "step": 3790 + }, + { + "epoch": 1.29, + "learning_rate": 3.16379201812917e-07, + "logits/chosen": -1.5734034776687622, + "logits/rejected": -1.371080994606018, + "logps/chosen": -297.39996337890625, + "logps/rejected": -783.114990234375, + "loss": 0.0147, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.0067734718322754, + "rewards/margins": 10.57593059539795, + "rewards/rejected": -7.569157600402832, + "step": 3800 + }, + { + "epoch": 1.29, + "eval_logits/chosen": -1.5667117834091187, + "eval_logits/rejected": -1.3081718683242798, + "eval_logps/chosen": -387.1887512207031, + "eval_logps/rejected": -641.2315673828125, + "eval_loss": 0.015309293754398823, + "eval_rewards/accuracies": 0.996632993221283, + "eval_rewards/chosen": 3.2733964920043945, + "eval_rewards/margins": 11.44491958618164, + "eval_rewards/rejected": -8.17152214050293, + "eval_runtime": 564.402, + "eval_samples_per_second": 16.832, + "eval_steps_per_second": 0.526, + "step": 3800 + }, + { + "epoch": 1.3, + "learning_rate": 3.1574971673171347e-07, + "logits/chosen": -1.5623356103897095, + "logits/rejected": -1.2604972124099731, + "logps/chosen": -450.8146057128906, + "logps/rejected": -472.3253479003906, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0199196338653564, + "rewards/margins": 10.700310707092285, + "rewards/rejected": -7.680390357971191, + "step": 3810 + }, + { + "epoch": 1.3, + "learning_rate": 3.151202316505099e-07, + "logits/chosen": -1.5827510356903076, + "logits/rejected": -1.3609280586242676, + "logps/chosen": -345.0986022949219, + "logps/rejected": -752.5287475585938, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.385680675506592, + "rewards/margins": 13.005363464355469, + "rewards/rejected": -8.619684219360352, + "step": 3820 + }, + { + "epoch": 1.3, + "learning_rate": 3.144907465693063e-07, + "logits/chosen": -1.557237148284912, + "logits/rejected": -1.3344987630844116, + "logps/chosen": -367.04693603515625, + "logps/rejected": -615.0887451171875, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1507620811462402, + "rewards/margins": 10.534086227416992, + "rewards/rejected": -7.383325099945068, + "step": 3830 + }, + { + "epoch": 1.31, + "learning_rate": 3.1386126148810274e-07, + "logits/chosen": -1.55043363571167, + "logits/rejected": -1.2795573472976685, + "logps/chosen": -515.2775268554688, + "logps/rejected": -562.2213134765625, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1226460933685303, + "rewards/margins": 11.414012908935547, + "rewards/rejected": -8.291367530822754, + "step": 3840 + }, + { + "epoch": 1.31, + "learning_rate": 3.132317764068991e-07, + "logits/chosen": -1.539562463760376, + "logits/rejected": -1.32407808303833, + "logps/chosen": -345.9480285644531, + "logps/rejected": -761.6564331054688, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.158485174179077, + "rewards/margins": 10.913348197937012, + "rewards/rejected": -7.754862308502197, + "step": 3850 + }, + { + "epoch": 1.31, + "learning_rate": 3.1260229132569553e-07, + "logits/chosen": -1.540583848953247, + "logits/rejected": -1.3491706848144531, + "logps/chosen": -442.97705078125, + "logps/rejected": -484.7159118652344, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1008834838867188, + "rewards/margins": 11.370752334594727, + "rewards/rejected": -8.269866943359375, + "step": 3860 + }, + { + "epoch": 1.32, + "learning_rate": 3.11972806244492e-07, + "logits/chosen": -1.5526082515716553, + "logits/rejected": -1.3367919921875, + "logps/chosen": -391.95721435546875, + "logps/rejected": -625.2447509765625, + "loss": 0.0261, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8465263843536377, + "rewards/margins": 10.367132186889648, + "rewards/rejected": -7.520604133605957, + "step": 3870 + }, + { + "epoch": 1.32, + "learning_rate": 3.1134332116328843e-07, + "logits/chosen": -1.572690725326538, + "logits/rejected": -1.3410431146621704, + "logps/chosen": -389.5829162597656, + "logps/rejected": -752.0416259765625, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.387737989425659, + "rewards/margins": 11.799455642700195, + "rewards/rejected": -8.411718368530273, + "step": 3880 + }, + { + "epoch": 1.32, + "learning_rate": 3.1071383608208485e-07, + "logits/chosen": -1.5610352754592896, + "logits/rejected": -1.351049542427063, + "logps/chosen": -428.21453857421875, + "logps/rejected": -427.4214782714844, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6971182823181152, + "rewards/margins": 11.824326515197754, + "rewards/rejected": -8.127208709716797, + "step": 3890 + }, + { + "epoch": 1.33, + "learning_rate": 3.1008435100088127e-07, + "logits/chosen": -1.5544888973236084, + "logits/rejected": -1.2803246974945068, + "logps/chosen": -347.70916748046875, + "logps/rejected": -850.9161987304688, + "loss": 0.0113, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.6056485176086426, + "rewards/margins": 12.393299102783203, + "rewards/rejected": -8.787651062011719, + "step": 3900 + }, + { + "epoch": 1.33, + "eval_logits/chosen": -1.552535057067871, + "eval_logits/rejected": -1.2906668186187744, + "eval_logps/chosen": -386.889892578125, + "eval_logps/rejected": -643.0200805664062, + "eval_loss": 0.015344955027103424, + "eval_rewards/accuracies": 0.996632993221283, + "eval_rewards/chosen": 3.3032867908477783, + "eval_rewards/margins": 11.653657913208008, + "eval_rewards/rejected": -8.350372314453125, + "eval_runtime": 561.7541, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 0.529, + "step": 3900 + }, + { + "epoch": 1.33, + "learning_rate": 3.094548659196777e-07, + "logits/chosen": -1.5446327924728394, + "logits/rejected": -1.3399782180786133, + "logps/chosen": -565.9669799804688, + "logps/rejected": -817.57177734375, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2617568969726562, + "rewards/margins": 11.561229705810547, + "rewards/rejected": -8.29947280883789, + "step": 3910 + }, + { + "epoch": 1.33, + "learning_rate": 3.0882538083847407e-07, + "logits/chosen": -1.5832773447036743, + "logits/rejected": -1.2803641557693481, + "logps/chosen": -426.3008728027344, + "logps/rejected": -529.3174438476562, + "loss": 0.0144, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.748228073120117, + "rewards/margins": 10.918815612792969, + "rewards/rejected": -8.170587539672852, + "step": 3920 + }, + { + "epoch": 1.34, + "learning_rate": 3.0819589575727054e-07, + "logits/chosen": -1.5334964990615845, + "logits/rejected": -1.3193509578704834, + "logps/chosen": -574.7423095703125, + "logps/rejected": -429.0559997558594, + "loss": 0.0294, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.156707763671875, + "rewards/margins": 10.570030212402344, + "rewards/rejected": -7.413322448730469, + "step": 3930 + }, + { + "epoch": 1.34, + "learning_rate": 3.0756641067606696e-07, + "logits/chosen": -1.557379961013794, + "logits/rejected": -1.352913737297058, + "logps/chosen": -352.03057861328125, + "logps/rejected": -578.6124267578125, + "loss": 0.0089, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.9821767807006836, + "rewards/margins": 10.810458183288574, + "rewards/rejected": -7.828280448913574, + "step": 3940 + }, + { + "epoch": 1.34, + "learning_rate": 3.069369255948634e-07, + "logits/chosen": -1.573169469833374, + "logits/rejected": -1.2757112979888916, + "logps/chosen": -340.1747131347656, + "logps/rejected": -664.8591918945312, + "loss": 0.0147, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.806180238723755, + "rewards/margins": 11.857370376586914, + "rewards/rejected": -8.051191329956055, + "step": 3950 + }, + { + "epoch": 1.35, + "learning_rate": 3.063074405136598e-07, + "logits/chosen": -1.5420644283294678, + "logits/rejected": -1.3730390071868896, + "logps/chosen": -333.8004150390625, + "logps/rejected": -607.1297607421875, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6073250770568848, + "rewards/margins": 12.011253356933594, + "rewards/rejected": -8.40392780303955, + "step": 3960 + }, + { + "epoch": 1.35, + "learning_rate": 3.0567795543245623e-07, + "logits/chosen": -1.5615346431732178, + "logits/rejected": -1.356567621231079, + "logps/chosen": -396.7604675292969, + "logps/rejected": -717.3262939453125, + "loss": 0.011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.9268836975097656, + "rewards/margins": 12.396979331970215, + "rewards/rejected": -8.470096588134766, + "step": 3970 + }, + { + "epoch": 1.35, + "learning_rate": 3.0504847035125266e-07, + "logits/chosen": -1.5500177145004272, + "logits/rejected": -1.2806719541549683, + "logps/chosen": -426.8389587402344, + "logps/rejected": -606.6141967773438, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.582719087600708, + "rewards/margins": 11.660516738891602, + "rewards/rejected": -8.07779598236084, + "step": 3980 + }, + { + "epoch": 1.36, + "learning_rate": 3.0441898527004913e-07, + "logits/chosen": -1.5717202425003052, + "logits/rejected": -1.3078752756118774, + "logps/chosen": -315.86077880859375, + "logps/rejected": -582.4734497070312, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7915472984313965, + "rewards/margins": 11.904988288879395, + "rewards/rejected": -8.113441467285156, + "step": 3990 + }, + { + "epoch": 1.36, + "learning_rate": 3.037895001888455e-07, + "logits/chosen": -1.558194875717163, + "logits/rejected": -1.3561882972717285, + "logps/chosen": -398.2179260253906, + "logps/rejected": -621.0252685546875, + "loss": 0.0284, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8816046714782715, + "rewards/margins": 11.797433853149414, + "rewards/rejected": -7.915828704833984, + "step": 4000 + }, + { + "epoch": 1.36, + "eval_logits/chosen": -1.5473618507385254, + "eval_logits/rejected": -1.2917457818984985, + "eval_logps/chosen": -384.68170166015625, + "eval_logps/rejected": -641.0870971679688, + "eval_loss": 0.026986392214894295, + "eval_rewards/accuracies": 0.9924242496490479, + "eval_rewards/chosen": 3.524106740951538, + "eval_rewards/margins": 11.681173324584961, + "eval_rewards/rejected": -8.157066345214844, + "eval_runtime": 568.1029, + "eval_samples_per_second": 16.722, + "eval_steps_per_second": 0.523, + "step": 4000 + }, + { + "epoch": 1.36, + "learning_rate": 3.031600151076419e-07, + "logits/chosen": -1.5626230239868164, + "logits/rejected": -1.328392744064331, + "logps/chosen": -337.9571838378906, + "logps/rejected": -660.5670166015625, + "loss": 0.0112, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.0910449028015137, + "rewards/margins": 11.443092346191406, + "rewards/rejected": -8.352046966552734, + "step": 4010 + }, + { + "epoch": 1.37, + "learning_rate": 3.0253053002643835e-07, + "logits/chosen": -1.5600558519363403, + "logits/rejected": -1.329992651939392, + "logps/chosen": -339.1719665527344, + "logps/rejected": -634.294921875, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.49611234664917, + "rewards/margins": 12.32038688659668, + "rewards/rejected": -8.824273109436035, + "step": 4020 + }, + { + "epoch": 1.37, + "learning_rate": 3.0190104494523477e-07, + "logits/chosen": -1.5497252941131592, + "logits/rejected": -1.3417781591415405, + "logps/chosen": -332.14532470703125, + "logps/rejected": -812.8832397460938, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.682182788848877, + "rewards/margins": 10.57850456237793, + "rewards/rejected": -7.896321773529053, + "step": 4030 + }, + { + "epoch": 1.37, + "learning_rate": 3.012715598640312e-07, + "logits/chosen": -1.5086257457733154, + "logits/rejected": -1.314073085784912, + "logps/chosen": -476.35955810546875, + "logps/rejected": -638.862548828125, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8616223335266113, + "rewards/margins": 12.898541450500488, + "rewards/rejected": -9.036918640136719, + "step": 4040 + }, + { + "epoch": 1.38, + "learning_rate": 3.0064207478282767e-07, + "logits/chosen": -1.5316005945205688, + "logits/rejected": -1.3184077739715576, + "logps/chosen": -476.8362731933594, + "logps/rejected": -637.3565063476562, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.863726854324341, + "rewards/margins": 11.051305770874023, + "rewards/rejected": -8.187579154968262, + "step": 4050 + }, + { + "epoch": 1.38, + "learning_rate": 3.000125897016241e-07, + "logits/chosen": -1.5376583337783813, + "logits/rejected": -1.3154890537261963, + "logps/chosen": -357.4083557128906, + "logps/rejected": -616.3048095703125, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9005610942840576, + "rewards/margins": 11.261184692382812, + "rewards/rejected": -8.360623359680176, + "step": 4060 + }, + { + "epoch": 1.38, + "learning_rate": 2.993831046204205e-07, + "logits/chosen": -1.5178908109664917, + "logits/rejected": -1.31767737865448, + "logps/chosen": -503.52276611328125, + "logps/rejected": -525.5484619140625, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6984305381774902, + "rewards/margins": 12.142548561096191, + "rewards/rejected": -8.444117546081543, + "step": 4070 + }, + { + "epoch": 1.39, + "learning_rate": 2.987536195392169e-07, + "logits/chosen": -1.5185540914535522, + "logits/rejected": -1.2753911018371582, + "logps/chosen": -435.0626525878906, + "logps/rejected": -551.1524047851562, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.467758893966675, + "rewards/margins": 12.487218856811523, + "rewards/rejected": -9.019460678100586, + "step": 4080 + }, + { + "epoch": 1.39, + "learning_rate": 2.981241344580133e-07, + "logits/chosen": -1.5369386672973633, + "logits/rejected": -1.339849829673767, + "logps/chosen": -408.22528076171875, + "logps/rejected": -597.9600219726562, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.543799638748169, + "rewards/margins": 11.706110000610352, + "rewards/rejected": -8.162309646606445, + "step": 4090 + }, + { + "epoch": 1.39, + "learning_rate": 2.9749464937680973e-07, + "logits/chosen": -1.5481826066970825, + "logits/rejected": -1.3568317890167236, + "logps/chosen": -354.1960754394531, + "logps/rejected": -382.54290771484375, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.675137758255005, + "rewards/margins": 11.775232315063477, + "rewards/rejected": -8.100095748901367, + "step": 4100 + }, + { + "epoch": 1.39, + "eval_logits/chosen": -1.5401920080184937, + "eval_logits/rejected": -1.3039100170135498, + "eval_logps/chosen": -386.7808532714844, + "eval_logps/rejected": -648.9590454101562, + "eval_loss": 0.013829583302140236, + "eval_rewards/accuracies": 0.994107723236084, + "eval_rewards/chosen": 3.3141894340515137, + "eval_rewards/margins": 12.258452415466309, + "eval_rewards/rejected": -8.944263458251953, + "eval_runtime": 573.7018, + "eval_samples_per_second": 16.559, + "eval_steps_per_second": 0.518, + "step": 4100 + }, + { + "epoch": 1.4, + "learning_rate": 2.968651642956062e-07, + "logits/chosen": -1.5369822978973389, + "logits/rejected": -1.3080308437347412, + "logps/chosen": -315.96112060546875, + "logps/rejected": -624.6578369140625, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0886013507843018, + "rewards/margins": 11.427984237670898, + "rewards/rejected": -8.339384078979492, + "step": 4110 + }, + { + "epoch": 1.4, + "learning_rate": 2.9623567921440263e-07, + "logits/chosen": -1.5768437385559082, + "logits/rejected": -1.346503496170044, + "logps/chosen": -326.0722351074219, + "logps/rejected": -502.36920166015625, + "loss": 0.0168, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.354001522064209, + "rewards/margins": 12.075699806213379, + "rewards/rejected": -8.721698760986328, + "step": 4120 + }, + { + "epoch": 1.4, + "learning_rate": 2.9560619413319905e-07, + "logits/chosen": -1.5425691604614258, + "logits/rejected": -1.3352969884872437, + "logps/chosen": -592.2025756835938, + "logps/rejected": -815.1539306640625, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.650028705596924, + "rewards/margins": 12.661463737487793, + "rewards/rejected": -10.011434555053711, + "step": 4130 + }, + { + "epoch": 1.41, + "learning_rate": 2.9497670905199547e-07, + "logits/chosen": -1.5177825689315796, + "logits/rejected": -1.3187179565429688, + "logps/chosen": -560.4607543945312, + "logps/rejected": -677.9915771484375, + "loss": 0.0158, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.5132954120635986, + "rewards/margins": 12.43108081817627, + "rewards/rejected": -8.917786598205566, + "step": 4140 + }, + { + "epoch": 1.41, + "learning_rate": 2.9434722397079184e-07, + "logits/chosen": -1.5356464385986328, + "logits/rejected": -1.313726544380188, + "logps/chosen": -443.2185974121094, + "logps/rejected": -837.4552001953125, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.26578950881958, + "rewards/margins": 13.2615966796875, + "rewards/rejected": -9.995807647705078, + "step": 4150 + }, + { + "epoch": 1.41, + "learning_rate": 2.9371773888958827e-07, + "logits/chosen": -1.5704758167266846, + "logits/rejected": -1.2890733480453491, + "logps/chosen": -361.41448974609375, + "logps/rejected": -598.3782348632812, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6268386840820312, + "rewards/margins": 12.407869338989258, + "rewards/rejected": -8.781030654907227, + "step": 4160 + }, + { + "epoch": 1.42, + "learning_rate": 2.9308825380838474e-07, + "logits/chosen": -1.556640386581421, + "logits/rejected": -1.2798312902450562, + "logps/chosen": -431.86932373046875, + "logps/rejected": -528.3174438476562, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5157973766326904, + "rewards/margins": 12.050169944763184, + "rewards/rejected": -8.534372329711914, + "step": 4170 + }, + { + "epoch": 1.42, + "learning_rate": 2.9245876872718116e-07, + "logits/chosen": -1.5409786701202393, + "logits/rejected": -1.3333543539047241, + "logps/chosen": -381.4745178222656, + "logps/rejected": -602.3903198242188, + "loss": 0.0098, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.5858802795410156, + "rewards/margins": 11.83027172088623, + "rewards/rejected": -8.244391441345215, + "step": 4180 + }, + { + "epoch": 1.42, + "learning_rate": 2.918292836459776e-07, + "logits/chosen": -1.5540903806686401, + "logits/rejected": -1.3745924234390259, + "logps/chosen": -354.9630432128906, + "logps/rejected": -884.9613037109375, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3870034217834473, + "rewards/margins": 11.55251407623291, + "rewards/rejected": -8.165510177612305, + "step": 4190 + }, + { + "epoch": 1.43, + "learning_rate": 2.91199798564774e-07, + "logits/chosen": -1.5731165409088135, + "logits/rejected": -1.3726691007614136, + "logps/chosen": -303.6591491699219, + "logps/rejected": -612.04345703125, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4851551055908203, + "rewards/margins": 12.123380661010742, + "rewards/rejected": -8.638226509094238, + "step": 4200 + }, + { + "epoch": 1.43, + "eval_logits/chosen": -1.5543475151062012, + "eval_logits/rejected": -1.3067430257797241, + "eval_logps/chosen": -386.3898620605469, + "eval_logps/rejected": -650.0152587890625, + "eval_loss": 0.01588916778564453, + "eval_rewards/accuracies": 0.996632993221283, + "eval_rewards/chosen": 3.353287696838379, + "eval_rewards/margins": 12.403179168701172, + "eval_rewards/rejected": -9.04989242553711, + "eval_runtime": 572.9448, + "eval_samples_per_second": 16.581, + "eval_steps_per_second": 0.518, + "step": 4200 + }, + { + "epoch": 1.43, + "learning_rate": 2.9057031348357043e-07, + "logits/chosen": -1.575650930404663, + "logits/rejected": -1.3764150142669678, + "logps/chosen": -334.74420166015625, + "logps/rejected": -792.2727661132812, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.741102695465088, + "rewards/margins": 12.627447128295898, + "rewards/rejected": -8.886344909667969, + "step": 4210 + }, + { + "epoch": 1.43, + "learning_rate": 2.8994082840236686e-07, + "logits/chosen": -1.5574661493301392, + "logits/rejected": -1.3739725351333618, + "logps/chosen": -447.37310791015625, + "logps/rejected": -716.6776123046875, + "loss": 0.007, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.2827892303466797, + "rewards/margins": 12.042317390441895, + "rewards/rejected": -8.759527206420898, + "step": 4220 + }, + { + "epoch": 1.44, + "learning_rate": 2.893113433211632e-07, + "logits/chosen": -1.554689645767212, + "logits/rejected": -1.356540322303772, + "logps/chosen": -347.460693359375, + "logps/rejected": -553.4609375, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2481014728546143, + "rewards/margins": 12.289466857910156, + "rewards/rejected": -9.041364669799805, + "step": 4230 + }, + { + "epoch": 1.44, + "learning_rate": 2.886818582399597e-07, + "logits/chosen": -1.558514952659607, + "logits/rejected": -1.3339954614639282, + "logps/chosen": -386.17596435546875, + "logps/rejected": -509.777099609375, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1594197750091553, + "rewards/margins": 11.619569778442383, + "rewards/rejected": -9.460148811340332, + "step": 4240 + }, + { + "epoch": 1.44, + "learning_rate": 2.880523731587561e-07, + "logits/chosen": -1.5142104625701904, + "logits/rejected": -1.2935596704483032, + "logps/chosen": -516.2032470703125, + "logps/rejected": -1030.715087890625, + "loss": 0.0076, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.366779327392578, + "rewards/margins": 12.155426025390625, + "rewards/rejected": -8.788646697998047, + "step": 4250 + }, + { + "epoch": 1.45, + "learning_rate": 2.8742288807755255e-07, + "logits/chosen": -1.5530768632888794, + "logits/rejected": -1.3639857769012451, + "logps/chosen": -378.64398193359375, + "logps/rejected": -778.8458862304688, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5309619903564453, + "rewards/margins": 11.59952449798584, + "rewards/rejected": -8.068562507629395, + "step": 4260 + }, + { + "epoch": 1.45, + "learning_rate": 2.8679340299634897e-07, + "logits/chosen": -1.5568735599517822, + "logits/rejected": -1.309661865234375, + "logps/chosen": -317.3431701660156, + "logps/rejected": -963.3165283203125, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6510112285614014, + "rewards/margins": 12.318338394165039, + "rewards/rejected": -8.667327880859375, + "step": 4270 + }, + { + "epoch": 1.45, + "learning_rate": 2.861639179151454e-07, + "logits/chosen": -1.5590856075286865, + "logits/rejected": -1.3114995956420898, + "logps/chosen": -348.91192626953125, + "logps/rejected": -671.611572265625, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.514787197113037, + "rewards/margins": 12.77906608581543, + "rewards/rejected": -9.264278411865234, + "step": 4280 + }, + { + "epoch": 1.46, + "learning_rate": 2.855344328339418e-07, + "logits/chosen": -1.5728092193603516, + "logits/rejected": -1.2642021179199219, + "logps/chosen": -332.35223388671875, + "logps/rejected": -674.48486328125, + "loss": 0.0075, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.2403461933135986, + "rewards/margins": 12.125340461730957, + "rewards/rejected": -8.884993553161621, + "step": 4290 + }, + { + "epoch": 1.46, + "learning_rate": 2.849049477527383e-07, + "logits/chosen": -1.5678465366363525, + "logits/rejected": -1.3430898189544678, + "logps/chosen": -339.64495849609375, + "logps/rejected": -452.1986389160156, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.996410846710205, + "rewards/margins": 13.312530517578125, + "rewards/rejected": -9.316119194030762, + "step": 4300 + }, + { + "epoch": 1.46, + "eval_logits/chosen": -1.5557563304901123, + "eval_logits/rejected": -1.3104143142700195, + "eval_logps/chosen": -385.7142028808594, + "eval_logps/rejected": -647.8128051757812, + "eval_loss": 0.014931157231330872, + "eval_rewards/accuracies": 0.9957912564277649, + "eval_rewards/chosen": 3.4208548069000244, + "eval_rewards/margins": 12.250496864318848, + "eval_rewards/rejected": -8.829641342163086, + "eval_runtime": 574.3792, + "eval_samples_per_second": 16.54, + "eval_steps_per_second": 0.517, + "step": 4300 + }, + { + "epoch": 1.46, + "learning_rate": 2.8427546267153466e-07, + "logits/chosen": -1.5578222274780273, + "logits/rejected": -1.352829933166504, + "logps/chosen": -341.92498779296875, + "logps/rejected": -764.1944580078125, + "loss": 0.0147, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.258507251739502, + "rewards/margins": 12.833139419555664, + "rewards/rejected": -9.574630737304688, + "step": 4310 + }, + { + "epoch": 1.47, + "learning_rate": 2.836459775903311e-07, + "logits/chosen": -1.5515110492706299, + "logits/rejected": -1.3754364252090454, + "logps/chosen": -450.63787841796875, + "logps/rejected": -548.5369262695312, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1725552082061768, + "rewards/margins": 11.982792854309082, + "rewards/rejected": -8.8102388381958, + "step": 4320 + }, + { + "epoch": 1.47, + "learning_rate": 2.830164925091275e-07, + "logits/chosen": -1.5565662384033203, + "logits/rejected": -1.2802814245224, + "logps/chosen": -334.83917236328125, + "logps/rejected": -580.50732421875, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.101797580718994, + "rewards/margins": 12.574882507324219, + "rewards/rejected": -9.473085403442383, + "step": 4330 + }, + { + "epoch": 1.48, + "learning_rate": 2.8238700742792393e-07, + "logits/chosen": -1.5656096935272217, + "logits/rejected": -1.354305386543274, + "logps/chosen": -388.5872802734375, + "logps/rejected": -447.0284729003906, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.723498821258545, + "rewards/margins": 12.825960159301758, + "rewards/rejected": -9.102461814880371, + "step": 4340 + }, + { + "epoch": 1.48, + "learning_rate": 2.8175752234672035e-07, + "logits/chosen": -1.561553955078125, + "logits/rejected": -1.3496907949447632, + "logps/chosen": -459.55462646484375, + "logps/rejected": -582.40771484375, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.287027359008789, + "rewards/margins": 12.110834121704102, + "rewards/rejected": -8.823808670043945, + "step": 4350 + }, + { + "epoch": 1.48, + "learning_rate": 2.8112803726551683e-07, + "logits/chosen": -1.5527913570404053, + "logits/rejected": -1.386364221572876, + "logps/chosen": -491.20074462890625, + "logps/rejected": -434.0328674316406, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.880095958709717, + "rewards/margins": 10.701440811157227, + "rewards/rejected": -7.82134485244751, + "step": 4360 + }, + { + "epoch": 1.49, + "learning_rate": 2.8049855218431325e-07, + "logits/chosen": -1.568893313407898, + "logits/rejected": -1.381589651107788, + "logps/chosen": -349.3913269042969, + "logps/rejected": -547.5789794921875, + "loss": 0.01, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.500054121017456, + "rewards/margins": 11.511063575744629, + "rewards/rejected": -8.011009216308594, + "step": 4370 + }, + { + "epoch": 1.49, + "learning_rate": 2.7986906710310967e-07, + "logits/chosen": -1.5507256984710693, + "logits/rejected": -1.2853158712387085, + "logps/chosen": -417.3768615722656, + "logps/rejected": -930.9948120117188, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4849910736083984, + "rewards/margins": 12.736702919006348, + "rewards/rejected": -9.251710891723633, + "step": 4380 + }, + { + "epoch": 1.49, + "learning_rate": 2.7923958202190604e-07, + "logits/chosen": -1.5627254247665405, + "logits/rejected": -1.2921102046966553, + "logps/chosen": -305.49774169921875, + "logps/rejected": -910.4715576171875, + "loss": 0.0066, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.8230881690979004, + "rewards/margins": 12.196969985961914, + "rewards/rejected": -9.373881340026855, + "step": 4390 + }, + { + "epoch": 1.5, + "learning_rate": 2.7861009694070247e-07, + "logits/chosen": -1.5330791473388672, + "logits/rejected": -1.3629655838012695, + "logps/chosen": -428.8568420410156, + "logps/rejected": -481.49407958984375, + "loss": 0.0068, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.3116793632507324, + "rewards/margins": 11.905986785888672, + "rewards/rejected": -8.594307899475098, + "step": 4400 + }, + { + "epoch": 1.5, + "eval_logits/chosen": -1.567962884902954, + "eval_logits/rejected": -1.3257269859313965, + "eval_logps/chosen": -387.222900390625, + "eval_logps/rejected": -652.5496215820312, + "eval_loss": 0.012299657799303532, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 3.2699849605560303, + "eval_rewards/margins": 12.573309898376465, + "eval_rewards/rejected": -9.303322792053223, + "eval_runtime": 574.5088, + "eval_samples_per_second": 16.536, + "eval_steps_per_second": 0.517, + "step": 4400 + }, + { + "epoch": 1.5, + "learning_rate": 2.779806118594989e-07, + "logits/chosen": -1.5669740438461304, + "logits/rejected": -1.3345576524734497, + "logps/chosen": -416.5772399902344, + "logps/rejected": -746.0413208007812, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8947136402130127, + "rewards/margins": 13.320287704467773, + "rewards/rejected": -9.425573348999023, + "step": 4410 + }, + { + "epoch": 1.5, + "learning_rate": 2.7735112677829536e-07, + "logits/chosen": -1.5641319751739502, + "logits/rejected": -1.3472621440887451, + "logps/chosen": -318.9018859863281, + "logps/rejected": -606.7911376953125, + "loss": 0.0276, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2370028495788574, + "rewards/margins": 12.15302562713623, + "rewards/rejected": -8.916023254394531, + "step": 4420 + }, + { + "epoch": 1.51, + "learning_rate": 2.767216416970918e-07, + "logits/chosen": -1.5774097442626953, + "logits/rejected": -1.3461287021636963, + "logps/chosen": -321.50054931640625, + "logps/rejected": -483.26312255859375, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4035484790802, + "rewards/margins": 11.756426811218262, + "rewards/rejected": -8.352877616882324, + "step": 4430 + }, + { + "epoch": 1.51, + "learning_rate": 2.760921566158882e-07, + "logits/chosen": -1.5704342126846313, + "logits/rejected": -1.3461997509002686, + "logps/chosen": -376.5470275878906, + "logps/rejected": -648.4378051757812, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0833146572113037, + "rewards/margins": 11.515409469604492, + "rewards/rejected": -8.432095527648926, + "step": 4440 + }, + { + "epoch": 1.51, + "learning_rate": 2.7546267153468463e-07, + "logits/chosen": -1.5196638107299805, + "logits/rejected": -1.3435132503509521, + "logps/chosen": -577.20556640625, + "logps/rejected": -587.968017578125, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3741836547851562, + "rewards/margins": 11.995950698852539, + "rewards/rejected": -8.621767044067383, + "step": 4450 + }, + { + "epoch": 1.52, + "learning_rate": 2.74833186453481e-07, + "logits/chosen": -1.5547802448272705, + "logits/rejected": -1.3611857891082764, + "logps/chosen": -469.7144470214844, + "logps/rejected": -670.6278076171875, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.372948408126831, + "rewards/margins": 12.447505950927734, + "rewards/rejected": -9.074556350708008, + "step": 4460 + }, + { + "epoch": 1.52, + "learning_rate": 2.742037013722774e-07, + "logits/chosen": -1.5676251649856567, + "logits/rejected": -1.3201380968093872, + "logps/chosen": -406.49737548828125, + "logps/rejected": -639.8590087890625, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.265721559524536, + "rewards/margins": 12.159043312072754, + "rewards/rejected": -8.89332103729248, + "step": 4470 + }, + { + "epoch": 1.52, + "learning_rate": 2.735742162910739e-07, + "logits/chosen": -1.5791658163070679, + "logits/rejected": -1.3252313137054443, + "logps/chosen": -449.76654052734375, + "logps/rejected": -478.0682067871094, + "loss": 0.0523, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.331035614013672, + "rewards/margins": 12.001869201660156, + "rewards/rejected": -8.6708345413208, + "step": 4480 + }, + { + "epoch": 1.53, + "learning_rate": 2.729447312098703e-07, + "logits/chosen": -1.5534188747406006, + "logits/rejected": -1.3447082042694092, + "logps/chosen": -516.5260620117188, + "logps/rejected": -737.7864990234375, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4898250102996826, + "rewards/margins": 12.464486122131348, + "rewards/rejected": -8.974660873413086, + "step": 4490 + }, + { + "epoch": 1.53, + "learning_rate": 2.7231524612866675e-07, + "logits/chosen": -1.5807605981826782, + "logits/rejected": -1.3340727090835571, + "logps/chosen": -333.02337646484375, + "logps/rejected": -710.5103759765625, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3552627563476562, + "rewards/margins": 12.304976463317871, + "rewards/rejected": -8.949713706970215, + "step": 4500 + }, + { + "epoch": 1.53, + "eval_logits/chosen": -1.5700557231903076, + "eval_logits/rejected": -1.321670651435852, + "eval_logps/chosen": -384.0287780761719, + "eval_logps/rejected": -647.8700561523438, + "eval_loss": 0.012181604281067848, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.5893986225128174, + "eval_rewards/margins": 12.42477035522461, + "eval_rewards/rejected": -8.835371017456055, + "eval_runtime": 574.9971, + "eval_samples_per_second": 16.522, + "eval_steps_per_second": 0.517, + "step": 4500 + }, + { + "epoch": 1.53, + "learning_rate": 2.7168576104746317e-07, + "logits/chosen": -1.5824540853500366, + "logits/rejected": -1.2995485067367554, + "logps/chosen": -348.568115234375, + "logps/rejected": -717.29736328125, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.124833583831787, + "rewards/margins": 13.313066482543945, + "rewards/rejected": -9.188231468200684, + "step": 4510 + }, + { + "epoch": 1.54, + "learning_rate": 2.710562759662596e-07, + "logits/chosen": -1.5651118755340576, + "logits/rejected": -1.3384430408477783, + "logps/chosen": -405.93450927734375, + "logps/rejected": -661.6476440429688, + "loss": 0.007, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.918644428253174, + "rewards/margins": 12.093953132629395, + "rewards/rejected": -8.175308227539062, + "step": 4520 + }, + { + "epoch": 1.54, + "learning_rate": 2.70426790885056e-07, + "logits/chosen": -1.5880855321884155, + "logits/rejected": -1.3448718786239624, + "logps/chosen": -374.66119384765625, + "logps/rejected": -812.947021484375, + "loss": 0.007, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.635627269744873, + "rewards/margins": 12.905779838562012, + "rewards/rejected": -9.270153045654297, + "step": 4530 + }, + { + "epoch": 1.54, + "learning_rate": 2.6979730580385244e-07, + "logits/chosen": -1.558767318725586, + "logits/rejected": -1.3361907005310059, + "logps/chosen": -471.9656677246094, + "logps/rejected": -705.7621459960938, + "loss": 0.0082, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.24249267578125, + "rewards/margins": 11.446606636047363, + "rewards/rejected": -8.204113006591797, + "step": 4540 + }, + { + "epoch": 1.55, + "learning_rate": 2.6916782072264886e-07, + "logits/chosen": -1.5765936374664307, + "logits/rejected": -1.3543466329574585, + "logps/chosen": -389.0654296875, + "logps/rejected": -518.3501586914062, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.960689067840576, + "rewards/margins": 11.882513046264648, + "rewards/rejected": -7.9218244552612305, + "step": 4550 + }, + { + "epoch": 1.55, + "learning_rate": 2.685383356414453e-07, + "logits/chosen": -1.5953829288482666, + "logits/rejected": -1.3639295101165771, + "logps/chosen": -407.7821350097656, + "logps/rejected": -544.7317504882812, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.82462215423584, + "rewards/margins": 12.388737678527832, + "rewards/rejected": -9.564115524291992, + "step": 4560 + }, + { + "epoch": 1.55, + "learning_rate": 2.679088505602417e-07, + "logits/chosen": -1.5624397993087769, + "logits/rejected": -1.366820216178894, + "logps/chosen": -446.39532470703125, + "logps/rejected": -735.2801513671875, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4764761924743652, + "rewards/margins": 13.024548530578613, + "rewards/rejected": -9.548072814941406, + "step": 4570 + }, + { + "epoch": 1.56, + "learning_rate": 2.6727936547903813e-07, + "logits/chosen": -1.576667308807373, + "logits/rejected": -1.447074055671692, + "logps/chosen": -323.8355407714844, + "logps/rejected": -676.883056640625, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.611523389816284, + "rewards/margins": 11.435809135437012, + "rewards/rejected": -7.824287414550781, + "step": 4580 + }, + { + "epoch": 1.56, + "learning_rate": 2.6664988039783455e-07, + "logits/chosen": -1.5838779211044312, + "logits/rejected": -1.3739904165267944, + "logps/chosen": -305.7718505859375, + "logps/rejected": -476.9453125, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.988311529159546, + "rewards/margins": 11.243171691894531, + "rewards/rejected": -8.254859924316406, + "step": 4590 + }, + { + "epoch": 1.56, + "learning_rate": 2.66020395316631e-07, + "logits/chosen": -1.5930497646331787, + "logits/rejected": -1.3427181243896484, + "logps/chosen": -327.7312927246094, + "logps/rejected": -506.9082946777344, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7603626251220703, + "rewards/margins": 11.619297981262207, + "rewards/rejected": -8.858935356140137, + "step": 4600 + }, + { + "epoch": 1.56, + "eval_logits/chosen": -1.5837931632995605, + "eval_logits/rejected": -1.338111162185669, + "eval_logps/chosen": -385.4079895019531, + "eval_logps/rejected": -647.3306274414062, + "eval_loss": 0.011675420217216015, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 3.4514763355255127, + "eval_rewards/margins": 12.232900619506836, + "eval_rewards/rejected": -8.781423568725586, + "eval_runtime": 573.303, + "eval_samples_per_second": 16.571, + "eval_steps_per_second": 0.518, + "step": 4600 + }, + { + "epoch": 1.57, + "learning_rate": 2.6539091023542745e-07, + "logits/chosen": -1.5802656412124634, + "logits/rejected": -1.3715194463729858, + "logps/chosen": -332.55633544921875, + "logps/rejected": -749.2489013671875, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.012269020080566, + "rewards/margins": 13.14435863494873, + "rewards/rejected": -9.132089614868164, + "step": 4610 + }, + { + "epoch": 1.57, + "learning_rate": 2.647614251542238e-07, + "logits/chosen": -1.5664923191070557, + "logits/rejected": -1.3520386219024658, + "logps/chosen": -364.9347839355469, + "logps/rejected": -773.1204833984375, + "loss": 0.0167, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.884031295776367, + "rewards/margins": 12.19784164428711, + "rewards/rejected": -9.313810348510742, + "step": 4620 + }, + { + "epoch": 1.57, + "learning_rate": 2.6413194007302024e-07, + "logits/chosen": -1.573128342628479, + "logits/rejected": -1.3606058359146118, + "logps/chosen": -345.389892578125, + "logps/rejected": -676.2288208007812, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8202741146087646, + "rewards/margins": 11.298421859741211, + "rewards/rejected": -8.478147506713867, + "step": 4630 + }, + { + "epoch": 1.58, + "learning_rate": 2.6350245499181666e-07, + "logits/chosen": -1.5692265033721924, + "logits/rejected": -1.368632435798645, + "logps/chosen": -416.19366455078125, + "logps/rejected": -639.7122192382812, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.592820644378662, + "rewards/margins": 12.517929077148438, + "rewards/rejected": -8.925107955932617, + "step": 4640 + }, + { + "epoch": 1.58, + "learning_rate": 2.628729699106131e-07, + "logits/chosen": -1.5738487243652344, + "logits/rejected": -1.2761101722717285, + "logps/chosen": -331.5666198730469, + "logps/rejected": -600.9445190429688, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.793644428253174, + "rewards/margins": 12.527485847473145, + "rewards/rejected": -8.733840942382812, + "step": 4650 + }, + { + "epoch": 1.58, + "learning_rate": 2.6224348482940956e-07, + "logits/chosen": -1.5478246212005615, + "logits/rejected": -1.3203939199447632, + "logps/chosen": -492.5116271972656, + "logps/rejected": -774.1646118164062, + "loss": 0.0205, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.3782639503479004, + "rewards/margins": 12.984552383422852, + "rewards/rejected": -9.606287002563477, + "step": 4660 + }, + { + "epoch": 1.59, + "learning_rate": 2.61613999748206e-07, + "logits/chosen": -1.5718917846679688, + "logits/rejected": -1.3344552516937256, + "logps/chosen": -425.4710998535156, + "logps/rejected": -450.6754455566406, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2286791801452637, + "rewards/margins": 11.520196914672852, + "rewards/rejected": -8.291518211364746, + "step": 4670 + }, + { + "epoch": 1.59, + "learning_rate": 2.609845146670024e-07, + "logits/chosen": -1.5951523780822754, + "logits/rejected": -1.3611605167388916, + "logps/chosen": -303.38299560546875, + "logps/rejected": -464.9132385253906, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6332221031188965, + "rewards/margins": 12.191313743591309, + "rewards/rejected": -8.55809211730957, + "step": 4680 + }, + { + "epoch": 1.59, + "learning_rate": 2.603550295857988e-07, + "logits/chosen": -1.5934396982192993, + "logits/rejected": -1.3806281089782715, + "logps/chosen": -344.48809814453125, + "logps/rejected": -675.7086181640625, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2177586555480957, + "rewards/margins": 10.960406303405762, + "rewards/rejected": -7.742647647857666, + "step": 4690 + }, + { + "epoch": 1.6, + "learning_rate": 2.597255445045952e-07, + "logits/chosen": -1.576796293258667, + "logits/rejected": -1.331266164779663, + "logps/chosen": -448.0177307128906, + "logps/rejected": -496.26910400390625, + "loss": 0.0132, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.870629072189331, + "rewards/margins": 11.30385684967041, + "rewards/rejected": -8.4332275390625, + "step": 4700 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.586198091506958, + "eval_logits/rejected": -1.335240364074707, + "eval_logps/chosen": -385.3825378417969, + "eval_logps/rejected": -644.0344848632812, + "eval_loss": 0.011897599324584007, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 3.454022169113159, + "eval_rewards/margins": 11.905839920043945, + "eval_rewards/rejected": -8.451817512512207, + "eval_runtime": 572.0947, + "eval_samples_per_second": 16.606, + "eval_steps_per_second": 0.519, + "step": 4700 + }, + { + "epoch": 1.6, + "learning_rate": 2.590960594233916e-07, + "logits/chosen": -1.5953645706176758, + "logits/rejected": -1.298682689666748, + "logps/chosen": -352.5274963378906, + "logps/rejected": -655.4091796875, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.301424503326416, + "rewards/margins": 11.375890731811523, + "rewards/rejected": -8.074466705322266, + "step": 4710 + }, + { + "epoch": 1.6, + "learning_rate": 2.584665743421881e-07, + "logits/chosen": -1.5720751285552979, + "logits/rejected": -1.3486759662628174, + "logps/chosen": -307.88128662109375, + "logps/rejected": -452.779296875, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3723862171173096, + "rewards/margins": 11.433685302734375, + "rewards/rejected": -8.061299324035645, + "step": 4720 + }, + { + "epoch": 1.61, + "learning_rate": 2.578370892609845e-07, + "logits/chosen": -1.5656077861785889, + "logits/rejected": -1.3505555391311646, + "logps/chosen": -374.82293701171875, + "logps/rejected": -515.9763793945312, + "loss": 0.0059, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.244389295578003, + "rewards/margins": 11.142977714538574, + "rewards/rejected": -7.89858865737915, + "step": 4730 + }, + { + "epoch": 1.61, + "learning_rate": 2.5720760417978095e-07, + "logits/chosen": -1.5412850379943848, + "logits/rejected": -1.3561943769454956, + "logps/chosen": -446.4327087402344, + "logps/rejected": -583.0899047851562, + "loss": 0.0105, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.014249324798584, + "rewards/margins": 11.794703483581543, + "rewards/rejected": -8.7804536819458, + "step": 4740 + }, + { + "epoch": 1.61, + "learning_rate": 2.5657811909857737e-07, + "logits/chosen": -1.59699285030365, + "logits/rejected": -1.3868151903152466, + "logps/chosen": -421.68206787109375, + "logps/rejected": -560.4231567382812, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2001800537109375, + "rewards/margins": 11.05296516418457, + "rewards/rejected": -7.852785587310791, + "step": 4750 + }, + { + "epoch": 1.62, + "learning_rate": 2.559486340173738e-07, + "logits/chosen": -1.5786495208740234, + "logits/rejected": -1.3772896528244019, + "logps/chosen": -334.3032531738281, + "logps/rejected": -524.63818359375, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.590693712234497, + "rewards/margins": 11.941690444946289, + "rewards/rejected": -8.350996971130371, + "step": 4760 + }, + { + "epoch": 1.62, + "learning_rate": 2.5531914893617016e-07, + "logits/chosen": -1.6007106304168701, + "logits/rejected": -1.2973322868347168, + "logps/chosen": -335.73834228515625, + "logps/rejected": -871.6506958007812, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8262393474578857, + "rewards/margins": 11.769452095031738, + "rewards/rejected": -8.943212509155273, + "step": 4770 + }, + { + "epoch": 1.62, + "learning_rate": 2.5468966385496664e-07, + "logits/chosen": -1.5615354776382446, + "logits/rejected": -1.342139482498169, + "logps/chosen": -325.63604736328125, + "logps/rejected": -436.28411865234375, + "loss": 0.0084, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.16536283493042, + "rewards/margins": 11.67732048034668, + "rewards/rejected": -8.511957168579102, + "step": 4780 + }, + { + "epoch": 1.63, + "learning_rate": 2.5406017877376306e-07, + "logits/chosen": -1.595267653465271, + "logits/rejected": -1.3399150371551514, + "logps/chosen": -383.64532470703125, + "logps/rejected": -566.6316528320312, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0984179973602295, + "rewards/margins": 11.563322067260742, + "rewards/rejected": -8.464902877807617, + "step": 4790 + }, + { + "epoch": 1.63, + "learning_rate": 2.534306936925595e-07, + "logits/chosen": -1.5738465785980225, + "logits/rejected": -1.3037891387939453, + "logps/chosen": -399.59112548828125, + "logps/rejected": -598.68505859375, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.171052932739258, + "rewards/margins": 13.007044792175293, + "rewards/rejected": -8.835992813110352, + "step": 4800 + }, + { + "epoch": 1.63, + "eval_logits/chosen": -1.5765715837478638, + "eval_logits/rejected": -1.3331305980682373, + "eval_logps/chosen": -385.95257568359375, + "eval_logps/rejected": -646.8692016601562, + "eval_loss": 0.01125615369528532, + "eval_rewards/accuracies": 0.996632993221283, + "eval_rewards/chosen": 3.3970179557800293, + "eval_rewards/margins": 12.132306098937988, + "eval_rewards/rejected": -8.735286712646484, + "eval_runtime": 571.8986, + "eval_samples_per_second": 16.611, + "eval_steps_per_second": 0.519, + "step": 4800 + }, + { + "epoch": 1.63, + "learning_rate": 2.528012086113559e-07, + "logits/chosen": -1.5844953060150146, + "logits/rejected": -1.3389002084732056, + "logps/chosen": -338.5472717285156, + "logps/rejected": -802.2797241210938, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4826133251190186, + "rewards/margins": 12.257604598999023, + "rewards/rejected": -8.77499008178711, + "step": 4810 + }, + { + "epoch": 1.64, + "learning_rate": 2.5217172353015233e-07, + "logits/chosen": -1.5755666494369507, + "logits/rejected": -1.3739709854125977, + "logps/chosen": -393.8878479003906, + "logps/rejected": -717.4280395507812, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.946418285369873, + "rewards/margins": 12.013291358947754, + "rewards/rejected": -8.066873550415039, + "step": 4820 + }, + { + "epoch": 1.64, + "learning_rate": 2.5154223844894875e-07, + "logits/chosen": -1.602664589881897, + "logits/rejected": -1.3410438299179077, + "logps/chosen": -433.88848876953125, + "logps/rejected": -491.8291931152344, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.19930362701416, + "rewards/margins": 11.402435302734375, + "rewards/rejected": -8.203130722045898, + "step": 4830 + }, + { + "epoch": 1.65, + "learning_rate": 2.509127533677452e-07, + "logits/chosen": -1.5869371891021729, + "logits/rejected": -1.3926212787628174, + "logps/chosen": -404.59979248046875, + "logps/rejected": -423.90008544921875, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6544318199157715, + "rewards/margins": 11.81891918182373, + "rewards/rejected": -8.164487838745117, + "step": 4840 + }, + { + "epoch": 1.65, + "learning_rate": 2.502832682865416e-07, + "logits/chosen": -1.593727469444275, + "logits/rejected": -1.3780322074890137, + "logps/chosen": -405.2511901855469, + "logps/rejected": -622.6893310546875, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7419540882110596, + "rewards/margins": 11.418304443359375, + "rewards/rejected": -7.6763505935668945, + "step": 4850 + }, + { + "epoch": 1.65, + "learning_rate": 2.49653783205338e-07, + "logits/chosen": -1.5823546648025513, + "logits/rejected": -1.3324047327041626, + "logps/chosen": -339.1897888183594, + "logps/rejected": -586.0838623046875, + "loss": 0.0064, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.9650630950927734, + "rewards/margins": 11.485040664672852, + "rewards/rejected": -8.519976615905762, + "step": 4860 + }, + { + "epoch": 1.66, + "learning_rate": 2.4902429812413444e-07, + "logits/chosen": -1.5852962732315063, + "logits/rejected": -1.3198401927947998, + "logps/chosen": -328.0611267089844, + "logps/rejected": -617.8011474609375, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1902358531951904, + "rewards/margins": 11.56981086730957, + "rewards/rejected": -8.379575729370117, + "step": 4870 + }, + { + "epoch": 1.66, + "learning_rate": 2.4839481304293086e-07, + "logits/chosen": -1.574779987335205, + "logits/rejected": -1.4085489511489868, + "logps/chosen": -371.4349670410156, + "logps/rejected": -720.2868041992188, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0854008197784424, + "rewards/margins": 11.729382514953613, + "rewards/rejected": -8.643980979919434, + "step": 4880 + }, + { + "epoch": 1.66, + "learning_rate": 2.477653279617273e-07, + "logits/chosen": -1.568144679069519, + "logits/rejected": -1.3370033502578735, + "logps/chosen": -450.6881408691406, + "logps/rejected": -918.5833129882812, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.428896427154541, + "rewards/margins": 13.131878852844238, + "rewards/rejected": -9.702981948852539, + "step": 4890 + }, + { + "epoch": 1.67, + "learning_rate": 2.471358428805237e-07, + "logits/chosen": -1.566572904586792, + "logits/rejected": -1.3753902912139893, + "logps/chosen": -418.70648193359375, + "logps/rejected": -672.7622680664062, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1943225860595703, + "rewards/margins": 12.216558456420898, + "rewards/rejected": -9.022233963012695, + "step": 4900 + }, + { + "epoch": 1.67, + "eval_logits/chosen": -1.5969394445419312, + "eval_logits/rejected": -1.3552192449569702, + "eval_logps/chosen": -387.19427490234375, + "eval_logps/rejected": -650.2294921875, + "eval_loss": 0.012126320973038673, + "eval_rewards/accuracies": 0.996632993221283, + "eval_rewards/chosen": 3.2728476524353027, + "eval_rewards/margins": 12.344151496887207, + "eval_rewards/rejected": -9.071304321289062, + "eval_runtime": 572.0522, + "eval_samples_per_second": 16.607, + "eval_steps_per_second": 0.519, + "step": 4900 + }, + { + "epoch": 1.67, + "learning_rate": 2.4650635779932013e-07, + "logits/chosen": -1.5913022756576538, + "logits/rejected": -1.331397294998169, + "logps/chosen": -351.8318176269531, + "logps/rejected": -507.9164123535156, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.842801570892334, + "rewards/margins": 11.720033645629883, + "rewards/rejected": -8.877232551574707, + "step": 4910 + }, + { + "epoch": 1.67, + "learning_rate": 2.4587687271811656e-07, + "logits/chosen": -1.5994876623153687, + "logits/rejected": -1.3168003559112549, + "logps/chosen": -349.27252197265625, + "logps/rejected": -790.8400268554688, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.222647190093994, + "rewards/margins": 12.813901901245117, + "rewards/rejected": -9.591253280639648, + "step": 4920 + }, + { + "epoch": 1.68, + "learning_rate": 2.45247387636913e-07, + "logits/chosen": -1.5485001802444458, + "logits/rejected": -1.4097952842712402, + "logps/chosen": -367.8609924316406, + "logps/rejected": -608.3257446289062, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4942245483398438, + "rewards/margins": 11.769346237182617, + "rewards/rejected": -8.27512264251709, + "step": 4930 + }, + { + "epoch": 1.68, + "learning_rate": 2.446179025557094e-07, + "logits/chosen": -1.581343412399292, + "logits/rejected": -1.3481428623199463, + "logps/chosen": -521.8402709960938, + "logps/rejected": -741.0032958984375, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4162158966064453, + "rewards/margins": 12.157910346984863, + "rewards/rejected": -8.741695404052734, + "step": 4940 + }, + { + "epoch": 1.68, + "learning_rate": 2.439884174745059e-07, + "logits/chosen": -1.5672409534454346, + "logits/rejected": -1.291656255722046, + "logps/chosen": -397.7919616699219, + "logps/rejected": -526.3001708984375, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.695279359817505, + "rewards/margins": 11.808991432189941, + "rewards/rejected": -9.113714218139648, + "step": 4950 + }, + { + "epoch": 1.69, + "learning_rate": 2.4335893239330225e-07, + "logits/chosen": -1.559485912322998, + "logits/rejected": -1.4066263437271118, + "logps/chosen": -435.94281005859375, + "logps/rejected": -631.257568359375, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.933976650238037, + "rewards/margins": 12.108378410339355, + "rewards/rejected": -9.174402236938477, + "step": 4960 + }, + { + "epoch": 1.69, + "learning_rate": 2.4272944731209867e-07, + "logits/chosen": -1.5883634090423584, + "logits/rejected": -1.3917853832244873, + "logps/chosen": -415.3692321777344, + "logps/rejected": -601.6611328125, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2792468070983887, + "rewards/margins": 13.196688652038574, + "rewards/rejected": -9.917442321777344, + "step": 4970 + }, + { + "epoch": 1.69, + "learning_rate": 2.4209996223089514e-07, + "logits/chosen": -1.5781867504119873, + "logits/rejected": -1.4130737781524658, + "logps/chosen": -415.3194274902344, + "logps/rejected": -631.3121337890625, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.908569097518921, + "rewards/margins": 11.958176612854004, + "rewards/rejected": -9.04960823059082, + "step": 4980 + }, + { + "epoch": 1.7, + "learning_rate": 2.4147047714969157e-07, + "logits/chosen": -1.5828626155853271, + "logits/rejected": -1.3957014083862305, + "logps/chosen": -345.7574768066406, + "logps/rejected": -684.1732177734375, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.811290740966797, + "rewards/margins": 12.056185722351074, + "rewards/rejected": -8.244895935058594, + "step": 4990 + }, + { + "epoch": 1.7, + "learning_rate": 2.4084099206848794e-07, + "logits/chosen": -1.5823535919189453, + "logits/rejected": -1.3998852968215942, + "logps/chosen": -325.7845153808594, + "logps/rejected": -554.8994750976562, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4590084552764893, + "rewards/margins": 11.742083549499512, + "rewards/rejected": -8.283075332641602, + "step": 5000 + }, + { + "epoch": 1.7, + "eval_logits/chosen": -1.5739115476608276, + "eval_logits/rejected": -1.3307162523269653, + "eval_logps/chosen": -388.2237243652344, + "eval_logps/rejected": -653.7092895507812, + "eval_loss": 0.01058993674814701, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 3.169904947280884, + "eval_rewards/margins": 12.589190483093262, + "eval_rewards/rejected": -9.41928482055664, + "eval_runtime": 571.7488, + "eval_samples_per_second": 16.616, + "eval_steps_per_second": 0.519, + "step": 5000 + }, + { + "epoch": 1.7, + "learning_rate": 2.402115069872844e-07, + "logits/chosen": -1.6081085205078125, + "logits/rejected": -1.3910783529281616, + "logps/chosen": -365.353271484375, + "logps/rejected": -488.6876525878906, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9899582862854004, + "rewards/margins": 12.198873519897461, + "rewards/rejected": -9.208916664123535, + "step": 5010 + }, + { + "epoch": 1.71, + "learning_rate": 2.3958202190608084e-07, + "logits/chosen": -1.5861672163009644, + "logits/rejected": -1.3388426303863525, + "logps/chosen": -413.3779296875, + "logps/rejected": -553.15380859375, + "loss": 0.0073, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.3428969383239746, + "rewards/margins": 12.404275894165039, + "rewards/rejected": -9.061378479003906, + "step": 5020 + }, + { + "epoch": 1.71, + "learning_rate": 2.3895253682487726e-07, + "logits/chosen": -1.581631064414978, + "logits/rejected": -1.3349254131317139, + "logps/chosen": -344.7869567871094, + "logps/rejected": -654.842529296875, + "loss": 0.0271, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.2765603065490723, + "rewards/margins": 11.532754898071289, + "rewards/rejected": -8.256193161010742, + "step": 5030 + }, + { + "epoch": 1.71, + "learning_rate": 2.3832305174367368e-07, + "logits/chosen": -1.574760913848877, + "logits/rejected": -1.3747285604476929, + "logps/chosen": -406.5971374511719, + "logps/rejected": -581.491943359375, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0485939979553223, + "rewards/margins": 11.67126178741455, + "rewards/rejected": -8.62266731262207, + "step": 5040 + }, + { + "epoch": 1.72, + "learning_rate": 2.3769356666247008e-07, + "logits/chosen": -1.577365517616272, + "logits/rejected": -1.3778914213180542, + "logps/chosen": -390.0703430175781, + "logps/rejected": -632.7647705078125, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.893449306488037, + "rewards/margins": 11.523534774780273, + "rewards/rejected": -8.630085945129395, + "step": 5050 + }, + { + "epoch": 1.72, + "learning_rate": 2.370640815812665e-07, + "logits/chosen": -1.5791559219360352, + "logits/rejected": -1.3136100769042969, + "logps/chosen": -413.769775390625, + "logps/rejected": -628.4302978515625, + "loss": 0.0067, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.3355441093444824, + "rewards/margins": 12.887995719909668, + "rewards/rejected": -9.552452087402344, + "step": 5060 + }, + { + "epoch": 1.72, + "learning_rate": 2.3643459650006295e-07, + "logits/chosen": -1.590283989906311, + "logits/rejected": -1.4215797185897827, + "logps/chosen": -407.3586730957031, + "logps/rejected": -470.0646057128906, + "loss": 0.0296, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.442373752593994, + "rewards/margins": 11.997186660766602, + "rewards/rejected": -8.554813385009766, + "step": 5070 + }, + { + "epoch": 1.73, + "learning_rate": 2.3580511141885937e-07, + "logits/chosen": -1.5737007856369019, + "logits/rejected": -1.3570020198822021, + "logps/chosen": -393.2453308105469, + "logps/rejected": -500.4339904785156, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.366410732269287, + "rewards/margins": 11.313410758972168, + "rewards/rejected": -7.947000980377197, + "step": 5080 + }, + { + "epoch": 1.73, + "learning_rate": 2.3517562633765577e-07, + "logits/chosen": -1.5474077463150024, + "logits/rejected": -1.3519481420516968, + "logps/chosen": -303.19964599609375, + "logps/rejected": -487.8985900878906, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6122899055480957, + "rewards/margins": 11.95722770690918, + "rewards/rejected": -8.344938278198242, + "step": 5090 + }, + { + "epoch": 1.73, + "learning_rate": 2.3454614125645222e-07, + "logits/chosen": -1.5498723983764648, + "logits/rejected": -1.3690489530563354, + "logps/chosen": -354.73126220703125, + "logps/rejected": -696.5047607421875, + "loss": 0.0116, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.7654943466186523, + "rewards/margins": 11.734155654907227, + "rewards/rejected": -8.968660354614258, + "step": 5100 + }, + { + "epoch": 1.73, + "eval_logits/chosen": -1.574830174446106, + "eval_logits/rejected": -1.3274495601654053, + "eval_logps/chosen": -387.2066955566406, + "eval_logps/rejected": -649.8084716796875, + "eval_loss": 0.009646850638091564, + "eval_rewards/accuracies": 0.9957912564277649, + "eval_rewards/chosen": 3.2716057300567627, + "eval_rewards/margins": 12.300819396972656, + "eval_rewards/rejected": -9.029214859008789, + "eval_runtime": 572.22, + "eval_samples_per_second": 16.602, + "eval_steps_per_second": 0.519, + "step": 5100 + }, + { + "epoch": 1.74, + "learning_rate": 2.3391665617524864e-07, + "logits/chosen": -1.5520095825195312, + "logits/rejected": -1.3251075744628906, + "logps/chosen": -458.45849609375, + "logps/rejected": -884.36279296875, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8091976642608643, + "rewards/margins": 13.736592292785645, + "rewards/rejected": -9.92739486694336, + "step": 5110 + }, + { + "epoch": 1.74, + "learning_rate": 2.3328717109404506e-07, + "logits/chosen": -1.5521091222763062, + "logits/rejected": -1.3074095249176025, + "logps/chosen": -339.93756103515625, + "logps/rejected": -531.7042846679688, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.290306806564331, + "rewards/margins": 12.762462615966797, + "rewards/rejected": -9.472156524658203, + "step": 5120 + }, + { + "epoch": 1.74, + "learning_rate": 2.3265768601284149e-07, + "logits/chosen": -1.5432167053222656, + "logits/rejected": -1.3495628833770752, + "logps/chosen": -493.0538635253906, + "logps/rejected": -682.4107666015625, + "loss": 0.0042, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.591378688812256, + "rewards/margins": 12.024812698364258, + "rewards/rejected": -8.433433532714844, + "step": 5130 + }, + { + "epoch": 1.75, + "learning_rate": 2.320282009316379e-07, + "logits/chosen": -1.543609619140625, + "logits/rejected": -1.339796543121338, + "logps/chosen": -481.4986877441406, + "logps/rejected": -809.5143432617188, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7638771533966064, + "rewards/margins": 13.186566352844238, + "rewards/rejected": -9.422689437866211, + "step": 5140 + }, + { + "epoch": 1.75, + "learning_rate": 2.3139871585043433e-07, + "logits/chosen": -1.5696828365325928, + "logits/rejected": -1.3172285556793213, + "logps/chosen": -402.095947265625, + "logps/rejected": -515.7801513671875, + "loss": 0.0076, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.004603862762451, + "rewards/margins": 11.735708236694336, + "rewards/rejected": -8.731104850769043, + "step": 5150 + }, + { + "epoch": 1.75, + "learning_rate": 2.3076923076923078e-07, + "logits/chosen": -1.5352977514266968, + "logits/rejected": -1.3456909656524658, + "logps/chosen": -503.0603942871094, + "logps/rejected": -577.761962890625, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4081103801727295, + "rewards/margins": 12.933027267456055, + "rewards/rejected": -9.524917602539062, + "step": 5160 + }, + { + "epoch": 1.76, + "learning_rate": 2.3013974568802718e-07, + "logits/chosen": -1.553674340248108, + "logits/rejected": -1.3577382564544678, + "logps/chosen": -390.30682373046875, + "logps/rejected": -529.7505493164062, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7188827991485596, + "rewards/margins": 12.600153923034668, + "rewards/rejected": -8.881269454956055, + "step": 5170 + }, + { + "epoch": 1.76, + "learning_rate": 2.295102606068236e-07, + "logits/chosen": -1.566882610321045, + "logits/rejected": -1.329391360282898, + "logps/chosen": -342.52801513671875, + "logps/rejected": -859.9151611328125, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3750109672546387, + "rewards/margins": 12.931638717651367, + "rewards/rejected": -9.556629180908203, + "step": 5180 + }, + { + "epoch": 1.76, + "learning_rate": 2.2888077552562005e-07, + "logits/chosen": -1.5974833965301514, + "logits/rejected": -1.3018697500228882, + "logps/chosen": -345.0240783691406, + "logps/rejected": -687.2056884765625, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.810941219329834, + "rewards/margins": 13.485822677612305, + "rewards/rejected": -9.674881935119629, + "step": 5190 + }, + { + "epoch": 1.77, + "learning_rate": 2.2825129044441647e-07, + "logits/chosen": -1.5678842067718506, + "logits/rejected": -1.3167750835418701, + "logps/chosen": -322.955078125, + "logps/rejected": -579.8414306640625, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.3619937896728516, + "rewards/margins": 13.055994033813477, + "rewards/rejected": -9.694000244140625, + "step": 5200 + }, + { + "epoch": 1.77, + "eval_logits/chosen": -1.549462080001831, + "eval_logits/rejected": -1.3152766227722168, + "eval_logps/chosen": -387.6946105957031, + "eval_logps/rejected": -652.9937744140625, + "eval_loss": 0.010288468562066555, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.222813367843628, + "eval_rewards/margins": 12.570555686950684, + "eval_rewards/rejected": -9.347742080688477, + "eval_runtime": 572.7069, + "eval_samples_per_second": 16.588, + "eval_steps_per_second": 0.519, + "step": 5200 + }, + { + "epoch": 1.77, + "learning_rate": 2.2762180536321287e-07, + "logits/chosen": -1.5619487762451172, + "logits/rejected": -1.4257888793945312, + "logps/chosen": -332.32818603515625, + "logps/rejected": -690.8129272460938, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3254685401916504, + "rewards/margins": 11.27719783782959, + "rewards/rejected": -7.951729774475098, + "step": 5210 + }, + { + "epoch": 1.77, + "learning_rate": 2.2699232028200932e-07, + "logits/chosen": -1.5653175115585327, + "logits/rejected": -1.3985856771469116, + "logps/chosen": -410.2430114746094, + "logps/rejected": -784.0066528320312, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2796382904052734, + "rewards/margins": 12.211853981018066, + "rewards/rejected": -8.932214736938477, + "step": 5220 + }, + { + "epoch": 1.78, + "learning_rate": 2.2636283520080574e-07, + "logits/chosen": -1.5587904453277588, + "logits/rejected": -1.3683871030807495, + "logps/chosen": -454.5293884277344, + "logps/rejected": -597.5782470703125, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8911080360412598, + "rewards/margins": 12.435003280639648, + "rewards/rejected": -9.543895721435547, + "step": 5230 + }, + { + "epoch": 1.78, + "learning_rate": 2.2573335011960216e-07, + "logits/chosen": -1.564541220664978, + "logits/rejected": -1.3123071193695068, + "logps/chosen": -332.67742919921875, + "logps/rejected": -794.0281982421875, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.515615940093994, + "rewards/margins": 12.901514053344727, + "rewards/rejected": -9.385897636413574, + "step": 5240 + }, + { + "epoch": 1.78, + "learning_rate": 2.2510386503839856e-07, + "logits/chosen": -1.5765674114227295, + "logits/rejected": -1.343785047531128, + "logps/chosen": -342.3314208984375, + "logps/rejected": -668.9845581054688, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.506909132003784, + "rewards/margins": 12.336097717285156, + "rewards/rejected": -8.829188346862793, + "step": 5250 + }, + { + "epoch": 1.79, + "learning_rate": 2.24474379957195e-07, + "logits/chosen": -1.5395399332046509, + "logits/rejected": -1.3350563049316406, + "logps/chosen": -493.20556640625, + "logps/rejected": -793.2821044921875, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.601318359375, + "rewards/margins": 13.143940925598145, + "rewards/rejected": -9.542622566223145, + "step": 5260 + }, + { + "epoch": 1.79, + "learning_rate": 2.2384489487599143e-07, + "logits/chosen": -1.5338081121444702, + "logits/rejected": -1.3050000667572021, + "logps/chosen": -488.8599548339844, + "logps/rejected": -803.2353515625, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.386167526245117, + "rewards/margins": 13.006983757019043, + "rewards/rejected": -9.620818138122559, + "step": 5270 + }, + { + "epoch": 1.79, + "learning_rate": 2.2321540979478783e-07, + "logits/chosen": -1.5569932460784912, + "logits/rejected": -1.3241132497787476, + "logps/chosen": -393.7997131347656, + "logps/rejected": -698.0803833007812, + "loss": 0.0089, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.974869966506958, + "rewards/margins": 13.234077453613281, + "rewards/rejected": -10.259209632873535, + "step": 5280 + }, + { + "epoch": 1.8, + "learning_rate": 2.2258592471358428e-07, + "logits/chosen": -1.5720268487930298, + "logits/rejected": -1.365965485572815, + "logps/chosen": -328.6231384277344, + "logps/rejected": -720.9183349609375, + "loss": 0.0116, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.796217441558838, + "rewards/margins": 12.159344673156738, + "rewards/rejected": -9.363128662109375, + "step": 5290 + }, + { + "epoch": 1.8, + "learning_rate": 2.219564396323807e-07, + "logits/chosen": -1.5677629709243774, + "logits/rejected": -1.281874418258667, + "logps/chosen": -335.3643493652344, + "logps/rejected": -780.1639404296875, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6878421306610107, + "rewards/margins": 12.665910720825195, + "rewards/rejected": -9.978068351745605, + "step": 5300 + }, + { + "epoch": 1.8, + "eval_logits/chosen": -1.5594065189361572, + "eval_logits/rejected": -1.3273098468780518, + "eval_logps/chosen": -388.6713562011719, + "eval_logps/rejected": -655.5680541992188, + "eval_loss": 0.010250851511955261, + "eval_rewards/accuracies": 0.996632993221283, + "eval_rewards/chosen": 3.1251397132873535, + "eval_rewards/margins": 12.730304718017578, + "eval_rewards/rejected": -9.60516357421875, + "eval_runtime": 574.1567, + "eval_samples_per_second": 16.546, + "eval_steps_per_second": 0.517, + "step": 5300 + }, + { + "epoch": 1.8, + "learning_rate": 2.2132695455117712e-07, + "logits/chosen": -1.5727773904800415, + "logits/rejected": -1.4024388790130615, + "logps/chosen": -351.7112731933594, + "logps/rejected": -502.00244140625, + "loss": 0.0053, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.4972426891326904, + "rewards/margins": 12.715316772460938, + "rewards/rejected": -9.218074798583984, + "step": 5310 + }, + { + "epoch": 1.81, + "learning_rate": 2.2069746946997355e-07, + "logits/chosen": -1.5766611099243164, + "logits/rejected": -1.3245556354522705, + "logps/chosen": -287.96563720703125, + "logps/rejected": -430.05279541015625, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.615394115447998, + "rewards/margins": 11.580501556396484, + "rewards/rejected": -8.965107917785645, + "step": 5320 + }, + { + "epoch": 1.81, + "learning_rate": 2.2006798438876997e-07, + "logits/chosen": -1.5362478494644165, + "logits/rejected": -1.3397371768951416, + "logps/chosen": -475.42840576171875, + "logps/rejected": -760.2046508789062, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.601423740386963, + "rewards/margins": 11.93852710723877, + "rewards/rejected": -9.337103843688965, + "step": 5330 + }, + { + "epoch": 1.82, + "learning_rate": 2.194384993075664e-07, + "logits/chosen": -1.5621169805526733, + "logits/rejected": -1.3535120487213135, + "logps/chosen": -304.7786560058594, + "logps/rejected": -661.0731811523438, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.161728858947754, + "rewards/margins": 12.517748832702637, + "rewards/rejected": -9.3560209274292, + "step": 5340 + }, + { + "epoch": 1.82, + "learning_rate": 2.1880901422636284e-07, + "logits/chosen": -1.5866388082504272, + "logits/rejected": -1.3659638166427612, + "logps/chosen": -332.0908203125, + "logps/rejected": -582.4724731445312, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.690329074859619, + "rewards/margins": 13.068005561828613, + "rewards/rejected": -9.377676963806152, + "step": 5350 + }, + { + "epoch": 1.82, + "learning_rate": 2.1817952914515924e-07, + "logits/chosen": -1.5866968631744385, + "logits/rejected": -1.3916261196136475, + "logps/chosen": -328.92510986328125, + "logps/rejected": -631.3219604492188, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.355168581008911, + "rewards/margins": 12.187517166137695, + "rewards/rejected": -8.832348823547363, + "step": 5360 + }, + { + "epoch": 1.83, + "learning_rate": 2.1755004406395566e-07, + "logits/chosen": -1.5792675018310547, + "logits/rejected": -1.3011935949325562, + "logps/chosen": -422.63409423828125, + "logps/rejected": -486.2444763183594, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.016300201416016, + "rewards/margins": 13.841157913208008, + "rewards/rejected": -9.824856758117676, + "step": 5370 + }, + { + "epoch": 1.83, + "learning_rate": 2.169205589827521e-07, + "logits/chosen": -1.582124948501587, + "logits/rejected": -1.2963042259216309, + "logps/chosen": -347.41094970703125, + "logps/rejected": -574.789306640625, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0046887397766113, + "rewards/margins": 13.203763008117676, + "rewards/rejected": -10.199073791503906, + "step": 5380 + }, + { + "epoch": 1.83, + "learning_rate": 2.1629107390154853e-07, + "logits/chosen": -1.5856962203979492, + "logits/rejected": -1.2681124210357666, + "logps/chosen": -330.63116455078125, + "logps/rejected": -684.1741333007812, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0809762477874756, + "rewards/margins": 12.916654586791992, + "rewards/rejected": -9.835679054260254, + "step": 5390 + }, + { + "epoch": 1.84, + "learning_rate": 2.1566158882034493e-07, + "logits/chosen": -1.5775415897369385, + "logits/rejected": -1.366699457168579, + "logps/chosen": -319.5450439453125, + "logps/rejected": -491.41046142578125, + "loss": 0.0066, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.945000171661377, + "rewards/margins": 12.414464950561523, + "rewards/rejected": -8.469464302062988, + "step": 5400 + }, + { + "epoch": 1.84, + "eval_logits/chosen": -1.5720725059509277, + "eval_logits/rejected": -1.3330131769180298, + "eval_logps/chosen": -384.7553405761719, + "eval_logps/rejected": -650.0753784179688, + "eval_loss": 0.009380945935845375, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.516740560531616, + "eval_rewards/margins": 12.57264232635498, + "eval_rewards/rejected": -9.055903434753418, + "eval_runtime": 572.9488, + "eval_samples_per_second": 16.581, + "eval_steps_per_second": 0.518, + "step": 5400 + }, + { + "epoch": 1.84, + "learning_rate": 2.1503210373914138e-07, + "logits/chosen": -1.5731507539749146, + "logits/rejected": -1.3907480239868164, + "logps/chosen": -399.7240295410156, + "logps/rejected": -778.9546508789062, + "loss": 0.0048, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.9262969493865967, + "rewards/margins": 12.531413078308105, + "rewards/rejected": -8.605116844177246, + "step": 5410 + }, + { + "epoch": 1.84, + "learning_rate": 2.144026186579378e-07, + "logits/chosen": -1.574392318725586, + "logits/rejected": -1.3216757774353027, + "logps/chosen": -304.32183837890625, + "logps/rejected": -762.1922607421875, + "loss": 0.0147, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.8099148273468018, + "rewards/margins": 13.454829216003418, + "rewards/rejected": -9.644914627075195, + "step": 5420 + }, + { + "epoch": 1.85, + "learning_rate": 2.1377313357673422e-07, + "logits/chosen": -1.5652107000350952, + "logits/rejected": -1.3855630159378052, + "logps/chosen": -388.4117736816406, + "logps/rejected": -682.9684448242188, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0735394954681396, + "rewards/margins": 11.171327590942383, + "rewards/rejected": -8.097785949707031, + "step": 5430 + }, + { + "epoch": 1.85, + "learning_rate": 2.1314364849553065e-07, + "logits/chosen": -1.5598199367523193, + "logits/rejected": -1.3214751482009888, + "logps/chosen": -374.487060546875, + "logps/rejected": -556.2450561523438, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.024935245513916, + "rewards/margins": 11.697943687438965, + "rewards/rejected": -8.673008918762207, + "step": 5440 + }, + { + "epoch": 1.85, + "learning_rate": 2.1251416341432707e-07, + "logits/chosen": -1.5670108795166016, + "logits/rejected": -1.310826301574707, + "logps/chosen": -415.1046447753906, + "logps/rejected": -736.4661865234375, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.079799652099609, + "rewards/margins": 13.034085273742676, + "rewards/rejected": -8.95428466796875, + "step": 5450 + }, + { + "epoch": 1.86, + "learning_rate": 2.118846783331235e-07, + "logits/chosen": -1.5449684858322144, + "logits/rejected": -1.2761390209197998, + "logps/chosen": -435.33673095703125, + "logps/rejected": -633.0001220703125, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.350632429122925, + "rewards/margins": 12.148347854614258, + "rewards/rejected": -8.79771614074707, + "step": 5460 + }, + { + "epoch": 1.86, + "learning_rate": 2.1125519325191994e-07, + "logits/chosen": -1.5703989267349243, + "logits/rejected": -1.2627012729644775, + "logps/chosen": -346.9437561035156, + "logps/rejected": -744.6874389648438, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.421677827835083, + "rewards/margins": 13.0451021194458, + "rewards/rejected": -9.62342357635498, + "step": 5470 + }, + { + "epoch": 1.86, + "learning_rate": 2.1062570817071634e-07, + "logits/chosen": -1.5710852146148682, + "logits/rejected": -1.3873956203460693, + "logps/chosen": -451.33660888671875, + "logps/rejected": -671.2227783203125, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.976327896118164, + "rewards/margins": 12.036826133728027, + "rewards/rejected": -8.06049919128418, + "step": 5480 + }, + { + "epoch": 1.87, + "learning_rate": 2.0999622308951276e-07, + "logits/chosen": -1.5577223300933838, + "logits/rejected": -1.2908861637115479, + "logps/chosen": -394.15142822265625, + "logps/rejected": -720.3331298828125, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.853203296661377, + "rewards/margins": 13.717082023620605, + "rewards/rejected": -9.863879203796387, + "step": 5490 + }, + { + "epoch": 1.87, + "learning_rate": 2.093667380083092e-07, + "logits/chosen": -1.5653389692306519, + "logits/rejected": -1.3116123676300049, + "logps/chosen": -339.7210388183594, + "logps/rejected": -565.1358642578125, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1849656105041504, + "rewards/margins": 11.719869613647461, + "rewards/rejected": -8.534902572631836, + "step": 5500 + }, + { + "epoch": 1.87, + "eval_logits/chosen": -1.5598562955856323, + "eval_logits/rejected": -1.317143201828003, + "eval_logps/chosen": -384.03857421875, + "eval_logps/rejected": -649.7782592773438, + "eval_loss": 0.00933061819523573, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.588416814804077, + "eval_rewards/margins": 12.614607810974121, + "eval_rewards/rejected": -9.026191711425781, + "eval_runtime": 570.0431, + "eval_samples_per_second": 16.665, + "eval_steps_per_second": 0.521, + "step": 5500 + }, + { + "epoch": 1.87, + "learning_rate": 2.087372529271056e-07, + "logits/chosen": -1.5455589294433594, + "logits/rejected": -1.286950945854187, + "logps/chosen": -477.8189392089844, + "logps/rejected": -736.4945068359375, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.651362180709839, + "rewards/margins": 13.368435859680176, + "rewards/rejected": -9.717074394226074, + "step": 5510 + }, + { + "epoch": 1.88, + "learning_rate": 2.0810776784590203e-07, + "logits/chosen": -1.5545756816864014, + "logits/rejected": -1.283362627029419, + "logps/chosen": -335.6719055175781, + "logps/rejected": -628.423095703125, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.07934308052063, + "rewards/margins": 13.087526321411133, + "rewards/rejected": -10.008182525634766, + "step": 5520 + }, + { + "epoch": 1.88, + "learning_rate": 2.0747828276469848e-07, + "logits/chosen": -1.5584474802017212, + "logits/rejected": -1.3904309272766113, + "logps/chosen": -455.1902770996094, + "logps/rejected": -796.02734375, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4554061889648438, + "rewards/margins": 11.802480697631836, + "rewards/rejected": -8.347075462341309, + "step": 5530 + }, + { + "epoch": 1.88, + "learning_rate": 2.068487976834949e-07, + "logits/chosen": -1.539808750152588, + "logits/rejected": -1.3378984928131104, + "logps/chosen": -356.41925048828125, + "logps/rejected": -625.2384643554688, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1959996223449707, + "rewards/margins": 12.564947128295898, + "rewards/rejected": -9.368947982788086, + "step": 5540 + }, + { + "epoch": 1.89, + "learning_rate": 2.062193126022913e-07, + "logits/chosen": -1.5628660917282104, + "logits/rejected": -1.318337321281433, + "logps/chosen": -315.418701171875, + "logps/rejected": -740.911865234375, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.270336151123047, + "rewards/margins": 12.624650955200195, + "rewards/rejected": -9.354316711425781, + "step": 5550 + }, + { + "epoch": 1.89, + "learning_rate": 2.0558982752108775e-07, + "logits/chosen": -1.588404893875122, + "logits/rejected": -1.2994272708892822, + "logps/chosen": -368.47747802734375, + "logps/rejected": -743.4070434570312, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8771255016326904, + "rewards/margins": 13.238743782043457, + "rewards/rejected": -10.36161994934082, + "step": 5560 + }, + { + "epoch": 1.89, + "learning_rate": 2.0496034243988417e-07, + "logits/chosen": -1.5635043382644653, + "logits/rejected": -1.334977388381958, + "logps/chosen": -403.81280517578125, + "logps/rejected": -740.9823608398438, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4309468269348145, + "rewards/margins": 11.773470878601074, + "rewards/rejected": -9.342524528503418, + "step": 5570 + }, + { + "epoch": 1.9, + "learning_rate": 2.043308573586806e-07, + "logits/chosen": -1.5360522270202637, + "logits/rejected": -1.3296664953231812, + "logps/chosen": -461.235595703125, + "logps/rejected": -799.4022216796875, + "loss": 0.0181, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.090125560760498, + "rewards/margins": 12.993644714355469, + "rewards/rejected": -9.903517723083496, + "step": 5580 + }, + { + "epoch": 1.9, + "learning_rate": 2.0370137227747701e-07, + "logits/chosen": -1.5685545206069946, + "logits/rejected": -1.3774278163909912, + "logps/chosen": -422.351806640625, + "logps/rejected": -739.2772827148438, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7383501529693604, + "rewards/margins": 12.71796989440918, + "rewards/rejected": -9.979619979858398, + "step": 5590 + }, + { + "epoch": 1.9, + "learning_rate": 2.0307188719627344e-07, + "logits/chosen": -1.5561625957489014, + "logits/rejected": -1.317685604095459, + "logps/chosen": -484.38177490234375, + "logps/rejected": -562.920166015625, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9664769172668457, + "rewards/margins": 11.963323593139648, + "rewards/rejected": -8.996845245361328, + "step": 5600 + }, + { + "epoch": 1.9, + "eval_logits/chosen": -1.5644868612289429, + "eval_logits/rejected": -1.3368220329284668, + "eval_logps/chosen": -389.048828125, + "eval_logps/rejected": -657.5431518554688, + "eval_loss": 0.009319731034338474, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.087395429611206, + "eval_rewards/margins": 12.890071868896484, + "eval_rewards/rejected": -9.802677154541016, + "eval_runtime": 588.8103, + "eval_samples_per_second": 16.134, + "eval_steps_per_second": 0.504, + "step": 5600 + }, + { + "epoch": 1.91, + "learning_rate": 2.0244240211506986e-07, + "logits/chosen": -1.5812385082244873, + "logits/rejected": -1.367365837097168, + "logps/chosen": -372.86920166015625, + "logps/rejected": -473.0984802246094, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.745804786682129, + "rewards/margins": 12.244952201843262, + "rewards/rejected": -9.499147415161133, + "step": 5610 + }, + { + "epoch": 1.91, + "learning_rate": 2.018129170338663e-07, + "logits/chosen": -1.5326608419418335, + "logits/rejected": -1.3504233360290527, + "logps/chosen": -586.9166870117188, + "logps/rejected": -584.9916381835938, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5567238330841064, + "rewards/margins": 13.971818923950195, + "rewards/rejected": -10.415095329284668, + "step": 5620 + }, + { + "epoch": 1.91, + "learning_rate": 2.011834319526627e-07, + "logits/chosen": -1.5651975870132446, + "logits/rejected": -1.3767033815383911, + "logps/chosen": -405.72247314453125, + "logps/rejected": -605.7102661132812, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.916785478591919, + "rewards/margins": 13.681718826293945, + "rewards/rejected": -9.764933586120605, + "step": 5630 + }, + { + "epoch": 1.92, + "learning_rate": 2.0055394687145913e-07, + "logits/chosen": -1.5448428392410278, + "logits/rejected": -1.3316487073898315, + "logps/chosen": -386.0478515625, + "logps/rejected": -901.8233642578125, + "loss": 0.0256, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4366188049316406, + "rewards/margins": 12.76618480682373, + "rewards/rejected": -9.32956600189209, + "step": 5640 + }, + { + "epoch": 1.92, + "learning_rate": 1.9992446179025558e-07, + "logits/chosen": -1.5700167417526245, + "logits/rejected": -1.4121501445770264, + "logps/chosen": -329.1085205078125, + "logps/rejected": -710.774658203125, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0071444511413574, + "rewards/margins": 11.8978910446167, + "rewards/rejected": -8.890746116638184, + "step": 5650 + }, + { + "epoch": 1.92, + "learning_rate": 1.99294976709052e-07, + "logits/chosen": -1.561606764793396, + "logits/rejected": -1.3756434917449951, + "logps/chosen": -343.8679504394531, + "logps/rejected": -637.36083984375, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9006857872009277, + "rewards/margins": 12.96288776397705, + "rewards/rejected": -10.062201499938965, + "step": 5660 + }, + { + "epoch": 1.93, + "learning_rate": 1.986654916278484e-07, + "logits/chosen": -1.5785058736801147, + "logits/rejected": -1.3644511699676514, + "logps/chosen": -336.99578857421875, + "logps/rejected": -672.2367553710938, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.789386510848999, + "rewards/margins": 13.537968635559082, + "rewards/rejected": -10.748581886291504, + "step": 5670 + }, + { + "epoch": 1.93, + "learning_rate": 1.9803600654664484e-07, + "logits/chosen": -1.575010061264038, + "logits/rejected": -1.3783913850784302, + "logps/chosen": -338.83282470703125, + "logps/rejected": -920.6682739257812, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9358177185058594, + "rewards/margins": 13.603116035461426, + "rewards/rejected": -9.66729736328125, + "step": 5680 + }, + { + "epoch": 1.93, + "learning_rate": 1.9740652146544127e-07, + "logits/chosen": -1.580043077468872, + "logits/rejected": -1.3623154163360596, + "logps/chosen": -422.20599365234375, + "logps/rejected": -648.5010375976562, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4759020805358887, + "rewards/margins": 13.383193969726562, + "rewards/rejected": -9.907292366027832, + "step": 5690 + }, + { + "epoch": 1.94, + "learning_rate": 1.9677703638423766e-07, + "logits/chosen": -1.5734128952026367, + "logits/rejected": -1.3318603038787842, + "logps/chosen": -345.6842956542969, + "logps/rejected": -559.3414306640625, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.252077579498291, + "rewards/margins": 13.170082092285156, + "rewards/rejected": -9.918004989624023, + "step": 5700 + }, + { + "epoch": 1.94, + "eval_logits/chosen": -1.5715759992599487, + "eval_logits/rejected": -1.3526341915130615, + "eval_logps/chosen": -385.5294494628906, + "eval_logps/rejected": -656.620361328125, + "eval_loss": 0.009785475209355354, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 3.439331531524658, + "eval_rewards/margins": 13.14973258972168, + "eval_rewards/rejected": -9.710402488708496, + "eval_runtime": 569.6293, + "eval_samples_per_second": 16.678, + "eval_steps_per_second": 0.521, + "step": 5700 + }, + { + "epoch": 1.94, + "learning_rate": 1.961475513030341e-07, + "logits/chosen": -1.5523710250854492, + "logits/rejected": -1.3565914630889893, + "logps/chosen": -464.20147705078125, + "logps/rejected": -684.78076171875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3501803874969482, + "rewards/margins": 12.506669998168945, + "rewards/rejected": -9.156488418579102, + "step": 5710 + }, + { + "epoch": 1.94, + "learning_rate": 1.9551806622183054e-07, + "logits/chosen": -1.570746898651123, + "logits/rejected": -1.3322603702545166, + "logps/chosen": -320.79595947265625, + "logps/rejected": -702.6842651367188, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.4401965141296387, + "rewards/margins": 12.810795783996582, + "rewards/rejected": -9.370598793029785, + "step": 5720 + }, + { + "epoch": 1.95, + "learning_rate": 1.9488858114062696e-07, + "logits/chosen": -1.574364423751831, + "logits/rejected": -1.3548295497894287, + "logps/chosen": -340.6933288574219, + "logps/rejected": -506.8951721191406, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7329070568084717, + "rewards/margins": 13.038932800292969, + "rewards/rejected": -9.306025505065918, + "step": 5730 + }, + { + "epoch": 1.95, + "learning_rate": 1.9425909605942338e-07, + "logits/chosen": -1.5704716444015503, + "logits/rejected": -1.3661311864852905, + "logps/chosen": -552.2789306640625, + "logps/rejected": -650.8052978515625, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.780675172805786, + "rewards/margins": 12.080301284790039, + "rewards/rejected": -9.299625396728516, + "step": 5740 + }, + { + "epoch": 1.95, + "learning_rate": 1.936296109782198e-07, + "logits/chosen": -1.5602500438690186, + "logits/rejected": -1.3438739776611328, + "logps/chosen": -365.57867431640625, + "logps/rejected": -877.02392578125, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4795734882354736, + "rewards/margins": 13.832864761352539, + "rewards/rejected": -10.353291511535645, + "step": 5750 + }, + { + "epoch": 1.96, + "learning_rate": 1.9300012589701623e-07, + "logits/chosen": -1.5906877517700195, + "logits/rejected": -1.4425641298294067, + "logps/chosen": -439.29638671875, + "logps/rejected": -801.90234375, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.250617504119873, + "rewards/margins": 12.746950149536133, + "rewards/rejected": -9.496332168579102, + "step": 5760 + }, + { + "epoch": 1.96, + "learning_rate": 1.9237064081581268e-07, + "logits/chosen": -1.5896638631820679, + "logits/rejected": -1.3576405048370361, + "logps/chosen": -349.1166687011719, + "logps/rejected": -749.1558837890625, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.29948353767395, + "rewards/margins": 14.574786186218262, + "rewards/rejected": -11.275301933288574, + "step": 5770 + }, + { + "epoch": 1.96, + "learning_rate": 1.9174115573460907e-07, + "logits/chosen": -1.6059318780899048, + "logits/rejected": -1.300520896911621, + "logps/chosen": -421.22540283203125, + "logps/rejected": -677.3818969726562, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.342651844024658, + "rewards/margins": 12.222294807434082, + "rewards/rejected": -8.879643440246582, + "step": 5780 + }, + { + "epoch": 1.97, + "learning_rate": 1.911116706534055e-07, + "logits/chosen": -1.579067349433899, + "logits/rejected": -1.3455263376235962, + "logps/chosen": -497.47064208984375, + "logps/rejected": -650.6846313476562, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8139636516571045, + "rewards/margins": 12.30526065826416, + "rewards/rejected": -9.491296768188477, + "step": 5790 + }, + { + "epoch": 1.97, + "learning_rate": 1.9048218557220194e-07, + "logits/chosen": -1.5863184928894043, + "logits/rejected": -1.3400444984436035, + "logps/chosen": -396.5912780761719, + "logps/rejected": -489.7623596191406, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.680152177810669, + "rewards/margins": 13.147547721862793, + "rewards/rejected": -9.46739387512207, + "step": 5800 + }, + { + "epoch": 1.97, + "eval_logits/chosen": -1.5879576206207275, + "eval_logits/rejected": -1.359340786933899, + "eval_logps/chosen": -384.03070068359375, + "eval_logps/rejected": -653.519775390625, + "eval_loss": 0.008032087236642838, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.589207172393799, + "eval_rewards/margins": 12.98955249786377, + "eval_rewards/rejected": -9.400345802307129, + "eval_runtime": 569.0142, + "eval_samples_per_second": 16.696, + "eval_steps_per_second": 0.522, + "step": 5800 + }, + { + "epoch": 1.97, + "learning_rate": 1.8985270049099837e-07, + "logits/chosen": -1.5842969417572021, + "logits/rejected": -1.3865296840667725, + "logps/chosen": -323.2148132324219, + "logps/rejected": -747.6116943359375, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1280736923217773, + "rewards/margins": 12.603017807006836, + "rewards/rejected": -9.474943161010742, + "step": 5810 + }, + { + "epoch": 1.98, + "learning_rate": 1.8922321540979476e-07, + "logits/chosen": -1.5732569694519043, + "logits/rejected": -1.3697015047073364, + "logps/chosen": -496.08746337890625, + "logps/rejected": -474.392333984375, + "loss": 0.0062, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.2842488288879395, + "rewards/margins": 12.359346389770508, + "rewards/rejected": -9.07509708404541, + "step": 5820 + }, + { + "epoch": 1.98, + "learning_rate": 1.885937303285912e-07, + "logits/chosen": -1.5913281440734863, + "logits/rejected": -1.367375135421753, + "logps/chosen": -309.66705322265625, + "logps/rejected": -695.351806640625, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6442904472351074, + "rewards/margins": 13.128097534179688, + "rewards/rejected": -9.483807563781738, + "step": 5830 + }, + { + "epoch": 1.99, + "learning_rate": 1.8796424524738764e-07, + "logits/chosen": -1.5833710432052612, + "logits/rejected": -1.406314730644226, + "logps/chosen": -416.65252685546875, + "logps/rejected": -589.8971557617188, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8386924266815186, + "rewards/margins": 12.964340209960938, + "rewards/rejected": -9.125648498535156, + "step": 5840 + }, + { + "epoch": 1.99, + "learning_rate": 1.8733476016618406e-07, + "logits/chosen": -1.5824650526046753, + "logits/rejected": -1.4084742069244385, + "logps/chosen": -380.238037109375, + "logps/rejected": -637.6284790039062, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.611265182495117, + "rewards/margins": 13.597894668579102, + "rewards/rejected": -9.986627578735352, + "step": 5850 + }, + { + "epoch": 1.99, + "learning_rate": 1.8670527508498048e-07, + "logits/chosen": -1.6007578372955322, + "logits/rejected": -1.3486738204956055, + "logps/chosen": -325.0093994140625, + "logps/rejected": -860.0470581054688, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.413071870803833, + "rewards/margins": 13.735666275024414, + "rewards/rejected": -10.322593688964844, + "step": 5860 + }, + { + "epoch": 2.0, + "learning_rate": 1.860757900037769e-07, + "logits/chosen": -1.5835248231887817, + "logits/rejected": -1.3723149299621582, + "logps/chosen": -321.95965576171875, + "logps/rejected": -544.609619140625, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.16746711730957, + "rewards/margins": 13.992466926574707, + "rewards/rejected": -9.82499885559082, + "step": 5870 + }, + { + "epoch": 2.0, + "learning_rate": 1.8544630492257333e-07, + "logits/chosen": -1.5852301120758057, + "logits/rejected": -1.3901424407958984, + "logps/chosen": -307.2806091308594, + "logps/rejected": -701.1611328125, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4009246826171875, + "rewards/margins": 13.229278564453125, + "rewards/rejected": -9.828351974487305, + "step": 5880 + }, + { + "epoch": 2.0, + "learning_rate": 1.8481681984136978e-07, + "logits/chosen": -1.6187397241592407, + "logits/rejected": -1.4273978471755981, + "logps/chosen": -347.74237060546875, + "logps/rejected": -514.708984375, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.79756760597229, + "rewards/margins": 11.97067642211914, + "rewards/rejected": -9.17310905456543, + "step": 5890 + }, + { + "epoch": 2.01, + "learning_rate": 1.8418733476016617e-07, + "logits/chosen": -1.563674807548523, + "logits/rejected": -1.3802025318145752, + "logps/chosen": -391.52642822265625, + "logps/rejected": -576.9651489257812, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7611618041992188, + "rewards/margins": 12.104849815368652, + "rewards/rejected": -8.343687057495117, + "step": 5900 + }, + { + "epoch": 2.01, + "eval_logits/chosen": -1.5836540460586548, + "eval_logits/rejected": -1.3551989793777466, + "eval_logps/chosen": -385.6568603515625, + "eval_logps/rejected": -658.06689453125, + "eval_loss": 0.010191100649535656, + "eval_rewards/accuracies": 0.996632993221283, + "eval_rewards/chosen": 3.4265856742858887, + "eval_rewards/margins": 13.28164005279541, + "eval_rewards/rejected": -9.85505485534668, + "eval_runtime": 571.0697, + "eval_samples_per_second": 16.635, + "eval_steps_per_second": 0.52, + "step": 5900 + }, + { + "epoch": 2.01, + "learning_rate": 1.835578496789626e-07, + "logits/chosen": -1.5853064060211182, + "logits/rejected": -1.414734125137329, + "logps/chosen": -405.837890625, + "logps/rejected": -696.7243041992188, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.537665605545044, + "rewards/margins": 12.571477890014648, + "rewards/rejected": -9.033811569213867, + "step": 5910 + }, + { + "epoch": 2.01, + "learning_rate": 1.8292836459775904e-07, + "logits/chosen": -1.5765013694763184, + "logits/rejected": -1.35932195186615, + "logps/chosen": -464.1136779785156, + "logps/rejected": -828.7986450195312, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.393418550491333, + "rewards/margins": 13.604681015014648, + "rewards/rejected": -10.211263656616211, + "step": 5920 + }, + { + "epoch": 2.02, + "learning_rate": 1.8229887951655544e-07, + "logits/chosen": -1.5896810293197632, + "logits/rejected": -1.372353196144104, + "logps/chosen": -313.2590026855469, + "logps/rejected": -668.3497924804688, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2981677055358887, + "rewards/margins": 13.175135612487793, + "rewards/rejected": -9.87696647644043, + "step": 5930 + }, + { + "epoch": 2.02, + "learning_rate": 1.8166939443535186e-07, + "logits/chosen": -1.6018621921539307, + "logits/rejected": -1.4273078441619873, + "logps/chosen": -426.26739501953125, + "logps/rejected": -456.65863037109375, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.01948356628418, + "rewards/margins": 13.477208137512207, + "rewards/rejected": -9.457724571228027, + "step": 5940 + }, + { + "epoch": 2.02, + "learning_rate": 1.8103990935414829e-07, + "logits/chosen": -1.5911426544189453, + "logits/rejected": -1.3666123151779175, + "logps/chosen": -333.13311767578125, + "logps/rejected": -647.44970703125, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.403470277786255, + "rewards/margins": 12.167041778564453, + "rewards/rejected": -8.763570785522461, + "step": 5950 + }, + { + "epoch": 2.03, + "learning_rate": 1.8041042427294474e-07, + "logits/chosen": -1.5918200016021729, + "logits/rejected": -1.425859808921814, + "logps/chosen": -402.16131591796875, + "logps/rejected": -402.430419921875, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.761643886566162, + "rewards/margins": 13.374422073364258, + "rewards/rejected": -9.612775802612305, + "step": 5960 + }, + { + "epoch": 2.03, + "learning_rate": 1.7978093919174113e-07, + "logits/chosen": -1.577141523361206, + "logits/rejected": -1.3713648319244385, + "logps/chosen": -362.5179748535156, + "logps/rejected": -798.7357177734375, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0766875743865967, + "rewards/margins": 12.924699783325195, + "rewards/rejected": -9.84801197052002, + "step": 5970 + }, + { + "epoch": 2.03, + "learning_rate": 1.7915145411053755e-07, + "logits/chosen": -1.575863242149353, + "logits/rejected": -1.3683074712753296, + "logps/chosen": -382.6520080566406, + "logps/rejected": -746.1673583984375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9939167499542236, + "rewards/margins": 12.296357154846191, + "rewards/rejected": -9.302441596984863, + "step": 5980 + }, + { + "epoch": 2.04, + "learning_rate": 1.78521969029334e-07, + "logits/chosen": -1.5800620317459106, + "logits/rejected": -1.3653762340545654, + "logps/chosen": -385.9838562011719, + "logps/rejected": -827.6422119140625, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.391411304473877, + "rewards/margins": 14.520398139953613, + "rewards/rejected": -11.128988265991211, + "step": 5990 + }, + { + "epoch": 2.04, + "learning_rate": 1.7789248394813043e-07, + "logits/chosen": -1.5842511653900146, + "logits/rejected": -1.3848586082458496, + "logps/chosen": -457.4696350097656, + "logps/rejected": -504.9671936035156, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.196730613708496, + "rewards/margins": 14.184361457824707, + "rewards/rejected": -9.987629890441895, + "step": 6000 + }, + { + "epoch": 2.04, + "eval_logits/chosen": -1.5734307765960693, + "eval_logits/rejected": -1.3417832851409912, + "eval_logps/chosen": -384.8310546875, + "eval_logps/rejected": -658.9734497070312, + "eval_loss": 0.01052306592464447, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.5091729164123535, + "eval_rewards/margins": 13.454878807067871, + "eval_rewards/rejected": -9.945707321166992, + "eval_runtime": 571.3029, + "eval_samples_per_second": 16.629, + "eval_steps_per_second": 0.52, + "step": 6000 + }, + { + "epoch": 2.04, + "learning_rate": 1.7726299886692682e-07, + "logits/chosen": -1.5890525579452515, + "logits/rejected": -1.3782134056091309, + "logps/chosen": -346.62750244140625, + "logps/rejected": -574.5610961914062, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.202110767364502, + "rewards/margins": 12.107061386108398, + "rewards/rejected": -8.904951095581055, + "step": 6010 + }, + { + "epoch": 2.05, + "learning_rate": 1.7663351378572327e-07, + "logits/chosen": -1.5794428586959839, + "logits/rejected": -1.3990501165390015, + "logps/chosen": -328.50225830078125, + "logps/rejected": -872.6994018554688, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.609851360321045, + "rewards/margins": 13.973623275756836, + "rewards/rejected": -10.363773345947266, + "step": 6020 + }, + { + "epoch": 2.05, + "learning_rate": 1.760040287045197e-07, + "logits/chosen": -1.544471025466919, + "logits/rejected": -1.395743727684021, + "logps/chosen": -530.2237548828125, + "logps/rejected": -609.5791015625, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6359665393829346, + "rewards/margins": 12.672989845275879, + "rewards/rejected": -9.037022590637207, + "step": 6030 + }, + { + "epoch": 2.05, + "learning_rate": 1.7537454362331612e-07, + "logits/chosen": -1.5333201885223389, + "logits/rejected": -1.3785232305526733, + "logps/chosen": -378.5882873535156, + "logps/rejected": -582.8477783203125, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3894290924072266, + "rewards/margins": 12.997915267944336, + "rewards/rejected": -9.60848617553711, + "step": 6040 + }, + { + "epoch": 2.06, + "learning_rate": 1.7474505854211254e-07, + "logits/chosen": -1.556309461593628, + "logits/rejected": -1.356785535812378, + "logps/chosen": -536.20654296875, + "logps/rejected": -604.7540283203125, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4235618114471436, + "rewards/margins": 13.518298149108887, + "rewards/rejected": -10.09473705291748, + "step": 6050 + }, + { + "epoch": 2.06, + "learning_rate": 1.7411557346090896e-07, + "logits/chosen": -1.5435625314712524, + "logits/rejected": -1.354384422302246, + "logps/chosen": -368.25897216796875, + "logps/rejected": -518.4398803710938, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.393087387084961, + "rewards/margins": 12.706704139709473, + "rewards/rejected": -9.313615798950195, + "step": 6060 + }, + { + "epoch": 2.06, + "learning_rate": 1.7348608837970539e-07, + "logits/chosen": -1.5750101804733276, + "logits/rejected": -1.3663175106048584, + "logps/chosen": -384.9168395996094, + "logps/rejected": -727.3328857421875, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.956982374191284, + "rewards/margins": 13.832025527954102, + "rewards/rejected": -9.875041007995605, + "step": 6070 + }, + { + "epoch": 2.07, + "learning_rate": 1.7285660329850184e-07, + "logits/chosen": -1.5687141418457031, + "logits/rejected": -1.3270981311798096, + "logps/chosen": -313.4903869628906, + "logps/rejected": -624.5948486328125, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8407986164093018, + "rewards/margins": 14.303858757019043, + "rewards/rejected": -10.463059425354004, + "step": 6080 + }, + { + "epoch": 2.07, + "learning_rate": 1.7222711821729823e-07, + "logits/chosen": -1.5713229179382324, + "logits/rejected": -1.3685797452926636, + "logps/chosen": -307.17608642578125, + "logps/rejected": -782.0498046875, + "loss": 0.0048, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.2845778465270996, + "rewards/margins": 12.913658142089844, + "rewards/rejected": -9.629079818725586, + "step": 6090 + }, + { + "epoch": 2.07, + "learning_rate": 1.7159763313609465e-07, + "logits/chosen": -1.5606857538223267, + "logits/rejected": -1.3849254846572876, + "logps/chosen": -408.3519592285156, + "logps/rejected": -566.0878295898438, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6911189556121826, + "rewards/margins": 13.635490417480469, + "rewards/rejected": -9.94437026977539, + "step": 6100 + }, + { + "epoch": 2.07, + "eval_logits/chosen": -1.5633213520050049, + "eval_logits/rejected": -1.3269240856170654, + "eval_logps/chosen": -385.05035400390625, + "eval_logps/rejected": -660.5548706054688, + "eval_loss": 0.008254719898104668, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.487239360809326, + "eval_rewards/margins": 13.591096878051758, + "eval_rewards/rejected": -10.103857040405273, + "eval_runtime": 570.9561, + "eval_samples_per_second": 16.639, + "eval_steps_per_second": 0.52, + "step": 6100 + }, + { + "epoch": 2.08, + "learning_rate": 1.709681480548911e-07, + "logits/chosen": -1.5641956329345703, + "logits/rejected": -1.376461148262024, + "logps/chosen": -309.01123046875, + "logps/rejected": -535.5664672851562, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4919581413269043, + "rewards/margins": 12.847412109375, + "rewards/rejected": -9.355454444885254, + "step": 6110 + }, + { + "epoch": 2.08, + "learning_rate": 1.7033866297368753e-07, + "logits/chosen": -1.5530128479003906, + "logits/rejected": -1.3875248432159424, + "logps/chosen": -410.8304138183594, + "logps/rejected": -685.3343505859375, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4738612174987793, + "rewards/margins": 13.94920539855957, + "rewards/rejected": -10.47534465789795, + "step": 6120 + }, + { + "epoch": 2.08, + "learning_rate": 1.6970917789248392e-07, + "logits/chosen": -1.600319266319275, + "logits/rejected": -1.3862138986587524, + "logps/chosen": -382.09063720703125, + "logps/rejected": -812.2560424804688, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.948361873626709, + "rewards/margins": 13.856928825378418, + "rewards/rejected": -10.908567428588867, + "step": 6130 + }, + { + "epoch": 2.09, + "learning_rate": 1.6907969281128037e-07, + "logits/chosen": -1.5730650424957275, + "logits/rejected": -1.3630366325378418, + "logps/chosen": -487.9115295410156, + "logps/rejected": -606.2828369140625, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7007343769073486, + "rewards/margins": 13.465612411499023, + "rewards/rejected": -10.76487922668457, + "step": 6140 + }, + { + "epoch": 2.09, + "learning_rate": 1.684502077300768e-07, + "logits/chosen": -1.5798609256744385, + "logits/rejected": -1.3997907638549805, + "logps/chosen": -424.8887634277344, + "logps/rejected": -640.5821533203125, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.282831907272339, + "rewards/margins": 13.371957778930664, + "rewards/rejected": -10.08912467956543, + "step": 6150 + }, + { + "epoch": 2.09, + "learning_rate": 1.678207226488732e-07, + "logits/chosen": -1.5721240043640137, + "logits/rejected": -1.36776602268219, + "logps/chosen": -380.6419982910156, + "logps/rejected": -552.1428833007812, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.643103837966919, + "rewards/margins": 13.19780158996582, + "rewards/rejected": -9.554697036743164, + "step": 6160 + }, + { + "epoch": 2.1, + "learning_rate": 1.6719123756766964e-07, + "logits/chosen": -1.5856187343597412, + "logits/rejected": -1.3289967775344849, + "logps/chosen": -421.395263671875, + "logps/rejected": -783.7921752929688, + "loss": 0.0041, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.849787950515747, + "rewards/margins": 14.713786125183105, + "rewards/rejected": -10.863998413085938, + "step": 6170 + }, + { + "epoch": 2.1, + "learning_rate": 1.6656175248646606e-07, + "logits/chosen": -1.5449007749557495, + "logits/rejected": -1.3362114429473877, + "logps/chosen": -350.7699890136719, + "logps/rejected": -623.5848388671875, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1750528812408447, + "rewards/margins": 14.337316513061523, + "rewards/rejected": -11.162263870239258, + "step": 6180 + }, + { + "epoch": 2.1, + "learning_rate": 1.6593226740526249e-07, + "logits/chosen": -1.5679972171783447, + "logits/rejected": -1.3455889225006104, + "logps/chosen": -294.62939453125, + "logps/rejected": -481.19842529296875, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.551668643951416, + "rewards/margins": 13.721986770629883, + "rewards/rejected": -10.170318603515625, + "step": 6190 + }, + { + "epoch": 2.11, + "learning_rate": 1.653027823240589e-07, + "logits/chosen": -1.577105164527893, + "logits/rejected": -1.3542635440826416, + "logps/chosen": -322.06622314453125, + "logps/rejected": -705.23876953125, + "loss": 0.0056, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.04902982711792, + "rewards/margins": 13.115121841430664, + "rewards/rejected": -10.066092491149902, + "step": 6200 + }, + { + "epoch": 2.11, + "eval_logits/chosen": -1.5700169801712036, + "eval_logits/rejected": -1.343705654144287, + "eval_logps/chosen": -386.0007629394531, + "eval_logps/rejected": -663.229736328125, + "eval_loss": 0.0089480672031641, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 3.3921990394592285, + "eval_rewards/margins": 13.76353645324707, + "eval_rewards/rejected": -10.371336936950684, + "eval_runtime": 569.7908, + "eval_samples_per_second": 16.673, + "eval_steps_per_second": 0.521, + "step": 6200 + }, + { + "epoch": 2.11, + "learning_rate": 1.6467329724285533e-07, + "logits/chosen": -1.6055638790130615, + "logits/rejected": -1.3567235469818115, + "logps/chosen": -333.28326416015625, + "logps/rejected": -781.2586059570312, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1567676067352295, + "rewards/margins": 13.559796333312988, + "rewards/rejected": -10.403029441833496, + "step": 6210 + }, + { + "epoch": 2.11, + "learning_rate": 1.6404381216165175e-07, + "logits/chosen": -1.583471655845642, + "logits/rejected": -1.3695013523101807, + "logps/chosen": -331.52777099609375, + "logps/rejected": -588.3743896484375, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.343158006668091, + "rewards/margins": 13.703679084777832, + "rewards/rejected": -10.36052131652832, + "step": 6220 + }, + { + "epoch": 2.12, + "learning_rate": 1.634143270804482e-07, + "logits/chosen": -1.5970001220703125, + "logits/rejected": -1.4041357040405273, + "logps/chosen": -365.20855712890625, + "logps/rejected": -587.7662353515625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3545563220977783, + "rewards/margins": 12.629743576049805, + "rewards/rejected": -9.275187492370605, + "step": 6230 + }, + { + "epoch": 2.12, + "learning_rate": 1.627848419992446e-07, + "logits/chosen": -1.5541200637817383, + "logits/rejected": -1.3522471189498901, + "logps/chosen": -458.91290283203125, + "logps/rejected": -838.9461669921875, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.499737501144409, + "rewards/margins": 14.423726081848145, + "rewards/rejected": -10.923989295959473, + "step": 6240 + }, + { + "epoch": 2.12, + "learning_rate": 1.6215535691804102e-07, + "logits/chosen": -1.571530818939209, + "logits/rejected": -1.3561214208602905, + "logps/chosen": -394.83917236328125, + "logps/rejected": -543.1696166992188, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.35219144821167, + "rewards/margins": 12.686806678771973, + "rewards/rejected": -9.334614753723145, + "step": 6250 + }, + { + "epoch": 2.13, + "learning_rate": 1.6152587183683747e-07, + "logits/chosen": -1.5718861818313599, + "logits/rejected": -1.341476559638977, + "logps/chosen": -321.237060546875, + "logps/rejected": -730.6900024414062, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1130313873291016, + "rewards/margins": 13.657200813293457, + "rewards/rejected": -10.544168472290039, + "step": 6260 + }, + { + "epoch": 2.13, + "learning_rate": 1.608963867556339e-07, + "logits/chosen": -1.5870338678359985, + "logits/rejected": -1.3626445531845093, + "logps/chosen": -321.41473388671875, + "logps/rejected": -530.8362426757812, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4880223274230957, + "rewards/margins": 12.710477828979492, + "rewards/rejected": -9.222455978393555, + "step": 6270 + }, + { + "epoch": 2.13, + "learning_rate": 1.602669016744303e-07, + "logits/chosen": -1.5680911540985107, + "logits/rejected": -1.3386658430099487, + "logps/chosen": -482.3976135253906, + "logps/rejected": -952.5712890625, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6433074474334717, + "rewards/margins": 13.233453750610352, + "rewards/rejected": -9.590144157409668, + "step": 6280 + }, + { + "epoch": 2.14, + "learning_rate": 1.5963741659322674e-07, + "logits/chosen": -1.5943859815597534, + "logits/rejected": -1.3496134281158447, + "logps/chosen": -358.327880859375, + "logps/rejected": -727.8673095703125, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.567591428756714, + "rewards/margins": 13.70576000213623, + "rewards/rejected": -10.138166427612305, + "step": 6290 + }, + { + "epoch": 2.14, + "learning_rate": 1.5900793151202316e-07, + "logits/chosen": -1.583922028541565, + "logits/rejected": -1.3518226146697998, + "logps/chosen": -321.2569274902344, + "logps/rejected": -714.7805786132812, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.26947283744812, + "rewards/margins": 13.061630249023438, + "rewards/rejected": -9.792158126831055, + "step": 6300 + }, + { + "epoch": 2.14, + "eval_logits/chosen": -1.5831453800201416, + "eval_logits/rejected": -1.3527216911315918, + "eval_logps/chosen": -384.21820068359375, + "eval_logps/rejected": -660.8606567382812, + "eval_loss": 0.007751443888992071, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.5704545974731445, + "eval_rewards/margins": 13.704879760742188, + "eval_rewards/rejected": -10.134425163269043, + "eval_runtime": 570.1835, + "eval_samples_per_second": 16.661, + "eval_steps_per_second": 0.521, + "step": 6300 + }, + { + "epoch": 2.14, + "learning_rate": 1.5837844643081959e-07, + "logits/chosen": -1.5702444314956665, + "logits/rejected": -1.3582322597503662, + "logps/chosen": -299.5747375488281, + "logps/rejected": -828.6398315429688, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4351024627685547, + "rewards/margins": 13.68336009979248, + "rewards/rejected": -10.248257637023926, + "step": 6310 + }, + { + "epoch": 2.15, + "learning_rate": 1.57748961349616e-07, + "logits/chosen": -1.5622743368148804, + "logits/rejected": -1.3565266132354736, + "logps/chosen": -404.68756103515625, + "logps/rejected": -888.7033081054688, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4816806316375732, + "rewards/margins": 14.000155448913574, + "rewards/rejected": -10.518476486206055, + "step": 6320 + }, + { + "epoch": 2.15, + "learning_rate": 1.5711947626841243e-07, + "logits/chosen": -1.539731740951538, + "logits/rejected": -1.372911810874939, + "logps/chosen": -500.98577880859375, + "logps/rejected": -518.1825561523438, + "loss": 0.0047, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.4630017280578613, + "rewards/margins": 13.42473030090332, + "rewards/rejected": -9.961729049682617, + "step": 6330 + }, + { + "epoch": 2.15, + "learning_rate": 1.5648999118720885e-07, + "logits/chosen": -1.5548157691955566, + "logits/rejected": -1.3993146419525146, + "logps/chosen": -417.75830078125, + "logps/rejected": -542.7899169921875, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.326592445373535, + "rewards/margins": 14.735855102539062, + "rewards/rejected": -10.409261703491211, + "step": 6340 + }, + { + "epoch": 2.16, + "learning_rate": 1.558605061060053e-07, + "logits/chosen": -1.5931575298309326, + "logits/rejected": -1.3328150510787964, + "logps/chosen": -349.7419738769531, + "logps/rejected": -665.8507080078125, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.716078281402588, + "rewards/margins": 14.710800170898438, + "rewards/rejected": -10.994720458984375, + "step": 6350 + }, + { + "epoch": 2.16, + "learning_rate": 1.552310210248017e-07, + "logits/chosen": -1.5400502681732178, + "logits/rejected": -1.3369262218475342, + "logps/chosen": -470.3833923339844, + "logps/rejected": -814.0325927734375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.523441791534424, + "rewards/margins": 14.534136772155762, + "rewards/rejected": -11.010696411132812, + "step": 6360 + }, + { + "epoch": 2.17, + "learning_rate": 1.5460153594359812e-07, + "logits/chosen": -1.5648212432861328, + "logits/rejected": -1.3341492414474487, + "logps/chosen": -473.6314392089844, + "logps/rejected": -670.8265380859375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4539389610290527, + "rewards/margins": 13.69250202178955, + "rewards/rejected": -10.23856258392334, + "step": 6370 + }, + { + "epoch": 2.17, + "learning_rate": 1.5397205086239457e-07, + "logits/chosen": -1.5823898315429688, + "logits/rejected": -1.3830448389053345, + "logps/chosen": -394.908935546875, + "logps/rejected": -758.3516845703125, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4654624462127686, + "rewards/margins": 14.750338554382324, + "rewards/rejected": -11.284875869750977, + "step": 6380 + }, + { + "epoch": 2.17, + "learning_rate": 1.5334256578119097e-07, + "logits/chosen": -1.5933994054794312, + "logits/rejected": -1.3969393968582153, + "logps/chosen": -418.40252685546875, + "logps/rejected": -663.5984497070312, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.937318801879883, + "rewards/margins": 14.6946439743042, + "rewards/rejected": -10.75732421875, + "step": 6390 + }, + { + "epoch": 2.18, + "learning_rate": 1.527130806999874e-07, + "logits/chosen": -1.549824833869934, + "logits/rejected": -1.385512351989746, + "logps/chosen": -486.784423828125, + "logps/rejected": -526.1112060546875, + "loss": 0.0039, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.1367604732513428, + "rewards/margins": 13.472702026367188, + "rewards/rejected": -10.335943222045898, + "step": 6400 + }, + { + "epoch": 2.18, + "eval_logits/chosen": -1.584315299987793, + "eval_logits/rejected": -1.3748033046722412, + "eval_logps/chosen": -386.1251525878906, + "eval_logps/rejected": -667.5103149414062, + "eval_loss": 0.009169588796794415, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 3.379760980606079, + "eval_rewards/margins": 14.179154396057129, + "eval_rewards/rejected": -10.799394607543945, + "eval_runtime": 571.3626, + "eval_samples_per_second": 16.627, + "eval_steps_per_second": 0.52, + "step": 6400 + }, + { + "epoch": 2.18, + "learning_rate": 1.5208359561878384e-07, + "logits/chosen": -1.591533899307251, + "logits/rejected": -1.3876944780349731, + "logps/chosen": -390.2744140625, + "logps/rejected": -589.9276123046875, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2361819744110107, + "rewards/margins": 13.965123176574707, + "rewards/rejected": -10.728940963745117, + "step": 6410 + }, + { + "epoch": 2.18, + "learning_rate": 1.5145411053758026e-07, + "logits/chosen": -1.5609999895095825, + "logits/rejected": -1.3654464483261108, + "logps/chosen": -389.9656066894531, + "logps/rejected": -637.1482543945312, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.197247266769409, + "rewards/margins": 15.399210929870605, + "rewards/rejected": -12.201963424682617, + "step": 6420 + }, + { + "epoch": 2.19, + "learning_rate": 1.5082462545637666e-07, + "logits/chosen": -1.600942611694336, + "logits/rejected": -1.3456239700317383, + "logps/chosen": -321.9989929199219, + "logps/rejected": -474.2069396972656, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2121589183807373, + "rewards/margins": 14.064419746398926, + "rewards/rejected": -10.85226058959961, + "step": 6430 + }, + { + "epoch": 2.19, + "learning_rate": 1.501951403751731e-07, + "logits/chosen": -1.5909321308135986, + "logits/rejected": -1.3883854150772095, + "logps/chosen": -316.77093505859375, + "logps/rejected": -648.8579711914062, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.393423557281494, + "rewards/margins": 14.023935317993164, + "rewards/rejected": -10.630510330200195, + "step": 6440 + }, + { + "epoch": 2.19, + "learning_rate": 1.4956565529396953e-07, + "logits/chosen": -1.5898196697235107, + "logits/rejected": -1.3238946199417114, + "logps/chosen": -363.89239501953125, + "logps/rejected": -623.1990966796875, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.178192377090454, + "rewards/margins": 15.123025894165039, + "rewards/rejected": -11.944832801818848, + "step": 6450 + }, + { + "epoch": 2.2, + "learning_rate": 1.4893617021276595e-07, + "logits/chosen": -1.564621925354004, + "logits/rejected": -1.3675488233566284, + "logps/chosen": -339.17852783203125, + "logps/rejected": -733.6370239257812, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.584500551223755, + "rewards/margins": 15.411958694458008, + "rewards/rejected": -11.827457427978516, + "step": 6460 + }, + { + "epoch": 2.2, + "learning_rate": 1.4830668513156238e-07, + "logits/chosen": -1.5763976573944092, + "logits/rejected": -1.3346980810165405, + "logps/chosen": -291.3924255371094, + "logps/rejected": -416.7007751464844, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.118260383605957, + "rewards/margins": 14.34459114074707, + "rewards/rejected": -11.22633171081543, + "step": 6470 + }, + { + "epoch": 2.2, + "learning_rate": 1.476772000503588e-07, + "logits/chosen": -1.5817840099334717, + "logits/rejected": -1.3806138038635254, + "logps/chosen": -328.1126403808594, + "logps/rejected": -548.0718994140625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.615523099899292, + "rewards/margins": 14.706913948059082, + "rewards/rejected": -11.091389656066895, + "step": 6480 + }, + { + "epoch": 2.21, + "learning_rate": 1.4704771496915522e-07, + "logits/chosen": -1.5833837985992432, + "logits/rejected": -1.3564140796661377, + "logps/chosen": -352.1372985839844, + "logps/rejected": -552.1243896484375, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.284930467605591, + "rewards/margins": 14.432092666625977, + "rewards/rejected": -11.147162437438965, + "step": 6490 + }, + { + "epoch": 2.21, + "learning_rate": 1.4641822988795167e-07, + "logits/chosen": -1.5765619277954102, + "logits/rejected": -1.391261100769043, + "logps/chosen": -325.61993408203125, + "logps/rejected": -666.7384643554688, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1809964179992676, + "rewards/margins": 13.457502365112305, + "rewards/rejected": -10.276507377624512, + "step": 6500 + }, + { + "epoch": 2.21, + "eval_logits/chosen": -1.5743842124938965, + "eval_logits/rejected": -1.3582652807235718, + "eval_logps/chosen": -384.0976867675781, + "eval_logps/rejected": -664.8441162109375, + "eval_loss": 0.007639728020876646, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.582503080368042, + "eval_rewards/margins": 14.115275382995605, + "eval_rewards/rejected": -10.532772064208984, + "eval_runtime": 572.1252, + "eval_samples_per_second": 16.605, + "eval_steps_per_second": 0.519, + "step": 6500 + }, + { + "epoch": 2.21, + "learning_rate": 1.4578874480674807e-07, + "logits/chosen": -1.575344443321228, + "logits/rejected": -1.3377422094345093, + "logps/chosen": -436.69732666015625, + "logps/rejected": -614.0791625976562, + "loss": 0.0048, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.3779194355010986, + "rewards/margins": 14.324641227722168, + "rewards/rejected": -10.946722030639648, + "step": 6510 + }, + { + "epoch": 2.22, + "learning_rate": 1.451592597255445e-07, + "logits/chosen": -1.5191529989242554, + "logits/rejected": -1.3680410385131836, + "logps/chosen": -422.9986267089844, + "logps/rejected": -610.8345947265625, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2538719177246094, + "rewards/margins": 13.690958023071289, + "rewards/rejected": -10.437085151672363, + "step": 6520 + }, + { + "epoch": 2.22, + "learning_rate": 1.4452977464434094e-07, + "logits/chosen": -1.5816547870635986, + "logits/rejected": -1.4170851707458496, + "logps/chosen": -371.47015380859375, + "logps/rejected": -624.63134765625, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.505117893218994, + "rewards/margins": 13.807650566101074, + "rewards/rejected": -10.302534103393555, + "step": 6530 + }, + { + "epoch": 2.22, + "learning_rate": 1.4390028956313736e-07, + "logits/chosen": -1.5831215381622314, + "logits/rejected": -1.3600349426269531, + "logps/chosen": -333.8761291503906, + "logps/rejected": -656.4736938476562, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7498302459716797, + "rewards/margins": 15.26855754852295, + "rewards/rejected": -11.518728256225586, + "step": 6540 + }, + { + "epoch": 2.23, + "learning_rate": 1.4327080448193376e-07, + "logits/chosen": -1.552154302597046, + "logits/rejected": -1.3444443941116333, + "logps/chosen": -516.4844360351562, + "logps/rejected": -561.1699829101562, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.515700578689575, + "rewards/margins": 14.293835639953613, + "rewards/rejected": -10.778135299682617, + "step": 6550 + }, + { + "epoch": 2.23, + "learning_rate": 1.426413194007302e-07, + "logits/chosen": -1.6008199453353882, + "logits/rejected": -1.4133853912353516, + "logps/chosen": -342.3550720214844, + "logps/rejected": -629.8597412109375, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.1179094314575195, + "rewards/margins": 14.55487060546875, + "rewards/rejected": -10.436960220336914, + "step": 6560 + }, + { + "epoch": 2.23, + "learning_rate": 1.4201183431952663e-07, + "logits/chosen": -1.5721176862716675, + "logits/rejected": -1.3438633680343628, + "logps/chosen": -396.29815673828125, + "logps/rejected": -872.9259643554688, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7063846588134766, + "rewards/margins": 14.292887687683105, + "rewards/rejected": -10.586503982543945, + "step": 6570 + }, + { + "epoch": 2.24, + "learning_rate": 1.4138234923832303e-07, + "logits/chosen": -1.5794665813446045, + "logits/rejected": -1.4185601472854614, + "logps/chosen": -400.0427551269531, + "logps/rejected": -698.7635498046875, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.174726963043213, + "rewards/margins": 14.291448593139648, + "rewards/rejected": -10.116722106933594, + "step": 6580 + }, + { + "epoch": 2.24, + "learning_rate": 1.4075286415711948e-07, + "logits/chosen": -1.6006819009780884, + "logits/rejected": -1.3750007152557373, + "logps/chosen": -411.69903564453125, + "logps/rejected": -537.7091674804688, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.134584665298462, + "rewards/margins": 13.69175910949707, + "rewards/rejected": -10.557173728942871, + "step": 6590 + }, + { + "epoch": 2.24, + "learning_rate": 1.401233790759159e-07, + "logits/chosen": -1.5483369827270508, + "logits/rejected": -1.4290311336517334, + "logps/chosen": -427.8651428222656, + "logps/rejected": -637.8646240234375, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1674082279205322, + "rewards/margins": 13.184328079223633, + "rewards/rejected": -10.016919136047363, + "step": 6600 + }, + { + "epoch": 2.24, + "eval_logits/chosen": -1.584792137145996, + "eval_logits/rejected": -1.3603935241699219, + "eval_logps/chosen": -384.3702392578125, + "eval_logps/rejected": -662.9480590820312, + "eval_loss": 0.007546957582235336, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.5552539825439453, + "eval_rewards/margins": 13.898427963256836, + "eval_rewards/rejected": -10.343173027038574, + "eval_runtime": 569.3007, + "eval_samples_per_second": 16.687, + "eval_steps_per_second": 0.522, + "step": 6600 + }, + { + "epoch": 2.25, + "learning_rate": 1.3949389399471232e-07, + "logits/chosen": -1.5965279340744019, + "logits/rejected": -1.3698196411132812, + "logps/chosen": -317.65069580078125, + "logps/rejected": -662.3995971679688, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8021888732910156, + "rewards/margins": 14.649023056030273, + "rewards/rejected": -10.846835136413574, + "step": 6610 + }, + { + "epoch": 2.25, + "learning_rate": 1.3886440891350874e-07, + "logits/chosen": -1.5680992603302002, + "logits/rejected": -1.35530686378479, + "logps/chosen": -408.56927490234375, + "logps/rejected": -486.865234375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5884852409362793, + "rewards/margins": 14.509864807128906, + "rewards/rejected": -10.921378135681152, + "step": 6620 + }, + { + "epoch": 2.25, + "learning_rate": 1.3823492383230517e-07, + "logits/chosen": -1.5966899394989014, + "logits/rejected": -1.325230598449707, + "logps/chosen": -335.7962951660156, + "logps/rejected": -728.0191650390625, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.631432056427002, + "rewards/margins": 14.483572006225586, + "rewards/rejected": -10.852140426635742, + "step": 6630 + }, + { + "epoch": 2.26, + "learning_rate": 1.376054387511016e-07, + "logits/chosen": -1.5955592393875122, + "logits/rejected": -1.4142637252807617, + "logps/chosen": -356.5505065917969, + "logps/rejected": -655.1959838867188, + "loss": 0.002, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.0483269691467285, + "rewards/margins": 13.817525863647461, + "rewards/rejected": -10.769198417663574, + "step": 6640 + }, + { + "epoch": 2.26, + "learning_rate": 1.36975953669898e-07, + "logits/chosen": -1.579647183418274, + "logits/rejected": -1.3854573965072632, + "logps/chosen": -379.48553466796875, + "logps/rejected": -516.62109375, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2711269855499268, + "rewards/margins": 13.301862716674805, + "rewards/rejected": -10.030735969543457, + "step": 6650 + }, + { + "epoch": 2.26, + "learning_rate": 1.3634646858869444e-07, + "logits/chosen": -1.5828149318695068, + "logits/rejected": -1.3179619312286377, + "logps/chosen": -345.9654235839844, + "logps/rejected": -493.1145935058594, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1242008209228516, + "rewards/margins": 13.35713005065918, + "rewards/rejected": -10.232929229736328, + "step": 6660 + }, + { + "epoch": 2.27, + "learning_rate": 1.3571698350749086e-07, + "logits/chosen": -1.5601648092269897, + "logits/rejected": -1.378584384918213, + "logps/chosen": -384.14739990234375, + "logps/rejected": -636.5850830078125, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.830223798751831, + "rewards/margins": 14.211817741394043, + "rewards/rejected": -10.381593704223633, + "step": 6670 + }, + { + "epoch": 2.27, + "learning_rate": 1.3508749842628728e-07, + "logits/chosen": -1.5668874979019165, + "logits/rejected": -1.3902943134307861, + "logps/chosen": -375.37701416015625, + "logps/rejected": -653.6080322265625, + "loss": 0.0054, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 4.2413153648376465, + "rewards/margins": 13.837162971496582, + "rewards/rejected": -9.595846176147461, + "step": 6680 + }, + { + "epoch": 2.27, + "learning_rate": 1.3445801334508373e-07, + "logits/chosen": -1.556911587715149, + "logits/rejected": -1.3844763040542603, + "logps/chosen": -438.6883850097656, + "logps/rejected": -398.62237548828125, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5050048828125, + "rewards/margins": 13.407320976257324, + "rewards/rejected": -9.90231704711914, + "step": 6690 + }, + { + "epoch": 2.28, + "learning_rate": 1.3382852826388013e-07, + "logits/chosen": -1.5897338390350342, + "logits/rejected": -1.3608797788619995, + "logps/chosen": -340.8258056640625, + "logps/rejected": -532.7095947265625, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2200019359588623, + "rewards/margins": 13.246417045593262, + "rewards/rejected": -10.02641487121582, + "step": 6700 + }, + { + "epoch": 2.28, + "eval_logits/chosen": -1.5692567825317383, + "eval_logits/rejected": -1.3437275886535645, + "eval_logps/chosen": -382.6127014160156, + "eval_logps/rejected": -662.8403930664062, + "eval_loss": 0.008176215924322605, + "eval_rewards/accuracies": 0.9991582632064819, + "eval_rewards/chosen": 3.731004238128662, + "eval_rewards/margins": 14.063405990600586, + "eval_rewards/rejected": -10.332402229309082, + "eval_runtime": 570.1677, + "eval_samples_per_second": 16.662, + "eval_steps_per_second": 0.521, + "step": 6700 + }, + { + "epoch": 2.28, + "learning_rate": 1.3319904318267655e-07, + "logits/chosen": -1.566868782043457, + "logits/rejected": -1.399821162223816, + "logps/chosen": -384.14630126953125, + "logps/rejected": -699.1255493164062, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.569903612136841, + "rewards/margins": 14.030588150024414, + "rewards/rejected": -10.460684776306152, + "step": 6710 + }, + { + "epoch": 2.28, + "learning_rate": 1.32569558101473e-07, + "logits/chosen": -1.5901718139648438, + "logits/rejected": -1.3749638795852661, + "logps/chosen": -354.3837585449219, + "logps/rejected": -655.8762817382812, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7713146209716797, + "rewards/margins": 14.032336235046387, + "rewards/rejected": -10.26102066040039, + "step": 6720 + }, + { + "epoch": 2.29, + "learning_rate": 1.3194007302026942e-07, + "logits/chosen": -1.5647567510604858, + "logits/rejected": -1.345435380935669, + "logps/chosen": -363.5932922363281, + "logps/rejected": -533.5272827148438, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.929785966873169, + "rewards/margins": 14.380762100219727, + "rewards/rejected": -10.450976371765137, + "step": 6730 + }, + { + "epoch": 2.29, + "learning_rate": 1.3131058793906582e-07, + "logits/chosen": -1.5715420246124268, + "logits/rejected": -1.3576515913009644, + "logps/chosen": -452.7503967285156, + "logps/rejected": -590.1964111328125, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.610787868499756, + "rewards/margins": 13.278940200805664, + "rewards/rejected": -9.668152809143066, + "step": 6740 + }, + { + "epoch": 2.29, + "learning_rate": 1.3068110285786227e-07, + "logits/chosen": -1.5701452493667603, + "logits/rejected": -1.4307655096054077, + "logps/chosen": -390.6100769042969, + "logps/rejected": -555.83740234375, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7854411602020264, + "rewards/margins": 13.554372787475586, + "rewards/rejected": -9.76893138885498, + "step": 6750 + }, + { + "epoch": 2.3, + "learning_rate": 1.300516177766587e-07, + "logits/chosen": -1.566888689994812, + "logits/rejected": -1.4025644063949585, + "logps/chosen": -388.9593811035156, + "logps/rejected": -590.3077392578125, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5975394248962402, + "rewards/margins": 13.576024055480957, + "rewards/rejected": -9.978484153747559, + "step": 6760 + }, + { + "epoch": 2.3, + "learning_rate": 1.294221326954551e-07, + "logits/chosen": -1.5657069683074951, + "logits/rejected": -1.3441110849380493, + "logps/chosen": -398.51104736328125, + "logps/rejected": -825.87744140625, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5940654277801514, + "rewards/margins": 14.752792358398438, + "rewards/rejected": -11.158726692199707, + "step": 6770 + }, + { + "epoch": 2.3, + "learning_rate": 1.2879264761425154e-07, + "logits/chosen": -1.5740673542022705, + "logits/rejected": -1.3703259229660034, + "logps/chosen": -354.5306091308594, + "logps/rejected": -504.9317932128906, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7628026008605957, + "rewards/margins": 14.886259078979492, + "rewards/rejected": -11.123456954956055, + "step": 6780 + }, + { + "epoch": 2.31, + "learning_rate": 1.2816316253304796e-07, + "logits/chosen": -1.586774468421936, + "logits/rejected": -1.3477851152420044, + "logps/chosen": -284.14825439453125, + "logps/rejected": -580.8948974609375, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.604658842086792, + "rewards/margins": 14.037424087524414, + "rewards/rejected": -10.432764053344727, + "step": 6790 + }, + { + "epoch": 2.31, + "learning_rate": 1.2753367745184438e-07, + "logits/chosen": -1.5886600017547607, + "logits/rejected": -1.3811349868774414, + "logps/chosen": -393.3787841796875, + "logps/rejected": -467.98187255859375, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8067924976348877, + "rewards/margins": 14.1246337890625, + "rewards/rejected": -10.317841529846191, + "step": 6800 + }, + { + "epoch": 2.31, + "eval_logits/chosen": -1.565779685974121, + "eval_logits/rejected": -1.3487393856048584, + "eval_logps/chosen": -384.3409423828125, + "eval_logps/rejected": -666.226318359375, + "eval_loss": 0.007372667081654072, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 3.558180332183838, + "eval_rewards/margins": 14.229174613952637, + "eval_rewards/rejected": -10.670992851257324, + "eval_runtime": 571.7064, + "eval_samples_per_second": 16.617, + "eval_steps_per_second": 0.519, + "step": 6800 + }, + { + "epoch": 2.31, + "learning_rate": 1.2690419237064083e-07, + "logits/chosen": -1.5575084686279297, + "logits/rejected": -1.3819316625595093, + "logps/chosen": -510.38140869140625, + "logps/rejected": -679.424072265625, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.165252685546875, + "rewards/margins": 13.866109848022461, + "rewards/rejected": -9.700857162475586, + "step": 6810 + }, + { + "epoch": 2.32, + "learning_rate": 1.2627470728943723e-07, + "logits/chosen": -1.555760145187378, + "logits/rejected": -1.364272117614746, + "logps/chosen": -501.7169494628906, + "logps/rejected": -739.920166015625, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.260267734527588, + "rewards/margins": 14.104917526245117, + "rewards/rejected": -10.844651222229004, + "step": 6820 + }, + { + "epoch": 2.32, + "learning_rate": 1.2564522220823365e-07, + "logits/chosen": -1.568599820137024, + "logits/rejected": -1.370758056640625, + "logps/chosen": -398.878173828125, + "logps/rejected": -704.766845703125, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.0346360206604, + "rewards/margins": 14.279245376586914, + "rewards/rejected": -10.244607925415039, + "step": 6830 + }, + { + "epoch": 2.32, + "learning_rate": 1.250157371270301e-07, + "logits/chosen": -1.5861151218414307, + "logits/rejected": -1.3264439105987549, + "logps/chosen": -405.00445556640625, + "logps/rejected": -539.4598388671875, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7309226989746094, + "rewards/margins": 14.613065719604492, + "rewards/rejected": -10.882143020629883, + "step": 6840 + }, + { + "epoch": 2.33, + "learning_rate": 1.243862520458265e-07, + "logits/chosen": -1.5699310302734375, + "logits/rejected": -1.4282658100128174, + "logps/chosen": -385.1383361816406, + "logps/rejected": -507.529296875, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2965614795684814, + "rewards/margins": 13.809582710266113, + "rewards/rejected": -10.513020515441895, + "step": 6850 + }, + { + "epoch": 2.33, + "learning_rate": 1.2375676696462294e-07, + "logits/chosen": -1.5978082418441772, + "logits/rejected": -1.4050474166870117, + "logps/chosen": -419.123291015625, + "logps/rejected": -808.9733276367188, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8502037525177, + "rewards/margins": 13.376398086547852, + "rewards/rejected": -10.526193618774414, + "step": 6860 + }, + { + "epoch": 2.34, + "learning_rate": 1.2312728188341934e-07, + "logits/chosen": -1.5935125350952148, + "logits/rejected": -1.3760064840316772, + "logps/chosen": -356.4635314941406, + "logps/rejected": -690.5131225585938, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9484024047851562, + "rewards/margins": 15.973760604858398, + "rewards/rejected": -12.025357246398926, + "step": 6870 + }, + { + "epoch": 2.34, + "learning_rate": 1.224977968022158e-07, + "logits/chosen": -1.5837767124176025, + "logits/rejected": -1.3753459453582764, + "logps/chosen": -461.54058837890625, + "logps/rejected": -610.938720703125, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4065334796905518, + "rewards/margins": 13.856122016906738, + "rewards/rejected": -10.44958782196045, + "step": 6880 + }, + { + "epoch": 2.34, + "learning_rate": 1.218683117210122e-07, + "logits/chosen": -1.585831880569458, + "logits/rejected": -1.3505988121032715, + "logps/chosen": -330.95819091796875, + "logps/rejected": -752.7947387695312, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.245179176330566, + "rewards/margins": 15.41187858581543, + "rewards/rejected": -11.166702270507812, + "step": 6890 + }, + { + "epoch": 2.35, + "learning_rate": 1.2123882663980863e-07, + "logits/chosen": -1.5682971477508545, + "logits/rejected": -1.355499267578125, + "logps/chosen": -429.307861328125, + "logps/rejected": -800.7735595703125, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.566223621368408, + "rewards/margins": 13.6681489944458, + "rewards/rejected": -10.101926803588867, + "step": 6900 + }, + { + "epoch": 2.35, + "eval_logits/chosen": -1.5687878131866455, + "eval_logits/rejected": -1.3470209836959839, + "eval_logps/chosen": -384.0080871582031, + "eval_logps/rejected": -667.3019409179688, + "eval_loss": 0.007594715338200331, + "eval_rewards/accuracies": 0.996632993221283, + "eval_rewards/chosen": 3.5914647579193115, + "eval_rewards/margins": 14.370023727416992, + "eval_rewards/rejected": -10.778556823730469, + "eval_runtime": 571.7969, + "eval_samples_per_second": 16.614, + "eval_steps_per_second": 0.519, + "step": 6900 + }, + { + "epoch": 2.35, + "learning_rate": 1.2060934155860506e-07, + "logits/chosen": -1.5775777101516724, + "logits/rejected": -1.3184734582901, + "logps/chosen": -360.031494140625, + "logps/rejected": -538.359130859375, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4221057891845703, + "rewards/margins": 13.670002937316895, + "rewards/rejected": -10.247896194458008, + "step": 6910 + }, + { + "epoch": 2.35, + "learning_rate": 1.1997985647740148e-07, + "logits/chosen": -1.5940953493118286, + "logits/rejected": -1.3757946491241455, + "logps/chosen": -349.8443908691406, + "logps/rejected": -645.4672241210938, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.440652370452881, + "rewards/margins": 14.937997817993164, + "rewards/rejected": -10.497344970703125, + "step": 6920 + }, + { + "epoch": 2.36, + "learning_rate": 1.193503713961979e-07, + "logits/chosen": -1.5421262979507446, + "logits/rejected": -1.3159245252609253, + "logps/chosen": -524.6812133789062, + "logps/rejected": -917.3536987304688, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.625113010406494, + "rewards/margins": 14.569310188293457, + "rewards/rejected": -10.944195747375488, + "step": 6930 + }, + { + "epoch": 2.36, + "learning_rate": 1.1872088631499433e-07, + "logits/chosen": -1.5573019981384277, + "logits/rejected": -1.3715643882751465, + "logps/chosen": -347.124267578125, + "logps/rejected": -724.9014892578125, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3002593517303467, + "rewards/margins": 14.475519180297852, + "rewards/rejected": -11.175260543823242, + "step": 6940 + }, + { + "epoch": 2.36, + "learning_rate": 1.1809140123379076e-07, + "logits/chosen": -1.5602883100509644, + "logits/rejected": -1.4126348495483398, + "logps/chosen": -365.7541809082031, + "logps/rejected": -728.0130615234375, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.736027479171753, + "rewards/margins": 14.09008502960205, + "rewards/rejected": -10.354058265686035, + "step": 6950 + }, + { + "epoch": 2.37, + "learning_rate": 1.1746191615258717e-07, + "logits/chosen": -1.5862915515899658, + "logits/rejected": -1.350115180015564, + "logps/chosen": -345.8599548339844, + "logps/rejected": -700.153076171875, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8681564331054688, + "rewards/margins": 15.308801651000977, + "rewards/rejected": -11.440645217895508, + "step": 6960 + }, + { + "epoch": 2.37, + "learning_rate": 1.1683243107138361e-07, + "logits/chosen": -1.5725862979888916, + "logits/rejected": -1.3288795948028564, + "logps/chosen": -416.9593811035156, + "logps/rejected": -572.9369506835938, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.048195838928223, + "rewards/margins": 14.201942443847656, + "rewards/rejected": -10.153745651245117, + "step": 6970 + }, + { + "epoch": 2.37, + "learning_rate": 1.1620294599018003e-07, + "logits/chosen": -1.5789787769317627, + "logits/rejected": -1.3985240459442139, + "logps/chosen": -335.38128662109375, + "logps/rejected": -549.26171875, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4752185344696045, + "rewards/margins": 13.310501098632812, + "rewards/rejected": -9.835283279418945, + "step": 6980 + }, + { + "epoch": 2.38, + "learning_rate": 1.1557346090897645e-07, + "logits/chosen": -1.5853245258331299, + "logits/rejected": -1.4135210514068604, + "logps/chosen": -323.85943603515625, + "logps/rejected": -841.52587890625, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.927032470703125, + "rewards/margins": 14.641139030456543, + "rewards/rejected": -10.714105606079102, + "step": 6990 + }, + { + "epoch": 2.38, + "learning_rate": 1.1494397582777288e-07, + "logits/chosen": -1.5870996713638306, + "logits/rejected": -1.3939707279205322, + "logps/chosen": -304.43341064453125, + "logps/rejected": -709.2398681640625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.461134672164917, + "rewards/margins": 13.570245742797852, + "rewards/rejected": -10.109109878540039, + "step": 7000 + }, + { + "epoch": 2.38, + "eval_logits/chosen": -1.577390193939209, + "eval_logits/rejected": -1.3535740375518799, + "eval_logps/chosen": -383.8625183105469, + "eval_logps/rejected": -665.5233764648438, + "eval_loss": 0.008002311922609806, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 3.6060264110565186, + "eval_rewards/margins": 14.206722259521484, + "eval_rewards/rejected": -10.600696563720703, + "eval_runtime": 571.8975, + "eval_samples_per_second": 16.611, + "eval_steps_per_second": 0.519, + "step": 7000 + }, + { + "epoch": 2.38, + "learning_rate": 1.1431449074656931e-07, + "logits/chosen": -1.5623493194580078, + "logits/rejected": -1.4044008255004883, + "logps/chosen": -544.9088745117188, + "logps/rejected": -524.11474609375, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.415673017501831, + "rewards/margins": 14.24224853515625, + "rewards/rejected": -10.826576232910156, + "step": 7010 + }, + { + "epoch": 2.39, + "learning_rate": 1.1368500566536572e-07, + "logits/chosen": -1.5702780485153198, + "logits/rejected": -1.399794101715088, + "logps/chosen": -480.2239685058594, + "logps/rejected": -741.3621215820312, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7297675609588623, + "rewards/margins": 13.888583183288574, + "rewards/rejected": -10.158815383911133, + "step": 7020 + }, + { + "epoch": 2.39, + "learning_rate": 1.1305552058416214e-07, + "logits/chosen": -1.5777041912078857, + "logits/rejected": -1.3729133605957031, + "logps/chosen": -353.8095703125, + "logps/rejected": -664.9339599609375, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5037841796875, + "rewards/margins": 14.553430557250977, + "rewards/rejected": -11.049646377563477, + "step": 7030 + }, + { + "epoch": 2.39, + "learning_rate": 1.1242603550295858e-07, + "logits/chosen": -1.5780526399612427, + "logits/rejected": -1.3785450458526611, + "logps/chosen": -315.7511901855469, + "logps/rejected": -740.5752563476562, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.938439130783081, + "rewards/margins": 15.353727340698242, + "rewards/rejected": -11.415287971496582, + "step": 7040 + }, + { + "epoch": 2.4, + "learning_rate": 1.1179655042175499e-07, + "logits/chosen": -1.573671579360962, + "logits/rejected": -1.3027465343475342, + "logps/chosen": -350.18634033203125, + "logps/rejected": -501.58135986328125, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0180866718292236, + "rewards/margins": 13.756540298461914, + "rewards/rejected": -10.738452911376953, + "step": 7050 + }, + { + "epoch": 2.4, + "learning_rate": 1.1116706534055143e-07, + "logits/chosen": -1.5909698009490967, + "logits/rejected": -1.284363031387329, + "logps/chosen": -399.37054443359375, + "logps/rejected": -595.177978515625, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.924706220626831, + "rewards/margins": 15.391641616821289, + "rewards/rejected": -11.466936111450195, + "step": 7060 + }, + { + "epoch": 2.4, + "learning_rate": 1.1053758025934785e-07, + "logits/chosen": -1.5602728128433228, + "logits/rejected": -1.3765531778335571, + "logps/chosen": -327.498046875, + "logps/rejected": -850.45654296875, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.34861421585083, + "rewards/margins": 14.247636795043945, + "rewards/rejected": -10.89902400970459, + "step": 7070 + }, + { + "epoch": 2.41, + "learning_rate": 1.0990809517814427e-07, + "logits/chosen": -1.6001904010772705, + "logits/rejected": -1.3743841648101807, + "logps/chosen": -400.81640625, + "logps/rejected": -525.2322387695312, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5727505683898926, + "rewards/margins": 13.725286483764648, + "rewards/rejected": -10.15253734588623, + "step": 7080 + }, + { + "epoch": 2.41, + "learning_rate": 1.092786100969407e-07, + "logits/chosen": -1.5767635107040405, + "logits/rejected": -1.4243793487548828, + "logps/chosen": -342.7900390625, + "logps/rejected": -660.6629638671875, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.0674614906311035, + "rewards/margins": 14.147119522094727, + "rewards/rejected": -10.079660415649414, + "step": 7090 + }, + { + "epoch": 2.41, + "learning_rate": 1.0864912501573713e-07, + "logits/chosen": -1.5857694149017334, + "logits/rejected": -1.4307571649551392, + "logps/chosen": -318.28363037109375, + "logps/rejected": -575.15478515625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9067416191101074, + "rewards/margins": 11.60275936126709, + "rewards/rejected": -8.69601821899414, + "step": 7100 + }, + { + "epoch": 2.41, + "eval_logits/chosen": -1.5681462287902832, + "eval_logits/rejected": -1.344546914100647, + "eval_logps/chosen": -384.29534912109375, + "eval_logps/rejected": -668.2890625, + "eval_loss": 0.006281446199864149, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 3.5627384185791016, + "eval_rewards/margins": 14.440010070800781, + "eval_rewards/rejected": -10.87727165222168, + "eval_runtime": 568.8846, + "eval_samples_per_second": 16.699, + "eval_steps_per_second": 0.522, + "step": 7100 + }, + { + "epoch": 2.42, + "learning_rate": 1.0801963993453354e-07, + "logits/chosen": -1.609238624572754, + "logits/rejected": -1.4078028202056885, + "logps/chosen": -339.06048583984375, + "logps/rejected": -479.1642150878906, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.761439800262451, + "rewards/margins": 14.099250793457031, + "rewards/rejected": -10.337812423706055, + "step": 7110 + }, + { + "epoch": 2.42, + "learning_rate": 1.0739015485332998e-07, + "logits/chosen": -1.5482730865478516, + "logits/rejected": -1.3797681331634521, + "logps/chosen": -432.4139709472656, + "logps/rejected": -724.8422241210938, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.864971876144409, + "rewards/margins": 14.196083068847656, + "rewards/rejected": -10.331110954284668, + "step": 7120 + }, + { + "epoch": 2.42, + "learning_rate": 1.067606697721264e-07, + "logits/chosen": -1.573305368423462, + "logits/rejected": -1.4095571041107178, + "logps/chosen": -405.9944763183594, + "logps/rejected": -581.6749267578125, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.036935329437256, + "rewards/margins": 13.76244068145752, + "rewards/rejected": -10.725503921508789, + "step": 7130 + }, + { + "epoch": 2.43, + "learning_rate": 1.0613118469092282e-07, + "logits/chosen": -1.5819218158721924, + "logits/rejected": -1.3558950424194336, + "logps/chosen": -397.9393005371094, + "logps/rejected": -1036.35107421875, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.331040620803833, + "rewards/margins": 14.022743225097656, + "rewards/rejected": -10.691704750061035, + "step": 7140 + }, + { + "epoch": 2.43, + "learning_rate": 1.0550169960971924e-07, + "logits/chosen": -1.5679513216018677, + "logits/rejected": -1.3929964303970337, + "logps/chosen": -449.7079162597656, + "logps/rejected": -813.3853149414062, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3755314350128174, + "rewards/margins": 14.53040599822998, + "rewards/rejected": -11.154873847961426, + "step": 7150 + }, + { + "epoch": 2.43, + "learning_rate": 1.0487221452851568e-07, + "logits/chosen": -1.5713636875152588, + "logits/rejected": -1.4483355283737183, + "logps/chosen": -373.6534423828125, + "logps/rejected": -832.5340576171875, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2033112049102783, + "rewards/margins": 13.972986221313477, + "rewards/rejected": -10.769674301147461, + "step": 7160 + }, + { + "epoch": 2.44, + "learning_rate": 1.0424272944731209e-07, + "logits/chosen": -1.5848790407180786, + "logits/rejected": -1.3746501207351685, + "logps/chosen": -421.29278564453125, + "logps/rejected": -789.7203979492188, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5114974975585938, + "rewards/margins": 15.449055671691895, + "rewards/rejected": -11.937556266784668, + "step": 7170 + }, + { + "epoch": 2.44, + "learning_rate": 1.0361324436610853e-07, + "logits/chosen": -1.5968456268310547, + "logits/rejected": -1.315861701965332, + "logps/chosen": -410.9493713378906, + "logps/rejected": -579.80224609375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8760547637939453, + "rewards/margins": 16.206562042236328, + "rewards/rejected": -12.330510139465332, + "step": 7180 + }, + { + "epoch": 2.44, + "learning_rate": 1.0298375928490494e-07, + "logits/chosen": -1.5887360572814941, + "logits/rejected": -1.283867359161377, + "logps/chosen": -318.6828918457031, + "logps/rejected": -685.0931396484375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3481574058532715, + "rewards/margins": 14.906071662902832, + "rewards/rejected": -11.557912826538086, + "step": 7190 + }, + { + "epoch": 2.45, + "learning_rate": 1.0235427420370137e-07, + "logits/chosen": -1.577286958694458, + "logits/rejected": -1.3959635496139526, + "logps/chosen": -352.7952575683594, + "logps/rejected": -629.1926879882812, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5100455284118652, + "rewards/margins": 14.476755142211914, + "rewards/rejected": -10.96670913696289, + "step": 7200 + }, + { + "epoch": 2.45, + "eval_logits/chosen": -1.5818926095962524, + "eval_logits/rejected": -1.3655781745910645, + "eval_logps/chosen": -385.6861877441406, + "eval_logps/rejected": -670.2083129882812, + "eval_loss": 0.007005918771028519, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 3.4236550331115723, + "eval_rewards/margins": 14.492846488952637, + "eval_rewards/rejected": -11.06919002532959, + "eval_runtime": 570.3635, + "eval_samples_per_second": 16.656, + "eval_steps_per_second": 0.521, + "step": 7200 + }, + { + "epoch": 2.45, + "learning_rate": 1.017247891224978e-07, + "logits/chosen": -1.5919464826583862, + "logits/rejected": -1.4004160165786743, + "logps/chosen": -330.8450012207031, + "logps/rejected": -439.6155700683594, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.686676025390625, + "rewards/margins": 13.914591789245605, + "rewards/rejected": -10.22791576385498, + "step": 7210 + }, + { + "epoch": 2.45, + "learning_rate": 1.0109530404129422e-07, + "logits/chosen": -1.5849114656448364, + "logits/rejected": -1.3717721700668335, + "logps/chosen": -412.67120361328125, + "logps/rejected": -538.4139404296875, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.431063413619995, + "rewards/margins": 13.679788589477539, + "rewards/rejected": -10.248724937438965, + "step": 7220 + }, + { + "epoch": 2.46, + "learning_rate": 1.0046581896009064e-07, + "logits/chosen": -1.582661747932434, + "logits/rejected": -1.330621361732483, + "logps/chosen": -378.28912353515625, + "logps/rejected": -607.7154541015625, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4779610633850098, + "rewards/margins": 14.362543106079102, + "rewards/rejected": -10.884581565856934, + "step": 7230 + }, + { + "epoch": 2.46, + "learning_rate": 9.983633387888708e-08, + "logits/chosen": -1.5704317092895508, + "logits/rejected": -1.3496906757354736, + "logps/chosen": -384.03448486328125, + "logps/rejected": -866.3507690429688, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.432457685470581, + "rewards/margins": 14.304654121398926, + "rewards/rejected": -10.872194290161133, + "step": 7240 + }, + { + "epoch": 2.46, + "learning_rate": 9.920684879768348e-08, + "logits/chosen": -1.6106436252593994, + "logits/rejected": -1.376107931137085, + "logps/chosen": -477.6227111816406, + "logps/rejected": -708.9939575195312, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4307682514190674, + "rewards/margins": 13.428256034851074, + "rewards/rejected": -9.997488975524902, + "step": 7250 + }, + { + "epoch": 2.47, + "learning_rate": 9.857736371647991e-08, + "logits/chosen": -1.5716253519058228, + "logits/rejected": -1.3926492929458618, + "logps/chosen": -486.75592041015625, + "logps/rejected": -749.6036987304688, + "loss": 0.0049, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.7937064170837402, + "rewards/margins": 14.991548538208008, + "rewards/rejected": -11.19784164428711, + "step": 7260 + }, + { + "epoch": 2.47, + "learning_rate": 9.794787863527634e-08, + "logits/chosen": -1.5732413530349731, + "logits/rejected": -1.4033808708190918, + "logps/chosen": -485.3672790527344, + "logps/rejected": -441.64678955078125, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9441161155700684, + "rewards/margins": 16.51407814025879, + "rewards/rejected": -12.569963455200195, + "step": 7270 + }, + { + "epoch": 2.47, + "learning_rate": 9.731839355407275e-08, + "logits/chosen": -1.5941002368927002, + "logits/rejected": -1.42324960231781, + "logps/chosen": -381.661376953125, + "logps/rejected": -487.35894775390625, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.004810333251953, + "rewards/margins": 13.481122016906738, + "rewards/rejected": -9.476312637329102, + "step": 7280 + }, + { + "epoch": 2.48, + "learning_rate": 9.668890847286919e-08, + "logits/chosen": -1.5920240879058838, + "logits/rejected": -1.4084751605987549, + "logps/chosen": -466.22662353515625, + "logps/rejected": -502.583251953125, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.823432445526123, + "rewards/margins": 14.07629680633545, + "rewards/rejected": -10.2528657913208, + "step": 7290 + }, + { + "epoch": 2.48, + "learning_rate": 9.605942339166561e-08, + "logits/chosen": -1.5820099115371704, + "logits/rejected": -1.3936901092529297, + "logps/chosen": -376.5813903808594, + "logps/rejected": -918.03515625, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.296030044555664, + "rewards/margins": 14.593231201171875, + "rewards/rejected": -10.297201156616211, + "step": 7300 + }, + { + "epoch": 2.48, + "eval_logits/chosen": -1.5872900485992432, + "eval_logits/rejected": -1.3538511991500854, + "eval_logps/chosen": -382.83135986328125, + "eval_logps/rejected": -662.9935913085938, + "eval_loss": 0.007855456322431564, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.7091400623321533, + "eval_rewards/margins": 14.056862831115723, + "eval_rewards/rejected": -10.347722053527832, + "eval_runtime": 569.0869, + "eval_samples_per_second": 16.693, + "eval_steps_per_second": 0.522, + "step": 7300 + }, + { + "epoch": 2.48, + "learning_rate": 9.542993831046203e-08, + "logits/chosen": -1.589583396911621, + "logits/rejected": -1.4094817638397217, + "logps/chosen": -392.8844909667969, + "logps/rejected": -559.5259399414062, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.81827974319458, + "rewards/margins": 13.039645195007324, + "rewards/rejected": -9.221365928649902, + "step": 7310 + }, + { + "epoch": 2.49, + "learning_rate": 9.480045322925846e-08, + "logits/chosen": -1.583153486251831, + "logits/rejected": -1.3818209171295166, + "logps/chosen": -384.2645568847656, + "logps/rejected": -678.3912353515625, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.487889051437378, + "rewards/margins": 14.138727188110352, + "rewards/rejected": -10.650838851928711, + "step": 7320 + }, + { + "epoch": 2.49, + "learning_rate": 9.41709681480549e-08, + "logits/chosen": -1.5922229290008545, + "logits/rejected": -1.343609094619751, + "logps/chosen": -327.4759216308594, + "logps/rejected": -571.35791015625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.0135321617126465, + "rewards/margins": 14.138875961303711, + "rewards/rejected": -10.125345230102539, + "step": 7330 + }, + { + "epoch": 2.49, + "learning_rate": 9.35414830668513e-08, + "logits/chosen": -1.5811469554901123, + "logits/rejected": -1.4120653867721558, + "logps/chosen": -281.00360107421875, + "logps/rejected": -676.4948120117188, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.36657977104187, + "rewards/margins": 13.984593391418457, + "rewards/rejected": -10.618013381958008, + "step": 7340 + }, + { + "epoch": 2.5, + "learning_rate": 9.291199798564774e-08, + "logits/chosen": -1.6029351949691772, + "logits/rejected": -1.4120194911956787, + "logps/chosen": -357.8847961425781, + "logps/rejected": -672.7982788085938, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8664844036102295, + "rewards/margins": 14.355639457702637, + "rewards/rejected": -10.489152908325195, + "step": 7350 + }, + { + "epoch": 2.5, + "learning_rate": 9.228251290444416e-08, + "logits/chosen": -1.5803842544555664, + "logits/rejected": -1.4585782289505005, + "logps/chosen": -390.85479736328125, + "logps/rejected": -679.8489379882812, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.082976818084717, + "rewards/margins": 13.562578201293945, + "rewards/rejected": -9.47960090637207, + "step": 7360 + }, + { + "epoch": 2.51, + "learning_rate": 9.165302782324058e-08, + "logits/chosen": -1.6014230251312256, + "logits/rejected": -1.3388015031814575, + "logps/chosen": -343.8720703125, + "logps/rejected": -637.8966674804688, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.627603530883789, + "rewards/margins": 13.793660163879395, + "rewards/rejected": -10.166055679321289, + "step": 7370 + }, + { + "epoch": 2.51, + "learning_rate": 9.102354274203701e-08, + "logits/chosen": -1.5745279788970947, + "logits/rejected": -1.3818104267120361, + "logps/chosen": -375.74896240234375, + "logps/rejected": -674.0797119140625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9007415771484375, + "rewards/margins": 14.159538269042969, + "rewards/rejected": -10.258795738220215, + "step": 7380 + }, + { + "epoch": 2.51, + "learning_rate": 9.039405766083344e-08, + "logits/chosen": -1.6090705394744873, + "logits/rejected": -1.3810365200042725, + "logps/chosen": -330.9891662597656, + "logps/rejected": -805.0186767578125, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2524688243865967, + "rewards/margins": 13.301783561706543, + "rewards/rejected": -10.049315452575684, + "step": 7390 + }, + { + "epoch": 2.52, + "learning_rate": 8.976457257962985e-08, + "logits/chosen": -1.598346471786499, + "logits/rejected": -1.411240816116333, + "logps/chosen": -343.16680908203125, + "logps/rejected": -705.9971923828125, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4078705310821533, + "rewards/margins": 13.131240844726562, + "rewards/rejected": -9.723368644714355, + "step": 7400 + }, + { + "epoch": 2.52, + "eval_logits/chosen": -1.5829030275344849, + "eval_logits/rejected": -1.350967288017273, + "eval_logps/chosen": -384.2423095703125, + "eval_logps/rejected": -664.3638916015625, + "eval_loss": 0.006370027083903551, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.568047046661377, + "eval_rewards/margins": 14.052802085876465, + "eval_rewards/rejected": -10.484756469726562, + "eval_runtime": 571.268, + "eval_samples_per_second": 16.63, + "eval_steps_per_second": 0.52, + "step": 7400 + }, + { + "epoch": 2.52, + "learning_rate": 8.913508749842629e-08, + "logits/chosen": -1.5901390314102173, + "logits/rejected": -1.3591899871826172, + "logps/chosen": -332.70770263671875, + "logps/rejected": -630.0189819335938, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.421738862991333, + "rewards/margins": 13.327035903930664, + "rewards/rejected": -9.905294418334961, + "step": 7410 + }, + { + "epoch": 2.52, + "learning_rate": 8.850560241722271e-08, + "logits/chosen": -1.5820858478546143, + "logits/rejected": -1.3561170101165771, + "logps/chosen": -406.7608947753906, + "logps/rejected": -651.2835693359375, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.002291440963745, + "rewards/margins": 12.931818962097168, + "rewards/rejected": -9.929529190063477, + "step": 7420 + }, + { + "epoch": 2.53, + "learning_rate": 8.787611733601913e-08, + "logits/chosen": -1.5665420293807983, + "logits/rejected": -1.3573030233383179, + "logps/chosen": -521.3502807617188, + "logps/rejected": -437.029296875, + "loss": 0.0028, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.305558681488037, + "rewards/margins": 13.643495559692383, + "rewards/rejected": -10.337937355041504, + "step": 7430 + }, + { + "epoch": 2.53, + "learning_rate": 8.724663225481556e-08, + "logits/chosen": -1.6046295166015625, + "logits/rejected": -1.3191004991531372, + "logps/chosen": -393.41693115234375, + "logps/rejected": -776.4949951171875, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.056914806365967, + "rewards/margins": 15.406539916992188, + "rewards/rejected": -11.349626541137695, + "step": 7440 + }, + { + "epoch": 2.53, + "learning_rate": 8.6617147173612e-08, + "logits/chosen": -1.6083438396453857, + "logits/rejected": -1.3144936561584473, + "logps/chosen": -347.88043212890625, + "logps/rejected": -803.9027709960938, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1882686614990234, + "rewards/margins": 15.012051582336426, + "rewards/rejected": -11.82378101348877, + "step": 7450 + }, + { + "epoch": 2.54, + "learning_rate": 8.59876620924084e-08, + "logits/chosen": -1.56670343875885, + "logits/rejected": -1.3724141120910645, + "logps/chosen": -540.942626953125, + "logps/rejected": -705.7154541015625, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.112201690673828, + "rewards/margins": 14.807909965515137, + "rewards/rejected": -10.695707321166992, + "step": 7460 + }, + { + "epoch": 2.54, + "learning_rate": 8.535817701120483e-08, + "logits/chosen": -1.5813065767288208, + "logits/rejected": -1.3714473247528076, + "logps/chosen": -347.3874206542969, + "logps/rejected": -854.4906005859375, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5635986328125, + "rewards/margins": 13.728411674499512, + "rewards/rejected": -10.164813041687012, + "step": 7470 + }, + { + "epoch": 2.54, + "learning_rate": 8.472869193000126e-08, + "logits/chosen": -1.5560318231582642, + "logits/rejected": -1.3608484268188477, + "logps/chosen": -547.0894775390625, + "logps/rejected": -689.1031494140625, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5903820991516113, + "rewards/margins": 14.714937210083008, + "rewards/rejected": -11.124554634094238, + "step": 7480 + }, + { + "epoch": 2.55, + "learning_rate": 8.409920684879767e-08, + "logits/chosen": -1.5825515985488892, + "logits/rejected": -1.3870153427124023, + "logps/chosen": -398.7313232421875, + "logps/rejected": -632.9392700195312, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.546818494796753, + "rewards/margins": 13.79389762878418, + "rewards/rejected": -10.247079849243164, + "step": 7490 + }, + { + "epoch": 2.55, + "learning_rate": 8.346972176759411e-08, + "logits/chosen": -1.5720359086990356, + "logits/rejected": -1.3702938556671143, + "logps/chosen": -511.16351318359375, + "logps/rejected": -699.3697509765625, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.283144474029541, + "rewards/margins": 14.129135131835938, + "rewards/rejected": -10.845991134643555, + "step": 7500 + }, + { + "epoch": 2.55, + "eval_logits/chosen": -1.5946578979492188, + "eval_logits/rejected": -1.366605281829834, + "eval_logps/chosen": -384.7931823730469, + "eval_logps/rejected": -666.1279907226562, + "eval_loss": 0.006850528996437788, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.5129547119140625, + "eval_rewards/margins": 14.174114227294922, + "eval_rewards/rejected": -10.66115951538086, + "eval_runtime": 572.1089, + "eval_samples_per_second": 16.605, + "eval_steps_per_second": 0.519, + "step": 7500 + }, + { + "epoch": 2.55, + "learning_rate": 8.284023668639053e-08, + "logits/chosen": -1.552826166152954, + "logits/rejected": -1.4504058361053467, + "logps/chosen": -528.3265380859375, + "logps/rejected": -570.2879028320312, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.130772113800049, + "rewards/margins": 12.209383010864258, + "rewards/rejected": -9.07861042022705, + "step": 7510 + }, + { + "epoch": 2.56, + "learning_rate": 8.221075160518695e-08, + "logits/chosen": -1.5661380290985107, + "logits/rejected": -1.466923713684082, + "logps/chosen": -426.16229248046875, + "logps/rejected": -649.5921630859375, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1720786094665527, + "rewards/margins": 12.898447036743164, + "rewards/rejected": -9.726366996765137, + "step": 7520 + }, + { + "epoch": 2.56, + "learning_rate": 8.158126652398338e-08, + "logits/chosen": -1.5844460725784302, + "logits/rejected": -1.3616052865982056, + "logps/chosen": -462.893798828125, + "logps/rejected": -514.5243530273438, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.736729621887207, + "rewards/margins": 15.536142349243164, + "rewards/rejected": -10.799410820007324, + "step": 7530 + }, + { + "epoch": 2.56, + "learning_rate": 8.09517814427798e-08, + "logits/chosen": -1.596928358078003, + "logits/rejected": -1.3848183155059814, + "logps/chosen": -312.83966064453125, + "logps/rejected": -630.4840087890625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6991429328918457, + "rewards/margins": 15.021478652954102, + "rewards/rejected": -11.322335243225098, + "step": 7540 + }, + { + "epoch": 2.57, + "learning_rate": 8.032229636157622e-08, + "logits/chosen": -1.5858895778656006, + "logits/rejected": -1.397066354751587, + "logps/chosen": -304.4268493652344, + "logps/rejected": -842.9368286132812, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.297738552093506, + "rewards/margins": 13.705889701843262, + "rewards/rejected": -10.408151626586914, + "step": 7550 + }, + { + "epoch": 2.57, + "learning_rate": 7.969281128037266e-08, + "logits/chosen": -1.5978413820266724, + "logits/rejected": -1.3775299787521362, + "logps/chosen": -340.87518310546875, + "logps/rejected": -601.3428955078125, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7253220081329346, + "rewards/margins": 14.181994438171387, + "rewards/rejected": -10.456671714782715, + "step": 7560 + }, + { + "epoch": 2.57, + "learning_rate": 7.906332619916907e-08, + "logits/chosen": -1.5734174251556396, + "logits/rejected": -1.4049029350280762, + "logps/chosen": -407.07403564453125, + "logps/rejected": -682.3347778320312, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.013782501220703, + "rewards/margins": 13.829617500305176, + "rewards/rejected": -10.815835952758789, + "step": 7570 + }, + { + "epoch": 2.58, + "learning_rate": 7.84338411179655e-08, + "logits/chosen": -1.5779165029525757, + "logits/rejected": -1.3067697286605835, + "logps/chosen": -446.7791442871094, + "logps/rejected": -634.8629760742188, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4104971885681152, + "rewards/margins": 13.894719123840332, + "rewards/rejected": -10.484220504760742, + "step": 7580 + }, + { + "epoch": 2.58, + "learning_rate": 7.780435603676193e-08, + "logits/chosen": -1.6122270822525024, + "logits/rejected": -1.3840420246124268, + "logps/chosen": -314.5458679199219, + "logps/rejected": -645.9930419921875, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2400405406951904, + "rewards/margins": 14.697580337524414, + "rewards/rejected": -11.457540512084961, + "step": 7590 + }, + { + "epoch": 2.58, + "learning_rate": 7.717487095555835e-08, + "logits/chosen": -1.5950958728790283, + "logits/rejected": -1.3703113794326782, + "logps/chosen": -342.82794189453125, + "logps/rejected": -728.8080444335938, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.265272617340088, + "rewards/margins": 14.3114595413208, + "rewards/rejected": -11.046186447143555, + "step": 7600 + }, + { + "epoch": 2.58, + "eval_logits/chosen": -1.5871638059616089, + "eval_logits/rejected": -1.3599934577941895, + "eval_logps/chosen": -384.4612121582031, + "eval_logps/rejected": -667.1109008789062, + "eval_loss": 0.006630922667682171, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.546149492263794, + "eval_rewards/margins": 14.3056001663208, + "eval_rewards/rejected": -10.759450912475586, + "eval_runtime": 571.0819, + "eval_samples_per_second": 16.635, + "eval_steps_per_second": 0.52, + "step": 7600 + }, + { + "epoch": 2.59, + "learning_rate": 7.654538587435477e-08, + "logits/chosen": -1.611829161643982, + "logits/rejected": -1.3992689847946167, + "logps/chosen": -351.6083679199219, + "logps/rejected": -622.8372802734375, + "loss": 0.0031, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.6356570720672607, + "rewards/margins": 14.646787643432617, + "rewards/rejected": -11.011129379272461, + "step": 7610 + }, + { + "epoch": 2.59, + "learning_rate": 7.591590079315121e-08, + "logits/chosen": -1.5716960430145264, + "logits/rejected": -1.4380439519882202, + "logps/chosen": -381.25677490234375, + "logps/rejected": -664.9888916015625, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2809977531433105, + "rewards/margins": 14.19939136505127, + "rewards/rejected": -10.918392181396484, + "step": 7620 + }, + { + "epoch": 2.59, + "learning_rate": 7.528641571194762e-08, + "logits/chosen": -1.5856646299362183, + "logits/rejected": -1.4311058521270752, + "logps/chosen": -309.49310302734375, + "logps/rejected": -812.0164794921875, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9913628101348877, + "rewards/margins": 14.624282836914062, + "rewards/rejected": -10.632919311523438, + "step": 7630 + }, + { + "epoch": 2.6, + "learning_rate": 7.465693063074405e-08, + "logits/chosen": -1.5708390474319458, + "logits/rejected": -1.353971004486084, + "logps/chosen": -411.2115173339844, + "logps/rejected": -654.625732421875, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0149474143981934, + "rewards/margins": 14.89503002166748, + "rewards/rejected": -11.880084037780762, + "step": 7640 + }, + { + "epoch": 2.6, + "learning_rate": 7.402744554954048e-08, + "logits/chosen": -1.5937126874923706, + "logits/rejected": -1.3695682287216187, + "logps/chosen": -409.56378173828125, + "logps/rejected": -557.94140625, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8853092193603516, + "rewards/margins": 14.108288764953613, + "rewards/rejected": -10.222978591918945, + "step": 7650 + }, + { + "epoch": 2.6, + "learning_rate": 7.33979604683369e-08, + "logits/chosen": -1.5842314958572388, + "logits/rejected": -1.4110573530197144, + "logps/chosen": -423.503173828125, + "logps/rejected": -583.1806030273438, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.729583740234375, + "rewards/margins": 14.047520637512207, + "rewards/rejected": -10.317937850952148, + "step": 7660 + }, + { + "epoch": 2.61, + "learning_rate": 7.276847538713332e-08, + "logits/chosen": -1.5859633684158325, + "logits/rejected": -1.3568613529205322, + "logps/chosen": -431.7061462402344, + "logps/rejected": -576.2310180664062, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.187563896179199, + "rewards/margins": 14.1210298538208, + "rewards/rejected": -9.933465957641602, + "step": 7670 + }, + { + "epoch": 2.61, + "learning_rate": 7.213899030592976e-08, + "logits/chosen": -1.589237093925476, + "logits/rejected": -1.4081981182098389, + "logps/chosen": -422.6863708496094, + "logps/rejected": -547.9866943359375, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8875298500061035, + "rewards/margins": 13.395736694335938, + "rewards/rejected": -9.508207321166992, + "step": 7680 + }, + { + "epoch": 2.61, + "learning_rate": 7.150950522472617e-08, + "logits/chosen": -1.5805962085723877, + "logits/rejected": -1.3518579006195068, + "logps/chosen": -402.2761535644531, + "logps/rejected": -550.6287841796875, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.606547832489014, + "rewards/margins": 14.900644302368164, + "rewards/rejected": -10.294095993041992, + "step": 7690 + }, + { + "epoch": 2.62, + "learning_rate": 7.088002014352259e-08, + "logits/chosen": -1.5822559595108032, + "logits/rejected": -1.3676798343658447, + "logps/chosen": -489.9197692871094, + "logps/rejected": -609.206298828125, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.705000638961792, + "rewards/margins": 14.201299667358398, + "rewards/rejected": -10.496298789978027, + "step": 7700 + }, + { + "epoch": 2.62, + "eval_logits/chosen": -1.5750356912612915, + "eval_logits/rejected": -1.3485898971557617, + "eval_logps/chosen": -384.2902526855469, + "eval_logps/rejected": -667.2067260742188, + "eval_loss": 0.007591096684336662, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.563251256942749, + "eval_rewards/margins": 14.332283973693848, + "eval_rewards/rejected": -10.76903247833252, + "eval_runtime": 571.5732, + "eval_samples_per_second": 16.621, + "eval_steps_per_second": 0.52, + "step": 7700 + }, + { + "epoch": 2.62, + "learning_rate": 7.025053506231903e-08, + "logits/chosen": -1.5702096223831177, + "logits/rejected": -1.2900179624557495, + "logps/chosen": -362.52508544921875, + "logps/rejected": -425.1124572753906, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9060986042022705, + "rewards/margins": 13.619707107543945, + "rewards/rejected": -9.713608741760254, + "step": 7710 + }, + { + "epoch": 2.62, + "learning_rate": 6.962104998111543e-08, + "logits/chosen": -1.5926183462142944, + "logits/rejected": -1.343196153640747, + "logps/chosen": -357.8170471191406, + "logps/rejected": -597.354248046875, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7952880859375, + "rewards/margins": 14.029565811157227, + "rewards/rejected": -10.23427677154541, + "step": 7720 + }, + { + "epoch": 2.63, + "learning_rate": 6.899156489991187e-08, + "logits/chosen": -1.5680058002471924, + "logits/rejected": -1.3565375804901123, + "logps/chosen": -374.9936218261719, + "logps/rejected": -576.9132080078125, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2953975200653076, + "rewards/margins": 14.247251510620117, + "rewards/rejected": -10.951855659484863, + "step": 7730 + }, + { + "epoch": 2.63, + "learning_rate": 6.83620798187083e-08, + "logits/chosen": -1.5791174173355103, + "logits/rejected": -1.380943775177002, + "logps/chosen": -463.700439453125, + "logps/rejected": -554.7304077148438, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.44099497795105, + "rewards/margins": 14.814692497253418, + "rewards/rejected": -11.373697280883789, + "step": 7740 + }, + { + "epoch": 2.63, + "learning_rate": 6.773259473750472e-08, + "logits/chosen": -1.5898202657699585, + "logits/rejected": -1.4140681028366089, + "logps/chosen": -313.2778015136719, + "logps/rejected": -614.6798706054688, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.1342573165893555, + "rewards/margins": 14.9911527633667, + "rewards/rejected": -10.856893539428711, + "step": 7750 + }, + { + "epoch": 2.64, + "learning_rate": 6.710310965630114e-08, + "logits/chosen": -1.5512304306030273, + "logits/rejected": -1.3856016397476196, + "logps/chosen": -591.60888671875, + "logps/rejected": -697.7617797851562, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6739540100097656, + "rewards/margins": 14.407045364379883, + "rewards/rejected": -10.733091354370117, + "step": 7760 + }, + { + "epoch": 2.64, + "learning_rate": 6.647362457509758e-08, + "logits/chosen": -1.5883727073669434, + "logits/rejected": -1.3925888538360596, + "logps/chosen": -371.9454345703125, + "logps/rejected": -514.0771484375, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8032193183898926, + "rewards/margins": 13.631205558776855, + "rewards/rejected": -9.827985763549805, + "step": 7770 + }, + { + "epoch": 2.64, + "learning_rate": 6.584413949389398e-08, + "logits/chosen": -1.5685484409332275, + "logits/rejected": -1.4317271709442139, + "logps/chosen": -451.58544921875, + "logps/rejected": -733.6173706054688, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.603257656097412, + "rewards/margins": 13.950202941894531, + "rewards/rejected": -10.346945762634277, + "step": 7780 + }, + { + "epoch": 2.65, + "learning_rate": 6.521465441269042e-08, + "logits/chosen": -1.5642088651657104, + "logits/rejected": -1.404473066329956, + "logps/chosen": -450.37554931640625, + "logps/rejected": -754.6324462890625, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.472395420074463, + "rewards/margins": 14.46251106262207, + "rewards/rejected": -10.990114212036133, + "step": 7790 + }, + { + "epoch": 2.65, + "learning_rate": 6.458516933148684e-08, + "logits/chosen": -1.5736857652664185, + "logits/rejected": -1.369564414024353, + "logps/chosen": -370.5788879394531, + "logps/rejected": -770.5895385742188, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8563263416290283, + "rewards/margins": 15.076814651489258, + "rewards/rejected": -11.220487594604492, + "step": 7800 + }, + { + "epoch": 2.65, + "eval_logits/chosen": -1.5892072916030884, + "eval_logits/rejected": -1.360355257987976, + "eval_logps/chosen": -383.2607116699219, + "eval_logps/rejected": -667.1862182617188, + "eval_loss": 0.006557795684784651, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.666203498840332, + "eval_rewards/margins": 14.433189392089844, + "eval_rewards/rejected": -10.766985893249512, + "eval_runtime": 572.443, + "eval_samples_per_second": 16.596, + "eval_steps_per_second": 0.519, + "step": 7800 + }, + { + "epoch": 2.65, + "learning_rate": 6.395568425028327e-08, + "logits/chosen": -1.574926495552063, + "logits/rejected": -1.3776017427444458, + "logps/chosen": -305.8157043457031, + "logps/rejected": -844.5144653320312, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.732442855834961, + "rewards/margins": 15.174756050109863, + "rewards/rejected": -11.44231128692627, + "step": 7810 + }, + { + "epoch": 2.66, + "learning_rate": 6.332619916907969e-08, + "logits/chosen": -1.595088005065918, + "logits/rejected": -1.3792155981063843, + "logps/chosen": -319.9638366699219, + "logps/rejected": -690.8758544921875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6837856769561768, + "rewards/margins": 15.366823196411133, + "rewards/rejected": -11.683036804199219, + "step": 7820 + }, + { + "epoch": 2.66, + "learning_rate": 6.269671408787612e-08, + "logits/chosen": -1.606609582901001, + "logits/rejected": -1.3512972593307495, + "logps/chosen": -351.0549011230469, + "logps/rejected": -794.4775390625, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.761204957962036, + "rewards/margins": 14.8416166305542, + "rewards/rejected": -11.080410957336426, + "step": 7830 + }, + { + "epoch": 2.66, + "learning_rate": 6.206722900667253e-08, + "logits/chosen": -1.5637352466583252, + "logits/rejected": -1.3899656534194946, + "logps/chosen": -398.4555358886719, + "logps/rejected": -888.8825073242188, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2782394886016846, + "rewards/margins": 14.381002426147461, + "rewards/rejected": -11.102763175964355, + "step": 7840 + }, + { + "epoch": 2.67, + "learning_rate": 6.143774392546897e-08, + "logits/chosen": -1.57192862033844, + "logits/rejected": -1.3875234127044678, + "logps/chosen": -495.4486389160156, + "logps/rejected": -528.0491943359375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.432842254638672, + "rewards/margins": 14.237629890441895, + "rewards/rejected": -9.804787635803223, + "step": 7850 + }, + { + "epoch": 2.67, + "learning_rate": 6.080825884426539e-08, + "logits/chosen": -1.5970394611358643, + "logits/rejected": -1.280611515045166, + "logps/chosen": -397.8489990234375, + "logps/rejected": -573.056884765625, + "loss": 0.0031, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.8107166290283203, + "rewards/margins": 14.091639518737793, + "rewards/rejected": -11.280923843383789, + "step": 7860 + }, + { + "epoch": 2.68, + "learning_rate": 6.017877376306182e-08, + "logits/chosen": -1.5684585571289062, + "logits/rejected": -1.3257622718811035, + "logps/chosen": -399.6987609863281, + "logps/rejected": -576.7581176757812, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.739924192428589, + "rewards/margins": 15.564404487609863, + "rewards/rejected": -11.824480056762695, + "step": 7870 + }, + { + "epoch": 2.68, + "learning_rate": 5.954928868185824e-08, + "logits/chosen": -1.5764528512954712, + "logits/rejected": -1.3443315029144287, + "logps/chosen": -478.32781982421875, + "logps/rejected": -899.9153442382812, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.780478000640869, + "rewards/margins": 14.009150505065918, + "rewards/rejected": -10.228672981262207, + "step": 7880 + }, + { + "epoch": 2.68, + "learning_rate": 5.891980360065466e-08, + "logits/chosen": -1.5657343864440918, + "logits/rejected": -1.3282606601715088, + "logps/chosen": -443.96014404296875, + "logps/rejected": -448.8551330566406, + "loss": 0.0045, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.976241111755371, + "rewards/margins": 13.407419204711914, + "rewards/rejected": -10.431177139282227, + "step": 7890 + }, + { + "epoch": 2.69, + "learning_rate": 5.8290318519451084e-08, + "logits/chosen": -1.5937620401382446, + "logits/rejected": -1.373793601989746, + "logps/chosen": -314.4598693847656, + "logps/rejected": -553.9489135742188, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.609654188156128, + "rewards/margins": 13.212257385253906, + "rewards/rejected": -9.602604866027832, + "step": 7900 + }, + { + "epoch": 2.69, + "eval_logits/chosen": -1.5830224752426147, + "eval_logits/rejected": -1.3540189266204834, + "eval_logps/chosen": -382.00738525390625, + "eval_logps/rejected": -664.3722534179688, + "eval_loss": 0.006677311845123768, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.7915351390838623, + "eval_rewards/margins": 14.277125358581543, + "eval_rewards/rejected": -10.485589027404785, + "eval_runtime": 570.3973, + "eval_samples_per_second": 16.655, + "eval_steps_per_second": 0.521, + "step": 7900 + }, + { + "epoch": 2.69, + "learning_rate": 5.7660833438247514e-08, + "logits/chosen": -1.5998241901397705, + "logits/rejected": -1.3606961965560913, + "logps/chosen": -407.9090270996094, + "logps/rejected": -616.8807373046875, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.739180088043213, + "rewards/margins": 14.191816329956055, + "rewards/rejected": -10.45263671875, + "step": 7910 + }, + { + "epoch": 2.69, + "learning_rate": 5.7031348357043937e-08, + "logits/chosen": -1.5920798778533936, + "logits/rejected": -1.3565165996551514, + "logps/chosen": -344.7170715332031, + "logps/rejected": -869.0130615234375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.756824493408203, + "rewards/margins": 14.19177532196045, + "rewards/rejected": -10.434951782226562, + "step": 7920 + }, + { + "epoch": 2.7, + "learning_rate": 5.640186327584036e-08, + "logits/chosen": -1.581946611404419, + "logits/rejected": -1.3423621654510498, + "logps/chosen": -361.2926025390625, + "logps/rejected": -539.7999267578125, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.458395481109619, + "rewards/margins": 14.517300605773926, + "rewards/rejected": -11.058903694152832, + "step": 7930 + }, + { + "epoch": 2.7, + "learning_rate": 5.577237819463679e-08, + "logits/chosen": -1.58747398853302, + "logits/rejected": -1.3739595413208008, + "logps/chosen": -393.8480224609375, + "logps/rejected": -445.3944396972656, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8729991912841797, + "rewards/margins": 15.524072647094727, + "rewards/rejected": -11.65107250213623, + "step": 7940 + }, + { + "epoch": 2.7, + "learning_rate": 5.514289311343321e-08, + "logits/chosen": -1.5707464218139648, + "logits/rejected": -1.4097000360488892, + "logps/chosen": -453.5381774902344, + "logps/rejected": -726.0867919921875, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.95831036567688, + "rewards/margins": 15.412431716918945, + "rewards/rejected": -11.454119682312012, + "step": 7950 + }, + { + "epoch": 2.71, + "learning_rate": 5.4513408032229634e-08, + "logits/chosen": -1.561715841293335, + "logits/rejected": -1.4068595170974731, + "logps/chosen": -457.23150634765625, + "logps/rejected": -829.3739013671875, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.482323408126831, + "rewards/margins": 14.275054931640625, + "rewards/rejected": -10.792731285095215, + "step": 7960 + }, + { + "epoch": 2.71, + "learning_rate": 5.388392295102606e-08, + "logits/chosen": -1.5843822956085205, + "logits/rejected": -1.4259929656982422, + "logps/chosen": -387.32159423828125, + "logps/rejected": -472.2605895996094, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.0503716468811035, + "rewards/margins": 13.791621208190918, + "rewards/rejected": -9.741250038146973, + "step": 7970 + }, + { + "epoch": 2.71, + "learning_rate": 5.3254437869822486e-08, + "logits/chosen": -1.6101129055023193, + "logits/rejected": -1.3590729236602783, + "logps/chosen": -329.87420654296875, + "logps/rejected": -554.1741943359375, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.082028388977051, + "rewards/margins": 14.453269958496094, + "rewards/rejected": -10.37124252319336, + "step": 7980 + }, + { + "epoch": 2.72, + "learning_rate": 5.262495278861891e-08, + "logits/chosen": -1.5898675918579102, + "logits/rejected": -1.4039897918701172, + "logps/chosen": -302.30255126953125, + "logps/rejected": -445.60467529296875, + "loss": 0.003, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.23103666305542, + "rewards/margins": 12.236723899841309, + "rewards/rejected": -9.005687713623047, + "step": 7990 + }, + { + "epoch": 2.72, + "learning_rate": 5.199546770741533e-08, + "logits/chosen": -1.5916635990142822, + "logits/rejected": -1.359731912612915, + "logps/chosen": -321.1141662597656, + "logps/rejected": -635.2688598632812, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.746471881866455, + "rewards/margins": 14.233724594116211, + "rewards/rejected": -10.487253189086914, + "step": 8000 + }, + { + "epoch": 2.72, + "eval_logits/chosen": -1.5812485218048096, + "eval_logits/rejected": -1.351022720336914, + "eval_logps/chosen": -381.6640930175781, + "eval_logps/rejected": -664.8873291015625, + "eval_loss": 0.006568717770278454, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.825864791870117, + "eval_rewards/margins": 14.36296272277832, + "eval_rewards/rejected": -10.537096977233887, + "eval_runtime": 570.6964, + "eval_samples_per_second": 16.646, + "eval_steps_per_second": 0.52, + "step": 8000 + }, + { + "epoch": 2.72, + "learning_rate": 5.136598262621176e-08, + "logits/chosen": -1.5812301635742188, + "logits/rejected": -1.3925411701202393, + "logps/chosen": -301.6327209472656, + "logps/rejected": -522.1574096679688, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.969151020050049, + "rewards/margins": 14.294095039367676, + "rewards/rejected": -10.324941635131836, + "step": 8010 + }, + { + "epoch": 2.73, + "learning_rate": 5.073649754500818e-08, + "logits/chosen": -1.61221182346344, + "logits/rejected": -1.406883716583252, + "logps/chosen": -335.6075439453125, + "logps/rejected": -499.5021057128906, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.301943302154541, + "rewards/margins": 14.78752326965332, + "rewards/rejected": -10.485579490661621, + "step": 8020 + }, + { + "epoch": 2.73, + "learning_rate": 5.01070124638046e-08, + "logits/chosen": -1.5815832614898682, + "logits/rejected": -1.3213074207305908, + "logps/chosen": -433.0674743652344, + "logps/rejected": -651.0155639648438, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.802009105682373, + "rewards/margins": 15.129300117492676, + "rewards/rejected": -11.327291488647461, + "step": 8030 + }, + { + "epoch": 2.73, + "learning_rate": 4.947752738260103e-08, + "logits/chosen": -1.5869495868682861, + "logits/rejected": -1.3655319213867188, + "logps/chosen": -430.05169677734375, + "logps/rejected": -746.5196533203125, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6719703674316406, + "rewards/margins": 14.196032524108887, + "rewards/rejected": -10.524062156677246, + "step": 8040 + }, + { + "epoch": 2.74, + "learning_rate": 4.884804230139745e-08, + "logits/chosen": -1.5900530815124512, + "logits/rejected": -1.3719992637634277, + "logps/chosen": -318.2651062011719, + "logps/rejected": -922.4944458007812, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.923218250274658, + "rewards/margins": 15.004796028137207, + "rewards/rejected": -11.081579208374023, + "step": 8050 + }, + { + "epoch": 2.74, + "learning_rate": 4.8218557220193875e-08, + "logits/chosen": -1.5289605855941772, + "logits/rejected": -1.3707047700881958, + "logps/chosen": -624.6290893554688, + "logps/rejected": -717.632080078125, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2964844703674316, + "rewards/margins": 13.906949043273926, + "rewards/rejected": -10.610464096069336, + "step": 8060 + }, + { + "epoch": 2.74, + "learning_rate": 4.7589072138990305e-08, + "logits/chosen": -1.5721800327301025, + "logits/rejected": -1.331579566001892, + "logps/chosen": -373.02716064453125, + "logps/rejected": -528.2210693359375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.968733549118042, + "rewards/margins": 14.147048950195312, + "rewards/rejected": -10.178316116333008, + "step": 8070 + }, + { + "epoch": 2.75, + "learning_rate": 4.695958705778673e-08, + "logits/chosen": -1.577202558517456, + "logits/rejected": -1.3963220119476318, + "logps/chosen": -521.974365234375, + "logps/rejected": -389.03265380859375, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.123630046844482, + "rewards/margins": 13.158258438110352, + "rewards/rejected": -9.034627914428711, + "step": 8080 + }, + { + "epoch": 2.75, + "learning_rate": 4.633010197658315e-08, + "logits/chosen": -1.5879758596420288, + "logits/rejected": -1.4590990543365479, + "logps/chosen": -402.6632385253906, + "logps/rejected": -552.7661743164062, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8068649768829346, + "rewards/margins": 12.73681640625, + "rewards/rejected": -8.929951667785645, + "step": 8090 + }, + { + "epoch": 2.75, + "learning_rate": 4.570061689537958e-08, + "logits/chosen": -1.5723588466644287, + "logits/rejected": -1.4018672704696655, + "logps/chosen": -563.3448486328125, + "logps/rejected": -632.9932861328125, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.124373435974121, + "rewards/margins": 14.704602241516113, + "rewards/rejected": -10.580228805541992, + "step": 8100 + }, + { + "epoch": 2.75, + "eval_logits/chosen": -1.5788776874542236, + "eval_logits/rejected": -1.3469969034194946, + "eval_logps/chosen": -382.6946105957031, + "eval_logps/rejected": -666.2989501953125, + "eval_loss": 0.007073494140058756, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.722816228866577, + "eval_rewards/margins": 14.40107250213623, + "eval_rewards/rejected": -10.678256034851074, + "eval_runtime": 571.3072, + "eval_samples_per_second": 16.629, + "eval_steps_per_second": 0.52, + "step": 8100 + }, + { + "epoch": 2.76, + "learning_rate": 4.5071131814176e-08, + "logits/chosen": -1.5811126232147217, + "logits/rejected": -1.4175249338150024, + "logps/chosen": -462.9391174316406, + "logps/rejected": -668.3675537109375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.022510051727295, + "rewards/margins": 13.767558097839355, + "rewards/rejected": -9.745049476623535, + "step": 8110 + }, + { + "epoch": 2.76, + "learning_rate": 4.4441646732972425e-08, + "logits/chosen": -1.5727370977401733, + "logits/rejected": -1.3278236389160156, + "logps/chosen": -332.7187194824219, + "logps/rejected": -897.8184814453125, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.005051612854004, + "rewards/margins": 15.594218254089355, + "rewards/rejected": -11.589165687561035, + "step": 8120 + }, + { + "epoch": 2.76, + "learning_rate": 4.3812161651768855e-08, + "logits/chosen": -1.5994136333465576, + "logits/rejected": -1.3103677034378052, + "logps/chosen": -339.26715087890625, + "logps/rejected": -608.7657470703125, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.065439701080322, + "rewards/margins": 14.395718574523926, + "rewards/rejected": -10.330279350280762, + "step": 8130 + }, + { + "epoch": 2.77, + "learning_rate": 4.318267657056528e-08, + "logits/chosen": -1.5932546854019165, + "logits/rejected": -1.413646936416626, + "logps/chosen": -423.60736083984375, + "logps/rejected": -544.1804809570312, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6994776725769043, + "rewards/margins": 12.383493423461914, + "rewards/rejected": -8.684015274047852, + "step": 8140 + }, + { + "epoch": 2.77, + "learning_rate": 4.25531914893617e-08, + "logits/chosen": -1.5990790128707886, + "logits/rejected": -1.3871958255767822, + "logps/chosen": -347.128662109375, + "logps/rejected": -721.7862548828125, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.942992687225342, + "rewards/margins": 14.607824325561523, + "rewards/rejected": -10.664833068847656, + "step": 8150 + }, + { + "epoch": 2.77, + "learning_rate": 4.192370640815812e-08, + "logits/chosen": -1.5772348642349243, + "logits/rejected": -1.327580213546753, + "logps/chosen": -365.26556396484375, + "logps/rejected": -551.3877563476562, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.529675006866455, + "rewards/margins": 14.26812744140625, + "rewards/rejected": -10.738451957702637, + "step": 8160 + }, + { + "epoch": 2.78, + "learning_rate": 4.129422132695455e-08, + "logits/chosen": -1.567935824394226, + "logits/rejected": -1.40500009059906, + "logps/chosen": -382.6530456542969, + "logps/rejected": -560.8984985351562, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7513511180877686, + "rewards/margins": 14.16174602508545, + "rewards/rejected": -10.410395622253418, + "step": 8170 + }, + { + "epoch": 2.78, + "learning_rate": 4.0664736245750975e-08, + "logits/chosen": -1.5888985395431519, + "logits/rejected": -1.3442599773406982, + "logps/chosen": -329.550048828125, + "logps/rejected": -517.7901611328125, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8502590656280518, + "rewards/margins": 14.013814926147461, + "rewards/rejected": -10.163557052612305, + "step": 8180 + }, + { + "epoch": 2.78, + "learning_rate": 4.00352511645474e-08, + "logits/chosen": -1.5963094234466553, + "logits/rejected": -1.3955920934677124, + "logps/chosen": -425.23907470703125, + "logps/rejected": -632.748779296875, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2856781482696533, + "rewards/margins": 13.697344779968262, + "rewards/rejected": -10.411666870117188, + "step": 8190 + }, + { + "epoch": 2.79, + "learning_rate": 3.940576608334383e-08, + "logits/chosen": -1.5618524551391602, + "logits/rejected": -1.4135830402374268, + "logps/chosen": -409.8563537597656, + "logps/rejected": -637.9149169921875, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.3201799392700195, + "rewards/margins": 14.587376594543457, + "rewards/rejected": -10.267195701599121, + "step": 8200 + }, + { + "epoch": 2.79, + "eval_logits/chosen": -1.5790894031524658, + "eval_logits/rejected": -1.3500999212265015, + "eval_logps/chosen": -382.890869140625, + "eval_logps/rejected": -667.2009887695312, + "eval_loss": 0.006452932022511959, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.703184127807617, + "eval_rewards/margins": 14.471650123596191, + "eval_rewards/rejected": -10.768465995788574, + "eval_runtime": 575.5534, + "eval_samples_per_second": 16.506, + "eval_steps_per_second": 0.516, + "step": 8200 + }, + { + "epoch": 2.79, + "learning_rate": 3.877628100214025e-08, + "logits/chosen": -1.5769083499908447, + "logits/rejected": -1.398919939994812, + "logps/chosen": -385.2383728027344, + "logps/rejected": -670.9034423828125, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.1332807540893555, + "rewards/margins": 14.730560302734375, + "rewards/rejected": -10.59727954864502, + "step": 8210 + }, + { + "epoch": 2.79, + "learning_rate": 3.814679592093667e-08, + "logits/chosen": -1.5672122240066528, + "logits/rejected": -1.3397407531738281, + "logps/chosen": -445.0633850097656, + "logps/rejected": -871.2277221679688, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.133738994598389, + "rewards/margins": 16.045263290405273, + "rewards/rejected": -11.911523818969727, + "step": 8220 + }, + { + "epoch": 2.8, + "learning_rate": 3.75173108397331e-08, + "logits/chosen": -1.587575912475586, + "logits/rejected": -1.310447096824646, + "logps/chosen": -373.71148681640625, + "logps/rejected": -749.4341430664062, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5330822467803955, + "rewards/margins": 14.49261474609375, + "rewards/rejected": -10.959531784057617, + "step": 8230 + }, + { + "epoch": 2.8, + "learning_rate": 3.688782575852952e-08, + "logits/chosen": -1.5781171321868896, + "logits/rejected": -1.437328577041626, + "logps/chosen": -298.32537841796875, + "logps/rejected": -486.23297119140625, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.116750717163086, + "rewards/margins": 13.457989692687988, + "rewards/rejected": -9.341238975524902, + "step": 8240 + }, + { + "epoch": 2.8, + "learning_rate": 3.625834067732594e-08, + "logits/chosen": -1.571605920791626, + "logits/rejected": -1.3770138025283813, + "logps/chosen": -451.38494873046875, + "logps/rejected": -649.7266845703125, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.037511825561523, + "rewards/margins": 14.274869918823242, + "rewards/rejected": -10.237359046936035, + "step": 8250 + }, + { + "epoch": 2.81, + "learning_rate": 3.562885559612237e-08, + "logits/chosen": -1.5555534362792969, + "logits/rejected": -1.393279790878296, + "logps/chosen": -420.8077697753906, + "logps/rejected": -733.6368408203125, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.789829969406128, + "rewards/margins": 14.2489595413208, + "rewards/rejected": -10.459129333496094, + "step": 8260 + }, + { + "epoch": 2.81, + "learning_rate": 3.499937051491879e-08, + "logits/chosen": -1.5919803380966187, + "logits/rejected": -1.3264809846878052, + "logps/chosen": -321.86798095703125, + "logps/rejected": -571.9153442382812, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.310166120529175, + "rewards/margins": 14.663949966430664, + "rewards/rejected": -11.353784561157227, + "step": 8270 + }, + { + "epoch": 2.81, + "learning_rate": 3.4369885433715216e-08, + "logits/chosen": -1.6076500415802002, + "logits/rejected": -1.4048197269439697, + "logps/chosen": -291.5757141113281, + "logps/rejected": -500.05816650390625, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.5002970695495605, + "rewards/margins": 14.942835807800293, + "rewards/rejected": -10.442538261413574, + "step": 8280 + }, + { + "epoch": 2.82, + "learning_rate": 3.3740400352511645e-08, + "logits/chosen": -1.5926601886749268, + "logits/rejected": -1.3878093957901, + "logps/chosen": -300.0306701660156, + "logps/rejected": -939.9244995117188, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6893222332000732, + "rewards/margins": 15.406352043151855, + "rewards/rejected": -11.717028617858887, + "step": 8290 + }, + { + "epoch": 2.82, + "learning_rate": 3.311091527130807e-08, + "logits/chosen": -1.6151847839355469, + "logits/rejected": -1.4056875705718994, + "logps/chosen": -353.1057434082031, + "logps/rejected": -475.6534729003906, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.917816162109375, + "rewards/margins": 14.426996231079102, + "rewards/rejected": -10.509180068969727, + "step": 8300 + }, + { + "epoch": 2.82, + "eval_logits/chosen": -1.5863962173461914, + "eval_logits/rejected": -1.3573564291000366, + "eval_logps/chosen": -382.7499084472656, + "eval_logps/rejected": -667.2633666992188, + "eval_loss": 0.007200776599347591, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 3.71728515625, + "eval_rewards/margins": 14.491976737976074, + "eval_rewards/rejected": -10.774690628051758, + "eval_runtime": 575.4547, + "eval_samples_per_second": 16.509, + "eval_steps_per_second": 0.516, + "step": 8300 + }, + { + "epoch": 2.82, + "learning_rate": 3.248143019010449e-08, + "logits/chosen": -1.5838531255722046, + "logits/rejected": -1.3761316537857056, + "logps/chosen": -299.0707092285156, + "logps/rejected": -723.9361572265625, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8292579650878906, + "rewards/margins": 13.472803115844727, + "rewards/rejected": -9.64354419708252, + "step": 8310 + }, + { + "epoch": 2.83, + "learning_rate": 3.1851945108900914e-08, + "logits/chosen": -1.5779653787612915, + "logits/rejected": -1.393566370010376, + "logps/chosen": -393.44708251953125, + "logps/rejected": -733.7208251953125, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.779395580291748, + "rewards/margins": 14.074894905090332, + "rewards/rejected": -10.295498847961426, + "step": 8320 + }, + { + "epoch": 2.83, + "learning_rate": 3.122246002769734e-08, + "logits/chosen": -1.5992467403411865, + "logits/rejected": -1.3724254369735718, + "logps/chosen": -303.39910888671875, + "logps/rejected": -566.540771484375, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.905411958694458, + "rewards/margins": 15.12633991241455, + "rewards/rejected": -11.220927238464355, + "step": 8330 + }, + { + "epoch": 2.83, + "learning_rate": 3.0592974946493766e-08, + "logits/chosen": -1.5981175899505615, + "logits/rejected": -1.3781298398971558, + "logps/chosen": -376.58856201171875, + "logps/rejected": -528.969970703125, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.1143598556518555, + "rewards/margins": 13.978137016296387, + "rewards/rejected": -9.863778114318848, + "step": 8340 + }, + { + "epoch": 2.84, + "learning_rate": 2.996348986529019e-08, + "logits/chosen": -1.5841161012649536, + "logits/rejected": -1.4147449731826782, + "logps/chosen": -480.7455139160156, + "logps/rejected": -535.1409912109375, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9188015460968018, + "rewards/margins": 13.855165481567383, + "rewards/rejected": -9.936361312866211, + "step": 8350 + }, + { + "epoch": 2.84, + "learning_rate": 2.9334004784086618e-08, + "logits/chosen": -1.5880306959152222, + "logits/rejected": -1.3770748376846313, + "logps/chosen": -351.6282958984375, + "logps/rejected": -683.6505126953125, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6251416206359863, + "rewards/margins": 14.068796157836914, + "rewards/rejected": -10.443655014038086, + "step": 8360 + }, + { + "epoch": 2.85, + "learning_rate": 2.870451970288304e-08, + "logits/chosen": -1.580641508102417, + "logits/rejected": -1.373658537864685, + "logps/chosen": -464.54925537109375, + "logps/rejected": -760.2039794921875, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5572311878204346, + "rewards/margins": 14.061103820800781, + "rewards/rejected": -10.503872871398926, + "step": 8370 + }, + { + "epoch": 2.85, + "learning_rate": 2.8075034621679467e-08, + "logits/chosen": -1.5665006637573242, + "logits/rejected": -1.3866631984710693, + "logps/chosen": -529.6736450195312, + "logps/rejected": -407.0146789550781, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9439423084259033, + "rewards/margins": 12.712591171264648, + "rewards/rejected": -8.768649101257324, + "step": 8380 + }, + { + "epoch": 2.85, + "learning_rate": 2.744554954047589e-08, + "logits/chosen": -1.6041733026504517, + "logits/rejected": -1.3978776931762695, + "logps/chosen": -311.43487548828125, + "logps/rejected": -636.9126586914062, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.349041700363159, + "rewards/margins": 14.620532035827637, + "rewards/rejected": -11.271491050720215, + "step": 8390 + }, + { + "epoch": 2.86, + "learning_rate": 2.6816064459272312e-08, + "logits/chosen": -1.5718210935592651, + "logits/rejected": -1.411438226699829, + "logps/chosen": -536.2229614257812, + "logps/rejected": -693.5496826171875, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.682270050048828, + "rewards/margins": 14.324712753295898, + "rewards/rejected": -10.642443656921387, + "step": 8400 + }, + { + "epoch": 2.86, + "eval_logits/chosen": -1.589887261390686, + "eval_logits/rejected": -1.3626463413238525, + "eval_logps/chosen": -382.655029296875, + "eval_logps/rejected": -666.6168823242188, + "eval_loss": 0.006446606479585171, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.7267708778381348, + "eval_rewards/margins": 14.436819076538086, + "eval_rewards/rejected": -10.71004867553711, + "eval_runtime": 577.3113, + "eval_samples_per_second": 16.456, + "eval_steps_per_second": 0.514, + "step": 8400 + }, + { + "epoch": 2.86, + "learning_rate": 2.618657937806874e-08, + "logits/chosen": -1.5737426280975342, + "logits/rejected": -1.3891984224319458, + "logps/chosen": -386.510009765625, + "logps/rejected": -616.197265625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8604679107666016, + "rewards/margins": 14.738340377807617, + "rewards/rejected": -10.877873420715332, + "step": 8410 + }, + { + "epoch": 2.86, + "learning_rate": 2.555709429686516e-08, + "logits/chosen": -1.5804517269134521, + "logits/rejected": -1.4171744585037231, + "logps/chosen": -486.77691650390625, + "logps/rejected": -664.5090942382812, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.3698625564575195, + "rewards/margins": 14.903956413269043, + "rewards/rejected": -10.534092903137207, + "step": 8420 + }, + { + "epoch": 2.87, + "learning_rate": 2.4927609215661587e-08, + "logits/chosen": -1.6028152704238892, + "logits/rejected": -1.3772051334381104, + "logps/chosen": -315.5608215332031, + "logps/rejected": -446.89324951171875, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.740208148956299, + "rewards/margins": 13.875042915344238, + "rewards/rejected": -10.134832382202148, + "step": 8430 + }, + { + "epoch": 2.87, + "learning_rate": 2.4298124134458013e-08, + "logits/chosen": -1.5859858989715576, + "logits/rejected": -1.4134756326675415, + "logps/chosen": -324.21051025390625, + "logps/rejected": -713.0202026367188, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.333634853363037, + "rewards/margins": 14.107881546020508, + "rewards/rejected": -10.774248123168945, + "step": 8440 + }, + { + "epoch": 2.87, + "learning_rate": 2.3668639053254436e-08, + "logits/chosen": -1.5692503452301025, + "logits/rejected": -1.3909469842910767, + "logps/chosen": -323.3507080078125, + "logps/rejected": -507.1849060058594, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1547141075134277, + "rewards/margins": 13.821222305297852, + "rewards/rejected": -10.666507720947266, + "step": 8450 + }, + { + "epoch": 2.88, + "learning_rate": 2.3039153972050862e-08, + "logits/chosen": -1.5687628984451294, + "logits/rejected": -1.3588732481002808, + "logps/chosen": -400.7327575683594, + "logps/rejected": -878.7252197265625, + "loss": 0.0045, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.6340134143829346, + "rewards/margins": 15.365193367004395, + "rewards/rejected": -11.731180191040039, + "step": 8460 + }, + { + "epoch": 2.88, + "learning_rate": 2.2409668890847285e-08, + "logits/chosen": -1.606419563293457, + "logits/rejected": -1.413710594177246, + "logps/chosen": -331.51397705078125, + "logps/rejected": -655.89697265625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8855056762695312, + "rewards/margins": 14.470726013183594, + "rewards/rejected": -10.585221290588379, + "step": 8470 + }, + { + "epoch": 2.88, + "learning_rate": 2.178018380964371e-08, + "logits/chosen": -1.5816526412963867, + "logits/rejected": -1.3538811206817627, + "logps/chosen": -405.48455810546875, + "logps/rejected": -654.637939453125, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.486841917037964, + "rewards/margins": 14.658251762390137, + "rewards/rejected": -11.171409606933594, + "step": 8480 + }, + { + "epoch": 2.89, + "learning_rate": 2.1150698728440137e-08, + "logits/chosen": -1.5789690017700195, + "logits/rejected": -1.4102327823638916, + "logps/chosen": -369.9436950683594, + "logps/rejected": -584.9236450195312, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.257991313934326, + "rewards/margins": 14.483480453491211, + "rewards/rejected": -10.22548770904541, + "step": 8490 + }, + { + "epoch": 2.89, + "learning_rate": 2.052121364723656e-08, + "logits/chosen": -1.580875039100647, + "logits/rejected": -1.3349984884262085, + "logps/chosen": -322.26318359375, + "logps/rejected": -617.8092041015625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.581845760345459, + "rewards/margins": 14.022043228149414, + "rewards/rejected": -10.440197944641113, + "step": 8500 + }, + { + "epoch": 2.89, + "eval_logits/chosen": -1.5791901350021362, + "eval_logits/rejected": -1.353068232536316, + "eval_logps/chosen": -383.7477111816406, + "eval_logps/rejected": -668.754150390625, + "eval_loss": 0.00621420843526721, + "eval_rewards/accuracies": 0.9991582632064819, + "eval_rewards/chosen": 3.6175010204315186, + "eval_rewards/margins": 14.54128360748291, + "eval_rewards/rejected": -10.923782348632812, + "eval_runtime": 576.5112, + "eval_samples_per_second": 16.478, + "eval_steps_per_second": 0.515, + "step": 8500 + }, + { + "epoch": 2.89, + "learning_rate": 1.9891728566032983e-08, + "logits/chosen": -1.5964787006378174, + "logits/rejected": -1.2840310335159302, + "logps/chosen": -400.33380126953125, + "logps/rejected": -446.5572814941406, + "loss": 0.0047, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.4554107189178467, + "rewards/margins": 13.462667465209961, + "rewards/rejected": -10.007257461547852, + "step": 8510 + }, + { + "epoch": 2.9, + "learning_rate": 1.926224348482941e-08, + "logits/chosen": -1.5614221096038818, + "logits/rejected": -1.3612565994262695, + "logps/chosen": -451.8758850097656, + "logps/rejected": -578.5814208984375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.243788719177246, + "rewards/margins": 14.758915901184082, + "rewards/rejected": -10.51512622833252, + "step": 8520 + }, + { + "epoch": 2.9, + "learning_rate": 1.863275840362583e-08, + "logits/chosen": -1.5639197826385498, + "logits/rejected": -1.3159427642822266, + "logps/chosen": -318.609130859375, + "logps/rejected": -774.41455078125, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0678935050964355, + "rewards/margins": 14.0814208984375, + "rewards/rejected": -11.013525009155273, + "step": 8530 + }, + { + "epoch": 2.9, + "learning_rate": 1.8003273322422258e-08, + "logits/chosen": -1.5878156423568726, + "logits/rejected": -1.374361276626587, + "logps/chosen": -356.32794189453125, + "logps/rejected": -662.537841796875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2693569660186768, + "rewards/margins": 13.764470100402832, + "rewards/rejected": -10.495115280151367, + "step": 8540 + }, + { + "epoch": 2.91, + "learning_rate": 1.737378824121868e-08, + "logits/chosen": -1.5880918502807617, + "logits/rejected": -1.3318572044372559, + "logps/chosen": -310.6067810058594, + "logps/rejected": -639.80029296875, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4643654823303223, + "rewards/margins": 13.975817680358887, + "rewards/rejected": -10.511453628540039, + "step": 8550 + }, + { + "epoch": 2.91, + "learning_rate": 1.6744303160015107e-08, + "logits/chosen": -1.5657209157943726, + "logits/rejected": -1.3724708557128906, + "logps/chosen": -390.4461364746094, + "logps/rejected": -629.1236572265625, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.46832013130188, + "rewards/margins": 13.555915832519531, + "rewards/rejected": -10.08759593963623, + "step": 8560 + }, + { + "epoch": 2.91, + "learning_rate": 1.6114818078811533e-08, + "logits/chosen": -1.5931777954101562, + "logits/rejected": -1.3727209568023682, + "logps/chosen": -409.6351013183594, + "logps/rejected": -745.1212768554688, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.715482234954834, + "rewards/margins": 13.272628784179688, + "rewards/rejected": -9.557146072387695, + "step": 8570 + }, + { + "epoch": 2.92, + "learning_rate": 1.5485332997607955e-08, + "logits/chosen": -1.5811976194381714, + "logits/rejected": -1.3594437837600708, + "logps/chosen": -309.9898681640625, + "logps/rejected": -618.1383666992188, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.676459550857544, + "rewards/margins": 14.93535327911377, + "rewards/rejected": -11.258892059326172, + "step": 8580 + }, + { + "epoch": 2.92, + "learning_rate": 1.485584791640438e-08, + "logits/chosen": -1.5847084522247314, + "logits/rejected": -1.3902032375335693, + "logps/chosen": -370.1962890625, + "logps/rejected": -662.829833984375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.495457172393799, + "rewards/margins": 14.065027236938477, + "rewards/rejected": -10.56956958770752, + "step": 8590 + }, + { + "epoch": 2.92, + "learning_rate": 1.4226362835200804e-08, + "logits/chosen": -1.6019541025161743, + "logits/rejected": -1.426632046699524, + "logps/chosen": -321.93701171875, + "logps/rejected": -699.9473876953125, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7738540172576904, + "rewards/margins": 13.496922492980957, + "rewards/rejected": -9.723068237304688, + "step": 8600 + }, + { + "epoch": 2.92, + "eval_logits/chosen": -1.586925745010376, + "eval_logits/rejected": -1.3575776815414429, + "eval_logps/chosen": -382.7484130859375, + "eval_logps/rejected": -666.8983154296875, + "eval_loss": 0.005927423480898142, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.7174360752105713, + "eval_rewards/margins": 14.45562744140625, + "eval_rewards/rejected": -10.738192558288574, + "eval_runtime": 577.4524, + "eval_samples_per_second": 16.452, + "eval_steps_per_second": 0.514, + "step": 8600 + }, + { + "epoch": 2.93, + "learning_rate": 1.3596877753997229e-08, + "logits/chosen": -1.5804331302642822, + "logits/rejected": -1.366788387298584, + "logps/chosen": -461.79937744140625, + "logps/rejected": -489.282958984375, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.735045909881592, + "rewards/margins": 15.089797973632812, + "rewards/rejected": -11.354753494262695, + "step": 8610 + }, + { + "epoch": 2.93, + "learning_rate": 1.2967392672793655e-08, + "logits/chosen": -1.5933865308761597, + "logits/rejected": -1.3984348773956299, + "logps/chosen": -345.1706237792969, + "logps/rejected": -965.3956298828125, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6401455402374268, + "rewards/margins": 14.948602676391602, + "rewards/rejected": -11.308455467224121, + "step": 8620 + }, + { + "epoch": 2.93, + "learning_rate": 1.233790759159008e-08, + "logits/chosen": -1.5928618907928467, + "logits/rejected": -1.3979308605194092, + "logps/chosen": -374.5369873046875, + "logps/rejected": -616.7839965820312, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5822746753692627, + "rewards/margins": 13.60655689239502, + "rewards/rejected": -10.024280548095703, + "step": 8630 + }, + { + "epoch": 2.94, + "learning_rate": 1.1708422510386504e-08, + "logits/chosen": -1.566396951675415, + "logits/rejected": -1.3205740451812744, + "logps/chosen": -442.74609375, + "logps/rejected": -614.5992431640625, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.032160758972168, + "rewards/margins": 15.41914176940918, + "rewards/rejected": -11.386981964111328, + "step": 8640 + }, + { + "epoch": 2.94, + "learning_rate": 1.1078937429182926e-08, + "logits/chosen": -1.5697476863861084, + "logits/rejected": -1.4339548349380493, + "logps/chosen": -511.27105712890625, + "logps/rejected": -769.0615844726562, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9845359325408936, + "rewards/margins": 13.379335403442383, + "rewards/rejected": -10.394798278808594, + "step": 8650 + }, + { + "epoch": 2.94, + "learning_rate": 1.0449452347979353e-08, + "logits/chosen": -1.5964996814727783, + "logits/rejected": -1.3791311979293823, + "logps/chosen": -455.9476623535156, + "logps/rejected": -664.6773071289062, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9346728324890137, + "rewards/margins": 15.141133308410645, + "rewards/rejected": -11.206460952758789, + "step": 8660 + }, + { + "epoch": 2.95, + "learning_rate": 9.819967266775777e-09, + "logits/chosen": -1.572621464729309, + "logits/rejected": -1.336653232574463, + "logps/chosen": -317.2608337402344, + "logps/rejected": -676.523193359375, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.347161054611206, + "rewards/margins": 13.526951789855957, + "rewards/rejected": -10.179792404174805, + "step": 8670 + }, + { + "epoch": 2.95, + "learning_rate": 9.190482185572201e-09, + "logits/chosen": -1.5671789646148682, + "logits/rejected": -1.3559914827346802, + "logps/chosen": -435.9064025878906, + "logps/rejected": -818.0877685546875, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.03999662399292, + "rewards/margins": 14.680827140808105, + "rewards/rejected": -10.640829086303711, + "step": 8680 + }, + { + "epoch": 2.95, + "learning_rate": 8.560997104368626e-09, + "logits/chosen": -1.5859724283218384, + "logits/rejected": -1.3796188831329346, + "logps/chosen": -388.61767578125, + "logps/rejected": -536.8897705078125, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.998549222946167, + "rewards/margins": 14.71049690246582, + "rewards/rejected": -10.711947441101074, + "step": 8690 + }, + { + "epoch": 2.96, + "learning_rate": 7.931512023165052e-09, + "logits/chosen": -1.5859348773956299, + "logits/rejected": -1.418962001800537, + "logps/chosen": -404.0248107910156, + "logps/rejected": -607.8995361328125, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4491419792175293, + "rewards/margins": 12.928059577941895, + "rewards/rejected": -9.478917121887207, + "step": 8700 + }, + { + "epoch": 2.96, + "eval_logits/chosen": -1.572788953781128, + "eval_logits/rejected": -1.3458889722824097, + "eval_logps/chosen": -384.1739196777344, + "eval_logps/rejected": -670.3754272460938, + "eval_loss": 0.0062035322189331055, + "eval_rewards/accuracies": 0.997474730014801, + "eval_rewards/chosen": 3.5748820304870605, + "eval_rewards/margins": 14.660787582397461, + "eval_rewards/rejected": -11.085906028747559, + "eval_runtime": 575.7775, + "eval_samples_per_second": 16.499, + "eval_steps_per_second": 0.516, + "step": 8700 + }, + { + "epoch": 2.96, + "learning_rate": 7.3020269419614755e-09, + "logits/chosen": -1.5888078212738037, + "logits/rejected": -1.441348910331726, + "logps/chosen": -334.7347717285156, + "logps/rejected": -880.3033447265625, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.533327579498291, + "rewards/margins": 14.541035652160645, + "rewards/rejected": -11.007707595825195, + "step": 8710 + }, + { + "epoch": 2.96, + "learning_rate": 6.6725418607579e-09, + "logits/chosen": -1.580986738204956, + "logits/rejected": -1.3180453777313232, + "logps/chosen": -424.1851501464844, + "logps/rejected": -736.6097412109375, + "loss": 0.0029, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 3.925544261932373, + "rewards/margins": 15.604103088378906, + "rewards/rejected": -11.678559303283691, + "step": 8720 + }, + { + "epoch": 2.97, + "learning_rate": 6.043056779554324e-09, + "logits/chosen": -1.5984703302383423, + "logits/rejected": -1.3540745973587036, + "logps/chosen": -334.62152099609375, + "logps/rejected": -833.6624145507812, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.724179744720459, + "rewards/margins": 13.787577629089355, + "rewards/rejected": -10.063395500183105, + "step": 8730 + }, + { + "epoch": 2.97, + "learning_rate": 5.41357169835075e-09, + "logits/chosen": -1.5832895040512085, + "logits/rejected": -1.2794277667999268, + "logps/chosen": -418.2052307128906, + "logps/rejected": -589.4303588867188, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3660645484924316, + "rewards/margins": 13.847193717956543, + "rewards/rejected": -10.481128692626953, + "step": 8740 + }, + { + "epoch": 2.97, + "learning_rate": 4.784086617147173e-09, + "logits/chosen": -1.5725127458572388, + "logits/rejected": -1.4708614349365234, + "logps/chosen": -534.7617797851562, + "logps/rejected": -558.42236328125, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8431174755096436, + "rewards/margins": 13.795602798461914, + "rewards/rejected": -9.952485084533691, + "step": 8750 + }, + { + "epoch": 2.98, + "learning_rate": 4.1546015359435984e-09, + "logits/chosen": -1.5622774362564087, + "logits/rejected": -1.3851722478866577, + "logps/chosen": -463.82940673828125, + "logps/rejected": -644.4447631835938, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8551177978515625, + "rewards/margins": 14.625436782836914, + "rewards/rejected": -10.770318031311035, + "step": 8760 + }, + { + "epoch": 2.98, + "learning_rate": 3.5251164547400225e-09, + "logits/chosen": -1.556908369064331, + "logits/rejected": -1.377995252609253, + "logps/chosen": -446.1564025878906, + "logps/rejected": -673.19140625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3479065895080566, + "rewards/margins": 13.489291191101074, + "rewards/rejected": -10.14138412475586, + "step": 8770 + }, + { + "epoch": 2.98, + "learning_rate": 2.895631373536447e-09, + "logits/chosen": -1.5834182500839233, + "logits/rejected": -1.3828694820404053, + "logps/chosen": -394.6122131347656, + "logps/rejected": -465.70849609375, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.183238983154297, + "rewards/margins": 13.352790832519531, + "rewards/rejected": -10.169550895690918, + "step": 8780 + }, + { + "epoch": 2.99, + "learning_rate": 2.2661462923328713e-09, + "logits/chosen": -1.5709158182144165, + "logits/rejected": -1.334829330444336, + "logps/chosen": -386.2044677734375, + "logps/rejected": -675.251953125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4015731811523438, + "rewards/margins": 13.834693908691406, + "rewards/rejected": -10.433121681213379, + "step": 8790 + }, + { + "epoch": 2.99, + "learning_rate": 1.6366612111292962e-09, + "logits/chosen": -1.5731828212738037, + "logits/rejected": -1.3847582340240479, + "logps/chosen": -333.2795715332031, + "logps/rejected": -523.1157836914062, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.49733304977417, + "rewards/margins": 14.319364547729492, + "rewards/rejected": -10.82203197479248, + "step": 8800 + }, + { + "epoch": 2.99, + "eval_logits/chosen": -1.5891327857971191, + "eval_logits/rejected": -1.3617818355560303, + "eval_logps/chosen": -383.02899169921875, + "eval_logps/rejected": -667.5586547851562, + "eval_loss": 0.0062055825255811214, + "eval_rewards/accuracies": 0.9983165264129639, + "eval_rewards/chosen": 3.689375400543213, + "eval_rewards/margins": 14.49360466003418, + "eval_rewards/rejected": -10.804230690002441, + "eval_runtime": 576.2256, + "eval_samples_per_second": 16.487, + "eval_steps_per_second": 0.515, + "step": 8800 + }, + { + "epoch": 2.99, + "learning_rate": 1.0071761299257208e-09, + "logits/chosen": -1.5978032350540161, + "logits/rejected": -1.4079787731170654, + "logps/chosen": -362.62518310546875, + "logps/rejected": -684.5773315429688, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9742512702941895, + "rewards/margins": 15.327960014343262, + "rewards/rejected": -11.353708267211914, + "step": 8810 + }, + { + "epoch": 3.0, + "learning_rate": 3.7769104872214527e-10, + "logits/chosen": -1.5846927165985107, + "logits/rejected": -1.3839848041534424, + "logps/chosen": -335.94635009765625, + "logps/rejected": -530.9546508789062, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.628373622894287, + "rewards/margins": 14.358508110046387, + "rewards/rejected": -10.730133056640625, + "step": 8820 + }, + { + "epoch": 3.0, + "step": 8826, + "total_flos": 0.0, + "train_loss": 0.053706495013128505, + "train_runtime": 110737.0914, + "train_samples_per_second": 5.101, + "train_steps_per_second": 0.08 + } + ], + "logging_steps": 10, + "max_steps": 8826, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}