{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.995661605206074, "eval_steps": 500, "global_step": 4146, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014461315979754157, "grad_norm": 17.68067024457695, "learning_rate": 8e-08, "logits/chosen": 0.8356107473373413, "logits/rejected": 0.7603495717048645, "logps/chosen": -0.8946685791015625, "logps/rejected": -0.9055352807044983, "loss": 1.0095, "odds_ratio_loss": 0.7516021132469177, "rewards/accuracies": 0.5, "rewards/chosen": -0.0894668698310852, "rewards/margins": 0.0010866625234484673, "rewards/rejected": -0.09055352210998535, "sft_loss": 0.8946685791015625, "step": 1 }, { "epoch": 0.0028922631959508315, "grad_norm": 22.2584821655261, "learning_rate": 1.6e-07, "logits/chosen": 0.9862427115440369, "logits/rejected": 0.8068673014640808, "logps/chosen": -0.906627357006073, "logps/rejected": -1.0544397830963135, "loss": 1.0587, "odds_ratio_loss": 0.66656893491745, "rewards/accuracies": 0.5, "rewards/chosen": -0.09066274017095566, "rewards/margins": 0.01478125061839819, "rewards/rejected": -0.10544399917125702, "sft_loss": 0.906627357006073, "step": 2 }, { "epoch": 0.004338394793926247, "grad_norm": 27.462300830950515, "learning_rate": 2.4e-07, "logits/chosen": 0.745313823223114, "logits/rejected": 0.5240325927734375, "logps/chosen": -1.166395664215088, "logps/rejected": -1.1368509531021118, "loss": 1.0855, "odds_ratio_loss": 0.8221395015716553, "rewards/accuracies": 0.4375, "rewards/chosen": -0.11663956940174103, "rewards/margins": -0.0029544695280492306, "rewards/rejected": -0.11368509382009506, "sft_loss": 1.166395664215088, "step": 3 }, { "epoch": 0.005784526391901663, "grad_norm": 20.425495140968387, "learning_rate": 3.2e-07, "logits/chosen": 0.9010774493217468, "logits/rejected": 0.7436051368713379, "logps/chosen": -0.9683417081832886, "logps/rejected": -1.0990208387374878, "loss": 1.0477, "odds_ratio_loss": 0.6798087358474731, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09683416783809662, "rewards/margins": 0.013067921623587608, "rewards/rejected": -0.10990208387374878, "sft_loss": 0.9683417081832886, "step": 4 }, { "epoch": 0.0072306579898770785, "grad_norm": 9.072577711772448, "learning_rate": 4e-07, "logits/chosen": 1.0971115827560425, "logits/rejected": 0.9646638631820679, "logps/chosen": -0.6877315044403076, "logps/rejected": -1.0325212478637695, "loss": 0.9536, "odds_ratio_loss": 0.6098527908325195, "rewards/accuracies": 0.75, "rewards/chosen": -0.06877315789461136, "rewards/margins": 0.03447896987199783, "rewards/rejected": -0.10325212776660919, "sft_loss": 0.6877315044403076, "step": 5 }, { "epoch": 0.008676789587852495, "grad_norm": 31.366473181101718, "learning_rate": 4.8e-07, "logits/chosen": 1.1323935985565186, "logits/rejected": 0.6428372263908386, "logps/chosen": -0.9170743227005005, "logps/rejected": -1.1397260427474976, "loss": 1.0515, "odds_ratio_loss": 0.6087831258773804, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09170743823051453, "rewards/margins": 0.022265177220106125, "rewards/rejected": -0.11397261917591095, "sft_loss": 0.9170743227005005, "step": 6 }, { "epoch": 0.01012292118582791, "grad_norm": 23.362133533438314, "learning_rate": 5.6e-07, "logits/chosen": 1.0590741634368896, "logits/rejected": 0.8795340657234192, "logps/chosen": -0.849834144115448, "logps/rejected": -1.0384660959243774, "loss": 0.9954, "odds_ratio_loss": 0.6854482889175415, "rewards/accuracies": 0.625, "rewards/chosen": -0.08498341590166092, "rewards/margins": 0.01886320300400257, "rewards/rejected": -0.10384660959243774, "sft_loss": 0.849834144115448, "step": 7 }, { "epoch": 0.011569052783803326, "grad_norm": 13.59399093179875, "learning_rate": 6.4e-07, "logits/chosen": 0.9279472827911377, "logits/rejected": 0.7937031388282776, "logps/chosen": -0.8301913142204285, "logps/rejected": -0.9943248629570007, "loss": 0.9836, "odds_ratio_loss": 0.6135038137435913, "rewards/accuracies": 0.75, "rewards/chosen": -0.08301912993192673, "rewards/margins": 0.016413355246186256, "rewards/rejected": -0.09943248331546783, "sft_loss": 0.8301913142204285, "step": 8 }, { "epoch": 0.013015184381778741, "grad_norm": 20.87366368622792, "learning_rate": 7.2e-07, "logits/chosen": 0.6361145973205566, "logits/rejected": 0.5400397181510925, "logps/chosen": -1.2082147598266602, "logps/rejected": -1.314789056777954, "loss": 1.0284, "odds_ratio_loss": 0.7909836173057556, "rewards/accuracies": 0.4375, "rewards/chosen": -0.12082147598266602, "rewards/margins": 0.010657444596290588, "rewards/rejected": -0.1314789056777954, "sft_loss": 1.2082147598266602, "step": 9 }, { "epoch": 0.014461315979754157, "grad_norm": 35.6477296168724, "learning_rate": 8e-07, "logits/chosen": 0.7234739065170288, "logits/rejected": 0.7677637338638306, "logps/chosen": -0.9336320161819458, "logps/rejected": -1.1697273254394531, "loss": 0.9868, "odds_ratio_loss": 0.6399700045585632, "rewards/accuracies": 0.625, "rewards/chosen": -0.0933631956577301, "rewards/margins": 0.02360953576862812, "rewards/rejected": -0.11697272211313248, "sft_loss": 0.9336320161819458, "step": 10 }, { "epoch": 0.015907447577729574, "grad_norm": 17.36589859929308, "learning_rate": 8.799999999999999e-07, "logits/chosen": 0.8094204664230347, "logits/rejected": 0.6163336038589478, "logps/chosen": -0.8817548751831055, "logps/rejected": -0.9194456934928894, "loss": 1.0479, "odds_ratio_loss": 0.7149617075920105, "rewards/accuracies": 0.4375, "rewards/chosen": -0.08817549049854279, "rewards/margins": 0.003769081551581621, "rewards/rejected": -0.09194456785917282, "sft_loss": 0.8817548751831055, "step": 11 }, { "epoch": 0.01735357917570499, "grad_norm": 7.785721880156769, "learning_rate": 9.6e-07, "logits/chosen": 0.6122461557388306, "logits/rejected": 0.7143501043319702, "logps/chosen": -0.9489305019378662, "logps/rejected": -1.1337459087371826, "loss": 0.951, "odds_ratio_loss": 0.5687827467918396, "rewards/accuracies": 0.75, "rewards/chosen": -0.09489305317401886, "rewards/margins": 0.018481535837054253, "rewards/rejected": -0.11337459087371826, "sft_loss": 0.9489305019378662, "step": 12 }, { "epoch": 0.018799710773680405, "grad_norm": 11.720887718671964, "learning_rate": 1.04e-06, "logits/chosen": 0.7059281468391418, "logits/rejected": 0.6184044480323792, "logps/chosen": -0.8445357084274292, "logps/rejected": -0.9474976062774658, "loss": 0.8701, "odds_ratio_loss": 0.7172947525978088, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08445357531309128, "rewards/margins": 0.010296182706952095, "rewards/rejected": -0.09474976360797882, "sft_loss": 0.8445357084274292, "step": 13 }, { "epoch": 0.02024584237165582, "grad_norm": 10.973659424574418, "learning_rate": 1.12e-06, "logits/chosen": 0.855961799621582, "logits/rejected": 0.6677908897399902, "logps/chosen": -1.0772851705551147, "logps/rejected": -1.1868722438812256, "loss": 1.004, "odds_ratio_loss": 0.6617684364318848, "rewards/accuracies": 0.5625, "rewards/chosen": -0.107728511095047, "rewards/margins": 0.010958710685372353, "rewards/rejected": -0.1186872273683548, "sft_loss": 1.0772851705551147, "step": 14 }, { "epoch": 0.021691973969631236, "grad_norm": 10.00136448443639, "learning_rate": 1.2e-06, "logits/chosen": 0.8666166663169861, "logits/rejected": 0.7374793887138367, "logps/chosen": -0.6898748278617859, "logps/rejected": -0.9876577854156494, "loss": 0.954, "odds_ratio_loss": 0.5177364349365234, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06898748129606247, "rewards/margins": 0.02977829799056053, "rewards/rejected": -0.0987657755613327, "sft_loss": 0.6898748278617859, "step": 15 }, { "epoch": 0.023138105567606652, "grad_norm": 7.339064707053332, "learning_rate": 1.28e-06, "logits/chosen": 1.0984623432159424, "logits/rejected": 0.9263596534729004, "logps/chosen": -0.9031826853752136, "logps/rejected": -0.9198977947235107, "loss": 1.0056, "odds_ratio_loss": 0.7259780168533325, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09031827002763748, "rewards/margins": 0.001671510748565197, "rewards/rejected": -0.09198978543281555, "sft_loss": 0.9031826853752136, "step": 16 }, { "epoch": 0.024584237165582067, "grad_norm": 8.349121794398082, "learning_rate": 1.3600000000000001e-06, "logits/chosen": 0.958114743232727, "logits/rejected": 0.7234222888946533, "logps/chosen": -0.7752377390861511, "logps/rejected": -1.0817121267318726, "loss": 0.8369, "odds_ratio_loss": 0.5174487233161926, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07752377539873123, "rewards/margins": 0.030647438019514084, "rewards/rejected": -0.10817121714353561, "sft_loss": 0.7752377390861511, "step": 17 }, { "epoch": 0.026030368763557483, "grad_norm": 16.74840406974954, "learning_rate": 1.44e-06, "logits/chosen": 1.036068081855774, "logits/rejected": 0.7755584716796875, "logps/chosen": -0.7796313166618347, "logps/rejected": -0.8467473387718201, "loss": 0.9468, "odds_ratio_loss": 0.7774651050567627, "rewards/accuracies": 0.5, "rewards/chosen": -0.07796312868595123, "rewards/margins": 0.006711603607982397, "rewards/rejected": -0.08467473834753036, "sft_loss": 0.7796313166618347, "step": 18 }, { "epoch": 0.0274765003615329, "grad_norm": 19.71080839311607, "learning_rate": 1.5199999999999998e-06, "logits/chosen": 0.9443542957305908, "logits/rejected": 0.7273315191268921, "logps/chosen": -0.8325945138931274, "logps/rejected": -0.9272810220718384, "loss": 0.8364, "odds_ratio_loss": 0.6661559343338013, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0832594484090805, "rewards/margins": 0.009468646720051765, "rewards/rejected": -0.09272809326648712, "sft_loss": 0.8325945138931274, "step": 19 }, { "epoch": 0.028922631959508314, "grad_norm": 17.932913869480345, "learning_rate": 1.6e-06, "logits/chosen": 0.9203794598579407, "logits/rejected": 0.7660186290740967, "logps/chosen": -0.8127734065055847, "logps/rejected": -0.9844791889190674, "loss": 0.8528, "odds_ratio_loss": 0.6955982446670532, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08127734810113907, "rewards/margins": 0.017170574516057968, "rewards/rejected": -0.09844791889190674, "sft_loss": 0.8127734065055847, "step": 20 }, { "epoch": 0.03036876355748373, "grad_norm": 4.676737103695072, "learning_rate": 1.6799999999999998e-06, "logits/chosen": 1.0346933603286743, "logits/rejected": 0.7640005350112915, "logps/chosen": -0.7656794786453247, "logps/rejected": -0.9085444211959839, "loss": 0.8817, "odds_ratio_loss": 0.5907818675041199, "rewards/accuracies": 0.625, "rewards/chosen": -0.07656795531511307, "rewards/margins": 0.014286492951214314, "rewards/rejected": -0.09085444360971451, "sft_loss": 0.7656794786453247, "step": 21 }, { "epoch": 0.03181489515545915, "grad_norm": 21.219968023645468, "learning_rate": 1.7599999999999999e-06, "logits/chosen": 0.7389025688171387, "logits/rejected": 0.768907904624939, "logps/chosen": -0.740533709526062, "logps/rejected": -0.9216527938842773, "loss": 0.9192, "odds_ratio_loss": 0.7230217456817627, "rewards/accuracies": 0.5, "rewards/chosen": -0.07405337691307068, "rewards/margins": 0.018111903220415115, "rewards/rejected": -0.0921652764081955, "sft_loss": 0.740533709526062, "step": 22 }, { "epoch": 0.033261026753434564, "grad_norm": 5.1186934556905905, "learning_rate": 1.84e-06, "logits/chosen": 1.104678988456726, "logits/rejected": 0.933571457862854, "logps/chosen": -0.8342806696891785, "logps/rejected": -1.0144786834716797, "loss": 0.9273, "odds_ratio_loss": 0.5798680186271667, "rewards/accuracies": 0.75, "rewards/chosen": -0.08342806994915009, "rewards/margins": 0.018019797280430794, "rewards/rejected": -0.10144786536693573, "sft_loss": 0.8342806696891785, "step": 23 }, { "epoch": 0.03470715835140998, "grad_norm": 5.996648991214411, "learning_rate": 1.92e-06, "logits/chosen": 0.6956485509872437, "logits/rejected": 0.5334842801094055, "logps/chosen": -0.8051484823226929, "logps/rejected": -1.0607928037643433, "loss": 0.9043, "odds_ratio_loss": 0.5199005603790283, "rewards/accuracies": 0.875, "rewards/chosen": -0.08051484823226929, "rewards/margins": 0.02556443400681019, "rewards/rejected": -0.10607928782701492, "sft_loss": 0.8051484823226929, "step": 24 }, { "epoch": 0.036153289949385395, "grad_norm": 15.463181593194717, "learning_rate": 2e-06, "logits/chosen": 1.0711913108825684, "logits/rejected": 0.6716903448104858, "logps/chosen": -0.780259370803833, "logps/rejected": -0.8929332494735718, "loss": 0.8245, "odds_ratio_loss": 0.6280741095542908, "rewards/accuracies": 0.625, "rewards/chosen": -0.0780259370803833, "rewards/margins": 0.01126739289611578, "rewards/rejected": -0.08929332345724106, "sft_loss": 0.780259370803833, "step": 25 }, { "epoch": 0.03759942154736081, "grad_norm": 6.4509394430823725, "learning_rate": 2.08e-06, "logits/chosen": 0.9420080184936523, "logits/rejected": 0.8971505165100098, "logps/chosen": -0.6603622436523438, "logps/rejected": -0.9678102135658264, "loss": 0.8143, "odds_ratio_loss": 0.5521212816238403, "rewards/accuracies": 0.75, "rewards/chosen": -0.06603622436523438, "rewards/margins": 0.030744802206754684, "rewards/rejected": -0.09678103029727936, "sft_loss": 0.6603622436523438, "step": 26 }, { "epoch": 0.039045553145336226, "grad_norm": 5.904770765139368, "learning_rate": 2.16e-06, "logits/chosen": 1.0589203834533691, "logits/rejected": 1.0087999105453491, "logps/chosen": -0.9654819965362549, "logps/rejected": -1.0529497861862183, "loss": 0.9155, "odds_ratio_loss": 0.7037606239318848, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09654819965362549, "rewards/margins": 0.008746780455112457, "rewards/rejected": -0.10529498755931854, "sft_loss": 0.9654819965362549, "step": 27 }, { "epoch": 0.04049168474331164, "grad_norm": 6.804983128940474, "learning_rate": 2.24e-06, "logits/chosen": 1.0854326486587524, "logits/rejected": 0.7822256684303284, "logps/chosen": -0.8072190284729004, "logps/rejected": -1.1046161651611328, "loss": 0.8532, "odds_ratio_loss": 0.5510303378105164, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0807219073176384, "rewards/margins": 0.029739707708358765, "rewards/rejected": -0.11046161502599716, "sft_loss": 0.8072190284729004, "step": 28 }, { "epoch": 0.04193781634128706, "grad_norm": 11.155540190844036, "learning_rate": 2.32e-06, "logits/chosen": 0.9284826517105103, "logits/rejected": 0.7190419435501099, "logps/chosen": -0.84559166431427, "logps/rejected": -0.9231205582618713, "loss": 0.9285, "odds_ratio_loss": 0.6862574815750122, "rewards/accuracies": 0.5, "rewards/chosen": -0.08455916494131088, "rewards/margins": 0.007752885576337576, "rewards/rejected": -0.0923120528459549, "sft_loss": 0.84559166431427, "step": 29 }, { "epoch": 0.04338394793926247, "grad_norm": 7.130648176543995, "learning_rate": 2.4e-06, "logits/chosen": 0.9986206293106079, "logits/rejected": 0.7722131013870239, "logps/chosen": -0.6998481154441833, "logps/rejected": -1.2671633958816528, "loss": 0.8881, "odds_ratio_loss": 0.4308162033557892, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0699848085641861, "rewards/margins": 0.05673152208328247, "rewards/rejected": -0.12671633064746857, "sft_loss": 0.6998481154441833, "step": 30 }, { "epoch": 0.04483007953723789, "grad_norm": 7.964095468445476, "learning_rate": 2.48e-06, "logits/chosen": 1.0034689903259277, "logits/rejected": 0.8177412152290344, "logps/chosen": -0.7045868635177612, "logps/rejected": -1.0203757286071777, "loss": 0.8301, "odds_ratio_loss": 0.4992942214012146, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07045868784189224, "rewards/margins": 0.031578898429870605, "rewards/rejected": -0.10203758627176285, "sft_loss": 0.7045868635177612, "step": 31 }, { "epoch": 0.046276211135213303, "grad_norm": 6.206570705290645, "learning_rate": 2.56e-06, "logits/chosen": 0.8828554749488831, "logits/rejected": 0.8245723247528076, "logps/chosen": -0.7559677362442017, "logps/rejected": -0.7688103318214417, "loss": 0.9242, "odds_ratio_loss": 0.6954567432403564, "rewards/accuracies": 0.5, "rewards/chosen": -0.07559677958488464, "rewards/margins": 0.001284262165427208, "rewards/rejected": -0.0768810361623764, "sft_loss": 0.7559677362442017, "step": 32 }, { "epoch": 0.04772234273318872, "grad_norm": 3.819558444843904, "learning_rate": 2.64e-06, "logits/chosen": 0.9170816540718079, "logits/rejected": 0.6471152305603027, "logps/chosen": -0.7481220960617065, "logps/rejected": -1.044447422027588, "loss": 0.8164, "odds_ratio_loss": 0.5162819623947144, "rewards/accuracies": 0.75, "rewards/chosen": -0.07481221854686737, "rewards/margins": 0.029632527381181717, "rewards/rejected": -0.10444474220275879, "sft_loss": 0.7481220960617065, "step": 33 }, { "epoch": 0.049168474331164135, "grad_norm": 17.468727974038213, "learning_rate": 2.7200000000000002e-06, "logits/chosen": 0.9004616737365723, "logits/rejected": 0.9549974203109741, "logps/chosen": -0.7914824485778809, "logps/rejected": -0.9024128317832947, "loss": 0.7975, "odds_ratio_loss": 0.6575660705566406, "rewards/accuracies": 0.5, "rewards/chosen": -0.07914824038743973, "rewards/margins": 0.011093037202954292, "rewards/rejected": -0.09024128317832947, "sft_loss": 0.7914824485778809, "step": 34 }, { "epoch": 0.05061460592913955, "grad_norm": 4.6049278375914415, "learning_rate": 2.8e-06, "logits/chosen": 1.0355346202850342, "logits/rejected": 0.8618839979171753, "logps/chosen": -0.6818061470985413, "logps/rejected": -1.0076775550842285, "loss": 0.8934, "odds_ratio_loss": 0.48088353872299194, "rewards/accuracies": 0.8125, "rewards/chosen": -0.068180613219738, "rewards/margins": 0.03258715197443962, "rewards/rejected": -0.10076776891946793, "sft_loss": 0.6818061470985413, "step": 35 }, { "epoch": 0.052060737527114966, "grad_norm": 10.126587564107808, "learning_rate": 2.88e-06, "logits/chosen": 1.0104275941848755, "logits/rejected": 0.8126014471054077, "logps/chosen": -1.0583109855651855, "logps/rejected": -1.188212513923645, "loss": 0.9078, "odds_ratio_loss": 0.6410647630691528, "rewards/accuracies": 0.625, "rewards/chosen": -0.1058310940861702, "rewards/margins": 0.012990152463316917, "rewards/rejected": -0.11882124096155167, "sft_loss": 1.0583109855651855, "step": 36 }, { "epoch": 0.05350686912509038, "grad_norm": 5.195790645822001, "learning_rate": 2.96e-06, "logits/chosen": 0.8879537582397461, "logits/rejected": 0.8749307990074158, "logps/chosen": -0.9341248273849487, "logps/rejected": -1.0359203815460205, "loss": 0.8659, "odds_ratio_loss": 0.6952499151229858, "rewards/accuracies": 0.5, "rewards/chosen": -0.09341248869895935, "rewards/margins": 0.0101795494556427, "rewards/rejected": -0.10359203815460205, "sft_loss": 0.9341248273849487, "step": 37 }, { "epoch": 0.0549530007230658, "grad_norm": 7.028715936791847, "learning_rate": 3.0399999999999997e-06, "logits/chosen": 0.8880399465560913, "logits/rejected": 0.7450892329216003, "logps/chosen": -0.7892625331878662, "logps/rejected": -0.7776023745536804, "loss": 0.8412, "odds_ratio_loss": 0.7238245010375977, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07892625778913498, "rewards/margins": -0.0011660188902169466, "rewards/rejected": -0.0777602344751358, "sft_loss": 0.7892625331878662, "step": 38 }, { "epoch": 0.05639913232104121, "grad_norm": 5.64693417602307, "learning_rate": 3.1199999999999998e-06, "logits/chosen": 0.9538003206253052, "logits/rejected": 0.8920505046844482, "logps/chosen": -0.6508951187133789, "logps/rejected": -1.040766954421997, "loss": 0.8128, "odds_ratio_loss": 0.4561525881290436, "rewards/accuracies": 0.875, "rewards/chosen": -0.06508950889110565, "rewards/margins": 0.03898719325661659, "rewards/rejected": -0.10407670587301254, "sft_loss": 0.6508951187133789, "step": 39 }, { "epoch": 0.05784526391901663, "grad_norm": 4.834895359786458, "learning_rate": 3.2e-06, "logits/chosen": 1.149649977684021, "logits/rejected": 0.9461228251457214, "logps/chosen": -0.9168272614479065, "logps/rejected": -1.1571921110153198, "loss": 0.8322, "odds_ratio_loss": 0.6043152809143066, "rewards/accuracies": 0.75, "rewards/chosen": -0.09168273210525513, "rewards/margins": 0.02403649315237999, "rewards/rejected": -0.11571922153234482, "sft_loss": 0.9168272614479065, "step": 40 }, { "epoch": 0.05929139551699204, "grad_norm": 4.061085611051016, "learning_rate": 3.2799999999999995e-06, "logits/chosen": 1.1228294372558594, "logits/rejected": 0.8763277530670166, "logps/chosen": -0.7228974103927612, "logps/rejected": -0.8757832050323486, "loss": 0.8719, "odds_ratio_loss": 0.6970602869987488, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07228974252939224, "rewards/margins": 0.015288583002984524, "rewards/rejected": -0.08757832646369934, "sft_loss": 0.7228974103927612, "step": 41 }, { "epoch": 0.06073752711496746, "grad_norm": 4.580481948452303, "learning_rate": 3.3599999999999996e-06, "logits/chosen": 1.060896873474121, "logits/rejected": 0.8334064483642578, "logps/chosen": -0.6850025653839111, "logps/rejected": -1.0060864686965942, "loss": 0.7318, "odds_ratio_loss": 0.5280728340148926, "rewards/accuracies": 0.75, "rewards/chosen": -0.06850025802850723, "rewards/margins": 0.03210839629173279, "rewards/rejected": -0.10060865432024002, "sft_loss": 0.6850025653839111, "step": 42 }, { "epoch": 0.06218365871294288, "grad_norm": 5.169320832414527, "learning_rate": 3.4399999999999997e-06, "logits/chosen": 1.1155328750610352, "logits/rejected": 0.7692084908485413, "logps/chosen": -0.6570387482643127, "logps/rejected": -0.9917082786560059, "loss": 0.7906, "odds_ratio_loss": 0.5790050625801086, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0657038763165474, "rewards/margins": 0.03346695005893707, "rewards/rejected": -0.09917082637548447, "sft_loss": 0.6570387482643127, "step": 43 }, { "epoch": 0.0636297903109183, "grad_norm": 3.998113789272701, "learning_rate": 3.5199999999999998e-06, "logits/chosen": 0.9994779229164124, "logits/rejected": 0.7162806391716003, "logps/chosen": -0.7262680530548096, "logps/rejected": -0.9695625305175781, "loss": 0.8926, "odds_ratio_loss": 0.5879533886909485, "rewards/accuracies": 0.75, "rewards/chosen": -0.07262679934501648, "rewards/margins": 0.024329453706741333, "rewards/rejected": -0.09695626050233841, "sft_loss": 0.7262680530548096, "step": 44 }, { "epoch": 0.0650759219088937, "grad_norm": 3.4506987115134757, "learning_rate": 3.6e-06, "logits/chosen": 0.8653795123100281, "logits/rejected": 0.7805371284484863, "logps/chosen": -0.7925112247467041, "logps/rejected": -1.1502509117126465, "loss": 0.7794, "odds_ratio_loss": 0.5484931468963623, "rewards/accuracies": 0.5, "rewards/chosen": -0.07925112545490265, "rewards/margins": 0.035773973912000656, "rewards/rejected": -0.11502508819103241, "sft_loss": 0.7925112247467041, "step": 45 }, { "epoch": 0.06652205350686913, "grad_norm": 2.9338860329360443, "learning_rate": 3.68e-06, "logits/chosen": 0.9942594766616821, "logits/rejected": 0.6031086444854736, "logps/chosen": -0.7195640802383423, "logps/rejected": -0.9865086674690247, "loss": 0.8429, "odds_ratio_loss": 0.5378471612930298, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07195641100406647, "rewards/margins": 0.026694457978010178, "rewards/rejected": -0.09865086525678635, "sft_loss": 0.7195640802383423, "step": 46 }, { "epoch": 0.06796818510484454, "grad_norm": 18.444891203967817, "learning_rate": 3.7599999999999996e-06, "logits/chosen": 1.0228843688964844, "logits/rejected": 0.6981637477874756, "logps/chosen": -0.8705933094024658, "logps/rejected": -1.2415589094161987, "loss": 0.9371, "odds_ratio_loss": 0.8004380464553833, "rewards/accuracies": 0.625, "rewards/chosen": -0.08705933392047882, "rewards/margins": 0.037096552550792694, "rewards/rejected": -0.12415588647127151, "sft_loss": 0.8705933094024658, "step": 47 }, { "epoch": 0.06941431670281996, "grad_norm": 4.737185826037039, "learning_rate": 3.84e-06, "logits/chosen": 1.0492089986801147, "logits/rejected": 0.6930069327354431, "logps/chosen": -0.8137787580490112, "logps/rejected": -1.0236539840698242, "loss": 0.8259, "odds_ratio_loss": 0.6219763159751892, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08137787878513336, "rewards/margins": 0.020987525582313538, "rewards/rejected": -0.1023654043674469, "sft_loss": 0.8137787580490112, "step": 48 }, { "epoch": 0.07086044830079537, "grad_norm": 20.118458343186862, "learning_rate": 3.92e-06, "logits/chosen": 0.8911822438240051, "logits/rejected": 0.6944270730018616, "logps/chosen": -1.052188515663147, "logps/rejected": -1.200345754623413, "loss": 1.0211, "odds_ratio_loss": 0.6875659823417664, "rewards/accuracies": 0.4375, "rewards/chosen": -0.10521887242794037, "rewards/margins": 0.01481570117175579, "rewards/rejected": -0.12003456056118011, "sft_loss": 1.052188515663147, "step": 49 }, { "epoch": 0.07230657989877079, "grad_norm": 5.7579792293688845, "learning_rate": 4e-06, "logits/chosen": 0.7346884608268738, "logits/rejected": 0.660639226436615, "logps/chosen": -0.9779757261276245, "logps/rejected": -1.0913139581680298, "loss": 0.9103, "odds_ratio_loss": 0.6801553964614868, "rewards/accuracies": 0.625, "rewards/chosen": -0.09779756516218185, "rewards/margins": 0.0113338278606534, "rewards/rejected": -0.10913139581680298, "sft_loss": 0.9779757261276245, "step": 50 }, { "epoch": 0.0737527114967462, "grad_norm": 3.985226415418088, "learning_rate": 4.08e-06, "logits/chosen": 1.0270240306854248, "logits/rejected": 0.8079010248184204, "logps/chosen": -0.6934676766395569, "logps/rejected": -1.0339841842651367, "loss": 0.8186, "odds_ratio_loss": 0.4910920560359955, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06934677064418793, "rewards/margins": 0.034051645547151566, "rewards/rejected": -0.1033984124660492, "sft_loss": 0.6934676766395569, "step": 51 }, { "epoch": 0.07519884309472162, "grad_norm": 7.042436013353386, "learning_rate": 4.16e-06, "logits/chosen": 0.9413070678710938, "logits/rejected": 0.7744332551956177, "logps/chosen": -0.7994893193244934, "logps/rejected": -0.9427841901779175, "loss": 0.9243, "odds_ratio_loss": 0.6500593423843384, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07994893193244934, "rewards/margins": 0.014329486526548862, "rewards/rejected": -0.09427841752767563, "sft_loss": 0.7994893193244934, "step": 52 }, { "epoch": 0.07664497469269703, "grad_norm": 7.37511851741436, "learning_rate": 4.24e-06, "logits/chosen": 1.0415973663330078, "logits/rejected": 0.8271567821502686, "logps/chosen": -0.7861368656158447, "logps/rejected": -0.9846646189689636, "loss": 0.798, "odds_ratio_loss": 0.613193690776825, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07861369103193283, "rewards/margins": 0.01985277608036995, "rewards/rejected": -0.09846646338701248, "sft_loss": 0.7861368656158447, "step": 53 }, { "epoch": 0.07809110629067245, "grad_norm": 5.530049183138087, "learning_rate": 4.32e-06, "logits/chosen": 1.035746455192566, "logits/rejected": 0.6262372732162476, "logps/chosen": -0.7784286737442017, "logps/rejected": -1.2029107809066772, "loss": 0.8983, "odds_ratio_loss": 0.49279114603996277, "rewards/accuracies": 0.75, "rewards/chosen": -0.07784287631511688, "rewards/margins": 0.042448196560144424, "rewards/rejected": -0.1202910766005516, "sft_loss": 0.7784286737442017, "step": 54 }, { "epoch": 0.07953723788864786, "grad_norm": 6.675216545716233, "learning_rate": 4.4e-06, "logits/chosen": 0.8669562935829163, "logits/rejected": 0.6869562268257141, "logps/chosen": -0.9464516639709473, "logps/rejected": -1.1197850704193115, "loss": 0.8484, "odds_ratio_loss": 0.608762800693512, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09464516490697861, "rewards/margins": 0.017333339899778366, "rewards/rejected": -0.11197850108146667, "sft_loss": 0.9464516639709473, "step": 55 }, { "epoch": 0.08098336948662328, "grad_norm": 9.900030241867466, "learning_rate": 4.48e-06, "logits/chosen": 0.9601394534111023, "logits/rejected": 0.7478969097137451, "logps/chosen": -0.6744643449783325, "logps/rejected": -1.155207872390747, "loss": 0.8861, "odds_ratio_loss": 0.47762393951416016, "rewards/accuracies": 0.875, "rewards/chosen": -0.06744643300771713, "rewards/margins": 0.04807435721158981, "rewards/rejected": -0.11552079021930695, "sft_loss": 0.6744643449783325, "step": 56 }, { "epoch": 0.0824295010845987, "grad_norm": 5.686458237142553, "learning_rate": 4.5599999999999995e-06, "logits/chosen": 1.0790021419525146, "logits/rejected": 1.103777289390564, "logps/chosen": -0.7211272716522217, "logps/rejected": -0.8694078922271729, "loss": 0.8806, "odds_ratio_loss": 0.6249120235443115, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07211272418498993, "rewards/margins": 0.014828063547611237, "rewards/rejected": -0.08694078773260117, "sft_loss": 0.7211272716522217, "step": 57 }, { "epoch": 0.08387563268257411, "grad_norm": 8.777827739224744, "learning_rate": 4.64e-06, "logits/chosen": 1.056907296180725, "logits/rejected": 0.8694510459899902, "logps/chosen": -0.7749738097190857, "logps/rejected": -1.1565377712249756, "loss": 0.8714, "odds_ratio_loss": 0.5057281255722046, "rewards/accuracies": 0.75, "rewards/chosen": -0.07749737799167633, "rewards/margins": 0.0381564125418663, "rewards/rejected": -0.11565379798412323, "sft_loss": 0.7749738097190857, "step": 58 }, { "epoch": 0.08532176428054954, "grad_norm": 3.0283569241613524, "learning_rate": 4.72e-06, "logits/chosen": 0.9298826456069946, "logits/rejected": 0.6310389041900635, "logps/chosen": -0.6535084843635559, "logps/rejected": -1.1013919115066528, "loss": 0.7376, "odds_ratio_loss": 0.42935678362846375, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06535085290670395, "rewards/margins": 0.04478834196925163, "rewards/rejected": -0.11013919860124588, "sft_loss": 0.6535084843635559, "step": 59 }, { "epoch": 0.08676789587852494, "grad_norm": 4.29846372956217, "learning_rate": 4.8e-06, "logits/chosen": 1.125542163848877, "logits/rejected": 0.9014356732368469, "logps/chosen": -0.6315572261810303, "logps/rejected": -1.0562748908996582, "loss": 0.7112, "odds_ratio_loss": 0.4755699634552002, "rewards/accuracies": 0.75, "rewards/chosen": -0.06315572559833527, "rewards/margins": 0.04247176647186279, "rewards/rejected": -0.10562749207019806, "sft_loss": 0.6315572261810303, "step": 60 }, { "epoch": 0.08821402747650037, "grad_norm": 3.541070990089084, "learning_rate": 4.88e-06, "logits/chosen": 1.202345848083496, "logits/rejected": 1.018904209136963, "logps/chosen": -0.5439103841781616, "logps/rejected": -0.8144343495368958, "loss": 0.7895, "odds_ratio_loss": 0.5063339471817017, "rewards/accuracies": 0.75, "rewards/chosen": -0.054391033947467804, "rewards/margins": 0.02705240435898304, "rewards/rejected": -0.0814434364438057, "sft_loss": 0.5439103841781616, "step": 61 }, { "epoch": 0.08966015907447578, "grad_norm": 8.368866284697733, "learning_rate": 4.96e-06, "logits/chosen": 1.0831687450408936, "logits/rejected": 1.0173500776290894, "logps/chosen": -0.7749509811401367, "logps/rejected": -0.9368792176246643, "loss": 0.9018, "odds_ratio_loss": 0.6482864618301392, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07749509811401367, "rewards/margins": 0.01619281992316246, "rewards/rejected": -0.09368792176246643, "sft_loss": 0.7749509811401367, "step": 62 }, { "epoch": 0.0911062906724512, "grad_norm": 18.13366140105839, "learning_rate": 5.04e-06, "logits/chosen": 0.8514754176139832, "logits/rejected": 0.6560556888580322, "logps/chosen": -1.131075382232666, "logps/rejected": -1.4477269649505615, "loss": 1.0699, "odds_ratio_loss": 0.6726313233375549, "rewards/accuracies": 0.5, "rewards/chosen": -0.11310753226280212, "rewards/margins": 0.03166516125202179, "rewards/rejected": -0.1447726935148239, "sft_loss": 1.131075382232666, "step": 63 }, { "epoch": 0.09255242227042661, "grad_norm": 5.535853982596553, "learning_rate": 5.12e-06, "logits/chosen": 1.1037675142288208, "logits/rejected": 0.8205037713050842, "logps/chosen": -0.712691068649292, "logps/rejected": -0.9677368998527527, "loss": 0.8002, "odds_ratio_loss": 0.5520382523536682, "rewards/accuracies": 0.625, "rewards/chosen": -0.07126910239458084, "rewards/margins": 0.025504592806100845, "rewards/rejected": -0.09677369147539139, "sft_loss": 0.712691068649292, "step": 64 }, { "epoch": 0.09399855386840203, "grad_norm": 3.238469408471894, "learning_rate": 5.2e-06, "logits/chosen": 0.8979614973068237, "logits/rejected": 0.8939811587333679, "logps/chosen": -0.9045255184173584, "logps/rejected": -1.1835026741027832, "loss": 0.8535, "odds_ratio_loss": 0.5290065407752991, "rewards/accuracies": 0.875, "rewards/chosen": -0.09045255184173584, "rewards/margins": 0.027897723019123077, "rewards/rejected": -0.11835027486085892, "sft_loss": 0.9045255184173584, "step": 65 }, { "epoch": 0.09544468546637744, "grad_norm": 3.9235480660049364, "learning_rate": 5.28e-06, "logits/chosen": 1.0252755880355835, "logits/rejected": 0.805016040802002, "logps/chosen": -0.6900225877761841, "logps/rejected": -1.1014788150787354, "loss": 0.8608, "odds_ratio_loss": 0.4513680636882782, "rewards/accuracies": 0.75, "rewards/chosen": -0.06900225579738617, "rewards/margins": 0.041145630180835724, "rewards/rejected": -0.11014789342880249, "sft_loss": 0.6900225877761841, "step": 66 }, { "epoch": 0.09689081706435286, "grad_norm": 4.096161339643125, "learning_rate": 5.36e-06, "logits/chosen": 0.8171386122703552, "logits/rejected": 0.8907574415206909, "logps/chosen": -0.7569859027862549, "logps/rejected": -0.9295750856399536, "loss": 0.8071, "odds_ratio_loss": 0.5909979343414307, "rewards/accuracies": 0.75, "rewards/chosen": -0.07569858431816101, "rewards/margins": 0.01725892536342144, "rewards/rejected": -0.0929575189948082, "sft_loss": 0.7569859027862549, "step": 67 }, { "epoch": 0.09833694866232827, "grad_norm": 3.4880666252048034, "learning_rate": 5.4400000000000004e-06, "logits/chosen": 1.0990238189697266, "logits/rejected": 0.8180453777313232, "logps/chosen": -0.5353269577026367, "logps/rejected": -1.0225234031677246, "loss": 0.7102, "odds_ratio_loss": 0.44182831048965454, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05353269353508949, "rewards/margins": 0.04871963709592819, "rewards/rejected": -0.10225233435630798, "sft_loss": 0.5353269577026367, "step": 68 }, { "epoch": 0.09978308026030369, "grad_norm": 4.957151765389429, "learning_rate": 5.52e-06, "logits/chosen": 1.0731085538864136, "logits/rejected": 1.0340485572814941, "logps/chosen": -0.779766321182251, "logps/rejected": -0.9066184163093567, "loss": 0.7684, "odds_ratio_loss": 0.6572247743606567, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07797663658857346, "rewards/margins": 0.012685209512710571, "rewards/rejected": -0.09066184610128403, "sft_loss": 0.779766321182251, "step": 69 }, { "epoch": 0.1012292118582791, "grad_norm": 3.1824426288429173, "learning_rate": 5.6e-06, "logits/chosen": 1.1628271341323853, "logits/rejected": 0.8257616758346558, "logps/chosen": -0.6837524771690369, "logps/rejected": -1.061213731765747, "loss": 0.8337, "odds_ratio_loss": 0.46075576543807983, "rewards/accuracies": 0.875, "rewards/chosen": -0.06837525963783264, "rewards/margins": 0.03774610906839371, "rewards/rejected": -0.10612136870622635, "sft_loss": 0.6837524771690369, "step": 70 }, { "epoch": 0.10267534345625452, "grad_norm": 2.9877543497021897, "learning_rate": 5.68e-06, "logits/chosen": 1.220341682434082, "logits/rejected": 0.8999754190444946, "logps/chosen": -0.7730412483215332, "logps/rejected": -0.9635111093521118, "loss": 0.811, "odds_ratio_loss": 0.6032207012176514, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07730412483215332, "rewards/margins": 0.01904698833823204, "rewards/rejected": -0.09635111689567566, "sft_loss": 0.7730412483215332, "step": 71 }, { "epoch": 0.10412147505422993, "grad_norm": 3.68575891866247, "learning_rate": 5.76e-06, "logits/chosen": 1.1874583959579468, "logits/rejected": 1.072020411491394, "logps/chosen": -0.9427322149276733, "logps/rejected": -1.102614402770996, "loss": 0.8952, "odds_ratio_loss": 0.6729242205619812, "rewards/accuracies": 0.5, "rewards/chosen": -0.09427322447299957, "rewards/margins": 0.015988217666745186, "rewards/rejected": -0.11026144027709961, "sft_loss": 0.9427322149276733, "step": 72 }, { "epoch": 0.10556760665220535, "grad_norm": 2.8520049474926448, "learning_rate": 5.84e-06, "logits/chosen": 0.9496423006057739, "logits/rejected": 0.68682461977005, "logps/chosen": -0.84450763463974, "logps/rejected": -1.4252830743789673, "loss": 0.824, "odds_ratio_loss": 0.4710613489151001, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08445076644420624, "rewards/margins": 0.058077551424503326, "rewards/rejected": -0.14252832531929016, "sft_loss": 0.84450763463974, "step": 73 }, { "epoch": 0.10701373825018076, "grad_norm": 4.909446839208547, "learning_rate": 5.92e-06, "logits/chosen": 1.0577809810638428, "logits/rejected": 1.0075559616088867, "logps/chosen": -0.7429163455963135, "logps/rejected": -1.2081857919692993, "loss": 0.8528, "odds_ratio_loss": 0.47998422384262085, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07429163157939911, "rewards/margins": 0.046526938676834106, "rewards/rejected": -0.12081858515739441, "sft_loss": 0.7429163455963135, "step": 74 }, { "epoch": 0.10845986984815618, "grad_norm": 8.719192985116054, "learning_rate": 6e-06, "logits/chosen": 1.0399186611175537, "logits/rejected": 0.7236427068710327, "logps/chosen": -0.8451824188232422, "logps/rejected": -1.1076613664627075, "loss": 0.9515, "odds_ratio_loss": 0.5856386423110962, "rewards/accuracies": 0.75, "rewards/chosen": -0.08451823890209198, "rewards/margins": 0.026247896254062653, "rewards/rejected": -0.11076614260673523, "sft_loss": 0.8451824188232422, "step": 75 }, { "epoch": 0.1099060014461316, "grad_norm": 3.242774804776767, "learning_rate": 6.079999999999999e-06, "logits/chosen": 1.170623540878296, "logits/rejected": 0.9636479616165161, "logps/chosen": -0.7744165658950806, "logps/rejected": -0.9548482298851013, "loss": 0.8192, "odds_ratio_loss": 0.6630096435546875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07744166254997253, "rewards/margins": 0.018043160438537598, "rewards/rejected": -0.09548482298851013, "sft_loss": 0.7744165658950806, "step": 76 }, { "epoch": 0.11135213304410702, "grad_norm": 5.606115194740814, "learning_rate": 6.1599999999999995e-06, "logits/chosen": 1.2055904865264893, "logits/rejected": 0.8868299126625061, "logps/chosen": -1.0346038341522217, "logps/rejected": -0.9845407009124756, "loss": 0.9325, "odds_ratio_loss": 0.8492572903633118, "rewards/accuracies": 0.375, "rewards/chosen": -0.1034604012966156, "rewards/margins": -0.005006318911910057, "rewards/rejected": -0.0984540730714798, "sft_loss": 1.0346038341522217, "step": 77 }, { "epoch": 0.11279826464208242, "grad_norm": 11.819359350343193, "learning_rate": 6.2399999999999995e-06, "logits/chosen": 1.2517834901809692, "logits/rejected": 0.7903429269790649, "logps/chosen": -0.7485107779502869, "logps/rejected": -1.4334845542907715, "loss": 0.8361, "odds_ratio_loss": 0.37815889716148376, "rewards/accuracies": 0.875, "rewards/chosen": -0.07485108077526093, "rewards/margins": 0.06849737465381622, "rewards/rejected": -0.14334845542907715, "sft_loss": 0.7485107779502869, "step": 78 }, { "epoch": 0.11424439624005785, "grad_norm": 3.8517494454673367, "learning_rate": 6.32e-06, "logits/chosen": 1.035347580909729, "logits/rejected": 0.7966654896736145, "logps/chosen": -0.7934662103652954, "logps/rejected": -1.0752906799316406, "loss": 0.784, "odds_ratio_loss": 0.5487212538719177, "rewards/accuracies": 0.75, "rewards/chosen": -0.07934662699699402, "rewards/margins": 0.028182443231344223, "rewards/rejected": -0.10752906650304794, "sft_loss": 0.7934662103652954, "step": 79 }, { "epoch": 0.11569052783803326, "grad_norm": 5.067148096067779, "learning_rate": 6.4e-06, "logits/chosen": 1.0562264919281006, "logits/rejected": 0.9451851844787598, "logps/chosen": -0.8409098386764526, "logps/rejected": -1.0445072650909424, "loss": 0.8479, "odds_ratio_loss": 0.5876671671867371, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08409098535776138, "rewards/margins": 0.020359739661216736, "rewards/rejected": -0.10445072501897812, "sft_loss": 0.8409098386764526, "step": 80 }, { "epoch": 0.11713665943600868, "grad_norm": 3.52750504575538, "learning_rate": 6.48e-06, "logits/chosen": 0.8893054127693176, "logits/rejected": 0.852543830871582, "logps/chosen": -0.799626350402832, "logps/rejected": -1.193359613418579, "loss": 0.819, "odds_ratio_loss": 0.514946460723877, "rewards/accuracies": 0.75, "rewards/chosen": -0.07996264100074768, "rewards/margins": 0.03937332704663277, "rewards/rejected": -0.11933596432209015, "sft_loss": 0.799626350402832, "step": 81 }, { "epoch": 0.11858279103398409, "grad_norm": 7.884839298424228, "learning_rate": 6.559999999999999e-06, "logits/chosen": 0.9608660936355591, "logits/rejected": 0.7938075065612793, "logps/chosen": -0.905261218547821, "logps/rejected": -1.2442588806152344, "loss": 0.9495, "odds_ratio_loss": 0.7865235209465027, "rewards/accuracies": 0.8125, "rewards/chosen": -0.09052613377571106, "rewards/margins": 0.03389975428581238, "rewards/rejected": -0.12442588806152344, "sft_loss": 0.905261218547821, "step": 82 }, { "epoch": 0.12002892263195951, "grad_norm": 3.7300532504299255, "learning_rate": 6.639999999999999e-06, "logits/chosen": 0.8842788934707642, "logits/rejected": 0.8357954025268555, "logps/chosen": -0.8731819987297058, "logps/rejected": -0.9422979950904846, "loss": 0.8644, "odds_ratio_loss": 0.6957225203514099, "rewards/accuracies": 0.5, "rewards/chosen": -0.08731820434331894, "rewards/margins": 0.0069116028025746346, "rewards/rejected": -0.0942298099398613, "sft_loss": 0.8731819987297058, "step": 83 }, { "epoch": 0.12147505422993492, "grad_norm": 4.523959440898583, "learning_rate": 6.719999999999999e-06, "logits/chosen": 0.8383230566978455, "logits/rejected": 0.9067773222923279, "logps/chosen": -0.9250085353851318, "logps/rejected": -1.1533596515655518, "loss": 0.9015, "odds_ratio_loss": 0.6070276498794556, "rewards/accuracies": 0.625, "rewards/chosen": -0.09250085055828094, "rewards/margins": 0.022835111245512962, "rewards/rejected": -0.11533597111701965, "sft_loss": 0.9250085353851318, "step": 84 }, { "epoch": 0.12292118582791034, "grad_norm": 4.009564684524501, "learning_rate": 6.799999999999999e-06, "logits/chosen": 1.1185507774353027, "logits/rejected": 0.9861254096031189, "logps/chosen": -0.7019460201263428, "logps/rejected": -1.3582534790039062, "loss": 0.7674, "odds_ratio_loss": 0.38255906105041504, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07019460946321487, "rewards/margins": 0.06563074886798859, "rewards/rejected": -0.13582536578178406, "sft_loss": 0.7019460201263428, "step": 85 }, { "epoch": 0.12436731742588576, "grad_norm": 4.978681123457067, "learning_rate": 6.879999999999999e-06, "logits/chosen": 1.1274616718292236, "logits/rejected": 1.0533956289291382, "logps/chosen": -0.8610905408859253, "logps/rejected": -1.079627275466919, "loss": 1.054, "odds_ratio_loss": 0.6273439526557922, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08610905706882477, "rewards/margins": 0.021853668615221977, "rewards/rejected": -0.1079627275466919, "sft_loss": 0.8610905408859253, "step": 86 }, { "epoch": 0.12581344902386118, "grad_norm": 3.3881381629270027, "learning_rate": 6.9599999999999994e-06, "logits/chosen": 0.7648379802703857, "logits/rejected": 0.7650954723358154, "logps/chosen": -0.9075650572776794, "logps/rejected": -0.9574316740036011, "loss": 0.832, "odds_ratio_loss": 0.740204930305481, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09075652062892914, "rewards/margins": 0.0049866605550050735, "rewards/rejected": -0.09574317187070847, "sft_loss": 0.9075650572776794, "step": 87 }, { "epoch": 0.1272595806218366, "grad_norm": 4.3362701314683685, "learning_rate": 7.0399999999999995e-06, "logits/chosen": 1.11726975440979, "logits/rejected": 0.7689473032951355, "logps/chosen": -0.806981086730957, "logps/rejected": -1.3281915187835693, "loss": 0.7672, "odds_ratio_loss": 0.45331400632858276, "rewards/accuracies": 0.75, "rewards/chosen": -0.08069811016321182, "rewards/margins": 0.05212104693055153, "rewards/rejected": -0.13281914591789246, "sft_loss": 0.806981086730957, "step": 88 }, { "epoch": 0.128705712219812, "grad_norm": 5.657518637155685, "learning_rate": 7.12e-06, "logits/chosen": 0.9810636043548584, "logits/rejected": 0.6966180205345154, "logps/chosen": -0.844375491142273, "logps/rejected": -1.1458146572113037, "loss": 0.8386, "odds_ratio_loss": 0.5362038612365723, "rewards/accuracies": 0.75, "rewards/chosen": -0.08443755656480789, "rewards/margins": 0.03014390543103218, "rewards/rejected": -0.11458146572113037, "sft_loss": 0.844375491142273, "step": 89 }, { "epoch": 0.1301518438177874, "grad_norm": 4.112138737639026, "learning_rate": 7.2e-06, "logits/chosen": 1.0501034259796143, "logits/rejected": 0.9047867059707642, "logps/chosen": -0.7257125973701477, "logps/rejected": -0.9122753739356995, "loss": 0.894, "odds_ratio_loss": 0.5964730381965637, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07257126271724701, "rewards/margins": 0.018656279891729355, "rewards/rejected": -0.09122754633426666, "sft_loss": 0.7257125973701477, "step": 90 }, { "epoch": 0.13159797541576285, "grad_norm": 3.329297429235451, "learning_rate": 7.28e-06, "logits/chosen": 0.9696775078773499, "logits/rejected": 0.7746231555938721, "logps/chosen": -0.8063006401062012, "logps/rejected": -1.0086421966552734, "loss": 0.812, "odds_ratio_loss": 0.5819357633590698, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08063006401062012, "rewards/margins": 0.02023415081202984, "rewards/rejected": -0.1008642166852951, "sft_loss": 0.8063006401062012, "step": 91 }, { "epoch": 0.13304410701373826, "grad_norm": 3.335099806630708, "learning_rate": 7.36e-06, "logits/chosen": 0.8014536499977112, "logits/rejected": 0.7237277626991272, "logps/chosen": -0.9466049671173096, "logps/rejected": -1.1349971294403076, "loss": 0.8838, "odds_ratio_loss": 0.6838469505310059, "rewards/accuracies": 0.4375, "rewards/chosen": -0.09466049075126648, "rewards/margins": 0.018839217722415924, "rewards/rejected": -0.113499715924263, "sft_loss": 0.9466049671173096, "step": 92 }, { "epoch": 0.13449023861171366, "grad_norm": 3.6165391263690747, "learning_rate": 7.44e-06, "logits/chosen": 0.9842814803123474, "logits/rejected": 0.7658122181892395, "logps/chosen": -0.8842167258262634, "logps/rejected": -1.298095703125, "loss": 0.9687, "odds_ratio_loss": 0.47397667169570923, "rewards/accuracies": 0.875, "rewards/chosen": -0.08842167258262634, "rewards/margins": 0.0413878932595253, "rewards/rejected": -0.12980955839157104, "sft_loss": 0.8842167258262634, "step": 93 }, { "epoch": 0.13593637020968907, "grad_norm": 3.258394073264351, "learning_rate": 7.519999999999999e-06, "logits/chosen": 1.2068235874176025, "logits/rejected": 0.7209275960922241, "logps/chosen": -0.7621604204177856, "logps/rejected": -1.1626864671707153, "loss": 0.8619, "odds_ratio_loss": 0.5013617277145386, "rewards/accuracies": 0.75, "rewards/chosen": -0.07621604204177856, "rewards/margins": 0.04005260765552521, "rewards/rejected": -0.11626865714788437, "sft_loss": 0.7621604204177856, "step": 94 }, { "epoch": 0.1373825018076645, "grad_norm": 2.8180662547268556, "learning_rate": 7.599999999999999e-06, "logits/chosen": 0.8703403472900391, "logits/rejected": 0.8108032941818237, "logps/chosen": -0.7662641406059265, "logps/rejected": -1.0566703081130981, "loss": 0.797, "odds_ratio_loss": 0.5328947305679321, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07662642002105713, "rewards/margins": 0.029040617868304253, "rewards/rejected": -0.10566703975200653, "sft_loss": 0.7662641406059265, "step": 95 }, { "epoch": 0.13882863340563992, "grad_norm": 3.6926906103204753, "learning_rate": 7.68e-06, "logits/chosen": 1.23716139793396, "logits/rejected": 0.7248931527137756, "logps/chosen": -0.8130195736885071, "logps/rejected": -1.3673142194747925, "loss": 0.8516, "odds_ratio_loss": 0.5090689659118652, "rewards/accuracies": 0.75, "rewards/chosen": -0.0813019648194313, "rewards/margins": 0.055429454892873764, "rewards/rejected": -0.13673141598701477, "sft_loss": 0.8130195736885071, "step": 96 }, { "epoch": 0.14027476500361533, "grad_norm": 3.004623867002305, "learning_rate": 7.76e-06, "logits/chosen": 1.098660945892334, "logits/rejected": 0.9489208459854126, "logps/chosen": -0.8788610100746155, "logps/rejected": -1.3326481580734253, "loss": 0.88, "odds_ratio_loss": 0.5792851448059082, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08788609504699707, "rewards/margins": 0.04537871479988098, "rewards/rejected": -0.13326480984687805, "sft_loss": 0.8788610100746155, "step": 97 }, { "epoch": 0.14172089660159073, "grad_norm": 4.588818222593908, "learning_rate": 7.84e-06, "logits/chosen": 0.9910817742347717, "logits/rejected": 0.7520518898963928, "logps/chosen": -0.9339778423309326, "logps/rejected": -1.2882939577102661, "loss": 0.8916, "odds_ratio_loss": 0.5939985513687134, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09339778870344162, "rewards/margins": 0.03543160483241081, "rewards/rejected": -0.12882938981056213, "sft_loss": 0.9339778423309326, "step": 98 }, { "epoch": 0.14316702819956617, "grad_norm": 4.138791016089956, "learning_rate": 7.92e-06, "logits/chosen": 1.126113772392273, "logits/rejected": 1.0925661325454712, "logps/chosen": -0.6955535411834717, "logps/rejected": -1.1417237520217896, "loss": 0.8094, "odds_ratio_loss": 0.46277695894241333, "rewards/accuracies": 0.875, "rewards/chosen": -0.06955534964799881, "rewards/margins": 0.04461703449487686, "rewards/rejected": -0.11417238414287567, "sft_loss": 0.6955535411834717, "step": 99 }, { "epoch": 0.14461315979754158, "grad_norm": 5.006479057556612, "learning_rate": 8e-06, "logits/chosen": 0.9892964363098145, "logits/rejected": 0.9845679998397827, "logps/chosen": -0.7721818089485168, "logps/rejected": -1.2382351160049438, "loss": 0.9171, "odds_ratio_loss": 0.5086143016815186, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0772181823849678, "rewards/margins": 0.04660532623529434, "rewards/rejected": -0.12382350862026215, "sft_loss": 0.7721818089485168, "step": 100 }, { "epoch": 0.146059291395517, "grad_norm": 4.221494833102751, "learning_rate": 7.999998794192551e-06, "logits/chosen": 1.0973759889602661, "logits/rejected": 0.825110912322998, "logps/chosen": -0.7033730745315552, "logps/rejected": -0.8850679397583008, "loss": 0.9284, "odds_ratio_loss": 0.8899041414260864, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07033731043338776, "rewards/margins": 0.01816948503255844, "rewards/rejected": -0.0885067954659462, "sft_loss": 0.7033730745315552, "step": 101 }, { "epoch": 0.1475054229934924, "grad_norm": 4.217626175448476, "learning_rate": 7.999995176770932e-06, "logits/chosen": 1.0476391315460205, "logits/rejected": 1.20012366771698, "logps/chosen": -0.7490733861923218, "logps/rejected": -1.357642650604248, "loss": 1.0154, "odds_ratio_loss": 0.416873574256897, "rewards/accuracies": 0.875, "rewards/chosen": -0.0749073401093483, "rewards/margins": 0.060856934636831284, "rewards/rejected": -0.13576427102088928, "sft_loss": 0.7490733861923218, "step": 102 }, { "epoch": 0.14895155459146783, "grad_norm": 3.870844944694995, "learning_rate": 7.999989147737321e-06, "logits/chosen": 1.3996965885162354, "logits/rejected": 1.0624024868011475, "logps/chosen": -0.7202589511871338, "logps/rejected": -1.4208965301513672, "loss": 0.8005, "odds_ratio_loss": 0.5439056754112244, "rewards/accuracies": 0.625, "rewards/chosen": -0.07202590256929398, "rewards/margins": 0.0700637698173523, "rewards/rejected": -0.14208966493606567, "sft_loss": 0.7202589511871338, "step": 103 }, { "epoch": 0.15039768618944324, "grad_norm": 7.09212049587876, "learning_rate": 7.999980707095359e-06, "logits/chosen": 0.975654125213623, "logits/rejected": 0.6775309443473816, "logps/chosen": -0.8399592638015747, "logps/rejected": -1.3394160270690918, "loss": 0.8881, "odds_ratio_loss": 0.5169605016708374, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08399593830108643, "rewards/margins": 0.04994567483663559, "rewards/rejected": -0.13394160568714142, "sft_loss": 0.8399592638015747, "step": 104 }, { "epoch": 0.15184381778741865, "grad_norm": 3.2017348907968923, "learning_rate": 7.99996985485013e-06, "logits/chosen": 1.3523914813995361, "logits/rejected": 1.1231534481048584, "logps/chosen": -0.8947268724441528, "logps/rejected": -0.947187066078186, "loss": 0.9017, "odds_ratio_loss": 0.6707051992416382, "rewards/accuracies": 0.625, "rewards/chosen": -0.0894726812839508, "rewards/margins": 0.005246022716164589, "rewards/rejected": -0.09471870958805084, "sft_loss": 0.8947268724441528, "step": 105 }, { "epoch": 0.15328994938539406, "grad_norm": 5.116065229940842, "learning_rate": 7.999956591008177e-06, "logits/chosen": 0.9761826395988464, "logits/rejected": 0.8456287384033203, "logps/chosen": -0.9132041931152344, "logps/rejected": -1.2559480667114258, "loss": 0.8568, "odds_ratio_loss": 0.5640711784362793, "rewards/accuracies": 0.625, "rewards/chosen": -0.09132041782140732, "rewards/margins": 0.0342743881046772, "rewards/rejected": -0.12559480965137482, "sft_loss": 0.9132041931152344, "step": 106 }, { "epoch": 0.1547360809833695, "grad_norm": 4.402587157653877, "learning_rate": 7.999940915577498e-06, "logits/chosen": 1.186305046081543, "logits/rejected": 0.9557572603225708, "logps/chosen": -0.8537756204605103, "logps/rejected": -1.1455714702606201, "loss": 0.8694, "odds_ratio_loss": 0.5543307065963745, "rewards/accuracies": 0.75, "rewards/chosen": -0.08537755906581879, "rewards/margins": 0.029179591685533524, "rewards/rejected": -0.11455715447664261, "sft_loss": 0.8537756204605103, "step": 107 }, { "epoch": 0.1561822125813449, "grad_norm": 5.333316678653816, "learning_rate": 7.999922828567544e-06, "logits/chosen": 1.1584198474884033, "logits/rejected": 0.866698145866394, "logps/chosen": -0.7809406518936157, "logps/rejected": -1.0734394788742065, "loss": 1.0352, "odds_ratio_loss": 0.6391094326972961, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07809406518936157, "rewards/margins": 0.029249876737594604, "rewards/rejected": -0.10734394937753677, "sft_loss": 0.7809406518936157, "step": 108 }, { "epoch": 0.1576283441793203, "grad_norm": 4.181964422446741, "learning_rate": 7.999902329989218e-06, "logits/chosen": 1.1614594459533691, "logits/rejected": 0.9762617349624634, "logps/chosen": -0.751151978969574, "logps/rejected": -1.218088150024414, "loss": 1.0105, "odds_ratio_loss": 0.5704274773597717, "rewards/accuracies": 0.625, "rewards/chosen": -0.07511519640684128, "rewards/margins": 0.04669361934065819, "rewards/rejected": -0.12180881202220917, "sft_loss": 0.751151978969574, "step": 109 }, { "epoch": 0.15907447577729572, "grad_norm": 4.998106576378202, "learning_rate": 7.999879419854883e-06, "logits/chosen": 0.8305407762527466, "logits/rejected": 0.789115309715271, "logps/chosen": -0.7767509818077087, "logps/rejected": -1.8016185760498047, "loss": 0.9043, "odds_ratio_loss": 0.41118738055229187, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07767509669065475, "rewards/margins": 0.1024867594242096, "rewards/rejected": -0.18016186356544495, "sft_loss": 0.7767509818077087, "step": 110 }, { "epoch": 0.16052060737527116, "grad_norm": 2.7549476854728794, "learning_rate": 7.999854098178346e-06, "logits/chosen": 1.3881924152374268, "logits/rejected": 1.1433719396591187, "logps/chosen": -0.6934785842895508, "logps/rejected": -1.2178714275360107, "loss": 0.8135, "odds_ratio_loss": 0.49856168031692505, "rewards/accuracies": 0.75, "rewards/chosen": -0.06934786587953568, "rewards/margins": 0.052439287304878235, "rewards/rejected": -0.12178714573383331, "sft_loss": 0.6934785842895508, "step": 111 }, { "epoch": 0.16196673897324657, "grad_norm": 3.946293342352624, "learning_rate": 7.999826364974878e-06, "logits/chosen": 1.0656780004501343, "logits/rejected": 0.935299277305603, "logps/chosen": -0.7571520805358887, "logps/rejected": -1.3282155990600586, "loss": 0.982, "odds_ratio_loss": 0.48986226320266724, "rewards/accuracies": 0.625, "rewards/chosen": -0.07571521401405334, "rewards/margins": 0.057106345891952515, "rewards/rejected": -0.13282155990600586, "sft_loss": 0.7571520805358887, "step": 112 }, { "epoch": 0.16341287057122197, "grad_norm": 4.909329819670107, "learning_rate": 7.999796220261196e-06, "logits/chosen": 1.2778562307357788, "logits/rejected": 1.0329546928405762, "logps/chosen": -0.84651118516922, "logps/rejected": -1.1883376836776733, "loss": 0.7856, "odds_ratio_loss": 0.5662205219268799, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08465111255645752, "rewards/margins": 0.034182652831077576, "rewards/rejected": -0.11883377283811569, "sft_loss": 0.84651118516922, "step": 113 }, { "epoch": 0.1648590021691974, "grad_norm": 3.3384814962446705, "learning_rate": 7.999763664055477e-06, "logits/chosen": 1.0280776023864746, "logits/rejected": 0.9973467588424683, "logps/chosen": -0.8428975939750671, "logps/rejected": -1.4291057586669922, "loss": 0.7546, "odds_ratio_loss": 0.48032891750335693, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08428976684808731, "rewards/margins": 0.05862081050872803, "rewards/rejected": -0.14291056990623474, "sft_loss": 0.8428975939750671, "step": 114 }, { "epoch": 0.16630513376717282, "grad_norm": 8.183910290224793, "learning_rate": 7.999728696377347e-06, "logits/chosen": 1.1547966003417969, "logits/rejected": 0.9317610859870911, "logps/chosen": -0.8324732184410095, "logps/rejected": -0.9054718017578125, "loss": 0.8927, "odds_ratio_loss": 0.7408847808837891, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08324733376502991, "rewards/margins": 0.007299853954464197, "rewards/rejected": -0.09054718911647797, "sft_loss": 0.8324732184410095, "step": 115 }, { "epoch": 0.16775126536514823, "grad_norm": 8.690597190815746, "learning_rate": 7.99969131724789e-06, "logits/chosen": 1.031779170036316, "logits/rejected": 0.9053889513015747, "logps/chosen": -0.647568941116333, "logps/rejected": -1.9913418292999268, "loss": 0.8437, "odds_ratio_loss": 0.4320135712623596, "rewards/accuracies": 0.875, "rewards/chosen": -0.06475690007209778, "rewards/margins": 0.13437728583812714, "rewards/rejected": -0.1991342008113861, "sft_loss": 0.647568941116333, "step": 116 }, { "epoch": 0.16919739696312364, "grad_norm": 3.5044517946016094, "learning_rate": 7.999651526689642e-06, "logits/chosen": 0.9917743802070618, "logits/rejected": 0.8863325119018555, "logps/chosen": -0.793087899684906, "logps/rejected": -1.0273563861846924, "loss": 0.8403, "odds_ratio_loss": 0.6240065693855286, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07930880039930344, "rewards/margins": 0.023426856845617294, "rewards/rejected": -0.10273565351963043, "sft_loss": 0.793087899684906, "step": 117 }, { "epoch": 0.17064352856109907, "grad_norm": 3.342116463726969, "learning_rate": 7.999609324726592e-06, "logits/chosen": 1.1911059617996216, "logits/rejected": 0.9521145820617676, "logps/chosen": -0.7328106760978699, "logps/rejected": -1.5902066230773926, "loss": 0.8601, "odds_ratio_loss": 0.40612202882766724, "rewards/accuracies": 0.75, "rewards/chosen": -0.07328107208013535, "rewards/margins": 0.08573959767818451, "rewards/rejected": -0.15902066230773926, "sft_loss": 0.7328106760978699, "step": 118 }, { "epoch": 0.17208966015907448, "grad_norm": 2.9956931508799247, "learning_rate": 7.999564711384184e-06, "logits/chosen": 0.7753307223320007, "logits/rejected": 0.7809234261512756, "logps/chosen": -0.8056610822677612, "logps/rejected": -1.2226719856262207, "loss": 0.8651, "odds_ratio_loss": 0.5064537525177002, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08056611567735672, "rewards/margins": 0.04170108586549759, "rewards/rejected": -0.12226720154285431, "sft_loss": 0.8056610822677612, "step": 119 }, { "epoch": 0.1735357917570499, "grad_norm": 4.895879965141485, "learning_rate": 7.999517686689316e-06, "logits/chosen": 1.0470901727676392, "logits/rejected": 0.868865966796875, "logps/chosen": -0.701599657535553, "logps/rejected": -1.2374787330627441, "loss": 0.9475, "odds_ratio_loss": 0.5663205981254578, "rewards/accuracies": 0.75, "rewards/chosen": -0.07015997171401978, "rewards/margins": 0.0535879023373127, "rewards/rejected": -0.12374787032604218, "sft_loss": 0.701599657535553, "step": 120 }, { "epoch": 0.1749819233550253, "grad_norm": 3.877019988352723, "learning_rate": 7.999468250670339e-06, "logits/chosen": 0.8767350912094116, "logits/rejected": 0.8941935300827026, "logps/chosen": -0.7906222939491272, "logps/rejected": -1.284137487411499, "loss": 0.7921, "odds_ratio_loss": 0.4683469235897064, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07906222343444824, "rewards/margins": 0.0493515208363533, "rewards/rejected": -0.12841375172138214, "sft_loss": 0.7906222939491272, "step": 121 }, { "epoch": 0.17642805495300073, "grad_norm": 13.592899631151674, "learning_rate": 7.999416403357056e-06, "logits/chosen": 0.7820194959640503, "logits/rejected": 0.8491408824920654, "logps/chosen": -0.8906112909317017, "logps/rejected": -1.2906492948532104, "loss": 0.862, "odds_ratio_loss": 0.5522098541259766, "rewards/accuracies": 0.75, "rewards/chosen": -0.08906114101409912, "rewards/margins": 0.04000380262732506, "rewards/rejected": -0.12906494736671448, "sft_loss": 0.8906112909317017, "step": 122 }, { "epoch": 0.17787418655097614, "grad_norm": 2.9079992073602368, "learning_rate": 7.99936214478073e-06, "logits/chosen": 1.0372124910354614, "logits/rejected": 0.8244482278823853, "logps/chosen": -0.7030653953552246, "logps/rejected": -1.417474627494812, "loss": 0.7689, "odds_ratio_loss": 0.3927401304244995, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07030653953552246, "rewards/margins": 0.0714409276843071, "rewards/rejected": -0.14174747467041016, "sft_loss": 0.7030653953552246, "step": 123 }, { "epoch": 0.17932031814895155, "grad_norm": 3.1826994675798077, "learning_rate": 7.999305474974071e-06, "logits/chosen": 1.1816697120666504, "logits/rejected": 0.942751407623291, "logps/chosen": -0.7592461109161377, "logps/rejected": -1.1128010749816895, "loss": 0.8271, "odds_ratio_loss": 0.5115275382995605, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07592461258172989, "rewards/margins": 0.03535548970103264, "rewards/rejected": -0.11128010600805283, "sft_loss": 0.7592461109161377, "step": 124 }, { "epoch": 0.18076644974692696, "grad_norm": 3.0381911136860293, "learning_rate": 7.999246393971247e-06, "logits/chosen": 1.0905792713165283, "logits/rejected": 0.7886046171188354, "logps/chosen": -0.7659871578216553, "logps/rejected": -1.5863971710205078, "loss": 0.8179, "odds_ratio_loss": 0.36050310730934143, "rewards/accuracies": 0.875, "rewards/chosen": -0.07659871876239777, "rewards/margins": 0.08204102516174316, "rewards/rejected": -0.15863972902297974, "sft_loss": 0.7659871578216553, "step": 125 }, { "epoch": 0.1822125813449024, "grad_norm": 10.206680986022551, "learning_rate": 7.999184901807875e-06, "logits/chosen": 0.8782986402511597, "logits/rejected": 0.9173144698143005, "logps/chosen": -0.8764703273773193, "logps/rejected": -1.0771946907043457, "loss": 0.9931, "odds_ratio_loss": 0.6936411261558533, "rewards/accuracies": 0.375, "rewards/chosen": -0.08764703571796417, "rewards/margins": 0.020072437822818756, "rewards/rejected": -0.10771947354078293, "sft_loss": 0.8764703273773193, "step": 126 }, { "epoch": 0.1836587129428778, "grad_norm": 3.1106884257684007, "learning_rate": 7.999120998521033e-06, "logits/chosen": 1.0604857206344604, "logits/rejected": 1.0624693632125854, "logps/chosen": -0.7787372469902039, "logps/rejected": -1.3081343173980713, "loss": 0.8462, "odds_ratio_loss": 0.47298046946525574, "rewards/accuracies": 0.75, "rewards/chosen": -0.07787372171878815, "rewards/margins": 0.05293971672654152, "rewards/rejected": -0.13081344962120056, "sft_loss": 0.7787372469902039, "step": 127 }, { "epoch": 0.18510484454085321, "grad_norm": 5.440440093591614, "learning_rate": 7.999054684149247e-06, "logits/chosen": 1.07607102394104, "logits/rejected": 1.0282245874404907, "logps/chosen": -0.7584375143051147, "logps/rejected": -1.117292046546936, "loss": 0.8178, "odds_ratio_loss": 0.5175266861915588, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07584375143051147, "rewards/margins": 0.035885464400053024, "rewards/rejected": -0.1117292046546936, "sft_loss": 0.7584375143051147, "step": 128 }, { "epoch": 0.18655097613882862, "grad_norm": 9.357508492499358, "learning_rate": 7.998985958732496e-06, "logits/chosen": 1.1318176984786987, "logits/rejected": 0.8266797065734863, "logps/chosen": -0.8652774095535278, "logps/rejected": -1.303208351135254, "loss": 0.9764, "odds_ratio_loss": 0.5121663212776184, "rewards/accuracies": 0.75, "rewards/chosen": -0.0865277424454689, "rewards/margins": 0.04379308968782425, "rewards/rejected": -0.13032083213329315, "sft_loss": 0.8652774095535278, "step": 129 }, { "epoch": 0.18799710773680406, "grad_norm": 4.527334579057526, "learning_rate": 7.998914822312218e-06, "logits/chosen": 0.8850480318069458, "logits/rejected": 0.8174777626991272, "logps/chosen": -0.9658189415931702, "logps/rejected": -1.1547739505767822, "loss": 0.9466, "odds_ratio_loss": 0.6745478510856628, "rewards/accuracies": 0.375, "rewards/chosen": -0.09658190608024597, "rewards/margins": 0.01889548823237419, "rewards/rejected": -0.11547739058732986, "sft_loss": 0.9658189415931702, "step": 130 }, { "epoch": 0.18944323933477947, "grad_norm": 4.443262215950921, "learning_rate": 7.998841274931302e-06, "logits/chosen": 1.0955193042755127, "logits/rejected": 0.7597486972808838, "logps/chosen": -0.7344867587089539, "logps/rejected": -1.3775670528411865, "loss": 0.8939, "odds_ratio_loss": 0.5716267824172974, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07344868034124374, "rewards/margins": 0.0643080472946167, "rewards/rejected": -0.13775672018527985, "sft_loss": 0.7344867587089539, "step": 131 }, { "epoch": 0.19088937093275488, "grad_norm": 2.6245053770051414, "learning_rate": 7.998765316634085e-06, "logits/chosen": 1.3198339939117432, "logits/rejected": 0.9343856573104858, "logps/chosen": -0.7423506379127502, "logps/rejected": -1.0867445468902588, "loss": 0.8236, "odds_ratio_loss": 0.5474746227264404, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07423506677150726, "rewards/margins": 0.034439388662576675, "rewards/rejected": -0.10867445170879364, "sft_loss": 0.7423506379127502, "step": 132 }, { "epoch": 0.19233550253073028, "grad_norm": 3.008883127727729, "learning_rate": 7.998686947466366e-06, "logits/chosen": 0.9286758899688721, "logits/rejected": 0.7316522002220154, "logps/chosen": -0.8677981495857239, "logps/rejected": -1.2070274353027344, "loss": 0.858, "odds_ratio_loss": 0.5611447691917419, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08677982538938522, "rewards/margins": 0.03392292559146881, "rewards/rejected": -0.12070275098085403, "sft_loss": 0.8677981495857239, "step": 133 }, { "epoch": 0.19378163412870572, "grad_norm": 3.9031706234078754, "learning_rate": 7.998606167475395e-06, "logits/chosen": 0.9033632874488831, "logits/rejected": 0.7890003323554993, "logps/chosen": -0.854728102684021, "logps/rejected": -1.2537704706192017, "loss": 0.8872, "odds_ratio_loss": 0.5175142288208008, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08547282218933105, "rewards/margins": 0.039904236793518066, "rewards/rejected": -0.12537705898284912, "sft_loss": 0.854728102684021, "step": 134 }, { "epoch": 0.19522776572668113, "grad_norm": 2.9440785803501335, "learning_rate": 7.998522976709873e-06, "logits/chosen": 1.0039453506469727, "logits/rejected": 0.9802931547164917, "logps/chosen": -0.7600377798080444, "logps/rejected": -1.3574674129486084, "loss": 0.8128, "odds_ratio_loss": 0.610292375087738, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0760037824511528, "rewards/margins": 0.05974297225475311, "rewards/rejected": -0.13574674725532532, "sft_loss": 0.7600377798080444, "step": 135 }, { "epoch": 0.19667389732465654, "grad_norm": 2.8581912638451694, "learning_rate": 7.998437375219955e-06, "logits/chosen": 1.2175956964492798, "logits/rejected": 0.8139463663101196, "logps/chosen": -0.7659615874290466, "logps/rejected": -1.644038200378418, "loss": 0.8783, "odds_ratio_loss": 0.49914222955703735, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07659615576267242, "rewards/margins": 0.08780766278505325, "rewards/rejected": -0.16440382599830627, "sft_loss": 0.7659615874290466, "step": 136 }, { "epoch": 0.19812002892263195, "grad_norm": 6.226413264357474, "learning_rate": 7.998349363057252e-06, "logits/chosen": 1.0425236225128174, "logits/rejected": 0.9942991137504578, "logps/chosen": -0.7867666482925415, "logps/rejected": -1.3995404243469238, "loss": 0.8907, "odds_ratio_loss": 0.5336313843727112, "rewards/accuracies": 0.625, "rewards/chosen": -0.07867667078971863, "rewards/margins": 0.06127737835049629, "rewards/rejected": -0.13995404541492462, "sft_loss": 0.7867666482925415, "step": 137 }, { "epoch": 0.19956616052060738, "grad_norm": 2.869697362833916, "learning_rate": 7.998258940274828e-06, "logits/chosen": 0.969634473323822, "logits/rejected": 0.621441662311554, "logps/chosen": -0.8326024413108826, "logps/rejected": -1.6405725479125977, "loss": 0.8682, "odds_ratio_loss": 0.5707529783248901, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08326023817062378, "rewards/margins": 0.08079702407121658, "rewards/rejected": -0.16405726969242096, "sft_loss": 0.8326024413108826, "step": 138 }, { "epoch": 0.2010122921185828, "grad_norm": 8.40598616315667, "learning_rate": 7.998166106927197e-06, "logits/chosen": 1.067244291305542, "logits/rejected": 0.7582840919494629, "logps/chosen": -0.5070884227752686, "logps/rejected": -1.4907238483428955, "loss": 0.8181, "odds_ratio_loss": 0.4089462459087372, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0507088378071785, "rewards/margins": 0.09836354106664658, "rewards/rejected": -0.14907237887382507, "sft_loss": 0.5070884227752686, "step": 139 }, { "epoch": 0.2024584237165582, "grad_norm": 3.2177545453756986, "learning_rate": 7.998070863070329e-06, "logits/chosen": 1.0560888051986694, "logits/rejected": 0.7042299509048462, "logps/chosen": -0.9449357390403748, "logps/rejected": -1.1625012159347534, "loss": 0.8684, "odds_ratio_loss": 0.5798974633216858, "rewards/accuracies": 0.75, "rewards/chosen": -0.0944935753941536, "rewards/margins": 0.021756542846560478, "rewards/rejected": -0.11625012010335922, "sft_loss": 0.9449357390403748, "step": 140 }, { "epoch": 0.2039045553145336, "grad_norm": 4.626119117506995, "learning_rate": 7.997973208761647e-06, "logits/chosen": 0.8979699611663818, "logits/rejected": 0.7739961743354797, "logps/chosen": -0.7614917159080505, "logps/rejected": -1.5159138441085815, "loss": 0.8879, "odds_ratio_loss": 0.48651716113090515, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07614917308092117, "rewards/margins": 0.07544222474098206, "rewards/rejected": -0.15159140527248383, "sft_loss": 0.7614917159080505, "step": 141 }, { "epoch": 0.20535068691250905, "grad_norm": 6.122175759295342, "learning_rate": 7.997873144060028e-06, "logits/chosen": 1.0175689458847046, "logits/rejected": 0.8373978734016418, "logps/chosen": -0.7784775495529175, "logps/rejected": -1.4234530925750732, "loss": 0.962, "odds_ratio_loss": 0.6008108854293823, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07784774899482727, "rewards/margins": 0.06449756026268005, "rewards/rejected": -0.14234532415866852, "sft_loss": 0.7784775495529175, "step": 142 }, { "epoch": 0.20679681851048445, "grad_norm": 4.8238500026764095, "learning_rate": 7.9977706690258e-06, "logits/chosen": 0.9935001134872437, "logits/rejected": 0.8581937551498413, "logps/chosen": -0.8578831553459167, "logps/rejected": -1.0968329906463623, "loss": 0.8869, "odds_ratio_loss": 0.6052335500717163, "rewards/accuracies": 0.625, "rewards/chosen": -0.0857883170247078, "rewards/margins": 0.023894988000392914, "rewards/rejected": -0.10968329757452011, "sft_loss": 0.8578831553459167, "step": 143 }, { "epoch": 0.20824295010845986, "grad_norm": 3.1206319457795875, "learning_rate": 7.997665783720749e-06, "logits/chosen": 1.1067547798156738, "logits/rejected": 1.0524673461914062, "logps/chosen": -0.6326704025268555, "logps/rejected": -1.2812092304229736, "loss": 0.8483, "odds_ratio_loss": 0.4811175763607025, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0632670447230339, "rewards/margins": 0.06485386937856674, "rewards/rejected": -0.12812092900276184, "sft_loss": 0.6326704025268555, "step": 144 }, { "epoch": 0.2096890817064353, "grad_norm": 3.7395254574099126, "learning_rate": 7.997558488208105e-06, "logits/chosen": 0.8425149917602539, "logits/rejected": 0.6503562927246094, "logps/chosen": -0.6809152960777283, "logps/rejected": -1.6081905364990234, "loss": 0.8512, "odds_ratio_loss": 0.4523545801639557, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06809153407812119, "rewards/margins": 0.09272752702236176, "rewards/rejected": -0.16081905364990234, "sft_loss": 0.6809152960777283, "step": 145 }, { "epoch": 0.2111352133044107, "grad_norm": 3.482516836820913, "learning_rate": 7.997448782552561e-06, "logits/chosen": 1.1837321519851685, "logits/rejected": 0.89334636926651, "logps/chosen": -0.8056322932243347, "logps/rejected": -1.4506659507751465, "loss": 0.8242, "odds_ratio_loss": 0.5507670640945435, "rewards/accuracies": 0.625, "rewards/chosen": -0.08056323230266571, "rewards/margins": 0.06450335681438446, "rewards/rejected": -0.14506658911705017, "sft_loss": 0.8056322932243347, "step": 146 }, { "epoch": 0.21258134490238612, "grad_norm": 3.3963256945903915, "learning_rate": 7.997336666820258e-06, "logits/chosen": 0.7776620388031006, "logits/rejected": 0.8272684216499329, "logps/chosen": -0.7493107318878174, "logps/rejected": -1.14316987991333, "loss": 0.8604, "odds_ratio_loss": 0.5952869057655334, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0749310702085495, "rewards/margins": 0.039385922253131866, "rewards/rejected": -0.11431698501110077, "sft_loss": 0.7493107318878174, "step": 147 }, { "epoch": 0.21402747650036152, "grad_norm": 3.3238714858314244, "learning_rate": 7.997222141078791e-06, "logits/chosen": 0.8280146718025208, "logits/rejected": 0.7190472483634949, "logps/chosen": -0.8145158290863037, "logps/rejected": -1.596473217010498, "loss": 0.8569, "odds_ratio_loss": 0.6677970886230469, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08145157992839813, "rewards/margins": 0.07819575071334839, "rewards/rejected": -0.15964734554290771, "sft_loss": 0.8145158290863037, "step": 148 }, { "epoch": 0.21547360809833696, "grad_norm": 3.8446953387308724, "learning_rate": 7.997105205397208e-06, "logits/chosen": 0.7918663024902344, "logits/rejected": 0.7498356103897095, "logps/chosen": -0.9449871182441711, "logps/rejected": -1.143221378326416, "loss": 0.8744, "odds_ratio_loss": 0.5902383327484131, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09449870884418488, "rewards/margins": 0.019823430106043816, "rewards/rejected": -0.11432213336229324, "sft_loss": 0.9449871182441711, "step": 149 }, { "epoch": 0.21691973969631237, "grad_norm": 3.3011722687802316, "learning_rate": 7.99698585984601e-06, "logits/chosen": 1.0179299116134644, "logits/rejected": 0.8888053297996521, "logps/chosen": -0.9322119951248169, "logps/rejected": -1.32847261428833, "loss": 0.8966, "odds_ratio_loss": 0.7749242186546326, "rewards/accuracies": 0.4375, "rewards/chosen": -0.09322120249271393, "rewards/margins": 0.039626047015190125, "rewards/rejected": -0.13284724950790405, "sft_loss": 0.9322119951248169, "step": 150 }, { "epoch": 0.21836587129428778, "grad_norm": 4.782501865845731, "learning_rate": 7.99686410449715e-06, "logits/chosen": 1.043616533279419, "logits/rejected": 0.8453623056411743, "logps/chosen": -1.0205018520355225, "logps/rejected": -1.2420902252197266, "loss": 0.9077, "odds_ratio_loss": 0.6890058517456055, "rewards/accuracies": 0.625, "rewards/chosen": -0.10205018520355225, "rewards/margins": 0.02215883880853653, "rewards/rejected": -0.12420902401208878, "sft_loss": 1.0205018520355225, "step": 151 }, { "epoch": 0.2198120028922632, "grad_norm": 2.7453397904711867, "learning_rate": 7.996739939424036e-06, "logits/chosen": 1.0936996936798096, "logits/rejected": 0.7009330987930298, "logps/chosen": -0.9429874420166016, "logps/rejected": -1.2208938598632812, "loss": 0.8392, "odds_ratio_loss": 0.6516534686088562, "rewards/accuracies": 0.5, "rewards/chosen": -0.09429875016212463, "rewards/margins": 0.027790643274784088, "rewards/rejected": -0.12208940088748932, "sft_loss": 0.9429874420166016, "step": 152 }, { "epoch": 0.22125813449023862, "grad_norm": 3.5114275545087272, "learning_rate": 7.996613364701528e-06, "logits/chosen": 0.8619660139083862, "logits/rejected": 0.8545634746551514, "logps/chosen": -0.8546210527420044, "logps/rejected": -0.9204950332641602, "loss": 0.823, "odds_ratio_loss": 0.7684470415115356, "rewards/accuracies": 0.375, "rewards/chosen": -0.08546211570501328, "rewards/margins": 0.006587391719222069, "rewards/rejected": -0.0920495018362999, "sft_loss": 0.8546210527420044, "step": 153 }, { "epoch": 0.22270426608821403, "grad_norm": 3.1464188184377067, "learning_rate": 7.996484380405936e-06, "logits/chosen": 0.8472630977630615, "logits/rejected": 0.9261313080787659, "logps/chosen": -0.8409748673439026, "logps/rejected": -1.471449851989746, "loss": 0.8471, "odds_ratio_loss": 0.5609486699104309, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0840974897146225, "rewards/margins": 0.06304750591516495, "rewards/rejected": -0.14714500308036804, "sft_loss": 0.8409748673439026, "step": 154 }, { "epoch": 0.22415039768618944, "grad_norm": 3.82484646727686, "learning_rate": 7.996352986615026e-06, "logits/chosen": 1.1553988456726074, "logits/rejected": 0.9536395072937012, "logps/chosen": -0.8071169257164001, "logps/rejected": -2.1079392433166504, "loss": 1.0003, "odds_ratio_loss": 0.44714686274528503, "rewards/accuracies": 0.625, "rewards/chosen": -0.08071169257164001, "rewards/margins": 0.13008223474025726, "rewards/rejected": -0.21079392731189728, "sft_loss": 0.8071169257164001, "step": 155 }, { "epoch": 0.22559652928416485, "grad_norm": 3.3443773185383248, "learning_rate": 7.996219183408017e-06, "logits/chosen": 0.8596165180206299, "logits/rejected": 0.8327771425247192, "logps/chosen": -1.0019909143447876, "logps/rejected": -1.1135194301605225, "loss": 0.9495, "odds_ratio_loss": 0.7217904329299927, "rewards/accuracies": 0.4375, "rewards/chosen": -0.10019908845424652, "rewards/margins": 0.011152852326631546, "rewards/rejected": -0.11135194450616837, "sft_loss": 1.0019909143447876, "step": 156 }, { "epoch": 0.22704266088214028, "grad_norm": 3.253728818340114, "learning_rate": 7.99608297086558e-06, "logits/chosen": 0.7683792114257812, "logits/rejected": 0.7193489074707031, "logps/chosen": -0.7505009174346924, "logps/rejected": -1.5825207233428955, "loss": 0.8003, "odds_ratio_loss": 0.5120865702629089, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07505009323358536, "rewards/margins": 0.08320198208093643, "rewards/rejected": -0.1582520753145218, "sft_loss": 0.7505009174346924, "step": 157 }, { "epoch": 0.2284887924801157, "grad_norm": 4.3523818776563035, "learning_rate": 7.995944349069836e-06, "logits/chosen": 1.1126295328140259, "logits/rejected": 0.9180561900138855, "logps/chosen": -0.870684027671814, "logps/rejected": -1.4605320692062378, "loss": 0.8131, "odds_ratio_loss": 0.5564998388290405, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08706840872764587, "rewards/margins": 0.05898480489850044, "rewards/rejected": -0.14605320990085602, "sft_loss": 0.870684027671814, "step": 158 }, { "epoch": 0.2299349240780911, "grad_norm": 4.108594412816073, "learning_rate": 7.99580331810436e-06, "logits/chosen": 1.1916407346725464, "logits/rejected": 0.7693163156509399, "logps/chosen": -0.7917389869689941, "logps/rejected": -1.244872808456421, "loss": 0.877, "odds_ratio_loss": 0.5736300945281982, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07917389273643494, "rewards/margins": 0.04531338810920715, "rewards/rejected": -0.12448728084564209, "sft_loss": 0.7917389869689941, "step": 159 }, { "epoch": 0.2313810556760665, "grad_norm": 3.113607572889762, "learning_rate": 7.995659878054184e-06, "logits/chosen": 1.040024995803833, "logits/rejected": 0.8305615782737732, "logps/chosen": -0.8537604808807373, "logps/rejected": -1.1052323579788208, "loss": 0.8598, "odds_ratio_loss": 0.5960687398910522, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08537605404853821, "rewards/margins": 0.025147181004285812, "rewards/rejected": -0.11052322387695312, "sft_loss": 0.8537604808807373, "step": 160 }, { "epoch": 0.23282718727404195, "grad_norm": 4.094550792057405, "learning_rate": 7.995514029005786e-06, "logits/chosen": 0.8100274801254272, "logits/rejected": 0.7823082208633423, "logps/chosen": -0.8013307452201843, "logps/rejected": -1.2279610633850098, "loss": 0.9395, "odds_ratio_loss": 0.5334639549255371, "rewards/accuracies": 0.75, "rewards/chosen": -0.08013307303190231, "rewards/margins": 0.042663030326366425, "rewards/rejected": -0.12279611080884933, "sft_loss": 0.8013307452201843, "step": 161 }, { "epoch": 0.23427331887201736, "grad_norm": 4.417141454685987, "learning_rate": 7.995365771047098e-06, "logits/chosen": 1.0468742847442627, "logits/rejected": 0.8377524018287659, "logps/chosen": -0.9060525298118591, "logps/rejected": -1.0592972040176392, "loss": 0.8911, "odds_ratio_loss": 0.7184407711029053, "rewards/accuracies": 0.5, "rewards/chosen": -0.09060525894165039, "rewards/margins": 0.015324457548558712, "rewards/rejected": -0.10592971742153168, "sft_loss": 0.9060525298118591, "step": 162 }, { "epoch": 0.23571945046999276, "grad_norm": 4.991713895984588, "learning_rate": 7.995215104267506e-06, "logits/chosen": 0.8785092830657959, "logits/rejected": 0.7008171677589417, "logps/chosen": -0.9880103468894958, "logps/rejected": -1.4084423780441284, "loss": 0.954, "odds_ratio_loss": 0.6232841610908508, "rewards/accuracies": 0.875, "rewards/chosen": -0.09880103915929794, "rewards/margins": 0.042043209075927734, "rewards/rejected": -0.14084425568580627, "sft_loss": 0.9880103468894958, "step": 163 }, { "epoch": 0.23716558206796817, "grad_norm": 3.888510837008354, "learning_rate": 7.995062028757848e-06, "logits/chosen": 1.2152018547058105, "logits/rejected": 1.0892407894134521, "logps/chosen": -0.7016069293022156, "logps/rejected": -1.010518193244934, "loss": 0.8521, "odds_ratio_loss": 0.5655720233917236, "rewards/accuracies": 0.625, "rewards/chosen": -0.07016069442033768, "rewards/margins": 0.03089112415909767, "rewards/rejected": -0.10105182230472565, "sft_loss": 0.7016069293022156, "step": 164 }, { "epoch": 0.2386117136659436, "grad_norm": 3.047500446746817, "learning_rate": 7.994906544610413e-06, "logits/chosen": 1.074456810951233, "logits/rejected": 0.809201180934906, "logps/chosen": -0.6478066444396973, "logps/rejected": -1.6955187320709229, "loss": 0.8543, "odds_ratio_loss": 0.38769054412841797, "rewards/accuracies": 0.875, "rewards/chosen": -0.06478067487478256, "rewards/margins": 0.1047712117433548, "rewards/rejected": -0.16955187916755676, "sft_loss": 0.6478066444396973, "step": 165 }, { "epoch": 0.24005784526391902, "grad_norm": 2.6536028558426774, "learning_rate": 7.994748651918946e-06, "logits/chosen": 1.1055707931518555, "logits/rejected": 0.8093370795249939, "logps/chosen": -0.8095314502716064, "logps/rejected": -1.4664175510406494, "loss": 0.9225, "odds_ratio_loss": 0.664374828338623, "rewards/accuracies": 0.5, "rewards/chosen": -0.08095315098762512, "rewards/margins": 0.0656886100769043, "rewards/rejected": -0.14664176106452942, "sft_loss": 0.8095314502716064, "step": 166 }, { "epoch": 0.24150397686189443, "grad_norm": 3.541652037604111, "learning_rate": 7.994588350778638e-06, "logits/chosen": 1.0057575702667236, "logits/rejected": 0.8427650928497314, "logps/chosen": -0.9639623165130615, "logps/rejected": -1.1809089183807373, "loss": 0.9288, "odds_ratio_loss": 0.6998913288116455, "rewards/accuracies": 0.5, "rewards/chosen": -0.09639623761177063, "rewards/margins": 0.02169465646147728, "rewards/rejected": -0.11809088289737701, "sft_loss": 0.9639623165130615, "step": 167 }, { "epoch": 0.24295010845986983, "grad_norm": 4.277742047003352, "learning_rate": 7.994425641286135e-06, "logits/chosen": 1.0967094898223877, "logits/rejected": 1.0031440258026123, "logps/chosen": -0.7859099507331848, "logps/rejected": -1.1452839374542236, "loss": 0.8337, "odds_ratio_loss": 0.5716875791549683, "rewards/accuracies": 0.75, "rewards/chosen": -0.0785909965634346, "rewards/margins": 0.03593740612268448, "rewards/rejected": -0.11452840268611908, "sft_loss": 0.7859099507331848, "step": 168 }, { "epoch": 0.24439624005784527, "grad_norm": 2.8034050061730107, "learning_rate": 7.994260523539536e-06, "logits/chosen": 1.0453124046325684, "logits/rejected": 0.8825990557670593, "logps/chosen": -0.7164455652236938, "logps/rejected": -1.6012907028198242, "loss": 0.8723, "odds_ratio_loss": 0.4701683223247528, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07164455950260162, "rewards/margins": 0.0884845107793808, "rewards/rejected": -0.16012907028198242, "sft_loss": 0.7164455652236938, "step": 169 }, { "epoch": 0.24584237165582068, "grad_norm": 2.977337634451211, "learning_rate": 7.994092997638392e-06, "logits/chosen": 0.9832794666290283, "logits/rejected": 0.7748570442199707, "logps/chosen": -0.6102631092071533, "logps/rejected": -2.1352500915527344, "loss": 0.8238, "odds_ratio_loss": 0.2782694697380066, "rewards/accuracies": 0.875, "rewards/chosen": -0.06102630868554115, "rewards/margins": 0.15249869227409363, "rewards/rejected": -0.2135249823331833, "sft_loss": 0.6102631092071533, "step": 170 }, { "epoch": 0.2472885032537961, "grad_norm": 3.1791088490599693, "learning_rate": 7.993923063683702e-06, "logits/chosen": 1.1284598112106323, "logits/rejected": 0.8740060329437256, "logps/chosen": -0.739490807056427, "logps/rejected": -1.5526961088180542, "loss": 0.8523, "odds_ratio_loss": 0.546734094619751, "rewards/accuracies": 0.75, "rewards/chosen": -0.07394907623529434, "rewards/margins": 0.08132053166627884, "rewards/rejected": -0.15526960790157318, "sft_loss": 0.739490807056427, "step": 171 }, { "epoch": 0.24873463485177152, "grad_norm": 3.104876414975262, "learning_rate": 7.993750721777924e-06, "logits/chosen": 0.7685276865959167, "logits/rejected": 0.6663203239440918, "logps/chosen": -0.869566798210144, "logps/rejected": -1.5396361351013184, "loss": 0.9164, "odds_ratio_loss": 0.5057757496833801, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0869566798210144, "rewards/margins": 0.06700694561004639, "rewards/rejected": -0.1539636254310608, "sft_loss": 0.869566798210144, "step": 172 }, { "epoch": 0.2501807664497469, "grad_norm": 3.153797811030736, "learning_rate": 7.993575972024962e-06, "logits/chosen": 0.8820764422416687, "logits/rejected": 0.8640388250350952, "logps/chosen": -0.881617546081543, "logps/rejected": -1.2504996061325073, "loss": 0.898, "odds_ratio_loss": 0.6156437397003174, "rewards/accuracies": 0.625, "rewards/chosen": -0.08816175162792206, "rewards/margins": 0.036888204514980316, "rewards/rejected": -0.12504996359348297, "sft_loss": 0.881617546081543, "step": 173 }, { "epoch": 0.25162689804772237, "grad_norm": 3.111404000329643, "learning_rate": 7.99339881453017e-06, "logits/chosen": 0.9390738606452942, "logits/rejected": 0.9202344417572021, "logps/chosen": -0.7253485918045044, "logps/rejected": -1.4922535419464111, "loss": 0.8046, "odds_ratio_loss": 0.46386268734931946, "rewards/accuracies": 0.75, "rewards/chosen": -0.07253485172986984, "rewards/margins": 0.07669049501419067, "rewards/rejected": -0.1492253541946411, "sft_loss": 0.7253485918045044, "step": 174 }, { "epoch": 0.2530730296456978, "grad_norm": 3.9720902981170942, "learning_rate": 7.993219249400363e-06, "logits/chosen": 0.9836024045944214, "logits/rejected": 0.8066073060035706, "logps/chosen": -0.8062931299209595, "logps/rejected": -1.5868737697601318, "loss": 0.9287, "odds_ratio_loss": 0.5350762605667114, "rewards/accuracies": 0.75, "rewards/chosen": -0.08062931895256042, "rewards/margins": 0.07805806398391724, "rewards/rejected": -0.15868738293647766, "sft_loss": 0.8062931299209595, "step": 175 }, { "epoch": 0.2545191612436732, "grad_norm": 2.800439660609358, "learning_rate": 7.993037276743796e-06, "logits/chosen": 0.7656744122505188, "logits/rejected": 0.5632432699203491, "logps/chosen": -0.7139934301376343, "logps/rejected": -2.1812193393707275, "loss": 0.8189, "odds_ratio_loss": 0.3666784465312958, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07139935344457626, "rewards/margins": 0.14672258496284485, "rewards/rejected": -0.21812193095684052, "sft_loss": 0.7139934301376343, "step": 176 }, { "epoch": 0.2559652928416486, "grad_norm": 2.8937622627656037, "learning_rate": 7.992852896670184e-06, "logits/chosen": 0.9522703886032104, "logits/rejected": 0.8420579433441162, "logps/chosen": -1.0373575687408447, "logps/rejected": -1.0805912017822266, "loss": 0.9378, "odds_ratio_loss": 0.8096970319747925, "rewards/accuracies": 0.4375, "rewards/chosen": -0.10373575985431671, "rewards/margins": 0.0043233660981059074, "rewards/rejected": -0.1080591231584549, "sft_loss": 1.0373575687408447, "step": 177 }, { "epoch": 0.257411424439624, "grad_norm": 5.312025248990845, "learning_rate": 7.99266610929069e-06, "logits/chosen": 1.116875410079956, "logits/rejected": 0.8645721673965454, "logps/chosen": -0.8963755965232849, "logps/rejected": -1.5484733581542969, "loss": 0.7993, "odds_ratio_loss": 0.6069468259811401, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08963755518198013, "rewards/margins": 0.0652097761631012, "rewards/rejected": -0.15484732389450073, "sft_loss": 0.8963755965232849, "step": 178 }, { "epoch": 0.2588575560375994, "grad_norm": 4.10362786725861, "learning_rate": 7.992476914717928e-06, "logits/chosen": 0.9966639876365662, "logits/rejected": 0.9122011661529541, "logps/chosen": -0.7157900333404541, "logps/rejected": -1.8117486238479614, "loss": 0.8281, "odds_ratio_loss": 0.4310546815395355, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07157900184392929, "rewards/margins": 0.10959585756063461, "rewards/rejected": -0.1811748594045639, "sft_loss": 0.7157900333404541, "step": 179 }, { "epoch": 0.2603036876355748, "grad_norm": 6.129315995584233, "learning_rate": 7.992285313065964e-06, "logits/chosen": 0.8830561637878418, "logits/rejected": 0.8202065229415894, "logps/chosen": -0.9284470081329346, "logps/rejected": -1.648970603942871, "loss": 0.9707, "odds_ratio_loss": 0.7073562145233154, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09284469485282898, "rewards/margins": 0.07205238938331604, "rewards/rejected": -0.16489706933498383, "sft_loss": 0.9284470081329346, "step": 180 }, { "epoch": 0.26174981923355023, "grad_norm": 3.8183141881814393, "learning_rate": 7.992091304450316e-06, "logits/chosen": 1.123767375946045, "logits/rejected": 0.7026354670524597, "logps/chosen": -0.8758699893951416, "logps/rejected": -2.0314223766326904, "loss": 0.7393, "odds_ratio_loss": 0.5088123083114624, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08758699893951416, "rewards/margins": 0.11555524170398712, "rewards/rejected": -0.20314225554466248, "sft_loss": 0.8758699893951416, "step": 181 }, { "epoch": 0.2631959508315257, "grad_norm": 3.181819086244578, "learning_rate": 7.991894888987954e-06, "logits/chosen": 0.8263757824897766, "logits/rejected": 0.6464723944664001, "logps/chosen": -0.6118509769439697, "logps/rejected": -1.7053591012954712, "loss": 0.7897, "odds_ratio_loss": 0.40566810965538025, "rewards/accuracies": 0.875, "rewards/chosen": -0.06118509918451309, "rewards/margins": 0.10935080051422119, "rewards/rejected": -0.17053590714931488, "sft_loss": 0.6118509769439697, "step": 182 }, { "epoch": 0.2646420824295011, "grad_norm": 2.7941883907120846, "learning_rate": 7.991696066797293e-06, "logits/chosen": 1.130903720855713, "logits/rejected": 0.7515724301338196, "logps/chosen": -0.7077884674072266, "logps/rejected": -1.0460158586502075, "loss": 0.8384, "odds_ratio_loss": 0.4854280948638916, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07077884674072266, "rewards/margins": 0.033822737634181976, "rewards/rejected": -0.10460158437490463, "sft_loss": 0.7077884674072266, "step": 183 }, { "epoch": 0.2660882140274765, "grad_norm": 3.922413376181467, "learning_rate": 7.991494837998209e-06, "logits/chosen": 0.867751955986023, "logits/rejected": 0.6655453443527222, "logps/chosen": -1.1125421524047852, "logps/rejected": -2.218876838684082, "loss": 0.9668, "odds_ratio_loss": 0.6620099544525146, "rewards/accuracies": 0.5625, "rewards/chosen": -0.11125421524047852, "rewards/margins": 0.1106334701180458, "rewards/rejected": -0.22188769280910492, "sft_loss": 1.1125421524047852, "step": 184 }, { "epoch": 0.2675343456254519, "grad_norm": 4.849556019250261, "learning_rate": 7.991291202712021e-06, "logits/chosen": 0.9399557113647461, "logits/rejected": 0.7142766714096069, "logps/chosen": -0.7886428833007812, "logps/rejected": -1.5420012474060059, "loss": 0.8707, "odds_ratio_loss": 0.5734947323799133, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07886429131031036, "rewards/margins": 0.07533583790063858, "rewards/rejected": -0.15420013666152954, "sft_loss": 0.7886428833007812, "step": 185 }, { "epoch": 0.26898047722342733, "grad_norm": 3.5359765528486298, "learning_rate": 7.991085161061502e-06, "logits/chosen": 0.9476644992828369, "logits/rejected": 0.8192164897918701, "logps/chosen": -0.6491098403930664, "logps/rejected": -1.3708826303482056, "loss": 0.8088, "odds_ratio_loss": 0.4414351284503937, "rewards/accuracies": 0.75, "rewards/chosen": -0.06491097807884216, "rewards/margins": 0.07217729091644287, "rewards/rejected": -0.13708826899528503, "sft_loss": 0.6491098403930664, "step": 186 }, { "epoch": 0.27042660882140274, "grad_norm": 3.5799876990998767, "learning_rate": 7.990876713170873e-06, "logits/chosen": 0.988240659236908, "logits/rejected": 0.8487038612365723, "logps/chosen": -0.7226072549819946, "logps/rejected": -1.360625982284546, "loss": 0.8501, "odds_ratio_loss": 0.538947343826294, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07226072996854782, "rewards/margins": 0.06380186975002289, "rewards/rejected": -0.1360626071691513, "sft_loss": 0.7226072549819946, "step": 187 }, { "epoch": 0.27187274041937814, "grad_norm": 2.952397694472701, "learning_rate": 7.990665859165812e-06, "logits/chosen": 0.9410344362258911, "logits/rejected": 0.8459163904190063, "logps/chosen": -0.8709695339202881, "logps/rejected": -1.0749214887619019, "loss": 0.8362, "odds_ratio_loss": 0.6080572009086609, "rewards/accuracies": 0.625, "rewards/chosen": -0.08709695935249329, "rewards/margins": 0.020395198836922646, "rewards/rejected": -0.10749214887619019, "sft_loss": 0.8709695339202881, "step": 188 }, { "epoch": 0.27331887201735355, "grad_norm": 3.0804169157983807, "learning_rate": 7.990452599173442e-06, "logits/chosen": 1.2123034000396729, "logits/rejected": 0.9961423873901367, "logps/chosen": -0.5125177502632141, "logps/rejected": -1.496875524520874, "loss": 0.862, "odds_ratio_loss": 0.3468582034111023, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05125177651643753, "rewards/margins": 0.09843578189611435, "rewards/rejected": -0.14968755841255188, "sft_loss": 0.5125177502632141, "step": 189 }, { "epoch": 0.274765003615329, "grad_norm": 2.6526058029397577, "learning_rate": 7.990236933322337e-06, "logits/chosen": 0.8868533372879028, "logits/rejected": 0.6382319927215576, "logps/chosen": -0.8466877937316895, "logps/rejected": -1.705843448638916, "loss": 0.8327, "odds_ratio_loss": 0.407950222492218, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08466878533363342, "rewards/margins": 0.08591558039188385, "rewards/rejected": -0.17058435082435608, "sft_loss": 0.8466877937316895, "step": 190 }, { "epoch": 0.2762111352133044, "grad_norm": 3.995125160090004, "learning_rate": 7.990018861742524e-06, "logits/chosen": 0.7121444940567017, "logits/rejected": 0.6306707859039307, "logps/chosen": -0.7519279718399048, "logps/rejected": -1.428523063659668, "loss": 0.8425, "odds_ratio_loss": 0.5476968288421631, "rewards/accuracies": 0.75, "rewards/chosen": -0.07519279420375824, "rewards/margins": 0.06765950471162796, "rewards/rejected": -0.1428523063659668, "sft_loss": 0.7519279718399048, "step": 191 }, { "epoch": 0.27765726681127983, "grad_norm": 3.632543289305855, "learning_rate": 7.989798384565478e-06, "logits/chosen": 0.7635389566421509, "logits/rejected": 0.72230064868927, "logps/chosen": -0.7457807064056396, "logps/rejected": -1.7796211242675781, "loss": 0.9186, "odds_ratio_loss": 0.4742932915687561, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07457807660102844, "rewards/margins": 0.10338405519723892, "rewards/rejected": -0.17796212434768677, "sft_loss": 0.7457807064056396, "step": 192 }, { "epoch": 0.27910339840925524, "grad_norm": 3.0980154406972362, "learning_rate": 7.989575501924127e-06, "logits/chosen": 0.8246858716011047, "logits/rejected": 0.5677071809768677, "logps/chosen": -0.7799168229103088, "logps/rejected": -1.816365122795105, "loss": 0.9227, "odds_ratio_loss": 0.5203930735588074, "rewards/accuracies": 0.625, "rewards/chosen": -0.07799168676137924, "rewards/margins": 0.10364483296871185, "rewards/rejected": -0.1816365271806717, "sft_loss": 0.7799168229103088, "step": 193 }, { "epoch": 0.28054953000723065, "grad_norm": 3.2989461799711544, "learning_rate": 7.989350213952848e-06, "logits/chosen": 0.9356462955474854, "logits/rejected": 0.629136323928833, "logps/chosen": -0.9007641673088074, "logps/rejected": -1.534816026687622, "loss": 0.899, "odds_ratio_loss": 0.549186646938324, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09007642418146133, "rewards/margins": 0.06340517848730087, "rewards/rejected": -0.1534816026687622, "sft_loss": 0.9007641673088074, "step": 194 }, { "epoch": 0.28199566160520606, "grad_norm": 5.46581584343181, "learning_rate": 7.989122520787467e-06, "logits/chosen": 0.8825095891952515, "logits/rejected": 0.685544490814209, "logps/chosen": -1.0721142292022705, "logps/rejected": -1.4573118686676025, "loss": 0.9202, "odds_ratio_loss": 0.7042291164398193, "rewards/accuracies": 0.5625, "rewards/chosen": -0.10721142590045929, "rewards/margins": 0.038519762456417084, "rewards/rejected": -0.14573121070861816, "sft_loss": 1.0721142292022705, "step": 195 }, { "epoch": 0.28344179320318147, "grad_norm": 4.080158423120696, "learning_rate": 7.98889242256526e-06, "logits/chosen": 0.7583059668540955, "logits/rejected": 0.7805667519569397, "logps/chosen": -0.9448596239089966, "logps/rejected": -1.3803226947784424, "loss": 0.8473, "odds_ratio_loss": 0.5857207775115967, "rewards/accuracies": 0.625, "rewards/chosen": -0.09448596090078354, "rewards/margins": 0.04354630410671234, "rewards/rejected": -0.13803227245807648, "sft_loss": 0.9448596239089966, "step": 196 }, { "epoch": 0.2848879248011569, "grad_norm": 3.973133965232593, "learning_rate": 7.988659919424955e-06, "logits/chosen": 1.090093731880188, "logits/rejected": 0.8616019487380981, "logps/chosen": -0.7189903855323792, "logps/rejected": -1.4010587930679321, "loss": 0.7704, "odds_ratio_loss": 0.48269736766815186, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07189904153347015, "rewards/margins": 0.06820684671401978, "rewards/rejected": -0.14010588824748993, "sft_loss": 0.7189903855323792, "step": 197 }, { "epoch": 0.28633405639913234, "grad_norm": 5.238268762550638, "learning_rate": 7.988425011506729e-06, "logits/chosen": 0.8917097449302673, "logits/rejected": 0.768082857131958, "logps/chosen": -0.9902671575546265, "logps/rejected": -1.6686362028121948, "loss": 0.9195, "odds_ratio_loss": 0.5506667494773865, "rewards/accuracies": 0.75, "rewards/chosen": -0.09902672469615936, "rewards/margins": 0.06783689558506012, "rewards/rejected": -0.16686362028121948, "sft_loss": 0.9902671575546265, "step": 198 }, { "epoch": 0.28778018799710775, "grad_norm": 2.912157324642061, "learning_rate": 7.98818769895221e-06, "logits/chosen": 0.7989850044250488, "logits/rejected": 0.804793655872345, "logps/chosen": -0.7779511213302612, "logps/rejected": -1.3023908138275146, "loss": 0.8612, "odds_ratio_loss": 0.4791719913482666, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0777951180934906, "rewards/margins": 0.0524439737200737, "rewards/rejected": -0.1302390843629837, "sft_loss": 0.7779511213302612, "step": 199 }, { "epoch": 0.28922631959508316, "grad_norm": 2.5819984768363367, "learning_rate": 7.987947981904474e-06, "logits/chosen": 0.7809892892837524, "logits/rejected": 0.7020816802978516, "logps/chosen": -0.9111021161079407, "logps/rejected": -1.5197502374649048, "loss": 0.8977, "odds_ratio_loss": 0.5979732275009155, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0911102145910263, "rewards/margins": 0.06086481735110283, "rewards/rejected": -0.15197503566741943, "sft_loss": 0.9111021161079407, "step": 200 }, { "epoch": 0.29067245119305857, "grad_norm": 3.718047797689079, "learning_rate": 7.987705860508047e-06, "logits/chosen": 0.7863653898239136, "logits/rejected": 0.6933090090751648, "logps/chosen": -1.001847267150879, "logps/rejected": -1.0187654495239258, "loss": 0.9285, "odds_ratio_loss": 0.7681245803833008, "rewards/accuracies": 0.5625, "rewards/chosen": -0.10018472373485565, "rewards/margins": 0.0016918233595788479, "rewards/rejected": -0.10187654942274094, "sft_loss": 1.001847267150879, "step": 201 }, { "epoch": 0.292118582791034, "grad_norm": 3.2947050827622753, "learning_rate": 7.987461334908904e-06, "logits/chosen": 0.7659215927124023, "logits/rejected": 0.7039311528205872, "logps/chosen": -0.7966563105583191, "logps/rejected": -1.5445137023925781, "loss": 0.8617, "odds_ratio_loss": 0.47074514627456665, "rewards/accuracies": 0.875, "rewards/chosen": -0.0796656385064125, "rewards/margins": 0.0747857540845871, "rewards/rejected": -0.154451385140419, "sft_loss": 0.7966563105583191, "step": 202 }, { "epoch": 0.2935647143890094, "grad_norm": 2.750448225020009, "learning_rate": 7.98721440525447e-06, "logits/chosen": 0.6041221618652344, "logits/rejected": 0.571781575679779, "logps/chosen": -0.8432777523994446, "logps/rejected": -1.7520842552185059, "loss": 0.8824, "odds_ratio_loss": 0.4685782194137573, "rewards/accuracies": 0.75, "rewards/chosen": -0.08432777971029282, "rewards/margins": 0.09088063985109329, "rewards/rejected": -0.1752084195613861, "sft_loss": 0.8432777523994446, "step": 203 }, { "epoch": 0.2950108459869848, "grad_norm": 3.2302840293561395, "learning_rate": 7.986965071693625e-06, "logits/chosen": 0.8296738862991333, "logits/rejected": 0.7145066857337952, "logps/chosen": -0.9232079982757568, "logps/rejected": -1.2196871042251587, "loss": 0.9195, "odds_ratio_loss": 0.6474923491477966, "rewards/accuracies": 0.625, "rewards/chosen": -0.09232080727815628, "rewards/margins": 0.029647907242178917, "rewards/rejected": -0.12196871638298035, "sft_loss": 0.9232079982757568, "step": 204 }, { "epoch": 0.29645697758496026, "grad_norm": 3.1285063753356144, "learning_rate": 7.986713334376686e-06, "logits/chosen": 0.7811083793640137, "logits/rejected": 0.7227165102958679, "logps/chosen": -0.8592801094055176, "logps/rejected": -1.6300902366638184, "loss": 0.817, "odds_ratio_loss": 0.4264185428619385, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08592800796031952, "rewards/margins": 0.07708100974559784, "rewards/rejected": -0.16300901770591736, "sft_loss": 0.8592801094055176, "step": 205 }, { "epoch": 0.29790310918293567, "grad_norm": 3.558090548670039, "learning_rate": 7.98645919345543e-06, "logits/chosen": 1.2186182737350464, "logits/rejected": 0.79095858335495, "logps/chosen": -0.8577883243560791, "logps/rejected": -1.7429773807525635, "loss": 0.9034, "odds_ratio_loss": 0.5151891112327576, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08577883243560791, "rewards/margins": 0.08851891756057739, "rewards/rejected": -0.1742977499961853, "sft_loss": 0.8577883243560791, "step": 206 }, { "epoch": 0.2993492407809111, "grad_norm": 2.9503574400099932, "learning_rate": 7.986202649083081e-06, "logits/chosen": 0.8998777270317078, "logits/rejected": 0.6546536087989807, "logps/chosen": -0.7888596057891846, "logps/rejected": -1.6412047147750854, "loss": 0.8684, "odds_ratio_loss": 0.5090416073799133, "rewards/accuracies": 0.5, "rewards/chosen": -0.07888597249984741, "rewards/margins": 0.08523451536893845, "rewards/rejected": -0.16412046551704407, "sft_loss": 0.7888596057891846, "step": 207 }, { "epoch": 0.3007953723788865, "grad_norm": 2.9896940410495003, "learning_rate": 7.985943701414308e-06, "logits/chosen": 0.8296674489974976, "logits/rejected": 0.6985807418823242, "logps/chosen": -0.7949290871620178, "logps/rejected": -1.3496679067611694, "loss": 0.9616, "odds_ratio_loss": 0.4149304926395416, "rewards/accuracies": 0.875, "rewards/chosen": -0.07949291169643402, "rewards/margins": 0.05547389015555382, "rewards/rejected": -0.13496679067611694, "sft_loss": 0.7949290871620178, "step": 208 }, { "epoch": 0.3022415039768619, "grad_norm": 2.8436164762025062, "learning_rate": 7.98568235060523e-06, "logits/chosen": 0.9186201095581055, "logits/rejected": 0.775266706943512, "logps/chosen": -0.7632952332496643, "logps/rejected": -1.2405225038528442, "loss": 0.8446, "odds_ratio_loss": 0.6041321158409119, "rewards/accuracies": 0.75, "rewards/chosen": -0.0763295367360115, "rewards/margins": 0.047722719609737396, "rewards/rejected": -0.1240522563457489, "sft_loss": 0.7632952332496643, "step": 209 }, { "epoch": 0.3036876355748373, "grad_norm": 3.2744747177577493, "learning_rate": 7.98541859681342e-06, "logits/chosen": 0.8311449289321899, "logits/rejected": 0.6091936826705933, "logps/chosen": -0.8429104685783386, "logps/rejected": -1.8527532815933228, "loss": 0.885, "odds_ratio_loss": 0.5193464756011963, "rewards/accuracies": 0.75, "rewards/chosen": -0.08429104834794998, "rewards/margins": 0.10098428279161453, "rewards/rejected": -0.18527531623840332, "sft_loss": 0.8429104685783386, "step": 210 }, { "epoch": 0.3051337671728127, "grad_norm": 3.581434079691926, "learning_rate": 7.985152440197896e-06, "logits/chosen": 0.924926221370697, "logits/rejected": 0.681089460849762, "logps/chosen": -0.7374269366264343, "logps/rejected": -1.5732078552246094, "loss": 0.8093, "odds_ratio_loss": 0.4177606701850891, "rewards/accuracies": 0.875, "rewards/chosen": -0.07374269515275955, "rewards/margins": 0.08357809484004974, "rewards/rejected": -0.1573207825422287, "sft_loss": 0.7374269366264343, "step": 211 }, { "epoch": 0.3065798987707881, "grad_norm": 4.3947384054323395, "learning_rate": 7.984883880919123e-06, "logits/chosen": 0.8097667694091797, "logits/rejected": 0.7048723697662354, "logps/chosen": -0.8589484691619873, "logps/rejected": -1.5178613662719727, "loss": 0.8306, "odds_ratio_loss": 0.5315617918968201, "rewards/accuracies": 0.75, "rewards/chosen": -0.08589484542608261, "rewards/margins": 0.06589128822088242, "rewards/rejected": -0.15178614854812622, "sft_loss": 0.8589484691619873, "step": 212 }, { "epoch": 0.3080260303687636, "grad_norm": 3.1907573370753695, "learning_rate": 7.984612919139015e-06, "logits/chosen": 0.6455193758010864, "logits/rejected": 0.5276200771331787, "logps/chosen": -0.7528071403503418, "logps/rejected": -1.862764596939087, "loss": 0.7911, "odds_ratio_loss": 0.4871300458908081, "rewards/accuracies": 0.75, "rewards/chosen": -0.07528071105480194, "rewards/margins": 0.11099573969841003, "rewards/rejected": -0.18627645075321198, "sft_loss": 0.7528071403503418, "step": 213 }, { "epoch": 0.309472161966739, "grad_norm": 3.0906834831413805, "learning_rate": 7.98433955502094e-06, "logits/chosen": 1.0261489152908325, "logits/rejected": 0.9212863445281982, "logps/chosen": -0.7404288053512573, "logps/rejected": -1.2891268730163574, "loss": 0.9422, "odds_ratio_loss": 0.729023814201355, "rewards/accuracies": 0.4375, "rewards/chosen": -0.07404288649559021, "rewards/margins": 0.054869815707206726, "rewards/rejected": -0.12891268730163574, "sft_loss": 0.7404288053512573, "step": 214 }, { "epoch": 0.3109182935647144, "grad_norm": 3.465475173558746, "learning_rate": 7.984063788729707e-06, "logits/chosen": 0.9017759561538696, "logits/rejected": 0.6762468814849854, "logps/chosen": -0.8559285402297974, "logps/rejected": -1.2730680704116821, "loss": 0.9338, "odds_ratio_loss": 0.6160497069358826, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0855928584933281, "rewards/margins": 0.041713956743478775, "rewards/rejected": -0.12730681896209717, "sft_loss": 0.8559285402297974, "step": 215 }, { "epoch": 0.3123644251626898, "grad_norm": 4.054453016381701, "learning_rate": 7.983785620431576e-06, "logits/chosen": 0.8824461102485657, "logits/rejected": 0.7006586790084839, "logps/chosen": -0.9869768619537354, "logps/rejected": -1.6781079769134521, "loss": 0.9413, "odds_ratio_loss": 0.609472393989563, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09869769215583801, "rewards/margins": 0.0691131055355072, "rewards/rejected": -0.16781079769134521, "sft_loss": 0.9869768619537354, "step": 216 }, { "epoch": 0.3138105567606652, "grad_norm": 3.3550744195656486, "learning_rate": 7.98350505029426e-06, "logits/chosen": 0.831702470779419, "logits/rejected": 0.6789094805717468, "logps/chosen": -0.8098732233047485, "logps/rejected": -1.5542247295379639, "loss": 0.8176, "odds_ratio_loss": 0.5694811344146729, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0809873417019844, "rewards/margins": 0.07443515211343765, "rewards/rejected": -0.15542249381542206, "sft_loss": 0.8098732233047485, "step": 217 }, { "epoch": 0.3152566883586406, "grad_norm": 4.018954232761074, "learning_rate": 7.983222078486912e-06, "logits/chosen": 0.7321873903274536, "logits/rejected": 0.686294674873352, "logps/chosen": -0.7930043339729309, "logps/rejected": -1.2628358602523804, "loss": 0.8478, "odds_ratio_loss": 0.5880774259567261, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07930043339729309, "rewards/margins": 0.046983152627944946, "rewards/rejected": -0.12628358602523804, "sft_loss": 0.7930043339729309, "step": 218 }, { "epoch": 0.31670281995661603, "grad_norm": 2.788787607391925, "learning_rate": 7.982936705180138e-06, "logits/chosen": 0.8820023536682129, "logits/rejected": 0.6825209259986877, "logps/chosen": -0.6613825559616089, "logps/rejected": -1.4193311929702759, "loss": 0.7987, "odds_ratio_loss": 0.49998748302459717, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06613826006650925, "rewards/margins": 0.07579487562179565, "rewards/rejected": -0.1419331282377243, "sft_loss": 0.6613825559616089, "step": 219 }, { "epoch": 0.31814895155459144, "grad_norm": 2.4168572760464793, "learning_rate": 7.98264893054599e-06, "logits/chosen": 0.8746920824050903, "logits/rejected": 0.6152170896530151, "logps/chosen": -0.8145027160644531, "logps/rejected": -1.6139273643493652, "loss": 0.812, "odds_ratio_loss": 0.5231243968009949, "rewards/accuracies": 0.75, "rewards/chosen": -0.08145026862621307, "rewards/margins": 0.07994245737791061, "rewards/rejected": -0.16139273345470428, "sft_loss": 0.8145027160644531, "step": 220 }, { "epoch": 0.3195950831525669, "grad_norm": 6.559029582787577, "learning_rate": 7.98235875475797e-06, "logits/chosen": 0.687819242477417, "logits/rejected": 0.5400627851486206, "logps/chosen": -0.811687171459198, "logps/rejected": -1.8423746824264526, "loss": 0.9222, "odds_ratio_loss": 0.4002645015716553, "rewards/accuracies": 0.875, "rewards/chosen": -0.08116871118545532, "rewards/margins": 0.1030687540769577, "rewards/rejected": -0.18423748016357422, "sft_loss": 0.811687171459198, "step": 221 }, { "epoch": 0.3210412147505423, "grad_norm": 4.26878072464604, "learning_rate": 7.982066177991022e-06, "logits/chosen": 0.7749350070953369, "logits/rejected": 0.6633896231651306, "logps/chosen": -0.8835655450820923, "logps/rejected": -1.1134123802185059, "loss": 0.9247, "odds_ratio_loss": 0.8716833591461182, "rewards/accuracies": 0.4375, "rewards/chosen": -0.08835654705762863, "rewards/margins": 0.02298467978835106, "rewards/rejected": -0.11134123802185059, "sft_loss": 0.8835655450820923, "step": 222 }, { "epoch": 0.3224873463485177, "grad_norm": 3.856415488368599, "learning_rate": 7.981771200421547e-06, "logits/chosen": 0.8121925592422485, "logits/rejected": 0.5447680354118347, "logps/chosen": -0.7949036359786987, "logps/rejected": -1.3006750345230103, "loss": 0.8606, "odds_ratio_loss": 0.5626305341720581, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07949037104845047, "rewards/margins": 0.05057713761925697, "rewards/rejected": -0.13006749749183655, "sft_loss": 0.7949036359786987, "step": 223 }, { "epoch": 0.32393347794649313, "grad_norm": 3.195463927964763, "learning_rate": 7.981473822227383e-06, "logits/chosen": 1.0078376531600952, "logits/rejected": 0.8154879212379456, "logps/chosen": -0.8676646947860718, "logps/rejected": -1.233111023902893, "loss": 0.8813, "odds_ratio_loss": 0.6121217608451843, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08676647394895554, "rewards/margins": 0.03654462844133377, "rewards/rejected": -0.1233111023902893, "sft_loss": 0.8676646947860718, "step": 224 }, { "epoch": 0.32537960954446854, "grad_norm": 5.3146650131117665, "learning_rate": 7.981174043587826e-06, "logits/chosen": 0.5916678309440613, "logits/rejected": 0.46369603276252747, "logps/chosen": -1.0442885160446167, "logps/rejected": -1.6847314834594727, "loss": 0.9145, "odds_ratio_loss": 0.5118202567100525, "rewards/accuracies": 0.75, "rewards/chosen": -0.10442885756492615, "rewards/margins": 0.06404431909322739, "rewards/rejected": -0.16847318410873413, "sft_loss": 1.0442885160446167, "step": 225 }, { "epoch": 0.32682574114244395, "grad_norm": 3.0340595722346992, "learning_rate": 7.98087186468361e-06, "logits/chosen": 0.6914336085319519, "logits/rejected": 0.5757249593734741, "logps/chosen": -0.9663025140762329, "logps/rejected": -0.9450004696846008, "loss": 0.9079, "odds_ratio_loss": 0.7738305926322937, "rewards/accuracies": 0.4375, "rewards/chosen": -0.09663024544715881, "rewards/margins": -0.002130193170160055, "rewards/rejected": -0.09450005739927292, "sft_loss": 0.9663025140762329, "step": 226 }, { "epoch": 0.32827187274041936, "grad_norm": 2.8726087856467557, "learning_rate": 7.98056728569692e-06, "logits/chosen": 0.8992648720741272, "logits/rejected": 0.8230560421943665, "logps/chosen": -0.772228479385376, "logps/rejected": -1.0922561883926392, "loss": 0.8579, "odds_ratio_loss": 0.5426236391067505, "rewards/accuracies": 0.75, "rewards/chosen": -0.07722284644842148, "rewards/margins": 0.0320027731359005, "rewards/rejected": -0.10922562330961227, "sft_loss": 0.772228479385376, "step": 227 }, { "epoch": 0.3297180043383948, "grad_norm": 3.7331750453817865, "learning_rate": 7.980260306811388e-06, "logits/chosen": 0.820920467376709, "logits/rejected": 0.6605263352394104, "logps/chosen": -0.8334155082702637, "logps/rejected": -1.178484559059143, "loss": 0.8927, "odds_ratio_loss": 0.5583294630050659, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0833415538072586, "rewards/margins": 0.0345069095492363, "rewards/rejected": -0.11784844845533371, "sft_loss": 0.8334155082702637, "step": 228 }, { "epoch": 0.33116413593637023, "grad_norm": 2.999700585683244, "learning_rate": 7.979950928212092e-06, "logits/chosen": 0.7122136354446411, "logits/rejected": 0.5927017331123352, "logps/chosen": -0.9525954723358154, "logps/rejected": -1.6945300102233887, "loss": 0.8914, "odds_ratio_loss": 0.6651919484138489, "rewards/accuracies": 0.4375, "rewards/chosen": -0.09525954723358154, "rewards/margins": 0.07419347018003464, "rewards/rejected": -0.16945302486419678, "sft_loss": 0.9525954723358154, "step": 229 }, { "epoch": 0.33261026753434564, "grad_norm": 3.173819627640913, "learning_rate": 7.97963915008556e-06, "logits/chosen": 0.8333603143692017, "logits/rejected": 0.542677640914917, "logps/chosen": -0.7230704426765442, "logps/rejected": -1.862441062927246, "loss": 0.8663, "odds_ratio_loss": 0.5365059971809387, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0723070502281189, "rewards/margins": 0.11393707245588303, "rewards/rejected": -0.18624411523342133, "sft_loss": 0.7230704426765442, "step": 230 }, { "epoch": 0.33405639913232105, "grad_norm": 7.7397925754093855, "learning_rate": 7.979324972619762e-06, "logits/chosen": 0.690934419631958, "logits/rejected": 0.5235450863838196, "logps/chosen": -0.7463875412940979, "logps/rejected": -1.8125139474868774, "loss": 0.818, "odds_ratio_loss": 0.42135879397392273, "rewards/accuracies": 0.75, "rewards/chosen": -0.07463876157999039, "rewards/margins": 0.10661264508962631, "rewards/rejected": -0.1812514215707779, "sft_loss": 0.7463875412940979, "step": 231 }, { "epoch": 0.33550253073029646, "grad_norm": 13.429401940993012, "learning_rate": 7.979008396004118e-06, "logits/chosen": 0.7407370805740356, "logits/rejected": 0.5966250896453857, "logps/chosen": -0.9655992388725281, "logps/rejected": -1.7845059633255005, "loss": 0.8857, "odds_ratio_loss": 0.49398142099380493, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09655991941690445, "rewards/margins": 0.08189067989587784, "rewards/rejected": -0.17845061421394348, "sft_loss": 0.9655992388725281, "step": 232 }, { "epoch": 0.33694866232827186, "grad_norm": 2.784215244397478, "learning_rate": 7.978689420429491e-06, "logits/chosen": 0.784191370010376, "logits/rejected": 0.6294467449188232, "logps/chosen": -0.8379493951797485, "logps/rejected": -1.453858494758606, "loss": 0.7912, "odds_ratio_loss": 0.5905799865722656, "rewards/accuracies": 0.625, "rewards/chosen": -0.08379494398832321, "rewards/margins": 0.06159090995788574, "rewards/rejected": -0.14538586139678955, "sft_loss": 0.8379493951797485, "step": 233 }, { "epoch": 0.3383947939262473, "grad_norm": 3.809142821856424, "learning_rate": 7.978368046088197e-06, "logits/chosen": 0.7262183427810669, "logits/rejected": 0.6582277417182922, "logps/chosen": -1.0824716091156006, "logps/rejected": -1.1397984027862549, "loss": 0.9569, "odds_ratio_loss": 0.7705241441726685, "rewards/accuracies": 0.5, "rewards/chosen": -0.10824716836214066, "rewards/margins": 0.0057326690293848515, "rewards/rejected": -0.11397983133792877, "sft_loss": 1.0824716091156006, "step": 234 }, { "epoch": 0.3398409255242227, "grad_norm": 3.0346959899246713, "learning_rate": 7.978044273173988e-06, "logits/chosen": 0.8044057488441467, "logits/rejected": 0.6709400415420532, "logps/chosen": -0.8678100109100342, "logps/rejected": -1.0254902839660645, "loss": 0.852, "odds_ratio_loss": 0.7025970816612244, "rewards/accuracies": 0.625, "rewards/chosen": -0.08678101003170013, "rewards/margins": 0.015768028795719147, "rewards/rejected": -0.10254903137683868, "sft_loss": 0.8678100109100342, "step": 235 }, { "epoch": 0.34128705712219815, "grad_norm": 2.7268429100393785, "learning_rate": 7.977718101882074e-06, "logits/chosen": 0.8474991321563721, "logits/rejected": 0.6978375911712646, "logps/chosen": -0.6163941025733948, "logps/rejected": -1.8094571828842163, "loss": 0.8349, "odds_ratio_loss": 0.525054931640625, "rewards/accuracies": 0.625, "rewards/chosen": -0.06163940951228142, "rewards/margins": 0.11930631101131439, "rewards/rejected": -0.18094570934772491, "sft_loss": 0.6163941025733948, "step": 236 }, { "epoch": 0.34273318872017355, "grad_norm": 3.5101613560433056, "learning_rate": 7.977389532409099e-06, "logits/chosen": 0.6671175956726074, "logits/rejected": 0.49634212255477905, "logps/chosen": -0.7292395234107971, "logps/rejected": -1.6196175813674927, "loss": 0.8755, "odds_ratio_loss": 0.4010503888130188, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07292395830154419, "rewards/margins": 0.08903781324625015, "rewards/rejected": -0.16196177899837494, "sft_loss": 0.7292395234107971, "step": 237 }, { "epoch": 0.34417932031814896, "grad_norm": 2.803002663496337, "learning_rate": 7.977058564953163e-06, "logits/chosen": 0.8379513025283813, "logits/rejected": 0.715684175491333, "logps/chosen": -0.8583624362945557, "logps/rejected": -1.6935725212097168, "loss": 0.8992, "odds_ratio_loss": 0.5625948905944824, "rewards/accuracies": 0.625, "rewards/chosen": -0.0858362466096878, "rewards/margins": 0.08352100849151611, "rewards/rejected": -0.1693572700023651, "sft_loss": 0.8583624362945557, "step": 238 }, { "epoch": 0.34562545191612437, "grad_norm": 4.167844335180785, "learning_rate": 7.976725199713806e-06, "logits/chosen": 0.899992048740387, "logits/rejected": 0.6711959838867188, "logps/chosen": -0.7251548767089844, "logps/rejected": -1.621293306350708, "loss": 0.9368, "odds_ratio_loss": 0.5080724954605103, "rewards/accuracies": 0.75, "rewards/chosen": -0.07251548767089844, "rewards/margins": 0.08961383998394012, "rewards/rejected": -0.16212932765483856, "sft_loss": 0.7251548767089844, "step": 239 }, { "epoch": 0.3470715835140998, "grad_norm": 3.35087123326394, "learning_rate": 7.976389436892015e-06, "logits/chosen": 0.7564694881439209, "logits/rejected": 0.6249856948852539, "logps/chosen": -0.8904150128364563, "logps/rejected": -1.1712754964828491, "loss": 0.8692, "odds_ratio_loss": 0.5690667033195496, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08904150128364563, "rewards/margins": 0.02808605134487152, "rewards/rejected": -0.11712755262851715, "sft_loss": 0.8904150128364563, "step": 240 }, { "epoch": 0.3485177151120752, "grad_norm": 2.6406013197038387, "learning_rate": 7.976051276690223e-06, "logits/chosen": 0.6425853967666626, "logits/rejected": 0.5586930513381958, "logps/chosen": -0.7436127662658691, "logps/rejected": -2.275128126144409, "loss": 0.8125, "odds_ratio_loss": 0.403175950050354, "rewards/accuracies": 0.75, "rewards/chosen": -0.07436127960681915, "rewards/margins": 0.15315154194831848, "rewards/rejected": -0.22751282155513763, "sft_loss": 0.7436127662658691, "step": 241 }, { "epoch": 0.3499638467100506, "grad_norm": 4.369279972922537, "learning_rate": 7.975710719312306e-06, "logits/chosen": 0.6873583793640137, "logits/rejected": 0.5604276657104492, "logps/chosen": -0.7742418646812439, "logps/rejected": -1.7675644159317017, "loss": 0.895, "odds_ratio_loss": 0.5378227829933167, "rewards/accuracies": 0.625, "rewards/chosen": -0.07742418348789215, "rewards/margins": 0.09933225810527802, "rewards/rejected": -0.17675642669200897, "sft_loss": 0.7742418646812439, "step": 242 }, { "epoch": 0.351409978308026, "grad_norm": 3.4169304241180485, "learning_rate": 7.975367764963591e-06, "logits/chosen": 0.9993571043014526, "logits/rejected": 0.8231282234191895, "logps/chosen": -0.8939650058746338, "logps/rejected": -1.2862818241119385, "loss": 0.9304, "odds_ratio_loss": 0.6970391869544983, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08939650654792786, "rewards/margins": 0.039231669157743454, "rewards/rejected": -0.12862816452980042, "sft_loss": 0.8939650058746338, "step": 243 }, { "epoch": 0.35285610990600147, "grad_norm": 3.577479393127468, "learning_rate": 7.975022413850844e-06, "logits/chosen": 0.7503065466880798, "logits/rejected": 0.5771316289901733, "logps/chosen": -0.7733768224716187, "logps/rejected": -1.5073132514953613, "loss": 0.8007, "odds_ratio_loss": 0.4124954044818878, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07733768224716187, "rewards/margins": 0.07339364290237427, "rewards/rejected": -0.15073132514953613, "sft_loss": 0.7733768224716187, "step": 244 }, { "epoch": 0.3543022415039769, "grad_norm": 3.836112525892133, "learning_rate": 7.974674666182281e-06, "logits/chosen": 0.8848112225532532, "logits/rejected": 0.8300377130508423, "logps/chosen": -0.752465546131134, "logps/rejected": -2.1321396827697754, "loss": 0.8716, "odds_ratio_loss": 0.5637364983558655, "rewards/accuracies": 0.625, "rewards/chosen": -0.07524655759334564, "rewards/margins": 0.13796743750572205, "rewards/rejected": -0.2132139950990677, "sft_loss": 0.752465546131134, "step": 245 }, { "epoch": 0.3557483731019523, "grad_norm": 3.1277493586154126, "learning_rate": 7.974324522167557e-06, "logits/chosen": 0.7861306667327881, "logits/rejected": 0.615057647228241, "logps/chosen": -0.9151326417922974, "logps/rejected": -1.5386865139007568, "loss": 0.8864, "odds_ratio_loss": 0.5509360432624817, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0915132611989975, "rewards/margins": 0.062355391681194305, "rewards/rejected": -0.1538686603307724, "sft_loss": 0.9151326417922974, "step": 246 }, { "epoch": 0.3571945046999277, "grad_norm": 3.506251057509387, "learning_rate": 7.973971982017775e-06, "logits/chosen": 0.8024786710739136, "logits/rejected": 0.5016666054725647, "logps/chosen": -0.6327417492866516, "logps/rejected": -1.9561928510665894, "loss": 0.8688, "odds_ratio_loss": 0.44218164682388306, "rewards/accuracies": 0.625, "rewards/chosen": -0.06327417492866516, "rewards/margins": 0.13234511017799377, "rewards/rejected": -0.19561928510665894, "sft_loss": 0.6327417492866516, "step": 247 }, { "epoch": 0.3586406362979031, "grad_norm": 2.6104103628429796, "learning_rate": 7.973617045945487e-06, "logits/chosen": 0.8094074726104736, "logits/rejected": 0.5848209857940674, "logps/chosen": -0.778999388217926, "logps/rejected": -1.5029215812683105, "loss": 0.8303, "odds_ratio_loss": 0.6139599084854126, "rewards/accuracies": 0.4375, "rewards/chosen": -0.07789994776248932, "rewards/margins": 0.07239222526550293, "rewards/rejected": -0.15029215812683105, "sft_loss": 0.778999388217926, "step": 248 }, { "epoch": 0.3600867678958785, "grad_norm": 3.8613735777872993, "learning_rate": 7.97325971416468e-06, "logits/chosen": 0.8423810601234436, "logits/rejected": 0.6365971565246582, "logps/chosen": -0.768801748752594, "logps/rejected": -2.2783043384552, "loss": 0.8586, "odds_ratio_loss": 0.37025052309036255, "rewards/accuracies": 0.875, "rewards/chosen": -0.07688017189502716, "rewards/margins": 0.15095025300979614, "rewards/rejected": -0.2278304398059845, "sft_loss": 0.768801748752594, "step": 249 }, { "epoch": 0.3615328994938539, "grad_norm": 5.012643427887363, "learning_rate": 7.972899986890796e-06, "logits/chosen": 0.7972968220710754, "logits/rejected": 0.7219531536102295, "logps/chosen": -1.2234079837799072, "logps/rejected": -1.1467794179916382, "loss": 1.0785, "odds_ratio_loss": 0.8847633600234985, "rewards/accuracies": 0.4375, "rewards/chosen": -0.12234079837799072, "rewards/margins": -0.007662854623049498, "rewards/rejected": -0.11467794328927994, "sft_loss": 1.2234079837799072, "step": 250 }, { "epoch": 0.36297903109182933, "grad_norm": 4.279210626470339, "learning_rate": 7.972537864340714e-06, "logits/chosen": 0.9484211206436157, "logits/rejected": 0.7891027331352234, "logps/chosen": -0.8660564422607422, "logps/rejected": -1.464170217514038, "loss": 0.8468, "odds_ratio_loss": 0.6933861970901489, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08660565316677094, "rewards/margins": 0.059811390936374664, "rewards/rejected": -0.146417036652565, "sft_loss": 0.8660564422607422, "step": 251 }, { "epoch": 0.3644251626898048, "grad_norm": 3.471495045207052, "learning_rate": 7.972173346732755e-06, "logits/chosen": 0.7789313793182373, "logits/rejected": 0.7130517959594727, "logps/chosen": -0.9272181391716003, "logps/rejected": -1.905896782875061, "loss": 0.8858, "odds_ratio_loss": 0.5296302437782288, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09272181242704391, "rewards/margins": 0.09786785393953323, "rewards/rejected": -0.19058966636657715, "sft_loss": 0.9272181391716003, "step": 252 }, { "epoch": 0.3658712942877802, "grad_norm": 10.562055110820904, "learning_rate": 7.971806434286693e-06, "logits/chosen": 0.9116954803466797, "logits/rejected": 0.7316747307777405, "logps/chosen": -0.6885159015655518, "logps/rejected": -2.1187455654144287, "loss": 0.8137, "odds_ratio_loss": 0.40810999274253845, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06885159015655518, "rewards/margins": 0.14302296936511993, "rewards/rejected": -0.21187454462051392, "sft_loss": 0.6885159015655518, "step": 253 }, { "epoch": 0.3673174258857556, "grad_norm": 2.417811083637075, "learning_rate": 7.97143712722374e-06, "logits/chosen": 0.7554264664649963, "logits/rejected": 0.5283613204956055, "logps/chosen": -0.8525446653366089, "logps/rejected": -1.62563157081604, "loss": 0.9782, "odds_ratio_loss": 0.5037074089050293, "rewards/accuracies": 0.75, "rewards/chosen": -0.08525446802377701, "rewards/margins": 0.07730869948863983, "rewards/rejected": -0.16256316006183624, "sft_loss": 0.8525446653366089, "step": 254 }, { "epoch": 0.368763557483731, "grad_norm": 2.4927832005447828, "learning_rate": 7.97106542576655e-06, "logits/chosen": 0.8163594007492065, "logits/rejected": 0.5682124495506287, "logps/chosen": -0.6700804829597473, "logps/rejected": -2.2717583179473877, "loss": 0.8972, "odds_ratio_loss": 0.33437198400497437, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06700804829597473, "rewards/margins": 0.16016778349876404, "rewards/rejected": -0.22717583179473877, "sft_loss": 0.6700804829597473, "step": 255 }, { "epoch": 0.37020968908170643, "grad_norm": 2.980750378676755, "learning_rate": 7.970691330139226e-06, "logits/chosen": 0.7557944655418396, "logits/rejected": 0.6307120323181152, "logps/chosen": -0.8848236203193665, "logps/rejected": -0.9924321174621582, "loss": 0.871, "odds_ratio_loss": 0.7175642251968384, "rewards/accuracies": 0.5, "rewards/chosen": -0.08848236501216888, "rewards/margins": 0.010760847479104996, "rewards/rejected": -0.09924320876598358, "sft_loss": 0.8848236203193665, "step": 256 }, { "epoch": 0.37165582067968184, "grad_norm": 4.391248806591652, "learning_rate": 7.97031484056731e-06, "logits/chosen": 0.7138476371765137, "logits/rejected": 0.6476942300796509, "logps/chosen": -0.7389509677886963, "logps/rejected": -1.5851773023605347, "loss": 0.864, "odds_ratio_loss": 0.41973942518234253, "rewards/accuracies": 0.875, "rewards/chosen": -0.07389508932828903, "rewards/margins": 0.08462263643741608, "rewards/rejected": -0.1585177183151245, "sft_loss": 0.7389509677886963, "step": 257 }, { "epoch": 0.37310195227765725, "grad_norm": 3.3143549158128818, "learning_rate": 7.96993595727779e-06, "logits/chosen": 0.6371474266052246, "logits/rejected": 0.5004527568817139, "logps/chosen": -0.7199169397354126, "logps/rejected": -2.65303111076355, "loss": 0.8483, "odds_ratio_loss": 0.44243472814559937, "rewards/accuracies": 0.75, "rewards/chosen": -0.0719916895031929, "rewards/margins": 0.1933114230632782, "rewards/rejected": -0.2653031349182129, "sft_loss": 0.7199169397354126, "step": 258 }, { "epoch": 0.3745480838756327, "grad_norm": 3.1321180531981687, "learning_rate": 7.969554680499097e-06, "logits/chosen": 0.6553201675415039, "logits/rejected": 0.6746339797973633, "logps/chosen": -0.8668511509895325, "logps/rejected": -1.6837364435195923, "loss": 0.9129, "odds_ratio_loss": 0.5618002414703369, "rewards/accuracies": 0.5, "rewards/chosen": -0.08668512105941772, "rewards/margins": 0.0816885307431221, "rewards/rejected": -0.16837365925312042, "sft_loss": 0.8668511509895325, "step": 259 }, { "epoch": 0.3759942154736081, "grad_norm": 7.57803343896559, "learning_rate": 7.969171010461101e-06, "logits/chosen": 0.6894788146018982, "logits/rejected": 0.5525285005569458, "logps/chosen": -0.7234008312225342, "logps/rejected": -1.6714537143707275, "loss": 0.7727, "odds_ratio_loss": 0.38769158720970154, "rewards/accuracies": 0.75, "rewards/chosen": -0.07234008610248566, "rewards/margins": 0.09480530768632889, "rewards/rejected": -0.16714540123939514, "sft_loss": 0.7234008312225342, "step": 260 }, { "epoch": 0.3774403470715835, "grad_norm": 3.0196548162625976, "learning_rate": 7.968784947395122e-06, "logits/chosen": 0.7908775210380554, "logits/rejected": 0.5918086171150208, "logps/chosen": -1.0163339376449585, "logps/rejected": -1.382534146308899, "loss": 0.9061, "odds_ratio_loss": 0.7340130805969238, "rewards/accuracies": 0.5, "rewards/chosen": -0.10163339227437973, "rewards/margins": 0.03662002459168434, "rewards/rejected": -0.13825342059135437, "sft_loss": 1.0163339376449585, "step": 261 }, { "epoch": 0.37888647866955893, "grad_norm": 4.090776023213169, "learning_rate": 7.968396491533914e-06, "logits/chosen": 0.7099351286888123, "logits/rejected": 0.5062960982322693, "logps/chosen": -1.0517055988311768, "logps/rejected": -1.2568538188934326, "loss": 0.9126, "odds_ratio_loss": 0.7793716788291931, "rewards/accuracies": 0.375, "rewards/chosen": -0.10517056286334991, "rewards/margins": 0.020514825358986855, "rewards/rejected": -0.12568539381027222, "sft_loss": 1.0517055988311768, "step": 262 }, { "epoch": 0.38033261026753434, "grad_norm": 4.6262771091993224, "learning_rate": 7.968005643111684e-06, "logits/chosen": 0.7514022588729858, "logits/rejected": 0.7052969336509705, "logps/chosen": -0.9725602865219116, "logps/rejected": -2.683314323425293, "loss": 0.898, "odds_ratio_loss": 0.6760257482528687, "rewards/accuracies": 0.4375, "rewards/chosen": -0.09725602716207504, "rewards/margins": 0.17107540369033813, "rewards/rejected": -0.2683314383029938, "sft_loss": 0.9725602865219116, "step": 263 }, { "epoch": 0.38177874186550975, "grad_norm": 3.091384295153508, "learning_rate": 7.967612402364071e-06, "logits/chosen": 0.7682317495346069, "logits/rejected": 0.7247217297554016, "logps/chosen": -0.8237577676773071, "logps/rejected": -1.758196234703064, "loss": 0.8667, "odds_ratio_loss": 0.7248599529266357, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08237577974796295, "rewards/margins": 0.0934438556432724, "rewards/rejected": -0.17581963539123535, "sft_loss": 0.8237577676773071, "step": 264 }, { "epoch": 0.38322487346348516, "grad_norm": 2.6477759807391825, "learning_rate": 7.967216769528166e-06, "logits/chosen": 0.6989853382110596, "logits/rejected": 0.5311872959136963, "logps/chosen": -0.8110822439193726, "logps/rejected": -2.2039005756378174, "loss": 0.9244, "odds_ratio_loss": 0.5130632519721985, "rewards/accuracies": 0.625, "rewards/chosen": -0.0811082199215889, "rewards/margins": 0.13928183913230896, "rewards/rejected": -0.22039005160331726, "sft_loss": 0.8110822439193726, "step": 265 }, { "epoch": 0.38467100506146057, "grad_norm": 3.867013203125484, "learning_rate": 7.966818744842494e-06, "logits/chosen": 0.8495751023292542, "logits/rejected": 0.74996018409729, "logps/chosen": -0.7023511528968811, "logps/rejected": -1.9245699644088745, "loss": 0.8556, "odds_ratio_loss": 0.46690189838409424, "rewards/accuracies": 0.75, "rewards/chosen": -0.07023511826992035, "rewards/margins": 0.12222187966108322, "rewards/rejected": -0.19245699048042297, "sft_loss": 0.7023511528968811, "step": 266 }, { "epoch": 0.38611713665943603, "grad_norm": 7.385204554151738, "learning_rate": 7.966418328547026e-06, "logits/chosen": 0.821699857711792, "logits/rejected": 0.6689359545707703, "logps/chosen": -0.5747587084770203, "logps/rejected": -1.5719361305236816, "loss": 0.9209, "odds_ratio_loss": 0.5139928460121155, "rewards/accuracies": 0.625, "rewards/chosen": -0.057475872337818146, "rewards/margins": 0.09971773624420166, "rewards/rejected": -0.1571936011314392, "sft_loss": 0.5747587084770203, "step": 267 }, { "epoch": 0.38756326825741144, "grad_norm": 3.0184638626608633, "learning_rate": 7.966015520883178e-06, "logits/chosen": 0.7042960524559021, "logits/rejected": 0.5639068484306335, "logps/chosen": -0.7901575565338135, "logps/rejected": -3.015544891357422, "loss": 0.7691, "odds_ratio_loss": 0.41159412264823914, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07901576161384583, "rewards/margins": 0.22253873944282532, "rewards/rejected": -0.30155447125434875, "sft_loss": 0.7901575565338135, "step": 268 }, { "epoch": 0.38900939985538685, "grad_norm": 4.235634529152362, "learning_rate": 7.965610322093798e-06, "logits/chosen": 0.7846585512161255, "logits/rejected": 0.711887001991272, "logps/chosen": -0.9673622250556946, "logps/rejected": -1.4234188795089722, "loss": 0.8955, "odds_ratio_loss": 0.6162651777267456, "rewards/accuracies": 0.625, "rewards/chosen": -0.09673622250556946, "rewards/margins": 0.04560566693544388, "rewards/rejected": -0.14234188199043274, "sft_loss": 0.9673622250556946, "step": 269 }, { "epoch": 0.39045553145336226, "grad_norm": 4.376119601183979, "learning_rate": 7.965202732423186e-06, "logits/chosen": 0.7393103837966919, "logits/rejected": 0.5519256591796875, "logps/chosen": -0.9471575021743774, "logps/rejected": -1.5052287578582764, "loss": 0.9612, "odds_ratio_loss": 0.5077100992202759, "rewards/accuracies": 0.8125, "rewards/chosen": -0.09471575170755386, "rewards/margins": 0.055807143449783325, "rewards/rejected": -0.1505228877067566, "sft_loss": 0.9471575021743774, "step": 270 }, { "epoch": 0.39190166305133767, "grad_norm": 3.621340032032785, "learning_rate": 7.96479275211708e-06, "logits/chosen": 0.9538942575454712, "logits/rejected": 0.7590014934539795, "logps/chosen": -0.7168223857879639, "logps/rejected": -1.5027300119400024, "loss": 0.8363, "odds_ratio_loss": 0.48245489597320557, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07168224453926086, "rewards/margins": 0.07859078049659729, "rewards/rejected": -0.15027301013469696, "sft_loss": 0.7168223857879639, "step": 271 }, { "epoch": 0.3933477946493131, "grad_norm": 3.2832296292942167, "learning_rate": 7.964380381422656e-06, "logits/chosen": 0.6526778340339661, "logits/rejected": 0.3193623423576355, "logps/chosen": -0.991958737373352, "logps/rejected": -1.9510480165481567, "loss": 0.865, "odds_ratio_loss": 0.51337730884552, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09919588267803192, "rewards/margins": 0.09590893238782883, "rewards/rejected": -0.19510480761528015, "sft_loss": 0.991958737373352, "step": 272 }, { "epoch": 0.3947939262472885, "grad_norm": 2.696843063651091, "learning_rate": 7.963965620588536e-06, "logits/chosen": 0.7486572861671448, "logits/rejected": 0.5010773539543152, "logps/chosen": -0.8629076480865479, "logps/rejected": -1.6694220304489136, "loss": 0.7328, "odds_ratio_loss": 0.5291403532028198, "rewards/accuracies": 0.75, "rewards/chosen": -0.08629077672958374, "rewards/margins": 0.08065143972635269, "rewards/rejected": -0.16694220900535583, "sft_loss": 0.8629076480865479, "step": 273 }, { "epoch": 0.3962400578452639, "grad_norm": 3.1227033798437516, "learning_rate": 7.96354846986478e-06, "logits/chosen": 0.7784473299980164, "logits/rejected": 0.5128236413002014, "logps/chosen": -0.6188066005706787, "logps/rejected": -1.8390074968338013, "loss": 0.8487, "odds_ratio_loss": 0.3730120360851288, "rewards/accuracies": 0.75, "rewards/chosen": -0.06188066303730011, "rewards/margins": 0.12202008068561554, "rewards/rejected": -0.18390074372291565, "sft_loss": 0.6188066005706787, "step": 274 }, { "epoch": 0.39768618944323936, "grad_norm": 3.6086922714292418, "learning_rate": 7.963128929502889e-06, "logits/chosen": 0.8294479250907898, "logits/rejected": 0.664797842502594, "logps/chosen": -0.8582887053489685, "logps/rejected": -1.5387067794799805, "loss": 0.904, "odds_ratio_loss": 0.6132429838180542, "rewards/accuracies": 0.625, "rewards/chosen": -0.08582887053489685, "rewards/margins": 0.06804181635379791, "rewards/rejected": -0.15387068688869476, "sft_loss": 0.8582887053489685, "step": 275 }, { "epoch": 0.39913232104121477, "grad_norm": 3.5451162311098243, "learning_rate": 7.962706999755807e-06, "logits/chosen": 0.7107446193695068, "logits/rejected": 0.5381249189376831, "logps/chosen": -0.8531709909439087, "logps/rejected": -1.67955482006073, "loss": 0.9392, "odds_ratio_loss": 0.4603929817676544, "rewards/accuracies": 0.75, "rewards/chosen": -0.08531709760427475, "rewards/margins": 0.08263837546110153, "rewards/rejected": -0.16795547306537628, "sft_loss": 0.8531709909439087, "step": 276 }, { "epoch": 0.4005784526391902, "grad_norm": 2.388888186464326, "learning_rate": 7.962282680877915e-06, "logits/chosen": 0.8900540471076965, "logits/rejected": 0.5301176905632019, "logps/chosen": -0.6871480941772461, "logps/rejected": -2.295354127883911, "loss": 0.8126, "odds_ratio_loss": 0.3517339527606964, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06871479749679565, "rewards/margins": 0.1608206033706665, "rewards/rejected": -0.22953541576862335, "sft_loss": 0.6871480941772461, "step": 277 }, { "epoch": 0.4020245842371656, "grad_norm": 2.6315560832486775, "learning_rate": 7.96185597312504e-06, "logits/chosen": 0.9399300813674927, "logits/rejected": 0.6871672868728638, "logps/chosen": -0.6519705653190613, "logps/rejected": -2.3862929344177246, "loss": 0.8321, "odds_ratio_loss": 0.3415951430797577, "rewards/accuracies": 0.875, "rewards/chosen": -0.06519706547260284, "rewards/margins": 0.17343223094940186, "rewards/rejected": -0.2386292815208435, "sft_loss": 0.6519705653190613, "step": 278 }, { "epoch": 0.403470715835141, "grad_norm": 9.473931696319617, "learning_rate": 7.96142687675444e-06, "logits/chosen": 0.712382435798645, "logits/rejected": 0.525117039680481, "logps/chosen": -0.5526185035705566, "logps/rejected": -2.451003313064575, "loss": 0.8902, "odds_ratio_loss": 0.2894706130027771, "rewards/accuracies": 0.9375, "rewards/chosen": -0.055261846631765366, "rewards/margins": 0.1898384690284729, "rewards/rejected": -0.24510033428668976, "sft_loss": 0.5526185035705566, "step": 279 }, { "epoch": 0.4049168474331164, "grad_norm": 13.10604992325088, "learning_rate": 7.960995392024826e-06, "logits/chosen": 0.9458888173103333, "logits/rejected": 0.6861677169799805, "logps/chosen": -0.8119727373123169, "logps/rejected": -1.5805472135543823, "loss": 0.8809, "odds_ratio_loss": 0.576356828212738, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08119726926088333, "rewards/margins": 0.07685744762420654, "rewards/rejected": -0.15805472433567047, "sft_loss": 0.8119727373123169, "step": 280 }, { "epoch": 0.4063629790310918, "grad_norm": 2.21110470027196, "learning_rate": 7.960561519196334e-06, "logits/chosen": 0.7865252494812012, "logits/rejected": 0.5100930333137512, "logps/chosen": -0.7717297077178955, "logps/rejected": -2.293206214904785, "loss": 0.8973, "odds_ratio_loss": 0.43415284156799316, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07717297971248627, "rewards/margins": 0.15214766561985016, "rewards/rejected": -0.22932063043117523, "sft_loss": 0.7717297077178955, "step": 281 }, { "epoch": 0.4078091106290672, "grad_norm": 3.993304152364151, "learning_rate": 7.960125258530553e-06, "logits/chosen": 0.7694143056869507, "logits/rejected": 0.5560774207115173, "logps/chosen": -0.8932427167892456, "logps/rejected": -1.4928797483444214, "loss": 0.9099, "odds_ratio_loss": 0.5252251029014587, "rewards/accuracies": 0.75, "rewards/chosen": -0.08932427316904068, "rewards/margins": 0.05996370688080788, "rewards/rejected": -0.14928798377513885, "sft_loss": 0.8932427167892456, "step": 282 }, { "epoch": 0.4092552422270427, "grad_norm": 3.0189154414124775, "learning_rate": 7.959686610290504e-06, "logits/chosen": 0.5652546882629395, "logits/rejected": 0.5164136290550232, "logps/chosen": -0.7622696161270142, "logps/rejected": -1.2584534883499146, "loss": 0.8417, "odds_ratio_loss": 0.4685708284378052, "rewards/accuracies": 0.75, "rewards/chosen": -0.07622695714235306, "rewards/margins": 0.04961838945746422, "rewards/rejected": -0.12584535777568817, "sft_loss": 0.7622696161270142, "step": 283 }, { "epoch": 0.4107013738250181, "grad_norm": 2.5548285360039285, "learning_rate": 7.959245574740652e-06, "logits/chosen": 0.868388831615448, "logits/rejected": 0.6646077036857605, "logps/chosen": -0.7296291589736938, "logps/rejected": -1.3972752094268799, "loss": 0.7955, "odds_ratio_loss": 0.4898286461830139, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0729629248380661, "rewards/margins": 0.06676461547613144, "rewards/rejected": -0.13972753286361694, "sft_loss": 0.7296291589736938, "step": 284 }, { "epoch": 0.4121475054229935, "grad_norm": 4.044250113535489, "learning_rate": 7.958802152146895e-06, "logits/chosen": 0.8534368276596069, "logits/rejected": 0.5839318633079529, "logps/chosen": -0.7515661120414734, "logps/rejected": -1.5278918743133545, "loss": 0.8219, "odds_ratio_loss": 0.44184809923171997, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07515661418437958, "rewards/margins": 0.07763257622718811, "rewards/rejected": -0.1527891904115677, "sft_loss": 0.7515661120414734, "step": 285 }, { "epoch": 0.4135936370209689, "grad_norm": 3.854922628972753, "learning_rate": 7.958356342776576e-06, "logits/chosen": 0.8270955085754395, "logits/rejected": 0.7576125860214233, "logps/chosen": -0.7885029315948486, "logps/rejected": -1.602339744567871, "loss": 0.8647, "odds_ratio_loss": 0.6078847050666809, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07885029166936874, "rewards/margins": 0.08138368278741837, "rewards/rejected": -0.1602339744567871, "sft_loss": 0.7885029315948486, "step": 286 }, { "epoch": 0.4150397686189443, "grad_norm": 2.97233807277169, "learning_rate": 7.957908146898477e-06, "logits/chosen": 0.8544989228248596, "logits/rejected": 0.6331669092178345, "logps/chosen": -0.9146354794502258, "logps/rejected": -1.7395083904266357, "loss": 0.9325, "odds_ratio_loss": 0.6201297044754028, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09146355092525482, "rewards/margins": 0.08248727768659592, "rewards/rejected": -0.17395083606243134, "sft_loss": 0.9146354794502258, "step": 287 }, { "epoch": 0.4164859002169197, "grad_norm": 3.9764262611407064, "learning_rate": 7.957457564782816e-06, "logits/chosen": 0.7527596950531006, "logits/rejected": 0.619892418384552, "logps/chosen": -0.7813956141471863, "logps/rejected": -2.4308414459228516, "loss": 0.7837, "odds_ratio_loss": 0.547453761100769, "rewards/accuracies": 0.625, "rewards/chosen": -0.07813956588506699, "rewards/margins": 0.164944589138031, "rewards/rejected": -0.2430841624736786, "sft_loss": 0.7813956141471863, "step": 288 }, { "epoch": 0.41793203181489513, "grad_norm": 7.563020631354844, "learning_rate": 7.95700459670125e-06, "logits/chosen": 0.8600162863731384, "logits/rejected": 0.6054329872131348, "logps/chosen": -0.8736534714698792, "logps/rejected": -2.9901254177093506, "loss": 0.9142, "odds_ratio_loss": 0.5161182880401611, "rewards/accuracies": 0.75, "rewards/chosen": -0.08736535161733627, "rewards/margins": 0.211647167801857, "rewards/rejected": -0.29901254177093506, "sft_loss": 0.8736534714698792, "step": 289 }, { "epoch": 0.4193781634128706, "grad_norm": 2.4611000413126676, "learning_rate": 7.956549242926872e-06, "logits/chosen": 0.7825429439544678, "logits/rejected": 0.5044749975204468, "logps/chosen": -0.7324574589729309, "logps/rejected": -1.9676034450531006, "loss": 0.7825, "odds_ratio_loss": 0.4541637897491455, "rewards/accuracies": 0.75, "rewards/chosen": -0.07324574887752533, "rewards/margins": 0.12351461499929428, "rewards/rejected": -0.196760356426239, "sft_loss": 0.7324574589729309, "step": 290 }, { "epoch": 0.420824295010846, "grad_norm": 3.0288336601213315, "learning_rate": 7.956091503734223e-06, "logits/chosen": 0.7554039359092712, "logits/rejected": 0.5739782452583313, "logps/chosen": -0.5028554797172546, "logps/rejected": -3.680788993835449, "loss": 0.6984, "odds_ratio_loss": 0.28743448853492737, "rewards/accuracies": 0.9375, "rewards/chosen": -0.050285547971725464, "rewards/margins": 0.3177933394908905, "rewards/rejected": -0.36807888746261597, "sft_loss": 0.5028554797172546, "step": 291 }, { "epoch": 0.4222704266088214, "grad_norm": 3.5405260189111947, "learning_rate": 7.955631379399271e-06, "logits/chosen": 0.7954279780387878, "logits/rejected": 0.5444122552871704, "logps/chosen": -0.6753735542297363, "logps/rejected": -2.6563949584960938, "loss": 0.7875, "odds_ratio_loss": 0.43333977460861206, "rewards/accuracies": 0.75, "rewards/chosen": -0.06753735989332199, "rewards/margins": 0.19810214638710022, "rewards/rejected": -0.2656395137310028, "sft_loss": 0.6753735542297363, "step": 292 }, { "epoch": 0.4237165582067968, "grad_norm": 3.1754832413171004, "learning_rate": 7.955168870199428e-06, "logits/chosen": 0.7190636992454529, "logits/rejected": 0.5010616779327393, "logps/chosen": -0.7396849393844604, "logps/rejected": -2.7797493934631348, "loss": 0.9183, "odds_ratio_loss": 0.43546822667121887, "rewards/accuracies": 0.75, "rewards/chosen": -0.07396849989891052, "rewards/margins": 0.20400643348693848, "rewards/rejected": -0.277974933385849, "sft_loss": 0.7396849393844604, "step": 293 }, { "epoch": 0.42516268980477223, "grad_norm": 4.057110333121107, "learning_rate": 7.954703976413544e-06, "logits/chosen": 0.561728298664093, "logits/rejected": 0.47190630435943604, "logps/chosen": -0.9969494342803955, "logps/rejected": -1.62455153465271, "loss": 0.9558, "odds_ratio_loss": 0.6601800322532654, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09969495236873627, "rewards/margins": 0.06276021897792816, "rewards/rejected": -0.16245517134666443, "sft_loss": 0.9969494342803955, "step": 294 }, { "epoch": 0.42660882140274764, "grad_norm": 2.864194448801957, "learning_rate": 7.954236698321901e-06, "logits/chosen": 0.6871510744094849, "logits/rejected": 0.465214341878891, "logps/chosen": -0.6832473278045654, "logps/rejected": -1.775686502456665, "loss": 0.7974, "odds_ratio_loss": 0.4238354563713074, "rewards/accuracies": 0.875, "rewards/chosen": -0.0683247298002243, "rewards/margins": 0.10924392938613892, "rewards/rejected": -0.17756864428520203, "sft_loss": 0.6832473278045654, "step": 295 }, { "epoch": 0.42805495300072305, "grad_norm": 4.022318915961337, "learning_rate": 7.953767036206228e-06, "logits/chosen": 0.548240602016449, "logits/rejected": 0.49791282415390015, "logps/chosen": -0.8850837349891663, "logps/rejected": -1.8481075763702393, "loss": 0.9559, "odds_ratio_loss": 0.4951096773147583, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08850837498903275, "rewards/margins": 0.09630238264799118, "rewards/rejected": -0.18481075763702393, "sft_loss": 0.8850837349891663, "step": 296 }, { "epoch": 0.42950108459869846, "grad_norm": 4.1252342060496066, "learning_rate": 7.953294990349683e-06, "logits/chosen": 0.6419621706008911, "logits/rejected": 0.636412501335144, "logps/chosen": -0.929691731929779, "logps/rejected": -2.140536069869995, "loss": 0.9064, "odds_ratio_loss": 0.6660383343696594, "rewards/accuracies": 0.625, "rewards/chosen": -0.09296917915344238, "rewards/margins": 0.12108444422483444, "rewards/rejected": -0.21405361592769623, "sft_loss": 0.929691731929779, "step": 297 }, { "epoch": 0.4309472161966739, "grad_norm": 6.395196153201281, "learning_rate": 7.952820561036864e-06, "logits/chosen": 0.6038861870765686, "logits/rejected": 0.4589577913284302, "logps/chosen": -0.7986758351325989, "logps/rejected": -2.17226505279541, "loss": 0.9359, "odds_ratio_loss": 0.5335122346878052, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07986757904291153, "rewards/margins": 0.13735893368721008, "rewards/rejected": -0.21722650527954102, "sft_loss": 0.7986758351325989, "step": 298 }, { "epoch": 0.43239334779464933, "grad_norm": 2.7876002435377765, "learning_rate": 7.952343748553806e-06, "logits/chosen": 0.6711763739585876, "logits/rejected": 0.6253769397735596, "logps/chosen": -0.820818305015564, "logps/rejected": -1.4959368705749512, "loss": 0.9121, "odds_ratio_loss": 0.5811915993690491, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08208183199167252, "rewards/margins": 0.06751186400651932, "rewards/rejected": -0.14959371089935303, "sft_loss": 0.820818305015564, "step": 299 }, { "epoch": 0.43383947939262474, "grad_norm": 2.9225860620617152, "learning_rate": 7.951864553187983e-06, "logits/chosen": 0.6169509291648865, "logits/rejected": 0.5235470533370972, "logps/chosen": -0.6802053451538086, "logps/rejected": -2.5945489406585693, "loss": 0.7323, "odds_ratio_loss": 0.3930429518222809, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0680205300450325, "rewards/margins": 0.19143438339233398, "rewards/rejected": -0.2594549059867859, "sft_loss": 0.6802053451538086, "step": 300 }, { "epoch": 0.43528561099060015, "grad_norm": 3.2161174399034347, "learning_rate": 7.951382975228301e-06, "logits/chosen": 0.7169187068939209, "logits/rejected": 0.7688310146331787, "logps/chosen": -0.8765419721603394, "logps/rejected": -1.1146844625473022, "loss": 0.9063, "odds_ratio_loss": 0.6021533012390137, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08765420317649841, "rewards/margins": 0.02381424978375435, "rewards/rejected": -0.11146844923496246, "sft_loss": 0.8765419721603394, "step": 301 }, { "epoch": 0.43673174258857556, "grad_norm": 3.0896141078168187, "learning_rate": 7.95089901496511e-06, "logits/chosen": 0.8363983631134033, "logits/rejected": 0.5107532739639282, "logps/chosen": -0.8250682353973389, "logps/rejected": -2.3571953773498535, "loss": 0.7644, "odds_ratio_loss": 0.5739777088165283, "rewards/accuracies": 0.625, "rewards/chosen": -0.08250682055950165, "rewards/margins": 0.15321271121501923, "rewards/rejected": -0.23571954667568207, "sft_loss": 0.8250682353973389, "step": 302 }, { "epoch": 0.43817787418655096, "grad_norm": 3.811865270406822, "learning_rate": 7.950412672690186e-06, "logits/chosen": 0.6645305752754211, "logits/rejected": 0.5417653322219849, "logps/chosen": -0.9253389835357666, "logps/rejected": -2.211357831954956, "loss": 0.9014, "odds_ratio_loss": 0.5837020874023438, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0925339013338089, "rewards/margins": 0.12860189378261566, "rewards/rejected": -0.22113579511642456, "sft_loss": 0.9253389835357666, "step": 303 }, { "epoch": 0.4396240057845264, "grad_norm": 2.672569700743552, "learning_rate": 7.94992394869675e-06, "logits/chosen": 0.7562464475631714, "logits/rejected": 0.536525309085846, "logps/chosen": -0.8378995656967163, "logps/rejected": -1.695633888244629, "loss": 0.8967, "odds_ratio_loss": 0.4167096018791199, "rewards/accuracies": 0.875, "rewards/chosen": -0.08378996700048447, "rewards/margins": 0.08577344566583633, "rewards/rejected": -0.1695634126663208, "sft_loss": 0.8378995656967163, "step": 304 }, { "epoch": 0.4410701373825018, "grad_norm": 3.008113742586865, "learning_rate": 7.949432843279453e-06, "logits/chosen": 0.7660605311393738, "logits/rejected": 0.599168598651886, "logps/chosen": -0.6577078700065613, "logps/rejected": -2.418144464492798, "loss": 0.8728, "odds_ratio_loss": 0.3436301648616791, "rewards/accuracies": 0.75, "rewards/chosen": -0.06577078998088837, "rewards/margins": 0.17604365944862366, "rewards/rejected": -0.24181444942951202, "sft_loss": 0.6577078700065613, "step": 305 }, { "epoch": 0.44251626898047725, "grad_norm": 2.916373711671278, "learning_rate": 7.948939356734385e-06, "logits/chosen": 0.7118316292762756, "logits/rejected": 0.5230697393417358, "logps/chosen": -0.6964635252952576, "logps/rejected": -1.8553109169006348, "loss": 0.9158, "odds_ratio_loss": 0.4708198308944702, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06964635848999023, "rewards/margins": 0.11588473618030548, "rewards/rejected": -0.1855311095714569, "sft_loss": 0.6964635252952576, "step": 306 }, { "epoch": 0.44396240057845265, "grad_norm": 3.1516702895048816, "learning_rate": 7.948443489359071e-06, "logits/chosen": 0.8500151634216309, "logits/rejected": 0.686989426612854, "logps/chosen": -0.6953170895576477, "logps/rejected": -1.5989413261413574, "loss": 0.8902, "odds_ratio_loss": 0.3981609046459198, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06953170895576477, "rewards/margins": 0.09036242961883545, "rewards/rejected": -0.15989412367343903, "sft_loss": 0.6953170895576477, "step": 307 }, { "epoch": 0.44540853217642806, "grad_norm": 3.5710829304772584, "learning_rate": 7.947945241452475e-06, "logits/chosen": 0.5635668039321899, "logits/rejected": 0.46536314487457275, "logps/chosen": -0.8453367948532104, "logps/rejected": -1.4774147272109985, "loss": 0.8867, "odds_ratio_loss": 0.5234245657920837, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0845336765050888, "rewards/margins": 0.06320779025554657, "rewards/rejected": -0.14774146676063538, "sft_loss": 0.8453367948532104, "step": 308 }, { "epoch": 0.44685466377440347, "grad_norm": 3.390233696947453, "learning_rate": 7.947444613314986e-06, "logits/chosen": 0.7768136262893677, "logits/rejected": 0.660037100315094, "logps/chosen": -0.8382095098495483, "logps/rejected": -1.4200916290283203, "loss": 0.797, "odds_ratio_loss": 0.5195042490959167, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08382095396518707, "rewards/margins": 0.05818820744752884, "rewards/rejected": -0.1420091688632965, "sft_loss": 0.8382095098495483, "step": 309 }, { "epoch": 0.4483007953723789, "grad_norm": 4.509100866130907, "learning_rate": 7.94694160524844e-06, "logits/chosen": 0.5886682868003845, "logits/rejected": 0.384355753660202, "logps/chosen": -0.867601752281189, "logps/rejected": -1.5529288053512573, "loss": 0.8558, "odds_ratio_loss": 0.5688661932945251, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08676017820835114, "rewards/margins": 0.06853270530700684, "rewards/rejected": -0.15529288351535797, "sft_loss": 0.867601752281189, "step": 310 }, { "epoch": 0.4497469269703543, "grad_norm": 2.1647577543237664, "learning_rate": 7.946436217556099e-06, "logits/chosen": 0.6968039274215698, "logits/rejected": 0.5218100547790527, "logps/chosen": -0.693936288356781, "logps/rejected": -1.393923282623291, "loss": 0.7933, "odds_ratio_loss": 0.41789939999580383, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06939362734556198, "rewards/margins": 0.06999869644641876, "rewards/rejected": -0.13939233124256134, "sft_loss": 0.693936288356781, "step": 311 }, { "epoch": 0.4511930585683297, "grad_norm": 4.30465084520022, "learning_rate": 7.945928450542664e-06, "logits/chosen": 0.5718408226966858, "logits/rejected": 0.6034159064292908, "logps/chosen": -0.8760853409767151, "logps/rejected": -1.6363561153411865, "loss": 0.8284, "odds_ratio_loss": 0.40968573093414307, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08760854601860046, "rewards/margins": 0.07602708041667938, "rewards/rejected": -0.16363561153411865, "sft_loss": 0.8760853409767151, "step": 312 }, { "epoch": 0.45263919016630516, "grad_norm": 3.0022074366655866, "learning_rate": 7.94541830451427e-06, "logits/chosen": 0.5793265104293823, "logits/rejected": 0.42465144395828247, "logps/chosen": -0.5700391530990601, "logps/rejected": -2.0401134490966797, "loss": 0.8435, "odds_ratio_loss": 0.29582464694976807, "rewards/accuracies": 0.875, "rewards/chosen": -0.057003915309906006, "rewards/margins": 0.14700743556022644, "rewards/rejected": -0.20401133596897125, "sft_loss": 0.5700391530990601, "step": 313 }, { "epoch": 0.45408532176428057, "grad_norm": 2.481391841589562, "learning_rate": 7.944905779778487e-06, "logits/chosen": 0.6300640106201172, "logits/rejected": 0.5014770030975342, "logps/chosen": -0.6986908912658691, "logps/rejected": -1.7633607387542725, "loss": 0.7659, "odds_ratio_loss": 0.5082624554634094, "rewards/accuracies": 0.625, "rewards/chosen": -0.06986908614635468, "rewards/margins": 0.10646697133779526, "rewards/rejected": -0.17633606493473053, "sft_loss": 0.6986908912658691, "step": 314 }, { "epoch": 0.455531453362256, "grad_norm": 2.6200652171864083, "learning_rate": 7.944390876644317e-06, "logits/chosen": 0.5968663096427917, "logits/rejected": 0.5737828016281128, "logps/chosen": -0.7979955077171326, "logps/rejected": -1.5119889974594116, "loss": 0.8404, "odds_ratio_loss": 0.6428624391555786, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07979955524206161, "rewards/margins": 0.07139935344457626, "rewards/rejected": -0.15119890868663788, "sft_loss": 0.7979955077171326, "step": 315 }, { "epoch": 0.4569775849602314, "grad_norm": 3.2994965196511035, "learning_rate": 7.943873595422195e-06, "logits/chosen": 0.7686248421669006, "logits/rejected": 0.6836127638816833, "logps/chosen": -0.8711780309677124, "logps/rejected": -1.6719789505004883, "loss": 0.8572, "odds_ratio_loss": 0.4641047716140747, "rewards/accuracies": 0.75, "rewards/chosen": -0.08711780607700348, "rewards/margins": 0.08008009195327759, "rewards/rejected": -0.16719789803028107, "sft_loss": 0.8711780309677124, "step": 316 }, { "epoch": 0.4584237165582068, "grad_norm": 3.644622791431945, "learning_rate": 7.943353936423996e-06, "logits/chosen": 0.8206264972686768, "logits/rejected": 0.5815707445144653, "logps/chosen": -0.9250833988189697, "logps/rejected": -1.7713178396224976, "loss": 0.8729, "odds_ratio_loss": 0.5220272541046143, "rewards/accuracies": 0.625, "rewards/chosen": -0.09250834584236145, "rewards/margins": 0.08462343364953995, "rewards/rejected": -0.1771317720413208, "sft_loss": 0.9250833988189697, "step": 317 }, { "epoch": 0.4598698481561822, "grad_norm": 3.631087682077212, "learning_rate": 7.94283189996302e-06, "logits/chosen": 0.6323086619377136, "logits/rejected": 0.5804564952850342, "logps/chosen": -0.835005521774292, "logps/rejected": -1.089737892150879, "loss": 0.7905, "odds_ratio_loss": 0.609061062335968, "rewards/accuracies": 0.625, "rewards/chosen": -0.08350054919719696, "rewards/margins": 0.025473233312368393, "rewards/rejected": -0.10897378623485565, "sft_loss": 0.835005521774292, "step": 318 }, { "epoch": 0.4613159797541576, "grad_norm": 3.383821151178241, "learning_rate": 7.942307486354009e-06, "logits/chosen": 0.5765923857688904, "logits/rejected": 0.4810538589954376, "logps/chosen": -0.7016163468360901, "logps/rejected": -1.5007145404815674, "loss": 0.8345, "odds_ratio_loss": 0.41251441836357117, "rewards/accuracies": 0.875, "rewards/chosen": -0.07016163319349289, "rewards/margins": 0.07990982383489609, "rewards/rejected": -0.15007147192955017, "sft_loss": 0.7016163468360901, "step": 319 }, { "epoch": 0.462762111352133, "grad_norm": 6.031998545816025, "learning_rate": 7.94178069591313e-06, "logits/chosen": 0.7171785235404968, "logits/rejected": 0.6892584562301636, "logps/chosen": -0.8179836273193359, "logps/rejected": -1.0612622499465942, "loss": 0.8643, "odds_ratio_loss": 0.5907955169677734, "rewards/accuracies": 0.875, "rewards/chosen": -0.08179835230112076, "rewards/margins": 0.024327874183654785, "rewards/rejected": -0.10612623393535614, "sft_loss": 0.8179836273193359, "step": 320 }, { "epoch": 0.4642082429501085, "grad_norm": 3.4977656147220975, "learning_rate": 7.94125152895799e-06, "logits/chosen": 0.6936742067337036, "logits/rejected": 0.5295305848121643, "logps/chosen": -0.7622445821762085, "logps/rejected": -1.9071930646896362, "loss": 0.7984, "odds_ratio_loss": 0.4924374222755432, "rewards/accuracies": 0.75, "rewards/chosen": -0.07622446119785309, "rewards/margins": 0.11449483782052994, "rewards/rejected": -0.19071930646896362, "sft_loss": 0.7622445821762085, "step": 321 }, { "epoch": 0.4656543745480839, "grad_norm": 2.5042671601600475, "learning_rate": 7.940719985807624e-06, "logits/chosen": 0.7296179533004761, "logits/rejected": 0.49951764941215515, "logps/chosen": -0.6940515041351318, "logps/rejected": -1.7299174070358276, "loss": 0.7871, "odds_ratio_loss": 0.47463756799697876, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06940515339374542, "rewards/margins": 0.1035865843296051, "rewards/rejected": -0.17299173772335052, "sft_loss": 0.6940515041351318, "step": 322 }, { "epoch": 0.4671005061460593, "grad_norm": 3.292946892095406, "learning_rate": 7.9401860667825e-06, "logits/chosen": 0.7062532305717468, "logits/rejected": 0.4862893223762512, "logps/chosen": -0.6891757249832153, "logps/rejected": -1.7076308727264404, "loss": 0.7987, "odds_ratio_loss": 0.35063761472702026, "rewards/accuracies": 0.875, "rewards/chosen": -0.06891757249832153, "rewards/margins": 0.10184551775455475, "rewards/rejected": -0.17076310515403748, "sft_loss": 0.6891757249832153, "step": 323 }, { "epoch": 0.4685466377440347, "grad_norm": 3.6446604374799274, "learning_rate": 7.939649772204524e-06, "logits/chosen": 0.6098726987838745, "logits/rejected": 0.44188717007637024, "logps/chosen": -0.9559364318847656, "logps/rejected": -1.3839476108551025, "loss": 0.8978, "odds_ratio_loss": 0.5117559432983398, "rewards/accuracies": 0.75, "rewards/chosen": -0.0955936461687088, "rewards/margins": 0.04280112311244011, "rewards/rejected": -0.1383947730064392, "sft_loss": 0.9559364318847656, "step": 324 }, { "epoch": 0.4699927693420101, "grad_norm": 3.584034339628172, "learning_rate": 7.939111102397025e-06, "logits/chosen": 0.6634198427200317, "logits/rejected": 0.5811032652854919, "logps/chosen": -0.6490356922149658, "logps/rejected": -1.5838490724563599, "loss": 0.8837, "odds_ratio_loss": 0.3766994774341583, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06490357220172882, "rewards/margins": 0.09348133206367493, "rewards/rejected": -0.15838490426540375, "sft_loss": 0.6490356922149658, "step": 325 }, { "epoch": 0.47143890093998553, "grad_norm": 3.5189379597603314, "learning_rate": 7.938570057684775e-06, "logits/chosen": 0.5340790748596191, "logits/rejected": 0.4121875464916229, "logps/chosen": -0.8325109481811523, "logps/rejected": -1.5845043659210205, "loss": 0.8627, "odds_ratio_loss": 0.49629390239715576, "rewards/accuracies": 0.75, "rewards/chosen": -0.08325108885765076, "rewards/margins": 0.07519934326410294, "rewards/rejected": -0.1584504395723343, "sft_loss": 0.8325109481811523, "step": 326 }, { "epoch": 0.47288503253796094, "grad_norm": 3.5375364033379837, "learning_rate": 7.938026638393967e-06, "logits/chosen": 0.75108802318573, "logits/rejected": 0.6010204553604126, "logps/chosen": -0.7769035696983337, "logps/rejected": -1.7428910732269287, "loss": 0.8074, "odds_ratio_loss": 0.4815066456794739, "rewards/accuracies": 0.75, "rewards/chosen": -0.07769035547971725, "rewards/margins": 0.09659874439239502, "rewards/rejected": -0.17428909242153168, "sft_loss": 0.7769035696983337, "step": 327 }, { "epoch": 0.47433116413593635, "grad_norm": 4.074796522105809, "learning_rate": 7.93748084485223e-06, "logits/chosen": 0.6220093965530396, "logits/rejected": 0.4709737300872803, "logps/chosen": -0.6528467535972595, "logps/rejected": -1.3154959678649902, "loss": 0.8344, "odds_ratio_loss": 0.4504534602165222, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06528467684984207, "rewards/margins": 0.06626491993665695, "rewards/rejected": -0.13154959678649902, "sft_loss": 0.6528467535972595, "step": 328 }, { "epoch": 0.4757772957339118, "grad_norm": 2.9413111640887206, "learning_rate": 7.936932677388629e-06, "logits/chosen": 0.48603928089141846, "logits/rejected": 0.4315299391746521, "logps/chosen": -0.9008134603500366, "logps/rejected": -1.2313823699951172, "loss": 0.8851, "odds_ratio_loss": 0.5827435255050659, "rewards/accuracies": 0.625, "rewards/chosen": -0.0900813490152359, "rewards/margins": 0.033056896179914474, "rewards/rejected": -0.12313823401927948, "sft_loss": 0.9008134603500366, "step": 329 }, { "epoch": 0.4772234273318872, "grad_norm": 4.2494599421762045, "learning_rate": 7.936382136333653e-06, "logits/chosen": 0.7024039030075073, "logits/rejected": 0.5576363801956177, "logps/chosen": -0.9647277593612671, "logps/rejected": -2.111638069152832, "loss": 0.9772, "odds_ratio_loss": 0.5717940330505371, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09647278487682343, "rewards/margins": 0.11469104140996933, "rewards/rejected": -0.21116381883621216, "sft_loss": 0.9647277593612671, "step": 330 }, { "epoch": 0.4786695589298626, "grad_norm": 4.782077019617622, "learning_rate": 7.935829222019228e-06, "logits/chosen": 0.6256594657897949, "logits/rejected": 0.5600475072860718, "logps/chosen": -0.9119490385055542, "logps/rejected": -1.3465038537979126, "loss": 0.8941, "odds_ratio_loss": 0.6236871480941772, "rewards/accuracies": 0.625, "rewards/chosen": -0.09119491279125214, "rewards/margins": 0.04345548897981644, "rewards/rejected": -0.13465039432048798, "sft_loss": 0.9119490385055542, "step": 331 }, { "epoch": 0.48011569052783803, "grad_norm": 3.528445359856052, "learning_rate": 7.935273934778704e-06, "logits/chosen": 0.5765432715415955, "logits/rejected": 0.43297064304351807, "logps/chosen": -0.7337332963943481, "logps/rejected": -1.637449860572815, "loss": 0.7898, "odds_ratio_loss": 0.3902113437652588, "rewards/accuracies": 0.875, "rewards/chosen": -0.07337333261966705, "rewards/margins": 0.09037166088819504, "rewards/rejected": -0.1637450009584427, "sft_loss": 0.7337332963943481, "step": 332 }, { "epoch": 0.48156182212581344, "grad_norm": 2.9637343259088826, "learning_rate": 7.93471627494687e-06, "logits/chosen": 0.6577078700065613, "logits/rejected": 0.5420234203338623, "logps/chosen": -0.9077745676040649, "logps/rejected": -0.8908224105834961, "loss": 0.9123, "odds_ratio_loss": 0.7640987634658813, "rewards/accuracies": 0.4375, "rewards/chosen": -0.09077746421098709, "rewards/margins": -0.001695222221314907, "rewards/rejected": -0.08908224105834961, "sft_loss": 0.9077745676040649, "step": 333 }, { "epoch": 0.48300795372378885, "grad_norm": 3.087555470787611, "learning_rate": 7.934156242859939e-06, "logits/chosen": 0.5774292945861816, "logits/rejected": 0.5972946882247925, "logps/chosen": -0.7331134676933289, "logps/rejected": -2.0628983974456787, "loss": 0.8601, "odds_ratio_loss": 0.30700796842575073, "rewards/accuracies": 0.875, "rewards/chosen": -0.07331134378910065, "rewards/margins": 0.13297849893569946, "rewards/rejected": -0.2062898576259613, "sft_loss": 0.7331134676933289, "step": 334 }, { "epoch": 0.48445408532176426, "grad_norm": 3.454287106669069, "learning_rate": 7.933593838855558e-06, "logits/chosen": 0.45803093910217285, "logits/rejected": 0.3993680775165558, "logps/chosen": -1.1503312587738037, "logps/rejected": -1.1803433895111084, "loss": 1.0082, "odds_ratio_loss": 0.7515895962715149, "rewards/accuracies": 0.625, "rewards/chosen": -0.11503313481807709, "rewards/margins": 0.0030012130737304688, "rewards/rejected": -0.11803434044122696, "sft_loss": 1.1503312587738037, "step": 335 }, { "epoch": 0.48590021691973967, "grad_norm": 4.234184234884144, "learning_rate": 7.9330290632728e-06, "logits/chosen": 0.4678802490234375, "logits/rejected": 0.39046192169189453, "logps/chosen": -0.9035397171974182, "logps/rejected": -1.1667201519012451, "loss": 0.9164, "odds_ratio_loss": 0.623311460018158, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09035398066043854, "rewards/margins": 0.02631804160773754, "rewards/rejected": -0.11667200922966003, "sft_loss": 0.9035397171974182, "step": 336 }, { "epoch": 0.48734634851771513, "grad_norm": 2.868596882817971, "learning_rate": 7.93246191645217e-06, "logits/chosen": 0.5796488523483276, "logits/rejected": 0.5661954283714294, "logps/chosen": -0.7045271396636963, "logps/rejected": -1.4447965621948242, "loss": 0.8614, "odds_ratio_loss": 0.4747796356678009, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0704527199268341, "rewards/margins": 0.07402694970369339, "rewards/rejected": -0.1444796621799469, "sft_loss": 0.7045271396636963, "step": 337 }, { "epoch": 0.48879248011569054, "grad_norm": 4.342722573935673, "learning_rate": 7.931892398735607e-06, "logits/chosen": 0.600724458694458, "logits/rejected": 0.5414420366287231, "logps/chosen": -0.7105461955070496, "logps/rejected": -1.3550055027008057, "loss": 0.8715, "odds_ratio_loss": 0.4188329875469208, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0710546150803566, "rewards/margins": 0.06444593518972397, "rewards/rejected": -0.13550056517124176, "sft_loss": 0.7105461955070496, "step": 338 }, { "epoch": 0.49023861171366595, "grad_norm": 4.582201948309204, "learning_rate": 7.931320510466472e-06, "logits/chosen": 0.5396424531936646, "logits/rejected": 0.3977088928222656, "logps/chosen": -0.9911842346191406, "logps/rejected": -1.341888427734375, "loss": 0.9325, "odds_ratio_loss": 0.741445004940033, "rewards/accuracies": 0.375, "rewards/chosen": -0.0991184264421463, "rewards/margins": 0.03507041931152344, "rewards/rejected": -0.13418884575366974, "sft_loss": 0.9911842346191406, "step": 339 }, { "epoch": 0.49168474331164136, "grad_norm": 2.782969535100825, "learning_rate": 7.930746251989558e-06, "logits/chosen": 0.5857880711555481, "logits/rejected": 0.46557682752609253, "logps/chosen": -0.8816125392913818, "logps/rejected": -1.6498929262161255, "loss": 0.8827, "odds_ratio_loss": 0.5967287421226501, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08816125243902206, "rewards/margins": 0.07682802528142929, "rewards/rejected": -0.16498929262161255, "sft_loss": 0.8816125392913818, "step": 340 }, { "epoch": 0.49313087490961677, "grad_norm": 3.50913863564006, "learning_rate": 7.930169623651092e-06, "logits/chosen": 0.7082901000976562, "logits/rejected": 0.5184527039527893, "logps/chosen": -0.6331555247306824, "logps/rejected": -2.131300449371338, "loss": 0.8071, "odds_ratio_loss": 0.5050643086433411, "rewards/accuracies": 0.625, "rewards/chosen": -0.06331555545330048, "rewards/margins": 0.14981448650360107, "rewards/rejected": -0.21313002705574036, "sft_loss": 0.6331555247306824, "step": 341 }, { "epoch": 0.4945770065075922, "grad_norm": 4.334338505420553, "learning_rate": 7.92959062579872e-06, "logits/chosen": 0.667914628982544, "logits/rejected": 0.5001978874206543, "logps/chosen": -0.7079198360443115, "logps/rejected": -1.6759718656539917, "loss": 0.8597, "odds_ratio_loss": 0.46850132942199707, "rewards/accuracies": 0.625, "rewards/chosen": -0.07079198956489563, "rewards/margins": 0.09680519998073578, "rewards/rejected": -0.1675971895456314, "sft_loss": 0.7079198360443115, "step": 342 }, { "epoch": 0.4960231381055676, "grad_norm": 10.026558896952501, "learning_rate": 7.929009258781526e-06, "logits/chosen": 0.5944724082946777, "logits/rejected": 0.4023160934448242, "logps/chosen": -0.758367657661438, "logps/rejected": -2.4440927505493164, "loss": 0.864, "odds_ratio_loss": 0.45333942770957947, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07583675533533096, "rewards/margins": 0.16857251524925232, "rewards/rejected": -0.24440926313400269, "sft_loss": 0.758367657661438, "step": 343 }, { "epoch": 0.49746926970354305, "grad_norm": 2.533667984333284, "learning_rate": 7.928425522950015e-06, "logits/chosen": 0.6907603144645691, "logits/rejected": 0.5649253129959106, "logps/chosen": -0.7110158205032349, "logps/rejected": -1.8234652280807495, "loss": 0.7522, "odds_ratio_loss": 0.40478286147117615, "rewards/accuracies": 0.75, "rewards/chosen": -0.0711015835404396, "rewards/margins": 0.11124493926763535, "rewards/rejected": -0.18234652280807495, "sft_loss": 0.7110158205032349, "step": 344 }, { "epoch": 0.49891540130151846, "grad_norm": 3.575246323886135, "learning_rate": 7.927839418656126e-06, "logits/chosen": 0.7409899830818176, "logits/rejected": 0.6180683970451355, "logps/chosen": -0.6791451573371887, "logps/rejected": -1.5107991695404053, "loss": 0.8689, "odds_ratio_loss": 0.472514271736145, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06791451573371887, "rewards/margins": 0.08316539227962494, "rewards/rejected": -0.1510799080133438, "sft_loss": 0.6791451573371887, "step": 345 }, { "epoch": 0.5003615328994938, "grad_norm": 5.550086385061495, "learning_rate": 7.927250946253224e-06, "logits/chosen": 0.7460111379623413, "logits/rejected": 0.6480680704116821, "logps/chosen": -0.8033276796340942, "logps/rejected": -1.2967023849487305, "loss": 0.8245, "odds_ratio_loss": 0.5949506163597107, "rewards/accuracies": 0.5, "rewards/chosen": -0.08033277094364166, "rewards/margins": 0.0493374727666378, "rewards/rejected": -0.12967024743556976, "sft_loss": 0.8033276796340942, "step": 346 }, { "epoch": 0.5018076644974693, "grad_norm": 2.8018962595308285, "learning_rate": 7.926660106096098e-06, "logits/chosen": 0.6507607698440552, "logits/rejected": 0.5750927925109863, "logps/chosen": -0.6767846345901489, "logps/rejected": -1.9638330936431885, "loss": 0.8087, "odds_ratio_loss": 0.6127895712852478, "rewards/accuracies": 0.5625, "rewards/chosen": -0.06767846643924713, "rewards/margins": 0.12870484590530396, "rewards/rejected": -0.19638332724571228, "sft_loss": 0.6767846345901489, "step": 347 }, { "epoch": 0.5032537960954447, "grad_norm": 5.415107124571396, "learning_rate": 7.92606689854097e-06, "logits/chosen": 0.5848195552825928, "logits/rejected": 0.52767014503479, "logps/chosen": -0.9780311584472656, "logps/rejected": -1.3824843168258667, "loss": 0.886, "odds_ratio_loss": 0.5286108255386353, "rewards/accuracies": 0.8125, "rewards/chosen": -0.09780311584472656, "rewards/margins": 0.040445320308208466, "rewards/rejected": -0.13824842870235443, "sft_loss": 0.9780311584472656, "step": 348 }, { "epoch": 0.5046999276934201, "grad_norm": 3.154755409577211, "learning_rate": 7.925471323945487e-06, "logits/chosen": 0.6179808378219604, "logits/rejected": 0.5163712501525879, "logps/chosen": -0.9327211976051331, "logps/rejected": -1.4365931749343872, "loss": 0.9042, "odds_ratio_loss": 0.6557815074920654, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0932721197605133, "rewards/margins": 0.050387196242809296, "rewards/rejected": -0.1436593234539032, "sft_loss": 0.9327211976051331, "step": 349 }, { "epoch": 0.5061460592913956, "grad_norm": 2.6911740599565173, "learning_rate": 7.924873382668724e-06, "logits/chosen": 0.6831887364387512, "logits/rejected": 0.7219184041023254, "logps/chosen": -0.7544329166412354, "logps/rejected": -1.539219617843628, "loss": 0.859, "odds_ratio_loss": 0.5167781710624695, "rewards/accuracies": 0.625, "rewards/chosen": -0.07544328272342682, "rewards/margins": 0.07847868651151657, "rewards/rejected": -0.153921976685524, "sft_loss": 0.7544329166412354, "step": 350 }, { "epoch": 0.5075921908893709, "grad_norm": 3.156218942574542, "learning_rate": 7.924273075071177e-06, "logits/chosen": 0.6295790672302246, "logits/rejected": 0.4227325916290283, "logps/chosen": -0.878291666507721, "logps/rejected": -1.71316659450531, "loss": 0.8423, "odds_ratio_loss": 0.48217105865478516, "rewards/accuracies": 0.75, "rewards/chosen": -0.08782917261123657, "rewards/margins": 0.08348748087882996, "rewards/rejected": -0.17131665349006653, "sft_loss": 0.878291666507721, "step": 351 }, { "epoch": 0.5090383224873464, "grad_norm": 4.709022398652423, "learning_rate": 7.92367040151478e-06, "logits/chosen": 0.5759553909301758, "logits/rejected": 0.4466925859451294, "logps/chosen": -0.7179098129272461, "logps/rejected": -1.511012315750122, "loss": 1.0013, "odds_ratio_loss": 0.5645134449005127, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07179098576307297, "rewards/margins": 0.07931024581193924, "rewards/rejected": -0.1511012315750122, "sft_loss": 0.7179098129272461, "step": 352 }, { "epoch": 0.5104844540853217, "grad_norm": 4.067706710450371, "learning_rate": 7.923065362362885e-06, "logits/chosen": 0.4464551508426666, "logits/rejected": 0.48552900552749634, "logps/chosen": -0.8215190172195435, "logps/rejected": -2.112861156463623, "loss": 0.86, "odds_ratio_loss": 0.49512815475463867, "rewards/accuracies": 0.875, "rewards/chosen": -0.08215189725160599, "rewards/margins": 0.12913422286510468, "rewards/rejected": -0.21128612756729126, "sft_loss": 0.8215190172195435, "step": 353 }, { "epoch": 0.5119305856832972, "grad_norm": 3.7090631226933852, "learning_rate": 7.922457957980272e-06, "logits/chosen": 0.5708781480789185, "logits/rejected": 0.6100287437438965, "logps/chosen": -0.8479795455932617, "logps/rejected": -1.83680260181427, "loss": 0.8505, "odds_ratio_loss": 0.4224860668182373, "rewards/accuracies": 0.75, "rewards/chosen": -0.08479795604944229, "rewards/margins": 0.09888231754302979, "rewards/rejected": -0.18368026614189148, "sft_loss": 0.8479795455932617, "step": 354 }, { "epoch": 0.5133767172812725, "grad_norm": 2.474186214253982, "learning_rate": 7.921848188733146e-06, "logits/chosen": 0.6852070093154907, "logits/rejected": 0.43174925446510315, "logps/chosen": -0.6690998077392578, "logps/rejected": -2.255444288253784, "loss": 0.8793, "odds_ratio_loss": 0.4386477470397949, "rewards/accuracies": 0.5625, "rewards/chosen": -0.06690998375415802, "rewards/margins": 0.15863445401191711, "rewards/rejected": -0.22554443776607513, "sft_loss": 0.6690998077392578, "step": 355 }, { "epoch": 0.514822848879248, "grad_norm": 2.797892389259021, "learning_rate": 7.921236054989142e-06, "logits/chosen": 0.6899568438529968, "logits/rejected": 0.41177740693092346, "logps/chosen": -0.713912844657898, "logps/rejected": -2.1068050861358643, "loss": 0.8, "odds_ratio_loss": 0.390377402305603, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07139129191637039, "rewards/margins": 0.1392892301082611, "rewards/rejected": -0.2106805145740509, "sft_loss": 0.713912844657898, "step": 356 }, { "epoch": 0.5162689804772235, "grad_norm": 2.9882574854982016, "learning_rate": 7.920621557117316e-06, "logits/chosen": 0.6059397459030151, "logits/rejected": 0.4042537212371826, "logps/chosen": -0.8969763517379761, "logps/rejected": -1.3820322751998901, "loss": 0.8743, "odds_ratio_loss": 0.5645525455474854, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08969763666391373, "rewards/margins": 0.04850558564066887, "rewards/rejected": -0.1382032185792923, "sft_loss": 0.8969763517379761, "step": 357 }, { "epoch": 0.5177151120751988, "grad_norm": 2.9687671798718585, "learning_rate": 7.92000469548815e-06, "logits/chosen": 0.6877776384353638, "logits/rejected": 0.6707451343536377, "logps/chosen": -0.6282877326011658, "logps/rejected": -1.3870527744293213, "loss": 0.8462, "odds_ratio_loss": 0.42180609703063965, "rewards/accuracies": 0.75, "rewards/chosen": -0.06282877922058105, "rewards/margins": 0.07587650418281555, "rewards/rejected": -0.1387052834033966, "sft_loss": 0.6282877326011658, "step": 358 }, { "epoch": 0.5191612436731743, "grad_norm": 3.1784523709075825, "learning_rate": 7.919385470473554e-06, "logits/chosen": 0.6407051682472229, "logits/rejected": 0.5854879021644592, "logps/chosen": -0.7749360799789429, "logps/rejected": -1.426418662071228, "loss": 0.93, "odds_ratio_loss": 0.5467308163642883, "rewards/accuracies": 0.625, "rewards/chosen": -0.07749360799789429, "rewards/margins": 0.0651482492685318, "rewards/rejected": -0.1426418572664261, "sft_loss": 0.7749360799789429, "step": 359 }, { "epoch": 0.5206073752711496, "grad_norm": 2.4008458367496455, "learning_rate": 7.918763882446861e-06, "logits/chosen": 0.5963618755340576, "logits/rejected": 0.5182086229324341, "logps/chosen": -0.8708814382553101, "logps/rejected": -1.3293944597244263, "loss": 0.8985, "odds_ratio_loss": 0.623752236366272, "rewards/accuracies": 0.5, "rewards/chosen": -0.08708814531564713, "rewards/margins": 0.04585129767656326, "rewards/rejected": -0.13293945789337158, "sft_loss": 0.8708814382553101, "step": 360 }, { "epoch": 0.5220535068691251, "grad_norm": 4.443409229829391, "learning_rate": 7.918139931782827e-06, "logits/chosen": 0.5416839718818665, "logits/rejected": 0.5389788746833801, "logps/chosen": -0.9856581091880798, "logps/rejected": -1.6269183158874512, "loss": 1.0153, "odds_ratio_loss": 0.591167688369751, "rewards/accuracies": 0.5, "rewards/chosen": -0.09856581687927246, "rewards/margins": 0.06412601470947266, "rewards/rejected": -0.16269183158874512, "sft_loss": 0.9856581091880798, "step": 361 }, { "epoch": 0.5234996384671005, "grad_norm": 2.9817466376184916, "learning_rate": 7.917513618857637e-06, "logits/chosen": 0.6022881269454956, "logits/rejected": 0.4377667009830475, "logps/chosen": -0.9305622577667236, "logps/rejected": -1.7188301086425781, "loss": 0.8499, "odds_ratio_loss": 0.5162808895111084, "rewards/accuracies": 0.625, "rewards/chosen": -0.09305623173713684, "rewards/margins": 0.07882677763700485, "rewards/rejected": -0.1718830019235611, "sft_loss": 0.9305622577667236, "step": 362 }, { "epoch": 0.5249457700650759, "grad_norm": 2.9696420778180666, "learning_rate": 7.916884944048896e-06, "logits/chosen": 0.7431566119194031, "logits/rejected": 0.534424901008606, "logps/chosen": -0.8032225370407104, "logps/rejected": -1.4060473442077637, "loss": 0.8416, "odds_ratio_loss": 0.554038405418396, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0803222581744194, "rewards/margins": 0.060282476246356964, "rewards/rejected": -0.14060473442077637, "sft_loss": 0.8032225370407104, "step": 363 }, { "epoch": 0.5263919016630514, "grad_norm": 3.572425590012433, "learning_rate": 7.916253907735632e-06, "logits/chosen": 0.4249388575553894, "logits/rejected": 0.3504279851913452, "logps/chosen": -0.7644526958465576, "logps/rejected": -0.9701670408248901, "loss": 0.8393, "odds_ratio_loss": 0.6225709915161133, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07644525915384293, "rewards/margins": 0.02057143673300743, "rewards/rejected": -0.09701670706272125, "sft_loss": 0.7644526958465576, "step": 364 }, { "epoch": 0.5278380332610267, "grad_norm": 3.258313489123501, "learning_rate": 7.915620510298303e-06, "logits/chosen": 0.4414077401161194, "logits/rejected": 0.37516123056411743, "logps/chosen": -0.7575925588607788, "logps/rejected": -2.3582098484039307, "loss": 0.8239, "odds_ratio_loss": 0.36041927337646484, "rewards/accuracies": 0.75, "rewards/chosen": -0.07575926184654236, "rewards/margins": 0.16006171703338623, "rewards/rejected": -0.2358209788799286, "sft_loss": 0.7575925588607788, "step": 365 }, { "epoch": 0.5292841648590022, "grad_norm": 4.082631394266917, "learning_rate": 7.914984752118785e-06, "logits/chosen": 0.6929055452346802, "logits/rejected": 0.5584222078323364, "logps/chosen": -0.7469485402107239, "logps/rejected": -2.453676223754883, "loss": 0.8559, "odds_ratio_loss": 0.47400549054145813, "rewards/accuracies": 0.75, "rewards/chosen": -0.07469485700130463, "rewards/margins": 0.17067277431488037, "rewards/rejected": -0.2453676164150238, "sft_loss": 0.7469485402107239, "step": 366 }, { "epoch": 0.5307302964569776, "grad_norm": 2.7439308886460094, "learning_rate": 7.91434663358038e-06, "logits/chosen": 0.4863385260105133, "logits/rejected": 0.46878552436828613, "logps/chosen": -0.6547051668167114, "logps/rejected": -1.097205400466919, "loss": 0.8037, "odds_ratio_loss": 0.4927642345428467, "rewards/accuracies": 0.75, "rewards/chosen": -0.06547051668167114, "rewards/margins": 0.04425002634525299, "rewards/rejected": -0.10972055047750473, "sft_loss": 0.6547051668167114, "step": 367 }, { "epoch": 0.532176428054953, "grad_norm": 3.142695590792442, "learning_rate": 7.913706155067809e-06, "logits/chosen": 0.6892445683479309, "logits/rejected": 0.6153708100318909, "logps/chosen": -0.7438619136810303, "logps/rejected": -1.5297504663467407, "loss": 0.8149, "odds_ratio_loss": 0.44938647747039795, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07438620179891586, "rewards/margins": 0.07858884334564209, "rewards/rejected": -0.15297505259513855, "sft_loss": 0.7438619136810303, "step": 368 }, { "epoch": 0.5336225596529284, "grad_norm": 2.969554883142122, "learning_rate": 7.913063316967221e-06, "logits/chosen": 0.6924266815185547, "logits/rejected": 0.5393475294113159, "logps/chosen": -0.657360315322876, "logps/rejected": -1.4955742359161377, "loss": 0.8055, "odds_ratio_loss": 0.4052860140800476, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06573603302240372, "rewards/margins": 0.08382140100002289, "rewards/rejected": -0.149557426571846, "sft_loss": 0.657360315322876, "step": 369 }, { "epoch": 0.5350686912509038, "grad_norm": 7.132638265040183, "learning_rate": 7.912418119666187e-06, "logits/chosen": 0.44325125217437744, "logits/rejected": 0.4056926965713501, "logps/chosen": -0.9184268712997437, "logps/rejected": -1.4103426933288574, "loss": 0.9277, "odds_ratio_loss": 0.6221168041229248, "rewards/accuracies": 0.5, "rewards/chosen": -0.09184268862009048, "rewards/margins": 0.049191586673259735, "rewards/rejected": -0.14103427529335022, "sft_loss": 0.9184268712997437, "step": 370 }, { "epoch": 0.5365148228488793, "grad_norm": 2.845008287197183, "learning_rate": 7.911770563553694e-06, "logits/chosen": 0.6006841659545898, "logits/rejected": 0.5208953619003296, "logps/chosen": -0.6521604061126709, "logps/rejected": -1.7045867443084717, "loss": 0.9057, "odds_ratio_loss": 0.5297222137451172, "rewards/accuracies": 0.75, "rewards/chosen": -0.06521604210138321, "rewards/margins": 0.10524262487888336, "rewards/rejected": -0.17045867443084717, "sft_loss": 0.6521604061126709, "step": 371 }, { "epoch": 0.5379609544468547, "grad_norm": 9.071354400551943, "learning_rate": 7.911120649020162e-06, "logits/chosen": 0.4639233350753784, "logits/rejected": 0.262873113155365, "logps/chosen": -0.9230935573577881, "logps/rejected": -1.9133673906326294, "loss": 0.8553, "odds_ratio_loss": 0.4418998062610626, "rewards/accuracies": 0.8125, "rewards/chosen": -0.09230935573577881, "rewards/margins": 0.09902738034725189, "rewards/rejected": -0.1913367360830307, "sft_loss": 0.9230935573577881, "step": 372 }, { "epoch": 0.5394070860448301, "grad_norm": 2.9865341827725143, "learning_rate": 7.910468376457424e-06, "logits/chosen": 0.7367454767227173, "logits/rejected": 0.5884082317352295, "logps/chosen": -0.6778854131698608, "logps/rejected": -1.1167229413986206, "loss": 0.7755, "odds_ratio_loss": 0.4953039288520813, "rewards/accuracies": 0.75, "rewards/chosen": -0.06778854131698608, "rewards/margins": 0.043883755803108215, "rewards/rejected": -0.1116722971200943, "sft_loss": 0.6778854131698608, "step": 373 }, { "epoch": 0.5408532176428055, "grad_norm": 2.718373269169299, "learning_rate": 7.909813746258738e-06, "logits/chosen": 0.699302613735199, "logits/rejected": 0.5751674175262451, "logps/chosen": -0.8385717868804932, "logps/rejected": -2.344866991043091, "loss": 0.8182, "odds_ratio_loss": 0.4515349268913269, "rewards/accuracies": 0.75, "rewards/chosen": -0.08385718613862991, "rewards/margins": 0.15062952041625977, "rewards/rejected": -0.23448669910430908, "sft_loss": 0.8385717868804932, "step": 374 }, { "epoch": 0.5422993492407809, "grad_norm": 2.7568273854501055, "learning_rate": 7.909156758818782e-06, "logits/chosen": 0.6291428208351135, "logits/rejected": 0.5532090067863464, "logps/chosen": -0.9137899875640869, "logps/rejected": -1.4941704273223877, "loss": 0.7956, "odds_ratio_loss": 0.6681514978408813, "rewards/accuracies": 0.625, "rewards/chosen": -0.09137900918722153, "rewards/margins": 0.058038052171468735, "rewards/rejected": -0.14941705763339996, "sft_loss": 0.9137899875640869, "step": 375 }, { "epoch": 0.5437454808387563, "grad_norm": 2.910942491569993, "learning_rate": 7.908497414533658e-06, "logits/chosen": 0.6253929138183594, "logits/rejected": 0.4845582842826843, "logps/chosen": -0.6655830144882202, "logps/rejected": -1.6559513807296753, "loss": 0.8689, "odds_ratio_loss": 0.5183405876159668, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06655830144882202, "rewards/margins": 0.09903683513402939, "rewards/rejected": -0.1655951291322708, "sft_loss": 0.6655830144882202, "step": 376 }, { "epoch": 0.5451916124367318, "grad_norm": 2.9429362391795815, "learning_rate": 7.907835713800883e-06, "logits/chosen": 0.6014878153800964, "logits/rejected": 0.5209006071090698, "logps/chosen": -0.7738833427429199, "logps/rejected": -1.4064111709594727, "loss": 0.7655, "odds_ratio_loss": 0.5940650701522827, "rewards/accuracies": 0.5, "rewards/chosen": -0.07738833129405975, "rewards/margins": 0.06325278431177139, "rewards/rejected": -0.14064112305641174, "sft_loss": 0.7738833427429199, "step": 377 }, { "epoch": 0.5466377440347071, "grad_norm": 2.559432919663948, "learning_rate": 7.907171657019403e-06, "logits/chosen": 0.5809941291809082, "logits/rejected": 0.6483409404754639, "logps/chosen": -1.0075054168701172, "logps/rejected": -1.088860273361206, "loss": 0.8069, "odds_ratio_loss": 0.6888061761856079, "rewards/accuracies": 0.5625, "rewards/chosen": -0.10075053572654724, "rewards/margins": 0.008135493844747543, "rewards/rejected": -0.10888603329658508, "sft_loss": 1.0075054168701172, "step": 378 }, { "epoch": 0.5480838756326826, "grad_norm": 3.50222797522594, "learning_rate": 7.906505244589581e-06, "logits/chosen": 0.5703312754631042, "logits/rejected": 0.46388378739356995, "logps/chosen": -0.9029073119163513, "logps/rejected": -1.9283422231674194, "loss": 0.8891, "odds_ratio_loss": 0.7023340463638306, "rewards/accuracies": 0.5, "rewards/chosen": -0.09029072523117065, "rewards/margins": 0.10254350304603577, "rewards/rejected": -0.19283424317836761, "sft_loss": 0.9029073119163513, "step": 379 }, { "epoch": 0.549530007230658, "grad_norm": 3.0937027814614506, "learning_rate": 7.905836476913197e-06, "logits/chosen": 0.6240139603614807, "logits/rejected": 0.6435034275054932, "logps/chosen": -0.7246947884559631, "logps/rejected": -1.4072096347808838, "loss": 0.9025, "odds_ratio_loss": 0.5814554691314697, "rewards/accuracies": 0.625, "rewards/chosen": -0.07246948033571243, "rewards/margins": 0.06825148314237595, "rewards/rejected": -0.14072097837924957, "sft_loss": 0.7246947884559631, "step": 380 }, { "epoch": 0.5509761388286334, "grad_norm": 5.572005139677108, "learning_rate": 7.905165354393453e-06, "logits/chosen": 0.5487345457077026, "logits/rejected": 0.4786183834075928, "logps/chosen": -1.0790622234344482, "logps/rejected": -2.643061876296997, "loss": 0.9764, "odds_ratio_loss": 0.6008350253105164, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10790622979402542, "rewards/margins": 0.1563999503850937, "rewards/rejected": -0.2643061876296997, "sft_loss": 1.0790622234344482, "step": 381 }, { "epoch": 0.5524222704266089, "grad_norm": 2.7876168403419848, "learning_rate": 7.904491877434973e-06, "logits/chosen": 0.49730992317199707, "logits/rejected": 0.5880797505378723, "logps/chosen": -0.7353624701499939, "logps/rejected": -1.8596457242965698, "loss": 0.9641, "odds_ratio_loss": 0.5217285752296448, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07353624701499939, "rewards/margins": 0.11242832243442535, "rewards/rejected": -0.18596458435058594, "sft_loss": 0.7353624701499939, "step": 382 }, { "epoch": 0.5538684020245842, "grad_norm": 3.9163687919179146, "learning_rate": 7.903816046443798e-06, "logits/chosen": 0.6902502179145813, "logits/rejected": 0.5075284242630005, "logps/chosen": -0.7549622058868408, "logps/rejected": -1.832808017730713, "loss": 0.9054, "odds_ratio_loss": 0.5362709760665894, "rewards/accuracies": 0.625, "rewards/chosen": -0.07549622654914856, "rewards/margins": 0.10778456926345825, "rewards/rejected": -0.1832807958126068, "sft_loss": 0.7549622058868408, "step": 383 }, { "epoch": 0.5553145336225597, "grad_norm": 2.7712943773528322, "learning_rate": 7.903137861827391e-06, "logits/chosen": 0.5990560054779053, "logits/rejected": 0.45573779940605164, "logps/chosen": -0.8964800238609314, "logps/rejected": -1.046126127243042, "loss": 0.9278, "odds_ratio_loss": 0.6818205118179321, "rewards/accuracies": 0.625, "rewards/chosen": -0.08964800089597702, "rewards/margins": 0.014964621514081955, "rewards/rejected": -0.10461262613534927, "sft_loss": 0.8964800238609314, "step": 384 }, { "epoch": 0.556760665220535, "grad_norm": 4.917169249683361, "learning_rate": 7.902457323994629e-06, "logits/chosen": 0.6207394599914551, "logits/rejected": 0.6121277809143066, "logps/chosen": -0.8706837892532349, "logps/rejected": -1.966713547706604, "loss": 0.9161, "odds_ratio_loss": 0.6748664379119873, "rewards/accuracies": 0.625, "rewards/chosen": -0.08706837892532349, "rewards/margins": 0.10960298031568527, "rewards/rejected": -0.19667133688926697, "sft_loss": 0.8706837892532349, "step": 385 }, { "epoch": 0.5582067968185105, "grad_norm": 3.0006340407456142, "learning_rate": 7.901774433355812e-06, "logits/chosen": 0.7476394176483154, "logits/rejected": 0.5458194017410278, "logps/chosen": -0.9039993286132812, "logps/rejected": -2.477038621902466, "loss": 0.8978, "odds_ratio_loss": 0.7108010053634644, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09039993584156036, "rewards/margins": 0.15730392932891846, "rewards/rejected": -0.24770388007164001, "sft_loss": 0.9039993286132812, "step": 386 }, { "epoch": 0.559652928416486, "grad_norm": 7.6882135807422705, "learning_rate": 7.901089190322656e-06, "logits/chosen": 0.5460501909255981, "logits/rejected": 0.4279562532901764, "logps/chosen": -0.9958518147468567, "logps/rejected": -1.7544076442718506, "loss": 0.8742, "odds_ratio_loss": 0.5986472368240356, "rewards/accuracies": 0.625, "rewards/chosen": -0.09958518296480179, "rewards/margins": 0.0758555680513382, "rewards/rejected": -0.17544077336788177, "sft_loss": 0.9958518147468567, "step": 387 }, { "epoch": 0.5610990600144613, "grad_norm": 2.977393496482833, "learning_rate": 7.900401595308299e-06, "logits/chosen": 0.5677440166473389, "logits/rejected": 0.32233792543411255, "logps/chosen": -0.8183796405792236, "logps/rejected": -2.062242031097412, "loss": 0.8109, "odds_ratio_loss": 0.4541066288948059, "rewards/accuracies": 0.75, "rewards/chosen": -0.081837959587574, "rewards/margins": 0.1243862509727478, "rewards/rejected": -0.2062242031097412, "sft_loss": 0.8183796405792236, "step": 388 }, { "epoch": 0.5625451916124368, "grad_norm": 2.416176246352007, "learning_rate": 7.899711648727295e-06, "logits/chosen": 0.4597778618335724, "logits/rejected": 0.40555012226104736, "logps/chosen": -0.8106697797775269, "logps/rejected": -2.3276708126068115, "loss": 0.8754, "odds_ratio_loss": 0.40332669019699097, "rewards/accuracies": 0.875, "rewards/chosen": -0.08106698095798492, "rewards/margins": 0.15170009434223175, "rewards/rejected": -0.23276707530021667, "sft_loss": 0.8106697797775269, "step": 389 }, { "epoch": 0.5639913232104121, "grad_norm": 4.587938500883625, "learning_rate": 7.899019350995612e-06, "logits/chosen": 0.3854691982269287, "logits/rejected": 0.32688039541244507, "logps/chosen": -0.8717947006225586, "logps/rejected": -2.0538129806518555, "loss": 0.8744, "odds_ratio_loss": 0.6730850338935852, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08717946708202362, "rewards/margins": 0.118201844394207, "rewards/rejected": -0.20538130402565002, "sft_loss": 0.8717947006225586, "step": 390 }, { "epoch": 0.5654374548083876, "grad_norm": 2.727564894902918, "learning_rate": 7.89832470253064e-06, "logits/chosen": 0.44197285175323486, "logits/rejected": 0.34067657589912415, "logps/chosen": -0.7974579334259033, "logps/rejected": -2.4840853214263916, "loss": 0.8007, "odds_ratio_loss": 0.5355666875839233, "rewards/accuracies": 0.625, "rewards/chosen": -0.07974579185247421, "rewards/margins": 0.16866274178028107, "rewards/rejected": -0.24840855598449707, "sft_loss": 0.7974579334259033, "step": 391 }, { "epoch": 0.5668835864063629, "grad_norm": 3.9564187454577597, "learning_rate": 7.89762770375119e-06, "logits/chosen": 0.5264127850532532, "logits/rejected": 0.39061862230300903, "logps/chosen": -0.8429474830627441, "logps/rejected": -1.5880827903747559, "loss": 0.8232, "odds_ratio_loss": 0.725482702255249, "rewards/accuracies": 0.4375, "rewards/chosen": -0.08429475873708725, "rewards/margins": 0.07451353222131729, "rewards/rejected": -0.15880829095840454, "sft_loss": 0.8429474830627441, "step": 392 }, { "epoch": 0.5683297180043384, "grad_norm": 3.6040585185208918, "learning_rate": 7.896928355077477e-06, "logits/chosen": 0.5754531621932983, "logits/rejected": 0.374263733625412, "logps/chosen": -0.7560480833053589, "logps/rejected": -2.024271011352539, "loss": 0.8564, "odds_ratio_loss": 0.48062875866889954, "rewards/accuracies": 0.75, "rewards/chosen": -0.07560480386018753, "rewards/margins": 0.12682229280471802, "rewards/rejected": -0.20242711901664734, "sft_loss": 0.7560480833053589, "step": 393 }, { "epoch": 0.5697758496023138, "grad_norm": 3.2999991845213326, "learning_rate": 7.896226656931146e-06, "logits/chosen": 0.548308789730072, "logits/rejected": 0.45042145252227783, "logps/chosen": -0.7207891941070557, "logps/rejected": -2.421163320541382, "loss": 0.8021, "odds_ratio_loss": 0.38493600487709045, "rewards/accuracies": 0.75, "rewards/chosen": -0.07207891345024109, "rewards/margins": 0.1700374186038971, "rewards/rejected": -0.24211633205413818, "sft_loss": 0.7207891941070557, "step": 394 }, { "epoch": 0.5712219812002892, "grad_norm": 2.5181280006116515, "learning_rate": 7.895522609735254e-06, "logits/chosen": 0.6037847995758057, "logits/rejected": 0.48602890968322754, "logps/chosen": -0.7324075102806091, "logps/rejected": -1.6046706438064575, "loss": 0.8519, "odds_ratio_loss": 0.5805519819259644, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07324075698852539, "rewards/margins": 0.08722630888223648, "rewards/rejected": -0.16046705842018127, "sft_loss": 0.7324075102806091, "step": 395 }, { "epoch": 0.5726681127982647, "grad_norm": 3.0548062329457064, "learning_rate": 7.894816213914271e-06, "logits/chosen": 0.6022318005561829, "logits/rejected": 0.5420160293579102, "logps/chosen": -0.8471623659133911, "logps/rejected": -1.6920993328094482, "loss": 0.8923, "odds_ratio_loss": 0.4801836609840393, "rewards/accuracies": 0.625, "rewards/chosen": -0.08471624553203583, "rewards/margins": 0.08449369668960571, "rewards/rejected": -0.16920992732048035, "sft_loss": 0.8471623659133911, "step": 396 }, { "epoch": 0.57411424439624, "grad_norm": 2.4424090387897763, "learning_rate": 7.894107469894086e-06, "logits/chosen": 0.5112386345863342, "logits/rejected": 0.35575413703918457, "logps/chosen": -0.6903353333473206, "logps/rejected": -2.252138614654541, "loss": 0.809, "odds_ratio_loss": 0.5777711868286133, "rewards/accuracies": 0.75, "rewards/chosen": -0.06903353333473206, "rewards/margins": 0.15618032217025757, "rewards/rejected": -0.22521387040615082, "sft_loss": 0.6903353333473206, "step": 397 }, { "epoch": 0.5755603759942155, "grad_norm": 4.353942299792494, "learning_rate": 7.893396378102005e-06, "logits/chosen": 0.4526655972003937, "logits/rejected": 0.384705126285553, "logps/chosen": -0.8826476335525513, "logps/rejected": -2.210721969604492, "loss": 0.9631, "odds_ratio_loss": 0.5307878255844116, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08826476335525513, "rewards/margins": 0.13280744850635529, "rewards/rejected": -0.22107219696044922, "sft_loss": 0.8826476335525513, "step": 398 }, { "epoch": 0.5770065075921909, "grad_norm": 2.9194418404101197, "learning_rate": 7.892682938966748e-06, "logits/chosen": 0.672846257686615, "logits/rejected": 0.4256134629249573, "logps/chosen": -0.8224917650222778, "logps/rejected": -1.2346034049987793, "loss": 0.885, "odds_ratio_loss": 0.6036404967308044, "rewards/accuracies": 0.625, "rewards/chosen": -0.08224917948246002, "rewards/margins": 0.041211169213056564, "rewards/rejected": -0.12346035242080688, "sft_loss": 0.8224917650222778, "step": 399 }, { "epoch": 0.5784526391901663, "grad_norm": 3.9247918318506834, "learning_rate": 7.891967152918447e-06, "logits/chosen": 0.5552332401275635, "logits/rejected": 0.4921160042285919, "logps/chosen": -0.8602915406227112, "logps/rejected": -2.5308303833007812, "loss": 0.8544, "odds_ratio_loss": 0.49064141511917114, "rewards/accuracies": 0.75, "rewards/chosen": -0.08602915704250336, "rewards/margins": 0.16705386340618134, "rewards/rejected": -0.2530830204486847, "sft_loss": 0.8602915406227112, "step": 400 }, { "epoch": 0.5798987707881417, "grad_norm": 2.5180141294825713, "learning_rate": 7.891249020388656e-06, "logits/chosen": 0.47350338101387024, "logits/rejected": 0.4336029291152954, "logps/chosen": -0.8087297677993774, "logps/rejected": -1.585395097732544, "loss": 0.8251, "odds_ratio_loss": 0.6435711979866028, "rewards/accuracies": 0.625, "rewards/chosen": -0.08087297528982162, "rewards/margins": 0.07766654342412949, "rewards/rejected": -0.15853950381278992, "sft_loss": 0.8087297677993774, "step": 401 }, { "epoch": 0.5813449023861171, "grad_norm": 2.812975306412341, "learning_rate": 7.890528541810339e-06, "logits/chosen": 0.4831019937992096, "logits/rejected": 0.428011029958725, "logps/chosen": -0.7527783513069153, "logps/rejected": -1.017602562904358, "loss": 0.85, "odds_ratio_loss": 0.5688704252243042, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07527783513069153, "rewards/margins": 0.02648242563009262, "rewards/rejected": -0.10176026821136475, "sft_loss": 0.7527783513069153, "step": 402 }, { "epoch": 0.5827910339840926, "grad_norm": 3.6350799321420135, "learning_rate": 7.889805717617872e-06, "logits/chosen": 0.5826254487037659, "logits/rejected": 0.5473474264144897, "logps/chosen": -0.7726538181304932, "logps/rejected": -2.026120185852051, "loss": 0.815, "odds_ratio_loss": 0.525509238243103, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07726538181304932, "rewards/margins": 0.12534663081169128, "rewards/rejected": -0.2026120275259018, "sft_loss": 0.7726538181304932, "step": 403 }, { "epoch": 0.584237165582068, "grad_norm": 2.955665410682994, "learning_rate": 7.889080548247051e-06, "logits/chosen": 0.4878218173980713, "logits/rejected": 0.4596579372882843, "logps/chosen": -0.6897256374359131, "logps/rejected": -1.9527850151062012, "loss": 0.8417, "odds_ratio_loss": 0.42767712473869324, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06897257268428802, "rewards/margins": 0.1263059377670288, "rewards/rejected": -0.19527849555015564, "sft_loss": 0.6897256374359131, "step": 404 }, { "epoch": 0.5856832971800434, "grad_norm": 3.3958916438819085, "learning_rate": 7.888353034135084e-06, "logits/chosen": 0.4562031328678131, "logits/rejected": 0.35089221596717834, "logps/chosen": -0.8547066450119019, "logps/rejected": -2.9385735988616943, "loss": 0.794, "odds_ratio_loss": 0.45870649814605713, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08547066152095795, "rewards/margins": 0.20838668942451477, "rewards/rejected": -0.2938573658466339, "sft_loss": 0.8547066450119019, "step": 405 }, { "epoch": 0.5871294287780188, "grad_norm": 2.851949471321928, "learning_rate": 7.88762317572059e-06, "logits/chosen": 0.4243275225162506, "logits/rejected": 0.452597975730896, "logps/chosen": -0.7892992496490479, "logps/rejected": -1.2699556350708008, "loss": 0.7885, "odds_ratio_loss": 0.5196892023086548, "rewards/accuracies": 0.875, "rewards/chosen": -0.07892993092536926, "rewards/margins": 0.048065636307001114, "rewards/rejected": -0.12699556350708008, "sft_loss": 0.7892992496490479, "step": 406 }, { "epoch": 0.5885755603759942, "grad_norm": 4.356841434270988, "learning_rate": 7.886890973443606e-06, "logits/chosen": 0.4870211184024811, "logits/rejected": 0.36551669239997864, "logps/chosen": -0.781580924987793, "logps/rejected": -2.3665194511413574, "loss": 0.7901, "odds_ratio_loss": 0.4650033414363861, "rewards/accuracies": 0.625, "rewards/chosen": -0.07815809547901154, "rewards/margins": 0.15849386155605316, "rewards/rejected": -0.2366519570350647, "sft_loss": 0.781580924987793, "step": 407 }, { "epoch": 0.5900216919739696, "grad_norm": 3.1252881359207123, "learning_rate": 7.886156427745576e-06, "logits/chosen": 0.5762686729431152, "logits/rejected": 0.41852471232414246, "logps/chosen": -0.7866231203079224, "logps/rejected": -1.6717575788497925, "loss": 0.844, "odds_ratio_loss": 0.5896944999694824, "rewards/accuracies": 0.625, "rewards/chosen": -0.07866232097148895, "rewards/margins": 0.08851346373558044, "rewards/rejected": -0.1671757698059082, "sft_loss": 0.7866231203079224, "step": 408 }, { "epoch": 0.591467823571945, "grad_norm": 3.24125061976931, "learning_rate": 7.885419539069362e-06, "logits/chosen": 0.5813404321670532, "logits/rejected": 0.3938213884830475, "logps/chosen": -0.7569571137428284, "logps/rejected": -2.2225003242492676, "loss": 0.8635, "odds_ratio_loss": 0.41640591621398926, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0756957158446312, "rewards/margins": 0.14655432105064392, "rewards/rejected": -0.2222500443458557, "sft_loss": 0.7569571137428284, "step": 409 }, { "epoch": 0.5929139551699205, "grad_norm": 2.8346587836313746, "learning_rate": 7.884680307859237e-06, "logits/chosen": 0.7509294152259827, "logits/rejected": 0.6562870144844055, "logps/chosen": -0.6297463178634644, "logps/rejected": -2.454871416091919, "loss": 0.7754, "odds_ratio_loss": 0.375518798828125, "rewards/accuracies": 0.75, "rewards/chosen": -0.06297463178634644, "rewards/margins": 0.18251252174377441, "rewards/rejected": -0.24548715353012085, "sft_loss": 0.6297463178634644, "step": 410 }, { "epoch": 0.5943600867678959, "grad_norm": 3.222409965720389, "learning_rate": 7.883938734560888e-06, "logits/chosen": 0.5145249366760254, "logits/rejected": 0.37488415837287903, "logps/chosen": -0.7391336560249329, "logps/rejected": -1.8347660303115845, "loss": 0.861, "odds_ratio_loss": 0.5218417048454285, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07391335815191269, "rewards/margins": 0.10956324636936188, "rewards/rejected": -0.18347659707069397, "sft_loss": 0.7391336560249329, "step": 411 }, { "epoch": 0.5958062183658713, "grad_norm": 3.965379970096697, "learning_rate": 7.88319481962141e-06, "logits/chosen": 0.5220549702644348, "logits/rejected": 0.38239559531211853, "logps/chosen": -0.7255507111549377, "logps/rejected": -2.4738118648529053, "loss": 0.8449, "odds_ratio_loss": 0.4144716262817383, "rewards/accuracies": 0.75, "rewards/chosen": -0.0725550651550293, "rewards/margins": 0.17482611536979675, "rewards/rejected": -0.24738118052482605, "sft_loss": 0.7255507111549377, "step": 412 }, { "epoch": 0.5972523499638467, "grad_norm": 4.452072359820141, "learning_rate": 7.882448563489313e-06, "logits/chosen": 0.4620935916900635, "logits/rejected": 0.4356992542743683, "logps/chosen": -0.8362157344818115, "logps/rejected": -2.1093122959136963, "loss": 0.8182, "odds_ratio_loss": 0.5006383061408997, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08362157642841339, "rewards/margins": 0.127309650182724, "rewards/rejected": -0.21093124151229858, "sft_loss": 0.8362157344818115, "step": 413 }, { "epoch": 0.5986984815618221, "grad_norm": 4.629463579221466, "learning_rate": 7.881699966614516e-06, "logits/chosen": 0.4516201913356781, "logits/rejected": 0.45159292221069336, "logps/chosen": -0.6810017824172974, "logps/rejected": -2.307833671569824, "loss": 0.8887, "odds_ratio_loss": 0.4952140748500824, "rewards/accuracies": 0.625, "rewards/chosen": -0.06810016930103302, "rewards/margins": 0.16268320381641388, "rewards/rejected": -0.2307833731174469, "sft_loss": 0.6810017824172974, "step": 414 }, { "epoch": 0.6001446131597975, "grad_norm": 3.2811814411404003, "learning_rate": 7.880949029448352e-06, "logits/chosen": 0.47481125593185425, "logits/rejected": 0.399766206741333, "logps/chosen": -0.6239845156669617, "logps/rejected": -2.5231199264526367, "loss": 0.7472, "odds_ratio_loss": 0.36092132329940796, "rewards/accuracies": 0.875, "rewards/chosen": -0.06239845231175423, "rewards/margins": 0.1899135261774063, "rewards/rejected": -0.2523120045661926, "sft_loss": 0.6239845156669617, "step": 415 }, { "epoch": 0.601590744757773, "grad_norm": 3.910382755288589, "learning_rate": 7.880195752443566e-06, "logits/chosen": 0.40132609009742737, "logits/rejected": 0.2246161550283432, "logps/chosen": -0.9353955984115601, "logps/rejected": -2.6664748191833496, "loss": 0.8951, "odds_ratio_loss": 0.38120150566101074, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09353956580162048, "rewards/margins": 0.17310792207717896, "rewards/rejected": -0.26664748787879944, "sft_loss": 0.9353955984115601, "step": 416 }, { "epoch": 0.6030368763557483, "grad_norm": 3.83453742589943, "learning_rate": 7.879440136054307e-06, "logits/chosen": 0.516146719455719, "logits/rejected": 0.4591212272644043, "logps/chosen": -0.7768975496292114, "logps/rejected": -3.070139169692993, "loss": 0.8451, "odds_ratio_loss": 0.453329861164093, "rewards/accuracies": 0.75, "rewards/chosen": -0.0776897519826889, "rewards/margins": 0.22932417690753937, "rewards/rejected": -0.30701392889022827, "sft_loss": 0.7768975496292114, "step": 417 }, { "epoch": 0.6044830079537238, "grad_norm": 3.8217120040188464, "learning_rate": 7.878682180736142e-06, "logits/chosen": 0.739425778388977, "logits/rejected": 0.5814308524131775, "logps/chosen": -0.8185400366783142, "logps/rejected": -1.45823073387146, "loss": 0.7712, "odds_ratio_loss": 0.6225258111953735, "rewards/accuracies": 0.625, "rewards/chosen": -0.08185401558876038, "rewards/margins": 0.06396905332803726, "rewards/rejected": -0.14582306146621704, "sft_loss": 0.8185400366783142, "step": 418 }, { "epoch": 0.6059291395516992, "grad_norm": 3.184411206872811, "learning_rate": 7.877921886946046e-06, "logits/chosen": 0.41738444566726685, "logits/rejected": 0.22434985637664795, "logps/chosen": -0.9352853298187256, "logps/rejected": -2.829294443130493, "loss": 0.889, "odds_ratio_loss": 0.4922487139701843, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09352853894233704, "rewards/margins": 0.18940091133117676, "rewards/rejected": -0.2829294502735138, "sft_loss": 0.9352853298187256, "step": 419 }, { "epoch": 0.6073752711496746, "grad_norm": 2.658817199141513, "learning_rate": 7.8771592551424e-06, "logits/chosen": 0.5982671976089478, "logits/rejected": 0.4165036082267761, "logps/chosen": -0.7658669948577881, "logps/rejected": -1.7320560216903687, "loss": 0.8374, "odds_ratio_loss": 0.4825802743434906, "rewards/accuracies": 0.625, "rewards/chosen": -0.07658669352531433, "rewards/margins": 0.09661891311407089, "rewards/rejected": -0.17320561408996582, "sft_loss": 0.7658669948577881, "step": 420 }, { "epoch": 0.6088214027476501, "grad_norm": 3.345391828041502, "learning_rate": 7.876394285785e-06, "logits/chosen": 0.5503162145614624, "logits/rejected": 0.34399694204330444, "logps/chosen": -0.7347284555435181, "logps/rejected": -2.7232933044433594, "loss": 0.7884, "odds_ratio_loss": 0.4280637502670288, "rewards/accuracies": 0.625, "rewards/chosen": -0.07347285747528076, "rewards/margins": 0.19885647296905518, "rewards/rejected": -0.27232933044433594, "sft_loss": 0.7347284555435181, "step": 421 }, { "epoch": 0.6102675343456254, "grad_norm": 3.386750433238105, "learning_rate": 7.875626979335047e-06, "logits/chosen": 0.4964909255504608, "logits/rejected": 0.4483870267868042, "logps/chosen": -0.8750501275062561, "logps/rejected": -1.7626843452453613, "loss": 0.7978, "odds_ratio_loss": 0.5853959918022156, "rewards/accuracies": 0.625, "rewards/chosen": -0.08750501275062561, "rewards/margins": 0.08876340091228485, "rewards/rejected": -0.17626842856407166, "sft_loss": 0.8750501275062561, "step": 422 }, { "epoch": 0.6117136659436009, "grad_norm": 2.4670916895507005, "learning_rate": 7.874857336255153e-06, "logits/chosen": 0.5551592707633972, "logits/rejected": 0.2982563376426697, "logps/chosen": -0.6951935291290283, "logps/rejected": -4.001288890838623, "loss": 0.7655, "odds_ratio_loss": 0.5873510837554932, "rewards/accuracies": 0.5, "rewards/chosen": -0.06951935589313507, "rewards/margins": 0.330609530210495, "rewards/rejected": -0.40012890100479126, "sft_loss": 0.6951935291290283, "step": 423 }, { "epoch": 0.6131597975415762, "grad_norm": 2.8129321951372703, "learning_rate": 7.874085357009341e-06, "logits/chosen": 0.6084282994270325, "logits/rejected": 0.42167001962661743, "logps/chosen": -0.6516855955123901, "logps/rejected": -1.563461184501648, "loss": 0.8, "odds_ratio_loss": 0.40489715337753296, "rewards/accuracies": 0.75, "rewards/chosen": -0.06516855955123901, "rewards/margins": 0.0911775529384613, "rewards/rejected": -0.15634611248970032, "sft_loss": 0.6516855955123901, "step": 424 }, { "epoch": 0.6146059291395517, "grad_norm": 3.283032567196892, "learning_rate": 7.873311042063038e-06, "logits/chosen": 0.554405927658081, "logits/rejected": 0.3872930705547333, "logps/chosen": -0.6961603164672852, "logps/rejected": -2.4004769325256348, "loss": 0.8816, "odds_ratio_loss": 0.4127447307109833, "rewards/accuracies": 0.75, "rewards/chosen": -0.06961603462696075, "rewards/margins": 0.17043164372444153, "rewards/rejected": -0.24004767835140228, "sft_loss": 0.6961603164672852, "step": 425 }, { "epoch": 0.6160520607375272, "grad_norm": 2.820835695498693, "learning_rate": 7.872534391883082e-06, "logits/chosen": 0.6225115656852722, "logits/rejected": 0.5951453447341919, "logps/chosen": -0.780193567276001, "logps/rejected": -1.2512143850326538, "loss": 0.9153, "odds_ratio_loss": 0.5581642389297485, "rewards/accuracies": 0.75, "rewards/chosen": -0.07801935821771622, "rewards/margins": 0.04710208252072334, "rewards/rejected": -0.12512142956256866, "sft_loss": 0.780193567276001, "step": 426 }, { "epoch": 0.6174981923355025, "grad_norm": 2.9845279410385177, "learning_rate": 7.87175540693772e-06, "logits/chosen": 0.5121057629585266, "logits/rejected": 0.4706208109855652, "logps/chosen": -0.8175680637359619, "logps/rejected": -1.6420925855636597, "loss": 0.86, "odds_ratio_loss": 0.5938406586647034, "rewards/accuracies": 0.625, "rewards/chosen": -0.08175680041313171, "rewards/margins": 0.08245246112346649, "rewards/rejected": -0.1642092615365982, "sft_loss": 0.8175680637359619, "step": 427 }, { "epoch": 0.618944323933478, "grad_norm": 2.756053493228294, "learning_rate": 7.870974087696601e-06, "logits/chosen": 0.5588035583496094, "logits/rejected": 0.4831167459487915, "logps/chosen": -0.7480887770652771, "logps/rejected": -1.4949398040771484, "loss": 0.8318, "odds_ratio_loss": 0.6169903874397278, "rewards/accuracies": 0.625, "rewards/chosen": -0.07480888068675995, "rewards/margins": 0.07468511164188385, "rewards/rejected": -0.1494939923286438, "sft_loss": 0.7480887770652771, "step": 428 }, { "epoch": 0.6203904555314533, "grad_norm": 6.440433342716773, "learning_rate": 7.870190434630788e-06, "logits/chosen": 0.5738497972488403, "logits/rejected": 0.48213326930999756, "logps/chosen": -0.9667487740516663, "logps/rejected": -1.965078353881836, "loss": 0.9166, "odds_ratio_loss": 0.6497290134429932, "rewards/accuracies": 0.625, "rewards/chosen": -0.09667487442493439, "rewards/margins": 0.0998329371213913, "rewards/rejected": -0.19650781154632568, "sft_loss": 0.9667487740516663, "step": 429 }, { "epoch": 0.6218365871294288, "grad_norm": 5.637365479715586, "learning_rate": 7.869404448212748e-06, "logits/chosen": 0.5685498118400574, "logits/rejected": 0.39901280403137207, "logps/chosen": -0.9792392253875732, "logps/rejected": -1.1231156587600708, "loss": 0.873, "odds_ratio_loss": 0.6865856647491455, "rewards/accuracies": 0.4375, "rewards/chosen": -0.09792391955852509, "rewards/margins": 0.014387642964720726, "rewards/rejected": -0.11231156438589096, "sft_loss": 0.9792392253875732, "step": 430 }, { "epoch": 0.6232827187274042, "grad_norm": 3.2594522263553385, "learning_rate": 7.868616128916355e-06, "logits/chosen": 0.4767334461212158, "logits/rejected": 0.5127293467521667, "logps/chosen": -0.6846072673797607, "logps/rejected": -1.3331453800201416, "loss": 0.7829, "odds_ratio_loss": 0.6704195737838745, "rewards/accuracies": 0.625, "rewards/chosen": -0.06846071779727936, "rewards/margins": 0.06485381722450256, "rewards/rejected": -0.13331454992294312, "sft_loss": 0.6846072673797607, "step": 431 }, { "epoch": 0.6247288503253796, "grad_norm": 12.94226676394391, "learning_rate": 7.86782547721689e-06, "logits/chosen": 0.3910233974456787, "logits/rejected": 0.2703205645084381, "logps/chosen": -1.022456169128418, "logps/rejected": -2.576681137084961, "loss": 1.0595, "odds_ratio_loss": 0.5771183967590332, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10224561393260956, "rewards/margins": 0.15542250871658325, "rewards/rejected": -0.257668137550354, "sft_loss": 1.022456169128418, "step": 432 }, { "epoch": 0.6261749819233551, "grad_norm": 4.258235258049596, "learning_rate": 7.867032493591039e-06, "logits/chosen": 0.6012907028198242, "logits/rejected": 0.4136401116847992, "logps/chosen": -0.7054803967475891, "logps/rejected": -2.6366329193115234, "loss": 0.8079, "odds_ratio_loss": 0.5335126519203186, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07054804265499115, "rewards/margins": 0.1931152641773224, "rewards/rejected": -0.26366329193115234, "sft_loss": 0.7054803967475891, "step": 433 }, { "epoch": 0.6276211135213304, "grad_norm": 3.2881085900599967, "learning_rate": 7.866237178516895e-06, "logits/chosen": 0.45769360661506653, "logits/rejected": 0.4603123664855957, "logps/chosen": -0.828456461429596, "logps/rejected": -2.0454816818237305, "loss": 0.863, "odds_ratio_loss": 0.564410924911499, "rewards/accuracies": 0.5, "rewards/chosen": -0.08284565061330795, "rewards/margins": 0.12170251458883286, "rewards/rejected": -0.2045481652021408, "sft_loss": 0.828456461429596, "step": 434 }, { "epoch": 0.6290672451193059, "grad_norm": 2.9543081301550123, "learning_rate": 7.865439532473956e-06, "logits/chosen": 0.40643981099128723, "logits/rejected": 0.35865318775177, "logps/chosen": -0.9577921032905579, "logps/rejected": -2.695612668991089, "loss": 0.8432, "odds_ratio_loss": 0.48533615469932556, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09577921777963638, "rewards/margins": 0.173782080411911, "rewards/rejected": -0.2695612907409668, "sft_loss": 0.9577921032905579, "step": 435 }, { "epoch": 0.6305133767172812, "grad_norm": 3.2712972136551715, "learning_rate": 7.864639555943128e-06, "logits/chosen": 0.6304845213890076, "logits/rejected": 0.3566408157348633, "logps/chosen": -0.8317915201187134, "logps/rejected": -3.221254348754883, "loss": 0.7553, "odds_ratio_loss": 0.5319594740867615, "rewards/accuracies": 0.75, "rewards/chosen": -0.08317915350198746, "rewards/margins": 0.23894628882408142, "rewards/rejected": -0.3221254348754883, "sft_loss": 0.8317915201187134, "step": 436 }, { "epoch": 0.6319595083152567, "grad_norm": 6.796669067502768, "learning_rate": 7.863837249406717e-06, "logits/chosen": 0.4694080352783203, "logits/rejected": 0.4095763862133026, "logps/chosen": -0.8717098832130432, "logps/rejected": -1.5735933780670166, "loss": 0.8834, "odds_ratio_loss": 0.6717262268066406, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08717098832130432, "rewards/margins": 0.07018835842609406, "rewards/rejected": -0.15735936164855957, "sft_loss": 0.8717098832130432, "step": 437 }, { "epoch": 0.6334056399132321, "grad_norm": 2.795098582170022, "learning_rate": 7.86303261334844e-06, "logits/chosen": 0.4883936941623688, "logits/rejected": 0.4044135510921478, "logps/chosen": -0.7085312604904175, "logps/rejected": -3.2201850414276123, "loss": 0.8154, "odds_ratio_loss": 0.5115565061569214, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07085312902927399, "rewards/margins": 0.25116536021232605, "rewards/rejected": -0.32201850414276123, "sft_loss": 0.7085312604904175, "step": 438 }, { "epoch": 0.6348517715112075, "grad_norm": 3.1284776560367185, "learning_rate": 7.86222564825341e-06, "logits/chosen": 0.47537481784820557, "logits/rejected": 0.49767816066741943, "logps/chosen": -0.9836025834083557, "logps/rejected": -1.391028642654419, "loss": 0.906, "odds_ratio_loss": 0.7164785265922546, "rewards/accuracies": 0.5, "rewards/chosen": -0.09836025536060333, "rewards/margins": 0.04074261337518692, "rewards/rejected": -0.13910287618637085, "sft_loss": 0.9836025834083557, "step": 439 }, { "epoch": 0.6362979031091829, "grad_norm": 2.8437761782591036, "learning_rate": 7.861416354608154e-06, "logits/chosen": 0.4136395752429962, "logits/rejected": 0.320941299200058, "logps/chosen": -0.8693005442619324, "logps/rejected": -1.1584815979003906, "loss": 0.8628, "odds_ratio_loss": 0.5910212993621826, "rewards/accuracies": 0.5625, "rewards/chosen": -0.086930051445961, "rewards/margins": 0.02891811728477478, "rewards/rejected": -0.11584816873073578, "sft_loss": 0.8693005442619324, "step": 440 }, { "epoch": 0.6377440347071583, "grad_norm": 4.251716982329854, "learning_rate": 7.860604732900595e-06, "logits/chosen": 0.761461615562439, "logits/rejected": 0.5725336074829102, "logps/chosen": -0.6326714754104614, "logps/rejected": -2.821256637573242, "loss": 0.8185, "odds_ratio_loss": 0.3858832120895386, "rewards/accuracies": 0.875, "rewards/chosen": -0.06326714903116226, "rewards/margins": 0.2188585251569748, "rewards/rejected": -0.28212568163871765, "sft_loss": 0.6326714754104614, "step": 441 }, { "epoch": 0.6391901663051338, "grad_norm": 2.983907575956472, "learning_rate": 7.859790783620066e-06, "logits/chosen": 0.4991647005081177, "logits/rejected": 0.37928247451782227, "logps/chosen": -0.757746160030365, "logps/rejected": -2.254284620285034, "loss": 0.8131, "odds_ratio_loss": 0.43578675389289856, "rewards/accuracies": 0.75, "rewards/chosen": -0.07577462494373322, "rewards/margins": 0.1496538519859314, "rewards/rejected": -0.2254284769296646, "sft_loss": 0.757746160030365, "step": 442 }, { "epoch": 0.6406362979031092, "grad_norm": 3.510678850486274, "learning_rate": 7.858974507257298e-06, "logits/chosen": 0.45033857226371765, "logits/rejected": 0.356212854385376, "logps/chosen": -0.8245335817337036, "logps/rejected": -2.268136978149414, "loss": 0.852, "odds_ratio_loss": 0.5200464129447937, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08245337009429932, "rewards/margins": 0.14436031877994537, "rewards/rejected": -0.2268136888742447, "sft_loss": 0.8245335817337036, "step": 443 }, { "epoch": 0.6420824295010846, "grad_norm": 2.7353859290225033, "learning_rate": 7.858155904304427e-06, "logits/chosen": 0.4843840003013611, "logits/rejected": 0.4002265930175781, "logps/chosen": -0.9183147549629211, "logps/rejected": -1.4296321868896484, "loss": 0.8456, "odds_ratio_loss": 0.7742018103599548, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09183148294687271, "rewards/margins": 0.05113174393773079, "rewards/rejected": -0.1429632306098938, "sft_loss": 0.9183147549629211, "step": 444 }, { "epoch": 0.64352856109906, "grad_norm": 3.0770167286409973, "learning_rate": 7.85733497525499e-06, "logits/chosen": 0.4947357177734375, "logits/rejected": 0.4650039076805115, "logps/chosen": -0.8962004780769348, "logps/rejected": -1.226360559463501, "loss": 0.8767, "odds_ratio_loss": 0.5764116048812866, "rewards/accuracies": 0.625, "rewards/chosen": -0.08962005376815796, "rewards/margins": 0.03301601484417915, "rewards/rejected": -0.12263606488704681, "sft_loss": 0.8962004780769348, "step": 445 }, { "epoch": 0.6449746926970354, "grad_norm": 2.87017794293484, "learning_rate": 7.856511720603932e-06, "logits/chosen": 0.390936017036438, "logits/rejected": 0.3944934606552124, "logps/chosen": -0.8416546583175659, "logps/rejected": -1.379352331161499, "loss": 0.8475, "odds_ratio_loss": 0.5354892611503601, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08416546881198883, "rewards/margins": 0.053769778460264206, "rewards/rejected": -0.13793525099754333, "sft_loss": 0.8416546583175659, "step": 446 }, { "epoch": 0.6464208242950108, "grad_norm": 4.173759451782166, "learning_rate": 7.855686140847595e-06, "logits/chosen": 0.7082953453063965, "logits/rejected": 0.3387436270713806, "logps/chosen": -0.8564531207084656, "logps/rejected": -2.3805789947509766, "loss": 0.7679, "odds_ratio_loss": 0.4507424533367157, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08564531058073044, "rewards/margins": 0.15241259336471558, "rewards/rejected": -0.2380579113960266, "sft_loss": 0.8564531207084656, "step": 447 }, { "epoch": 0.6478669558929863, "grad_norm": 2.8517830713037955, "learning_rate": 7.854858236483722e-06, "logits/chosen": 0.593734085559845, "logits/rejected": 0.31093043088912964, "logps/chosen": -0.7490995526313782, "logps/rejected": -2.389633893966675, "loss": 0.8676, "odds_ratio_loss": 0.5279134511947632, "rewards/accuracies": 0.75, "rewards/chosen": -0.07490995526313782, "rewards/margins": 0.16405344009399414, "rewards/rejected": -0.23896339535713196, "sft_loss": 0.7490995526313782, "step": 448 }, { "epoch": 0.6493130874909617, "grad_norm": 3.821681729479148, "learning_rate": 7.854028008011463e-06, "logits/chosen": 0.7026189565658569, "logits/rejected": 0.4407723546028137, "logps/chosen": -0.7633951902389526, "logps/rejected": -2.1326398849487305, "loss": 0.7899, "odds_ratio_loss": 0.4903205335140228, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07633952051401138, "rewards/margins": 0.13692447543144226, "rewards/rejected": -0.21326400339603424, "sft_loss": 0.7633951902389526, "step": 449 }, { "epoch": 0.6507592190889371, "grad_norm": 5.133699682302873, "learning_rate": 7.853195455931362e-06, "logits/chosen": 0.642242431640625, "logits/rejected": 0.5476385951042175, "logps/chosen": -0.7822733521461487, "logps/rejected": -1.8922646045684814, "loss": 0.8471, "odds_ratio_loss": 0.4665636718273163, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07822733372449875, "rewards/margins": 0.11099912226200104, "rewards/rejected": -0.18922646343708038, "sft_loss": 0.7822733521461487, "step": 450 }, { "epoch": 0.6522053506869125, "grad_norm": 2.4945263473256696, "learning_rate": 7.85236058074537e-06, "logits/chosen": 0.43350091576576233, "logits/rejected": 0.348577082157135, "logps/chosen": -0.7209492921829224, "logps/rejected": -2.9272561073303223, "loss": 0.7655, "odds_ratio_loss": 0.4050930142402649, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07209491729736328, "rewards/margins": 0.2206306755542755, "rewards/rejected": -0.2927256226539612, "sft_loss": 0.7209492921829224, "step": 451 }, { "epoch": 0.6536514822848879, "grad_norm": 3.1603284424366227, "learning_rate": 7.851523382956839e-06, "logits/chosen": 0.685008704662323, "logits/rejected": 0.41446003317832947, "logps/chosen": -0.6122561693191528, "logps/rejected": -2.7564573287963867, "loss": 0.8216, "odds_ratio_loss": 0.4269762635231018, "rewards/accuracies": 0.625, "rewards/chosen": -0.061225611716508865, "rewards/margins": 0.2144201248884201, "rewards/rejected": -0.27564573287963867, "sft_loss": 0.6122561693191528, "step": 452 }, { "epoch": 0.6550976138828634, "grad_norm": 2.5123215280417264, "learning_rate": 7.850683863070513e-06, "logits/chosen": 0.5416068434715271, "logits/rejected": 0.39375370740890503, "logps/chosen": -0.8951384425163269, "logps/rejected": -2.091689109802246, "loss": 0.7807, "odds_ratio_loss": 0.598831832408905, "rewards/accuracies": 0.5, "rewards/chosen": -0.08951384574174881, "rewards/margins": 0.1196550726890564, "rewards/rejected": -0.2091689109802246, "sft_loss": 0.8951384425163269, "step": 453 }, { "epoch": 0.6565437454808387, "grad_norm": 2.737876764066823, "learning_rate": 7.849842021592546e-06, "logits/chosen": 0.3288041353225708, "logits/rejected": 0.3482604920864105, "logps/chosen": -0.9502256512641907, "logps/rejected": -1.9984798431396484, "loss": 0.9195, "odds_ratio_loss": 0.6456742882728577, "rewards/accuracies": 0.4375, "rewards/chosen": -0.09502257406711578, "rewards/margins": 0.10482542961835861, "rewards/rejected": -0.1998479962348938, "sft_loss": 0.9502256512641907, "step": 454 }, { "epoch": 0.6579898770788142, "grad_norm": 3.869771792844011, "learning_rate": 7.848997859030484e-06, "logits/chosen": 0.4661051034927368, "logits/rejected": 0.3330320119857788, "logps/chosen": -0.8484359979629517, "logps/rejected": -2.905519485473633, "loss": 0.9405, "odds_ratio_loss": 0.4373171925544739, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08484360575675964, "rewards/margins": 0.2057083249092102, "rewards/rejected": -0.29055193066596985, "sft_loss": 0.8484359979629517, "step": 455 }, { "epoch": 0.6594360086767896, "grad_norm": 3.8838637117754558, "learning_rate": 7.84815137589328e-06, "logits/chosen": 0.4900085926055908, "logits/rejected": 0.449086457490921, "logps/chosen": -0.915755033493042, "logps/rejected": -1.9004579782485962, "loss": 0.9095, "odds_ratio_loss": 0.5265486836433411, "rewards/accuracies": 0.75, "rewards/chosen": -0.0915755107998848, "rewards/margins": 0.0984703004360199, "rewards/rejected": -0.1900458037853241, "sft_loss": 0.915755033493042, "step": 456 }, { "epoch": 0.660882140274765, "grad_norm": 2.8946868825853342, "learning_rate": 7.847302572691277e-06, "logits/chosen": 0.5905077457427979, "logits/rejected": 0.3706495761871338, "logps/chosen": -0.7773427963256836, "logps/rejected": -2.9122698307037354, "loss": 0.7367, "odds_ratio_loss": 0.49745047092437744, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07773428410291672, "rewards/margins": 0.21349269151687622, "rewards/rejected": -0.29122695326805115, "sft_loss": 0.7773427963256836, "step": 457 }, { "epoch": 0.6623282718727405, "grad_norm": 2.6139847805230563, "learning_rate": 7.846451449936224e-06, "logits/chosen": 0.4387049078941345, "logits/rejected": 0.3426019549369812, "logps/chosen": -0.7675788402557373, "logps/rejected": -1.365877389907837, "loss": 0.7461, "odds_ratio_loss": 0.5597048997879028, "rewards/accuracies": 0.75, "rewards/chosen": -0.07675788551568985, "rewards/margins": 0.05982984974980354, "rewards/rejected": -0.1365877389907837, "sft_loss": 0.7675788402557373, "step": 458 }, { "epoch": 0.6637744034707158, "grad_norm": 2.6779398241996044, "learning_rate": 7.845598008141267e-06, "logits/chosen": 0.4579883813858032, "logits/rejected": 0.3316115438938141, "logps/chosen": -0.6582392454147339, "logps/rejected": -3.040428638458252, "loss": 0.7372, "odds_ratio_loss": 0.4125193953514099, "rewards/accuracies": 0.875, "rewards/chosen": -0.06582392752170563, "rewards/margins": 0.2382189780473709, "rewards/rejected": -0.30404287576675415, "sft_loss": 0.6582392454147339, "step": 459 }, { "epoch": 0.6652205350686913, "grad_norm": 2.482458627766373, "learning_rate": 7.844742247820949e-06, "logits/chosen": 0.5820556282997131, "logits/rejected": 0.38141798973083496, "logps/chosen": -0.6435836553573608, "logps/rejected": -3.4378960132598877, "loss": 0.7705, "odds_ratio_loss": 0.36722418665885925, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06435836851596832, "rewards/margins": 0.2794312536716461, "rewards/rejected": -0.34378957748413086, "sft_loss": 0.6435836553573608, "step": 460 }, { "epoch": 0.6666666666666666, "grad_norm": 3.849254654154496, "learning_rate": 7.843884169491209e-06, "logits/chosen": 0.5479345917701721, "logits/rejected": 0.38080063462257385, "logps/chosen": -0.8304556608200073, "logps/rejected": -1.7188962697982788, "loss": 0.88, "odds_ratio_loss": 0.5580796599388123, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08304556459188461, "rewards/margins": 0.08884406089782715, "rewards/rejected": -0.17188964784145355, "sft_loss": 0.8304556608200073, "step": 461 }, { "epoch": 0.6681127982646421, "grad_norm": 3.2623131963394396, "learning_rate": 7.843023773669388e-06, "logits/chosen": 0.5372626185417175, "logits/rejected": 0.38888368010520935, "logps/chosen": -0.732601523399353, "logps/rejected": -3.131098747253418, "loss": 0.7745, "odds_ratio_loss": 0.5246361494064331, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07326015084981918, "rewards/margins": 0.2398497313261032, "rewards/rejected": -0.3131098747253418, "sft_loss": 0.732601523399353, "step": 462 }, { "epoch": 0.6695589298626174, "grad_norm": 6.67370859391976, "learning_rate": 7.842161060874221e-06, "logits/chosen": 0.6117855906486511, "logits/rejected": 0.46079525351524353, "logps/chosen": -0.6942752003669739, "logps/rejected": -3.08976674079895, "loss": 0.8945, "odds_ratio_loss": 0.34388789534568787, "rewards/accuracies": 0.875, "rewards/chosen": -0.06942752003669739, "rewards/margins": 0.2395491600036621, "rewards/rejected": -0.3089766800403595, "sft_loss": 0.6942752003669739, "step": 463 }, { "epoch": 0.6710050614605929, "grad_norm": 3.4901642146880527, "learning_rate": 7.841296031625842e-06, "logits/chosen": 0.5016286969184875, "logits/rejected": 0.30921778082847595, "logps/chosen": -0.8868895769119263, "logps/rejected": -1.7468254566192627, "loss": 0.9116, "odds_ratio_loss": 0.5753090381622314, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08868895471096039, "rewards/margins": 0.08599359542131424, "rewards/rejected": -0.17468255758285522, "sft_loss": 0.8868895769119263, "step": 464 }, { "epoch": 0.6724511930585684, "grad_norm": 2.705934552586253, "learning_rate": 7.840428686445777e-06, "logits/chosen": 0.3969650864601135, "logits/rejected": 0.37639883160591125, "logps/chosen": -0.7846730947494507, "logps/rejected": -3.3052570819854736, "loss": 0.899, "odds_ratio_loss": 0.578758180141449, "rewards/accuracies": 0.625, "rewards/chosen": -0.07846730202436447, "rewards/margins": 0.2520584166049957, "rewards/rejected": -0.3305257260799408, "sft_loss": 0.7846730947494507, "step": 465 }, { "epoch": 0.6738973246565437, "grad_norm": 3.8524541345226453, "learning_rate": 7.839559025856954e-06, "logits/chosen": 0.6053705811500549, "logits/rejected": 0.5268568992614746, "logps/chosen": -0.948915958404541, "logps/rejected": -2.0502045154571533, "loss": 0.9358, "odds_ratio_loss": 0.6905159950256348, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09489160031080246, "rewards/margins": 0.11012885719537735, "rewards/rejected": -0.2050204575061798, "sft_loss": 0.948915958404541, "step": 466 }, { "epoch": 0.6753434562545192, "grad_norm": 2.5938676997175505, "learning_rate": 7.838687050383694e-06, "logits/chosen": 0.5891566276550293, "logits/rejected": 0.4464053809642792, "logps/chosen": -0.857430636882782, "logps/rejected": -1.6854695081710815, "loss": 0.853, "odds_ratio_loss": 0.5570505857467651, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08574306219816208, "rewards/margins": 0.0828038826584816, "rewards/rejected": -0.16854694485664368, "sft_loss": 0.857430636882782, "step": 467 }, { "epoch": 0.6767895878524945, "grad_norm": 3.2940178997552967, "learning_rate": 7.837812760551714e-06, "logits/chosen": 0.43278759717941284, "logits/rejected": 0.26053839921951294, "logps/chosen": -0.7142987251281738, "logps/rejected": -2.4090588092803955, "loss": 0.7569, "odds_ratio_loss": 0.4101409614086151, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07142987847328186, "rewards/margins": 0.16947603225708008, "rewards/rejected": -0.24090591073036194, "sft_loss": 0.7142987251281738, "step": 468 }, { "epoch": 0.67823571945047, "grad_norm": 3.7919377185149608, "learning_rate": 7.83693615688813e-06, "logits/chosen": 0.35429370403289795, "logits/rejected": 0.39123567938804626, "logps/chosen": -0.9550790190696716, "logps/rejected": -1.3350166082382202, "loss": 0.8351, "odds_ratio_loss": 0.614581286907196, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0955079048871994, "rewards/margins": 0.037993766367435455, "rewards/rejected": -0.13350167870521545, "sft_loss": 0.9550790190696716, "step": 469 }, { "epoch": 0.6796818510484454, "grad_norm": 2.91281834178183, "learning_rate": 7.836057239921444e-06, "logits/chosen": 0.5054612755775452, "logits/rejected": 0.43213528394699097, "logps/chosen": -0.9110418558120728, "logps/rejected": -1.7556121349334717, "loss": 0.8131, "odds_ratio_loss": 0.6161195039749146, "rewards/accuracies": 0.625, "rewards/chosen": -0.0911041870713234, "rewards/margins": 0.08445702493190765, "rewards/rejected": -0.17556121945381165, "sft_loss": 0.9110418558120728, "step": 470 }, { "epoch": 0.6811279826464208, "grad_norm": 5.212947929820761, "learning_rate": 7.835176010181563e-06, "logits/chosen": 0.5794857740402222, "logits/rejected": 0.452696293592453, "logps/chosen": -0.9335224628448486, "logps/rejected": -2.0056893825531006, "loss": 0.9011, "odds_ratio_loss": 0.5872938632965088, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09335225075483322, "rewards/margins": 0.10721667855978012, "rewards/rejected": -0.20056892931461334, "sft_loss": 0.9335224628448486, "step": 471 }, { "epoch": 0.6825741142443963, "grad_norm": 4.005336153240001, "learning_rate": 7.834292468199781e-06, "logits/chosen": 0.40509578585624695, "logits/rejected": 0.4229750335216522, "logps/chosen": -0.8150092959403992, "logps/rejected": -2.13399600982666, "loss": 0.9764, "odds_ratio_loss": 0.48038750886917114, "rewards/accuracies": 0.75, "rewards/chosen": -0.08150092512369156, "rewards/margins": 0.1318986564874649, "rewards/rejected": -0.21339958906173706, "sft_loss": 0.8150092959403992, "step": 472 }, { "epoch": 0.6840202458423716, "grad_norm": 4.70726946640677, "learning_rate": 7.833406614508788e-06, "logits/chosen": 0.6632890105247498, "logits/rejected": 0.6132000684738159, "logps/chosen": -0.5284931063652039, "logps/rejected": -2.332587480545044, "loss": 0.7618, "odds_ratio_loss": 0.4303593635559082, "rewards/accuracies": 0.75, "rewards/chosen": -0.05284930765628815, "rewards/margins": 0.18040944635868073, "rewards/rejected": -0.23325875401496887, "sft_loss": 0.5284931063652039, "step": 473 }, { "epoch": 0.6854663774403471, "grad_norm": 3.324398929863063, "learning_rate": 7.832518449642672e-06, "logits/chosen": 0.422392874956131, "logits/rejected": 0.4441375434398651, "logps/chosen": -0.9545919895172119, "logps/rejected": -2.223881483078003, "loss": 0.9125, "odds_ratio_loss": 0.6443504095077515, "rewards/accuracies": 0.625, "rewards/chosen": -0.09545920789241791, "rewards/margins": 0.12692894041538239, "rewards/rejected": -0.2223881483078003, "sft_loss": 0.9545919895172119, "step": 474 }, { "epoch": 0.6869125090383225, "grad_norm": 4.571799028540172, "learning_rate": 7.831627974136907e-06, "logits/chosen": 0.4669177830219269, "logits/rejected": 0.43409720063209534, "logps/chosen": -0.9336066246032715, "logps/rejected": -2.756803512573242, "loss": 0.9133, "odds_ratio_loss": 0.5451022386550903, "rewards/accuracies": 0.75, "rewards/chosen": -0.09336065500974655, "rewards/margins": 0.18231970071792603, "rewards/rejected": -0.2756803631782532, "sft_loss": 0.9336066246032715, "step": 475 }, { "epoch": 0.6883586406362979, "grad_norm": 8.263383358535402, "learning_rate": 7.830735188528369e-06, "logits/chosen": 0.5980125069618225, "logits/rejected": 0.349201500415802, "logps/chosen": -0.5357162356376648, "logps/rejected": -4.337940216064453, "loss": 0.9422, "odds_ratio_loss": 0.31241413950920105, "rewards/accuracies": 0.75, "rewards/chosen": -0.05357161909341812, "rewards/margins": 0.3802223801612854, "rewards/rejected": -0.4337940216064453, "sft_loss": 0.5357162356376648, "step": 476 }, { "epoch": 0.6898047722342733, "grad_norm": 2.6729720908747536, "learning_rate": 7.829840093355315e-06, "logits/chosen": 0.41900211572647095, "logits/rejected": 0.4156907796859741, "logps/chosen": -0.8371658325195312, "logps/rejected": -1.4804648160934448, "loss": 0.8874, "odds_ratio_loss": 0.5720455050468445, "rewards/accuracies": 0.625, "rewards/chosen": -0.08371657878160477, "rewards/margins": 0.06432989984750748, "rewards/rejected": -0.14804649353027344, "sft_loss": 0.8371658325195312, "step": 477 }, { "epoch": 0.6912509038322487, "grad_norm": 2.996041581883456, "learning_rate": 7.828942689157407e-06, "logits/chosen": 0.4665514826774597, "logits/rejected": 0.43654048442840576, "logps/chosen": -0.8104965090751648, "logps/rejected": -2.2539286613464355, "loss": 0.8045, "odds_ratio_loss": 0.5409241318702698, "rewards/accuracies": 0.625, "rewards/chosen": -0.08104965090751648, "rewards/margins": 0.14434322714805603, "rewards/rejected": -0.2253929078578949, "sft_loss": 0.8104965090751648, "step": 478 }, { "epoch": 0.6926970354302241, "grad_norm": 2.886010343753202, "learning_rate": 7.82804297647569e-06, "logits/chosen": 0.39548736810684204, "logits/rejected": 0.25323015451431274, "logps/chosen": -0.8145580887794495, "logps/rejected": -1.8616585731506348, "loss": 0.7833, "odds_ratio_loss": 0.581149160861969, "rewards/accuracies": 0.625, "rewards/chosen": -0.08145581185817719, "rewards/margins": 0.10471004992723465, "rewards/rejected": -0.18616585433483124, "sft_loss": 0.8145580887794495, "step": 479 }, { "epoch": 0.6941431670281996, "grad_norm": 3.721831170544819, "learning_rate": 7.827140955852606e-06, "logits/chosen": 0.5488985776901245, "logits/rejected": 0.43202951550483704, "logps/chosen": -0.8996453285217285, "logps/rejected": -1.8889070749282837, "loss": 0.9119, "odds_ratio_loss": 0.5485165119171143, "rewards/accuracies": 0.625, "rewards/chosen": -0.08996453881263733, "rewards/margins": 0.09892617166042328, "rewards/rejected": -0.18889069557189941, "sft_loss": 0.8996453285217285, "step": 480 }, { "epoch": 0.695589298626175, "grad_norm": 3.8244041653006247, "learning_rate": 7.826236627831986e-06, "logits/chosen": 0.49204105138778687, "logits/rejected": 0.3208334147930145, "logps/chosen": -0.7775691151618958, "logps/rejected": -2.3301753997802734, "loss": 0.875, "odds_ratio_loss": 0.42941606044769287, "rewards/accuracies": 0.75, "rewards/chosen": -0.07775691151618958, "rewards/margins": 0.15526065230369568, "rewards/rejected": -0.23301756381988525, "sft_loss": 0.7775691151618958, "step": 481 }, { "epoch": 0.6970354302241504, "grad_norm": 2.607178353982076, "learning_rate": 7.825329992959054e-06, "logits/chosen": 0.4929500222206116, "logits/rejected": 0.40421661734580994, "logps/chosen": -0.8761095404624939, "logps/rejected": -1.5417531728744507, "loss": 0.8476, "odds_ratio_loss": 0.584408700466156, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08761096000671387, "rewards/margins": 0.06656436622142792, "rewards/rejected": -0.15417534112930298, "sft_loss": 0.8761095404624939, "step": 482 }, { "epoch": 0.6984815618221258, "grad_norm": 2.6174084912028563, "learning_rate": 7.82442105178042e-06, "logits/chosen": 0.6119585633277893, "logits/rejected": 0.5422635078430176, "logps/chosen": -0.7514777779579163, "logps/rejected": -2.8054733276367188, "loss": 0.7823, "odds_ratio_loss": 0.3922504782676697, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07514777779579163, "rewards/margins": 0.20539957284927368, "rewards/rejected": -0.2805473506450653, "sft_loss": 0.7514777779579163, "step": 483 }, { "epoch": 0.6999276934201012, "grad_norm": 2.3537756879016785, "learning_rate": 7.823509804844091e-06, "logits/chosen": 0.5284101963043213, "logits/rejected": 0.3043856620788574, "logps/chosen": -0.8965635299682617, "logps/rejected": -1.57173490524292, "loss": 0.865, "odds_ratio_loss": 0.5315253734588623, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08965635299682617, "rewards/margins": 0.06751712411642075, "rewards/rejected": -0.1571734994649887, "sft_loss": 0.8965635299682617, "step": 484 }, { "epoch": 0.7013738250180767, "grad_norm": 4.370410960026388, "learning_rate": 7.82259625269946e-06, "logits/chosen": 0.4739269018173218, "logits/rejected": 0.28087443113327026, "logps/chosen": -0.7558042407035828, "logps/rejected": -2.000213623046875, "loss": 0.8116, "odds_ratio_loss": 0.4949937164783478, "rewards/accuracies": 0.625, "rewards/chosen": -0.0755804255604744, "rewards/margins": 0.12444092333316803, "rewards/rejected": -0.20002135634422302, "sft_loss": 0.7558042407035828, "step": 485 }, { "epoch": 0.702819956616052, "grad_norm": 2.757296734529158, "learning_rate": 7.821680395897311e-06, "logits/chosen": 0.5641602873802185, "logits/rejected": 0.4281392991542816, "logps/chosen": -0.9164305925369263, "logps/rejected": -1.9298672676086426, "loss": 0.8482, "odds_ratio_loss": 0.5546359419822693, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0916430652141571, "rewards/margins": 0.10134366154670715, "rewards/rejected": -0.19298672676086426, "sft_loss": 0.9164305925369263, "step": 486 }, { "epoch": 0.7042660882140275, "grad_norm": 2.7370944482804203, "learning_rate": 7.820762234989819e-06, "logits/chosen": 0.5384199619293213, "logits/rejected": 0.41531190276145935, "logps/chosen": -0.8857967257499695, "logps/rejected": -1.7009695768356323, "loss": 0.9314, "odds_ratio_loss": 0.6683101654052734, "rewards/accuracies": 0.5, "rewards/chosen": -0.08857966959476471, "rewards/margins": 0.0815172791481018, "rewards/rejected": -0.1700969636440277, "sft_loss": 0.8857967257499695, "step": 487 }, { "epoch": 0.7057122198120029, "grad_norm": 3.6320189601532973, "learning_rate": 7.819841770530546e-06, "logits/chosen": 0.5773338675498962, "logits/rejected": 0.33606287837028503, "logps/chosen": -0.5831294059753418, "logps/rejected": -2.437774181365967, "loss": 0.8034, "odds_ratio_loss": 0.40286755561828613, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05831294506788254, "rewards/margins": 0.18546447157859802, "rewards/rejected": -0.24377740919589996, "sft_loss": 0.5831294059753418, "step": 488 }, { "epoch": 0.7071583514099783, "grad_norm": 2.8677538750318003, "learning_rate": 7.818919003074443e-06, "logits/chosen": 0.42975157499313354, "logits/rejected": 0.3642490804195404, "logps/chosen": -0.8703562021255493, "logps/rejected": -1.5688755512237549, "loss": 0.8759, "odds_ratio_loss": 0.5489083528518677, "rewards/accuracies": 0.625, "rewards/chosen": -0.08703562617301941, "rewards/margins": 0.06985193490982056, "rewards/rejected": -0.15688756108283997, "sft_loss": 0.8703562021255493, "step": 489 }, { "epoch": 0.7086044830079538, "grad_norm": 3.346433651687049, "learning_rate": 7.817993933177848e-06, "logits/chosen": 0.4698405861854553, "logits/rejected": 0.3148632347583771, "logps/chosen": -0.8803499937057495, "logps/rejected": -2.480961561203003, "loss": 0.9126, "odds_ratio_loss": 0.6439063549041748, "rewards/accuracies": 0.625, "rewards/chosen": -0.08803500235080719, "rewards/margins": 0.16006116569042206, "rewards/rejected": -0.24809619784355164, "sft_loss": 0.8803499937057495, "step": 490 }, { "epoch": 0.7100506146059291, "grad_norm": 2.4161250021000606, "learning_rate": 7.817066561398493e-06, "logits/chosen": 0.6872197985649109, "logits/rejected": 0.5721977353096008, "logps/chosen": -0.6891111731529236, "logps/rejected": -2.3174729347229004, "loss": 0.8469, "odds_ratio_loss": 0.5914407968521118, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0689111202955246, "rewards/margins": 0.16283617913722992, "rewards/rejected": -0.23174728453159332, "sft_loss": 0.6891111731529236, "step": 491 }, { "epoch": 0.7114967462039046, "grad_norm": 2.375376214224217, "learning_rate": 7.81613688829549e-06, "logits/chosen": 0.5590274930000305, "logits/rejected": 0.4893609285354614, "logps/chosen": -0.8814407587051392, "logps/rejected": -1.0228970050811768, "loss": 1.008, "odds_ratio_loss": 0.6809512972831726, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08814407885074615, "rewards/margins": 0.014145624823868275, "rewards/rejected": -0.10228970646858215, "sft_loss": 0.8814407587051392, "step": 492 }, { "epoch": 0.7129428778018799, "grad_norm": 2.915036842751362, "learning_rate": 7.815204914429343e-06, "logits/chosen": 0.45566874742507935, "logits/rejected": 0.29462623596191406, "logps/chosen": -0.8483090400695801, "logps/rejected": -3.8663082122802734, "loss": 0.8399, "odds_ratio_loss": 0.5736794471740723, "rewards/accuracies": 0.625, "rewards/chosen": -0.08483090251684189, "rewards/margins": 0.3017999529838562, "rewards/rejected": -0.3866308331489563, "sft_loss": 0.8483090400695801, "step": 493 }, { "epoch": 0.7143890093998554, "grad_norm": 2.823588392434594, "learning_rate": 7.814270640361947e-06, "logits/chosen": 0.4194034934043884, "logits/rejected": 0.3192807734012604, "logps/chosen": -0.7508400678634644, "logps/rejected": -2.6305389404296875, "loss": 0.9102, "odds_ratio_loss": 0.37007543444633484, "rewards/accuracies": 0.875, "rewards/chosen": -0.07508400082588196, "rewards/margins": 0.1879698932170868, "rewards/rejected": -0.26305389404296875, "sft_loss": 0.7508400678634644, "step": 494 }, { "epoch": 0.7158351409978309, "grad_norm": 8.703869182596133, "learning_rate": 7.813334066656575e-06, "logits/chosen": 0.5819438099861145, "logits/rejected": 0.35457098484039307, "logps/chosen": -0.6189179420471191, "logps/rejected": -4.7622880935668945, "loss": 0.7467, "odds_ratio_loss": 0.27410566806793213, "rewards/accuracies": 0.9375, "rewards/chosen": -0.061891794204711914, "rewards/margins": 0.41433700919151306, "rewards/rejected": -0.476228803396225, "sft_loss": 0.6189179420471191, "step": 495 }, { "epoch": 0.7172812725958062, "grad_norm": 2.5095698313273447, "learning_rate": 7.812395193877891e-06, "logits/chosen": 0.6065265536308289, "logits/rejected": 0.36203110218048096, "logps/chosen": -0.8203286528587341, "logps/rejected": -1.367362141609192, "loss": 0.8163, "odds_ratio_loss": 0.5582252740859985, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08203285932540894, "rewards/margins": 0.054703351110219955, "rewards/rejected": -0.1367362141609192, "sft_loss": 0.8203286528587341, "step": 496 }, { "epoch": 0.7187274041937817, "grad_norm": 2.450038513585519, "learning_rate": 7.811454022591946e-06, "logits/chosen": 0.410911500453949, "logits/rejected": 0.3190882205963135, "logps/chosen": -0.840955913066864, "logps/rejected": -2.12227725982666, "loss": 0.861, "odds_ratio_loss": 0.4995306730270386, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08409559726715088, "rewards/margins": 0.12813213467597961, "rewards/rejected": -0.2122277319431305, "sft_loss": 0.840955913066864, "step": 497 }, { "epoch": 0.720173535791757, "grad_norm": 3.1149539308383627, "learning_rate": 7.810510553366177e-06, "logits/chosen": 0.6082262396812439, "logits/rejected": 0.48194149136543274, "logps/chosen": -0.8298695087432861, "logps/rejected": -1.5063502788543701, "loss": 0.8049, "odds_ratio_loss": 0.5610617399215698, "rewards/accuracies": 0.75, "rewards/chosen": -0.08298695832490921, "rewards/margins": 0.06764806807041168, "rewards/rejected": -0.1506350338459015, "sft_loss": 0.8298695087432861, "step": 498 }, { "epoch": 0.7216196673897325, "grad_norm": 4.575070600880403, "learning_rate": 7.809564786769403e-06, "logits/chosen": 0.42572611570358276, "logits/rejected": 0.5204564332962036, "logps/chosen": -0.7067459225654602, "logps/rejected": -1.7655752897262573, "loss": 0.8539, "odds_ratio_loss": 0.49053311347961426, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0706745982170105, "rewards/margins": 0.10588293522596359, "rewards/rejected": -0.1765575408935547, "sft_loss": 0.7067459225654602, "step": 499 }, { "epoch": 0.7230657989877078, "grad_norm": 4.362979902093869, "learning_rate": 7.808616723371828e-06, "logits/chosen": 0.5878921151161194, "logits/rejected": 0.39825379848480225, "logps/chosen": -0.6144382953643799, "logps/rejected": -1.6051030158996582, "loss": 0.7595, "odds_ratio_loss": 0.3771412968635559, "rewards/accuracies": 0.75, "rewards/chosen": -0.06144382804632187, "rewards/margins": 0.09906647354364395, "rewards/rejected": -0.16051030158996582, "sft_loss": 0.6144382953643799, "step": 500 }, { "epoch": 0.7245119305856833, "grad_norm": 2.838948408412676, "learning_rate": 7.807666363745048e-06, "logits/chosen": 0.4524368941783905, "logits/rejected": 0.24179279804229736, "logps/chosen": -0.6637477874755859, "logps/rejected": -1.8917555809020996, "loss": 0.9149, "odds_ratio_loss": 0.4682096838951111, "rewards/accuracies": 0.75, "rewards/chosen": -0.0663747787475586, "rewards/margins": 0.1228007823228836, "rewards/rejected": -0.1891755610704422, "sft_loss": 0.6637477874755859, "step": 501 }, { "epoch": 0.7259580621836587, "grad_norm": 3.244484568190041, "learning_rate": 7.806713708462036e-06, "logits/chosen": 0.31584107875823975, "logits/rejected": 0.3299151062965393, "logps/chosen": -0.8359942436218262, "logps/rejected": -1.7388925552368164, "loss": 0.9218, "odds_ratio_loss": 0.34620821475982666, "rewards/accuracies": 0.9375, "rewards/chosen": -0.08359942585229874, "rewards/margins": 0.09028984606266022, "rewards/rejected": -0.17388926446437836, "sft_loss": 0.8359942436218262, "step": 502 }, { "epoch": 0.7274041937816341, "grad_norm": 2.9413391620870204, "learning_rate": 7.805758758097152e-06, "logits/chosen": 0.39965227246284485, "logits/rejected": 0.2767530083656311, "logps/chosen": -0.682356059551239, "logps/rejected": -1.8037464618682861, "loss": 0.8816, "odds_ratio_loss": 0.42978233098983765, "rewards/accuracies": 0.75, "rewards/chosen": -0.0682356059551239, "rewards/margins": 0.11213904619216919, "rewards/rejected": -0.18037466704845428, "sft_loss": 0.682356059551239, "step": 503 }, { "epoch": 0.7288503253796096, "grad_norm": 7.08235782639055, "learning_rate": 7.804801513226138e-06, "logits/chosen": 0.6158980131149292, "logits/rejected": 0.4643901288509369, "logps/chosen": -0.6825918555259705, "logps/rejected": -1.3164550065994263, "loss": 0.9577, "odds_ratio_loss": 0.5534126162528992, "rewards/accuracies": 0.5625, "rewards/chosen": -0.06825917959213257, "rewards/margins": 0.06338632106781006, "rewards/rejected": -0.13164550065994263, "sft_loss": 0.6825918555259705, "step": 504 }, { "epoch": 0.7302964569775849, "grad_norm": 2.430837485754504, "learning_rate": 7.80384197442612e-06, "logits/chosen": 0.6757205724716187, "logits/rejected": 0.3712140619754791, "logps/chosen": -0.7965311408042908, "logps/rejected": -1.7287770509719849, "loss": 0.8777, "odds_ratio_loss": 0.5180415511131287, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07965311408042908, "rewards/margins": 0.09322459250688553, "rewards/rejected": -0.172877699136734, "sft_loss": 0.7965311408042908, "step": 505 }, { "epoch": 0.7317425885755604, "grad_norm": 2.461862933908863, "learning_rate": 7.802880142275609e-06, "logits/chosen": 0.5290156006813049, "logits/rejected": 0.36588141322135925, "logps/chosen": -0.6499413251876831, "logps/rejected": -2.212672472000122, "loss": 0.7377, "odds_ratio_loss": 0.4530958831310272, "rewards/accuracies": 0.75, "rewards/chosen": -0.06499413400888443, "rewards/margins": 0.15627311170101166, "rewards/rejected": -0.2212672382593155, "sft_loss": 0.6499413251876831, "step": 506 }, { "epoch": 0.7331887201735358, "grad_norm": 4.607958134692978, "learning_rate": 7.801916017354498e-06, "logits/chosen": 0.4226146936416626, "logits/rejected": 0.34249216318130493, "logps/chosen": -0.8650091290473938, "logps/rejected": -1.9593756198883057, "loss": 0.8793, "odds_ratio_loss": 0.5410417318344116, "rewards/accuracies": 0.625, "rewards/chosen": -0.08650091290473938, "rewards/margins": 0.10943662375211716, "rewards/rejected": -0.19593754410743713, "sft_loss": 0.8650091290473938, "step": 507 }, { "epoch": 0.7346348517715112, "grad_norm": 2.889267730930254, "learning_rate": 7.80094960024406e-06, "logits/chosen": 0.382773756980896, "logits/rejected": 0.24849705398082733, "logps/chosen": -0.9779189825057983, "logps/rejected": -2.239924430847168, "loss": 0.7804, "odds_ratio_loss": 0.5563200116157532, "rewards/accuracies": 0.75, "rewards/chosen": -0.0977918952703476, "rewards/margins": 0.12620052695274353, "rewards/rejected": -0.2239924520254135, "sft_loss": 0.9779189825057983, "step": 508 }, { "epoch": 0.7360809833694866, "grad_norm": 3.7010451738157886, "learning_rate": 7.799980891526951e-06, "logits/chosen": 0.48358017206192017, "logits/rejected": 0.28892093896865845, "logps/chosen": -0.7672395706176758, "logps/rejected": -2.660430431365967, "loss": 0.9052, "odds_ratio_loss": 0.34770241379737854, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07672396302223206, "rewards/margins": 0.18931908905506134, "rewards/rejected": -0.2660430669784546, "sft_loss": 0.7672395706176758, "step": 509 }, { "epoch": 0.737527114967462, "grad_norm": 2.965109666641044, "learning_rate": 7.799009891787211e-06, "logits/chosen": 0.4797486960887909, "logits/rejected": 0.34847331047058105, "logps/chosen": -0.7682068347930908, "logps/rejected": -1.6704812049865723, "loss": 0.7642, "odds_ratio_loss": 0.5312727093696594, "rewards/accuracies": 0.75, "rewards/chosen": -0.07682067900896072, "rewards/margins": 0.09022743999958038, "rewards/rejected": -0.1670481115579605, "sft_loss": 0.7682068347930908, "step": 510 }, { "epoch": 0.7389732465654375, "grad_norm": 4.029197098624356, "learning_rate": 7.798036601610256e-06, "logits/chosen": 0.3794160783290863, "logits/rejected": 0.309948205947876, "logps/chosen": -0.7439759969711304, "logps/rejected": -2.09332013130188, "loss": 0.8295, "odds_ratio_loss": 0.5097244381904602, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07439759373664856, "rewards/margins": 0.13493439555168152, "rewards/rejected": -0.20933200418949127, "sft_loss": 0.7439759969711304, "step": 511 }, { "epoch": 0.7404193781634129, "grad_norm": 2.667176937833829, "learning_rate": 7.79706102158289e-06, "logits/chosen": 0.5048097968101501, "logits/rejected": 0.3359774053096771, "logps/chosen": -0.9156619310379028, "logps/rejected": -1.9869725704193115, "loss": 0.9098, "odds_ratio_loss": 0.5073124766349792, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09156619757413864, "rewards/margins": 0.10713106393814087, "rewards/rejected": -0.1986972540616989, "sft_loss": 0.9156619310379028, "step": 512 }, { "epoch": 0.7418655097613883, "grad_norm": 2.7993967833945868, "learning_rate": 7.796083152293293e-06, "logits/chosen": 0.5514634251594543, "logits/rejected": 0.49493858218193054, "logps/chosen": -0.719922661781311, "logps/rejected": -1.1581745147705078, "loss": 0.8478, "odds_ratio_loss": 0.5648511052131653, "rewards/accuracies": 0.625, "rewards/chosen": -0.07199226319789886, "rewards/margins": 0.0438251867890358, "rewards/rejected": -0.11581744998693466, "sft_loss": 0.719922661781311, "step": 513 }, { "epoch": 0.7433116413593637, "grad_norm": 3.1335701177478303, "learning_rate": 7.795102994331024e-06, "logits/chosen": 0.41231465339660645, "logits/rejected": 0.26936453580856323, "logps/chosen": -0.9713563323020935, "logps/rejected": -1.754350185394287, "loss": 0.8876, "odds_ratio_loss": 0.46331292390823364, "rewards/accuracies": 0.75, "rewards/chosen": -0.09713563323020935, "rewards/margins": 0.078299380838871, "rewards/rejected": -0.17543500661849976, "sft_loss": 0.9713563323020935, "step": 514 }, { "epoch": 0.7447577729573391, "grad_norm": 3.4077557927604154, "learning_rate": 7.794120548287026e-06, "logits/chosen": 0.5619039535522461, "logits/rejected": 0.44908225536346436, "logps/chosen": -0.8658353090286255, "logps/rejected": -1.4560352563858032, "loss": 0.8073, "odds_ratio_loss": 0.6640084981918335, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08658353984355927, "rewards/margins": 0.05901998281478882, "rewards/rejected": -0.14560352265834808, "sft_loss": 0.8658353090286255, "step": 515 }, { "epoch": 0.7462039045553145, "grad_norm": 2.278784725218634, "learning_rate": 7.793135814753618e-06, "logits/chosen": 0.5649205446243286, "logits/rejected": 0.4413209855556488, "logps/chosen": -0.7314971685409546, "logps/rejected": -1.7357701063156128, "loss": 0.8308, "odds_ratio_loss": 0.6118183135986328, "rewards/accuracies": 0.5, "rewards/chosen": -0.07314971834421158, "rewards/margins": 0.1004272997379303, "rewards/rejected": -0.17357701063156128, "sft_loss": 0.7314971685409546, "step": 516 }, { "epoch": 0.74765003615329, "grad_norm": 2.527228379509471, "learning_rate": 7.7921487943245e-06, "logits/chosen": 0.5564696192741394, "logits/rejected": 0.26475444436073303, "logps/chosen": -0.6380961537361145, "logps/rejected": -3.072185516357422, "loss": 0.7328, "odds_ratio_loss": 0.37601861357688904, "rewards/accuracies": 0.75, "rewards/chosen": -0.06380962580442429, "rewards/margins": 0.2434089481830597, "rewards/rejected": -0.3072185814380646, "sft_loss": 0.6380961537361145, "step": 517 }, { "epoch": 0.7490961677512654, "grad_norm": 4.417874578202541, "learning_rate": 7.791159487594752e-06, "logits/chosen": 0.37396425008773804, "logits/rejected": 0.2895720601081848, "logps/chosen": -0.9451796412467957, "logps/rejected": -1.155394434928894, "loss": 0.854, "odds_ratio_loss": 0.7048704028129578, "rewards/accuracies": 0.5, "rewards/chosen": -0.09451796114444733, "rewards/margins": 0.021021487191319466, "rewards/rejected": -0.11553944647312164, "sft_loss": 0.9451796412467957, "step": 518 }, { "epoch": 0.7505422993492408, "grad_norm": 4.6204210581385325, "learning_rate": 7.790167895160827e-06, "logits/chosen": 0.49021750688552856, "logits/rejected": 0.38616707921028137, "logps/chosen": -0.6952154636383057, "logps/rejected": -1.7352021932601929, "loss": 0.8745, "odds_ratio_loss": 0.521263062953949, "rewards/accuracies": 0.625, "rewards/chosen": -0.06952154636383057, "rewards/margins": 0.10399869829416275, "rewards/rejected": -0.17352023720741272, "sft_loss": 0.6952154636383057, "step": 519 }, { "epoch": 0.7519884309472162, "grad_norm": 7.408022711503893, "learning_rate": 7.789174017620563e-06, "logits/chosen": 0.5427068471908569, "logits/rejected": 0.5353628396987915, "logps/chosen": -0.7601156234741211, "logps/rejected": -1.1861733198165894, "loss": 0.8739, "odds_ratio_loss": 0.5829976797103882, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07601156085729599, "rewards/margins": 0.04260575771331787, "rewards/rejected": -0.11861732602119446, "sft_loss": 0.7601156234741211, "step": 520 }, { "epoch": 0.7534345625451916, "grad_norm": 3.002218429840675, "learning_rate": 7.788177855573172e-06, "logits/chosen": 0.4132567048072815, "logits/rejected": 0.40480029582977295, "logps/chosen": -0.5906501412391663, "logps/rejected": -1.6327810287475586, "loss": 0.8547, "odds_ratio_loss": 0.5081474781036377, "rewards/accuracies": 0.6875, "rewards/chosen": -0.059065017849206924, "rewards/margins": 0.10421308130025864, "rewards/rejected": -0.16327810287475586, "sft_loss": 0.5906501412391663, "step": 521 }, { "epoch": 0.754880694143167, "grad_norm": 4.557354676364852, "learning_rate": 7.787179409619243e-06, "logits/chosen": 0.39193394780158997, "logits/rejected": 0.36725348234176636, "logps/chosen": -0.7613071203231812, "logps/rejected": -1.2227325439453125, "loss": 0.8271, "odds_ratio_loss": 0.5340095162391663, "rewards/accuracies": 0.625, "rewards/chosen": -0.07613071799278259, "rewards/margins": 0.046142540872097015, "rewards/rejected": -0.12227325141429901, "sft_loss": 0.7613071203231812, "step": 522 }, { "epoch": 0.7563268257411424, "grad_norm": 3.041867939127662, "learning_rate": 7.786178680360743e-06, "logits/chosen": 0.4620596468448639, "logits/rejected": 0.37321341037750244, "logps/chosen": -0.9166240692138672, "logps/rejected": -1.4555108547210693, "loss": 0.9253, "odds_ratio_loss": 0.6333938837051392, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09166240692138672, "rewards/margins": 0.053888678550720215, "rewards/rejected": -0.14555108547210693, "sft_loss": 0.9166240692138672, "step": 523 }, { "epoch": 0.7577729573391179, "grad_norm": 2.850718909419124, "learning_rate": 7.785175668401015e-06, "logits/chosen": 0.35010722279548645, "logits/rejected": 0.2687487006187439, "logps/chosen": -0.7346320748329163, "logps/rejected": -2.8913722038269043, "loss": 0.8036, "odds_ratio_loss": 0.31207412481307983, "rewards/accuracies": 0.875, "rewards/chosen": -0.07346320897340775, "rewards/margins": 0.21567398309707642, "rewards/rejected": -0.28913718461990356, "sft_loss": 0.7346320748329163, "step": 524 }, { "epoch": 0.7592190889370932, "grad_norm": 2.8661637700070517, "learning_rate": 7.784170374344778e-06, "logits/chosen": 0.3620089292526245, "logits/rejected": 0.28093022108078003, "logps/chosen": -0.947007417678833, "logps/rejected": -2.0669689178466797, "loss": 0.9131, "odds_ratio_loss": 0.604080855846405, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09470075368881226, "rewards/margins": 0.11199614405632019, "rewards/rejected": -0.20669689774513245, "sft_loss": 0.947007417678833, "step": 525 }, { "epoch": 0.7606652205350687, "grad_norm": 3.1260980643123295, "learning_rate": 7.78316279879813e-06, "logits/chosen": 0.4290822148323059, "logits/rejected": 0.3005208373069763, "logps/chosen": -0.9603489637374878, "logps/rejected": -1.726524829864502, "loss": 0.9003, "odds_ratio_loss": 0.7454808950424194, "rewards/accuracies": 0.625, "rewards/chosen": -0.09603489935398102, "rewards/margins": 0.07661759853363037, "rewards/rejected": -0.1726524978876114, "sft_loss": 0.9603489637374878, "step": 526 }, { "epoch": 0.7621113521330442, "grad_norm": 3.225760777962045, "learning_rate": 7.78215294236854e-06, "logits/chosen": 0.41072916984558105, "logits/rejected": 0.38332122564315796, "logps/chosen": -0.7351124286651611, "logps/rejected": -1.3829286098480225, "loss": 0.889, "odds_ratio_loss": 0.4489685893058777, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07351124286651611, "rewards/margins": 0.06478161364793777, "rewards/rejected": -0.13829286396503448, "sft_loss": 0.7351124286651611, "step": 527 }, { "epoch": 0.7635574837310195, "grad_norm": 3.557799675646004, "learning_rate": 7.781140805664854e-06, "logits/chosen": 0.5594673752784729, "logits/rejected": 0.4795070290565491, "logps/chosen": -0.6471788883209229, "logps/rejected": -1.9761933088302612, "loss": 0.7664, "odds_ratio_loss": 0.5941170454025269, "rewards/accuracies": 0.5, "rewards/chosen": -0.06471788883209229, "rewards/margins": 0.13290144503116608, "rewards/rejected": -0.19761933386325836, "sft_loss": 0.6471788883209229, "step": 528 }, { "epoch": 0.765003615328995, "grad_norm": 2.6986867787865294, "learning_rate": 7.780126389297296e-06, "logits/chosen": 0.2951076626777649, "logits/rejected": 0.20684948563575745, "logps/chosen": -0.7162260413169861, "logps/rejected": -1.82748544216156, "loss": 0.8949, "odds_ratio_loss": 0.44935142993927, "rewards/accuracies": 0.75, "rewards/chosen": -0.07162261009216309, "rewards/margins": 0.11112594604492188, "rewards/rejected": -0.18274855613708496, "sft_loss": 0.7162260413169861, "step": 529 }, { "epoch": 0.7664497469269703, "grad_norm": 2.607178013962108, "learning_rate": 7.779109693877458e-06, "logits/chosen": 0.5780777931213379, "logits/rejected": 0.3079754710197449, "logps/chosen": -0.6577814221382141, "logps/rejected": -1.864539384841919, "loss": 0.802, "odds_ratio_loss": 0.4034327268600464, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06577815115451813, "rewards/margins": 0.12067579478025436, "rewards/rejected": -0.1864539533853531, "sft_loss": 0.6577814221382141, "step": 530 }, { "epoch": 0.7678958785249458, "grad_norm": 3.7648940723064124, "learning_rate": 7.77809072001831e-06, "logits/chosen": 0.4569055140018463, "logits/rejected": 0.35255199670791626, "logps/chosen": -0.8331925868988037, "logps/rejected": -1.2831447124481201, "loss": 0.7794, "odds_ratio_loss": 0.624811053276062, "rewards/accuracies": 0.625, "rewards/chosen": -0.08331926167011261, "rewards/margins": 0.04499521851539612, "rewards/rejected": -0.12831448018550873, "sft_loss": 0.8331925868988037, "step": 531 }, { "epoch": 0.7693420101229211, "grad_norm": 3.154231228709698, "learning_rate": 7.777069468334197e-06, "logits/chosen": 0.3883509635925293, "logits/rejected": 0.4145001173019409, "logps/chosen": -0.6996007561683655, "logps/rejected": -1.6742178201675415, "loss": 0.8262, "odds_ratio_loss": 0.6708589196205139, "rewards/accuracies": 0.75, "rewards/chosen": -0.06996007263660431, "rewards/margins": 0.09746171534061432, "rewards/rejected": -0.16742177307605743, "sft_loss": 0.6996007561683655, "step": 532 }, { "epoch": 0.7707881417208966, "grad_norm": 2.81732861155204, "learning_rate": 7.776045939440835e-06, "logits/chosen": 0.2952539920806885, "logits/rejected": 0.20435559749603271, "logps/chosen": -1.0842608213424683, "logps/rejected": -1.9723320007324219, "loss": 0.9858, "odds_ratio_loss": 0.6315648555755615, "rewards/accuracies": 0.5625, "rewards/chosen": -0.10842607915401459, "rewards/margins": 0.088807113468647, "rewards/rejected": -0.197233185172081, "sft_loss": 1.0842608213424683, "step": 533 }, { "epoch": 0.7722342733188721, "grad_norm": 3.248807458733231, "learning_rate": 7.77502013395531e-06, "logits/chosen": 0.3924180865287781, "logits/rejected": 0.3698059320449829, "logps/chosen": -0.8218859434127808, "logps/rejected": -1.5072885751724243, "loss": 0.7944, "odds_ratio_loss": 0.3994704484939575, "rewards/accuracies": 0.875, "rewards/chosen": -0.08218859881162643, "rewards/margins": 0.06854026019573212, "rewards/rejected": -0.15072886645793915, "sft_loss": 0.8218859434127808, "step": 534 }, { "epoch": 0.7736804049168474, "grad_norm": 2.471150436216554, "learning_rate": 7.773992052496087e-06, "logits/chosen": 0.413666307926178, "logits/rejected": 0.315445214509964, "logps/chosen": -0.8587773442268372, "logps/rejected": -2.272141456604004, "loss": 0.9716, "odds_ratio_loss": 0.5251275897026062, "rewards/accuracies": 0.625, "rewards/chosen": -0.08587773889303207, "rewards/margins": 0.14133641123771667, "rewards/rejected": -0.22721417248249054, "sft_loss": 0.8587773442268372, "step": 535 }, { "epoch": 0.7751265365148229, "grad_norm": 6.0303720876003615, "learning_rate": 7.772961695683001e-06, "logits/chosen": 0.5290374159812927, "logits/rejected": 0.35298866033554077, "logps/chosen": -0.8901825547218323, "logps/rejected": -2.4757721424102783, "loss": 0.8743, "odds_ratio_loss": 0.5522928237915039, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08901825547218323, "rewards/margins": 0.1585589349269867, "rewards/rejected": -0.24757720530033112, "sft_loss": 0.8901825547218323, "step": 536 }, { "epoch": 0.7765726681127982, "grad_norm": 3.199778027441658, "learning_rate": 7.771929064137255e-06, "logits/chosen": 0.301146537065506, "logits/rejected": 0.21612121164798737, "logps/chosen": -0.7302257418632507, "logps/rejected": -3.005721092224121, "loss": 0.8393, "odds_ratio_loss": 0.3484118580818176, "rewards/accuracies": 0.875, "rewards/chosen": -0.07302258163690567, "rewards/margins": 0.22754952311515808, "rewards/rejected": -0.30057209730148315, "sft_loss": 0.7302257418632507, "step": 537 }, { "epoch": 0.7780187997107737, "grad_norm": 2.8558577962690075, "learning_rate": 7.77089415848143e-06, "logits/chosen": 0.3406679034233093, "logits/rejected": 0.22741743922233582, "logps/chosen": -0.7484648823738098, "logps/rejected": -2.0377368927001953, "loss": 0.8792, "odds_ratio_loss": 0.5459549427032471, "rewards/accuracies": 0.75, "rewards/chosen": -0.07484649121761322, "rewards/margins": 0.12892721593379974, "rewards/rejected": -0.20377370715141296, "sft_loss": 0.7484648823738098, "step": 538 }, { "epoch": 0.779464931308749, "grad_norm": 3.044240366400535, "learning_rate": 7.769856979339473e-06, "logits/chosen": 0.39080581068992615, "logits/rejected": 0.27354422211647034, "logps/chosen": -0.7717220783233643, "logps/rejected": -2.013183355331421, "loss": 0.9099, "odds_ratio_loss": 0.45660167932510376, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07717221230268478, "rewards/margins": 0.12414614111185074, "rewards/rejected": -0.20131835341453552, "sft_loss": 0.7717220783233643, "step": 539 }, { "epoch": 0.7809110629067245, "grad_norm": 2.5132725680789516, "learning_rate": 7.768817527336701e-06, "logits/chosen": 0.4074794352054596, "logits/rejected": 0.2864871323108673, "logps/chosen": -0.9486532211303711, "logps/rejected": -1.185182809829712, "loss": 0.9103, "odds_ratio_loss": 0.7375662326812744, "rewards/accuracies": 0.5, "rewards/chosen": -0.09486532211303711, "rewards/margins": 0.023652950301766396, "rewards/rejected": -0.11851827800273895, "sft_loss": 0.9486532211303711, "step": 540 }, { "epoch": 0.7823571945047, "grad_norm": 4.473324712516512, "learning_rate": 7.767775803099805e-06, "logits/chosen": 0.30603158473968506, "logits/rejected": 0.27318325638771057, "logps/chosen": -1.0894935131072998, "logps/rejected": -1.8977537155151367, "loss": 0.9864, "odds_ratio_loss": 0.675370454788208, "rewards/accuracies": 0.375, "rewards/chosen": -0.10894934833049774, "rewards/margins": 0.080826036632061, "rewards/rejected": -0.18977537751197815, "sft_loss": 1.0894935131072998, "step": 541 }, { "epoch": 0.7838033261026753, "grad_norm": 2.366314632530353, "learning_rate": 7.766731807256845e-06, "logits/chosen": 0.5116511583328247, "logits/rejected": 0.262421578168869, "logps/chosen": -0.7834107279777527, "logps/rejected": -3.154900550842285, "loss": 0.8295, "odds_ratio_loss": 0.4272821545600891, "rewards/accuracies": 0.75, "rewards/chosen": -0.07834108173847198, "rewards/margins": 0.2371489703655243, "rewards/rejected": -0.31549006700515747, "sft_loss": 0.7834107279777527, "step": 542 }, { "epoch": 0.7852494577006508, "grad_norm": 2.752280571379248, "learning_rate": 7.76568554043725e-06, "logits/chosen": 0.3837011754512787, "logits/rejected": 0.3628317415714264, "logps/chosen": -0.9774251580238342, "logps/rejected": -1.3284695148468018, "loss": 0.8892, "odds_ratio_loss": 0.6802763938903809, "rewards/accuracies": 0.625, "rewards/chosen": -0.09774252027273178, "rewards/margins": 0.03510444238781929, "rewards/rejected": -0.13284695148468018, "sft_loss": 0.9774251580238342, "step": 543 }, { "epoch": 0.7866955892986262, "grad_norm": 3.694066400970215, "learning_rate": 7.764637003271819e-06, "logits/chosen": 0.42650485038757324, "logits/rejected": 0.2961081564426422, "logps/chosen": -0.9184818863868713, "logps/rejected": -2.8604061603546143, "loss": 0.8954, "odds_ratio_loss": 0.5541467666625977, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09184818714857101, "rewards/margins": 0.19419240951538086, "rewards/rejected": -0.28604060411453247, "sft_loss": 0.9184818863868713, "step": 544 }, { "epoch": 0.7881417208966016, "grad_norm": 3.5112933617172732, "learning_rate": 7.763586196392715e-06, "logits/chosen": 0.33292311429977417, "logits/rejected": 0.3208504915237427, "logps/chosen": -0.9027593731880188, "logps/rejected": -2.139615774154663, "loss": 0.8603, "odds_ratio_loss": 0.4521198570728302, "rewards/accuracies": 0.8125, "rewards/chosen": -0.09027593582868576, "rewards/margins": 0.12368564307689667, "rewards/rejected": -0.21396157145500183, "sft_loss": 0.9027593731880188, "step": 545 }, { "epoch": 0.789587852494577, "grad_norm": 2.9201291120405575, "learning_rate": 7.762533120433478e-06, "logits/chosen": 0.4251149594783783, "logits/rejected": 0.2440737783908844, "logps/chosen": -0.7995153665542603, "logps/rejected": -2.243067979812622, "loss": 0.8136, "odds_ratio_loss": 0.564355731010437, "rewards/accuracies": 0.625, "rewards/chosen": -0.07995154708623886, "rewards/margins": 0.14435526728630066, "rewards/rejected": -0.22430679202079773, "sft_loss": 0.7995153665542603, "step": 546 }, { "epoch": 0.7910339840925524, "grad_norm": 2.6748457009798443, "learning_rate": 7.761477776029008e-06, "logits/chosen": 0.4318455457687378, "logits/rejected": 0.2599557042121887, "logps/chosen": -0.7891461253166199, "logps/rejected": -2.2876057624816895, "loss": 0.8256, "odds_ratio_loss": 0.4860532283782959, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07891461253166199, "rewards/margins": 0.14984598755836487, "rewards/rejected": -0.22876060009002686, "sft_loss": 0.7891461253166199, "step": 547 }, { "epoch": 0.7924801156905278, "grad_norm": 5.4769117124092235, "learning_rate": 7.76042016381558e-06, "logits/chosen": 0.5130550265312195, "logits/rejected": 0.34844133257865906, "logps/chosen": -0.6933074593544006, "logps/rejected": -2.7397730350494385, "loss": 0.8176, "odds_ratio_loss": 0.48113542795181274, "rewards/accuracies": 0.75, "rewards/chosen": -0.06933074444532394, "rewards/margins": 0.20464655756950378, "rewards/rejected": -0.2739773094654083, "sft_loss": 0.6933074593544006, "step": 548 }, { "epoch": 0.7939262472885033, "grad_norm": 3.0348990843968546, "learning_rate": 7.759360284430827e-06, "logits/chosen": 0.4405141770839691, "logits/rejected": 0.265964537858963, "logps/chosen": -0.8159855008125305, "logps/rejected": -1.70670485496521, "loss": 0.8676, "odds_ratio_loss": 0.5362525582313538, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08159855753183365, "rewards/margins": 0.08907192945480347, "rewards/rejected": -0.1706704944372177, "sft_loss": 0.8159855008125305, "step": 549 }, { "epoch": 0.7953723788864787, "grad_norm": 4.29621985900849, "learning_rate": 7.75829813851376e-06, "logits/chosen": 0.352047860622406, "logits/rejected": 0.25868427753448486, "logps/chosen": -0.719826877117157, "logps/rejected": -2.092642307281494, "loss": 0.7661, "odds_ratio_loss": 0.5927515625953674, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07198269665241241, "rewards/margins": 0.13728153705596924, "rewards/rejected": -0.20926421880722046, "sft_loss": 0.719826877117157, "step": 550 }, { "epoch": 0.7968185104844541, "grad_norm": 3.4005018103671176, "learning_rate": 7.757233726704747e-06, "logits/chosen": 0.24000917375087738, "logits/rejected": 0.2559305429458618, "logps/chosen": -0.9485635161399841, "logps/rejected": -2.154191493988037, "loss": 0.9052, "odds_ratio_loss": 0.5316140055656433, "rewards/accuracies": 0.75, "rewards/chosen": -0.09485635161399841, "rewards/margins": 0.12056280672550201, "rewards/rejected": -0.21541914343833923, "sft_loss": 0.9485635161399841, "step": 551 }, { "epoch": 0.7982646420824295, "grad_norm": 3.0623594326374404, "learning_rate": 7.756167049645526e-06, "logits/chosen": 0.4697762131690979, "logits/rejected": 0.27899396419525146, "logps/chosen": -0.8783195614814758, "logps/rejected": -2.365593910217285, "loss": 0.8236, "odds_ratio_loss": 0.6793208122253418, "rewards/accuracies": 0.5, "rewards/chosen": -0.08783195912837982, "rewards/margins": 0.1487274318933487, "rewards/rejected": -0.2365594208240509, "sft_loss": 0.8783195614814758, "step": 552 }, { "epoch": 0.7997107736804049, "grad_norm": 2.9804375568080834, "learning_rate": 7.755098107979202e-06, "logits/chosen": 0.4210900664329529, "logits/rejected": 0.3507734537124634, "logps/chosen": -0.7799772620201111, "logps/rejected": -2.2229514122009277, "loss": 0.8528, "odds_ratio_loss": 0.5192586779594421, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07799772918224335, "rewards/margins": 0.14429740607738495, "rewards/rejected": -0.2222951352596283, "sft_loss": 0.7799772620201111, "step": 553 }, { "epoch": 0.8011569052783803, "grad_norm": 4.236419244741945, "learning_rate": 7.754026902350242e-06, "logits/chosen": 0.3866078853607178, "logits/rejected": 0.3358733654022217, "logps/chosen": -0.9486645460128784, "logps/rejected": -1.8577195405960083, "loss": 0.9705, "odds_ratio_loss": 0.6812944412231445, "rewards/accuracies": 0.625, "rewards/chosen": -0.09486645460128784, "rewards/margins": 0.09090550243854523, "rewards/rejected": -0.18577195703983307, "sft_loss": 0.9486645460128784, "step": 554 }, { "epoch": 0.8026030368763557, "grad_norm": 5.48208070685878, "learning_rate": 7.752953433404482e-06, "logits/chosen": 0.749567449092865, "logits/rejected": 0.7038176655769348, "logps/chosen": -0.8758996725082397, "logps/rejected": -1.5970778465270996, "loss": 0.7799, "odds_ratio_loss": 0.5583348274230957, "rewards/accuracies": 0.625, "rewards/chosen": -0.08758997172117233, "rewards/margins": 0.07211781293153763, "rewards/rejected": -0.15970778465270996, "sft_loss": 0.8758996725082397, "step": 555 }, { "epoch": 0.8040491684743312, "grad_norm": 3.2105285366510823, "learning_rate": 7.75187770178912e-06, "logits/chosen": 0.5437260866165161, "logits/rejected": 0.4299187660217285, "logps/chosen": -0.784862756729126, "logps/rejected": -1.8833541870117188, "loss": 0.8746, "odds_ratio_loss": 0.4752628207206726, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07848627865314484, "rewards/margins": 0.10984915494918823, "rewards/rejected": -0.18833541870117188, "sft_loss": 0.784862756729126, "step": 556 }, { "epoch": 0.8054953000723066, "grad_norm": 3.2621428288627565, "learning_rate": 7.750799708152716e-06, "logits/chosen": 0.4966755211353302, "logits/rejected": 0.35313740372657776, "logps/chosen": -0.8801267147064209, "logps/rejected": -2.198486566543579, "loss": 0.9417, "odds_ratio_loss": 0.601882815361023, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08801267296075821, "rewards/margins": 0.13183598220348358, "rewards/rejected": -0.2198486477136612, "sft_loss": 0.8801267147064209, "step": 557 }, { "epoch": 0.806941431670282, "grad_norm": 2.756186174552345, "learning_rate": 7.749719453145202e-06, "logits/chosen": 0.2815437316894531, "logits/rejected": 0.2515462636947632, "logps/chosen": -0.8514240980148315, "logps/rejected": -2.015674352645874, "loss": 0.8876, "odds_ratio_loss": 0.6282274723052979, "rewards/accuracies": 0.625, "rewards/chosen": -0.08514241874217987, "rewards/margins": 0.11642500758171082, "rewards/rejected": -0.2015674114227295, "sft_loss": 0.8514240980148315, "step": 558 }, { "epoch": 0.8083875632682574, "grad_norm": 3.421431652763658, "learning_rate": 7.748636937417862e-06, "logits/chosen": 0.4633673429489136, "logits/rejected": 0.3824692964553833, "logps/chosen": -0.6419711112976074, "logps/rejected": -2.9679622650146484, "loss": 0.7585, "odds_ratio_loss": 0.4057493805885315, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0641971081495285, "rewards/margins": 0.23259912431240082, "rewards/rejected": -0.2967962324619293, "sft_loss": 0.6419711112976074, "step": 559 }, { "epoch": 0.8098336948662328, "grad_norm": 53.9455390293421, "learning_rate": 7.747552161623352e-06, "logits/chosen": 0.4784644544124603, "logits/rejected": 0.3171621263027191, "logps/chosen": -1.4930461645126343, "logps/rejected": -3.1993000507354736, "loss": 1.1239, "odds_ratio_loss": 0.9343154430389404, "rewards/accuracies": 0.6875, "rewards/chosen": -0.14930462837219238, "rewards/margins": 0.17062537372112274, "rewards/rejected": -0.3199300169944763, "sft_loss": 1.4930461645126343, "step": 560 }, { "epoch": 0.8112798264642083, "grad_norm": 5.943313559869521, "learning_rate": 7.746465126415685e-06, "logits/chosen": 0.43953937292099, "logits/rejected": 0.32243290543556213, "logps/chosen": -0.667884111404419, "logps/rejected": -2.532602071762085, "loss": 0.8931, "odds_ratio_loss": 0.32886582612991333, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06678842008113861, "rewards/margins": 0.18647179007530212, "rewards/rejected": -0.25326019525527954, "sft_loss": 0.667884111404419, "step": 561 }, { "epoch": 0.8127259580621836, "grad_norm": 3.2132336268726296, "learning_rate": 7.74537583245024e-06, "logits/chosen": 0.5227065086364746, "logits/rejected": 0.35094642639160156, "logps/chosen": -0.7416081428527832, "logps/rejected": -2.211590528488159, "loss": 0.9376, "odds_ratio_loss": 0.43778154253959656, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07416081428527832, "rewards/margins": 0.14699825644493103, "rewards/rejected": -0.22115907073020935, "sft_loss": 0.7416081428527832, "step": 562 }, { "epoch": 0.8141720896601591, "grad_norm": 3.359980109883608, "learning_rate": 7.744284280383758e-06, "logits/chosen": 0.5760366916656494, "logits/rejected": 0.35373595356941223, "logps/chosen": -0.7241435050964355, "logps/rejected": -2.0765225887298584, "loss": 0.9294, "odds_ratio_loss": 0.28304675221443176, "rewards/accuracies": 0.875, "rewards/chosen": -0.0724143534898758, "rewards/margins": 0.1352379024028778, "rewards/rejected": -0.2076522558927536, "sft_loss": 0.7241435050964355, "step": 563 }, { "epoch": 0.8156182212581344, "grad_norm": 3.028393325761886, "learning_rate": 7.743190470874336e-06, "logits/chosen": 0.5186077356338501, "logits/rejected": 0.44535890221595764, "logps/chosen": -0.9656530618667603, "logps/rejected": -1.3157947063446045, "loss": 0.8103, "odds_ratio_loss": 0.697954535484314, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09656532108783722, "rewards/margins": 0.035014159977436066, "rewards/rejected": -0.1315794736146927, "sft_loss": 0.9656530618667603, "step": 564 }, { "epoch": 0.8170643528561099, "grad_norm": 2.5015055682103386, "learning_rate": 7.74209440458144e-06, "logits/chosen": 0.46131306886672974, "logits/rejected": 0.43117648363113403, "logps/chosen": -0.8131046891212463, "logps/rejected": -0.8619694709777832, "loss": 0.8913, "odds_ratio_loss": 0.6825778484344482, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0813104659318924, "rewards/margins": 0.004886474460363388, "rewards/rejected": -0.08619694411754608, "sft_loss": 0.8131046891212463, "step": 565 }, { "epoch": 0.8185104844540854, "grad_norm": 2.552811543812281, "learning_rate": 7.740996082165889e-06, "logits/chosen": 0.39947623014450073, "logits/rejected": 0.3178212642669678, "logps/chosen": -0.8110004663467407, "logps/rejected": -1.336201548576355, "loss": 0.9081, "odds_ratio_loss": 0.7006158232688904, "rewards/accuracies": 0.5, "rewards/chosen": -0.08110004663467407, "rewards/margins": 0.052520107477903366, "rewards/rejected": -0.13362015783786774, "sft_loss": 0.8110004663467407, "step": 566 }, { "epoch": 0.8199566160520607, "grad_norm": 2.337977459115273, "learning_rate": 7.739895504289867e-06, "logits/chosen": 0.3927333354949951, "logits/rejected": 0.4031215310096741, "logps/chosen": -0.6630347371101379, "logps/rejected": -1.4980498552322388, "loss": 0.8835, "odds_ratio_loss": 0.5098182559013367, "rewards/accuracies": 0.625, "rewards/chosen": -0.06630347669124603, "rewards/margins": 0.08350151777267456, "rewards/rejected": -0.1498049944639206, "sft_loss": 0.6630347371101379, "step": 567 }, { "epoch": 0.8214027476500362, "grad_norm": 3.035229863767425, "learning_rate": 7.738792671616918e-06, "logits/chosen": 0.33209335803985596, "logits/rejected": 0.2999739944934845, "logps/chosen": -0.860519528388977, "logps/rejected": -1.6074646711349487, "loss": 0.8805, "odds_ratio_loss": 0.6031328439712524, "rewards/accuracies": 0.625, "rewards/chosen": -0.08605195581912994, "rewards/margins": 0.07469449192285538, "rewards/rejected": -0.16074645519256592, "sft_loss": 0.860519528388977, "step": 568 }, { "epoch": 0.8228488792480115, "grad_norm": 6.249110734978357, "learning_rate": 7.737687584811942e-06, "logits/chosen": 0.3918280601501465, "logits/rejected": 0.48867300152778625, "logps/chosen": -0.9784340262413025, "logps/rejected": -1.133239984512329, "loss": 0.8658, "odds_ratio_loss": 0.8345239758491516, "rewards/accuracies": 0.625, "rewards/chosen": -0.09784340858459473, "rewards/margins": 0.015480585396289825, "rewards/rejected": -0.11332399398088455, "sft_loss": 0.9784340262413025, "step": 569 }, { "epoch": 0.824295010845987, "grad_norm": 2.959486238121644, "learning_rate": 7.7365802445412e-06, "logits/chosen": 0.4703490734100342, "logits/rejected": 0.3433605134487152, "logps/chosen": -0.683617115020752, "logps/rejected": -1.978018879890442, "loss": 0.7636, "odds_ratio_loss": 0.4209952652454376, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06836170703172684, "rewards/margins": 0.12944017350673676, "rewards/rejected": -0.1978018879890442, "sft_loss": 0.683617115020752, "step": 570 }, { "epoch": 0.8257411424439624, "grad_norm": 4.554090104991005, "learning_rate": 7.735470651472312e-06, "logits/chosen": 0.5703073740005493, "logits/rejected": 0.4268415570259094, "logps/chosen": -0.6800241470336914, "logps/rejected": -1.6585583686828613, "loss": 0.7895, "odds_ratio_loss": 0.45704489946365356, "rewards/accuracies": 0.75, "rewards/chosen": -0.06800241768360138, "rewards/margins": 0.09785342216491699, "rewards/rejected": -0.16585583984851837, "sft_loss": 0.6800241470336914, "step": 571 }, { "epoch": 0.8271872740419378, "grad_norm": 2.822960883716033, "learning_rate": 7.734358806274256e-06, "logits/chosen": 0.4933173656463623, "logits/rejected": 0.3999761939048767, "logps/chosen": -0.7931399345397949, "logps/rejected": -1.2513625621795654, "loss": 0.7925, "odds_ratio_loss": 0.5663318634033203, "rewards/accuracies": 0.625, "rewards/chosen": -0.07931399345397949, "rewards/margins": 0.04582225903868675, "rewards/rejected": -0.12513625621795654, "sft_loss": 0.7931399345397949, "step": 572 }, { "epoch": 0.8286334056399133, "grad_norm": 2.648985570783827, "learning_rate": 7.733244709617369e-06, "logits/chosen": 0.46373143792152405, "logits/rejected": 0.29053449630737305, "logps/chosen": -0.7167465090751648, "logps/rejected": -1.855074405670166, "loss": 0.8, "odds_ratio_loss": 0.3622341454029083, "rewards/accuracies": 0.875, "rewards/chosen": -0.0716746523976326, "rewards/margins": 0.11383280158042908, "rewards/rejected": -0.18550744652748108, "sft_loss": 0.7167465090751648, "step": 573 }, { "epoch": 0.8300795372378886, "grad_norm": 3.7137584494805145, "learning_rate": 7.73212836217334e-06, "logits/chosen": 0.4620693624019623, "logits/rejected": 0.4092079997062683, "logps/chosen": -1.0050597190856934, "logps/rejected": -1.536287784576416, "loss": 0.8899, "odds_ratio_loss": 0.47553932666778564, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10050597786903381, "rewards/margins": 0.05312279611825943, "rewards/rejected": -0.15362878143787384, "sft_loss": 1.0050597190856934, "step": 574 }, { "epoch": 0.8315256688358641, "grad_norm": 2.8877404863277096, "learning_rate": 7.731009764615223e-06, "logits/chosen": 0.4316241443157196, "logits/rejected": 0.4264254570007324, "logps/chosen": -0.805022120475769, "logps/rejected": -1.3257907629013062, "loss": 0.8333, "odds_ratio_loss": 0.5716394186019897, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0805022120475769, "rewards/margins": 0.052076857537031174, "rewards/rejected": -0.13257908821105957, "sft_loss": 0.805022120475769, "step": 575 }, { "epoch": 0.8329718004338394, "grad_norm": 6.1727650674380765, "learning_rate": 7.729888917617423e-06, "logits/chosen": 0.3446384370326996, "logits/rejected": 0.3820667564868927, "logps/chosen": -0.7798358201980591, "logps/rejected": -1.7499778270721436, "loss": 0.808, "odds_ratio_loss": 0.4026681184768677, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07798358052968979, "rewards/margins": 0.09701421111822128, "rewards/rejected": -0.17499780654907227, "sft_loss": 0.7798358201980591, "step": 576 }, { "epoch": 0.8344179320318149, "grad_norm": 2.625297803365516, "learning_rate": 7.728765821855703e-06, "logits/chosen": 0.34016144275665283, "logits/rejected": 0.2931269407272339, "logps/chosen": -1.023898959159851, "logps/rejected": -2.1351914405822754, "loss": 0.9166, "odds_ratio_loss": 0.6438050270080566, "rewards/accuracies": 0.5, "rewards/chosen": -0.10238990187644958, "rewards/margins": 0.11112925410270691, "rewards/rejected": -0.2135191559791565, "sft_loss": 1.023898959159851, "step": 577 }, { "epoch": 0.8358640636297903, "grad_norm": 2.5444970560001487, "learning_rate": 7.72764047800718e-06, "logits/chosen": 0.49933484196662903, "logits/rejected": 0.42723822593688965, "logps/chosen": -0.6545868515968323, "logps/rejected": -1.498138666152954, "loss": 0.8401, "odds_ratio_loss": 0.4437343180179596, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06545868515968323, "rewards/margins": 0.0843551903963089, "rewards/rejected": -0.14981386065483093, "sft_loss": 0.6545868515968323, "step": 578 }, { "epoch": 0.8373101952277657, "grad_norm": 2.3585819967991197, "learning_rate": 7.726512886750331e-06, "logits/chosen": 0.34519678354263306, "logits/rejected": 0.19996540248394012, "logps/chosen": -0.9542055726051331, "logps/rejected": -1.4532517194747925, "loss": 0.8652, "odds_ratio_loss": 0.5589162111282349, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09542055428028107, "rewards/margins": 0.049904607236385345, "rewards/rejected": -0.14532515406608582, "sft_loss": 0.9542055726051331, "step": 579 }, { "epoch": 0.8387563268257412, "grad_norm": 5.125090865717826, "learning_rate": 7.725383048764985e-06, "logits/chosen": 0.42127755284309387, "logits/rejected": 0.2979139983654022, "logps/chosen": -0.8492597341537476, "logps/rejected": -1.7331501245498657, "loss": 0.9187, "odds_ratio_loss": 0.45528745651245117, "rewards/accuracies": 0.75, "rewards/chosen": -0.08492596447467804, "rewards/margins": 0.08838904649019241, "rewards/rejected": -0.17331500351428986, "sft_loss": 0.8492597341537476, "step": 580 }, { "epoch": 0.8402024584237165, "grad_norm": 3.7775925775826757, "learning_rate": 7.724250964732322e-06, "logits/chosen": 0.45021694898605347, "logits/rejected": 0.275540828704834, "logps/chosen": -0.9041640162467957, "logps/rejected": -1.424367904663086, "loss": 0.9197, "odds_ratio_loss": 0.5092793107032776, "rewards/accuracies": 0.625, "rewards/chosen": -0.09041640907526016, "rewards/margins": 0.05202038586139679, "rewards/rejected": -0.14243678748607635, "sft_loss": 0.9041640162467957, "step": 581 }, { "epoch": 0.841648590021692, "grad_norm": 2.3382318025798066, "learning_rate": 7.723116635334883e-06, "logits/chosen": 0.414359986782074, "logits/rejected": 0.33379101753234863, "logps/chosen": -0.781288743019104, "logps/rejected": -1.8517032861709595, "loss": 0.7569, "odds_ratio_loss": 0.47253549098968506, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0781288743019104, "rewards/margins": 0.10704146325588226, "rewards/rejected": -0.18517033755779266, "sft_loss": 0.781288743019104, "step": 582 }, { "epoch": 0.8430947216196674, "grad_norm": 3.016134749169285, "learning_rate": 7.721980061256557e-06, "logits/chosen": 0.5401621460914612, "logits/rejected": 0.3053189218044281, "logps/chosen": -0.893081545829773, "logps/rejected": -1.5628845691680908, "loss": 0.8887, "odds_ratio_loss": 0.5231793522834778, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08930815756320953, "rewards/margins": 0.06698029488325119, "rewards/rejected": -0.15628844499588013, "sft_loss": 0.893081545829773, "step": 583 }, { "epoch": 0.8445408532176428, "grad_norm": 3.4336842069827886, "learning_rate": 7.72084124318259e-06, "logits/chosen": 0.487199068069458, "logits/rejected": 0.4739711284637451, "logps/chosen": -0.9182305932044983, "logps/rejected": -1.005215048789978, "loss": 0.8489, "odds_ratio_loss": 0.7025805711746216, "rewards/accuracies": 0.5, "rewards/chosen": -0.09182305634021759, "rewards/margins": 0.008698442950844765, "rewards/rejected": -0.1005215048789978, "sft_loss": 0.9182305932044983, "step": 584 }, { "epoch": 0.8459869848156182, "grad_norm": 3.199872433035413, "learning_rate": 7.719700181799581e-06, "logits/chosen": 0.4485635757446289, "logits/rejected": 0.4251922369003296, "logps/chosen": -0.7368327379226685, "logps/rejected": -1.369962453842163, "loss": 0.808, "odds_ratio_loss": 0.582382082939148, "rewards/accuracies": 0.75, "rewards/chosen": -0.07368327677249908, "rewards/margins": 0.06331297755241394, "rewards/rejected": -0.13699625432491302, "sft_loss": 0.7368327379226685, "step": 585 }, { "epoch": 0.8474331164135936, "grad_norm": 2.7751053686439886, "learning_rate": 7.718556877795479e-06, "logits/chosen": 0.45472198724746704, "logits/rejected": 0.47626519203186035, "logps/chosen": -0.5254935622215271, "logps/rejected": -1.7815715074539185, "loss": 0.7916, "odds_ratio_loss": 0.420468270778656, "rewards/accuracies": 0.75, "rewards/chosen": -0.05254935473203659, "rewards/margins": 0.12560781836509705, "rewards/rejected": -0.17815715074539185, "sft_loss": 0.5254935622215271, "step": 586 }, { "epoch": 0.848879248011569, "grad_norm": 3.23820103346826, "learning_rate": 7.717411331859584e-06, "logits/chosen": 0.3615628182888031, "logits/rejected": 0.36034783720970154, "logps/chosen": -0.8735466003417969, "logps/rejected": -1.287621021270752, "loss": 0.9102, "odds_ratio_loss": 0.5822567939758301, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08735466003417969, "rewards/margins": 0.04140744358301163, "rewards/rejected": -0.12876209616661072, "sft_loss": 0.8735466003417969, "step": 587 }, { "epoch": 0.8503253796095445, "grad_norm": 5.647839940161303, "learning_rate": 7.716263544682553e-06, "logits/chosen": 0.5941022634506226, "logits/rejected": 0.5508843660354614, "logps/chosen": -0.7583089470863342, "logps/rejected": -1.2879139184951782, "loss": 0.8767, "odds_ratio_loss": 0.5073609352111816, "rewards/accuracies": 0.625, "rewards/chosen": -0.07583089917898178, "rewards/margins": 0.05296049639582634, "rewards/rejected": -0.12879139184951782, "sft_loss": 0.7583089470863342, "step": 588 }, { "epoch": 0.8517715112075199, "grad_norm": 3.480425009507815, "learning_rate": 7.715113516956389e-06, "logits/chosen": 0.36370134353637695, "logits/rejected": 0.40123385190963745, "logps/chosen": -0.7449005842208862, "logps/rejected": -1.24686861038208, "loss": 0.8231, "odds_ratio_loss": 0.6178784966468811, "rewards/accuracies": 0.5, "rewards/chosen": -0.07449005544185638, "rewards/margins": 0.050196800380945206, "rewards/rejected": -0.12468685954809189, "sft_loss": 0.7449005842208862, "step": 589 }, { "epoch": 0.8532176428054953, "grad_norm": 2.6798844944883706, "learning_rate": 7.71396124937445e-06, "logits/chosen": 0.2954362630844116, "logits/rejected": 0.346477210521698, "logps/chosen": -0.6796218752861023, "logps/rejected": -1.1093833446502686, "loss": 0.8734, "odds_ratio_loss": 0.5511021018028259, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06796219199895859, "rewards/margins": 0.04297615587711334, "rewards/rejected": -0.11093834042549133, "sft_loss": 0.6796218752861023, "step": 590 }, { "epoch": 0.8546637744034707, "grad_norm": 3.2479796489504165, "learning_rate": 7.71280674263144e-06, "logits/chosen": 0.4030838906764984, "logits/rejected": 0.36736756563186646, "logps/chosen": -0.7332617044448853, "logps/rejected": -1.6842825412750244, "loss": 0.8104, "odds_ratio_loss": 0.40521979331970215, "rewards/accuracies": 0.875, "rewards/chosen": -0.07332617044448853, "rewards/margins": 0.09510207921266556, "rewards/rejected": -0.16842825710773468, "sft_loss": 0.7332617044448853, "step": 591 }, { "epoch": 0.8561099060014461, "grad_norm": 2.6431172294443743, "learning_rate": 7.71164999742342e-06, "logits/chosen": 0.5364017486572266, "logits/rejected": 0.36750340461730957, "logps/chosen": -0.9896173477172852, "logps/rejected": -1.9290273189544678, "loss": 0.8408, "odds_ratio_loss": 0.6732795238494873, "rewards/accuracies": 0.375, "rewards/chosen": -0.0989617332816124, "rewards/margins": 0.09394100308418274, "rewards/rejected": -0.19290274381637573, "sft_loss": 0.9896173477172852, "step": 592 }, { "epoch": 0.8575560375994216, "grad_norm": 2.5251085232268564, "learning_rate": 7.71049101444779e-06, "logits/chosen": 0.4465160071849823, "logits/rejected": 0.4120599627494812, "logps/chosen": -0.5574182868003845, "logps/rejected": -1.7155920267105103, "loss": 0.7582, "odds_ratio_loss": 0.405154824256897, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05574183166027069, "rewards/margins": 0.1158173680305481, "rewards/rejected": -0.17155921459197998, "sft_loss": 0.5574182868003845, "step": 593 }, { "epoch": 0.8590021691973969, "grad_norm": 3.346546938807114, "learning_rate": 7.70932979440331e-06, "logits/chosen": 0.33852913975715637, "logits/rejected": 0.27749061584472656, "logps/chosen": -0.8209311962127686, "logps/rejected": -1.596746802330017, "loss": 0.887, "odds_ratio_loss": 0.4618699848651886, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08209311217069626, "rewards/margins": 0.07758156955242157, "rewards/rejected": -0.15967468917369843, "sft_loss": 0.8209311962127686, "step": 594 }, { "epoch": 0.8604483007953724, "grad_norm": 5.284660640530955, "learning_rate": 7.708166337990082e-06, "logits/chosen": 0.5029542446136475, "logits/rejected": 0.47931772470474243, "logps/chosen": -0.8861660957336426, "logps/rejected": -1.6336941719055176, "loss": 0.8685, "odds_ratio_loss": 0.6479578018188477, "rewards/accuracies": 0.625, "rewards/chosen": -0.08861660957336426, "rewards/margins": 0.0747528076171875, "rewards/rejected": -0.16336941719055176, "sft_loss": 0.8861660957336426, "step": 595 }, { "epoch": 0.8618944323933478, "grad_norm": 2.6787514133901102, "learning_rate": 7.707000645909557e-06, "logits/chosen": 0.38561227917671204, "logits/rejected": 0.34946468472480774, "logps/chosen": -0.8105971813201904, "logps/rejected": -1.6445955038070679, "loss": 0.7593, "odds_ratio_loss": 0.625914454460144, "rewards/accuracies": 0.5, "rewards/chosen": -0.08105972409248352, "rewards/margins": 0.08339983969926834, "rewards/rejected": -0.16445955634117126, "sft_loss": 0.8105971813201904, "step": 596 }, { "epoch": 0.8633405639913232, "grad_norm": 2.562166564261387, "learning_rate": 7.705832718864537e-06, "logits/chosen": 0.4447406232357025, "logits/rejected": 0.40148672461509705, "logps/chosen": -0.7819239497184753, "logps/rejected": -1.901719093322754, "loss": 0.9083, "odds_ratio_loss": 0.48765087127685547, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07819239795207977, "rewards/margins": 0.11197951436042786, "rewards/rejected": -0.19017191231250763, "sft_loss": 0.7819239497184753, "step": 597 }, { "epoch": 0.8647866955892987, "grad_norm": 3.207797469050618, "learning_rate": 7.704662557559167e-06, "logits/chosen": 0.38521963357925415, "logits/rejected": 0.4216020107269287, "logps/chosen": -0.921389102935791, "logps/rejected": -1.601888656616211, "loss": 0.9603, "odds_ratio_loss": 0.6697303056716919, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09213890880346298, "rewards/margins": 0.06804996728897095, "rewards/rejected": -0.16018888354301453, "sft_loss": 0.921389102935791, "step": 598 }, { "epoch": 0.866232827187274, "grad_norm": 4.035040836751041, "learning_rate": 7.703490162698945e-06, "logits/chosen": 0.45725634694099426, "logits/rejected": 0.3862098455429077, "logps/chosen": -0.7345454692840576, "logps/rejected": -1.4923635721206665, "loss": 0.7959, "odds_ratio_loss": 0.40088510513305664, "rewards/accuracies": 0.875, "rewards/chosen": -0.07345455139875412, "rewards/margins": 0.07578181475400925, "rewards/rejected": -0.14923638105392456, "sft_loss": 0.7345454692840576, "step": 599 }, { "epoch": 0.8676789587852495, "grad_norm": 2.5500836204304873, "learning_rate": 7.70231553499071e-06, "logits/chosen": 0.5151770710945129, "logits/rejected": 0.34262362122535706, "logps/chosen": -0.6712307929992676, "logps/rejected": -1.7133383750915527, "loss": 0.816, "odds_ratio_loss": 0.43570661544799805, "rewards/accuracies": 0.75, "rewards/chosen": -0.06712308526039124, "rewards/margins": 0.104210764169693, "rewards/rejected": -0.17133383452892303, "sft_loss": 0.6712307929992676, "step": 600 }, { "epoch": 0.8691250903832248, "grad_norm": 2.6121252911880544, "learning_rate": 7.701138675142651e-06, "logits/chosen": 0.4404212534427643, "logits/rejected": 0.3398100435733795, "logps/chosen": -0.7245041131973267, "logps/rejected": -2.0399234294891357, "loss": 0.8187, "odds_ratio_loss": 0.4228803515434265, "rewards/accuracies": 0.875, "rewards/chosen": -0.0724504142999649, "rewards/margins": 0.13154193758964539, "rewards/rejected": -0.2039923518896103, "sft_loss": 0.7245041131973267, "step": 601 }, { "epoch": 0.8705712219812003, "grad_norm": 3.895120218207068, "learning_rate": 7.6999595838643e-06, "logits/chosen": 0.3892238438129425, "logits/rejected": 0.3399989902973175, "logps/chosen": -0.8683582544326782, "logps/rejected": -1.7081042528152466, "loss": 0.8405, "odds_ratio_loss": 0.5292615294456482, "rewards/accuracies": 0.625, "rewards/chosen": -0.0868358165025711, "rewards/margins": 0.08397459983825684, "rewards/rejected": -0.17081041634082794, "sft_loss": 0.8683582544326782, "step": 602 }, { "epoch": 0.8720173535791758, "grad_norm": 4.916824856051119, "learning_rate": 7.698778261866536e-06, "logits/chosen": 0.4902600347995758, "logits/rejected": 0.43887144327163696, "logps/chosen": -0.7983700037002563, "logps/rejected": -1.311253547668457, "loss": 0.8184, "odds_ratio_loss": 0.4534938633441925, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07983700931072235, "rewards/margins": 0.05128835141658783, "rewards/rejected": -0.13112536072731018, "sft_loss": 0.7983700037002563, "step": 603 }, { "epoch": 0.8734634851771511, "grad_norm": 2.5621387956986723, "learning_rate": 7.697594709861582e-06, "logits/chosen": 0.5107913017272949, "logits/rejected": 0.45272332429885864, "logps/chosen": -0.8786731958389282, "logps/rejected": -1.683459758758545, "loss": 0.8494, "odds_ratio_loss": 0.521609902381897, "rewards/accuracies": 0.5, "rewards/chosen": -0.08786732703447342, "rewards/margins": 0.08047864586114883, "rewards/rejected": -0.16834597289562225, "sft_loss": 0.8786731958389282, "step": 604 }, { "epoch": 0.8749096167751266, "grad_norm": 4.078257021315965, "learning_rate": 7.696408928563004e-06, "logits/chosen": 0.43462055921554565, "logits/rejected": 0.38017183542251587, "logps/chosen": -0.9040985107421875, "logps/rejected": -1.7478702068328857, "loss": 0.8425, "odds_ratio_loss": 0.543043315410614, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09040984511375427, "rewards/margins": 0.08437717705965042, "rewards/rejected": -0.1747870296239853, "sft_loss": 0.9040985107421875, "step": 605 }, { "epoch": 0.8763557483731019, "grad_norm": 3.110086029010687, "learning_rate": 7.695220918685718e-06, "logits/chosen": 0.40548551082611084, "logits/rejected": 0.33251649141311646, "logps/chosen": -0.7896912097930908, "logps/rejected": -1.9131922721862793, "loss": 0.8279, "odds_ratio_loss": 0.4588484764099121, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07896912097930908, "rewards/margins": 0.11235012114048004, "rewards/rejected": -0.19131924211978912, "sft_loss": 0.7896912097930908, "step": 606 }, { "epoch": 0.8778018799710774, "grad_norm": 3.45857776190995, "learning_rate": 7.694030680945978e-06, "logits/chosen": 0.5536810159683228, "logits/rejected": 0.4505231976509094, "logps/chosen": -0.640487790107727, "logps/rejected": -2.052018165588379, "loss": 0.847, "odds_ratio_loss": 0.43984222412109375, "rewards/accuracies": 0.875, "rewards/chosen": -0.06404877454042435, "rewards/margins": 0.14115305244922638, "rewards/rejected": -0.20520181953907013, "sft_loss": 0.640487790107727, "step": 607 }, { "epoch": 0.8792480115690527, "grad_norm": 3.5262340037551354, "learning_rate": 7.692838216061382e-06, "logits/chosen": 0.3820614218711853, "logits/rejected": 0.3396124839782715, "logps/chosen": -0.8231310248374939, "logps/rejected": -1.9909696578979492, "loss": 0.9157, "odds_ratio_loss": 0.4377727806568146, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08231310546398163, "rewards/margins": 0.11678387224674225, "rewards/rejected": -0.19909697771072388, "sft_loss": 0.8231310248374939, "step": 608 }, { "epoch": 0.8806941431670282, "grad_norm": 2.4054024668451857, "learning_rate": 7.691643524750872e-06, "logits/chosen": 0.6013118028640747, "logits/rejected": 0.4211696982383728, "logps/chosen": -0.6587211489677429, "logps/rejected": -2.157447338104248, "loss": 0.9399, "odds_ratio_loss": 0.398333877325058, "rewards/accuracies": 0.75, "rewards/chosen": -0.06587211787700653, "rewards/margins": 0.14987263083457947, "rewards/rejected": -0.2157447338104248, "sft_loss": 0.6587211489677429, "step": 609 }, { "epoch": 0.8821402747650036, "grad_norm": 5.024109256266422, "learning_rate": 7.690446607734731e-06, "logits/chosen": 0.4865860939025879, "logits/rejected": 0.2488732635974884, "logps/chosen": -0.9007077813148499, "logps/rejected": -2.9339916706085205, "loss": 0.9398, "odds_ratio_loss": 0.4676297605037689, "rewards/accuracies": 0.625, "rewards/chosen": -0.09007078409194946, "rewards/margins": 0.20332840085029602, "rewards/rejected": -0.2933991849422455, "sft_loss": 0.9007077813148499, "step": 610 }, { "epoch": 0.883586406362979, "grad_norm": 2.4390306772314942, "learning_rate": 7.689247465734587e-06, "logits/chosen": 0.4354531168937683, "logits/rejected": 0.34002625942230225, "logps/chosen": -0.7873460054397583, "logps/rejected": -2.1244518756866455, "loss": 0.8282, "odds_ratio_loss": 0.440873384475708, "rewards/accuracies": 0.625, "rewards/chosen": -0.07873459905385971, "rewards/margins": 0.1337105929851532, "rewards/rejected": -0.2124451845884323, "sft_loss": 0.7873460054397583, "step": 611 }, { "epoch": 0.8850325379609545, "grad_norm": 3.041808355646371, "learning_rate": 7.688046099473404e-06, "logits/chosen": 0.36165985465049744, "logits/rejected": 0.34865519404411316, "logps/chosen": -0.7287914752960205, "logps/rejected": -2.1389732360839844, "loss": 0.7459, "odds_ratio_loss": 0.3866339921951294, "rewards/accuracies": 0.75, "rewards/chosen": -0.07287915050983429, "rewards/margins": 0.14101818203926086, "rewards/rejected": -0.21389730274677277, "sft_loss": 0.7287914752960205, "step": 612 }, { "epoch": 0.8864786695589298, "grad_norm": 2.8562172272392146, "learning_rate": 7.686842509675493e-06, "logits/chosen": 0.44806107878685, "logits/rejected": 0.28580302000045776, "logps/chosen": -0.6983122825622559, "logps/rejected": -1.8353773355484009, "loss": 0.7468, "odds_ratio_loss": 0.3887363076210022, "rewards/accuracies": 0.75, "rewards/chosen": -0.0698312297463417, "rewards/margins": 0.11370651423931122, "rewards/rejected": -0.18353773653507233, "sft_loss": 0.6983122825622559, "step": 613 }, { "epoch": 0.8879248011569053, "grad_norm": 3.892747952798006, "learning_rate": 7.6856366970665e-06, "logits/chosen": 0.32401058077812195, "logits/rejected": 0.2524191439151764, "logps/chosen": -0.6660705804824829, "logps/rejected": -2.403459072113037, "loss": 0.7928, "odds_ratio_loss": 0.375204473733902, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06660705804824829, "rewards/margins": 0.17373886704444885, "rewards/rejected": -0.24034592509269714, "sft_loss": 0.6660705804824829, "step": 614 }, { "epoch": 0.8893709327548807, "grad_norm": 3.7158835902808973, "learning_rate": 7.68442866237342e-06, "logits/chosen": 0.3761048913002014, "logits/rejected": 0.2979498505592346, "logps/chosen": -0.8295606374740601, "logps/rejected": -1.7888013124465942, "loss": 0.7842, "odds_ratio_loss": 0.4067229628562927, "rewards/accuracies": 0.875, "rewards/chosen": -0.08295606076717377, "rewards/margins": 0.09592406451702118, "rewards/rejected": -0.17888012528419495, "sft_loss": 0.8295606374740601, "step": 615 }, { "epoch": 0.8908170643528561, "grad_norm": 2.4550407569809396, "learning_rate": 7.683218406324572e-06, "logits/chosen": 0.6669546365737915, "logits/rejected": 0.4045952558517456, "logps/chosen": -0.5585116744041443, "logps/rejected": -2.035814046859741, "loss": 0.7754, "odds_ratio_loss": 0.39348843693733215, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05585116893053055, "rewards/margins": 0.1477302461862564, "rewards/rejected": -0.20358142256736755, "sft_loss": 0.5585116744041443, "step": 616 }, { "epoch": 0.8922631959508315, "grad_norm": 3.407732902695217, "learning_rate": 7.682005929649631e-06, "logits/chosen": 0.39748919010162354, "logits/rejected": 0.37334704399108887, "logps/chosen": -0.7694322466850281, "logps/rejected": -1.4826719760894775, "loss": 0.8091, "odds_ratio_loss": 0.4687405228614807, "rewards/accuracies": 0.625, "rewards/chosen": -0.07694321870803833, "rewards/margins": 0.07132397592067719, "rewards/rejected": -0.1482672095298767, "sft_loss": 0.7694322466850281, "step": 617 }, { "epoch": 0.8937093275488069, "grad_norm": 2.5017227518592233, "learning_rate": 7.680791233079603e-06, "logits/chosen": 0.3456028997898102, "logits/rejected": 0.3027806282043457, "logps/chosen": -0.7319939136505127, "logps/rejected": -1.6122633218765259, "loss": 0.777, "odds_ratio_loss": 0.4658835530281067, "rewards/accuracies": 0.75, "rewards/chosen": -0.07319939136505127, "rewards/margins": 0.08802695572376251, "rewards/rejected": -0.16122636198997498, "sft_loss": 0.7319939136505127, "step": 618 }, { "epoch": 0.8951554591467824, "grad_norm": 2.3166376522393146, "learning_rate": 7.67957431734683e-06, "logits/chosen": 0.3627810478210449, "logits/rejected": 0.2669435441493988, "logps/chosen": -0.9080690145492554, "logps/rejected": -1.8668326139450073, "loss": 0.8564, "odds_ratio_loss": 0.5684492588043213, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09080690145492554, "rewards/margins": 0.09587635844945908, "rewards/rejected": -0.18668325245380402, "sft_loss": 0.9080690145492554, "step": 619 }, { "epoch": 0.8966015907447578, "grad_norm": 3.201147938968964, "learning_rate": 7.678355183184998e-06, "logits/chosen": 0.47244179248809814, "logits/rejected": 0.3506423830986023, "logps/chosen": -0.6035176515579224, "logps/rejected": -2.130152702331543, "loss": 0.8337, "odds_ratio_loss": 0.399160772562027, "rewards/accuracies": 0.75, "rewards/chosen": -0.060351770371198654, "rewards/margins": 0.15266351401805878, "rewards/rejected": -0.21301528811454773, "sft_loss": 0.6035176515579224, "step": 620 }, { "epoch": 0.8980477223427332, "grad_norm": 3.717093420401316, "learning_rate": 7.677133831329126e-06, "logits/chosen": 0.4847029447555542, "logits/rejected": 0.40981170535087585, "logps/chosen": -0.7285267114639282, "logps/rejected": -2.408964157104492, "loss": 0.7495, "odds_ratio_loss": 0.38725215196609497, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07285267114639282, "rewards/margins": 0.16804377734661102, "rewards/rejected": -0.24089643359184265, "sft_loss": 0.7285267114639282, "step": 621 }, { "epoch": 0.8994938539407086, "grad_norm": 13.31403718261638, "learning_rate": 7.675910262515571e-06, "logits/chosen": 0.3689773678779602, "logits/rejected": 0.31208884716033936, "logps/chosen": -0.7826531529426575, "logps/rejected": -1.995023488998413, "loss": 0.8353, "odds_ratio_loss": 0.44215285778045654, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07826531678438187, "rewards/margins": 0.12123703211545944, "rewards/rejected": -0.1995023488998413, "sft_loss": 0.7826531529426575, "step": 622 }, { "epoch": 0.900939985538684, "grad_norm": 2.5398626802834277, "learning_rate": 7.67468447748203e-06, "logits/chosen": 0.42934471368789673, "logits/rejected": 0.21536873281002045, "logps/chosen": -0.7805943489074707, "logps/rejected": -2.9530436992645264, "loss": 0.7891, "odds_ratio_loss": 0.5255002379417419, "rewards/accuracies": 0.625, "rewards/chosen": -0.07805943489074707, "rewards/margins": 0.217244952917099, "rewards/rejected": -0.29530438780784607, "sft_loss": 0.7805943489074707, "step": 623 }, { "epoch": 0.9023861171366594, "grad_norm": 2.70020042469918, "learning_rate": 7.67345647696753e-06, "logits/chosen": 0.5212641954421997, "logits/rejected": 0.42973682284355164, "logps/chosen": -0.6765854954719543, "logps/rejected": -1.6150991916656494, "loss": 0.9148, "odds_ratio_loss": 0.48807549476623535, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06765855103731155, "rewards/margins": 0.09385137259960175, "rewards/rejected": -0.1615099161863327, "sft_loss": 0.6765854954719543, "step": 624 }, { "epoch": 0.9038322487346349, "grad_norm": 4.310558816009849, "learning_rate": 7.67222626171244e-06, "logits/chosen": 0.33585911989212036, "logits/rejected": 0.20145192742347717, "logps/chosen": -0.8163321018218994, "logps/rejected": -3.1550087928771973, "loss": 0.9581, "odds_ratio_loss": 0.38997989892959595, "rewards/accuracies": 0.875, "rewards/chosen": -0.08163321018218994, "rewards/margins": 0.23386766016483307, "rewards/rejected": -0.3155008852481842, "sft_loss": 0.8163321018218994, "step": 625 }, { "epoch": 0.9052783803326103, "grad_norm": 3.448422343794376, "learning_rate": 7.670993832458459e-06, "logits/chosen": 0.42779773473739624, "logits/rejected": 0.36584076285362244, "logps/chosen": -0.6566274762153625, "logps/rejected": -1.3442649841308594, "loss": 0.833, "odds_ratio_loss": 0.4265965521335602, "rewards/accuracies": 0.875, "rewards/chosen": -0.06566274911165237, "rewards/margins": 0.06876374781131744, "rewards/rejected": -0.13442650437355042, "sft_loss": 0.6566274762153625, "step": 626 }, { "epoch": 0.9067245119305857, "grad_norm": 2.9169308040479223, "learning_rate": 7.669759189948624e-06, "logits/chosen": 0.35135525465011597, "logits/rejected": 0.2400461733341217, "logps/chosen": -0.6346577405929565, "logps/rejected": -1.875277042388916, "loss": 0.7533, "odds_ratio_loss": 0.3021318018436432, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06346577405929565, "rewards/margins": 0.1240619346499443, "rewards/rejected": -0.18752773106098175, "sft_loss": 0.6346577405929565, "step": 627 }, { "epoch": 0.9081706435285611, "grad_norm": 3.0982427667659076, "learning_rate": 7.668522334927307e-06, "logits/chosen": 0.30746403336524963, "logits/rejected": 0.20801600813865662, "logps/chosen": -0.8239056468009949, "logps/rejected": -1.7219469547271729, "loss": 0.8766, "odds_ratio_loss": 0.6280619502067566, "rewards/accuracies": 0.5, "rewards/chosen": -0.08239056169986725, "rewards/margins": 0.08980412036180496, "rewards/rejected": -0.1721946895122528, "sft_loss": 0.8239056468009949, "step": 628 }, { "epoch": 0.9096167751265365, "grad_norm": 3.907298148969337, "learning_rate": 7.667283268140211e-06, "logits/chosen": 0.4386427402496338, "logits/rejected": 0.31877654790878296, "logps/chosen": -0.8378483653068542, "logps/rejected": -1.461822509765625, "loss": 0.9106, "odds_ratio_loss": 0.6432319283485413, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08378484100103378, "rewards/margins": 0.06239742785692215, "rewards/rejected": -0.14618226885795593, "sft_loss": 0.8378483653068542, "step": 629 }, { "epoch": 0.911062906724512, "grad_norm": 2.7529705801934763, "learning_rate": 7.666041990334374e-06, "logits/chosen": 0.40260255336761475, "logits/rejected": 0.30121004581451416, "logps/chosen": -0.8997647166252136, "logps/rejected": -1.8492169380187988, "loss": 0.8114, "odds_ratio_loss": 0.4565718173980713, "rewards/accuracies": 0.75, "rewards/chosen": -0.0899764746427536, "rewards/margins": 0.09494520723819733, "rewards/rejected": -0.18492168188095093, "sft_loss": 0.8997647166252136, "step": 630 }, { "epoch": 0.9125090383224873, "grad_norm": 3.2383772151478882, "learning_rate": 7.664798502258167e-06, "logits/chosen": 0.43885284662246704, "logits/rejected": 0.4151266813278198, "logps/chosen": -0.717272162437439, "logps/rejected": -1.9099479913711548, "loss": 0.8819, "odds_ratio_loss": 0.4223041236400604, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0717272162437439, "rewards/margins": 0.11926757544279099, "rewards/rejected": -0.19099479913711548, "sft_loss": 0.717272162437439, "step": 631 }, { "epoch": 0.9139551699204628, "grad_norm": 2.8313172738379153, "learning_rate": 7.663552804661292e-06, "logits/chosen": 0.43072545528411865, "logits/rejected": 0.2662442922592163, "logps/chosen": -0.8863623738288879, "logps/rejected": -2.1999828815460205, "loss": 0.7894, "odds_ratio_loss": 0.40025269985198975, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08863624185323715, "rewards/margins": 0.13136205077171326, "rewards/rejected": -0.21999827027320862, "sft_loss": 0.8863623738288879, "step": 632 }, { "epoch": 0.9154013015184381, "grad_norm": 3.2231547445658433, "learning_rate": 7.662304898294789e-06, "logits/chosen": 0.34558531641960144, "logits/rejected": 0.25568896532058716, "logps/chosen": -0.9652397632598877, "logps/rejected": -2.154374599456787, "loss": 0.8856, "odds_ratio_loss": 0.6041095852851868, "rewards/accuracies": 0.5, "rewards/chosen": -0.09652397781610489, "rewards/margins": 0.11891345679759979, "rewards/rejected": -0.21543745696544647, "sft_loss": 0.9652397632598877, "step": 633 }, { "epoch": 0.9168474331164136, "grad_norm": 2.929910989196169, "learning_rate": 7.661054783911023e-06, "logits/chosen": 0.30516791343688965, "logits/rejected": 0.3138343095779419, "logps/chosen": -0.9684697389602661, "logps/rejected": -2.2495481967926025, "loss": 1.0268, "odds_ratio_loss": 0.6787380576133728, "rewards/accuracies": 0.75, "rewards/chosen": -0.09684698283672333, "rewards/margins": 0.12810784578323364, "rewards/rejected": -0.22495481371879578, "sft_loss": 0.9684697389602661, "step": 634 }, { "epoch": 0.918293564714389, "grad_norm": 3.5685552202873656, "learning_rate": 7.65980246226369e-06, "logits/chosen": 0.5167609453201294, "logits/rejected": 0.534476101398468, "logps/chosen": -0.8154505491256714, "logps/rejected": -1.6477975845336914, "loss": 0.8668, "odds_ratio_loss": 0.6014808416366577, "rewards/accuracies": 0.625, "rewards/chosen": -0.08154505491256714, "rewards/margins": 0.08323470503091812, "rewards/rejected": -0.16477975249290466, "sft_loss": 0.8154505491256714, "step": 635 }, { "epoch": 0.9197396963123644, "grad_norm": 3.791415677312705, "learning_rate": 7.658547934107826e-06, "logits/chosen": 0.38681352138519287, "logits/rejected": 0.3783404529094696, "logps/chosen": -0.852817714214325, "logps/rejected": -1.8300766944885254, "loss": 0.8929, "odds_ratio_loss": 0.5779595375061035, "rewards/accuracies": 0.625, "rewards/chosen": -0.08528177440166473, "rewards/margins": 0.09772589802742004, "rewards/rejected": -0.18300765752792358, "sft_loss": 0.852817714214325, "step": 636 }, { "epoch": 0.9211858279103399, "grad_norm": 3.3465232296765994, "learning_rate": 7.657291200199784e-06, "logits/chosen": 0.290289044380188, "logits/rejected": 0.3608376979827881, "logps/chosen": -0.7669796943664551, "logps/rejected": -2.2326064109802246, "loss": 0.8354, "odds_ratio_loss": 0.5988900661468506, "rewards/accuracies": 0.5, "rewards/chosen": -0.07669797539710999, "rewards/margins": 0.14656266570091248, "rewards/rejected": -0.22326062619686127, "sft_loss": 0.7669796943664551, "step": 637 }, { "epoch": 0.9226319595083152, "grad_norm": 4.414553916663672, "learning_rate": 7.656032261297255e-06, "logits/chosen": 0.26446521282196045, "logits/rejected": 0.16627632081508636, "logps/chosen": -0.7560668587684631, "logps/rejected": -2.14742374420166, "loss": 0.8415, "odds_ratio_loss": 0.4151613414287567, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07560668885707855, "rewards/margins": 0.1391356885433197, "rewards/rejected": -0.21474237740039825, "sft_loss": 0.7560668587684631, "step": 638 }, { "epoch": 0.9240780911062907, "grad_norm": 2.585907860504243, "learning_rate": 7.654771118159262e-06, "logits/chosen": 0.3260245621204376, "logits/rejected": 0.29659798741340637, "logps/chosen": -0.7077478766441345, "logps/rejected": -1.617417573928833, "loss": 0.7967, "odds_ratio_loss": 0.41377967596054077, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07077478617429733, "rewards/margins": 0.09096695482730865, "rewards/rejected": -0.16174176335334778, "sft_loss": 0.7077478766441345, "step": 639 }, { "epoch": 0.925524222704266, "grad_norm": 2.6907621885171924, "learning_rate": 7.653507771546148e-06, "logits/chosen": 0.4016415476799011, "logits/rejected": 0.19014433026313782, "logps/chosen": -0.6699740886688232, "logps/rejected": -2.6581130027770996, "loss": 0.8692, "odds_ratio_loss": 0.3492831885814667, "rewards/accuracies": 0.75, "rewards/chosen": -0.06699740886688232, "rewards/margins": 0.19881388545036316, "rewards/rejected": -0.2658112943172455, "sft_loss": 0.6699740886688232, "step": 640 }, { "epoch": 0.9269703543022415, "grad_norm": 4.452046547002235, "learning_rate": 7.652242222219593e-06, "logits/chosen": 0.297119677066803, "logits/rejected": 0.26825442910194397, "logps/chosen": -0.8774327039718628, "logps/rejected": -2.2502782344818115, "loss": 0.9753, "odds_ratio_loss": 0.49745067954063416, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08774326741695404, "rewards/margins": 0.1372845470905304, "rewards/rejected": -0.22502781450748444, "sft_loss": 0.8774327039718628, "step": 641 }, { "epoch": 0.928416485900217, "grad_norm": 3.1251697468215127, "learning_rate": 7.650974470942598e-06, "logits/chosen": 0.4491688013076782, "logits/rejected": 0.379191517829895, "logps/chosen": -0.9273581504821777, "logps/rejected": -1.4209566116333008, "loss": 0.8902, "odds_ratio_loss": 0.6889926195144653, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09273581951856613, "rewards/margins": 0.049359846860170364, "rewards/rejected": -0.1420956701040268, "sft_loss": 0.9273581504821777, "step": 642 }, { "epoch": 0.9298626174981923, "grad_norm": 3.00570853026626, "learning_rate": 7.649704518479497e-06, "logits/chosen": 0.3809622526168823, "logits/rejected": 0.3525671660900116, "logps/chosen": -0.8433955907821655, "logps/rejected": -1.3788678646087646, "loss": 0.8706, "odds_ratio_loss": 0.634644627571106, "rewards/accuracies": 0.75, "rewards/chosen": -0.08433955907821655, "rewards/margins": 0.05354723334312439, "rewards/rejected": -0.13788677752017975, "sft_loss": 0.8433955907821655, "step": 643 }, { "epoch": 0.9313087490961678, "grad_norm": 2.7897702867657324, "learning_rate": 7.648432365595951e-06, "logits/chosen": 0.3524807393550873, "logits/rejected": 0.34861454367637634, "logps/chosen": -0.9019001722335815, "logps/rejected": -1.3493397235870361, "loss": 0.7982, "odds_ratio_loss": 0.5859642624855042, "rewards/accuracies": 0.625, "rewards/chosen": -0.09019001573324203, "rewards/margins": 0.044743962585926056, "rewards/rejected": -0.1349339783191681, "sft_loss": 0.9019001722335815, "step": 644 }, { "epoch": 0.9327548806941431, "grad_norm": 3.4031829380173746, "learning_rate": 7.647158013058943e-06, "logits/chosen": 0.4466613531112671, "logits/rejected": 0.3243465721607208, "logps/chosen": -0.8417915105819702, "logps/rejected": -2.339245319366455, "loss": 0.8632, "odds_ratio_loss": 0.5358899235725403, "rewards/accuracies": 0.5, "rewards/chosen": -0.08417915552854538, "rewards/margins": 0.1497454047203064, "rewards/rejected": -0.23392455279827118, "sft_loss": 0.8417915105819702, "step": 645 }, { "epoch": 0.9342010122921186, "grad_norm": 3.1004064500515955, "learning_rate": 7.645881461636784e-06, "logits/chosen": 0.44667521119117737, "logits/rejected": 0.2823071777820587, "logps/chosen": -0.6927672028541565, "logps/rejected": -2.4781577587127686, "loss": 0.7595, "odds_ratio_loss": 0.380462646484375, "rewards/accuracies": 0.75, "rewards/chosen": -0.06927672028541565, "rewards/margins": 0.17853903770446777, "rewards/rejected": -0.24781575798988342, "sft_loss": 0.6927672028541565, "step": 646 }, { "epoch": 0.935647143890094, "grad_norm": 2.6234045812685682, "learning_rate": 7.644602712099113e-06, "logits/chosen": 0.5283761024475098, "logits/rejected": 0.3219844102859497, "logps/chosen": -0.6988430023193359, "logps/rejected": -1.651008129119873, "loss": 0.7322, "odds_ratio_loss": 0.4112403988838196, "rewards/accuracies": 0.75, "rewards/chosen": -0.0698843002319336, "rewards/margins": 0.09521650522947311, "rewards/rejected": -0.1651008129119873, "sft_loss": 0.6988430023193359, "step": 647 }, { "epoch": 0.9370932754880694, "grad_norm": 5.239045092918419, "learning_rate": 7.643321765216894e-06, "logits/chosen": 0.32309383153915405, "logits/rejected": 0.30599987506866455, "logps/chosen": -0.9356340765953064, "logps/rejected": -1.8596378564834595, "loss": 0.9289, "odds_ratio_loss": 0.6126976013183594, "rewards/accuracies": 0.625, "rewards/chosen": -0.09356341511011124, "rewards/margins": 0.09240037202835083, "rewards/rejected": -0.18596379458904266, "sft_loss": 0.9356340765953064, "step": 648 }, { "epoch": 0.9385394070860448, "grad_norm": 2.4529826709658495, "learning_rate": 7.642038621762414e-06, "logits/chosen": 0.5031927227973938, "logits/rejected": 0.3548673093318939, "logps/chosen": -0.6429674625396729, "logps/rejected": -1.8498015403747559, "loss": 0.8475, "odds_ratio_loss": 0.46681496500968933, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06429674476385117, "rewards/margins": 0.12068342417478561, "rewards/rejected": -0.18498015403747559, "sft_loss": 0.6429674625396729, "step": 649 }, { "epoch": 0.9399855386840202, "grad_norm": 4.412976604978248, "learning_rate": 7.640753282509284e-06, "logits/chosen": 0.2757370173931122, "logits/rejected": 0.2888236343860626, "logps/chosen": -0.78496915102005, "logps/rejected": -1.9232274293899536, "loss": 0.859, "odds_ratio_loss": 0.4362175464630127, "rewards/accuracies": 0.75, "rewards/chosen": -0.07849692553281784, "rewards/margins": 0.11382582783699036, "rewards/rejected": -0.1923227608203888, "sft_loss": 0.78496915102005, "step": 650 }, { "epoch": 0.9414316702819957, "grad_norm": 3.1829134737847937, "learning_rate": 7.639465748232439e-06, "logits/chosen": 0.3406945466995239, "logits/rejected": 0.34385547041893005, "logps/chosen": -0.8253358602523804, "logps/rejected": -1.9728519916534424, "loss": 0.8718, "odds_ratio_loss": 0.5355536937713623, "rewards/accuracies": 0.625, "rewards/chosen": -0.0825335830450058, "rewards/margins": 0.11475162208080292, "rewards/rejected": -0.1972852200269699, "sft_loss": 0.8253358602523804, "step": 651 }, { "epoch": 0.9428778018799711, "grad_norm": 4.11945466113238, "learning_rate": 7.638176019708141e-06, "logits/chosen": 0.45153433084487915, "logits/rejected": 0.3097097873687744, "logps/chosen": -0.7182214260101318, "logps/rejected": -2.8085572719573975, "loss": 0.8844, "odds_ratio_loss": 0.3755723834037781, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0718221515417099, "rewards/margins": 0.20903359353542328, "rewards/rejected": -0.2808557152748108, "sft_loss": 0.7182214260101318, "step": 652 }, { "epoch": 0.9443239334779465, "grad_norm": 4.169907813750463, "learning_rate": 7.63688409771397e-06, "logits/chosen": 0.3641658425331116, "logits/rejected": 0.16773995757102966, "logps/chosen": -0.7689434885978699, "logps/rejected": -2.3531601428985596, "loss": 0.9164, "odds_ratio_loss": 0.4541126489639282, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0768943503499031, "rewards/margins": 0.15842165052890778, "rewards/rejected": -0.23531600832939148, "sft_loss": 0.7689434885978699, "step": 653 }, { "epoch": 0.9457700650759219, "grad_norm": 2.8738626095421647, "learning_rate": 7.635589983028832e-06, "logits/chosen": 0.37339627742767334, "logits/rejected": 0.29968613386154175, "logps/chosen": -0.7189725637435913, "logps/rejected": -2.247807025909424, "loss": 0.8222, "odds_ratio_loss": 0.42104122042655945, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07189725339412689, "rewards/margins": 0.15288342535495758, "rewards/rejected": -0.22478069365024567, "sft_loss": 0.7189725637435913, "step": 654 }, { "epoch": 0.9472161966738973, "grad_norm": 2.3715953476245004, "learning_rate": 7.634293676432953e-06, "logits/chosen": 0.44171804189682007, "logits/rejected": 0.37442782521247864, "logps/chosen": -0.6790784597396851, "logps/rejected": -1.6948163509368896, "loss": 0.8714, "odds_ratio_loss": 0.40542173385620117, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06790784746408463, "rewards/margins": 0.10157379508018494, "rewards/rejected": -0.16948163509368896, "sft_loss": 0.6790784597396851, "step": 655 }, { "epoch": 0.9486623282718727, "grad_norm": 2.8909595515998308, "learning_rate": 7.63299517870788e-06, "logits/chosen": 0.27957409620285034, "logits/rejected": 0.2359994798898697, "logps/chosen": -0.8018577098846436, "logps/rejected": -1.7241246700286865, "loss": 0.8156, "odds_ratio_loss": 0.5142108201980591, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08018577098846436, "rewards/margins": 0.09222669899463654, "rewards/rejected": -0.1724124699831009, "sft_loss": 0.8018577098846436, "step": 656 }, { "epoch": 0.9501084598698482, "grad_norm": 3.057708923025605, "learning_rate": 7.631694490636483e-06, "logits/chosen": 0.5239850282669067, "logits/rejected": 0.3646376132965088, "logps/chosen": -0.6075431108474731, "logps/rejected": -3.623016357421875, "loss": 0.839, "odds_ratio_loss": 0.41288918256759644, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06075431406497955, "rewards/margins": 0.3015473484992981, "rewards/rejected": -0.36230167746543884, "sft_loss": 0.6075431108474731, "step": 657 }, { "epoch": 0.9515545914678236, "grad_norm": 3.5053796147588305, "learning_rate": 7.630391613002953e-06, "logits/chosen": 0.4126778244972229, "logits/rejected": 0.3100075125694275, "logps/chosen": -0.7381374835968018, "logps/rejected": -3.2679390907287598, "loss": 0.8151, "odds_ratio_loss": 0.44963547587394714, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07381375133991241, "rewards/margins": 0.25298017263412476, "rewards/rejected": -0.326793909072876, "sft_loss": 0.7381374835968018, "step": 658 }, { "epoch": 0.953000723065799, "grad_norm": 3.229921172064812, "learning_rate": 7.629086546592797e-06, "logits/chosen": 0.39081621170043945, "logits/rejected": 0.26037153601646423, "logps/chosen": -0.8727468252182007, "logps/rejected": -1.5484097003936768, "loss": 0.7941, "odds_ratio_loss": 0.5154801607131958, "rewards/accuracies": 0.75, "rewards/chosen": -0.08727468550205231, "rewards/margins": 0.06756629049777985, "rewards/rejected": -0.15484097599983215, "sft_loss": 0.8727468252182007, "step": 659 }, { "epoch": 0.9544468546637744, "grad_norm": 2.933170422294456, "learning_rate": 7.6277792921928464e-06, "logits/chosen": 0.2233104109764099, "logits/rejected": 0.35420551896095276, "logps/chosen": -0.8203558921813965, "logps/rejected": -1.4535131454467773, "loss": 0.8687, "odds_ratio_loss": 0.6188152432441711, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08203558623790741, "rewards/margins": 0.0633157268166542, "rewards/rejected": -0.1453513205051422, "sft_loss": 0.8203558921813965, "step": 660 }, { "epoch": 0.9558929862617498, "grad_norm": 2.5813611799218976, "learning_rate": 7.6264698505912504e-06, "logits/chosen": 0.24971553683280945, "logits/rejected": 0.24186164140701294, "logps/chosen": -0.7713397741317749, "logps/rejected": -1.9475972652435303, "loss": 0.7881, "odds_ratio_loss": 0.46839070320129395, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07713396847248077, "rewards/margins": 0.11762575060129166, "rewards/rejected": -0.19475972652435303, "sft_loss": 0.7713397741317749, "step": 661 }, { "epoch": 0.9573391178597253, "grad_norm": 3.2723557761645616, "learning_rate": 7.625158222577474e-06, "logits/chosen": 0.3482891023159027, "logits/rejected": 0.2935275435447693, "logps/chosen": -0.7917287349700928, "logps/rejected": -1.6271872520446777, "loss": 0.8783, "odds_ratio_loss": 0.6316386461257935, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07917286455631256, "rewards/margins": 0.08354586362838745, "rewards/rejected": -0.1627187281847, "sft_loss": 0.7917287349700928, "step": 662 }, { "epoch": 0.9587852494577006, "grad_norm": 3.1102507204547845, "learning_rate": 7.623844408942304e-06, "logits/chosen": 0.377737820148468, "logits/rejected": 0.34229159355163574, "logps/chosen": -0.7180280685424805, "logps/rejected": -1.8476381301879883, "loss": 0.8331, "odds_ratio_loss": 0.4935680627822876, "rewards/accuracies": 0.75, "rewards/chosen": -0.07180280983448029, "rewards/margins": 0.11296100914478302, "rewards/rejected": -0.1847638040781021, "sft_loss": 0.7180280685424805, "step": 663 }, { "epoch": 0.9602313810556761, "grad_norm": 3.151008524024375, "learning_rate": 7.622528410477842e-06, "logits/chosen": 0.3907800018787384, "logits/rejected": 0.3244548738002777, "logps/chosen": -0.7783466577529907, "logps/rejected": -1.9935373067855835, "loss": 0.8665, "odds_ratio_loss": 0.5557862520217896, "rewards/accuracies": 0.75, "rewards/chosen": -0.07783466577529907, "rewards/margins": 0.1215190589427948, "rewards/rejected": -0.19935372471809387, "sft_loss": 0.7783466577529907, "step": 664 }, { "epoch": 0.9616775126536515, "grad_norm": 4.387593299955748, "learning_rate": 7.6212102279775115e-06, "logits/chosen": 0.3593447208404541, "logits/rejected": 0.3258175849914551, "logps/chosen": -0.7193015813827515, "logps/rejected": -1.7327250242233276, "loss": 0.8301, "odds_ratio_loss": 0.46600061655044556, "rewards/accuracies": 0.75, "rewards/chosen": -0.0719301626086235, "rewards/margins": 0.1013423502445221, "rewards/rejected": -0.1732725203037262, "sft_loss": 0.7193015813827515, "step": 665 }, { "epoch": 0.9631236442516269, "grad_norm": 2.390706539517393, "learning_rate": 7.6198898622360464e-06, "logits/chosen": 0.34965071082115173, "logits/rejected": 0.31699496507644653, "logps/chosen": -0.9958378672599792, "logps/rejected": -1.3755648136138916, "loss": 0.9733, "odds_ratio_loss": 0.7043642997741699, "rewards/accuracies": 0.4375, "rewards/chosen": -0.09958378970623016, "rewards/margins": 0.03797269985079765, "rewards/rejected": -0.13755649328231812, "sft_loss": 0.9958378672599792, "step": 666 }, { "epoch": 0.9645697758496024, "grad_norm": 3.905469796567993, "learning_rate": 7.6185673140495015e-06, "logits/chosen": 0.3465041518211365, "logits/rejected": 0.31788861751556396, "logps/chosen": -0.8372557163238525, "logps/rejected": -1.2869638204574585, "loss": 0.8411, "odds_ratio_loss": 0.5828957557678223, "rewards/accuracies": 0.625, "rewards/chosen": -0.08372557163238525, "rewards/margins": 0.04497080296278, "rewards/rejected": -0.12869638204574585, "sft_loss": 0.8372557163238525, "step": 667 }, { "epoch": 0.9660159074475777, "grad_norm": 2.875529991953273, "learning_rate": 7.617242584215246e-06, "logits/chosen": 0.4149461090564728, "logits/rejected": 0.2595219612121582, "logps/chosen": -0.6817153692245483, "logps/rejected": -2.7561120986938477, "loss": 0.7481, "odds_ratio_loss": 0.37974974513053894, "rewards/accuracies": 0.75, "rewards/chosen": -0.06817153841257095, "rewards/margins": 0.20743967592716217, "rewards/rejected": -0.2756112217903137, "sft_loss": 0.6817153692245483, "step": 668 }, { "epoch": 0.9674620390455532, "grad_norm": 2.3328224837698883, "learning_rate": 7.615915673531965e-06, "logits/chosen": 0.3929983973503113, "logits/rejected": 0.300082802772522, "logps/chosen": -0.7355288863182068, "logps/rejected": -2.5262303352355957, "loss": 0.9321, "odds_ratio_loss": 0.49977803230285645, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07355289161205292, "rewards/margins": 0.1790701150894165, "rewards/rejected": -0.2526229918003082, "sft_loss": 0.7355288863182068, "step": 669 }, { "epoch": 0.9689081706435285, "grad_norm": 2.540065505875259, "learning_rate": 7.614586582799658e-06, "logits/chosen": 0.4453916847705841, "logits/rejected": 0.40558886528015137, "logps/chosen": -0.7393182516098022, "logps/rejected": -1.651641607284546, "loss": 0.8151, "odds_ratio_loss": 0.47889044880867004, "rewards/accuracies": 0.75, "rewards/chosen": -0.07393182814121246, "rewards/margins": 0.09123234450817108, "rewards/rejected": -0.16516415774822235, "sft_loss": 0.7393182516098022, "step": 670 }, { "epoch": 0.970354302241504, "grad_norm": 2.219181410621246, "learning_rate": 7.6132553128196375e-06, "logits/chosen": 0.4777889847755432, "logits/rejected": 0.3341582417488098, "logps/chosen": -0.6518281698226929, "logps/rejected": -1.7046725749969482, "loss": 0.7995, "odds_ratio_loss": 0.5171917676925659, "rewards/accuracies": 0.625, "rewards/chosen": -0.06518281996250153, "rewards/margins": 0.1052844300866127, "rewards/rejected": -0.17046725749969482, "sft_loss": 0.6518281698226929, "step": 671 }, { "epoch": 0.9718004338394793, "grad_norm": 2.6826612009314426, "learning_rate": 7.6119218643945315e-06, "logits/chosen": 0.3291289806365967, "logits/rejected": 0.32001611590385437, "logps/chosen": -0.9242229461669922, "logps/rejected": -1.5439229011535645, "loss": 0.8439, "odds_ratio_loss": 0.7239330410957336, "rewards/accuracies": 0.5, "rewards/chosen": -0.09242229908704758, "rewards/margins": 0.06196998804807663, "rewards/rejected": -0.1543922871351242, "sft_loss": 0.9242229461669922, "step": 672 }, { "epoch": 0.9732465654374548, "grad_norm": 2.554322117110017, "learning_rate": 7.610586238328281e-06, "logits/chosen": 0.4449082314968109, "logits/rejected": 0.3627157211303711, "logps/chosen": -0.769290030002594, "logps/rejected": -1.377912998199463, "loss": 0.8809, "odds_ratio_loss": 0.5437467098236084, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0769289955496788, "rewards/margins": 0.06086229905486107, "rewards/rejected": -0.13779129087924957, "sft_loss": 0.769290030002594, "step": 673 }, { "epoch": 0.9746926970354303, "grad_norm": 4.82523082470742, "learning_rate": 7.60924843542614e-06, "logits/chosen": 0.3904574513435364, "logits/rejected": 0.2845170497894287, "logps/chosen": -0.8830059766769409, "logps/rejected": -3.303161144256592, "loss": 0.9391, "odds_ratio_loss": 0.39237675070762634, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08830060064792633, "rewards/margins": 0.24201549589633942, "rewards/rejected": -0.33031609654426575, "sft_loss": 0.8830059766769409, "step": 674 }, { "epoch": 0.9761388286334056, "grad_norm": 2.6405629501899375, "learning_rate": 7.607908456494675e-06, "logits/chosen": 0.5365076661109924, "logits/rejected": 0.4125050902366638, "logps/chosen": -0.6532673835754395, "logps/rejected": -1.4482712745666504, "loss": 0.8422, "odds_ratio_loss": 0.4444807171821594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0653267428278923, "rewards/margins": 0.07950039207935333, "rewards/rejected": -0.14482714235782623, "sft_loss": 0.6532673835754395, "step": 675 }, { "epoch": 0.9775849602313811, "grad_norm": 3.3513421127086636, "learning_rate": 7.606566302341764e-06, "logits/chosen": 0.46091920137405396, "logits/rejected": 0.25358325242996216, "logps/chosen": -0.8063492178916931, "logps/rejected": -2.6329450607299805, "loss": 0.8453, "odds_ratio_loss": 0.4977782964706421, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08063492923974991, "rewards/margins": 0.1826595962047577, "rewards/rejected": -0.263294517993927, "sft_loss": 0.8063492178916931, "step": 676 }, { "epoch": 0.9790310918293564, "grad_norm": 2.5294773650031717, "learning_rate": 7.6052219737765975e-06, "logits/chosen": 0.478007435798645, "logits/rejected": 0.38246941566467285, "logps/chosen": -0.8104468584060669, "logps/rejected": -1.7242964506149292, "loss": 0.8154, "odds_ratio_loss": 0.520146369934082, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08104468137025833, "rewards/margins": 0.09138496220111847, "rewards/rejected": -0.1724296510219574, "sft_loss": 0.8104468584060669, "step": 677 }, { "epoch": 0.9804772234273319, "grad_norm": 3.661969667095535, "learning_rate": 7.6038754716096755e-06, "logits/chosen": 0.30862492322921753, "logits/rejected": 0.27289360761642456, "logps/chosen": -0.7575228810310364, "logps/rejected": -1.1974655389785767, "loss": 0.8244, "odds_ratio_loss": 0.545995831489563, "rewards/accuracies": 0.75, "rewards/chosen": -0.07575228810310364, "rewards/margins": 0.04399425908923149, "rewards/rejected": -0.11974655091762543, "sft_loss": 0.7575228810310364, "step": 678 }, { "epoch": 0.9819233550253073, "grad_norm": 2.836339221066449, "learning_rate": 7.60252679665281e-06, "logits/chosen": 0.35597071051597595, "logits/rejected": 0.3139832615852356, "logps/chosen": -0.7728620171546936, "logps/rejected": -1.354665994644165, "loss": 0.7193, "odds_ratio_loss": 0.5544801950454712, "rewards/accuracies": 0.625, "rewards/chosen": -0.07728619873523712, "rewards/margins": 0.05818040296435356, "rewards/rejected": -0.13546660542488098, "sft_loss": 0.7728620171546936, "step": 679 }, { "epoch": 0.9833694866232827, "grad_norm": 2.2847928594793943, "learning_rate": 7.601175949719122e-06, "logits/chosen": 0.3785718083381653, "logits/rejected": 0.25374460220336914, "logps/chosen": -0.6391298174858093, "logps/rejected": -2.5889670848846436, "loss": 0.7757, "odds_ratio_loss": 0.39246100187301636, "rewards/accuracies": 0.875, "rewards/chosen": -0.06391298770904541, "rewards/margins": 0.19498372077941895, "rewards/rejected": -0.25889670848846436, "sft_loss": 0.6391298174858093, "step": 680 }, { "epoch": 0.9848156182212582, "grad_norm": 3.2734066693082116, "learning_rate": 7.599822931623041e-06, "logits/chosen": 0.4401516020298004, "logits/rejected": 0.31636372208595276, "logps/chosen": -0.8324559330940247, "logps/rejected": -2.279355764389038, "loss": 0.8542, "odds_ratio_loss": 0.5873942375183105, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08324559032917023, "rewards/margins": 0.14468997716903687, "rewards/rejected": -0.2279355525970459, "sft_loss": 0.8324559330940247, "step": 681 }, { "epoch": 0.9862617498192335, "grad_norm": 3.4818714247714158, "learning_rate": 7.598467743180308e-06, "logits/chosen": 0.3637101352214813, "logits/rejected": 0.3162555694580078, "logps/chosen": -0.6616679430007935, "logps/rejected": -2.204662799835205, "loss": 0.8601, "odds_ratio_loss": 0.47493523359298706, "rewards/accuracies": 0.625, "rewards/chosen": -0.06616680324077606, "rewards/margins": 0.15429948270320892, "rewards/rejected": -0.22046628594398499, "sft_loss": 0.6616679430007935, "step": 682 }, { "epoch": 0.987707881417209, "grad_norm": 2.3892984192577122, "learning_rate": 7.597110385207969e-06, "logits/chosen": 0.31790465116500854, "logits/rejected": 0.26166340708732605, "logps/chosen": -0.9303499460220337, "logps/rejected": -1.437572956085205, "loss": 0.9003, "odds_ratio_loss": 0.6231876611709595, "rewards/accuracies": 0.5, "rewards/chosen": -0.09303499758243561, "rewards/margins": 0.05072229355573654, "rewards/rejected": -0.14375729858875275, "sft_loss": 0.9303499460220337, "step": 683 }, { "epoch": 0.9891540130151844, "grad_norm": 2.2284432203154685, "learning_rate": 7.5957508585243824e-06, "logits/chosen": 0.40829747915267944, "logits/rejected": 0.27027884125709534, "logps/chosen": -0.6149638891220093, "logps/rejected": -2.2985355854034424, "loss": 0.6659, "odds_ratio_loss": 0.340939998626709, "rewards/accuracies": 0.875, "rewards/chosen": -0.06149638444185257, "rewards/margins": 0.16835719347000122, "rewards/rejected": -0.2298535704612732, "sft_loss": 0.6149638891220093, "step": 684 }, { "epoch": 0.9906001446131598, "grad_norm": 2.95393549994262, "learning_rate": 7.594389163949211e-06, "logits/chosen": 0.3751344084739685, "logits/rejected": 0.24515630304813385, "logps/chosen": -0.6303630471229553, "logps/rejected": -2.502126693725586, "loss": 0.7415, "odds_ratio_loss": 0.3910840153694153, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06303630769252777, "rewards/margins": 0.18717637658119202, "rewards/rejected": -0.250212699174881, "sft_loss": 0.6303630471229553, "step": 685 }, { "epoch": 0.9920462762111352, "grad_norm": 3.0543818604744923, "learning_rate": 7.593025302303426e-06, "logits/chosen": 0.22160741686820984, "logits/rejected": 0.19796130061149597, "logps/chosen": -0.818253755569458, "logps/rejected": -1.9009416103363037, "loss": 0.877, "odds_ratio_loss": 0.6066489219665527, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0818253830075264, "rewards/margins": 0.10826879739761353, "rewards/rejected": -0.19009417295455933, "sft_loss": 0.818253755569458, "step": 686 }, { "epoch": 0.9934924078091106, "grad_norm": 4.998255178745226, "learning_rate": 7.591659274409305e-06, "logits/chosen": 0.4371909201145172, "logits/rejected": 0.3654007613658905, "logps/chosen": -0.8172946572303772, "logps/rejected": -1.5242695808410645, "loss": 0.9026, "odds_ratio_loss": 0.6884384155273438, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0817294716835022, "rewards/margins": 0.07069748640060425, "rewards/rejected": -0.15242695808410645, "sft_loss": 0.8172946572303772, "step": 687 }, { "epoch": 0.9949385394070861, "grad_norm": 4.778633155537618, "learning_rate": 7.590291081090429e-06, "logits/chosen": 0.33778876066207886, "logits/rejected": 0.18438909947872162, "logps/chosen": -0.8597713708877563, "logps/rejected": -1.5066003799438477, "loss": 0.8564, "odds_ratio_loss": 0.5719389319419861, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08597713708877563, "rewards/margins": 0.06468289345502853, "rewards/rejected": -0.15066005289554596, "sft_loss": 0.8597713708877563, "step": 688 }, { "epoch": 0.9963846710050615, "grad_norm": 6.0189507993908276, "learning_rate": 7.588920723171691e-06, "logits/chosen": 0.33838024735450745, "logits/rejected": 0.2940466105937958, "logps/chosen": -0.8718283176422119, "logps/rejected": -1.8520734310150146, "loss": 0.9698, "odds_ratio_loss": 0.5115089416503906, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08718283474445343, "rewards/margins": 0.09802451729774475, "rewards/rejected": -0.18520736694335938, "sft_loss": 0.8718283176422119, "step": 689 }, { "epoch": 0.9978308026030369, "grad_norm": 3.9502319107623367, "learning_rate": 7.5875482014792805e-06, "logits/chosen": 0.3570603132247925, "logits/rejected": 0.27360644936561584, "logps/chosen": -1.0185794830322266, "logps/rejected": -1.4535801410675049, "loss": 0.9067, "odds_ratio_loss": 0.65177321434021, "rewards/accuracies": 0.625, "rewards/chosen": -0.10185794532299042, "rewards/margins": 0.04350007325410843, "rewards/rejected": -0.14535802602767944, "sft_loss": 1.0185794830322266, "step": 690 }, { "epoch": 0.9992769342010123, "grad_norm": 7.674669788400658, "learning_rate": 7.586173516840698e-06, "logits/chosen": 0.1779128909111023, "logits/rejected": 0.17757825553417206, "logps/chosen": -0.9851481914520264, "logps/rejected": -1.3905221223831177, "loss": 0.8799, "odds_ratio_loss": 0.7719675302505493, "rewards/accuracies": 0.4375, "rewards/chosen": -0.09851481765508652, "rewards/margins": 0.04053738713264465, "rewards/rejected": -0.13905221223831177, "sft_loss": 0.9851481914520264, "step": 691 }, { "epoch": 1.0007230657989876, "grad_norm": 3.173077480414479, "learning_rate": 7.584796670084747e-06, "logits/chosen": 0.3374330997467041, "logits/rejected": 0.25602987408638, "logps/chosen": -0.7603847980499268, "logps/rejected": -2.37874436378479, "loss": 0.6604, "odds_ratio_loss": 0.4636649191379547, "rewards/accuracies": 0.625, "rewards/chosen": -0.07603848725557327, "rewards/margins": 0.1618359386920929, "rewards/rejected": -0.23787443339824677, "sft_loss": 0.7603847980499268, "step": 692 }, { "epoch": 1.002169197396963, "grad_norm": 2.9704017840618158, "learning_rate": 7.583417662041532e-06, "logits/chosen": 0.473114013671875, "logits/rejected": 0.2985457181930542, "logps/chosen": -0.4166085124015808, "logps/rejected": -2.621722459793091, "loss": 0.5093, "odds_ratio_loss": 0.25622743368148804, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0416608564555645, "rewards/margins": 0.22051140666007996, "rewards/rejected": -0.26217225193977356, "sft_loss": 0.4166085124015808, "step": 693 }, { "epoch": 1.0036153289949385, "grad_norm": 3.679055594373421, "learning_rate": 7.5820364935424625e-06, "logits/chosen": 0.36234307289123535, "logits/rejected": 0.2413337230682373, "logps/chosen": -0.49339836835861206, "logps/rejected": -2.096301555633545, "loss": 0.5615, "odds_ratio_loss": 0.36454418301582336, "rewards/accuracies": 0.75, "rewards/chosen": -0.049339838325977325, "rewards/margins": 0.16029031574726105, "rewards/rejected": -0.20963016152381897, "sft_loss": 0.49339836835861206, "step": 694 }, { "epoch": 1.005061460592914, "grad_norm": 2.4644440720589715, "learning_rate": 7.58065316542025e-06, "logits/chosen": 0.2052929401397705, "logits/rejected": 0.17279671132564545, "logps/chosen": -0.5167049169540405, "logps/rejected": -3.038837432861328, "loss": 0.5475, "odds_ratio_loss": 0.30443111062049866, "rewards/accuracies": 0.875, "rewards/chosen": -0.051670484244823456, "rewards/margins": 0.2522132694721222, "rewards/rejected": -0.30388376116752625, "sft_loss": 0.5167049169540405, "step": 695 }, { "epoch": 1.0065075921908895, "grad_norm": 2.2735633936891726, "learning_rate": 7.579267678508907e-06, "logits/chosen": 0.03438059240579605, "logits/rejected": 0.08168449997901917, "logps/chosen": -0.7009294033050537, "logps/rejected": -1.5276418924331665, "loss": 0.5529, "odds_ratio_loss": 0.3713332414627075, "rewards/accuracies": 0.875, "rewards/chosen": -0.07009293884038925, "rewards/margins": 0.08267124742269516, "rewards/rejected": -0.15276417136192322, "sft_loss": 0.7009294033050537, "step": 696 }, { "epoch": 1.0079537237888647, "grad_norm": 2.6197848918777447, "learning_rate": 7.577880033643751e-06, "logits/chosen": 0.01991932839155197, "logits/rejected": 0.004507867619395256, "logps/chosen": -0.6916311979293823, "logps/rejected": -2.359644651412964, "loss": 0.6266, "odds_ratio_loss": 0.3500153720378876, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06916312128305435, "rewards/margins": 0.1668013483285904, "rewards/rejected": -0.23596449196338654, "sft_loss": 0.6916311979293823, "step": 697 }, { "epoch": 1.0093998553868402, "grad_norm": 3.2406420094854553, "learning_rate": 7.576490231661397e-06, "logits/chosen": -0.03848964348435402, "logits/rejected": 0.02764774300158024, "logps/chosen": -0.5944284200668335, "logps/rejected": -1.3864145278930664, "loss": 0.6854, "odds_ratio_loss": 0.5450542569160461, "rewards/accuracies": 0.75, "rewards/chosen": -0.05944284424185753, "rewards/margins": 0.07919859886169434, "rewards/rejected": -0.13864144682884216, "sft_loss": 0.5944284200668335, "step": 698 }, { "epoch": 1.0108459869848156, "grad_norm": 2.6286685009567035, "learning_rate": 7.575098273399764e-06, "logits/chosen": -0.05605170875787735, "logits/rejected": -0.044075943529605865, "logps/chosen": -0.6372457146644592, "logps/rejected": -1.772578477859497, "loss": 0.5905, "odds_ratio_loss": 0.3773233890533447, "rewards/accuracies": 0.75, "rewards/chosen": -0.0637245699763298, "rewards/margins": 0.11353328824043274, "rewards/rejected": -0.17725783586502075, "sft_loss": 0.6372457146644592, "step": 699 }, { "epoch": 1.0122921185827911, "grad_norm": 2.617536762472655, "learning_rate": 7.573704159698065e-06, "logits/chosen": -0.05474071949720383, "logits/rejected": -0.07595973461866379, "logps/chosen": -0.6069769859313965, "logps/rejected": -2.064570903778076, "loss": 0.6547, "odds_ratio_loss": 0.35097628831863403, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06069770082831383, "rewards/margins": 0.14575940370559692, "rewards/rejected": -0.20645710825920105, "sft_loss": 0.6069769859313965, "step": 700 }, { "epoch": 1.0137382501807664, "grad_norm": 2.689800745017883, "learning_rate": 7.572307891396817e-06, "logits/chosen": -0.05100724846124649, "logits/rejected": 0.07264310121536255, "logps/chosen": -0.717523455619812, "logps/rejected": -2.394158363342285, "loss": 0.5874, "odds_ratio_loss": 0.36042433977127075, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07175233960151672, "rewards/margins": 0.16766351461410522, "rewards/rejected": -0.23941585421562195, "sft_loss": 0.717523455619812, "step": 701 }, { "epoch": 1.0151843817787418, "grad_norm": 2.535012066884644, "learning_rate": 7.570909469337838e-06, "logits/chosen": 0.06960493326187134, "logits/rejected": 0.07583092153072357, "logps/chosen": -0.49426883459091187, "logps/rejected": -2.2409791946411133, "loss": 0.6178, "odds_ratio_loss": 0.3533773720264435, "rewards/accuracies": 0.8125, "rewards/chosen": -0.049426883459091187, "rewards/margins": 0.17467105388641357, "rewards/rejected": -0.22409793734550476, "sft_loss": 0.49426883459091187, "step": 702 }, { "epoch": 1.0166305133767173, "grad_norm": 2.655301235799629, "learning_rate": 7.5695088943642415e-06, "logits/chosen": 0.01799941062927246, "logits/rejected": 0.00933268666267395, "logps/chosen": -0.3941487669944763, "logps/rejected": -2.9899024963378906, "loss": 0.5979, "odds_ratio_loss": 0.26885470747947693, "rewards/accuracies": 0.875, "rewards/chosen": -0.03941487893462181, "rewards/margins": 0.25957539677619934, "rewards/rejected": -0.29899024963378906, "sft_loss": 0.3941487669944763, "step": 703 }, { "epoch": 1.0180766449746927, "grad_norm": 2.5535543893804906, "learning_rate": 7.568106167320437e-06, "logits/chosen": 0.041138745844364166, "logits/rejected": 0.11178290843963623, "logps/chosen": -0.49738141894340515, "logps/rejected": -2.7035470008850098, "loss": 0.5773, "odds_ratio_loss": 0.2610205411911011, "rewards/accuracies": 0.9375, "rewards/chosen": -0.049738142639398575, "rewards/margins": 0.22061654925346375, "rewards/rejected": -0.270354688167572, "sft_loss": 0.49738141894340515, "step": 704 }, { "epoch": 1.0195227765726682, "grad_norm": 2.5907209857345923, "learning_rate": 7.566701289052136e-06, "logits/chosen": 0.0588398277759552, "logits/rejected": 0.020695263519883156, "logps/chosen": -0.6042892932891846, "logps/rejected": -3.8982067108154297, "loss": 0.5564, "odds_ratio_loss": 0.28044500946998596, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0604289248585701, "rewards/margins": 0.3293917775154114, "rewards/rejected": -0.3898206949234009, "sft_loss": 0.6042892932891846, "step": 705 }, { "epoch": 1.0209689081706435, "grad_norm": 6.346699372024468, "learning_rate": 7.565294260406343e-06, "logits/chosen": -0.01402386836707592, "logits/rejected": 0.020731184631586075, "logps/chosen": -0.517099916934967, "logps/rejected": -1.578346848487854, "loss": 0.5113, "odds_ratio_loss": 0.2911356985569, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05170999467372894, "rewards/margins": 0.10612468421459198, "rewards/rejected": -0.15783467888832092, "sft_loss": 0.517099916934967, "step": 706 }, { "epoch": 1.022415039768619, "grad_norm": 2.737750718755634, "learning_rate": 7.563885082231363e-06, "logits/chosen": 0.09974583983421326, "logits/rejected": 0.11834936589002609, "logps/chosen": -0.4971277415752411, "logps/rejected": -2.3825626373291016, "loss": 0.5577, "odds_ratio_loss": 0.30548495054244995, "rewards/accuracies": 0.875, "rewards/chosen": -0.04971277341246605, "rewards/margins": 0.18854348361492157, "rewards/rejected": -0.23825626075267792, "sft_loss": 0.4971277415752411, "step": 707 }, { "epoch": 1.0238611713665944, "grad_norm": 3.5057829756261785, "learning_rate": 7.562473755376792e-06, "logits/chosen": -0.1551908254623413, "logits/rejected": -0.17449763417243958, "logps/chosen": -0.6683604121208191, "logps/rejected": -3.447512149810791, "loss": 0.6055, "odds_ratio_loss": 0.3249449133872986, "rewards/accuracies": 0.875, "rewards/chosen": -0.06683603674173355, "rewards/margins": 0.27791517972946167, "rewards/rejected": -0.344751238822937, "sft_loss": 0.6683604121208191, "step": 708 }, { "epoch": 1.0253073029645698, "grad_norm": 2.578140669633971, "learning_rate": 7.561060280693528e-06, "logits/chosen": -0.23344504833221436, "logits/rejected": -0.031244784593582153, "logps/chosen": -0.4613468050956726, "logps/rejected": -1.805433988571167, "loss": 0.57, "odds_ratio_loss": 0.21366247534751892, "rewards/accuracies": 1.0, "rewards/chosen": -0.04613468050956726, "rewards/margins": 0.13440871238708496, "rewards/rejected": -0.18054340779781342, "sft_loss": 0.4613468050956726, "step": 709 }, { "epoch": 1.026753434562545, "grad_norm": 2.46611135338615, "learning_rate": 7.559644659033757e-06, "logits/chosen": -0.10052379965782166, "logits/rejected": -0.03007357195019722, "logps/chosen": -0.4927516579627991, "logps/rejected": -2.790343999862671, "loss": 0.571, "odds_ratio_loss": 0.2678479254245758, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04927516356110573, "rewards/margins": 0.22975923120975494, "rewards/rejected": -0.27903443574905396, "sft_loss": 0.4927516579627991, "step": 710 }, { "epoch": 1.0281995661605206, "grad_norm": 2.666752714440422, "learning_rate": 7.558226891250963e-06, "logits/chosen": -0.03222482651472092, "logits/rejected": 0.01601647585630417, "logps/chosen": -0.6681675314903259, "logps/rejected": -1.4737236499786377, "loss": 0.5683, "odds_ratio_loss": 0.353636771440506, "rewards/accuracies": 0.875, "rewards/chosen": -0.06681674718856812, "rewards/margins": 0.08055561035871506, "rewards/rejected": -0.14737236499786377, "sft_loss": 0.6681675314903259, "step": 711 }, { "epoch": 1.029645697758496, "grad_norm": 2.618017036869127, "learning_rate": 7.556806978199924e-06, "logits/chosen": 0.0519598163664341, "logits/rejected": 0.022058458998799324, "logps/chosen": -0.5769098997116089, "logps/rejected": -2.5604069232940674, "loss": 0.6182, "odds_ratio_loss": 0.3295339345932007, "rewards/accuracies": 0.75, "rewards/chosen": -0.05769098550081253, "rewards/margins": 0.1983497142791748, "rewards/rejected": -0.25604069232940674, "sft_loss": 0.5769098997116089, "step": 712 }, { "epoch": 1.0310918293564715, "grad_norm": 7.761185954898741, "learning_rate": 7.555384920736711e-06, "logits/chosen": 0.015996111556887627, "logits/rejected": 0.08699595928192139, "logps/chosen": -0.7711721658706665, "logps/rejected": -1.6923863887786865, "loss": 0.6644, "odds_ratio_loss": 0.4671747088432312, "rewards/accuracies": 0.75, "rewards/chosen": -0.0771172046661377, "rewards/margins": 0.0921214297413826, "rewards/rejected": -0.16923865675926208, "sft_loss": 0.7711721658706665, "step": 713 }, { "epoch": 1.032537960954447, "grad_norm": 2.6755880173033013, "learning_rate": 7.5539607197186875e-06, "logits/chosen": 0.07660658657550812, "logits/rejected": 0.01353158988058567, "logps/chosen": -0.6658707857131958, "logps/rejected": -3.14312481880188, "loss": 0.6126, "odds_ratio_loss": 0.2412908524274826, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06658707559108734, "rewards/margins": 0.24772541224956512, "rewards/rejected": -0.31431248784065247, "sft_loss": 0.6658707857131958, "step": 714 }, { "epoch": 1.0339840925524222, "grad_norm": 2.439023237908597, "learning_rate": 7.552534376004511e-06, "logits/chosen": -0.02263808436691761, "logits/rejected": 0.05558781325817108, "logps/chosen": -0.3617287576198578, "logps/rejected": -3.1298577785491943, "loss": 0.5288, "odds_ratio_loss": 0.16875040531158447, "rewards/accuracies": 1.0, "rewards/chosen": -0.03617287427186966, "rewards/margins": 0.27681291103363037, "rewards/rejected": -0.31298577785491943, "sft_loss": 0.3617287576198578, "step": 715 }, { "epoch": 1.0354302241503976, "grad_norm": 2.6945317555257775, "learning_rate": 7.551105890454128e-06, "logits/chosen": -0.13312295079231262, "logits/rejected": -0.014301072806119919, "logps/chosen": -0.5835843682289124, "logps/rejected": -1.665818214416504, "loss": 0.6249, "odds_ratio_loss": 0.23966217041015625, "rewards/accuracies": 1.0, "rewards/chosen": -0.058358438313007355, "rewards/margins": 0.10822339355945587, "rewards/rejected": -0.16658182442188263, "sft_loss": 0.5835843682289124, "step": 716 }, { "epoch": 1.0368763557483731, "grad_norm": 4.069629175458383, "learning_rate": 7.549675263928776e-06, "logits/chosen": 0.05003291368484497, "logits/rejected": 0.09077559411525726, "logps/chosen": -0.5346865057945251, "logps/rejected": -1.7945709228515625, "loss": 0.5653, "odds_ratio_loss": 0.30477625131607056, "rewards/accuracies": 0.875, "rewards/chosen": -0.053468652069568634, "rewards/margins": 0.1259884536266327, "rewards/rejected": -0.17945709824562073, "sft_loss": 0.5346865057945251, "step": 717 }, { "epoch": 1.0383224873463486, "grad_norm": 2.5024788446043438, "learning_rate": 7.548242497290988e-06, "logits/chosen": -0.1343534290790558, "logits/rejected": -0.016086647287011147, "logps/chosen": -0.49459975957870483, "logps/rejected": -2.4642348289489746, "loss": 0.5703, "odds_ratio_loss": 0.23821872472763062, "rewards/accuracies": 1.0, "rewards/chosen": -0.049459975212812424, "rewards/margins": 0.19696351885795593, "rewards/rejected": -0.24642349779605865, "sft_loss": 0.49459975957870483, "step": 718 }, { "epoch": 1.0397686189443238, "grad_norm": 2.604294962061697, "learning_rate": 7.546807591404584e-06, "logits/chosen": -0.10261072218418121, "logits/rejected": -0.09555017948150635, "logps/chosen": -0.654711127281189, "logps/rejected": -2.6856062412261963, "loss": 0.639, "odds_ratio_loss": 0.36931928992271423, "rewards/accuracies": 0.75, "rewards/chosen": -0.0654711127281189, "rewards/margins": 0.20308950543403625, "rewards/rejected": -0.26856061816215515, "sft_loss": 0.654711127281189, "step": 719 }, { "epoch": 1.0412147505422993, "grad_norm": 2.5336512821889947, "learning_rate": 7.545370547134672e-06, "logits/chosen": -0.0048141926527023315, "logits/rejected": 0.013522947207093239, "logps/chosen": -0.48836880922317505, "logps/rejected": -1.8508555889129639, "loss": 0.5304, "odds_ratio_loss": 0.2670961618423462, "rewards/accuracies": 0.9375, "rewards/chosen": -0.048836879432201385, "rewards/margins": 0.1362486630678177, "rewards/rejected": -0.18508554995059967, "sft_loss": 0.48836880922317505, "step": 720 }, { "epoch": 1.0426608821402747, "grad_norm": 5.073824546241314, "learning_rate": 7.5439313653476546e-06, "logits/chosen": 0.010667698457837105, "logits/rejected": -0.007655080407857895, "logps/chosen": -0.6140876412391663, "logps/rejected": -2.0642848014831543, "loss": 0.6017, "odds_ratio_loss": 0.33399447798728943, "rewards/accuracies": 0.875, "rewards/chosen": -0.06140875816345215, "rewards/margins": 0.14501972496509552, "rewards/rejected": -0.20642849802970886, "sft_loss": 0.6140876412391663, "step": 721 }, { "epoch": 1.0441070137382502, "grad_norm": 2.4676977774963214, "learning_rate": 7.542490046911217e-06, "logits/chosen": 0.046404771506786346, "logits/rejected": 0.04437633231282234, "logps/chosen": -0.5226276516914368, "logps/rejected": -1.6676445007324219, "loss": 0.581, "odds_ratio_loss": 0.33058759570121765, "rewards/accuracies": 0.875, "rewards/chosen": -0.052262768149375916, "rewards/margins": 0.11450167745351791, "rewards/rejected": -0.16676445305347443, "sft_loss": 0.5226276516914368, "step": 722 }, { "epoch": 1.0455531453362257, "grad_norm": 2.393012743770882, "learning_rate": 7.541046592694336e-06, "logits/chosen": -0.11884389072656631, "logits/rejected": -0.012990422546863556, "logps/chosen": -0.638570249080658, "logps/rejected": -1.4692680835723877, "loss": 0.6703, "odds_ratio_loss": 0.34661900997161865, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06385702639818192, "rewards/margins": 0.08306978642940521, "rewards/rejected": -0.14692680537700653, "sft_loss": 0.638570249080658, "step": 723 }, { "epoch": 1.046999276934201, "grad_norm": 2.2437060230333428, "learning_rate": 7.539601003567277e-06, "logits/chosen": -0.043492406606674194, "logits/rejected": 0.05224483087658882, "logps/chosen": -0.7726094722747803, "logps/rejected": -1.4802862405776978, "loss": 0.6724, "odds_ratio_loss": 0.4070410132408142, "rewards/accuracies": 0.75, "rewards/chosen": -0.07726094871759415, "rewards/margins": 0.07076768577098846, "rewards/rejected": -0.1480286419391632, "sft_loss": 0.7726094722747803, "step": 724 }, { "epoch": 1.0484454085321764, "grad_norm": 2.202580998080793, "learning_rate": 7.538153280401589e-06, "logits/chosen": -0.04949672147631645, "logits/rejected": -0.10443629324436188, "logps/chosen": -0.668379008769989, "logps/rejected": -1.9327638149261475, "loss": 0.5646, "odds_ratio_loss": 0.39114710688591003, "rewards/accuracies": 0.875, "rewards/chosen": -0.06683789938688278, "rewards/margins": 0.12643848359584808, "rewards/rejected": -0.19327637553215027, "sft_loss": 0.668379008769989, "step": 725 }, { "epoch": 1.0498915401301518, "grad_norm": 2.601181457233383, "learning_rate": 7.536703424070111e-06, "logits/chosen": 0.06261063367128372, "logits/rejected": -0.016470249742269516, "logps/chosen": -0.48023730516433716, "logps/rejected": -2.6002039909362793, "loss": 0.4852, "odds_ratio_loss": 0.25949835777282715, "rewards/accuracies": 0.9375, "rewards/chosen": -0.048023734241724014, "rewards/margins": 0.2119966745376587, "rewards/rejected": -0.2600204050540924, "sft_loss": 0.48023730516433716, "step": 726 }, { "epoch": 1.0513376717281273, "grad_norm": 2.923410627265685, "learning_rate": 7.535251435446967e-06, "logits/chosen": 0.07905685156583786, "logits/rejected": 0.07296408712863922, "logps/chosen": -0.41822901368141174, "logps/rejected": -2.0072243213653564, "loss": 0.5235, "odds_ratio_loss": 0.25963878631591797, "rewards/accuracies": 0.9375, "rewards/chosen": -0.041822899132966995, "rewards/margins": 0.15889953076839447, "rewards/rejected": -0.20072242617607117, "sft_loss": 0.41822901368141174, "step": 727 }, { "epoch": 1.0527838033261028, "grad_norm": 4.27971490526211, "learning_rate": 7.533797315407566e-06, "logits/chosen": 0.053780484944581985, "logits/rejected": 0.010446615517139435, "logps/chosen": -0.5086668729782104, "logps/rejected": -2.1758975982666016, "loss": 0.6473, "odds_ratio_loss": 0.3123031556606293, "rewards/accuracies": 0.9375, "rewards/chosen": -0.050866689532995224, "rewards/margins": 0.16672305762767792, "rewards/rejected": -0.21758975088596344, "sft_loss": 0.5086668729782104, "step": 728 }, { "epoch": 1.054229934924078, "grad_norm": 3.2608327123149223, "learning_rate": 7.532341064828602e-06, "logits/chosen": -0.3278353214263916, "logits/rejected": -0.129352867603302, "logps/chosen": -0.6491460800170898, "logps/rejected": -1.670910120010376, "loss": 0.5573, "odds_ratio_loss": 0.30113694071769714, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06491461396217346, "rewards/margins": 0.10217640548944473, "rewards/rejected": -0.1670910269021988, "sft_loss": 0.6491460800170898, "step": 729 }, { "epoch": 1.0556760665220535, "grad_norm": 3.004012904325533, "learning_rate": 7.530882684588055e-06, "logits/chosen": -0.07247396558523178, "logits/rejected": 0.033849820494651794, "logps/chosen": -0.6534922122955322, "logps/rejected": -2.295997381210327, "loss": 0.6167, "odds_ratio_loss": 0.3587084114551544, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06534922122955322, "rewards/margins": 0.16425049304962158, "rewards/rejected": -0.2295997142791748, "sft_loss": 0.6534922122955322, "step": 730 }, { "epoch": 1.057122198120029, "grad_norm": 3.5688118743402115, "learning_rate": 7.529422175565185e-06, "logits/chosen": 0.019199436530470848, "logits/rejected": 0.0648551732301712, "logps/chosen": -0.44681620597839355, "logps/rejected": -1.9060118198394775, "loss": 0.5512, "odds_ratio_loss": 0.30147257447242737, "rewards/accuracies": 0.875, "rewards/chosen": -0.044681623578071594, "rewards/margins": 0.1459195613861084, "rewards/rejected": -0.19060118496418, "sft_loss": 0.44681620597839355, "step": 731 }, { "epoch": 1.0585683297180044, "grad_norm": 2.4630310600048557, "learning_rate": 7.5279595386405426e-06, "logits/chosen": 0.041221290826797485, "logits/rejected": 0.041878592222929, "logps/chosen": -0.7346194982528687, "logps/rejected": -1.4957523345947266, "loss": 0.6413, "odds_ratio_loss": 0.49654725193977356, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07346194982528687, "rewards/margins": 0.07611328363418579, "rewards/rejected": -0.14957523345947266, "sft_loss": 0.7346194982528687, "step": 732 }, { "epoch": 1.0600144613159797, "grad_norm": 2.1602876897916063, "learning_rate": 7.526494774695953e-06, "logits/chosen": 0.05351543426513672, "logits/rejected": 0.06491318345069885, "logps/chosen": -0.6178892850875854, "logps/rejected": -1.1977730989456177, "loss": 0.6236, "odds_ratio_loss": 0.4457571804523468, "rewards/accuracies": 0.6875, "rewards/chosen": -0.061788931488990784, "rewards/margins": 0.057988375425338745, "rewards/rejected": -0.11977731436491013, "sft_loss": 0.6178892850875854, "step": 733 }, { "epoch": 1.0614605929139551, "grad_norm": 2.8924484227212877, "learning_rate": 7.525027884614532e-06, "logits/chosen": 0.014550477266311646, "logits/rejected": 0.09415071457624435, "logps/chosen": -0.5875763893127441, "logps/rejected": -1.687889575958252, "loss": 0.5808, "odds_ratio_loss": 0.3503105342388153, "rewards/accuracies": 0.875, "rewards/chosen": -0.05875764787197113, "rewards/margins": 0.11003130674362183, "rewards/rejected": -0.16878895461559296, "sft_loss": 0.5875763893127441, "step": 734 }, { "epoch": 1.0629067245119306, "grad_norm": 2.834196823758551, "learning_rate": 7.523558869280668e-06, "logits/chosen": -0.0009785722941160202, "logits/rejected": 0.036724962294101715, "logps/chosen": -0.4648720920085907, "logps/rejected": -1.547023892402649, "loss": 0.5905, "odds_ratio_loss": 0.2784779965877533, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04648720845580101, "rewards/margins": 0.10821517556905746, "rewards/rejected": -0.15470239520072937, "sft_loss": 0.4648720920085907, "step": 735 }, { "epoch": 1.064352856109906, "grad_norm": 2.285507014423719, "learning_rate": 7.52208772958004e-06, "logits/chosen": 0.021531209349632263, "logits/rejected": 0.05180025473237038, "logps/chosen": -0.7658559083938599, "logps/rejected": -1.4070403575897217, "loss": 0.6991, "odds_ratio_loss": 0.47670185565948486, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07658559828996658, "rewards/margins": 0.06411843001842499, "rewards/rejected": -0.14070403575897217, "sft_loss": 0.7658559083938599, "step": 736 }, { "epoch": 1.0657989877078815, "grad_norm": 2.1699866577767235, "learning_rate": 7.520614466399602e-06, "logits/chosen": 0.06271585822105408, "logits/rejected": 0.10178729146718979, "logps/chosen": -0.6396518349647522, "logps/rejected": -1.8695476055145264, "loss": 0.4857, "odds_ratio_loss": 0.29615530371665955, "rewards/accuracies": 0.875, "rewards/chosen": -0.06396518647670746, "rewards/margins": 0.12298958003520966, "rewards/rejected": -0.18695476651191711, "sft_loss": 0.6396518349647522, "step": 737 }, { "epoch": 1.0672451193058567, "grad_norm": 2.5201426723175215, "learning_rate": 7.5191390806275905e-06, "logits/chosen": 0.06206812709569931, "logits/rejected": 0.05045921355485916, "logps/chosen": -0.5444098114967346, "logps/rejected": -1.3108839988708496, "loss": 0.5319, "odds_ratio_loss": 0.4091678857803345, "rewards/accuracies": 0.75, "rewards/chosen": -0.05444097891449928, "rewards/margins": 0.07664742320775986, "rewards/rejected": -0.13108840584754944, "sft_loss": 0.5444098114967346, "step": 738 }, { "epoch": 1.0686912509038322, "grad_norm": 2.661287000165096, "learning_rate": 7.51766157315352e-06, "logits/chosen": 0.017679838463664055, "logits/rejected": 0.16645830869674683, "logps/chosen": -0.5121591091156006, "logps/rejected": -2.511974334716797, "loss": 0.5386, "odds_ratio_loss": 0.25777196884155273, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05121590942144394, "rewards/margins": 0.19998154044151306, "rewards/rejected": -0.2511974573135376, "sft_loss": 0.5121591091156006, "step": 739 }, { "epoch": 1.0701373825018077, "grad_norm": 2.6654097422186958, "learning_rate": 7.516181944868187e-06, "logits/chosen": 0.06915304809808731, "logits/rejected": 0.07672876119613647, "logps/chosen": -0.7108054757118225, "logps/rejected": -2.896374225616455, "loss": 0.614, "odds_ratio_loss": 0.3021408021450043, "rewards/accuracies": 0.875, "rewards/chosen": -0.07108054310083389, "rewards/margins": 0.21855685114860535, "rewards/rejected": -0.28963741660118103, "sft_loss": 0.7108054757118225, "step": 740 }, { "epoch": 1.0715835140997831, "grad_norm": 2.5983221718366853, "learning_rate": 7.514700196663663e-06, "logits/chosen": 0.04233922064304352, "logits/rejected": 0.10317760705947876, "logps/chosen": -0.5160056948661804, "logps/rejected": -1.6412593126296997, "loss": 0.5438, "odds_ratio_loss": 0.31687411665916443, "rewards/accuracies": 0.875, "rewards/chosen": -0.05160056799650192, "rewards/margins": 0.11252538114786148, "rewards/rejected": -0.1641259491443634, "sft_loss": 0.5160056948661804, "step": 741 }, { "epoch": 1.0730296456977584, "grad_norm": 2.5631716723122224, "learning_rate": 7.5132163294332995e-06, "logits/chosen": 0.0766778364777565, "logits/rejected": 0.09308762103319168, "logps/chosen": -0.5432895421981812, "logps/rejected": -2.4959678649902344, "loss": 0.566, "odds_ratio_loss": 0.273905485868454, "rewards/accuracies": 0.9375, "rewards/chosen": -0.054328951984643936, "rewards/margins": 0.19526784121990204, "rewards/rejected": -0.24959678947925568, "sft_loss": 0.5432895421981812, "step": 742 }, { "epoch": 1.0744757772957338, "grad_norm": 2.7943114290109494, "learning_rate": 7.511730344071727e-06, "logits/chosen": 0.29873916506767273, "logits/rejected": 0.2477722316980362, "logps/chosen": -0.286681592464447, "logps/rejected": -3.2388148307800293, "loss": 0.5915, "odds_ratio_loss": 0.17670558393001556, "rewards/accuracies": 0.9375, "rewards/chosen": -0.02866816148161888, "rewards/margins": 0.29521334171295166, "rewards/rejected": -0.32388150691986084, "sft_loss": 0.286681592464447, "step": 743 }, { "epoch": 1.0759219088937093, "grad_norm": 2.4260752328507977, "learning_rate": 7.51024224147485e-06, "logits/chosen": 0.12139531970024109, "logits/rejected": 0.1723383665084839, "logps/chosen": -0.549209475517273, "logps/rejected": -1.6002026796340942, "loss": 0.6727, "odds_ratio_loss": 0.2690895199775696, "rewards/accuracies": 0.875, "rewards/chosen": -0.05492095276713371, "rewards/margins": 0.10509932041168213, "rewards/rejected": -0.16002027690410614, "sft_loss": 0.549209475517273, "step": 744 }, { "epoch": 1.0773680404916848, "grad_norm": 2.8830174008099245, "learning_rate": 7.508752022539854e-06, "logits/chosen": 0.0439901128411293, "logits/rejected": 0.009160804562270641, "logps/chosen": -0.6056515574455261, "logps/rejected": -2.096651792526245, "loss": 0.6268, "odds_ratio_loss": 0.4086237847805023, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06056516245007515, "rewards/margins": 0.14910002052783966, "rewards/rejected": -0.2096651792526245, "sft_loss": 0.6056515574455261, "step": 745 }, { "epoch": 1.0788141720896602, "grad_norm": 7.200080989258661, "learning_rate": 7.507259688165195e-06, "logits/chosen": 0.06999029964208603, "logits/rejected": 0.10017455369234085, "logps/chosen": -0.5238281488418579, "logps/rejected": -2.3946316242218018, "loss": 0.5687, "odds_ratio_loss": 0.28676459193229675, "rewards/accuracies": 0.875, "rewards/chosen": -0.05238281562924385, "rewards/margins": 0.18708035349845886, "rewards/rejected": -0.2394631803035736, "sft_loss": 0.5238281488418579, "step": 746 }, { "epoch": 1.0802603036876355, "grad_norm": 2.443321073791897, "learning_rate": 7.5057652392506066e-06, "logits/chosen": 0.2201310396194458, "logits/rejected": 0.10155685991048813, "logps/chosen": -0.5948346853256226, "logps/rejected": -1.850401759147644, "loss": 0.6237, "odds_ratio_loss": 0.38340234756469727, "rewards/accuracies": 0.8125, "rewards/chosen": -0.059483468532562256, "rewards/margins": 0.12555670738220215, "rewards/rejected": -0.1850401759147644, "sft_loss": 0.5948346853256226, "step": 747 }, { "epoch": 1.081706435285611, "grad_norm": 2.7951657976898896, "learning_rate": 7.504268676697099e-06, "logits/chosen": 0.15670828521251678, "logits/rejected": 0.20218618214130402, "logps/chosen": -0.4286814332008362, "logps/rejected": -2.867215156555176, "loss": 0.5752, "odds_ratio_loss": 0.1948789358139038, "rewards/accuracies": 1.0, "rewards/chosen": -0.04286814481019974, "rewards/margins": 0.2438533902168274, "rewards/rejected": -0.28672152757644653, "sft_loss": 0.4286814332008362, "step": 748 }, { "epoch": 1.0831525668835864, "grad_norm": 3.036970670942331, "learning_rate": 7.502770001406956e-06, "logits/chosen": -0.09760545194149017, "logits/rejected": 0.01785401999950409, "logps/chosen": -0.5453016757965088, "logps/rejected": -2.2281641960144043, "loss": 0.6201, "odds_ratio_loss": 0.2581316828727722, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05453016608953476, "rewards/margins": 0.16828623414039612, "rewards/rejected": -0.22281640768051147, "sft_loss": 0.5453016757965088, "step": 749 }, { "epoch": 1.0845986984815619, "grad_norm": 2.6823863807236212, "learning_rate": 7.501269214283732e-06, "logits/chosen": 0.018087085336446762, "logits/rejected": 0.0732310563325882, "logps/chosen": -0.8942903280258179, "logps/rejected": -1.6640185117721558, "loss": 0.6898, "odds_ratio_loss": 0.4530031085014343, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08942904323339462, "rewards/margins": 0.07697281241416931, "rewards/rejected": -0.16640184819698334, "sft_loss": 0.8942903280258179, "step": 750 }, { "epoch": 1.0860448300795373, "grad_norm": 2.3463543940654468, "learning_rate": 7.499766316232259e-06, "logits/chosen": 0.04176183417439461, "logits/rejected": 0.04234351962804794, "logps/chosen": -0.5866740942001343, "logps/rejected": -1.2051106691360474, "loss": 0.6237, "odds_ratio_loss": 0.3826124668121338, "rewards/accuracies": 0.875, "rewards/chosen": -0.058667413890361786, "rewards/margins": 0.06184365600347519, "rewards/rejected": -0.12051106244325638, "sft_loss": 0.5866740942001343, "step": 751 }, { "epoch": 1.0874909616775126, "grad_norm": 2.638412242735269, "learning_rate": 7.49826130815864e-06, "logits/chosen": 0.11069029569625854, "logits/rejected": 0.10534046590328217, "logps/chosen": -0.4467243254184723, "logps/rejected": -2.7915725708007812, "loss": 0.6219, "odds_ratio_loss": 0.1533384621143341, "rewards/accuracies": 1.0, "rewards/chosen": -0.04467243328690529, "rewards/margins": 0.23448482155799866, "rewards/rejected": -0.27915725111961365, "sft_loss": 0.4467243254184723, "step": 752 }, { "epoch": 1.088937093275488, "grad_norm": 2.062633106407356, "learning_rate": 7.496754190970249e-06, "logits/chosen": 0.2177714705467224, "logits/rejected": 0.1637977957725525, "logps/chosen": -0.5180121064186096, "logps/rejected": -2.5437209606170654, "loss": 0.6282, "odds_ratio_loss": 0.25341200828552246, "rewards/accuracies": 0.875, "rewards/chosen": -0.05180121213197708, "rewards/margins": 0.20257088541984558, "rewards/rejected": -0.25437209010124207, "sft_loss": 0.5180121064186096, "step": 753 }, { "epoch": 1.0903832248734635, "grad_norm": 2.5677209683454394, "learning_rate": 7.495244965575734e-06, "logits/chosen": 0.11416684091091156, "logits/rejected": 0.13222752511501312, "logps/chosen": -0.4176730215549469, "logps/rejected": -2.855869770050049, "loss": 0.499, "odds_ratio_loss": 0.3281542658805847, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04176730290055275, "rewards/margins": 0.2438196837902069, "rewards/rejected": -0.28558698296546936, "sft_loss": 0.4176730215549469, "step": 754 }, { "epoch": 1.091829356471439, "grad_norm": 2.6069186629139196, "learning_rate": 7.49373363288501e-06, "logits/chosen": -0.022992167621850967, "logits/rejected": 0.004823219031095505, "logps/chosen": -0.7831302881240845, "logps/rejected": -1.6426312923431396, "loss": 0.5694, "odds_ratio_loss": 0.41426563262939453, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07831303775310516, "rewards/margins": 0.0859500989317894, "rewards/rejected": -0.16426314413547516, "sft_loss": 0.7831302881240845, "step": 755 }, { "epoch": 1.0932754880694142, "grad_norm": 3.6538860464685903, "learning_rate": 7.492220193809267e-06, "logits/chosen": 0.08986733853816986, "logits/rejected": 0.05370461195707321, "logps/chosen": -0.5365122556686401, "logps/rejected": -3.8356852531433105, "loss": 0.6559, "odds_ratio_loss": 0.2820737659931183, "rewards/accuracies": 0.875, "rewards/chosen": -0.05365122854709625, "rewards/margins": 0.329917311668396, "rewards/rejected": -0.38356852531433105, "sft_loss": 0.5365122556686401, "step": 756 }, { "epoch": 1.0947216196673897, "grad_norm": 2.3375387449236524, "learning_rate": 7.490704649260963e-06, "logits/chosen": 0.13479389250278473, "logits/rejected": 0.2163029909133911, "logps/chosen": -0.3963888883590698, "logps/rejected": -2.7100229263305664, "loss": 0.5486, "odds_ratio_loss": 0.16003967821598053, "rewards/accuracies": 1.0, "rewards/chosen": -0.03963889181613922, "rewards/margins": 0.23136338591575623, "rewards/rejected": -0.27100229263305664, "sft_loss": 0.3963888883590698, "step": 757 }, { "epoch": 1.0961677512653651, "grad_norm": 2.7826733677141187, "learning_rate": 7.489187000153825e-06, "logits/chosen": 0.09706145524978638, "logits/rejected": 0.16182109713554382, "logps/chosen": -0.4988633096218109, "logps/rejected": -2.425071954727173, "loss": 0.6046, "odds_ratio_loss": 0.2742428183555603, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04988633841276169, "rewards/margins": 0.1926208883523941, "rewards/rejected": -0.2425072193145752, "sft_loss": 0.4988633096218109, "step": 758 }, { "epoch": 1.0976138828633406, "grad_norm": 2.7639210117774953, "learning_rate": 7.48766724740285e-06, "logits/chosen": 0.12796220183372498, "logits/rejected": 0.06461986899375916, "logps/chosen": -0.5522404909133911, "logps/rejected": -2.7422850131988525, "loss": 0.5448, "odds_ratio_loss": 0.33710646629333496, "rewards/accuracies": 0.875, "rewards/chosen": -0.05522404983639717, "rewards/margins": 0.21900448203086853, "rewards/rejected": -0.2742285132408142, "sft_loss": 0.5522404909133911, "step": 759 }, { "epoch": 1.099060014461316, "grad_norm": 3.8969453348992515, "learning_rate": 7.486145391924301e-06, "logits/chosen": 0.05818912759423256, "logits/rejected": 0.05282029137015343, "logps/chosen": -0.6544702649116516, "logps/rejected": -2.393183708190918, "loss": 0.6318, "odds_ratio_loss": 0.40100622177124023, "rewards/accuracies": 0.75, "rewards/chosen": -0.06544703245162964, "rewards/margins": 0.17387133836746216, "rewards/rejected": -0.2393183708190918, "sft_loss": 0.6544702649116516, "step": 760 }, { "epoch": 1.1005061460592913, "grad_norm": 2.7628775670375063, "learning_rate": 7.4846214346357125e-06, "logits/chosen": 0.05507563427090645, "logits/rejected": 0.0078296959400177, "logps/chosen": -0.527977466583252, "logps/rejected": -2.508857488632202, "loss": 0.5549, "odds_ratio_loss": 0.32088613510131836, "rewards/accuracies": 0.9375, "rewards/chosen": -0.052797745913267136, "rewards/margins": 0.19808803498744965, "rewards/rejected": -0.2508857548236847, "sft_loss": 0.527977466583252, "step": 761 }, { "epoch": 1.1019522776572668, "grad_norm": 2.542282535738457, "learning_rate": 7.483095376455884e-06, "logits/chosen": 0.29207634925842285, "logits/rejected": 0.18721196055412292, "logps/chosen": -0.42134177684783936, "logps/rejected": -3.063565492630005, "loss": 0.5765, "odds_ratio_loss": 0.17249009013175964, "rewards/accuracies": 0.9375, "rewards/chosen": -0.042134176939725876, "rewards/margins": 0.2642224133014679, "rewards/rejected": -0.3063565790653229, "sft_loss": 0.42134177684783936, "step": 762 }, { "epoch": 1.1033984092552422, "grad_norm": 3.9252015141252015, "learning_rate": 7.481567218304878e-06, "logits/chosen": 0.16148284077644348, "logits/rejected": 0.16037528216838837, "logps/chosen": -0.37327027320861816, "logps/rejected": -2.7958102226257324, "loss": 0.5319, "odds_ratio_loss": 0.21088604629039764, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03732702508568764, "rewards/margins": 0.24225401878356934, "rewards/rejected": -0.2795810401439667, "sft_loss": 0.37327027320861816, "step": 763 }, { "epoch": 1.1048445408532177, "grad_norm": 4.315528891824625, "learning_rate": 7.480036961104031e-06, "logits/chosen": -0.07114061713218689, "logits/rejected": 0.013804474845528603, "logps/chosen": -0.6288735866546631, "logps/rejected": -2.1085119247436523, "loss": 0.6512, "odds_ratio_loss": 0.39338162541389465, "rewards/accuracies": 0.75, "rewards/chosen": -0.06288735568523407, "rewards/margins": 0.14796383678913116, "rewards/rejected": -0.21085119247436523, "sft_loss": 0.6288735866546631, "step": 764 }, { "epoch": 1.106290672451193, "grad_norm": 3.0374677766883047, "learning_rate": 7.478504605775938e-06, "logits/chosen": 0.09024432301521301, "logits/rejected": 0.11920122802257538, "logps/chosen": -0.5411593914031982, "logps/rejected": -1.7794485092163086, "loss": 0.5823, "odds_ratio_loss": 0.328127384185791, "rewards/accuracies": 0.8125, "rewards/chosen": -0.054115939885377884, "rewards/margins": 0.12382891029119492, "rewards/rejected": -0.1779448539018631, "sft_loss": 0.5411593914031982, "step": 765 }, { "epoch": 1.1077368040491684, "grad_norm": 2.8560879970949675, "learning_rate": 7.476970153244463e-06, "logits/chosen": 0.13054589927196503, "logits/rejected": 0.06883732229471207, "logps/chosen": -0.6385205388069153, "logps/rejected": -1.957822561264038, "loss": 0.5768, "odds_ratio_loss": 0.41158097982406616, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06385205686092377, "rewards/margins": 0.13193020224571228, "rewards/rejected": -0.19578225910663605, "sft_loss": 0.6385205388069153, "step": 766 }, { "epoch": 1.1091829356471439, "grad_norm": 3.8575247023602643, "learning_rate": 7.475433604434734e-06, "logits/chosen": 0.09822223335504532, "logits/rejected": -0.0020382339134812355, "logps/chosen": -0.553131103515625, "logps/rejected": -2.6227946281433105, "loss": 0.6335, "odds_ratio_loss": 0.3468872308731079, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0553131066262722, "rewards/margins": 0.2069663405418396, "rewards/rejected": -0.2622794508934021, "sft_loss": 0.553131103515625, "step": 767 }, { "epoch": 1.1106290672451193, "grad_norm": 2.4229550650167684, "learning_rate": 7.47389496027314e-06, "logits/chosen": 0.013593245297670364, "logits/rejected": 0.09762951731681824, "logps/chosen": -0.38503605127334595, "logps/rejected": -3.5363712310791016, "loss": 0.5369, "odds_ratio_loss": 0.1493338942527771, "rewards/accuracies": 1.0, "rewards/chosen": -0.038503602147102356, "rewards/margins": 0.3151334822177887, "rewards/rejected": -0.35363709926605225, "sft_loss": 0.38503605127334595, "step": 768 }, { "epoch": 1.1120751988430948, "grad_norm": 2.434271111123222, "learning_rate": 7.472354221687337e-06, "logits/chosen": 0.11509215831756592, "logits/rejected": -0.01175488531589508, "logps/chosen": -0.5181520581245422, "logps/rejected": -2.47695255279541, "loss": 0.5855, "odds_ratio_loss": 0.24836497008800507, "rewards/accuracies": 1.0, "rewards/chosen": -0.051815204322338104, "rewards/margins": 0.19588005542755127, "rewards/rejected": -0.24769528210163116, "sft_loss": 0.5181520581245422, "step": 769 }, { "epoch": 1.11352133044107, "grad_norm": 3.6024883785529798, "learning_rate": 7.470811389606241e-06, "logits/chosen": 0.1907249093055725, "logits/rejected": 0.28744474053382874, "logps/chosen": -0.3892439901828766, "logps/rejected": -2.2846317291259766, "loss": 0.5206, "odds_ratio_loss": 0.24296937882900238, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03892439603805542, "rewards/margins": 0.18953877687454224, "rewards/rejected": -0.22846317291259766, "sft_loss": 0.3892439901828766, "step": 770 }, { "epoch": 1.1149674620390455, "grad_norm": 2.9734534681452938, "learning_rate": 7.469266464960032e-06, "logits/chosen": -0.009543132036924362, "logits/rejected": 0.025686249136924744, "logps/chosen": -0.5033233761787415, "logps/rejected": -3.2725722789764404, "loss": 0.5662, "odds_ratio_loss": 0.23238584399223328, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05033233389258385, "rewards/margins": 0.27692487835884094, "rewards/rejected": -0.3272572159767151, "sft_loss": 0.5033233761787415, "step": 771 }, { "epoch": 1.116413593637021, "grad_norm": 2.6868039101284205, "learning_rate": 7.4677194486801504e-06, "logits/chosen": 0.0987258031964302, "logits/rejected": 0.18745985627174377, "logps/chosen": -0.5120230913162231, "logps/rejected": -1.8366214036941528, "loss": 0.5464, "odds_ratio_loss": 0.3096643090248108, "rewards/accuracies": 0.9375, "rewards/chosen": -0.051202308386564255, "rewards/margins": 0.1324598342180252, "rewards/rejected": -0.18366214632987976, "sft_loss": 0.5120230913162231, "step": 772 }, { "epoch": 1.1178597252349964, "grad_norm": 3.2648658058180646, "learning_rate": 7.466170341699298e-06, "logits/chosen": 0.12391200661659241, "logits/rejected": 0.07651515305042267, "logps/chosen": -0.5479176640510559, "logps/rejected": -2.7681431770324707, "loss": 0.6005, "odds_ratio_loss": 0.3141166567802429, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05479177087545395, "rewards/margins": 0.22202253341674805, "rewards/rejected": -0.2768143117427826, "sft_loss": 0.5479176640510559, "step": 773 }, { "epoch": 1.119305856832972, "grad_norm": 2.1969045008343535, "learning_rate": 7.464619144951436e-06, "logits/chosen": 0.1988317370414734, "logits/rejected": 0.11384803801774979, "logps/chosen": -0.6396727561950684, "logps/rejected": -2.1379432678222656, "loss": 0.5548, "odds_ratio_loss": 0.4122881293296814, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0639672800898552, "rewards/margins": 0.14982706308364868, "rewards/rejected": -0.21379435062408447, "sft_loss": 0.6396727561950684, "step": 774 }, { "epoch": 1.1207519884309471, "grad_norm": 2.828734292348446, "learning_rate": 7.463065859371789e-06, "logits/chosen": 0.024105386808514595, "logits/rejected": -0.05560684576630592, "logps/chosen": -0.58314049243927, "logps/rejected": -2.818758487701416, "loss": 0.5745, "odds_ratio_loss": 0.41460278630256653, "rewards/accuracies": 0.75, "rewards/chosen": -0.05831405147910118, "rewards/margins": 0.22356180846691132, "rewards/rejected": -0.2818758487701416, "sft_loss": 0.58314049243927, "step": 775 }, { "epoch": 1.1221981200289226, "grad_norm": 2.5439099335428206, "learning_rate": 7.461510485896838e-06, "logits/chosen": 0.2544099688529968, "logits/rejected": 0.20363222062587738, "logps/chosen": -0.4311085343360901, "logps/rejected": -2.491203546524048, "loss": 0.6513, "odds_ratio_loss": 0.28176209330558777, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04311085492372513, "rewards/margins": 0.20600950717926025, "rewards/rejected": -0.24912036955356598, "sft_loss": 0.4311085343360901, "step": 776 }, { "epoch": 1.123644251626898, "grad_norm": 3.5321805261897374, "learning_rate": 7.4599530254643205e-06, "logits/chosen": 0.054844632744789124, "logits/rejected": 0.0795421302318573, "logps/chosen": -0.49303340911865234, "logps/rejected": -2.2153918743133545, "loss": 0.6372, "odds_ratio_loss": 0.3127673864364624, "rewards/accuracies": 0.8125, "rewards/chosen": -0.049303337931632996, "rewards/margins": 0.1722358614206314, "rewards/rejected": -0.22153916954994202, "sft_loss": 0.49303340911865234, "step": 777 }, { "epoch": 1.1250903832248735, "grad_norm": 9.818583319726136, "learning_rate": 7.45839347901324e-06, "logits/chosen": 0.10133037716150284, "logits/rejected": 0.08598774671554565, "logps/chosen": -0.41913947463035583, "logps/rejected": -3.2657482624053955, "loss": 0.596, "odds_ratio_loss": 0.2532481849193573, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0419139489531517, "rewards/margins": 0.28466087579727173, "rewards/rejected": -0.3265748620033264, "sft_loss": 0.41913947463035583, "step": 778 }, { "epoch": 1.126536514822849, "grad_norm": 2.5273673577209186, "learning_rate": 7.45683184748385e-06, "logits/chosen": 0.10568895936012268, "logits/rejected": 0.07520011067390442, "logps/chosen": -0.4639187753200531, "logps/rejected": -4.35772180557251, "loss": 0.5471, "odds_ratio_loss": 0.18750569224357605, "rewards/accuracies": 1.0, "rewards/chosen": -0.04639187827706337, "rewards/margins": 0.3893802762031555, "rewards/rejected": -0.4357721507549286, "sft_loss": 0.4639187753200531, "step": 779 }, { "epoch": 1.1279826464208242, "grad_norm": 3.3847158927408816, "learning_rate": 7.455268131817664e-06, "logits/chosen": 0.11961061507463455, "logits/rejected": 0.11430468410253525, "logps/chosen": -0.4800012707710266, "logps/rejected": -1.999869704246521, "loss": 0.491, "odds_ratio_loss": 0.2718590497970581, "rewards/accuracies": 0.875, "rewards/chosen": -0.04800013080239296, "rewards/margins": 0.15198683738708496, "rewards/rejected": -0.19998696446418762, "sft_loss": 0.4800012707710266, "step": 780 }, { "epoch": 1.1294287780187997, "grad_norm": 2.26973956764143, "learning_rate": 7.453702332957454e-06, "logits/chosen": 0.0754300132393837, "logits/rejected": 0.048578303307294846, "logps/chosen": -0.6013798713684082, "logps/rejected": -1.9893176555633545, "loss": 0.5393, "odds_ratio_loss": 0.4172818660736084, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06013799458742142, "rewards/margins": 0.13879379630088806, "rewards/rejected": -0.19893178343772888, "sft_loss": 0.6013798713684082, "step": 781 }, { "epoch": 1.1308749096167752, "grad_norm": 3.0760734312283624, "learning_rate": 7.452134451847243e-06, "logits/chosen": 0.1234007477760315, "logits/rejected": 0.10730813443660736, "logps/chosen": -0.5656208395957947, "logps/rejected": -2.4852113723754883, "loss": 0.5875, "odds_ratio_loss": 0.31027752161026, "rewards/accuracies": 0.9375, "rewards/chosen": -0.056562088429927826, "rewards/margins": 0.19195906817913055, "rewards/rejected": -0.24852114915847778, "sft_loss": 0.5656208395957947, "step": 782 }, { "epoch": 1.1323210412147506, "grad_norm": 3.3526925530268326, "learning_rate": 7.450564489432315e-06, "logits/chosen": 0.04157966375350952, "logits/rejected": 0.0403023287653923, "logps/chosen": -0.6040517091751099, "logps/rejected": -1.2740135192871094, "loss": 0.6748, "odds_ratio_loss": 0.41726475954055786, "rewards/accuracies": 0.8125, "rewards/chosen": -0.060405176132917404, "rewards/margins": 0.06699618697166443, "rewards/rejected": -0.12740135192871094, "sft_loss": 0.6040517091751099, "step": 783 }, { "epoch": 1.1337671728127259, "grad_norm": 3.0802366326947204, "learning_rate": 7.448992446659204e-06, "logits/chosen": 0.14116020500659943, "logits/rejected": 0.15150752663612366, "logps/chosen": -0.6432009339332581, "logps/rejected": -3.107823133468628, "loss": 0.624, "odds_ratio_loss": 0.2854066789150238, "rewards/accuracies": 0.875, "rewards/chosen": -0.06432008743286133, "rewards/margins": 0.24646221101284027, "rewards/rejected": -0.3107823133468628, "sft_loss": 0.6432009339332581, "step": 784 }, { "epoch": 1.1352133044107013, "grad_norm": 2.9008034505623685, "learning_rate": 7.447418324475702e-06, "logits/chosen": -0.07773000001907349, "logits/rejected": 0.016000304371118546, "logps/chosen": -0.4436779022216797, "logps/rejected": -3.0289247035980225, "loss": 0.5249, "odds_ratio_loss": 0.2261512130498886, "rewards/accuracies": 0.875, "rewards/chosen": -0.04436779022216797, "rewards/margins": 0.25852468609809875, "rewards/rejected": -0.3028924763202667, "sft_loss": 0.4436779022216797, "step": 785 }, { "epoch": 1.1366594360086768, "grad_norm": 2.5909707064380596, "learning_rate": 7.445842123830853e-06, "logits/chosen": -0.016650903970003128, "logits/rejected": -0.010309025645256042, "logps/chosen": -0.5804932117462158, "logps/rejected": -1.5151318311691284, "loss": 0.5625, "odds_ratio_loss": 0.3440878689289093, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05804932489991188, "rewards/margins": 0.09346386790275574, "rewards/rejected": -0.15151318907737732, "sft_loss": 0.5804932117462158, "step": 786 }, { "epoch": 1.1381055676066523, "grad_norm": 2.2178481077560717, "learning_rate": 7.444263845674953e-06, "logits/chosen": 0.03409305587410927, "logits/rejected": 0.06542815268039703, "logps/chosen": -0.7013732194900513, "logps/rejected": -2.4514522552490234, "loss": 0.5767, "odds_ratio_loss": 0.32921674847602844, "rewards/accuracies": 0.875, "rewards/chosen": -0.07013732194900513, "rewards/margins": 0.1750079095363617, "rewards/rejected": -0.24514521658420563, "sft_loss": 0.7013732194900513, "step": 787 }, { "epoch": 1.1395516992046275, "grad_norm": 3.168676722580113, "learning_rate": 7.442683490959554e-06, "logits/chosen": 0.009714031592011452, "logits/rejected": 0.07314697653055191, "logps/chosen": -0.7520695924758911, "logps/rejected": -1.4773638248443604, "loss": 0.6163, "odds_ratio_loss": 0.3795466721057892, "rewards/accuracies": 0.875, "rewards/chosen": -0.07520696520805359, "rewards/margins": 0.07252941280603409, "rewards/rejected": -0.14773637056350708, "sft_loss": 0.7520695924758911, "step": 788 }, { "epoch": 1.140997830802603, "grad_norm": 2.231023146887912, "learning_rate": 7.441101060637456e-06, "logits/chosen": 0.011076090857386589, "logits/rejected": 0.0065444353967905045, "logps/chosen": -0.4639368951320648, "logps/rejected": -2.8974857330322266, "loss": 0.5276, "odds_ratio_loss": 0.358600378036499, "rewards/accuracies": 0.875, "rewards/chosen": -0.04639369249343872, "rewards/margins": 0.2433549016714096, "rewards/rejected": -0.2897486090660095, "sft_loss": 0.4639368951320648, "step": 789 }, { "epoch": 1.1424439624005784, "grad_norm": 5.751344294959958, "learning_rate": 7.4395165556627115e-06, "logits/chosen": 0.044419676065444946, "logits/rejected": 0.0467277392745018, "logps/chosen": -0.5675772428512573, "logps/rejected": -3.3737549781799316, "loss": 0.6823, "odds_ratio_loss": 0.3221891224384308, "rewards/accuracies": 0.875, "rewards/chosen": -0.05675772577524185, "rewards/margins": 0.28061774373054504, "rewards/rejected": -0.3373754918575287, "sft_loss": 0.5675772428512573, "step": 790 }, { "epoch": 1.143890093998554, "grad_norm": 3.157815374086038, "learning_rate": 7.437929976990625e-06, "logits/chosen": 0.07242487370967865, "logits/rejected": 0.06932821869850159, "logps/chosen": -0.6816798448562622, "logps/rejected": -2.0356240272521973, "loss": 0.6112, "odds_ratio_loss": 0.31875520944595337, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06816798448562622, "rewards/margins": 0.1353944092988968, "rewards/rejected": -0.203562393784523, "sft_loss": 0.6816798448562622, "step": 791 }, { "epoch": 1.1453362255965294, "grad_norm": 2.399288961917676, "learning_rate": 7.436341325577753e-06, "logits/chosen": 0.006493567489087582, "logits/rejected": 0.05797387287020683, "logps/chosen": -0.5998603701591492, "logps/rejected": -1.7945928573608398, "loss": 0.5849, "odds_ratio_loss": 0.2764272689819336, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05998603627085686, "rewards/margins": 0.11947324872016907, "rewards/rejected": -0.17945930361747742, "sft_loss": 0.5998603701591492, "step": 792 }, { "epoch": 1.1467823571945046, "grad_norm": 4.92991125737335, "learning_rate": 7.434750602381896e-06, "logits/chosen": -0.006598275154829025, "logits/rejected": 0.03047710284590721, "logps/chosen": -0.6850670576095581, "logps/rejected": -2.417117118835449, "loss": 0.6757, "odds_ratio_loss": 0.5608515739440918, "rewards/accuracies": 0.75, "rewards/chosen": -0.06850671023130417, "rewards/margins": 0.17320501804351807, "rewards/rejected": -0.24171173572540283, "sft_loss": 0.6850670576095581, "step": 793 }, { "epoch": 1.14822848879248, "grad_norm": 3.028327047199182, "learning_rate": 7.433157808362109e-06, "logits/chosen": 0.031274694949388504, "logits/rejected": 0.07910322397947311, "logps/chosen": -0.4861868619918823, "logps/rejected": -2.6531078815460205, "loss": 0.563, "odds_ratio_loss": 0.21675142645835876, "rewards/accuracies": 1.0, "rewards/chosen": -0.04861868917942047, "rewards/margins": 0.21669211983680725, "rewards/rejected": -0.26531079411506653, "sft_loss": 0.4861868619918823, "step": 794 }, { "epoch": 1.1496746203904555, "grad_norm": 2.7428039098034787, "learning_rate": 7.4315629444786934e-06, "logits/chosen": 0.13038958609104156, "logits/rejected": 0.16042017936706543, "logps/chosen": -0.6496018171310425, "logps/rejected": -2.7725348472595215, "loss": 0.6591, "odds_ratio_loss": 0.3653820753097534, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06496018916368484, "rewards/margins": 0.21229329705238342, "rewards/rejected": -0.27725350856781006, "sft_loss": 0.6496018171310425, "step": 795 }, { "epoch": 1.151120751988431, "grad_norm": 2.3190293075287585, "learning_rate": 7.429966011693198e-06, "logits/chosen": 0.09569834172725677, "logits/rejected": 0.13140498101711273, "logps/chosen": -0.5810202360153198, "logps/rejected": -1.6585171222686768, "loss": 0.6426, "odds_ratio_loss": 0.25168225169181824, "rewards/accuracies": 1.0, "rewards/chosen": -0.05810202658176422, "rewards/margins": 0.10774968564510345, "rewards/rejected": -0.16585171222686768, "sft_loss": 0.5810202360153198, "step": 796 }, { "epoch": 1.1525668835864065, "grad_norm": 2.3257384032215023, "learning_rate": 7.428367010968418e-06, "logits/chosen": 0.17940768599510193, "logits/rejected": 0.1755165457725525, "logps/chosen": -0.4974741041660309, "logps/rejected": -2.3814783096313477, "loss": 0.575, "odds_ratio_loss": 0.2930542230606079, "rewards/accuracies": 0.875, "rewards/chosen": -0.04974740743637085, "rewards/margins": 0.18840041756629944, "rewards/rejected": -0.2381478101015091, "sft_loss": 0.4974741041660309, "step": 797 }, { "epoch": 1.1540130151843817, "grad_norm": 2.33840083888642, "learning_rate": 7.4267659432684e-06, "logits/chosen": 0.06194993481040001, "logits/rejected": 0.02340729907155037, "logps/chosen": -0.5812244415283203, "logps/rejected": -3.1519532203674316, "loss": 0.6231, "odds_ratio_loss": 0.27667036652565, "rewards/accuracies": 0.875, "rewards/chosen": -0.05812244117259979, "rewards/margins": 0.2570728659629822, "rewards/rejected": -0.31519532203674316, "sft_loss": 0.5812244415283203, "step": 798 }, { "epoch": 1.1554591467823572, "grad_norm": 3.148046088506335, "learning_rate": 7.4251628095584325e-06, "logits/chosen": -0.011250527575612068, "logits/rejected": 0.017998933792114258, "logps/chosen": -0.693738579750061, "logps/rejected": -1.7688831090927124, "loss": 0.6544, "odds_ratio_loss": 0.3419437110424042, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06937386095523834, "rewards/margins": 0.10751446336507797, "rewards/rejected": -0.17688831686973572, "sft_loss": 0.693738579750061, "step": 799 }, { "epoch": 1.1569052783803326, "grad_norm": 2.437296250008247, "learning_rate": 7.4235576108050495e-06, "logits/chosen": 0.1309782713651657, "logits/rejected": 0.13392174243927002, "logps/chosen": -0.4447363615036011, "logps/rejected": -3.2417688369750977, "loss": 0.5679, "odds_ratio_loss": 0.36193907260894775, "rewards/accuracies": 0.8125, "rewards/chosen": -0.044473640620708466, "rewards/margins": 0.2797032594680786, "rewards/rejected": -0.3241769075393677, "sft_loss": 0.4447363615036011, "step": 800 }, { "epoch": 1.158351409978308, "grad_norm": 2.7438204360634346, "learning_rate": 7.4219503479760325e-06, "logits/chosen": 0.15036681294441223, "logits/rejected": 0.11659523844718933, "logps/chosen": -0.4682498574256897, "logps/rejected": -2.816537380218506, "loss": 0.5587, "odds_ratio_loss": 0.28212669491767883, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04682498425245285, "rewards/margins": 0.23482874035835266, "rewards/rejected": -0.2816537320613861, "sft_loss": 0.4682498574256897, "step": 801 }, { "epoch": 1.1597975415762836, "grad_norm": 2.2929808725485543, "learning_rate": 7.420341022040405e-06, "logits/chosen": 0.018691712990403175, "logits/rejected": 0.004425849765539169, "logps/chosen": -0.5483542084693909, "logps/rejected": -3.6711413860321045, "loss": 0.5833, "odds_ratio_loss": 0.2860555350780487, "rewards/accuracies": 0.875, "rewards/chosen": -0.05483543127775192, "rewards/margins": 0.31227871775627136, "rewards/rejected": -0.3671141266822815, "sft_loss": 0.5483542084693909, "step": 802 }, { "epoch": 1.1612436731742588, "grad_norm": 3.1911949847406413, "learning_rate": 7.418729633968439e-06, "logits/chosen": 0.16952145099639893, "logits/rejected": 0.15931960940361023, "logps/chosen": -0.4818491041660309, "logps/rejected": -2.875197410583496, "loss": 0.6413, "odds_ratio_loss": 0.2592041790485382, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04818491265177727, "rewards/margins": 0.239334836602211, "rewards/rejected": -0.28751975297927856, "sft_loss": 0.4818491041660309, "step": 803 }, { "epoch": 1.1626898047722343, "grad_norm": 2.459591917048973, "learning_rate": 7.4171161847316424e-06, "logits/chosen": 0.12029114365577698, "logits/rejected": 0.16648927330970764, "logps/chosen": -0.6787177920341492, "logps/rejected": -2.847053050994873, "loss": 0.6632, "odds_ratio_loss": 0.30066680908203125, "rewards/accuracies": 0.875, "rewards/chosen": -0.06787177175283432, "rewards/margins": 0.21683353185653687, "rewards/rejected": -0.2847052812576294, "sft_loss": 0.6787177920341492, "step": 804 }, { "epoch": 1.1641359363702097, "grad_norm": 2.819839180987813, "learning_rate": 7.4155006753027715e-06, "logits/chosen": 0.21553045511245728, "logits/rejected": 0.16459545493125916, "logps/chosen": -0.5797298550605774, "logps/rejected": -2.6545183658599854, "loss": 0.6086, "odds_ratio_loss": 0.349483847618103, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0579729899764061, "rewards/margins": 0.2074788510799408, "rewards/rejected": -0.2654518485069275, "sft_loss": 0.5797298550605774, "step": 805 }, { "epoch": 1.1655820679681852, "grad_norm": 3.649699933535626, "learning_rate": 7.413883106655823e-06, "logits/chosen": 0.1408543586730957, "logits/rejected": 0.151872456073761, "logps/chosen": -0.5444905757904053, "logps/rejected": -1.8509986400604248, "loss": 0.5559, "odds_ratio_loss": 0.28263750672340393, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05444905534386635, "rewards/margins": 0.1306508183479309, "rewards/rejected": -0.18509986996650696, "sft_loss": 0.5444905757904053, "step": 806 }, { "epoch": 1.1670281995661604, "grad_norm": 2.8521602435109106, "learning_rate": 7.412263479766034e-06, "logits/chosen": 0.0004207249730825424, "logits/rejected": 0.07325251400470734, "logps/chosen": -0.6310700178146362, "logps/rejected": -2.819793939590454, "loss": 0.5759, "odds_ratio_loss": 0.371545672416687, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06310699880123138, "rewards/margins": 0.21887239813804626, "rewards/rejected": -0.28197938203811646, "sft_loss": 0.6310700178146362, "step": 807 }, { "epoch": 1.168474331164136, "grad_norm": 2.9617169421526275, "learning_rate": 7.410641795609885e-06, "logits/chosen": 0.32813215255737305, "logits/rejected": 0.11179277300834656, "logps/chosen": -0.6126865148544312, "logps/rejected": -2.45556378364563, "loss": 0.5838, "odds_ratio_loss": 0.2806455194950104, "rewards/accuracies": 0.875, "rewards/chosen": -0.061268653720617294, "rewards/margins": 0.18428772687911987, "rewards/rejected": -0.24555638432502747, "sft_loss": 0.6126865148544312, "step": 808 }, { "epoch": 1.1699204627621114, "grad_norm": 3.51193121019222, "learning_rate": 7.409018055165095e-06, "logits/chosen": 0.24320703744888306, "logits/rejected": 0.14701107144355774, "logps/chosen": -0.5555135011672974, "logps/rejected": -3.6711976528167725, "loss": 0.6075, "odds_ratio_loss": 0.30618131160736084, "rewards/accuracies": 0.8125, "rewards/chosen": -0.055551350116729736, "rewards/margins": 0.3115684390068054, "rewards/rejected": -0.36711978912353516, "sft_loss": 0.5555135011672974, "step": 809 }, { "epoch": 1.1713665943600868, "grad_norm": 2.854322284446154, "learning_rate": 7.407392259410623e-06, "logits/chosen": 0.11030896008014679, "logits/rejected": 0.07109300792217255, "logps/chosen": -0.535825252532959, "logps/rejected": -3.356602907180786, "loss": 0.6089, "odds_ratio_loss": 0.37696099281311035, "rewards/accuracies": 0.75, "rewards/chosen": -0.05358252674341202, "rewards/margins": 0.28207775950431824, "rewards/rejected": -0.33566030859947205, "sft_loss": 0.535825252532959, "step": 810 }, { "epoch": 1.172812725958062, "grad_norm": 2.6810454261200536, "learning_rate": 7.405764409326668e-06, "logits/chosen": 0.024707181379199028, "logits/rejected": 0.15931369364261627, "logps/chosen": -0.5429731607437134, "logps/rejected": -2.546696186065674, "loss": 0.6071, "odds_ratio_loss": 0.2758614718914032, "rewards/accuracies": 0.9375, "rewards/chosen": -0.054297320544719696, "rewards/margins": 0.20037232339382172, "rewards/rejected": -0.2546696364879608, "sft_loss": 0.5429731607437134, "step": 811 }, { "epoch": 1.1742588575560375, "grad_norm": 2.8967656734297815, "learning_rate": 7.404134505894665e-06, "logits/chosen": 0.18780213594436646, "logits/rejected": 0.11726965010166168, "logps/chosen": -0.44569364190101624, "logps/rejected": -2.6580047607421875, "loss": 0.5756, "odds_ratio_loss": 0.2095184624195099, "rewards/accuracies": 1.0, "rewards/chosen": -0.04456936568021774, "rewards/margins": 0.2212311178445816, "rewards/rejected": -0.26580050587654114, "sft_loss": 0.44569364190101624, "step": 812 }, { "epoch": 1.175704989154013, "grad_norm": 2.7702135625243844, "learning_rate": 7.40250255009729e-06, "logits/chosen": 0.09605167806148529, "logits/rejected": 0.2212832123041153, "logps/chosen": -0.4400288462638855, "logps/rejected": -2.868157148361206, "loss": 0.591, "odds_ratio_loss": 0.21612593531608582, "rewards/accuracies": 1.0, "rewards/chosen": -0.04400289058685303, "rewards/margins": 0.2428128570318222, "rewards/rejected": -0.28681573271751404, "sft_loss": 0.4400288462638855, "step": 813 }, { "epoch": 1.1771511207519885, "grad_norm": 2.589780825174727, "learning_rate": 7.400868542918457e-06, "logits/chosen": 0.23280946910381317, "logits/rejected": 0.2035413384437561, "logps/chosen": -0.6458877325057983, "logps/rejected": -2.4575438499450684, "loss": 0.5889, "odds_ratio_loss": 0.3527429699897766, "rewards/accuracies": 0.875, "rewards/chosen": -0.0645887702703476, "rewards/margins": 0.18116562068462372, "rewards/rejected": -0.2457543909549713, "sft_loss": 0.6458877325057983, "step": 814 }, { "epoch": 1.178597252349964, "grad_norm": 2.5623900104319626, "learning_rate": 7.399232485343311e-06, "logits/chosen": 0.10259507596492767, "logits/rejected": 0.1167682632803917, "logps/chosen": -0.8443832397460938, "logps/rejected": -1.3338778018951416, "loss": 0.7028, "odds_ratio_loss": 0.6011531352996826, "rewards/accuracies": 0.625, "rewards/chosen": -0.08443832397460938, "rewards/margins": 0.0489494651556015, "rewards/rejected": -0.13338778913021088, "sft_loss": 0.8443832397460938, "step": 815 }, { "epoch": 1.1800433839479392, "grad_norm": 2.5950621753937493, "learning_rate": 7.397594378358241e-06, "logits/chosen": 0.25614750385284424, "logits/rejected": 0.22538936138153076, "logps/chosen": -0.4889935255050659, "logps/rejected": -3.658846139907837, "loss": 0.5713, "odds_ratio_loss": 0.1725495159626007, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04889935255050659, "rewards/margins": 0.31698527932167053, "rewards/rejected": -0.36588460206985474, "sft_loss": 0.4889935255050659, "step": 816 }, { "epoch": 1.1814895155459146, "grad_norm": 2.562573118975313, "learning_rate": 7.395954222950866e-06, "logits/chosen": 0.12951630353927612, "logits/rejected": 0.21345645189285278, "logps/chosen": -0.39483872056007385, "logps/rejected": -2.5382630825042725, "loss": 0.5061, "odds_ratio_loss": 0.25803595781326294, "rewards/accuracies": 0.8125, "rewards/chosen": -0.03948386758565903, "rewards/margins": 0.21434244513511658, "rewards/rejected": -0.2538263201713562, "sft_loss": 0.39483872056007385, "step": 817 }, { "epoch": 1.18293564714389, "grad_norm": 2.3614280189896744, "learning_rate": 7.394312020110042e-06, "logits/chosen": 0.17008808255195618, "logits/rejected": 0.10077522695064545, "logps/chosen": -0.5804603099822998, "logps/rejected": -1.993128776550293, "loss": 0.555, "odds_ratio_loss": 0.31265169382095337, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05804603174328804, "rewards/margins": 0.1412668526172638, "rewards/rejected": -0.19931289553642273, "sft_loss": 0.5804603099822998, "step": 818 }, { "epoch": 1.1843817787418656, "grad_norm": 4.26288185974698, "learning_rate": 7.392667770825859e-06, "logits/chosen": 0.12806616723537445, "logits/rejected": 0.12684689462184906, "logps/chosen": -0.6775280237197876, "logps/rejected": -2.6817450523376465, "loss": 0.6758, "odds_ratio_loss": 0.36490458250045776, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06775280088186264, "rewards/margins": 0.20042170584201813, "rewards/rejected": -0.2681744694709778, "sft_loss": 0.6775280237197876, "step": 819 }, { "epoch": 1.185827910339841, "grad_norm": 6.992906851248464, "learning_rate": 7.391021476089641e-06, "logits/chosen": 0.15885204076766968, "logits/rejected": 0.08956670761108398, "logps/chosen": -0.48606452345848083, "logps/rejected": -1.5799554586410522, "loss": 0.605, "odds_ratio_loss": 0.2400798499584198, "rewards/accuracies": 1.0, "rewards/chosen": -0.04860645532608032, "rewards/margins": 0.10938909649848938, "rewards/rejected": -0.1579955518245697, "sft_loss": 0.48606452345848083, "step": 820 }, { "epoch": 1.1872740419378163, "grad_norm": 2.8411437399780244, "learning_rate": 7.389373136893947e-06, "logits/chosen": 0.056294236332178116, "logits/rejected": 0.06409749388694763, "logps/chosen": -0.6919089555740356, "logps/rejected": -1.5181670188903809, "loss": 0.6048, "odds_ratio_loss": 0.4656546115875244, "rewards/accuracies": 0.75, "rewards/chosen": -0.06919088959693909, "rewards/margins": 0.08262581378221512, "rewards/rejected": -0.1518167108297348, "sft_loss": 0.6919089555740356, "step": 821 }, { "epoch": 1.1887201735357917, "grad_norm": 4.461930888679296, "learning_rate": 7.3877227542325645e-06, "logits/chosen": 0.15600307285785675, "logits/rejected": 0.11605434864759445, "logps/chosen": -0.47259920835494995, "logps/rejected": -2.2307705879211426, "loss": 0.569, "odds_ratio_loss": 0.34623655676841736, "rewards/accuracies": 0.875, "rewards/chosen": -0.04725992679595947, "rewards/margins": 0.17581716179847717, "rewards/rejected": -0.22307708859443665, "sft_loss": 0.47259920835494995, "step": 822 }, { "epoch": 1.1901663051337672, "grad_norm": 2.3378993173232394, "learning_rate": 7.3860703291005154e-06, "logits/chosen": -0.04555685818195343, "logits/rejected": -0.017071541398763657, "logps/chosen": -0.59052574634552, "logps/rejected": -3.355835199356079, "loss": 0.7088, "odds_ratio_loss": 0.3438900113105774, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05905257910490036, "rewards/margins": 0.276530921459198, "rewards/rejected": -0.33558350801467896, "sft_loss": 0.59052574634552, "step": 823 }, { "epoch": 1.1916124367317427, "grad_norm": 5.750195138604537, "learning_rate": 7.384415862494055e-06, "logits/chosen": 0.09526326507329941, "logits/rejected": 0.1360977292060852, "logps/chosen": -0.7659902572631836, "logps/rejected": -4.108973026275635, "loss": 0.6252, "odds_ratio_loss": 0.395515114068985, "rewards/accuracies": 0.875, "rewards/chosen": -0.07659903168678284, "rewards/margins": 0.3342982530593872, "rewards/rejected": -0.41089728474617004, "sft_loss": 0.7659902572631836, "step": 824 }, { "epoch": 1.1930585683297181, "grad_norm": 2.948269959006474, "learning_rate": 7.382759355410666e-06, "logits/chosen": 0.178156316280365, "logits/rejected": 0.2407047152519226, "logps/chosen": -0.5645558834075928, "logps/rejected": -2.222418785095215, "loss": 0.549, "odds_ratio_loss": 0.2562811076641083, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0564555898308754, "rewards/margins": 0.16578629612922668, "rewards/rejected": -0.22224187850952148, "sft_loss": 0.5645558834075928, "step": 825 }, { "epoch": 1.1945046999276934, "grad_norm": 2.874090383621196, "learning_rate": 7.381100808849063e-06, "logits/chosen": 0.10233473777770996, "logits/rejected": 0.19571229815483093, "logps/chosen": -0.39320099353790283, "logps/rejected": -2.9694581031799316, "loss": 0.5638, "odds_ratio_loss": 0.1621977984905243, "rewards/accuracies": 1.0, "rewards/chosen": -0.03932010382413864, "rewards/margins": 0.2576256990432739, "rewards/rejected": -0.29694584012031555, "sft_loss": 0.39320099353790283, "step": 826 }, { "epoch": 1.1959508315256688, "grad_norm": 2.9133806547736727, "learning_rate": 7.379440223809189e-06, "logits/chosen": 0.23779156804084778, "logits/rejected": 0.08777079731225967, "logps/chosen": -0.49866682291030884, "logps/rejected": -2.9157798290252686, "loss": 0.6402, "odds_ratio_loss": 0.27906733751296997, "rewards/accuracies": 0.875, "rewards/chosen": -0.049866683781147, "rewards/margins": 0.24171128869056702, "rewards/rejected": -0.2915779948234558, "sft_loss": 0.49866682291030884, "step": 827 }, { "epoch": 1.1973969631236443, "grad_norm": 2.2335902201771214, "learning_rate": 7.377777601292219e-06, "logits/chosen": 0.2867448031902313, "logits/rejected": 0.14885875582695007, "logps/chosen": -0.5436640381813049, "logps/rejected": -3.3524608612060547, "loss": 0.6432, "odds_ratio_loss": 0.27895376086235046, "rewards/accuracies": 0.875, "rewards/chosen": -0.05436640605330467, "rewards/margins": 0.2808796763420105, "rewards/rejected": -0.3352460563182831, "sft_loss": 0.5436640381813049, "step": 828 }, { "epoch": 1.1988430947216198, "grad_norm": 3.199011954022099, "learning_rate": 7.376112942300552e-06, "logits/chosen": 0.015715528279542923, "logits/rejected": 0.06243829429149628, "logps/chosen": -0.7299444675445557, "logps/rejected": -1.2420305013656616, "loss": 0.6999, "odds_ratio_loss": 0.4474690854549408, "rewards/accuracies": 0.875, "rewards/chosen": -0.07299444824457169, "rewards/margins": 0.051208604127168655, "rewards/rejected": -0.12420305609703064, "sft_loss": 0.7299444675445557, "step": 829 }, { "epoch": 1.200289226319595, "grad_norm": 2.4971244149041527, "learning_rate": 7.374446247837818e-06, "logits/chosen": 0.08180706202983856, "logits/rejected": 0.046968236565589905, "logps/chosen": -0.6631790399551392, "logps/rejected": -1.8908724784851074, "loss": 0.6524, "odds_ratio_loss": 0.3295513987541199, "rewards/accuracies": 0.875, "rewards/chosen": -0.06631790101528168, "rewards/margins": 0.12276935577392578, "rewards/rejected": -0.18908724188804626, "sft_loss": 0.6631790399551392, "step": 830 }, { "epoch": 1.2017353579175705, "grad_norm": 2.174741373156066, "learning_rate": 7.372777518908874e-06, "logits/chosen": 0.08737226575613022, "logits/rejected": 0.004193238914012909, "logps/chosen": -0.6524173021316528, "logps/rejected": -2.7812983989715576, "loss": 0.5948, "odds_ratio_loss": 0.36923202872276306, "rewards/accuracies": 0.8125, "rewards/chosen": -0.065241739153862, "rewards/margins": 0.21288813650608063, "rewards/rejected": -0.2781298756599426, "sft_loss": 0.6524173021316528, "step": 831 }, { "epoch": 1.203181489515546, "grad_norm": 2.7310295441448114, "learning_rate": 7.371106756519802e-06, "logits/chosen": 0.173319473862648, "logits/rejected": 0.08408096432685852, "logps/chosen": -0.6770601272583008, "logps/rejected": -2.8480212688446045, "loss": 0.6284, "odds_ratio_loss": 0.3516097664833069, "rewards/accuracies": 0.875, "rewards/chosen": -0.06770601868629456, "rewards/margins": 0.21709612011909485, "rewards/rejected": -0.2848021388053894, "sft_loss": 0.6770601272583008, "step": 832 }, { "epoch": 1.2046276211135214, "grad_norm": 7.808474205778759, "learning_rate": 7.369433961677911e-06, "logits/chosen": 0.0008435901254415512, "logits/rejected": -0.0506550632417202, "logps/chosen": -0.8189026117324829, "logps/rejected": -1.491754412651062, "loss": 0.6586, "odds_ratio_loss": 0.4845430850982666, "rewards/accuracies": 0.875, "rewards/chosen": -0.08189025521278381, "rewards/margins": 0.06728518754243851, "rewards/rejected": -0.14917545020580292, "sft_loss": 0.8189026117324829, "step": 833 }, { "epoch": 1.2060737527114966, "grad_norm": 2.728802488582667, "learning_rate": 7.367759135391736e-06, "logits/chosen": 0.08259500563144684, "logits/rejected": 0.11314639449119568, "logps/chosen": -0.6674097776412964, "logps/rejected": -2.400236129760742, "loss": 0.6741, "odds_ratio_loss": 0.31944167613983154, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0667409747838974, "rewards/margins": 0.17328262329101562, "rewards/rejected": -0.24002361297607422, "sft_loss": 0.6674097776412964, "step": 834 }, { "epoch": 1.207519884309472, "grad_norm": 3.9207051998045026, "learning_rate": 7.366082278671035e-06, "logits/chosen": 0.026229331269860268, "logits/rejected": 0.06131180375814438, "logps/chosen": -0.5891038179397583, "logps/rejected": -2.125972032546997, "loss": 0.6595, "odds_ratio_loss": 0.28627675771713257, "rewards/accuracies": 0.875, "rewards/chosen": -0.05891038104891777, "rewards/margins": 0.15368682146072388, "rewards/rejected": -0.21259720623493195, "sft_loss": 0.5891038179397583, "step": 835 }, { "epoch": 1.2089660159074476, "grad_norm": 2.5431982766029866, "learning_rate": 7.364403392526792e-06, "logits/chosen": 0.0870114117860794, "logits/rejected": 0.11938456445932388, "logps/chosen": -0.618964433670044, "logps/rejected": -1.597392201423645, "loss": 0.6361, "odds_ratio_loss": 0.33390533924102783, "rewards/accuracies": 0.9375, "rewards/chosen": -0.061896443367004395, "rewards/margins": 0.09784276783466339, "rewards/rejected": -0.15973922610282898, "sft_loss": 0.618964433670044, "step": 836 }, { "epoch": 1.210412147505423, "grad_norm": 4.077563662601138, "learning_rate": 7.362722477971212e-06, "logits/chosen": 0.16533887386322021, "logits/rejected": 0.19099020957946777, "logps/chosen": -0.6469172239303589, "logps/rejected": -1.096304178237915, "loss": 0.6717, "odds_ratio_loss": 0.458132803440094, "rewards/accuracies": 0.75, "rewards/chosen": -0.06469172239303589, "rewards/margins": 0.044938698410987854, "rewards/rejected": -0.10963042080402374, "sft_loss": 0.6469172239303589, "step": 837 }, { "epoch": 1.2118582791033985, "grad_norm": 4.144574279732616, "learning_rate": 7.3610395360177265e-06, "logits/chosen": 0.23874233663082123, "logits/rejected": 0.1909375786781311, "logps/chosen": -0.4308343529701233, "logps/rejected": -4.907786846160889, "loss": 0.6455, "odds_ratio_loss": 0.2326105535030365, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04308343678712845, "rewards/margins": 0.447695255279541, "rewards/rejected": -0.49077871441841125, "sft_loss": 0.4308343529701233, "step": 838 }, { "epoch": 1.2133044107013737, "grad_norm": 2.572402444131962, "learning_rate": 7.359354567680988e-06, "logits/chosen": 0.02161214128136635, "logits/rejected": 1.9058585166931152e-05, "logps/chosen": -0.6865906715393066, "logps/rejected": -3.0387065410614014, "loss": 0.571, "odds_ratio_loss": 0.32448068261146545, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06865906715393066, "rewards/margins": 0.2352115958929062, "rewards/rejected": -0.30387064814567566, "sft_loss": 0.6865906715393066, "step": 839 }, { "epoch": 1.2147505422993492, "grad_norm": 2.639023550297937, "learning_rate": 7.357667573976868e-06, "logits/chosen": 0.1299152374267578, "logits/rejected": 0.1438194215297699, "logps/chosen": -0.6337484121322632, "logps/rejected": -2.1571292877197266, "loss": 0.6384, "odds_ratio_loss": 0.32844048738479614, "rewards/accuracies": 0.875, "rewards/chosen": -0.0633748471736908, "rewards/margins": 0.15233808755874634, "rewards/rejected": -0.21571293473243713, "sft_loss": 0.6337484121322632, "step": 840 }, { "epoch": 1.2161966738973247, "grad_norm": 2.355187398625417, "learning_rate": 7.355978555922462e-06, "logits/chosen": 0.17293739318847656, "logits/rejected": 0.14240939915180206, "logps/chosen": -0.7368587255477905, "logps/rejected": -1.4114584922790527, "loss": 0.6926, "odds_ratio_loss": 0.5368320345878601, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07368587702512741, "rewards/margins": 0.06745997071266174, "rewards/rejected": -0.14114584028720856, "sft_loss": 0.7368587255477905, "step": 841 }, { "epoch": 1.2176428054953001, "grad_norm": 2.3515135229234385, "learning_rate": 7.354287514536086e-06, "logits/chosen": -0.12064902484416962, "logits/rejected": -0.07363397628068924, "logps/chosen": -0.639882504940033, "logps/rejected": -2.084611177444458, "loss": 0.6079, "odds_ratio_loss": 0.3934566378593445, "rewards/accuracies": 0.875, "rewards/chosen": -0.06398826092481613, "rewards/margins": 0.1444728672504425, "rewards/rejected": -0.20846112072467804, "sft_loss": 0.639882504940033, "step": 842 }, { "epoch": 1.2190889370932756, "grad_norm": 5.7804051421541995, "learning_rate": 7.352594450837275e-06, "logits/chosen": 0.14320078492164612, "logits/rejected": 0.13621798157691956, "logps/chosen": -0.6584324240684509, "logps/rejected": -3.753617763519287, "loss": 0.6103, "odds_ratio_loss": 0.31743061542510986, "rewards/accuracies": 0.875, "rewards/chosen": -0.06584323942661285, "rewards/margins": 0.3095185458660126, "rewards/rejected": -0.37536177039146423, "sft_loss": 0.6584324240684509, "step": 843 }, { "epoch": 1.2205350686912508, "grad_norm": 2.2438557449285605, "learning_rate": 7.350899365846783e-06, "logits/chosen": 0.0793205052614212, "logits/rejected": 0.06837757676839828, "logps/chosen": -0.6042904853820801, "logps/rejected": -3.4073262214660645, "loss": 0.5716, "odds_ratio_loss": 0.26326024532318115, "rewards/accuracies": 0.9375, "rewards/chosen": -0.060429058969020844, "rewards/margins": 0.2803035378456116, "rewards/rejected": -0.3407326340675354, "sft_loss": 0.6042904853820801, "step": 844 }, { "epoch": 1.2219812002892263, "grad_norm": 5.631401098009144, "learning_rate": 7.349202260586583e-06, "logits/chosen": 0.16910803318023682, "logits/rejected": 0.1033957228064537, "logps/chosen": -0.5252476930618286, "logps/rejected": -3.475262403488159, "loss": 0.5537, "odds_ratio_loss": 0.36323004961013794, "rewards/accuracies": 0.875, "rewards/chosen": -0.052524764090776443, "rewards/margins": 0.29500147700309753, "rewards/rejected": -0.3475262522697449, "sft_loss": 0.5252476930618286, "step": 845 }, { "epoch": 1.2234273318872018, "grad_norm": 2.3919449248178375, "learning_rate": 7.3475031360798675e-06, "logits/chosen": 0.021080223843455315, "logits/rejected": 0.05137103796005249, "logps/chosen": -0.7238802909851074, "logps/rejected": -1.834083080291748, "loss": 0.6581, "odds_ratio_loss": 0.3619506359100342, "rewards/accuracies": 0.875, "rewards/chosen": -0.07238802313804626, "rewards/margins": 0.1110202968120575, "rewards/rejected": -0.18340831995010376, "sft_loss": 0.7238802909851074, "step": 846 }, { "epoch": 1.2248734634851772, "grad_norm": 2.401756254973007, "learning_rate": 7.345801993351043e-06, "logits/chosen": 0.1136443018913269, "logits/rejected": 0.09337516129016876, "logps/chosen": -0.508891761302948, "logps/rejected": -1.8448481559753418, "loss": 0.5723, "odds_ratio_loss": 0.3853185772895813, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05088917538523674, "rewards/margins": 0.13359564542770386, "rewards/rejected": -0.1844848245382309, "sft_loss": 0.508891761302948, "step": 847 }, { "epoch": 1.2263195950831527, "grad_norm": 2.210151721154268, "learning_rate": 7.344098833425736e-06, "logits/chosen": 0.04942498356103897, "logits/rejected": 0.1343136578798294, "logps/chosen": -0.5119268298149109, "logps/rejected": -3.112112522125244, "loss": 0.6502, "odds_ratio_loss": 0.22073283791542053, "rewards/accuracies": 1.0, "rewards/chosen": -0.05119268596172333, "rewards/margins": 0.26001858711242676, "rewards/rejected": -0.3112112283706665, "sft_loss": 0.5119268298149109, "step": 848 }, { "epoch": 1.227765726681128, "grad_norm": 2.186487322506283, "learning_rate": 7.342393657330786e-06, "logits/chosen": 0.08392804116010666, "logits/rejected": 0.09755400568246841, "logps/chosen": -0.6143007874488831, "logps/rejected": -2.7268826961517334, "loss": 0.6268, "odds_ratio_loss": 0.3477764427661896, "rewards/accuracies": 0.875, "rewards/chosen": -0.061430081725120544, "rewards/margins": 0.21125821769237518, "rewards/rejected": -0.27268826961517334, "sft_loss": 0.6143007874488831, "step": 849 }, { "epoch": 1.2292118582791034, "grad_norm": 2.5756144440337003, "learning_rate": 7.340686466094253e-06, "logits/chosen": 0.01365756243467331, "logits/rejected": -0.015208684839308262, "logps/chosen": -0.6614384055137634, "logps/rejected": -1.732398271560669, "loss": 0.59, "odds_ratio_loss": 0.4114297926425934, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06614383310079575, "rewards/margins": 0.10709600150585175, "rewards/rejected": -0.1732398420572281, "sft_loss": 0.6614384055137634, "step": 850 }, { "epoch": 1.2306579898770789, "grad_norm": 3.2850407881963646, "learning_rate": 7.338977260745408e-06, "logits/chosen": 0.06653488427400589, "logits/rejected": 0.11970080435276031, "logps/chosen": -0.6355282068252563, "logps/rejected": -2.975282669067383, "loss": 0.6661, "odds_ratio_loss": 0.18445342779159546, "rewards/accuracies": 1.0, "rewards/chosen": -0.06355281919240952, "rewards/margins": 0.23397547006607056, "rewards/rejected": -0.29752829670906067, "sft_loss": 0.6355282068252563, "step": 851 }, { "epoch": 1.2321041214750543, "grad_norm": 3.3087787505793074, "learning_rate": 7.337266042314736e-06, "logits/chosen": -0.11909240484237671, "logits/rejected": -0.003269646316766739, "logps/chosen": -0.5766974687576294, "logps/rejected": -1.5956358909606934, "loss": 0.5398, "odds_ratio_loss": 0.3452081084251404, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0576697513461113, "rewards/margins": 0.1018938422203064, "rewards/rejected": -0.1595635861158371, "sft_loss": 0.5766974687576294, "step": 852 }, { "epoch": 1.2335502530730296, "grad_norm": 2.2720728280558182, "learning_rate": 7.335552811833938e-06, "logits/chosen": 0.09772266447544098, "logits/rejected": 0.05156873166561127, "logps/chosen": -0.7244196534156799, "logps/rejected": -1.5867480039596558, "loss": 0.5883, "odds_ratio_loss": 0.4903804659843445, "rewards/accuracies": 0.75, "rewards/chosen": -0.072441965341568, "rewards/margins": 0.08623284101486206, "rewards/rejected": -0.15867480635643005, "sft_loss": 0.7244196534156799, "step": 853 }, { "epoch": 1.234996384671005, "grad_norm": 4.799589917880964, "learning_rate": 7.333837570335926e-06, "logits/chosen": 0.17452943325042725, "logits/rejected": 0.13533267378807068, "logps/chosen": -0.41512399911880493, "logps/rejected": -4.827322959899902, "loss": 0.603, "odds_ratio_loss": 0.2235361635684967, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04151239991188049, "rewards/margins": 0.44121992588043213, "rewards/rejected": -0.4827323257923126, "sft_loss": 0.41512399911880493, "step": 854 }, { "epoch": 1.2364425162689805, "grad_norm": 2.1998808847443017, "learning_rate": 7.332120318854828e-06, "logits/chosen": 0.16299928724765778, "logits/rejected": 0.12603971362113953, "logps/chosen": -0.6076937317848206, "logps/rejected": -2.386244297027588, "loss": 0.6123, "odds_ratio_loss": 0.3471486568450928, "rewards/accuracies": 0.75, "rewards/chosen": -0.060769371688365936, "rewards/margins": 0.17785507440567017, "rewards/rejected": -0.2386244535446167, "sft_loss": 0.6076937317848206, "step": 855 }, { "epoch": 1.237888647866956, "grad_norm": 4.333732122428708, "learning_rate": 7.330401058425978e-06, "logits/chosen": 0.1115557998418808, "logits/rejected": 0.03767241910099983, "logps/chosen": -0.6454582214355469, "logps/rejected": -4.050790309906006, "loss": 0.5584, "odds_ratio_loss": 0.39332228899002075, "rewards/accuracies": 0.75, "rewards/chosen": -0.06454582512378693, "rewards/margins": 0.34053319692611694, "rewards/rejected": -0.4050790071487427, "sft_loss": 0.6454582214355469, "step": 856 }, { "epoch": 1.2393347794649312, "grad_norm": 2.325898041501977, "learning_rate": 7.328679790085928e-06, "logits/chosen": 0.233146071434021, "logits/rejected": 0.11350773274898529, "logps/chosen": -0.45263344049453735, "logps/rejected": -2.9898529052734375, "loss": 0.5872, "odds_ratio_loss": 0.2763618230819702, "rewards/accuracies": 0.875, "rewards/chosen": -0.045263346284627914, "rewards/margins": 0.2537219226360321, "rewards/rejected": -0.2989852726459503, "sft_loss": 0.45263344049453735, "step": 857 }, { "epoch": 1.2407809110629067, "grad_norm": 3.1302938440980874, "learning_rate": 7.326956514872434e-06, "logits/chosen": -0.018604222685098648, "logits/rejected": 0.06933964788913727, "logps/chosen": -0.48552218079566956, "logps/rejected": -1.1437679529190063, "loss": 0.6535, "odds_ratio_loss": 0.3590165376663208, "rewards/accuracies": 0.8125, "rewards/chosen": -0.048552218824625015, "rewards/margins": 0.06582456827163696, "rewards/rejected": -0.11437679827213287, "sft_loss": 0.48552218079566956, "step": 858 }, { "epoch": 1.2422270426608821, "grad_norm": 2.353039250439809, "learning_rate": 7.325231233824465e-06, "logits/chosen": 0.09792876243591309, "logits/rejected": 0.12561389803886414, "logps/chosen": -0.5376712679862976, "logps/rejected": -2.5611867904663086, "loss": 0.5828, "odds_ratio_loss": 0.2520514130592346, "rewards/accuracies": 0.9375, "rewards/chosen": -0.053767129778862, "rewards/margins": 0.20235158503055573, "rewards/rejected": -0.2561187148094177, "sft_loss": 0.5376712679862976, "step": 859 }, { "epoch": 1.2436731742588576, "grad_norm": 2.7402418071243697, "learning_rate": 7.323503947982203e-06, "logits/chosen": 0.22483845055103302, "logits/rejected": 0.20443016290664673, "logps/chosen": -0.34170234203338623, "logps/rejected": -2.6574883460998535, "loss": 0.5197, "odds_ratio_loss": 0.19146178662776947, "rewards/accuracies": 0.9375, "rewards/chosen": -0.034170232713222504, "rewards/margins": 0.23157860338687897, "rewards/rejected": -0.2657488286495209, "sft_loss": 0.34170234203338623, "step": 860 }, { "epoch": 1.245119305856833, "grad_norm": 2.1857929122043616, "learning_rate": 7.3217746583870315e-06, "logits/chosen": 0.0867963433265686, "logits/rejected": 0.08815450221300125, "logps/chosen": -0.49371716380119324, "logps/rejected": -3.03228497505188, "loss": 0.6737, "odds_ratio_loss": 0.2313099354505539, "rewards/accuracies": 1.0, "rewards/chosen": -0.049371711909770966, "rewards/margins": 0.2538568079471588, "rewards/rejected": -0.303228497505188, "sft_loss": 0.49371716380119324, "step": 861 }, { "epoch": 1.2465654374548083, "grad_norm": 4.024529157035028, "learning_rate": 7.3200433660815474e-06, "logits/chosen": 0.11046599596738815, "logits/rejected": 0.059477321803569794, "logps/chosen": -0.6523793935775757, "logps/rejected": -1.849184513092041, "loss": 0.6839, "odds_ratio_loss": 0.36109447479248047, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06523793935775757, "rewards/margins": 0.11968051642179489, "rewards/rejected": -0.18491846323013306, "sft_loss": 0.6523793935775757, "step": 862 }, { "epoch": 1.2480115690527838, "grad_norm": 3.176670393350689, "learning_rate": 7.318310072109552e-06, "logits/chosen": 0.07282102853059769, "logits/rejected": 0.19558274745941162, "logps/chosen": -0.5407617688179016, "logps/rejected": -2.6236157417297363, "loss": 0.6409, "odds_ratio_loss": 0.20967018604278564, "rewards/accuracies": 1.0, "rewards/chosen": -0.0540761798620224, "rewards/margins": 0.2082854062318802, "rewards/rejected": -0.2623615860939026, "sft_loss": 0.5407617688179016, "step": 863 }, { "epoch": 1.2494577006507592, "grad_norm": 2.8304016651309754, "learning_rate": 7.3165747775160555e-06, "logits/chosen": 0.1903497278690338, "logits/rejected": 0.07230844348669052, "logps/chosen": -0.712563693523407, "logps/rejected": -3.380535364151001, "loss": 0.5968, "odds_ratio_loss": 0.35233545303344727, "rewards/accuracies": 0.75, "rewards/chosen": -0.0712563768029213, "rewards/margins": 0.26679715514183044, "rewards/rejected": -0.33805355429649353, "sft_loss": 0.712563693523407, "step": 864 }, { "epoch": 1.2509038322487347, "grad_norm": 2.4209488023330996, "learning_rate": 7.3148374833472746e-06, "logits/chosen": 0.1617758572101593, "logits/rejected": 0.08497782796621323, "logps/chosen": -0.6493286490440369, "logps/rejected": -3.3906636238098145, "loss": 0.6008, "odds_ratio_loss": 0.4235643148422241, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06493286788463593, "rewards/margins": 0.27413350343704224, "rewards/rejected": -0.33906635642051697, "sft_loss": 0.6493286490440369, "step": 865 }, { "epoch": 1.2523499638467102, "grad_norm": 2.499704939420737, "learning_rate": 7.313098190650627e-06, "logits/chosen": -0.037707388401031494, "logits/rejected": 0.029249371960759163, "logps/chosen": -0.7335867881774902, "logps/rejected": -1.2428430318832397, "loss": 0.6596, "odds_ratio_loss": 0.46961063146591187, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0733586773276329, "rewards/margins": 0.050925616174936295, "rewards/rejected": -0.1242842972278595, "sft_loss": 0.7335867881774902, "step": 866 }, { "epoch": 1.2537960954446854, "grad_norm": 3.162650037572366, "learning_rate": 7.311356900474743e-06, "logits/chosen": 0.18778613209724426, "logits/rejected": 0.14763246476650238, "logps/chosen": -0.5227529406547546, "logps/rejected": -2.191002607345581, "loss": 0.5999, "odds_ratio_loss": 0.2827909588813782, "rewards/accuracies": 0.9375, "rewards/chosen": -0.052275292575359344, "rewards/margins": 0.16682496666908264, "rewards/rejected": -0.2191002517938614, "sft_loss": 0.5227529406547546, "step": 867 }, { "epoch": 1.2552422270426609, "grad_norm": 3.1734084703080048, "learning_rate": 7.30961361386945e-06, "logits/chosen": 0.22620001435279846, "logits/rejected": 0.28974610567092896, "logps/chosen": -0.631101131439209, "logps/rejected": -1.6538848876953125, "loss": 0.5449, "odds_ratio_loss": 0.3684981167316437, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0631101131439209, "rewards/margins": 0.10227837413549423, "rewards/rejected": -0.16538847982883453, "sft_loss": 0.631101131439209, "step": 868 }, { "epoch": 1.2566883586406363, "grad_norm": 3.5174474662803465, "learning_rate": 7.307868331885783e-06, "logits/chosen": 0.15043975412845612, "logits/rejected": 0.09274716675281525, "logps/chosen": -0.5229210257530212, "logps/rejected": -3.506356954574585, "loss": 0.5961, "odds_ratio_loss": 0.30983737111091614, "rewards/accuracies": 0.9375, "rewards/chosen": -0.052292101085186005, "rewards/margins": 0.29834359884262085, "rewards/rejected": -0.35063570737838745, "sft_loss": 0.5229210257530212, "step": 869 }, { "epoch": 1.2581344902386118, "grad_norm": 3.329767820424892, "learning_rate": 7.306121055575979e-06, "logits/chosen": 0.17091235518455505, "logits/rejected": 0.05310794711112976, "logps/chosen": -0.48885834217071533, "logps/rejected": -2.341912269592285, "loss": 0.5332, "odds_ratio_loss": 0.3056949973106384, "rewards/accuracies": 0.8125, "rewards/chosen": -0.04888583719730377, "rewards/margins": 0.1853054016828537, "rewards/rejected": -0.23419123888015747, "sft_loss": 0.48885834217071533, "step": 870 }, { "epoch": 1.2595806218365873, "grad_norm": 2.421934812884417, "learning_rate": 7.304371785993478e-06, "logits/chosen": 0.07035940140485764, "logits/rejected": 0.057187557220458984, "logps/chosen": -0.5910875797271729, "logps/rejected": -3.1442012786865234, "loss": 0.655, "odds_ratio_loss": 0.26056718826293945, "rewards/accuracies": 0.875, "rewards/chosen": -0.059108760207891464, "rewards/margins": 0.25531139969825745, "rewards/rejected": -0.3144201636314392, "sft_loss": 0.5910875797271729, "step": 871 }, { "epoch": 1.2610267534345625, "grad_norm": 5.4366957185017055, "learning_rate": 7.302620524192919e-06, "logits/chosen": 0.1306525021791458, "logits/rejected": -0.046006545424461365, "logps/chosen": -0.4835171401500702, "logps/rejected": -3.9955079555511475, "loss": 0.5438, "odds_ratio_loss": 0.2657640874385834, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0483517125248909, "rewards/margins": 0.35119912028312683, "rewards/rejected": -0.39955076575279236, "sft_loss": 0.4835171401500702, "step": 872 }, { "epoch": 1.262472885032538, "grad_norm": 4.732128151149686, "learning_rate": 7.300867271230147e-06, "logits/chosen": -0.02715367265045643, "logits/rejected": 0.008712584152817726, "logps/chosen": -0.7222077250480652, "logps/rejected": -2.355595827102661, "loss": 0.5656, "odds_ratio_loss": 0.2727832794189453, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07222076505422592, "rewards/margins": 0.1633388102054596, "rewards/rejected": -0.2355595827102661, "sft_loss": 0.7222077250480652, "step": 873 }, { "epoch": 1.2639190166305134, "grad_norm": 2.946079569555965, "learning_rate": 7.299112028162202e-06, "logits/chosen": 0.12276051938533783, "logits/rejected": 0.09081517904996872, "logps/chosen": -0.6572151184082031, "logps/rejected": -3.947697877883911, "loss": 0.6366, "odds_ratio_loss": 0.37073010206222534, "rewards/accuracies": 0.875, "rewards/chosen": -0.06572151184082031, "rewards/margins": 0.3290482759475708, "rewards/rejected": -0.3947698175907135, "sft_loss": 0.6572151184082031, "step": 874 }, { "epoch": 1.2653651482284887, "grad_norm": 3.824117606600312, "learning_rate": 7.297354796047329e-06, "logits/chosen": 0.1744898557662964, "logits/rejected": 0.1692255735397339, "logps/chosen": -0.4023602604866028, "logps/rejected": -3.4393415451049805, "loss": 0.6831, "odds_ratio_loss": 0.22353234887123108, "rewards/accuracies": 1.0, "rewards/chosen": -0.04023602232336998, "rewards/margins": 0.3036981225013733, "rewards/rejected": -0.34393414855003357, "sft_loss": 0.4023602604866028, "step": 875 }, { "epoch": 1.2668112798264641, "grad_norm": 2.9230987462682227, "learning_rate": 7.295595575944968e-06, "logits/chosen": 0.11851765215396881, "logits/rejected": 0.15573816001415253, "logps/chosen": -0.5971537232398987, "logps/rejected": -2.4105782508850098, "loss": 0.6128, "odds_ratio_loss": 0.3604203164577484, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05971537530422211, "rewards/margins": 0.1813424527645111, "rewards/rejected": -0.24105781316757202, "sft_loss": 0.5971537232398987, "step": 876 }, { "epoch": 1.2682574114244396, "grad_norm": 3.7641940234842592, "learning_rate": 7.293834368915762e-06, "logits/chosen": 0.17587612569332123, "logits/rejected": 0.11586427688598633, "logps/chosen": -0.6683433651924133, "logps/rejected": -1.2942430973052979, "loss": 0.6081, "odds_ratio_loss": 0.40443292260169983, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06683434545993805, "rewards/margins": 0.06258997321128845, "rewards/rejected": -0.1294243186712265, "sft_loss": 0.6683433651924133, "step": 877 }, { "epoch": 1.269703543022415, "grad_norm": 2.7147629034863483, "learning_rate": 7.292071176021546e-06, "logits/chosen": 0.1754606068134308, "logits/rejected": 0.17133712768554688, "logps/chosen": -0.6982166767120361, "logps/rejected": -2.0608537197113037, "loss": 0.6528, "odds_ratio_loss": 0.39273586869239807, "rewards/accuracies": 0.875, "rewards/chosen": -0.06982167065143585, "rewards/margins": 0.13626369833946228, "rewards/rejected": -0.20608535408973694, "sft_loss": 0.6982166767120361, "step": 878 }, { "epoch": 1.2711496746203905, "grad_norm": 2.362146428523586, "learning_rate": 7.2903059983253575e-06, "logits/chosen": 0.3036377429962158, "logits/rejected": 0.21588896214962006, "logps/chosen": -0.5209940671920776, "logps/rejected": -2.4526965618133545, "loss": 0.5407, "odds_ratio_loss": 0.33587196469306946, "rewards/accuracies": 0.875, "rewards/chosen": -0.052099406719207764, "rewards/margins": 0.19317024946212769, "rewards/rejected": -0.24526965618133545, "sft_loss": 0.5209940671920776, "step": 879 }, { "epoch": 1.2725958062183658, "grad_norm": 2.5454177758478496, "learning_rate": 7.288538836891428e-06, "logits/chosen": 0.06093733385205269, "logits/rejected": 0.07876378297805786, "logps/chosen": -0.6048973798751831, "logps/rejected": -3.196272611618042, "loss": 0.5978, "odds_ratio_loss": 0.28094661235809326, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06048973649740219, "rewards/margins": 0.25913751125335693, "rewards/rejected": -0.31962722539901733, "sft_loss": 0.6048973798751831, "step": 880 }, { "epoch": 1.2740419378163412, "grad_norm": 4.714093464026552, "learning_rate": 7.286769692785185e-06, "logits/chosen": 0.12928709387779236, "logits/rejected": 0.0929189920425415, "logps/chosen": -0.6182774305343628, "logps/rejected": -1.788148045539856, "loss": 0.6916, "odds_ratio_loss": 0.5657252073287964, "rewards/accuracies": 0.6875, "rewards/chosen": -0.061827752739191055, "rewards/margins": 0.11698705703020096, "rewards/rejected": -0.17881479859352112, "sft_loss": 0.6182774305343628, "step": 881 }, { "epoch": 1.2754880694143167, "grad_norm": 2.425544165899364, "learning_rate": 7.284998567073254e-06, "logits/chosen": 0.1605134755373001, "logits/rejected": 0.12401594966650009, "logps/chosen": -0.4902043640613556, "logps/rejected": -2.4860270023345947, "loss": 0.6171, "odds_ratio_loss": 0.24903835356235504, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0490204393863678, "rewards/margins": 0.1995822638273239, "rewards/rejected": -0.2486027032136917, "sft_loss": 0.4902043640613556, "step": 882 }, { "epoch": 1.2769342010122922, "grad_norm": 2.4177056064353937, "learning_rate": 7.283225460823452e-06, "logits/chosen": 0.15278860926628113, "logits/rejected": 0.09883365780115128, "logps/chosen": -0.4777308702468872, "logps/rejected": -3.172642707824707, "loss": 0.6086, "odds_ratio_loss": 0.2730526626110077, "rewards/accuracies": 1.0, "rewards/chosen": -0.0477730855345726, "rewards/margins": 0.26949119567871094, "rewards/rejected": -0.31726425886154175, "sft_loss": 0.4777308702468872, "step": 883 }, { "epoch": 1.2783803326102676, "grad_norm": 2.870771462027815, "learning_rate": 7.281450375104792e-06, "logits/chosen": 0.11632952094078064, "logits/rejected": 0.08604128658771515, "logps/chosen": -0.7169528007507324, "logps/rejected": -2.1212775707244873, "loss": 0.6727, "odds_ratio_loss": 0.43706566095352173, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07169528305530548, "rewards/margins": 0.1404324769973755, "rewards/rejected": -0.21212777495384216, "sft_loss": 0.7169528007507324, "step": 884 }, { "epoch": 1.2798264642082429, "grad_norm": 2.654421440453263, "learning_rate": 7.2796733109874785e-06, "logits/chosen": 0.16258105635643005, "logits/rejected": 0.170535147190094, "logps/chosen": -0.5291702747344971, "logps/rejected": -2.99528431892395, "loss": 0.6692, "odds_ratio_loss": 0.2050587683916092, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05291702598333359, "rewards/margins": 0.24661138653755188, "rewards/rejected": -0.29952841997146606, "sft_loss": 0.5291702747344971, "step": 885 }, { "epoch": 1.2812725958062183, "grad_norm": 6.212461882229058, "learning_rate": 7.277894269542912e-06, "logits/chosen": 0.1574457436800003, "logits/rejected": 0.021832166239619255, "logps/chosen": -0.5372164249420166, "logps/rejected": -3.0358946323394775, "loss": 0.682, "odds_ratio_loss": 0.29545292258262634, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05372164770960808, "rewards/margins": 0.24986781179904938, "rewards/rejected": -0.30358946323394775, "sft_loss": 0.5372164249420166, "step": 886 }, { "epoch": 1.2827187274041938, "grad_norm": 2.733381355486248, "learning_rate": 7.2761132518436825e-06, "logits/chosen": 0.1425001621246338, "logits/rejected": 0.16708292067050934, "logps/chosen": -0.498007595539093, "logps/rejected": -2.4287235736846924, "loss": 0.551, "odds_ratio_loss": 0.21452642977237701, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04980075731873512, "rewards/margins": 0.19307160377502441, "rewards/rejected": -0.24287235736846924, "sft_loss": 0.498007595539093, "step": 887 }, { "epoch": 1.2841648590021693, "grad_norm": 2.302702730393573, "learning_rate": 7.274330258963571e-06, "logits/chosen": 0.2424948513507843, "logits/rejected": 0.10700159519910812, "logps/chosen": -0.43079856038093567, "logps/rejected": -3.0819058418273926, "loss": 0.6015, "odds_ratio_loss": 0.19200022518634796, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04307985678315163, "rewards/margins": 0.26511070132255554, "rewards/rejected": -0.30819055438041687, "sft_loss": 0.43079856038093567, "step": 888 }, { "epoch": 1.2856109906001447, "grad_norm": 2.612798051257584, "learning_rate": 7.272545291977551e-06, "logits/chosen": 0.08881522715091705, "logits/rejected": 0.05702493339776993, "logps/chosen": -0.5296792387962341, "logps/rejected": -3.994205951690674, "loss": 0.6511, "odds_ratio_loss": 0.289590448141098, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05296792834997177, "rewards/margins": 0.34645265340805054, "rewards/rejected": -0.3994206190109253, "sft_loss": 0.5296792387962341, "step": 889 }, { "epoch": 1.28705712219812, "grad_norm": 2.9782311108855244, "learning_rate": 7.270758351961787e-06, "logits/chosen": 0.026416288688778877, "logits/rejected": 0.03395572304725647, "logps/chosen": -0.6140929460525513, "logps/rejected": -2.65456223487854, "loss": 0.5723, "odds_ratio_loss": 0.3684936761856079, "rewards/accuracies": 0.75, "rewards/chosen": -0.06140929460525513, "rewards/margins": 0.20404693484306335, "rewards/rejected": -0.2654562294483185, "sft_loss": 0.6140929460525513, "step": 890 }, { "epoch": 1.2885032537960954, "grad_norm": 2.589171345600719, "learning_rate": 7.268969439993631e-06, "logits/chosen": 0.1916169971227646, "logits/rejected": 0.04407678171992302, "logps/chosen": -0.5033664703369141, "logps/rejected": -2.073901891708374, "loss": 0.7008, "odds_ratio_loss": 0.2652108669281006, "rewards/accuracies": 0.875, "rewards/chosen": -0.05033664405345917, "rewards/margins": 0.15705353021621704, "rewards/rejected": -0.2073901891708374, "sft_loss": 0.5033664703369141, "step": 891 }, { "epoch": 1.289949385394071, "grad_norm": 5.374295907094805, "learning_rate": 7.267178557151625e-06, "logits/chosen": 0.142109215259552, "logits/rejected": 0.191066175699234, "logps/chosen": -0.4820294976234436, "logps/rejected": -2.943162441253662, "loss": 0.6246, "odds_ratio_loss": 0.3385387361049652, "rewards/accuracies": 0.8125, "rewards/chosen": -0.04820295050740242, "rewards/margins": 0.24611330032348633, "rewards/rejected": -0.29431626200675964, "sft_loss": 0.4820294976234436, "step": 892 }, { "epoch": 1.2913955169920464, "grad_norm": 3.3410095444556873, "learning_rate": 7.265385704515498e-06, "logits/chosen": 0.05013919621706009, "logits/rejected": 0.16960841417312622, "logps/chosen": -0.7405865788459778, "logps/rejected": -2.0694758892059326, "loss": 0.6052, "odds_ratio_loss": 0.41710177063941956, "rewards/accuracies": 0.875, "rewards/chosen": -0.0740586593747139, "rewards/margins": 0.13288892805576324, "rewards/rejected": -0.20694757997989655, "sft_loss": 0.7405865788459778, "step": 893 }, { "epoch": 1.2928416485900218, "grad_norm": 2.5037443157900823, "learning_rate": 7.263590883166168e-06, "logits/chosen": 0.10009223222732544, "logits/rejected": 0.14507119357585907, "logps/chosen": -0.433633029460907, "logps/rejected": -1.8324575424194336, "loss": 0.594, "odds_ratio_loss": 0.27205413579940796, "rewards/accuracies": 1.0, "rewards/chosen": -0.0433633029460907, "rewards/margins": 0.13988244533538818, "rewards/rejected": -0.18324576318264008, "sft_loss": 0.433633029460907, "step": 894 }, { "epoch": 1.294287780187997, "grad_norm": 2.6618437917755156, "learning_rate": 7.2617940941857395e-06, "logits/chosen": -0.0036581484600901604, "logits/rejected": 0.12767058610916138, "logps/chosen": -0.5401878356933594, "logps/rejected": -2.6623241901397705, "loss": 0.5868, "odds_ratio_loss": 0.21899092197418213, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0540187805891037, "rewards/margins": 0.2122136503458023, "rewards/rejected": -0.266232430934906, "sft_loss": 0.5401878356933594, "step": 895 }, { "epoch": 1.2957339117859725, "grad_norm": 3.7716031429476997, "learning_rate": 7.259995338657504e-06, "logits/chosen": 0.06534141302108765, "logits/rejected": 0.06475174427032471, "logps/chosen": -0.6003227829933167, "logps/rejected": -2.5519890785217285, "loss": 0.7213, "odds_ratio_loss": 0.39002418518066406, "rewards/accuracies": 0.75, "rewards/chosen": -0.060032278299331665, "rewards/margins": 0.19516664743423462, "rewards/rejected": -0.25519895553588867, "sft_loss": 0.6003227829933167, "step": 896 }, { "epoch": 1.297180043383948, "grad_norm": 2.794184064220952, "learning_rate": 7.258194617665937e-06, "logits/chosen": 0.1161397323012352, "logits/rejected": 0.14191779494285583, "logps/chosen": -0.6059271097183228, "logps/rejected": -2.318582534790039, "loss": 0.6352, "odds_ratio_loss": 0.42859724164009094, "rewards/accuracies": 0.875, "rewards/chosen": -0.060592710971832275, "rewards/margins": 0.17126552760601044, "rewards/rejected": -0.2318582534790039, "sft_loss": 0.6059271097183228, "step": 897 }, { "epoch": 1.2986261749819232, "grad_norm": 2.3562388178644915, "learning_rate": 7.256391932296701e-06, "logits/chosen": -0.09829920530319214, "logits/rejected": 0.0967685803771019, "logps/chosen": -0.5340638160705566, "logps/rejected": -2.656466245651245, "loss": 0.6395, "odds_ratio_loss": 0.22315004467964172, "rewards/accuracies": 1.0, "rewards/chosen": -0.053406380116939545, "rewards/margins": 0.21224026381969452, "rewards/rejected": -0.26564663648605347, "sft_loss": 0.5340638160705566, "step": 898 }, { "epoch": 1.3000723065798987, "grad_norm": 3.4290264601170573, "learning_rate": 7.25458728363664e-06, "logits/chosen": 0.10071888566017151, "logits/rejected": 0.07806149125099182, "logps/chosen": -0.6966016888618469, "logps/rejected": -3.5601463317871094, "loss": 0.6549, "odds_ratio_loss": 0.31283316016197205, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06966017186641693, "rewards/margins": 0.2863544821739197, "rewards/rejected": -0.3560146689414978, "sft_loss": 0.6966016888618469, "step": 899 }, { "epoch": 1.3015184381778742, "grad_norm": 2.5914762182916586, "learning_rate": 7.252780672773785e-06, "logits/chosen": 0.0018359817331656814, "logits/rejected": 0.041274283081293106, "logps/chosen": -0.592832624912262, "logps/rejected": -2.7108519077301025, "loss": 0.6497, "odds_ratio_loss": 0.2760257422924042, "rewards/accuracies": 1.0, "rewards/chosen": -0.059283267706632614, "rewards/margins": 0.2118019312620163, "rewards/rejected": -0.2710852026939392, "sft_loss": 0.592832624912262, "step": 900 }, { "epoch": 1.3029645697758496, "grad_norm": 2.512930115177367, "learning_rate": 7.250972100797347e-06, "logits/chosen": 0.14943164587020874, "logits/rejected": 0.04612865671515465, "logps/chosen": -0.7253419160842896, "logps/rejected": -2.070033073425293, "loss": 0.5876, "odds_ratio_loss": 0.45559167861938477, "rewards/accuracies": 0.75, "rewards/chosen": -0.07253418862819672, "rewards/margins": 0.13446910679340363, "rewards/rejected": -0.20700329542160034, "sft_loss": 0.7253419160842896, "step": 901 }, { "epoch": 1.304410701373825, "grad_norm": 3.628486682129814, "learning_rate": 7.249161568797722e-06, "logits/chosen": 0.04626443237066269, "logits/rejected": -0.00022936612367630005, "logps/chosen": -0.6973298788070679, "logps/rejected": -2.7048377990722656, "loss": 0.5538, "odds_ratio_loss": 0.39978471398353577, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06973298639059067, "rewards/margins": 0.20075079798698425, "rewards/rejected": -0.2704837918281555, "sft_loss": 0.6973298788070679, "step": 902 }, { "epoch": 1.3058568329718003, "grad_norm": 2.7165152924208398, "learning_rate": 7.247349077866486e-06, "logits/chosen": 0.13136953115463257, "logits/rejected": 0.08058802038431168, "logps/chosen": -0.6339420676231384, "logps/rejected": -2.620114326477051, "loss": 0.5215, "odds_ratio_loss": 0.3651321530342102, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0633942037820816, "rewards/margins": 0.19861723482608795, "rewards/rejected": -0.26201143860816956, "sft_loss": 0.6339420676231384, "step": 903 }, { "epoch": 1.3073029645697758, "grad_norm": 3.1412086704512645, "learning_rate": 7.245534629096397e-06, "logits/chosen": 0.06389408558607101, "logits/rejected": 0.13316801190376282, "logps/chosen": -0.3657557964324951, "logps/rejected": -2.257955551147461, "loss": 0.5489, "odds_ratio_loss": 0.22655794024467468, "rewards/accuracies": 1.0, "rewards/chosen": -0.03657558187842369, "rewards/margins": 0.18921998143196106, "rewards/rejected": -0.22579556703567505, "sft_loss": 0.3657557964324951, "step": 904 }, { "epoch": 1.3087490961677513, "grad_norm": 2.643111326392046, "learning_rate": 7.243718223581391e-06, "logits/chosen": 0.060490552335977554, "logits/rejected": 0.008018707856535912, "logps/chosen": -0.6461340188980103, "logps/rejected": -2.293785572052002, "loss": 0.659, "odds_ratio_loss": 0.3631659746170044, "rewards/accuracies": 0.875, "rewards/chosen": -0.06461340188980103, "rewards/margins": 0.1647651493549347, "rewards/rejected": -0.22937855124473572, "sft_loss": 0.6461340188980103, "step": 905 }, { "epoch": 1.3101952277657267, "grad_norm": 2.9794545360738995, "learning_rate": 7.241899862416588e-06, "logits/chosen": 0.041527822613716125, "logits/rejected": 0.18100914359092712, "logps/chosen": -0.37341946363449097, "logps/rejected": -1.93304443359375, "loss": 0.5409, "odds_ratio_loss": 0.19359731674194336, "rewards/accuracies": 1.0, "rewards/chosen": -0.03734194487333298, "rewards/margins": 0.1559625118970871, "rewards/rejected": -0.19330444931983948, "sft_loss": 0.37341946363449097, "step": 906 }, { "epoch": 1.3116413593637022, "grad_norm": 3.9441615451488916, "learning_rate": 7.240079546698284e-06, "logits/chosen": 0.03994975611567497, "logits/rejected": 0.034951452165842056, "logps/chosen": -0.6786578297615051, "logps/rejected": -3.219226360321045, "loss": 0.5901, "odds_ratio_loss": 0.3564690351486206, "rewards/accuracies": 0.875, "rewards/chosen": -0.06786578893661499, "rewards/margins": 0.254056841135025, "rewards/rejected": -0.32192263007164, "sft_loss": 0.6786578297615051, "step": 907 }, { "epoch": 1.3130874909616774, "grad_norm": 2.812788200690957, "learning_rate": 7.238257277523955e-06, "logits/chosen": -0.03691191226243973, "logits/rejected": 0.054480381309986115, "logps/chosen": -0.570400595664978, "logps/rejected": -2.1054439544677734, "loss": 0.593, "odds_ratio_loss": 0.3295222520828247, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05704005807638168, "rewards/margins": 0.15350434184074402, "rewards/rejected": -0.2105444073677063, "sft_loss": 0.570400595664978, "step": 908 }, { "epoch": 1.314533622559653, "grad_norm": 2.3743896241886797, "learning_rate": 7.23643305599225e-06, "logits/chosen": 0.10392654687166214, "logits/rejected": 0.07878941297531128, "logps/chosen": -0.6637530326843262, "logps/rejected": -2.511354446411133, "loss": 0.6289, "odds_ratio_loss": 0.3708875775337219, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06637530028820038, "rewards/margins": 0.1847601681947708, "rewards/rejected": -0.2511354684829712, "sft_loss": 0.6637530326843262, "step": 909 }, { "epoch": 1.3159797541576284, "grad_norm": 2.565095852577169, "learning_rate": 7.234606883203004e-06, "logits/chosen": -0.02966320887207985, "logits/rejected": -0.02640990912914276, "logps/chosen": -0.674669086933136, "logps/rejected": -2.56156325340271, "loss": 0.6611, "odds_ratio_loss": 0.26282304525375366, "rewards/accuracies": 1.0, "rewards/chosen": -0.06746691465377808, "rewards/margins": 0.18868939578533173, "rewards/rejected": -0.256156325340271, "sft_loss": 0.674669086933136, "step": 910 }, { "epoch": 1.3174258857556038, "grad_norm": 2.7836971947588776, "learning_rate": 7.23277876025722e-06, "logits/chosen": 0.20673127472400665, "logits/rejected": 0.08467914164066315, "logps/chosen": -0.353419691324234, "logps/rejected": -3.728929042816162, "loss": 0.429, "odds_ratio_loss": 0.10951358824968338, "rewards/accuracies": 1.0, "rewards/chosen": -0.03534197062253952, "rewards/margins": 0.33755093812942505, "rewards/rejected": -0.37289291620254517, "sft_loss": 0.353419691324234, "step": 911 }, { "epoch": 1.3188720173535793, "grad_norm": 2.454408699083897, "learning_rate": 7.230948688257083e-06, "logits/chosen": 0.24181370437145233, "logits/rejected": 0.17572006583213806, "logps/chosen": -0.5705441236495972, "logps/rejected": -1.9255404472351074, "loss": 0.5852, "odds_ratio_loss": 0.3566300868988037, "rewards/accuracies": 0.875, "rewards/chosen": -0.057054419070482254, "rewards/margins": 0.13549962639808655, "rewards/rejected": -0.1925540566444397, "sft_loss": 0.5705441236495972, "step": 912 }, { "epoch": 1.3203181489515545, "grad_norm": 2.658117655292024, "learning_rate": 7.2291166683059465e-06, "logits/chosen": 0.27265405654907227, "logits/rejected": 0.27807876467704773, "logps/chosen": -0.3936464786529541, "logps/rejected": -2.4130916595458984, "loss": 0.5982, "odds_ratio_loss": 0.26774898171424866, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03936465084552765, "rewards/margins": 0.2019445151090622, "rewards/rejected": -0.24130916595458984, "sft_loss": 0.3936464786529541, "step": 913 }, { "epoch": 1.32176428054953, "grad_norm": 2.597200659458208, "learning_rate": 7.227282701508345e-06, "logits/chosen": 0.1615338921546936, "logits/rejected": 0.2152920812368393, "logps/chosen": -0.6008027791976929, "logps/rejected": -2.2669618129730225, "loss": 0.5978, "odds_ratio_loss": 0.33722782135009766, "rewards/accuracies": 0.9375, "rewards/chosen": -0.060080282390117645, "rewards/margins": 0.16661590337753296, "rewards/rejected": -0.22669617831707, "sft_loss": 0.6008027791976929, "step": 914 }, { "epoch": 1.3232104121475055, "grad_norm": 3.5437125466151405, "learning_rate": 7.225446788969983e-06, "logits/chosen": 0.03311977535486221, "logits/rejected": 0.09436703473329544, "logps/chosen": -0.7546591758728027, "logps/rejected": -2.4398608207702637, "loss": 0.6545, "odds_ratio_loss": 0.3189430832862854, "rewards/accuracies": 0.75, "rewards/chosen": -0.07546591758728027, "rewards/margins": 0.16852018237113953, "rewards/rejected": -0.2439860850572586, "sft_loss": 0.7546591758728027, "step": 915 }, { "epoch": 1.324656543745481, "grad_norm": 2.3380832431302094, "learning_rate": 7.22360893179774e-06, "logits/chosen": 0.15004731714725494, "logits/rejected": 0.20780208706855774, "logps/chosen": -0.5689486265182495, "logps/rejected": -2.762641429901123, "loss": 0.6252, "odds_ratio_loss": 0.3046174645423889, "rewards/accuracies": 0.875, "rewards/chosen": -0.05689486116170883, "rewards/margins": 0.21936927735805511, "rewards/rejected": -0.27626413106918335, "sft_loss": 0.5689486265182495, "step": 916 }, { "epoch": 1.3261026753434564, "grad_norm": 2.4482868229892945, "learning_rate": 7.221769131099664e-06, "logits/chosen": 0.15187448263168335, "logits/rejected": 0.10933353006839752, "logps/chosen": -0.4541112184524536, "logps/rejected": -1.784976601600647, "loss": 0.5475, "odds_ratio_loss": 0.3130142092704773, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0454111248254776, "rewards/margins": 0.13308654725551605, "rewards/rejected": -0.17849765717983246, "sft_loss": 0.4541112184524536, "step": 917 }, { "epoch": 1.3275488069414316, "grad_norm": 2.1760698534951124, "learning_rate": 7.219927387984981e-06, "logits/chosen": 0.30233603715896606, "logits/rejected": 0.2338351458311081, "logps/chosen": -0.4559868276119232, "logps/rejected": -3.7631051540374756, "loss": 0.5335, "odds_ratio_loss": 0.2917534112930298, "rewards/accuracies": 0.875, "rewards/chosen": -0.04559868201613426, "rewards/margins": 0.33071184158325195, "rewards/rejected": -0.3763105273246765, "sft_loss": 0.4559868276119232, "step": 918 }, { "epoch": 1.328994938539407, "grad_norm": 2.4506090033415626, "learning_rate": 7.2180837035640835e-06, "logits/chosen": 0.04991018399596214, "logits/rejected": 0.04235024377703667, "logps/chosen": -0.6620197892189026, "logps/rejected": -1.66459059715271, "loss": 0.5907, "odds_ratio_loss": 0.5594837665557861, "rewards/accuracies": 0.75, "rewards/chosen": -0.06620198488235474, "rewards/margins": 0.10025707632303238, "rewards/rejected": -0.16645905375480652, "sft_loss": 0.6620197892189026, "step": 919 }, { "epoch": 1.3304410701373826, "grad_norm": 4.1490860488057555, "learning_rate": 7.216238078948535e-06, "logits/chosen": 0.08021122962236404, "logits/rejected": 0.06495549529790878, "logps/chosen": -0.7230328321456909, "logps/rejected": -1.6988046169281006, "loss": 0.6312, "odds_ratio_loss": 0.4206200838088989, "rewards/accuracies": 0.875, "rewards/chosen": -0.07230328768491745, "rewards/margins": 0.09757716953754425, "rewards/rejected": -0.1698804497718811, "sft_loss": 0.7230328321456909, "step": 920 }, { "epoch": 1.3318872017353578, "grad_norm": 2.1950249688867856, "learning_rate": 7.214390515251072e-06, "logits/chosen": 0.02401835471391678, "logits/rejected": 0.13640937209129333, "logps/chosen": -0.628595232963562, "logps/rejected": -3.0257411003112793, "loss": 0.5807, "odds_ratio_loss": 0.2951260209083557, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06285952031612396, "rewards/margins": 0.2397146075963974, "rewards/rejected": -0.30257412791252136, "sft_loss": 0.628595232963562, "step": 921 }, { "epoch": 1.3333333333333333, "grad_norm": 2.8819789495190573, "learning_rate": 7.212541013585593e-06, "logits/chosen": 0.06210823729634285, "logits/rejected": 0.04636611416935921, "logps/chosen": -0.6891711950302124, "logps/rejected": -1.5676841735839844, "loss": 0.5844, "odds_ratio_loss": 0.4012836515903473, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06891711801290512, "rewards/margins": 0.08785130083560944, "rewards/rejected": -0.15676842629909515, "sft_loss": 0.6891711950302124, "step": 922 }, { "epoch": 1.3347794649313087, "grad_norm": 3.2852191324633906, "learning_rate": 7.210689575067174e-06, "logits/chosen": 0.03468220680952072, "logits/rejected": 0.02965848706662655, "logps/chosen": -0.7979564070701599, "logps/rejected": -1.7514419555664062, "loss": 0.7483, "odds_ratio_loss": 0.4156520962715149, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07979562878608704, "rewards/margins": 0.09534855931997299, "rewards/rejected": -0.17514419555664062, "sft_loss": 0.7979564070701599, "step": 923 }, { "epoch": 1.3362255965292842, "grad_norm": 4.75037918024199, "learning_rate": 7.2088362008120525e-06, "logits/chosen": 0.05110059678554535, "logits/rejected": -0.019120914861559868, "logps/chosen": -0.7032926082611084, "logps/rejected": -2.6311984062194824, "loss": 0.6983, "odds_ratio_loss": 0.3601962625980377, "rewards/accuracies": 0.875, "rewards/chosen": -0.07032926380634308, "rewards/margins": 0.19279058277606964, "rewards/rejected": -0.2631198465824127, "sft_loss": 0.7032926082611084, "step": 924 }, { "epoch": 1.3376717281272597, "grad_norm": 2.6554971310087008, "learning_rate": 7.206980891937634e-06, "logits/chosen": 0.11156775057315826, "logits/rejected": 0.059639737010002136, "logps/chosen": -0.6611694097518921, "logps/rejected": -1.7360694408416748, "loss": 0.6662, "odds_ratio_loss": 0.388332724571228, "rewards/accuracies": 0.875, "rewards/chosen": -0.06611695140600204, "rewards/margins": 0.10748998820781708, "rewards/rejected": -0.17360693216323853, "sft_loss": 0.6611694097518921, "step": 925 }, { "epoch": 1.339117859725235, "grad_norm": 2.1000856373583217, "learning_rate": 7.205123649562491e-06, "logits/chosen": 0.048644740134477615, "logits/rejected": 0.09357769787311554, "logps/chosen": -0.6007069945335388, "logps/rejected": -1.6124356985092163, "loss": 0.5757, "odds_ratio_loss": 0.2972748875617981, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06007070094347, "rewards/margins": 0.10117286443710327, "rewards/rejected": -0.16124355792999268, "sft_loss": 0.6007069945335388, "step": 926 }, { "epoch": 1.3405639913232104, "grad_norm": 2.4909605286439116, "learning_rate": 7.203264474806363e-06, "logits/chosen": 0.036845333874225616, "logits/rejected": 0.03189665079116821, "logps/chosen": -0.8164952993392944, "logps/rejected": -1.740440845489502, "loss": 0.6713, "odds_ratio_loss": 0.4842316210269928, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0816495344042778, "rewards/margins": 0.09239454567432404, "rewards/rejected": -0.17404407262802124, "sft_loss": 0.8164952993392944, "step": 927 }, { "epoch": 1.3420101229211858, "grad_norm": 2.867368406936184, "learning_rate": 7.201403368790153e-06, "logits/chosen": 0.009860752150416374, "logits/rejected": -0.03470249101519585, "logps/chosen": -0.7068488597869873, "logps/rejected": -1.9816157817840576, "loss": 0.612, "odds_ratio_loss": 0.38851380348205566, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07068488001823425, "rewards/margins": 0.12747670710086823, "rewards/rejected": -0.19816158711910248, "sft_loss": 0.7068488597869873, "step": 928 }, { "epoch": 1.3434562545191613, "grad_norm": 2.561085257631517, "learning_rate": 7.199540332635929e-06, "logits/chosen": 0.16725574433803558, "logits/rejected": 0.08495479077100754, "logps/chosen": -0.5896700024604797, "logps/rejected": -2.8246240615844727, "loss": 0.6371, "odds_ratio_loss": 0.2792496085166931, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05896700173616409, "rewards/margins": 0.22349542379379272, "rewards/rejected": -0.2824624180793762, "sft_loss": 0.5896700024604797, "step": 929 }, { "epoch": 1.3449023861171367, "grad_norm": 3.454641080740334, "learning_rate": 7.197675367466921e-06, "logits/chosen": 0.13279861211776733, "logits/rejected": 0.07949304580688477, "logps/chosen": -0.5868549942970276, "logps/rejected": -2.453871965408325, "loss": 0.6217, "odds_ratio_loss": 0.280853807926178, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05868549272418022, "rewards/margins": 0.1867016851902008, "rewards/rejected": -0.24538719654083252, "sft_loss": 0.5868549942970276, "step": 930 }, { "epoch": 1.346348517715112, "grad_norm": 2.357731459721896, "learning_rate": 7.195808474407526e-06, "logits/chosen": 0.23046283423900604, "logits/rejected": 0.19778576493263245, "logps/chosen": -0.5050675868988037, "logps/rejected": -3.1081252098083496, "loss": 0.6304, "odds_ratio_loss": 0.292201966047287, "rewards/accuracies": 0.875, "rewards/chosen": -0.05050676316022873, "rewards/margins": 0.2603057622909546, "rewards/rejected": -0.3108125329017639, "sft_loss": 0.5050675868988037, "step": 931 }, { "epoch": 1.3477946493130875, "grad_norm": 2.7649321853544895, "learning_rate": 7.193939654583298e-06, "logits/chosen": 0.08632639050483704, "logits/rejected": 0.1114271879196167, "logps/chosen": -0.5845703482627869, "logps/rejected": -3.52717924118042, "loss": 0.6325, "odds_ratio_loss": 0.2684318423271179, "rewards/accuracies": 0.9375, "rewards/chosen": -0.058457035571336746, "rewards/margins": 0.2942608594894409, "rewards/rejected": -0.35271787643432617, "sft_loss": 0.5845703482627869, "step": 932 }, { "epoch": 1.349240780911063, "grad_norm": 2.848582481930005, "learning_rate": 7.192068909120959e-06, "logits/chosen": 0.23675987124443054, "logits/rejected": 0.15245428681373596, "logps/chosen": -0.5717525482177734, "logps/rejected": -2.8358075618743896, "loss": 0.5965, "odds_ratio_loss": 0.27822452783584595, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05717525631189346, "rewards/margins": 0.22640548646450043, "rewards/rejected": -0.2835807800292969, "sft_loss": 0.5717525482177734, "step": 933 }, { "epoch": 1.3506869125090384, "grad_norm": 2.2955814268257035, "learning_rate": 7.190196239148383e-06, "logits/chosen": 0.04583890736103058, "logits/rejected": 0.12824928760528564, "logps/chosen": -0.5103280544281006, "logps/rejected": -2.381923198699951, "loss": 0.5477, "odds_ratio_loss": 0.2340468466281891, "rewards/accuracies": 1.0, "rewards/chosen": -0.05103280395269394, "rewards/margins": 0.18715953826904297, "rewards/rejected": -0.2381923347711563, "sft_loss": 0.5103280544281006, "step": 934 }, { "epoch": 1.3521330441070138, "grad_norm": 2.265898663876626, "learning_rate": 7.188321645794614e-06, "logits/chosen": 0.10190585255622864, "logits/rejected": 0.072527676820755, "logps/chosen": -0.7817630171775818, "logps/rejected": -1.4952585697174072, "loss": 0.7559, "odds_ratio_loss": 0.4879745841026306, "rewards/accuracies": 0.75, "rewards/chosen": -0.07817629724740982, "rewards/margins": 0.07134956121444702, "rewards/rejected": -0.14952586591243744, "sft_loss": 0.7817630171775818, "step": 935 }, { "epoch": 1.353579175704989, "grad_norm": 2.720364384274246, "learning_rate": 7.186445130189851e-06, "logits/chosen": 0.142786905169487, "logits/rejected": 0.1314203292131424, "logps/chosen": -0.472442626953125, "logps/rejected": -3.0655393600463867, "loss": 0.6144, "odds_ratio_loss": 0.26142001152038574, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04724426195025444, "rewards/margins": 0.25930964946746826, "rewards/rejected": -0.3065539002418518, "sft_loss": 0.472442626953125, "step": 936 }, { "epoch": 1.3550253073029646, "grad_norm": 2.175857876943466, "learning_rate": 7.184566693465451e-06, "logits/chosen": 0.1783963143825531, "logits/rejected": 0.04779674485325813, "logps/chosen": -0.7196685671806335, "logps/rejected": -2.550107955932617, "loss": 0.5999, "odds_ratio_loss": 0.36530670523643494, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07196685671806335, "rewards/margins": 0.1830439418554306, "rewards/rejected": -0.25501078367233276, "sft_loss": 0.7196685671806335, "step": 937 }, { "epoch": 1.35647143890094, "grad_norm": 2.3984769925628706, "learning_rate": 7.182686336753932e-06, "logits/chosen": 0.025604430586099625, "logits/rejected": 0.07506309449672699, "logps/chosen": -0.6966655254364014, "logps/rejected": -2.069488286972046, "loss": 0.6765, "odds_ratio_loss": 0.30220383405685425, "rewards/accuracies": 0.875, "rewards/chosen": -0.0696665421128273, "rewards/margins": 0.13728229701519012, "rewards/rejected": -0.20694883167743683, "sft_loss": 0.6966655254364014, "step": 938 }, { "epoch": 1.3579175704989155, "grad_norm": 2.0636491573130056, "learning_rate": 7.180804061188965e-06, "logits/chosen": 0.2072528898715973, "logits/rejected": 0.1830851286649704, "logps/chosen": -0.5181276798248291, "logps/rejected": -4.006014823913574, "loss": 0.576, "odds_ratio_loss": 0.28068408370018005, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05181277170777321, "rewards/margins": 0.34878870844841003, "rewards/rejected": -0.40060147643089294, "sft_loss": 0.5181276798248291, "step": 939 }, { "epoch": 1.359363702096891, "grad_norm": 2.2952421934808593, "learning_rate": 7.1789198679053835e-06, "logits/chosen": 0.17554627358913422, "logits/rejected": 0.06563098728656769, "logps/chosen": -0.6369041204452515, "logps/rejected": -3.788663387298584, "loss": 0.6079, "odds_ratio_loss": 0.19372212886810303, "rewards/accuracies": 1.0, "rewards/chosen": -0.0636904165148735, "rewards/margins": 0.3151758909225464, "rewards/rejected": -0.3788663446903229, "sft_loss": 0.6369041204452515, "step": 940 }, { "epoch": 1.3608098336948662, "grad_norm": 2.335130582600968, "learning_rate": 7.177033758039174e-06, "logits/chosen": 0.079743891954422, "logits/rejected": 0.04777732491493225, "logps/chosen": -0.5177881717681885, "logps/rejected": -4.678028106689453, "loss": 0.5689, "odds_ratio_loss": 0.3156812787055969, "rewards/accuracies": 0.875, "rewards/chosen": -0.05177881568670273, "rewards/margins": 0.41602402925491333, "rewards/rejected": -0.46780288219451904, "sft_loss": 0.5177881717681885, "step": 941 }, { "epoch": 1.3622559652928417, "grad_norm": 3.0332594020601626, "learning_rate": 7.175145732727481e-06, "logits/chosen": 0.23156288266181946, "logits/rejected": 0.20619621872901917, "logps/chosen": -0.46926769614219666, "logps/rejected": -2.526423215866089, "loss": 0.5919, "odds_ratio_loss": 0.3614828288555145, "rewards/accuracies": 0.8125, "rewards/chosen": -0.046926770359277725, "rewards/margins": 0.20571555197238922, "rewards/rejected": -0.25264233350753784, "sft_loss": 0.46926769614219666, "step": 942 }, { "epoch": 1.3637020968908171, "grad_norm": 3.2546479492806704, "learning_rate": 7.1732557931085986e-06, "logits/chosen": 0.06294719874858856, "logits/rejected": 0.19816607236862183, "logps/chosen": -0.5671209096908569, "logps/rejected": -1.2771296501159668, "loss": 0.5765, "odds_ratio_loss": 0.4410284757614136, "rewards/accuracies": 0.8125, "rewards/chosen": -0.056712083518505096, "rewards/margins": 0.07100087404251099, "rewards/rejected": -0.12771296501159668, "sft_loss": 0.5671209096908569, "step": 943 }, { "epoch": 1.3651482284887924, "grad_norm": 2.39680766363605, "learning_rate": 7.17136394032198e-06, "logits/chosen": 0.17434678971767426, "logits/rejected": 0.10099100321531296, "logps/chosen": -0.4977617561817169, "logps/rejected": -2.2785232067108154, "loss": 0.5481, "odds_ratio_loss": 0.41038426756858826, "rewards/accuracies": 0.6875, "rewards/chosen": -0.04977617412805557, "rewards/margins": 0.1780761480331421, "rewards/rejected": -0.22785231471061707, "sft_loss": 0.4977617561817169, "step": 944 }, { "epoch": 1.3665943600867678, "grad_norm": 2.440564784775663, "learning_rate": 7.16947017550823e-06, "logits/chosen": 0.0626385509967804, "logits/rejected": 0.029109565541148186, "logps/chosen": -0.6074717044830322, "logps/rejected": -3.7257726192474365, "loss": 0.661, "odds_ratio_loss": 0.27803951501846313, "rewards/accuracies": 0.875, "rewards/chosen": -0.0607471764087677, "rewards/margins": 0.3118301033973694, "rewards/rejected": -0.3725772798061371, "sft_loss": 0.6074717044830322, "step": 945 }, { "epoch": 1.3680404916847433, "grad_norm": 7.122163738225102, "learning_rate": 7.167574499809108e-06, "logits/chosen": 0.1047414243221283, "logits/rejected": 0.05279163271188736, "logps/chosen": -0.6560348868370056, "logps/rejected": -3.7505457401275635, "loss": 0.6665, "odds_ratio_loss": 0.3180413544178009, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06560348719358444, "rewards/margins": 0.30945104360580444, "rewards/rejected": -0.37505456805229187, "sft_loss": 0.6560348868370056, "step": 946 }, { "epoch": 1.3694866232827188, "grad_norm": 17.543573177125726, "learning_rate": 7.165676914367522e-06, "logits/chosen": 0.08533883094787598, "logits/rejected": 0.14580032229423523, "logps/chosen": -0.6218971610069275, "logps/rejected": -3.34824275970459, "loss": 0.5894, "odds_ratio_loss": 0.30703213810920715, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06218971684575081, "rewards/margins": 0.2726345658302307, "rewards/rejected": -0.33482426404953003, "sft_loss": 0.6218971610069275, "step": 947 }, { "epoch": 1.3709327548806942, "grad_norm": 2.5915141272193307, "learning_rate": 7.163777420327534e-06, "logits/chosen": 0.1597047746181488, "logits/rejected": 0.11482205241918564, "logps/chosen": -0.6220844388008118, "logps/rejected": -2.246950149536133, "loss": 0.5908, "odds_ratio_loss": 0.3186954855918884, "rewards/accuracies": 1.0, "rewards/chosen": -0.06220844388008118, "rewards/margins": 0.16248659789562225, "rewards/rejected": -0.22469504177570343, "sft_loss": 0.6220844388008118, "step": 948 }, { "epoch": 1.3723788864786695, "grad_norm": 2.493790722665044, "learning_rate": 7.161876018834357e-06, "logits/chosen": 0.23967748880386353, "logits/rejected": 0.19121602177619934, "logps/chosen": -0.6689061522483826, "logps/rejected": -2.2310941219329834, "loss": 0.6469, "odds_ratio_loss": 0.3700863718986511, "rewards/accuracies": 0.875, "rewards/chosen": -0.06689061969518661, "rewards/margins": 0.15621879696846008, "rewards/rejected": -0.2231094092130661, "sft_loss": 0.6689061522483826, "step": 949 }, { "epoch": 1.373825018076645, "grad_norm": 7.095619235390716, "learning_rate": 7.159972711034352e-06, "logits/chosen": 0.3317505717277527, "logits/rejected": 0.25593841075897217, "logps/chosen": -0.5780055522918701, "logps/rejected": -1.6080982685089111, "loss": 0.5444, "odds_ratio_loss": 0.36196255683898926, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05780055373907089, "rewards/margins": 0.10300928354263306, "rewards/rejected": -0.16080984473228455, "sft_loss": 0.5780055522918701, "step": 950 }, { "epoch": 1.3752711496746204, "grad_norm": 2.4424076920707174, "learning_rate": 7.15806749807503e-06, "logits/chosen": 0.2837293744087219, "logits/rejected": 0.27560287714004517, "logps/chosen": -0.3303763270378113, "logps/rejected": -1.49053156375885, "loss": 0.5996, "odds_ratio_loss": 0.24135808646678925, "rewards/accuracies": 0.875, "rewards/chosen": -0.03303763270378113, "rewards/margins": 0.11601553857326508, "rewards/rejected": -0.1490531712770462, "sft_loss": 0.3303763270378113, "step": 951 }, { "epoch": 1.3767172812725958, "grad_norm": 2.8452765697499354, "learning_rate": 7.156160381105051e-06, "logits/chosen": 0.12051470577716827, "logits/rejected": 0.08477576822042465, "logps/chosen": -0.5957016944885254, "logps/rejected": -2.136902093887329, "loss": 0.5906, "odds_ratio_loss": 0.3252412676811218, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05957017466425896, "rewards/margins": 0.1541200429201126, "rewards/rejected": -0.21369022130966187, "sft_loss": 0.5957016944885254, "step": 952 }, { "epoch": 1.3781634128705713, "grad_norm": 2.5765936064861776, "learning_rate": 7.154251361274225e-06, "logits/chosen": -0.002856435254216194, "logits/rejected": 0.07068169862031937, "logps/chosen": -0.6157888770103455, "logps/rejected": -1.3842309713363647, "loss": 0.5797, "odds_ratio_loss": 0.3665923476219177, "rewards/accuracies": 0.875, "rewards/chosen": -0.061578892171382904, "rewards/margins": 0.0768442153930664, "rewards/rejected": -0.1384231150150299, "sft_loss": 0.6157888770103455, "step": 953 }, { "epoch": 1.3796095444685466, "grad_norm": 2.2398769523605786, "learning_rate": 7.152340439733504e-06, "logits/chosen": 0.13600611686706543, "logits/rejected": 0.10630634427070618, "logps/chosen": -0.5609400868415833, "logps/rejected": -1.998949408531189, "loss": 0.5757, "odds_ratio_loss": 0.23014606535434723, "rewards/accuracies": 0.875, "rewards/chosen": -0.056094009429216385, "rewards/margins": 0.14380092918872833, "rewards/rejected": -0.19989493489265442, "sft_loss": 0.5609400868415833, "step": 954 }, { "epoch": 1.381055676066522, "grad_norm": 2.613749506790682, "learning_rate": 7.1504276176349925e-06, "logits/chosen": 0.22844244539737701, "logits/rejected": 0.22849160432815552, "logps/chosen": -0.4829053282737732, "logps/rejected": -1.5053625106811523, "loss": 0.5361, "odds_ratio_loss": 0.26122379302978516, "rewards/accuracies": 0.875, "rewards/chosen": -0.04829053580760956, "rewards/margins": 0.10224571079015732, "rewards/rejected": -0.15053623914718628, "sft_loss": 0.4829053282737732, "step": 955 }, { "epoch": 1.3825018076644975, "grad_norm": 2.4554120773971757, "learning_rate": 7.148512896131937e-06, "logits/chosen": 0.09752713143825531, "logits/rejected": 0.1276295930147171, "logps/chosen": -0.6276790499687195, "logps/rejected": -1.9808611869812012, "loss": 0.6139, "odds_ratio_loss": 0.24553599953651428, "rewards/accuracies": 1.0, "rewards/chosen": -0.06276790797710419, "rewards/margins": 0.13531821966171265, "rewards/rejected": -0.19808611273765564, "sft_loss": 0.6276790499687195, "step": 956 }, { "epoch": 1.383947939262473, "grad_norm": 4.827788032322935, "learning_rate": 7.146596276378728e-06, "logits/chosen": 0.2864004969596863, "logits/rejected": 0.24811071157455444, "logps/chosen": -0.5591001510620117, "logps/rejected": -1.2078362703323364, "loss": 0.4646, "odds_ratio_loss": 0.32582181692123413, "rewards/accuracies": 1.0, "rewards/chosen": -0.05591001361608505, "rewards/margins": 0.06487361341714859, "rewards/rejected": -0.12078362703323364, "sft_loss": 0.5591001510620117, "step": 957 }, { "epoch": 1.3853940708604484, "grad_norm": 2.9075717065401308, "learning_rate": 7.1446777595309066e-06, "logits/chosen": 0.21857096254825592, "logits/rejected": 0.12259276211261749, "logps/chosen": -0.6916131973266602, "logps/rejected": -1.2286429405212402, "loss": 0.6633, "odds_ratio_loss": 0.5306054353713989, "rewards/accuracies": 0.625, "rewards/chosen": -0.0691613182425499, "rewards/margins": 0.053702980279922485, "rewards/rejected": -0.12286430597305298, "sft_loss": 0.6916131973266602, "step": 958 }, { "epoch": 1.3868402024584237, "grad_norm": 2.4754404143488333, "learning_rate": 7.1427573467451515e-06, "logits/chosen": 0.1782752275466919, "logits/rejected": 0.18330281972885132, "logps/chosen": -0.7293274402618408, "logps/rejected": -1.8634823560714722, "loss": 0.7318, "odds_ratio_loss": 0.28276628255844116, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07293274253606796, "rewards/margins": 0.11341549456119537, "rewards/rejected": -0.18634825944900513, "sft_loss": 0.7293274402618408, "step": 959 }, { "epoch": 1.3882863340563991, "grad_norm": 2.2520057424020927, "learning_rate": 7.140835039179288e-06, "logits/chosen": 0.14758704602718353, "logits/rejected": 0.1023918017745018, "logps/chosen": -0.5791318416595459, "logps/rejected": -1.516669511795044, "loss": 0.5407, "odds_ratio_loss": 0.3355981111526489, "rewards/accuracies": 0.875, "rewards/chosen": -0.05791318789124489, "rewards/margins": 0.09375376999378204, "rewards/rejected": -0.15166696906089783, "sft_loss": 0.5791318416595459, "step": 960 }, { "epoch": 1.3897324656543746, "grad_norm": 2.3823196591157902, "learning_rate": 7.138910837992281e-06, "logits/chosen": 0.15106113255023956, "logits/rejected": 0.05747794359922409, "logps/chosen": -0.6319140791893005, "logps/rejected": -1.544478178024292, "loss": 0.7169, "odds_ratio_loss": 0.39006781578063965, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06319140642881393, "rewards/margins": 0.09125641733407974, "rewards/rejected": -0.15444782376289368, "sft_loss": 0.6319140791893005, "step": 961 }, { "epoch": 1.39117859725235, "grad_norm": 3.1485603341157806, "learning_rate": 7.1369847443442394e-06, "logits/chosen": 0.11428692191839218, "logits/rejected": 0.1583063304424286, "logps/chosen": -0.43781501054763794, "logps/rejected": -1.672081708908081, "loss": 0.5578, "odds_ratio_loss": 0.20121005177497864, "rewards/accuracies": 1.0, "rewards/chosen": -0.043781500309705734, "rewards/margins": 0.12342666834592819, "rewards/rejected": -0.16720816493034363, "sft_loss": 0.43781501054763794, "step": 962 }, { "epoch": 1.3926247288503255, "grad_norm": 2.7598488840824196, "learning_rate": 7.135056759396411e-06, "logits/chosen": 0.1512761414051056, "logits/rejected": 0.16234086453914642, "logps/chosen": -0.47584232687950134, "logps/rejected": -2.192073106765747, "loss": 0.6127, "odds_ratio_loss": 0.24130618572235107, "rewards/accuracies": 0.9375, "rewards/chosen": -0.047584228217601776, "rewards/margins": 0.171623095870018, "rewards/rejected": -0.21920731663703918, "sft_loss": 0.47584232687950134, "step": 963 }, { "epoch": 1.3940708604483008, "grad_norm": 2.549294497737792, "learning_rate": 7.133126884311187e-06, "logits/chosen": 0.15712814033031464, "logits/rejected": 0.09098143875598907, "logps/chosen": -0.38123244047164917, "logps/rejected": -2.0281896591186523, "loss": 0.4884, "odds_ratio_loss": 0.26970845460891724, "rewards/accuracies": 0.875, "rewards/chosen": -0.038123246282339096, "rewards/margins": 0.16469572484493256, "rewards/rejected": -0.20281897485256195, "sft_loss": 0.38123244047164917, "step": 964 }, { "epoch": 1.3955169920462762, "grad_norm": 4.069834024322856, "learning_rate": 7.131195120252096e-06, "logits/chosen": -0.048756882548332214, "logits/rejected": -0.008307691663503647, "logps/chosen": -0.5556820631027222, "logps/rejected": -1.4905530214309692, "loss": 0.6153, "odds_ratio_loss": 0.33696621656417847, "rewards/accuracies": 0.875, "rewards/chosen": -0.05556820333003998, "rewards/margins": 0.09348709136247635, "rewards/rejected": -0.14905530214309692, "sft_loss": 0.5556820631027222, "step": 965 }, { "epoch": 1.3969631236442517, "grad_norm": 3.272715217782266, "learning_rate": 7.129261468383804e-06, "logits/chosen": 0.13486486673355103, "logits/rejected": 0.18454203009605408, "logps/chosen": -0.454858660697937, "logps/rejected": -1.9876313209533691, "loss": 0.5522, "odds_ratio_loss": 0.2198922336101532, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04548586905002594, "rewards/margins": 0.15327726304531097, "rewards/rejected": -0.19876313209533691, "sft_loss": 0.454858660697937, "step": 966 }, { "epoch": 1.398409255242227, "grad_norm": 2.9672392480045553, "learning_rate": 7.127325929872119e-06, "logits/chosen": 0.10959649085998535, "logits/rejected": 0.0439496710896492, "logps/chosen": -0.6432375907897949, "logps/rejected": -1.7530879974365234, "loss": 0.5961, "odds_ratio_loss": 0.3384247124195099, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06432375311851501, "rewards/margins": 0.11098504066467285, "rewards/rejected": -0.17530880868434906, "sft_loss": 0.6432375907897949, "step": 967 }, { "epoch": 1.3998553868402024, "grad_norm": 3.7393635299161994, "learning_rate": 7.125388505883983e-06, "logits/chosen": 0.14931072294712067, "logits/rejected": 0.10823297500610352, "logps/chosen": -0.6483991742134094, "logps/rejected": -2.3597779273986816, "loss": 0.6251, "odds_ratio_loss": 0.35210758447647095, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0648399218916893, "rewards/margins": 0.17113786935806274, "rewards/rejected": -0.23597779870033264, "sft_loss": 0.6483991742134094, "step": 968 }, { "epoch": 1.4013015184381779, "grad_norm": 14.620071753017426, "learning_rate": 7.123449197587477e-06, "logits/chosen": 0.2409316450357437, "logits/rejected": 0.16842088103294373, "logps/chosen": -0.4753984808921814, "logps/rejected": -1.9124274253845215, "loss": 0.5743, "odds_ratio_loss": 0.3033334016799927, "rewards/accuracies": 0.875, "rewards/chosen": -0.0475398488342762, "rewards/margins": 0.14370287954807281, "rewards/rejected": -0.19124272465705872, "sft_loss": 0.4753984808921814, "step": 969 }, { "epoch": 1.4027476500361533, "grad_norm": 2.45277025388321, "learning_rate": 7.121508006151817e-06, "logits/chosen": 0.20841699838638306, "logits/rejected": 0.15673436224460602, "logps/chosen": -0.4456249475479126, "logps/rejected": -1.9759278297424316, "loss": 0.5027, "odds_ratio_loss": 0.22154033184051514, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04456249624490738, "rewards/margins": 0.15303027629852295, "rewards/rejected": -0.19759276509284973, "sft_loss": 0.4456249475479126, "step": 970 }, { "epoch": 1.4041937816341288, "grad_norm": 3.378013540864925, "learning_rate": 7.119564932747353e-06, "logits/chosen": 0.1502024084329605, "logits/rejected": 0.1177692785859108, "logps/chosen": -0.7510949969291687, "logps/rejected": -1.452868938446045, "loss": 0.6338, "odds_ratio_loss": 0.3943096399307251, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07510949671268463, "rewards/margins": 0.07017740607261658, "rewards/rejected": -0.1452869176864624, "sft_loss": 0.7510949969291687, "step": 971 }, { "epoch": 1.405639913232104, "grad_norm": 2.4471392780527443, "learning_rate": 7.1176199785455744e-06, "logits/chosen": 0.18596996366977692, "logits/rejected": 0.20554053783416748, "logps/chosen": -0.48409661650657654, "logps/rejected": -2.281468152999878, "loss": 0.5936, "odds_ratio_loss": 0.18068525195121765, "rewards/accuracies": 1.0, "rewards/chosen": -0.04840966686606407, "rewards/margins": 0.1797371655702591, "rewards/rejected": -0.22814682126045227, "sft_loss": 0.48409661650657654, "step": 972 }, { "epoch": 1.4070860448300795, "grad_norm": 2.145919907848024, "learning_rate": 7.115673144719098e-06, "logits/chosen": 0.22358013689517975, "logits/rejected": 0.1526821106672287, "logps/chosen": -0.764935314655304, "logps/rejected": -1.8448175191879272, "loss": 0.6393, "odds_ratio_loss": 0.3151146173477173, "rewards/accuracies": 0.875, "rewards/chosen": -0.0764935314655304, "rewards/margins": 0.10798821598291397, "rewards/rejected": -0.18448176980018616, "sft_loss": 0.764935314655304, "step": 973 }, { "epoch": 1.408532176428055, "grad_norm": 2.4421624822159766, "learning_rate": 7.11372443244168e-06, "logits/chosen": 0.1208169162273407, "logits/rejected": 0.003757679834961891, "logps/chosen": -0.6918614506721497, "logps/rejected": -1.6558613777160645, "loss": 0.721, "odds_ratio_loss": 0.3674412965774536, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06918615102767944, "rewards/margins": 0.09639999270439148, "rewards/rejected": -0.16558612883090973, "sft_loss": 0.6918614506721497, "step": 974 }, { "epoch": 1.4099783080260304, "grad_norm": 2.323616865318886, "learning_rate": 7.111773842888204e-06, "logits/chosen": 0.08812505751848221, "logits/rejected": 0.11653533577919006, "logps/chosen": -0.4842926859855652, "logps/rejected": -1.6144311428070068, "loss": 0.6273, "odds_ratio_loss": 0.17716938257217407, "rewards/accuracies": 1.0, "rewards/chosen": -0.04842927306890488, "rewards/margins": 0.1130138412117958, "rewards/rejected": -0.16144311428070068, "sft_loss": 0.4842926859855652, "step": 975 }, { "epoch": 1.4114244396240059, "grad_norm": 2.9872065011223112, "learning_rate": 7.109821377234688e-06, "logits/chosen": 0.057052284479141235, "logits/rejected": 0.0494823083281517, "logps/chosen": -0.7413144111633301, "logps/rejected": -1.2311071157455444, "loss": 0.6643, "odds_ratio_loss": 0.48893070220947266, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07413144409656525, "rewards/margins": 0.0489792674779892, "rewards/rejected": -0.12311071157455444, "sft_loss": 0.7413144111633301, "step": 976 }, { "epoch": 1.4128705712219811, "grad_norm": 2.8564528181411064, "learning_rate": 7.107867036658283e-06, "logits/chosen": 0.12099157273769379, "logits/rejected": 0.14747996628284454, "logps/chosen": -0.5671837329864502, "logps/rejected": -2.8476688861846924, "loss": 0.6619, "odds_ratio_loss": 0.2223397195339203, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0567183718085289, "rewards/margins": 0.22804851830005646, "rewards/rejected": -0.28476688265800476, "sft_loss": 0.5671837329864502, "step": 977 }, { "epoch": 1.4143167028199566, "grad_norm": 3.8616731526720796, "learning_rate": 7.105910822337266e-06, "logits/chosen": 0.24294635653495789, "logits/rejected": 0.23327285051345825, "logps/chosen": -0.4823381304740906, "logps/rejected": -1.6777340173721313, "loss": 0.6131, "odds_ratio_loss": 0.33123984932899475, "rewards/accuracies": 0.9375, "rewards/chosen": -0.048233818262815475, "rewards/margins": 0.11953960359096527, "rewards/rejected": -0.16777342557907104, "sft_loss": 0.4823381304740906, "step": 978 }, { "epoch": 1.415762834417932, "grad_norm": 2.2192337031178804, "learning_rate": 7.103952735451047e-06, "logits/chosen": 0.26770561933517456, "logits/rejected": 0.33913475275039673, "logps/chosen": -0.4104244112968445, "logps/rejected": -1.8166509866714478, "loss": 0.5658, "odds_ratio_loss": 0.3066392242908478, "rewards/accuracies": 0.875, "rewards/chosen": -0.04104244336485863, "rewards/margins": 0.14062266051769257, "rewards/rejected": -0.1816651076078415, "sft_loss": 0.4104244112968445, "step": 979 }, { "epoch": 1.4172089660159075, "grad_norm": 2.734924337643227, "learning_rate": 7.1019927771801625e-06, "logits/chosen": 0.14210444688796997, "logits/rejected": 0.16518303751945496, "logps/chosen": -0.7257510423660278, "logps/rejected": -1.8980352878570557, "loss": 0.6179, "odds_ratio_loss": 0.2865678369998932, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07257509976625443, "rewards/margins": 0.1172284185886383, "rewards/rejected": -0.18980352580547333, "sft_loss": 0.7257510423660278, "step": 980 }, { "epoch": 1.418655097613883, "grad_norm": 2.229503219289486, "learning_rate": 7.10003094870628e-06, "logits/chosen": 0.13261494040489197, "logits/rejected": 0.061438750475645065, "logps/chosen": -0.610904335975647, "logps/rejected": -1.613142490386963, "loss": 0.6427, "odds_ratio_loss": 0.38070380687713623, "rewards/accuracies": 0.75, "rewards/chosen": -0.061090435832738876, "rewards/margins": 0.10022380948066711, "rewards/rejected": -0.1613142490386963, "sft_loss": 0.610904335975647, "step": 981 }, { "epoch": 1.4201012292118582, "grad_norm": 4.653721481898526, "learning_rate": 7.0980672512121925e-06, "logits/chosen": 0.08789413422346115, "logits/rejected": 0.12038551270961761, "logps/chosen": -0.615592360496521, "logps/rejected": -1.3449037075042725, "loss": 0.645, "odds_ratio_loss": 0.3649972677230835, "rewards/accuracies": 0.875, "rewards/chosen": -0.06155924126505852, "rewards/margins": 0.07293112576007843, "rewards/rejected": -0.13449037075042725, "sft_loss": 0.615592360496521, "step": 982 }, { "epoch": 1.4215473608098337, "grad_norm": 7.576850107740403, "learning_rate": 7.096101685881821e-06, "logits/chosen": 0.16142991185188293, "logits/rejected": 0.14472821354866028, "logps/chosen": -0.5334713459014893, "logps/rejected": -1.8715717792510986, "loss": 0.6974, "odds_ratio_loss": 0.2439751923084259, "rewards/accuracies": 0.9375, "rewards/chosen": -0.053347136825323105, "rewards/margins": 0.13381005823612213, "rewards/rejected": -0.18715719878673553, "sft_loss": 0.5334713459014893, "step": 983 }, { "epoch": 1.4229934924078091, "grad_norm": 3.518443568999813, "learning_rate": 7.094134253900212e-06, "logits/chosen": 0.12732619047164917, "logits/rejected": 0.24472256004810333, "logps/chosen": -0.572356104850769, "logps/rejected": -1.8468163013458252, "loss": 0.5554, "odds_ratio_loss": 0.30827876925468445, "rewards/accuracies": 0.9375, "rewards/chosen": -0.057235606014728546, "rewards/margins": 0.1274460256099701, "rewards/rejected": -0.18468163907527924, "sft_loss": 0.572356104850769, "step": 984 }, { "epoch": 1.4244396240057844, "grad_norm": 2.719443297095395, "learning_rate": 7.092164956453539e-06, "logits/chosen": 0.23533201217651367, "logits/rejected": 0.1809999793767929, "logps/chosen": -0.5789510011672974, "logps/rejected": -1.5870184898376465, "loss": 0.6231, "odds_ratio_loss": 0.33866560459136963, "rewards/accuracies": 0.875, "rewards/chosen": -0.05789510905742645, "rewards/margins": 0.10080674290657043, "rewards/rejected": -0.1587018370628357, "sft_loss": 0.5789510011672974, "step": 985 }, { "epoch": 1.42588575560376, "grad_norm": 5.778385803733022, "learning_rate": 7.090193794729095e-06, "logits/chosen": 0.008287119679152966, "logits/rejected": 0.03246738016605377, "logps/chosen": -0.5857095718383789, "logps/rejected": -2.002685070037842, "loss": 0.7155, "odds_ratio_loss": 0.2693645656108856, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05857095867395401, "rewards/margins": 0.14169755578041077, "rewards/rejected": -0.20026850700378418, "sft_loss": 0.5857095718383789, "step": 986 }, { "epoch": 1.4273318872017353, "grad_norm": 2.484397723351678, "learning_rate": 7.088220769915304e-06, "logits/chosen": 0.04477142542600632, "logits/rejected": 0.02228371426463127, "logps/chosen": -0.6996700167655945, "logps/rejected": -1.581622838973999, "loss": 0.6102, "odds_ratio_loss": 0.41134923696517944, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06996700912714005, "rewards/margins": 0.08819527924060822, "rewards/rejected": -0.15816228091716766, "sft_loss": 0.6996700167655945, "step": 987 }, { "epoch": 1.4287780187997108, "grad_norm": 3.1362962214467363, "learning_rate": 7.086245883201709e-06, "logits/chosen": -0.01240419689565897, "logits/rejected": -0.013412795960903168, "logps/chosen": -0.6005573272705078, "logps/rejected": -1.2886971235275269, "loss": 0.7403, "odds_ratio_loss": 0.4563630223274231, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06005573272705078, "rewards/margins": 0.0688139945268631, "rewards/rejected": -0.12886972725391388, "sft_loss": 0.6005573272705078, "step": 988 }, { "epoch": 1.4302241503976862, "grad_norm": 2.7302199943502963, "learning_rate": 7.084269135778976e-06, "logits/chosen": 0.27440059185028076, "logits/rejected": 0.17620548605918884, "logps/chosen": -0.5781697034835815, "logps/rejected": -1.6051629781723022, "loss": 0.599, "odds_ratio_loss": 0.3173554539680481, "rewards/accuracies": 0.9375, "rewards/chosen": -0.057816971093416214, "rewards/margins": 0.10269933193922043, "rewards/rejected": -0.16051630675792694, "sft_loss": 0.5781697034835815, "step": 989 }, { "epoch": 1.4316702819956615, "grad_norm": 2.211744246354969, "learning_rate": 7.082290528838895e-06, "logits/chosen": 0.19083952903747559, "logits/rejected": 0.14187321066856384, "logps/chosen": -0.6545434594154358, "logps/rejected": -1.9486523866653442, "loss": 0.5823, "odds_ratio_loss": 0.32819491624832153, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06545434892177582, "rewards/margins": 0.12941089272499084, "rewards/rejected": -0.19486522674560547, "sft_loss": 0.6545434594154358, "step": 990 }, { "epoch": 1.433116413593637, "grad_norm": 2.8739397638479582, "learning_rate": 7.080310063574374e-06, "logits/chosen": 0.2413957715034485, "logits/rejected": 0.2167605757713318, "logps/chosen": -0.7022543549537659, "logps/rejected": -1.8416874408721924, "loss": 0.6775, "odds_ratio_loss": 0.4233511984348297, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07022543251514435, "rewards/margins": 0.11394332349300385, "rewards/rejected": -0.1841687560081482, "sft_loss": 0.7022543549537659, "step": 991 }, { "epoch": 1.4345625451916124, "grad_norm": 4.233643812008143, "learning_rate": 7.078327741179443e-06, "logits/chosen": 0.31747961044311523, "logits/rejected": 0.17522543668746948, "logps/chosen": -0.526762843132019, "logps/rejected": -1.6177246570587158, "loss": 0.6189, "odds_ratio_loss": 0.3080407977104187, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05267628654837608, "rewards/margins": 0.10909618437290192, "rewards/rejected": -0.1617724597454071, "sft_loss": 0.526762843132019, "step": 992 }, { "epoch": 1.4360086767895879, "grad_norm": 3.555697493007301, "learning_rate": 7.076343562849253e-06, "logits/chosen": 0.18947246670722961, "logits/rejected": 0.11990557610988617, "logps/chosen": -0.6089348793029785, "logps/rejected": -3.1226184368133545, "loss": 0.6423, "odds_ratio_loss": 0.21913611888885498, "rewards/accuracies": 0.875, "rewards/chosen": -0.06089348718523979, "rewards/margins": 0.25136834383010864, "rewards/rejected": -0.31226181983947754, "sft_loss": 0.6089348793029785, "step": 993 }, { "epoch": 1.4374548083875633, "grad_norm": 2.4046589860985788, "learning_rate": 7.074357529780071e-06, "logits/chosen": 0.12996569275856018, "logits/rejected": 0.2138558328151703, "logps/chosen": -0.6799343228340149, "logps/rejected": -1.2040917873382568, "loss": 0.6446, "odds_ratio_loss": 0.37377679347991943, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06799343228340149, "rewards/margins": 0.05241573974490166, "rewards/rejected": -0.12040917575359344, "sft_loss": 0.6799343228340149, "step": 994 }, { "epoch": 1.4389009399855386, "grad_norm": 2.5068905804152304, "learning_rate": 7.072369643169284e-06, "logits/chosen": 0.28199514746665955, "logits/rejected": 0.1579236090183258, "logps/chosen": -0.6818425059318542, "logps/rejected": -1.374591588973999, "loss": 0.6167, "odds_ratio_loss": 0.4469667971134186, "rewards/accuracies": 0.75, "rewards/chosen": -0.0681842491030693, "rewards/margins": 0.0692749172449112, "rewards/rejected": -0.1374591588973999, "sft_loss": 0.6818425059318542, "step": 995 }, { "epoch": 1.440347071583514, "grad_norm": 2.8897518619748825, "learning_rate": 7.070379904215396e-06, "logits/chosen": 0.19853602349758148, "logits/rejected": 0.11660370975732803, "logps/chosen": -0.47910040616989136, "logps/rejected": -1.618467092514038, "loss": 0.591, "odds_ratio_loss": 0.30910736322402954, "rewards/accuracies": 0.8125, "rewards/chosen": -0.047910042107105255, "rewards/margins": 0.11393667757511139, "rewards/rejected": -0.16184672713279724, "sft_loss": 0.47910040616989136, "step": 996 }, { "epoch": 1.4417932031814895, "grad_norm": 4.590190762321148, "learning_rate": 7.0683883141180295e-06, "logits/chosen": 0.16932472586631775, "logits/rejected": 0.23921875655651093, "logps/chosen": -0.5061404705047607, "logps/rejected": -1.450538992881775, "loss": 0.6689, "odds_ratio_loss": 0.2410426139831543, "rewards/accuracies": 1.0, "rewards/chosen": -0.05061405152082443, "rewards/margins": 0.09443984925746918, "rewards/rejected": -0.145053893327713, "sft_loss": 0.5061404705047607, "step": 997 }, { "epoch": 1.443239334779465, "grad_norm": 2.3532961841410716, "learning_rate": 7.06639487407792e-06, "logits/chosen": 0.16211381554603577, "logits/rejected": 0.14582449197769165, "logps/chosen": -0.38793298602104187, "logps/rejected": -1.8462269306182861, "loss": 0.4933, "odds_ratio_loss": 0.15614688396453857, "rewards/accuracies": 1.0, "rewards/chosen": -0.038793303072452545, "rewards/margins": 0.14582940936088562, "rewards/rejected": -0.18462270498275757, "sft_loss": 0.38793298602104187, "step": 998 }, { "epoch": 1.4446854663774404, "grad_norm": 2.4403398082029564, "learning_rate": 7.06439958529692e-06, "logits/chosen": 0.24929878115653992, "logits/rejected": 0.20911167562007904, "logps/chosen": -0.5529198050498962, "logps/rejected": -2.3418209552764893, "loss": 0.5847, "odds_ratio_loss": 0.20496883988380432, "rewards/accuracies": 0.9375, "rewards/chosen": -0.055291980504989624, "rewards/margins": 0.17889009416103363, "rewards/rejected": -0.23418208956718445, "sft_loss": 0.5529198050498962, "step": 999 }, { "epoch": 1.4461315979754157, "grad_norm": 2.645683265131475, "learning_rate": 7.062402448977997e-06, "logits/chosen": 0.3289748430252075, "logits/rejected": 0.20327892899513245, "logps/chosen": -0.5288233757019043, "logps/rejected": -2.5970919132232666, "loss": 0.6075, "odds_ratio_loss": 0.23200330138206482, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05288233980536461, "rewards/margins": 0.20682686567306519, "rewards/rejected": -0.2597091794013977, "sft_loss": 0.5288233757019043, "step": 1000 }, { "epoch": 1.4475777295733911, "grad_norm": 2.7552725794710082, "learning_rate": 7.0604034663252326e-06, "logits/chosen": 0.13092300295829773, "logits/rejected": 0.15963725745677948, "logps/chosen": -0.37244975566864014, "logps/rejected": -1.9033265113830566, "loss": 0.5478, "odds_ratio_loss": 0.23867227137088776, "rewards/accuracies": 0.9375, "rewards/chosen": -0.037244975566864014, "rewards/margins": 0.15308767557144165, "rewards/rejected": -0.19033265113830566, "sft_loss": 0.37244975566864014, "step": 1001 }, { "epoch": 1.4490238611713666, "grad_norm": 4.113907222826934, "learning_rate": 7.058402638543819e-06, "logits/chosen": 0.2033143788576126, "logits/rejected": 0.23652614653110504, "logps/chosen": -0.5085000991821289, "logps/rejected": -1.6749879121780396, "loss": 0.6418, "odds_ratio_loss": 0.25362229347229004, "rewards/accuracies": 1.0, "rewards/chosen": -0.05085001513361931, "rewards/margins": 0.11664877831935883, "rewards/rejected": -0.16749879717826843, "sft_loss": 0.5085000991821289, "step": 1002 }, { "epoch": 1.450469992769342, "grad_norm": 2.6623407856341283, "learning_rate": 7.056399966840065e-06, "logits/chosen": 0.18851172924041748, "logits/rejected": 0.18449491262435913, "logps/chosen": -0.5473241806030273, "logps/rejected": -2.25048828125, "loss": 0.5852, "odds_ratio_loss": 0.2814570367336273, "rewards/accuracies": 0.8125, "rewards/chosen": -0.054732419550418854, "rewards/margins": 0.1703163981437683, "rewards/rejected": -0.22504881024360657, "sft_loss": 0.5473241806030273, "step": 1003 }, { "epoch": 1.4519161243673175, "grad_norm": 2.2832553471825956, "learning_rate": 7.0543954524213885e-06, "logits/chosen": 0.19176509976387024, "logits/rejected": 0.17191959917545319, "logps/chosen": -0.4636141359806061, "logps/rejected": -2.2305266857147217, "loss": 0.6144, "odds_ratio_loss": 0.19034519791603088, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04636141285300255, "rewards/margins": 0.17669126391410828, "rewards/rejected": -0.22305268049240112, "sft_loss": 0.4636141359806061, "step": 1004 }, { "epoch": 1.4533622559652928, "grad_norm": 2.8776457997498053, "learning_rate": 7.052389096496316e-06, "logits/chosen": 0.1522367298603058, "logits/rejected": 0.15638092160224915, "logps/chosen": -0.7201976180076599, "logps/rejected": -1.3056902885437012, "loss": 0.6707, "odds_ratio_loss": 0.4569261372089386, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07201976329088211, "rewards/margins": 0.058549270033836365, "rewards/rejected": -0.13056901097297668, "sft_loss": 0.7201976180076599, "step": 1005 }, { "epoch": 1.4548083875632682, "grad_norm": 3.901836611275651, "learning_rate": 7.0503809002744895e-06, "logits/chosen": 0.1361616551876068, "logits/rejected": 0.1412799060344696, "logps/chosen": -0.4152643084526062, "logps/rejected": -1.949552059173584, "loss": 0.6122, "odds_ratio_loss": 0.2301965057849884, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0415264368057251, "rewards/margins": 0.15342876315116882, "rewards/rejected": -0.19495519995689392, "sft_loss": 0.4152643084526062, "step": 1006 }, { "epoch": 1.4562545191612437, "grad_norm": 2.2885218862113406, "learning_rate": 7.048370864966658e-06, "logits/chosen": 0.09770112484693527, "logits/rejected": 0.16095119714736938, "logps/chosen": -0.5029357671737671, "logps/rejected": -1.595658540725708, "loss": 0.5774, "odds_ratio_loss": 0.27672871947288513, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05029357224702835, "rewards/margins": 0.10927227884531021, "rewards/rejected": -0.15956585109233856, "sft_loss": 0.5029357671737671, "step": 1007 }, { "epoch": 1.457700650759219, "grad_norm": 2.7412792848860024, "learning_rate": 7.046358991784679e-06, "logits/chosen": 0.3045928478240967, "logits/rejected": 0.1931784301996231, "logps/chosen": -0.5084326267242432, "logps/rejected": -1.954370379447937, "loss": 0.6619, "odds_ratio_loss": 0.23789116740226746, "rewards/accuracies": 0.9375, "rewards/chosen": -0.050843268632888794, "rewards/margins": 0.14459377527236938, "rewards/rejected": -0.19543704390525818, "sft_loss": 0.5084326267242432, "step": 1008 }, { "epoch": 1.4591467823571946, "grad_norm": 2.6518257487154897, "learning_rate": 7.044345281941517e-06, "logits/chosen": 0.19667373597621918, "logits/rejected": 0.026554403826594353, "logps/chosen": -0.6434774398803711, "logps/rejected": -1.9622483253479004, "loss": 0.6395, "odds_ratio_loss": 0.33053648471832275, "rewards/accuracies": 0.875, "rewards/chosen": -0.06434774398803711, "rewards/margins": 0.1318770945072174, "rewards/rejected": -0.19622483849525452, "sft_loss": 0.6434774398803711, "step": 1009 }, { "epoch": 1.4605929139551699, "grad_norm": 3.306538949734956, "learning_rate": 7.042329736651247e-06, "logits/chosen": 0.26415181159973145, "logits/rejected": 0.1740158200263977, "logps/chosen": -0.6276966333389282, "logps/rejected": -2.046126127243042, "loss": 0.6323, "odds_ratio_loss": 0.3550504148006439, "rewards/accuracies": 0.75, "rewards/chosen": -0.06276966631412506, "rewards/margins": 0.14184294641017914, "rewards/rejected": -0.2046126127243042, "sft_loss": 0.6276966333389282, "step": 1010 }, { "epoch": 1.4620390455531453, "grad_norm": 2.57776723354512, "learning_rate": 7.040312357129047e-06, "logits/chosen": 0.1909094750881195, "logits/rejected": 0.24289001524448395, "logps/chosen": -0.5022028684616089, "logps/rejected": -1.813218116760254, "loss": 0.581, "odds_ratio_loss": 0.2843645215034485, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05022028833627701, "rewards/margins": 0.1311015486717224, "rewards/rejected": -0.18132182955741882, "sft_loss": 0.5022028684616089, "step": 1011 }, { "epoch": 1.4634851771511208, "grad_norm": 2.7761880864873154, "learning_rate": 7.038293144591204e-06, "logits/chosen": 0.20759516954421997, "logits/rejected": 0.14761096239089966, "logps/chosen": -0.577498733997345, "logps/rejected": -1.8540127277374268, "loss": 0.6301, "odds_ratio_loss": 0.3021618127822876, "rewards/accuracies": 0.875, "rewards/chosen": -0.057749874889850616, "rewards/margins": 0.1276514083147049, "rewards/rejected": -0.18540127575397491, "sft_loss": 0.577498733997345, "step": 1012 }, { "epoch": 1.464931308749096, "grad_norm": 2.277520562849688, "learning_rate": 7.036272100255109e-06, "logits/chosen": 0.249737948179245, "logits/rejected": 0.18658365309238434, "logps/chosen": -0.5908975005149841, "logps/rejected": -1.8330168724060059, "loss": 0.6079, "odds_ratio_loss": 0.3356654644012451, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05908975377678871, "rewards/margins": 0.12421193718910217, "rewards/rejected": -0.18330168724060059, "sft_loss": 0.5908975005149841, "step": 1013 }, { "epoch": 1.4663774403470715, "grad_norm": 2.5201576999568163, "learning_rate": 7.034249225339255e-06, "logits/chosen": 0.11815465986728668, "logits/rejected": -0.03885069862008095, "logps/chosen": -0.5782637000083923, "logps/rejected": -2.2459661960601807, "loss": 0.6193, "odds_ratio_loss": 0.3164394497871399, "rewards/accuracies": 0.875, "rewards/chosen": -0.05782637372612953, "rewards/margins": 0.16677024960517883, "rewards/rejected": -0.22459663450717926, "sft_loss": 0.5782637000083923, "step": 1014 }, { "epoch": 1.467823571945047, "grad_norm": 2.9866144671540003, "learning_rate": 7.032224521063243e-06, "logits/chosen": 0.23842084407806396, "logits/rejected": 0.1553962379693985, "logps/chosen": -0.4561951160430908, "logps/rejected": -2.000938892364502, "loss": 0.598, "odds_ratio_loss": 0.20124277472496033, "rewards/accuracies": 1.0, "rewards/chosen": -0.04561951383948326, "rewards/margins": 0.15447436273097992, "rewards/rejected": -0.20009386539459229, "sft_loss": 0.4561951160430908, "step": 1015 }, { "epoch": 1.4692697035430224, "grad_norm": 2.69333811024608, "learning_rate": 7.030197988647774e-06, "logits/chosen": 0.1152530089020729, "logits/rejected": 0.0506892129778862, "logps/chosen": -0.5420433282852173, "logps/rejected": -2.2813198566436768, "loss": 0.5693, "odds_ratio_loss": 0.2778359651565552, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05420433729887009, "rewards/margins": 0.1739276647567749, "rewards/rejected": -0.2281319797039032, "sft_loss": 0.5420433282852173, "step": 1016 }, { "epoch": 1.470715835140998, "grad_norm": 2.325614102351017, "learning_rate": 7.028169629314653e-06, "logits/chosen": 0.10290016233921051, "logits/rejected": 0.09782204031944275, "logps/chosen": -0.5482643246650696, "logps/rejected": -1.9553083181381226, "loss": 0.6292, "odds_ratio_loss": 0.24048736691474915, "rewards/accuracies": 1.0, "rewards/chosen": -0.054826442152261734, "rewards/margins": 0.14070439338684082, "rewards/rejected": -0.19553083181381226, "sft_loss": 0.5482643246650696, "step": 1017 }, { "epoch": 1.4721619667389731, "grad_norm": 2.272861105925204, "learning_rate": 7.026139444286783e-06, "logits/chosen": 0.12405059486627579, "logits/rejected": 0.12237481027841568, "logps/chosen": -0.6233857870101929, "logps/rejected": -1.7711641788482666, "loss": 0.5714, "odds_ratio_loss": 0.30120688676834106, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06233857944607735, "rewards/margins": 0.1147778332233429, "rewards/rejected": -0.17711640894412994, "sft_loss": 0.6233857870101929, "step": 1018 }, { "epoch": 1.4736080983369486, "grad_norm": 3.7017997314571276, "learning_rate": 7.0241074347881725e-06, "logits/chosen": 0.20509743690490723, "logits/rejected": 0.19157272577285767, "logps/chosen": -0.5758723020553589, "logps/rejected": -1.6775627136230469, "loss": 0.571, "odds_ratio_loss": 0.35033929347991943, "rewards/accuracies": 0.875, "rewards/chosen": -0.05758722871541977, "rewards/margins": 0.11016905307769775, "rewards/rejected": -0.16775627434253693, "sft_loss": 0.5758723020553589, "step": 1019 }, { "epoch": 1.475054229934924, "grad_norm": 3.740076627765479, "learning_rate": 7.022073602043926e-06, "logits/chosen": 0.31628650426864624, "logits/rejected": 0.24296697974205017, "logps/chosen": -0.5375529527664185, "logps/rejected": -2.6180481910705566, "loss": 0.6056, "odds_ratio_loss": 0.3072022795677185, "rewards/accuracies": 0.875, "rewards/chosen": -0.053755298256874084, "rewards/margins": 0.20804953575134277, "rewards/rejected": -0.26180481910705566, "sft_loss": 0.5375529527664185, "step": 1020 }, { "epoch": 1.4765003615328995, "grad_norm": 2.916082710897773, "learning_rate": 7.020037947280249e-06, "logits/chosen": 0.2506396174430847, "logits/rejected": 0.11263955384492874, "logps/chosen": -0.6686195135116577, "logps/rejected": -1.8187741041183472, "loss": 0.596, "odds_ratio_loss": 0.3449232876300812, "rewards/accuracies": 0.875, "rewards/chosen": -0.06686195731163025, "rewards/margins": 0.11501546204090118, "rewards/rejected": -0.18187743425369263, "sft_loss": 0.6686195135116577, "step": 1021 }, { "epoch": 1.477946493130875, "grad_norm": 3.9390911957773134, "learning_rate": 7.018000471724446e-06, "logits/chosen": 0.10807211697101593, "logits/rejected": 0.22087359428405762, "logps/chosen": -0.5579795837402344, "logps/rejected": -2.2236127853393555, "loss": 0.6239, "odds_ratio_loss": 0.2716135084629059, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05579796060919762, "rewards/margins": 0.16656331717967987, "rewards/rejected": -0.2223612666130066, "sft_loss": 0.5579795837402344, "step": 1022 }, { "epoch": 1.4793926247288502, "grad_norm": 2.5788918246446126, "learning_rate": 7.01596117660492e-06, "logits/chosen": 0.04662645608186722, "logits/rejected": 0.1416071653366089, "logps/chosen": -0.6649520397186279, "logps/rejected": -1.8460800647735596, "loss": 0.5739, "odds_ratio_loss": 0.3914756178855896, "rewards/accuracies": 0.75, "rewards/chosen": -0.06649520993232727, "rewards/margins": 0.11811280250549316, "rewards/rejected": -0.18460801243782043, "sft_loss": 0.6649520397186279, "step": 1023 }, { "epoch": 1.4808387563268257, "grad_norm": 2.4501783550989713, "learning_rate": 7.013920063151166e-06, "logits/chosen": 0.1949535608291626, "logits/rejected": 0.0820920318365097, "logps/chosen": -0.6624947786331177, "logps/rejected": -2.2682080268859863, "loss": 0.6292, "odds_ratio_loss": 0.3945419192314148, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06624948233366013, "rewards/margins": 0.16057133674621582, "rewards/rejected": -0.22682081162929535, "sft_loss": 0.6624947786331177, "step": 1024 }, { "epoch": 1.4822848879248012, "grad_norm": 2.2241593032463887, "learning_rate": 7.011877132593781e-06, "logits/chosen": 0.19115297496318817, "logits/rejected": 0.19649362564086914, "logps/chosen": -0.5488402843475342, "logps/rejected": -1.9641447067260742, "loss": 0.5042, "odds_ratio_loss": 0.251476526260376, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05488403141498566, "rewards/margins": 0.14153045415878296, "rewards/rejected": -0.19641447067260742, "sft_loss": 0.5488402843475342, "step": 1025 }, { "epoch": 1.4837310195227766, "grad_norm": 2.9155833133769176, "learning_rate": 7.009832386164456e-06, "logits/chosen": 0.15561585128307343, "logits/rejected": 0.19798195362091064, "logps/chosen": -0.6533185243606567, "logps/rejected": -1.3767894506454468, "loss": 0.6188, "odds_ratio_loss": 0.3969898521900177, "rewards/accuracies": 0.875, "rewards/chosen": -0.0653318539261818, "rewards/margins": 0.07234710454940796, "rewards/rejected": -0.13767895102500916, "sft_loss": 0.6533185243606567, "step": 1026 }, { "epoch": 1.485177151120752, "grad_norm": 2.6169665641699913, "learning_rate": 7.007785825095975e-06, "logits/chosen": 0.23924939334392548, "logits/rejected": 0.019013479351997375, "logps/chosen": -0.521590530872345, "logps/rejected": -2.0993692874908447, "loss": 0.7109, "odds_ratio_loss": 0.24000337719917297, "rewards/accuracies": 0.875, "rewards/chosen": -0.05215904861688614, "rewards/margins": 0.15777787566184998, "rewards/rejected": -0.2099369466304779, "sft_loss": 0.521590530872345, "step": 1027 }, { "epoch": 1.4866232827187273, "grad_norm": 2.572626461324335, "learning_rate": 7.005737450622219e-06, "logits/chosen": 0.299623966217041, "logits/rejected": 0.24995410442352295, "logps/chosen": -0.533481776714325, "logps/rejected": -1.580140471458435, "loss": 0.5954, "odds_ratio_loss": 0.3327270746231079, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05334818363189697, "rewards/margins": 0.1046658605337143, "rewards/rejected": -0.15801402926445007, "sft_loss": 0.533481776714325, "step": 1028 }, { "epoch": 1.4880694143167028, "grad_norm": 3.5453397201056243, "learning_rate": 7.003687263978158e-06, "logits/chosen": 0.18590807914733887, "logits/rejected": 0.2126408964395523, "logps/chosen": -0.6051197052001953, "logps/rejected": -2.3243978023529053, "loss": 0.5805, "odds_ratio_loss": 0.38674530386924744, "rewards/accuracies": 0.875, "rewards/chosen": -0.06051196902990341, "rewards/margins": 0.171927809715271, "rewards/rejected": -0.232439786195755, "sft_loss": 0.6051197052001953, "step": 1029 }, { "epoch": 1.4895155459146783, "grad_norm": 2.8003747097713227, "learning_rate": 7.00163526639986e-06, "logits/chosen": 0.14941300451755524, "logits/rejected": 0.11249898374080658, "logps/chosen": -0.7737295031547546, "logps/rejected": -1.510383129119873, "loss": 0.6962, "odds_ratio_loss": 0.39545488357543945, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0773729458451271, "rewards/margins": 0.07366538047790527, "rewards/rejected": -0.15103831887245178, "sft_loss": 0.7737295031547546, "step": 1030 }, { "epoch": 1.4909616775126535, "grad_norm": 2.6587374392731475, "learning_rate": 6.99958145912448e-06, "logits/chosen": 0.18349449336528778, "logits/rejected": 0.10890393704175949, "logps/chosen": -0.5819097757339478, "logps/rejected": -1.5076624155044556, "loss": 0.5803, "odds_ratio_loss": 0.40786978602409363, "rewards/accuracies": 0.75, "rewards/chosen": -0.058190979063510895, "rewards/margins": 0.09257525205612183, "rewards/rejected": -0.15076623857021332, "sft_loss": 0.5819097757339478, "step": 1031 }, { "epoch": 1.4924078091106292, "grad_norm": 2.1458834769890203, "learning_rate": 6.997525843390267e-06, "logits/chosen": 0.0792083889245987, "logits/rejected": 0.06461001932621002, "logps/chosen": -0.6367547512054443, "logps/rejected": -1.3028634786605835, "loss": 0.6463, "odds_ratio_loss": 0.3900574743747711, "rewards/accuracies": 0.875, "rewards/chosen": -0.06367547810077667, "rewards/margins": 0.06661087274551392, "rewards/rejected": -0.1302863508462906, "sft_loss": 0.6367547512054443, "step": 1032 }, { "epoch": 1.4938539407086044, "grad_norm": 2.4642215991009517, "learning_rate": 6.995468420436559e-06, "logits/chosen": 0.1517665535211563, "logits/rejected": 0.15160293877124786, "logps/chosen": -0.6312042474746704, "logps/rejected": -2.0376627445220947, "loss": 0.684, "odds_ratio_loss": 0.3216949701309204, "rewards/accuracies": 0.875, "rewards/chosen": -0.06312042474746704, "rewards/margins": 0.1406458467245102, "rewards/rejected": -0.20376628637313843, "sft_loss": 0.6312042474746704, "step": 1033 }, { "epoch": 1.49530007230658, "grad_norm": 2.5076819118432874, "learning_rate": 6.993409191503783e-06, "logits/chosen": 0.239852637052536, "logits/rejected": 0.12884551286697388, "logps/chosen": -0.4071224331855774, "logps/rejected": -2.5471112728118896, "loss": 0.6061, "odds_ratio_loss": 0.25786054134368896, "rewards/accuracies": 0.875, "rewards/chosen": -0.04071224480867386, "rewards/margins": 0.21399888396263123, "rewards/rejected": -0.2547111213207245, "sft_loss": 0.4071224331855774, "step": 1034 }, { "epoch": 1.4967462039045554, "grad_norm": 4.028839475322059, "learning_rate": 6.991348157833457e-06, "logits/chosen": 0.10707180202007294, "logits/rejected": 0.08338991552591324, "logps/chosen": -0.5520052909851074, "logps/rejected": -2.078350067138672, "loss": 0.604, "odds_ratio_loss": 0.3524037003517151, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05520053207874298, "rewards/margins": 0.15263448655605316, "rewards/rejected": -0.20783501863479614, "sft_loss": 0.5520052909851074, "step": 1035 }, { "epoch": 1.4981923355025306, "grad_norm": 2.3833532208842256, "learning_rate": 6.9892853206681864e-06, "logits/chosen": 0.15900209546089172, "logits/rejected": 0.098246268928051, "logps/chosen": -0.6327139139175415, "logps/rejected": -1.6600315570831299, "loss": 0.6718, "odds_ratio_loss": 0.3264049291610718, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06327139586210251, "rewards/margins": 0.10273175686597824, "rewards/rejected": -0.16600315272808075, "sft_loss": 0.6327139139175415, "step": 1036 }, { "epoch": 1.499638467100506, "grad_norm": 4.421414597775642, "learning_rate": 6.987220681251663e-06, "logits/chosen": 0.0970354974269867, "logits/rejected": 0.07867510616779327, "logps/chosen": -0.5139919519424438, "logps/rejected": -2.057021379470825, "loss": 0.658, "odds_ratio_loss": 0.21215665340423584, "rewards/accuracies": 1.0, "rewards/chosen": -0.05139920115470886, "rewards/margins": 0.1543029397726059, "rewards/rejected": -0.20570214092731476, "sft_loss": 0.5139919519424438, "step": 1037 }, { "epoch": 1.5010845986984815, "grad_norm": 2.874565400720609, "learning_rate": 6.985154240828665e-06, "logits/chosen": 0.2262597382068634, "logits/rejected": 0.1943497508764267, "logps/chosen": -0.6259772181510925, "logps/rejected": -2.125296115875244, "loss": 0.7194, "odds_ratio_loss": 0.4203740954399109, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06259772181510925, "rewards/margins": 0.1499318927526474, "rewards/rejected": -0.21252959966659546, "sft_loss": 0.6259772181510925, "step": 1038 }, { "epoch": 1.502530730296457, "grad_norm": 2.6558933441242214, "learning_rate": 6.983086000645057e-06, "logits/chosen": 0.07608388364315033, "logits/rejected": 0.15391358733177185, "logps/chosen": -0.7084933519363403, "logps/rejected": -1.6010119915008545, "loss": 0.6625, "odds_ratio_loss": 0.30498605966567993, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07084932923316956, "rewards/margins": 0.08925186842679977, "rewards/rejected": -0.16010120511054993, "sft_loss": 0.7084933519363403, "step": 1039 }, { "epoch": 1.5039768618944325, "grad_norm": 2.6321061758591617, "learning_rate": 6.981015961947788e-06, "logits/chosen": 0.1412767767906189, "logits/rejected": 0.13829348981380463, "logps/chosen": -0.5603752136230469, "logps/rejected": -2.2956314086914062, "loss": 0.5655, "odds_ratio_loss": 0.3607153296470642, "rewards/accuracies": 0.8125, "rewards/chosen": -0.056037526577711105, "rewards/margins": 0.1735256314277649, "rewards/rejected": -0.2295631468296051, "sft_loss": 0.5603752136230469, "step": 1040 }, { "epoch": 1.5054229934924077, "grad_norm": 3.0828053228799006, "learning_rate": 6.978944125984895e-06, "logits/chosen": 0.19295093417167664, "logits/rejected": 0.14091432094573975, "logps/chosen": -0.5975869297981262, "logps/rejected": -2.6924171447753906, "loss": 0.6764, "odds_ratio_loss": 0.2563125789165497, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05975869297981262, "rewards/margins": 0.20948302745819092, "rewards/rejected": -0.26924172043800354, "sft_loss": 0.5975869297981262, "step": 1041 }, { "epoch": 1.5068691250903832, "grad_norm": 3.960558914590278, "learning_rate": 6.976870494005492e-06, "logits/chosen": 0.19148896634578705, "logits/rejected": 0.0627632588148117, "logps/chosen": -0.5534959435462952, "logps/rejected": -2.0796003341674805, "loss": 0.6236, "odds_ratio_loss": 0.3466914892196655, "rewards/accuracies": 0.8125, "rewards/chosen": -0.055349595844745636, "rewards/margins": 0.1526104211807251, "rewards/rejected": -0.20796000957489014, "sft_loss": 0.5534959435462952, "step": 1042 }, { "epoch": 1.5083152566883586, "grad_norm": 2.361686297673546, "learning_rate": 6.974795067259781e-06, "logits/chosen": 0.0762028768658638, "logits/rejected": 0.11023451387882233, "logps/chosen": -0.5732434988021851, "logps/rejected": -1.5401384830474854, "loss": 0.6878, "odds_ratio_loss": 0.29356640577316284, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0573243573307991, "rewards/margins": 0.09668950736522675, "rewards/rejected": -0.15401385724544525, "sft_loss": 0.5732434988021851, "step": 1043 }, { "epoch": 1.509761388286334, "grad_norm": 2.458530408887522, "learning_rate": 6.972717846999046e-06, "logits/chosen": 0.14236396551132202, "logits/rejected": 0.10859329998493195, "logps/chosen": -0.4144444465637207, "logps/rejected": -1.902043104171753, "loss": 0.622, "odds_ratio_loss": 0.23717351257801056, "rewards/accuracies": 1.0, "rewards/chosen": -0.04144444689154625, "rewards/margins": 0.1487598568201065, "rewards/rejected": -0.19020430743694305, "sft_loss": 0.4144444465637207, "step": 1044 }, { "epoch": 1.5112075198843096, "grad_norm": 2.3087641052869894, "learning_rate": 6.97063883447565e-06, "logits/chosen": 0.10838791728019714, "logits/rejected": 0.11068686097860336, "logps/chosen": -0.49994957447052, "logps/rejected": -2.1742091178894043, "loss": 0.5046, "odds_ratio_loss": 0.2922423481941223, "rewards/accuracies": 0.875, "rewards/chosen": -0.04999496415257454, "rewards/margins": 0.1674259603023529, "rewards/rejected": -0.21742090582847595, "sft_loss": 0.49994957447052, "step": 1045 }, { "epoch": 1.5126536514822848, "grad_norm": 2.3997435361179327, "learning_rate": 6.968558030943035e-06, "logits/chosen": 0.045848749577999115, "logits/rejected": 0.08902572095394135, "logps/chosen": -0.6003144383430481, "logps/rejected": -1.817967176437378, "loss": 0.5955, "odds_ratio_loss": 0.33355093002319336, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06003144755959511, "rewards/margins": 0.12176527082920074, "rewards/rejected": -0.18179671466350555, "sft_loss": 0.6003144383430481, "step": 1046 }, { "epoch": 1.5140997830802603, "grad_norm": 2.465062809755268, "learning_rate": 6.966475437655728e-06, "logits/chosen": 0.18887464702129364, "logits/rejected": 0.1661260426044464, "logps/chosen": -0.45803892612457275, "logps/rejected": -1.855370044708252, "loss": 0.5773, "odds_ratio_loss": 0.2459450662136078, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04580388963222504, "rewards/margins": 0.13973310589790344, "rewards/rejected": -0.18553701043128967, "sft_loss": 0.45803892612457275, "step": 1047 }, { "epoch": 1.5155459146782357, "grad_norm": 2.7122560174653088, "learning_rate": 6.964391055869331e-06, "logits/chosen": 0.20639120042324066, "logits/rejected": 0.24513506889343262, "logps/chosen": -0.5997314453125, "logps/rejected": -1.2009388208389282, "loss": 0.5093, "odds_ratio_loss": 0.5465415120124817, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05997314304113388, "rewards/margins": 0.06012075021862984, "rewards/rejected": -0.12009389698505402, "sft_loss": 0.5997314453125, "step": 1048 }, { "epoch": 1.516992046276211, "grad_norm": 3.499958743730523, "learning_rate": 6.962304886840526e-06, "logits/chosen": 0.11560941487550735, "logits/rejected": 0.1723850816488266, "logps/chosen": -0.5111711025238037, "logps/rejected": -1.831732153892517, "loss": 0.6253, "odds_ratio_loss": 0.19189368188381195, "rewards/accuracies": 1.0, "rewards/chosen": -0.05111711472272873, "rewards/margins": 0.1320561021566391, "rewards/rejected": -0.18317320942878723, "sft_loss": 0.5111711025238037, "step": 1049 }, { "epoch": 1.5184381778741867, "grad_norm": 2.490232127901872, "learning_rate": 6.960216931827072e-06, "logits/chosen": 0.13617388904094696, "logits/rejected": 0.10104858875274658, "logps/chosen": -0.8242969512939453, "logps/rejected": -1.2352579832077026, "loss": 0.6404, "odds_ratio_loss": 0.5068786144256592, "rewards/accuracies": 0.8125, "rewards/chosen": -0.08242969959974289, "rewards/margins": 0.04109610244631767, "rewards/rejected": -0.12352579832077026, "sft_loss": 0.8242969512939453, "step": 1050 }, { "epoch": 1.519884309472162, "grad_norm": 2.796815752969695, "learning_rate": 6.958127192087805e-06, "logits/chosen": 0.2978774905204773, "logits/rejected": 0.21057304739952087, "logps/chosen": -0.498077392578125, "logps/rejected": -1.7246334552764893, "loss": 0.5587, "odds_ratio_loss": 0.334107905626297, "rewards/accuracies": 0.875, "rewards/chosen": -0.04980773851275444, "rewards/margins": 0.12265560030937195, "rewards/rejected": -0.1724633276462555, "sft_loss": 0.498077392578125, "step": 1051 }, { "epoch": 1.5213304410701374, "grad_norm": 2.2782092009922708, "learning_rate": 6.956035668882636e-06, "logits/chosen": 0.010676529258489609, "logits/rejected": 0.0010472461581230164, "logps/chosen": -0.6882982850074768, "logps/rejected": -2.3437445163726807, "loss": 0.6361, "odds_ratio_loss": 0.33234259486198425, "rewards/accuracies": 0.875, "rewards/chosen": -0.06882983446121216, "rewards/margins": 0.16554459929466248, "rewards/rejected": -0.23437443375587463, "sft_loss": 0.6882982850074768, "step": 1052 }, { "epoch": 1.5227765726681128, "grad_norm": 3.2140920809453655, "learning_rate": 6.953942363472554e-06, "logits/chosen": 0.13342228531837463, "logits/rejected": 0.05354118347167969, "logps/chosen": -0.6020887494087219, "logps/rejected": -2.461318254470825, "loss": 0.6204, "odds_ratio_loss": 0.32788577675819397, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06020887941122055, "rewards/margins": 0.18592296540737152, "rewards/rejected": -0.24613183736801147, "sft_loss": 0.6020887494087219, "step": 1053 }, { "epoch": 1.524222704266088, "grad_norm": 2.5019712775378666, "learning_rate": 6.951847277119618e-06, "logits/chosen": 0.2589821219444275, "logits/rejected": 0.16210149228572845, "logps/chosen": -0.510164737701416, "logps/rejected": -2.564856767654419, "loss": 0.5104, "odds_ratio_loss": 0.23116162419319153, "rewards/accuracies": 1.0, "rewards/chosen": -0.05101647228002548, "rewards/margins": 0.20546920597553253, "rewards/rejected": -0.2564856708049774, "sft_loss": 0.510164737701416, "step": 1054 }, { "epoch": 1.5256688358640638, "grad_norm": 2.152655482579084, "learning_rate": 6.949750411086965e-06, "logits/chosen": 0.34317436814308167, "logits/rejected": 0.10291274636983871, "logps/chosen": -0.7262193560600281, "logps/rejected": -1.5096874237060547, "loss": 0.7065, "odds_ratio_loss": 0.4528844952583313, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07262193411588669, "rewards/margins": 0.07834681868553162, "rewards/rejected": -0.1509687602519989, "sft_loss": 0.7262193560600281, "step": 1055 }, { "epoch": 1.527114967462039, "grad_norm": 3.50447530153194, "learning_rate": 6.947651766638804e-06, "logits/chosen": 0.12302163243293762, "logits/rejected": 0.13556215167045593, "logps/chosen": -0.5117670297622681, "logps/rejected": -2.444344997406006, "loss": 0.5551, "odds_ratio_loss": 0.15113696455955505, "rewards/accuracies": 1.0, "rewards/chosen": -0.051176704466342926, "rewards/margins": 0.19325780868530273, "rewards/rejected": -0.24443453550338745, "sft_loss": 0.5117670297622681, "step": 1056 }, { "epoch": 1.5285610990600145, "grad_norm": 2.435293117738104, "learning_rate": 6.945551345040414e-06, "logits/chosen": 0.15510180592536926, "logits/rejected": 0.11878678947687149, "logps/chosen": -0.5824716687202454, "logps/rejected": -1.9384113550186157, "loss": 0.5609, "odds_ratio_loss": 0.35332125425338745, "rewards/accuracies": 0.875, "rewards/chosen": -0.058247171342372894, "rewards/margins": 0.1355939656496048, "rewards/rejected": -0.1938411295413971, "sft_loss": 0.5824716687202454, "step": 1057 }, { "epoch": 1.53000723065799, "grad_norm": 2.588323654734541, "learning_rate": 6.943449147558148e-06, "logits/chosen": 0.2299852967262268, "logits/rejected": 0.2677266597747803, "logps/chosen": -0.4605291187763214, "logps/rejected": -1.5798819065093994, "loss": 0.6116, "odds_ratio_loss": 0.2766846716403961, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04605291038751602, "rewards/margins": 0.11193528026342392, "rewards/rejected": -0.15798819065093994, "sft_loss": 0.4605291187763214, "step": 1058 }, { "epoch": 1.5314533622559652, "grad_norm": 3.837009839441936, "learning_rate": 6.941345175459428e-06, "logits/chosen": 0.1960982233285904, "logits/rejected": 0.1517314910888672, "logps/chosen": -0.6805843114852905, "logps/rejected": -1.9845657348632812, "loss": 0.6312, "odds_ratio_loss": 0.44597482681274414, "rewards/accuracies": 0.875, "rewards/chosen": -0.06805843114852905, "rewards/margins": 0.13039812445640564, "rewards/rejected": -0.1984565705060959, "sft_loss": 0.6805843114852905, "step": 1059 }, { "epoch": 1.5328994938539409, "grad_norm": 3.5474173037398278, "learning_rate": 6.939239430012747e-06, "logits/chosen": 0.1270655393600464, "logits/rejected": 0.16382458806037903, "logps/chosen": -0.6372203230857849, "logps/rejected": -1.725688099861145, "loss": 0.5645, "odds_ratio_loss": 0.3358212411403656, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06372203677892685, "rewards/margins": 0.10884677618741989, "rewards/rejected": -0.17256881296634674, "sft_loss": 0.6372203230857849, "step": 1060 }, { "epoch": 1.534345625451916, "grad_norm": 2.6777737468006597, "learning_rate": 6.937131912487666e-06, "logits/chosen": 0.12652812898159027, "logits/rejected": 0.17299723625183105, "logps/chosen": -0.5954207181930542, "logps/rejected": -1.8514387607574463, "loss": 0.6188, "odds_ratio_loss": 0.2636999487876892, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05954207852482796, "rewards/margins": 0.12560181319713593, "rewards/rejected": -0.18514388799667358, "sft_loss": 0.5954207181930542, "step": 1061 }, { "epoch": 1.5357917570498916, "grad_norm": 3.721568398574073, "learning_rate": 6.935022624154818e-06, "logits/chosen": 0.014476214535534382, "logits/rejected": 0.025586508214473724, "logps/chosen": -0.6806229948997498, "logps/rejected": -2.057115077972412, "loss": 0.6268, "odds_ratio_loss": 0.3748878836631775, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06806230545043945, "rewards/margins": 0.13764920830726624, "rewards/rejected": -0.2057115137577057, "sft_loss": 0.6806229948997498, "step": 1062 }, { "epoch": 1.537237888647867, "grad_norm": 3.415662609051976, "learning_rate": 6.9329115662858965e-06, "logits/chosen": 0.2381383776664734, "logits/rejected": 0.13723555207252502, "logps/chosen": -0.5097121000289917, "logps/rejected": -2.628650188446045, "loss": 0.617, "odds_ratio_loss": 0.269598126411438, "rewards/accuracies": 0.875, "rewards/chosen": -0.05097120627760887, "rewards/margins": 0.21189382672309875, "rewards/rejected": -0.2628650367259979, "sft_loss": 0.5097121000289917, "step": 1063 }, { "epoch": 1.5386840202458423, "grad_norm": 2.3924126847399028, "learning_rate": 6.9307987401536694e-06, "logits/chosen": 0.22002063691616058, "logits/rejected": 0.19519548118114471, "logps/chosen": -0.516993522644043, "logps/rejected": -2.4856157302856445, "loss": 0.5586, "odds_ratio_loss": 0.23298753798007965, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05169935151934624, "rewards/margins": 0.19686222076416016, "rewards/rejected": -0.2485615760087967, "sft_loss": 0.516993522644043, "step": 1064 }, { "epoch": 1.5401301518438177, "grad_norm": 2.492354637645805, "learning_rate": 6.928684147031967e-06, "logits/chosen": 0.20798169076442719, "logits/rejected": 0.20337769389152527, "logps/chosen": -0.5130513310432434, "logps/rejected": -1.760741949081421, "loss": 0.6231, "odds_ratio_loss": 0.29917263984680176, "rewards/accuracies": 0.875, "rewards/chosen": -0.0513051338493824, "rewards/margins": 0.12476906925439835, "rewards/rejected": -0.17607420682907104, "sft_loss": 0.5130513310432434, "step": 1065 }, { "epoch": 1.5415762834417932, "grad_norm": 2.8580063984243185, "learning_rate": 6.926567788195683e-06, "logits/chosen": 0.12441393733024597, "logits/rejected": 0.12534169852733612, "logps/chosen": -0.8271850943565369, "logps/rejected": -1.329588770866394, "loss": 0.6965, "odds_ratio_loss": 0.5016704797744751, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08271851390600204, "rewards/margins": 0.05024036392569542, "rewards/rejected": -0.13295888900756836, "sft_loss": 0.8271850943565369, "step": 1066 }, { "epoch": 1.5430224150397687, "grad_norm": 2.088204535572692, "learning_rate": 6.9244496649207814e-06, "logits/chosen": 0.29973694682121277, "logits/rejected": 0.19213856756687164, "logps/chosen": -0.476146399974823, "logps/rejected": -2.555558204650879, "loss": 0.5292, "odds_ratio_loss": 0.2622685432434082, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04761463776230812, "rewards/margins": 0.2079411745071411, "rewards/rejected": -0.25555580854415894, "sft_loss": 0.476146399974823, "step": 1067 }, { "epoch": 1.5444685466377441, "grad_norm": 2.6496817862444964, "learning_rate": 6.922329778484284e-06, "logits/chosen": 0.12427856773138046, "logits/rejected": -0.0361967608332634, "logps/chosen": -0.7265552282333374, "logps/rejected": -1.805168867111206, "loss": 0.6306, "odds_ratio_loss": 0.512405514717102, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07265552878379822, "rewards/margins": 0.10786135494709015, "rewards/rejected": -0.18051689863204956, "sft_loss": 0.7265552282333374, "step": 1068 }, { "epoch": 1.5459146782357194, "grad_norm": 2.3428447174051024, "learning_rate": 6.920208130164279e-06, "logits/chosen": 0.1944482922554016, "logits/rejected": 0.12993068993091583, "logps/chosen": -0.5531301498413086, "logps/rejected": -1.8621083498001099, "loss": 0.6635, "odds_ratio_loss": 0.31757286190986633, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05531301349401474, "rewards/margins": 0.13089781999588013, "rewards/rejected": -0.18621084094047546, "sft_loss": 0.5531301498413086, "step": 1069 }, { "epoch": 1.5473608098336948, "grad_norm": 2.4592542393577137, "learning_rate": 6.9180847212399185e-06, "logits/chosen": 0.11052542924880981, "logits/rejected": 0.12636902928352356, "logps/chosen": -0.5426141023635864, "logps/rejected": -2.2695884704589844, "loss": 0.6373, "odds_ratio_loss": 0.26784971356391907, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05426141619682312, "rewards/margins": 0.17269743978977203, "rewards/rejected": -0.22695885598659515, "sft_loss": 0.5426141023635864, "step": 1070 }, { "epoch": 1.5488069414316703, "grad_norm": 2.171573456977925, "learning_rate": 6.91595955299141e-06, "logits/chosen": 0.19434921443462372, "logits/rejected": 0.1056637316942215, "logps/chosen": -0.5363348722457886, "logps/rejected": -1.7876577377319336, "loss": 0.5617, "odds_ratio_loss": 0.32317835092544556, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05363348871469498, "rewards/margins": 0.12513227760791779, "rewards/rejected": -0.17876575887203217, "sft_loss": 0.5363348722457886, "step": 1071 }, { "epoch": 1.5502530730296455, "grad_norm": 3.1058433998181942, "learning_rate": 6.913832626700027e-06, "logits/chosen": 0.02380383014678955, "logits/rejected": 0.06460568308830261, "logps/chosen": -0.5088622570037842, "logps/rejected": -2.6462490558624268, "loss": 0.5913, "odds_ratio_loss": 0.242878720164299, "rewards/accuracies": 0.875, "rewards/chosen": -0.05088622868061066, "rewards/margins": 0.21373867988586426, "rewards/rejected": -0.2646248936653137, "sft_loss": 0.5088622570037842, "step": 1072 }, { "epoch": 1.5516992046276212, "grad_norm": 2.543394825286057, "learning_rate": 6.911703943648101e-06, "logits/chosen": 0.12439356744289398, "logits/rejected": 0.07908182591199875, "logps/chosen": -0.7427908182144165, "logps/rejected": -1.9743529558181763, "loss": 0.6456, "odds_ratio_loss": 0.44264400005340576, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07427908480167389, "rewards/margins": 0.12315621227025986, "rewards/rejected": -0.19743528962135315, "sft_loss": 0.7427908182144165, "step": 1073 }, { "epoch": 1.5531453362255965, "grad_norm": 2.859886564508446, "learning_rate": 6.909573505119022e-06, "logits/chosen": 0.18945147097110748, "logits/rejected": 0.028836995363235474, "logps/chosen": -0.509463906288147, "logps/rejected": -2.2200417518615723, "loss": 0.5975, "odds_ratio_loss": 0.354769229888916, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05094639211893082, "rewards/margins": 0.171057790517807, "rewards/rejected": -0.22200417518615723, "sft_loss": 0.509463906288147, "step": 1074 }, { "epoch": 1.554591467823572, "grad_norm": 2.4603604127071237, "learning_rate": 6.907441312397242e-06, "logits/chosen": 0.07920490205287933, "logits/rejected": 0.10876142978668213, "logps/chosen": -0.6252812743186951, "logps/rejected": -1.5969560146331787, "loss": 0.6832, "odds_ratio_loss": 0.32766029238700867, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06252812594175339, "rewards/margins": 0.0971674695611, "rewards/rejected": -0.1596955955028534, "sft_loss": 0.6252812743186951, "step": 1075 }, { "epoch": 1.5560375994215474, "grad_norm": 2.718584331639608, "learning_rate": 6.905307366768266e-06, "logits/chosen": 0.09336543083190918, "logits/rejected": 0.10105445981025696, "logps/chosen": -0.7363986968994141, "logps/rejected": -1.756248116493225, "loss": 0.6651, "odds_ratio_loss": 0.4169667363166809, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0736398696899414, "rewards/margins": 0.10198494046926498, "rewards/rejected": -0.175624817609787, "sft_loss": 0.7363986968994141, "step": 1076 }, { "epoch": 1.5574837310195226, "grad_norm": 2.5631268510701535, "learning_rate": 6.903171669518657e-06, "logits/chosen": 0.21750575304031372, "logits/rejected": 0.13447445631027222, "logps/chosen": -0.5367184281349182, "logps/rejected": -3.1012187004089355, "loss": 0.6973, "odds_ratio_loss": 0.31506919860839844, "rewards/accuracies": 0.875, "rewards/chosen": -0.05367184057831764, "rewards/margins": 0.2564500570297241, "rewards/rejected": -0.31012189388275146, "sft_loss": 0.5367184281349182, "step": 1077 }, { "epoch": 1.5589298626174983, "grad_norm": 2.749705246651489, "learning_rate": 6.901034221936037e-06, "logits/chosen": 0.21169041097164154, "logits/rejected": 0.12119154632091522, "logps/chosen": -0.3760773539543152, "logps/rejected": -2.7587168216705322, "loss": 0.4959, "odds_ratio_loss": 0.24344316124916077, "rewards/accuracies": 0.875, "rewards/chosen": -0.03760773688554764, "rewards/margins": 0.23826396465301514, "rewards/rejected": -0.2758716940879822, "sft_loss": 0.3760773539543152, "step": 1078 }, { "epoch": 1.5603759942154736, "grad_norm": 2.376515470109904, "learning_rate": 6.898895025309078e-06, "logits/chosen": 0.25765448808670044, "logits/rejected": 0.2227647304534912, "logps/chosen": -0.5679758191108704, "logps/rejected": -2.064207077026367, "loss": 0.6137, "odds_ratio_loss": 0.2876105308532715, "rewards/accuracies": 0.9375, "rewards/chosen": -0.056797582656145096, "rewards/margins": 0.14962315559387207, "rewards/rejected": -0.20642071962356567, "sft_loss": 0.5679758191108704, "step": 1079 }, { "epoch": 1.561822125813449, "grad_norm": 2.409181380059003, "learning_rate": 6.896754080927515e-06, "logits/chosen": 0.25729578733444214, "logits/rejected": 0.1790364384651184, "logps/chosen": -0.6595759391784668, "logps/rejected": -3.047138214111328, "loss": 0.6201, "odds_ratio_loss": 0.4079166352748871, "rewards/accuracies": 0.75, "rewards/chosen": -0.06595759838819504, "rewards/margins": 0.2387562245130539, "rewards/rejected": -0.30471381545066833, "sft_loss": 0.6595759391784668, "step": 1080 }, { "epoch": 1.5632682574114245, "grad_norm": 2.841839465101232, "learning_rate": 6.894611390082125e-06, "logits/chosen": 0.3018430471420288, "logits/rejected": 0.23976141214370728, "logps/chosen": -0.4250287413597107, "logps/rejected": -2.774780511856079, "loss": 0.6541, "odds_ratio_loss": 0.2535403370857239, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04250287264585495, "rewards/margins": 0.23497521877288818, "rewards/rejected": -0.27747809886932373, "sft_loss": 0.4250287413597107, "step": 1081 }, { "epoch": 1.5647143890093997, "grad_norm": 2.3848242853645742, "learning_rate": 6.892466954064748e-06, "logits/chosen": 0.17624327540397644, "logits/rejected": 0.035079099237918854, "logps/chosen": -0.7374420166015625, "logps/rejected": -2.1065616607666016, "loss": 0.6227, "odds_ratio_loss": 0.5102405548095703, "rewards/accuracies": 0.625, "rewards/chosen": -0.07374420762062073, "rewards/margins": 0.13691195845603943, "rewards/rejected": -0.21065616607666016, "sft_loss": 0.7374420166015625, "step": 1082 }, { "epoch": 1.5661605206073754, "grad_norm": 2.2507360178739555, "learning_rate": 6.890320774168272e-06, "logits/chosen": 0.09431849420070648, "logits/rejected": 0.09603389352560043, "logps/chosen": -0.671610414981842, "logps/rejected": -2.1536717414855957, "loss": 0.6743, "odds_ratio_loss": 0.33693233132362366, "rewards/accuracies": 0.875, "rewards/chosen": -0.06716103851795197, "rewards/margins": 0.14820614457130432, "rewards/rejected": -0.2153671830892563, "sft_loss": 0.671610414981842, "step": 1083 }, { "epoch": 1.5676066522053507, "grad_norm": 4.758544005698199, "learning_rate": 6.8881728516866365e-06, "logits/chosen": 0.06953014433383942, "logits/rejected": 0.10708625614643097, "logps/chosen": -0.5058231353759766, "logps/rejected": -2.6862740516662598, "loss": 0.5229, "odds_ratio_loss": 0.2551036477088928, "rewards/accuracies": 0.875, "rewards/chosen": -0.050582315772771835, "rewards/margins": 0.21804511547088623, "rewards/rejected": -0.268627405166626, "sft_loss": 0.5058231353759766, "step": 1084 }, { "epoch": 1.5690527838033261, "grad_norm": 2.8356454390086867, "learning_rate": 6.886023187914831e-06, "logits/chosen": 0.1127920001745224, "logits/rejected": 0.04135732352733612, "logps/chosen": -0.6294536590576172, "logps/rejected": -2.6099953651428223, "loss": 0.5941, "odds_ratio_loss": 0.38581383228302, "rewards/accuracies": 0.875, "rewards/chosen": -0.06294536590576172, "rewards/margins": 0.19805417954921722, "rewards/rejected": -0.26099956035614014, "sft_loss": 0.6294536590576172, "step": 1085 }, { "epoch": 1.5704989154013016, "grad_norm": 2.409003843426103, "learning_rate": 6.8838717841488995e-06, "logits/chosen": 0.18301844596862793, "logits/rejected": 0.1595679372549057, "logps/chosen": -0.6605107188224792, "logps/rejected": -2.446531295776367, "loss": 0.6903, "odds_ratio_loss": 0.2724457085132599, "rewards/accuracies": 1.0, "rewards/chosen": -0.06605107337236404, "rewards/margins": 0.17860206961631775, "rewards/rejected": -0.2446531355381012, "sft_loss": 0.6605107188224792, "step": 1086 }, { "epoch": 1.5719450469992768, "grad_norm": 2.439223193709018, "learning_rate": 6.881718641685926e-06, "logits/chosen": 0.14739452302455902, "logits/rejected": 0.11320450901985168, "logps/chosen": -0.5645332932472229, "logps/rejected": -3.358351945877075, "loss": 0.5989, "odds_ratio_loss": 0.2953460216522217, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05645333230495453, "rewards/margins": 0.2793818712234497, "rewards/rejected": -0.33583518862724304, "sft_loss": 0.5645332932472229, "step": 1087 }, { "epoch": 1.5733911785972523, "grad_norm": 2.3436482899115942, "learning_rate": 6.879563761824052e-06, "logits/chosen": 0.05163494125008583, "logits/rejected": 0.04702293127775192, "logps/chosen": -0.7239556312561035, "logps/rejected": -2.1285696029663086, "loss": 0.6696, "odds_ratio_loss": 0.37814947962760925, "rewards/accuracies": 0.875, "rewards/chosen": -0.07239556312561035, "rewards/margins": 0.14046140015125275, "rewards/rejected": -0.2128569781780243, "sft_loss": 0.7239556312561035, "step": 1088 }, { "epoch": 1.5748373101952278, "grad_norm": 2.36411335058339, "learning_rate": 6.877407145862461e-06, "logits/chosen": 0.18641921877861023, "logits/rejected": 0.05207761377096176, "logps/chosen": -0.5632745027542114, "logps/rejected": -2.345430612564087, "loss": 0.6053, "odds_ratio_loss": 0.3601348400115967, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0563274510204792, "rewards/margins": 0.1782156229019165, "rewards/rejected": -0.2345430552959442, "sft_loss": 0.5632745027542114, "step": 1089 }, { "epoch": 1.5762834417932032, "grad_norm": 3.1847878741876943, "learning_rate": 6.875248795101386e-06, "logits/chosen": 0.20281922817230225, "logits/rejected": 0.14336425065994263, "logps/chosen": -0.582189679145813, "logps/rejected": -1.8264280557632446, "loss": 0.5495, "odds_ratio_loss": 0.3409522473812103, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05821897089481354, "rewards/margins": 0.12442383170127869, "rewards/rejected": -0.18264278769493103, "sft_loss": 0.582189679145813, "step": 1090 }, { "epoch": 1.5777295733911787, "grad_norm": 3.329674278967477, "learning_rate": 6.873088710842103e-06, "logits/chosen": 0.07660141587257385, "logits/rejected": 0.06340264528989792, "logps/chosen": -0.49803632497787476, "logps/rejected": -2.5776054859161377, "loss": 0.575, "odds_ratio_loss": 0.2812206745147705, "rewards/accuracies": 0.9375, "rewards/chosen": -0.049803633242845535, "rewards/margins": 0.20795691013336182, "rewards/rejected": -0.25776055455207825, "sft_loss": 0.49803632497787476, "step": 1091 }, { "epoch": 1.579175704989154, "grad_norm": 3.3946195116434192, "learning_rate": 6.870926894386936e-06, "logits/chosen": 0.14749659597873688, "logits/rejected": 0.03280261904001236, "logps/chosen": -0.3851412236690521, "logps/rejected": -3.3120474815368652, "loss": 0.5268, "odds_ratio_loss": 0.21843570470809937, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03851412236690521, "rewards/margins": 0.292690634727478, "rewards/rejected": -0.33120474219322205, "sft_loss": 0.3851412236690521, "step": 1092 }, { "epoch": 1.5806218365871294, "grad_norm": 2.263707967091026, "learning_rate": 6.868763347039252e-06, "logits/chosen": 0.12011094391345978, "logits/rejected": 0.0620017871260643, "logps/chosen": -0.5656850934028625, "logps/rejected": -2.0197243690490723, "loss": 0.5298, "odds_ratio_loss": 0.33666715025901794, "rewards/accuracies": 0.8125, "rewards/chosen": -0.056568510830402374, "rewards/margins": 0.1454039216041565, "rewards/rejected": -0.20197243988513947, "sft_loss": 0.5656850934028625, "step": 1093 }, { "epoch": 1.5820679681851049, "grad_norm": 3.0913874233343446, "learning_rate": 6.8665980701034604e-06, "logits/chosen": 0.17975221574306488, "logits/rejected": 0.07172837853431702, "logps/chosen": -0.4119968116283417, "logps/rejected": -2.78731632232666, "loss": 0.6669, "odds_ratio_loss": 0.3397271931171417, "rewards/accuracies": 0.8125, "rewards/chosen": -0.04119968041777611, "rewards/margins": 0.23753196001052856, "rewards/rejected": -0.27873164415359497, "sft_loss": 0.4119968116283417, "step": 1094 }, { "epoch": 1.58351409978308, "grad_norm": 3.449949589857646, "learning_rate": 6.864431064885018e-06, "logits/chosen": 0.10938216745853424, "logits/rejected": 0.12328256666660309, "logps/chosen": -0.724564254283905, "logps/rejected": -2.3423516750335693, "loss": 0.6826, "odds_ratio_loss": 0.3296443819999695, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07245641946792603, "rewards/margins": 0.1617787629365921, "rewards/rejected": -0.23423519730567932, "sft_loss": 0.724564254283905, "step": 1095 }, { "epoch": 1.5849602313810558, "grad_norm": 2.585591290579989, "learning_rate": 6.862262332690416e-06, "logits/chosen": 0.1535097360610962, "logits/rejected": 0.0930045023560524, "logps/chosen": -0.5395622253417969, "logps/rejected": -2.67804217338562, "loss": 0.6028, "odds_ratio_loss": 0.2423020452260971, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05395621806383133, "rewards/margins": 0.2138480246067047, "rewards/rejected": -0.26780423521995544, "sft_loss": 0.5395622253417969, "step": 1096 }, { "epoch": 1.586406362979031, "grad_norm": 2.731714970233483, "learning_rate": 6.860091874827196e-06, "logits/chosen": 0.15666761994361877, "logits/rejected": 0.1512334793806076, "logps/chosen": -0.49038049578666687, "logps/rejected": -2.745338201522827, "loss": 0.6536, "odds_ratio_loss": 0.3623482286930084, "rewards/accuracies": 0.875, "rewards/chosen": -0.04903804883360863, "rewards/margins": 0.22549577057361603, "rewards/rejected": -0.27453380823135376, "sft_loss": 0.49038049578666687, "step": 1097 }, { "epoch": 1.5878524945770065, "grad_norm": 2.586427523762545, "learning_rate": 6.85791969260393e-06, "logits/chosen": 0.18899771571159363, "logits/rejected": 0.17124459147453308, "logps/chosen": -0.5725775957107544, "logps/rejected": -2.258521556854248, "loss": 0.5769, "odds_ratio_loss": 0.34468019008636475, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0572577565908432, "rewards/margins": 0.1685943752527237, "rewards/rejected": -0.2258521318435669, "sft_loss": 0.5725775957107544, "step": 1098 }, { "epoch": 1.589298626174982, "grad_norm": 2.359047347217126, "learning_rate": 6.855745787330238e-06, "logits/chosen": 0.19358870387077332, "logits/rejected": 0.17605549097061157, "logps/chosen": -0.5376795530319214, "logps/rejected": -2.1253416538238525, "loss": 0.5648, "odds_ratio_loss": 0.33196595311164856, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05376795679330826, "rewards/margins": 0.15876621007919312, "rewards/rejected": -0.21253415942192078, "sft_loss": 0.5376795530319214, "step": 1099 }, { "epoch": 1.5907447577729572, "grad_norm": 2.9412329763848697, "learning_rate": 6.853570160316777e-06, "logits/chosen": 0.11650137603282928, "logits/rejected": 0.1291441023349762, "logps/chosen": -0.6347838640213013, "logps/rejected": -1.5328726768493652, "loss": 0.6059, "odds_ratio_loss": 0.38221707940101624, "rewards/accuracies": 0.875, "rewards/chosen": -0.06347838789224625, "rewards/margins": 0.0898088812828064, "rewards/rejected": -0.15328726172447205, "sft_loss": 0.6347838640213013, "step": 1100 }, { "epoch": 1.592190889370933, "grad_norm": 2.3477841646172286, "learning_rate": 6.851392812875236e-06, "logits/chosen": 0.16941529512405396, "logits/rejected": 0.1017862856388092, "logps/chosen": -0.5886316299438477, "logps/rejected": -1.5656369924545288, "loss": 0.6633, "odds_ratio_loss": 0.3969772756099701, "rewards/accuracies": 0.8125, "rewards/chosen": -0.058863162994384766, "rewards/margins": 0.09770055115222931, "rewards/rejected": -0.15656371414661407, "sft_loss": 0.5886316299438477, "step": 1101 }, { "epoch": 1.5936370209689081, "grad_norm": 2.4546950838412056, "learning_rate": 6.84921374631835e-06, "logits/chosen": 0.24961897730827332, "logits/rejected": 0.03723708540201187, "logps/chosen": -0.602834165096283, "logps/rejected": -4.140806674957275, "loss": 0.6096, "odds_ratio_loss": 0.28800731897354126, "rewards/accuracies": 0.875, "rewards/chosen": -0.060283418744802475, "rewards/margins": 0.3537972569465637, "rewards/rejected": -0.4140806794166565, "sft_loss": 0.602834165096283, "step": 1102 }, { "epoch": 1.5950831525668836, "grad_norm": 2.5069717615115414, "learning_rate": 6.847032961959884e-06, "logits/chosen": 0.15000823140144348, "logits/rejected": 0.24704432487487793, "logps/chosen": -0.5027948617935181, "logps/rejected": -1.290198802947998, "loss": 0.6305, "odds_ratio_loss": 0.3084346652030945, "rewards/accuracies": 0.9375, "rewards/chosen": -0.050279490649700165, "rewards/margins": 0.07874040305614471, "rewards/rejected": -0.12901988625526428, "sft_loss": 0.5027948617935181, "step": 1103 }, { "epoch": 1.596529284164859, "grad_norm": 3.4065840087438204, "learning_rate": 6.844850461114643e-06, "logits/chosen": 0.22534215450286865, "logits/rejected": 0.1937633752822876, "logps/chosen": -0.44374150037765503, "logps/rejected": -2.513273000717163, "loss": 0.5895, "odds_ratio_loss": 0.23521637916564941, "rewards/accuracies": 0.875, "rewards/chosen": -0.04437415301799774, "rewards/margins": 0.20695313811302185, "rewards/rejected": -0.2513273060321808, "sft_loss": 0.44374150037765503, "step": 1104 }, { "epoch": 1.5979754157628343, "grad_norm": 2.506183913454602, "learning_rate": 6.842666245098462e-06, "logits/chosen": 0.1503123641014099, "logits/rejected": 0.10985986888408661, "logps/chosen": -0.6794801950454712, "logps/rejected": -2.6882190704345703, "loss": 0.7458, "odds_ratio_loss": 0.2798730134963989, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06794802099466324, "rewards/margins": 0.20087388157844543, "rewards/rejected": -0.26882192492485046, "sft_loss": 0.6794801950454712, "step": 1105 }, { "epoch": 1.59942154736081, "grad_norm": 3.2357897333744905, "learning_rate": 6.840480315228214e-06, "logits/chosen": 0.18373164534568787, "logits/rejected": 0.07605935633182526, "logps/chosen": -0.4748189449310303, "logps/rejected": -3.4461898803710938, "loss": 0.6415, "odds_ratio_loss": 0.3127560615539551, "rewards/accuracies": 0.75, "rewards/chosen": -0.04748189449310303, "rewards/margins": 0.2971371114253998, "rewards/rejected": -0.3446190357208252, "sft_loss": 0.4748189449310303, "step": 1106 }, { "epoch": 1.6008676789587852, "grad_norm": 2.616072647630207, "learning_rate": 6.838292672821806e-06, "logits/chosen": 0.29636842012405396, "logits/rejected": 0.11761577427387238, "logps/chosen": -0.5896111726760864, "logps/rejected": -3.0412216186523438, "loss": 0.6376, "odds_ratio_loss": 0.3512306213378906, "rewards/accuracies": 0.75, "rewards/chosen": -0.05896111577749252, "rewards/margins": 0.2451610416173935, "rewards/rejected": -0.3041221499443054, "sft_loss": 0.5896111726760864, "step": 1107 }, { "epoch": 1.6023138105567607, "grad_norm": 2.846990331140939, "learning_rate": 6.836103319198175e-06, "logits/chosen": 0.20525681972503662, "logits/rejected": 0.1730688512325287, "logps/chosen": -0.68004310131073, "logps/rejected": -1.7271138429641724, "loss": 0.6933, "odds_ratio_loss": 0.38755208253860474, "rewards/accuracies": 0.75, "rewards/chosen": -0.068004310131073, "rewards/margins": 0.10470708459615707, "rewards/rejected": -0.17271138727664948, "sft_loss": 0.68004310131073, "step": 1108 }, { "epoch": 1.6037599421547362, "grad_norm": 2.5244611709953664, "learning_rate": 6.833912255677289e-06, "logits/chosen": 0.05130451172590256, "logits/rejected": 0.19735708832740784, "logps/chosen": -0.6317247152328491, "logps/rejected": -2.590360164642334, "loss": 0.6713, "odds_ratio_loss": 0.25427278876304626, "rewards/accuracies": 1.0, "rewards/chosen": -0.06317247450351715, "rewards/margins": 0.1958635449409485, "rewards/rejected": -0.25903603434562683, "sft_loss": 0.6317247152328491, "step": 1109 }, { "epoch": 1.6052060737527114, "grad_norm": 2.5093017639783732, "learning_rate": 6.8317194835801505e-06, "logits/chosen": 0.16803748905658722, "logits/rejected": 0.15267445147037506, "logps/chosen": -0.4448273181915283, "logps/rejected": -2.7201156616210938, "loss": 0.5178, "odds_ratio_loss": 0.2554229199886322, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04448273032903671, "rewards/margins": 0.22752884030342102, "rewards/rejected": -0.27201157808303833, "sft_loss": 0.4448273181915283, "step": 1110 }, { "epoch": 1.6066522053506869, "grad_norm": 2.686052793049035, "learning_rate": 6.829525004228788e-06, "logits/chosen": 0.0710282251238823, "logits/rejected": 0.016714416444301605, "logps/chosen": -0.4681667983531952, "logps/rejected": -2.477705240249634, "loss": 0.5899, "odds_ratio_loss": 0.32242822647094727, "rewards/accuracies": 0.875, "rewards/chosen": -0.04681668430566788, "rewards/margins": 0.2009538859128952, "rewards/rejected": -0.2477705478668213, "sft_loss": 0.4681667983531952, "step": 1111 }, { "epoch": 1.6080983369486623, "grad_norm": 2.9752745108948813, "learning_rate": 6.827328818946263e-06, "logits/chosen": 0.006719652563333511, "logits/rejected": 0.06880474090576172, "logps/chosen": -0.7231383919715881, "logps/rejected": -2.544945478439331, "loss": 0.6282, "odds_ratio_loss": 0.22746002674102783, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0723138377070427, "rewards/margins": 0.182180717587471, "rewards/rejected": -0.2544945478439331, "sft_loss": 0.7231383919715881, "step": 1112 }, { "epoch": 1.6095444685466378, "grad_norm": 2.1692853490318122, "learning_rate": 6.825130929056662e-06, "logits/chosen": 0.14772772789001465, "logits/rejected": 0.10956289619207382, "logps/chosen": -0.7819063663482666, "logps/rejected": -1.8836016654968262, "loss": 0.7105, "odds_ratio_loss": 0.3727669417858124, "rewards/accuracies": 0.875, "rewards/chosen": -0.0781906396150589, "rewards/margins": 0.11016951501369476, "rewards/rejected": -0.18836016952991486, "sft_loss": 0.7819063663482666, "step": 1113 }, { "epoch": 1.6109906001446133, "grad_norm": 2.489057678072517, "learning_rate": 6.822931335885103e-06, "logits/chosen": 0.1797943413257599, "logits/rejected": 0.13633441925048828, "logps/chosen": -0.5514078140258789, "logps/rejected": -2.322840929031372, "loss": 0.6057, "odds_ratio_loss": 0.2773057520389557, "rewards/accuracies": 0.875, "rewards/chosen": -0.05514077842235565, "rewards/margins": 0.17714330554008484, "rewards/rejected": -0.2322840839624405, "sft_loss": 0.5514078140258789, "step": 1114 }, { "epoch": 1.6124367317425885, "grad_norm": 2.105562163406456, "learning_rate": 6.820730040757728e-06, "logits/chosen": 0.15017375349998474, "logits/rejected": 0.1785978227853775, "logps/chosen": -0.5378727316856384, "logps/rejected": -2.191357135772705, "loss": 0.6226, "odds_ratio_loss": 0.2285824716091156, "rewards/accuracies": 1.0, "rewards/chosen": -0.05378727242350578, "rewards/margins": 0.16534847021102905, "rewards/rejected": -0.21913574635982513, "sft_loss": 0.5378727316856384, "step": 1115 }, { "epoch": 1.613882863340564, "grad_norm": 2.3044143455434924, "learning_rate": 6.818527045001705e-06, "logits/chosen": 0.16752856969833374, "logits/rejected": 0.16827546060085297, "logps/chosen": -0.4314478933811188, "logps/rejected": -2.3377957344055176, "loss": 0.6104, "odds_ratio_loss": 0.2084464430809021, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04314478486776352, "rewards/margins": 0.19063478708267212, "rewards/rejected": -0.23377957940101624, "sft_loss": 0.4314478933811188, "step": 1116 }, { "epoch": 1.6153289949385394, "grad_norm": 2.3550945594014574, "learning_rate": 6.816322349945229e-06, "logits/chosen": 0.14617860317230225, "logits/rejected": 0.09834670275449753, "logps/chosen": -0.5126814246177673, "logps/rejected": -2.7283544540405273, "loss": 0.5873, "odds_ratio_loss": 0.17719094455242157, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05126814544200897, "rewards/margins": 0.221567302942276, "rewards/rejected": -0.2728354334831238, "sft_loss": 0.5126814246177673, "step": 1117 }, { "epoch": 1.6167751265365147, "grad_norm": 2.853329800579316, "learning_rate": 6.81411595691752e-06, "logits/chosen": 0.1595417559146881, "logits/rejected": 0.09003555774688721, "logps/chosen": -0.5111287832260132, "logps/rejected": -4.566333293914795, "loss": 0.5822, "odds_ratio_loss": 0.19126391410827637, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05111287906765938, "rewards/margins": 0.4055204391479492, "rewards/rejected": -0.4566333293914795, "sft_loss": 0.5111287832260132, "step": 1118 }, { "epoch": 1.6182212581344904, "grad_norm": 4.368835734538797, "learning_rate": 6.81190786724882e-06, "logits/chosen": 0.13393574953079224, "logits/rejected": 0.23001378774642944, "logps/chosen": -0.532265305519104, "logps/rejected": -2.221224308013916, "loss": 0.5696, "odds_ratio_loss": 0.27427342534065247, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0532265342772007, "rewards/margins": 0.1688959002494812, "rewards/rejected": -0.2221224159002304, "sft_loss": 0.532265305519104, "step": 1119 }, { "epoch": 1.6196673897324656, "grad_norm": 2.3014882949802646, "learning_rate": 6.809698082270394e-06, "logits/chosen": 0.08405912667512894, "logits/rejected": 0.14118604362010956, "logps/chosen": -0.742213249206543, "logps/rejected": -2.146195888519287, "loss": 0.6572, "odds_ratio_loss": 0.441084086894989, "rewards/accuracies": 0.75, "rewards/chosen": -0.07422132790088654, "rewards/margins": 0.14039826393127441, "rewards/rejected": -0.21461959183216095, "sft_loss": 0.742213249206543, "step": 1120 }, { "epoch": 1.621113521330441, "grad_norm": 4.67097973693336, "learning_rate": 6.80748660331453e-06, "logits/chosen": 0.1618950515985489, "logits/rejected": 0.035197075456380844, "logps/chosen": -0.7067070603370667, "logps/rejected": -1.7793760299682617, "loss": 0.6408, "odds_ratio_loss": 0.4611864686012268, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0706707015633583, "rewards/margins": 0.10726691782474518, "rewards/rejected": -0.17793762683868408, "sft_loss": 0.7067070603370667, "step": 1121 }, { "epoch": 1.6225596529284165, "grad_norm": 2.6910652991942787, "learning_rate": 6.8052734317145356e-06, "logits/chosen": 0.16007086634635925, "logits/rejected": 0.24231232702732086, "logps/chosen": -0.4963790774345398, "logps/rejected": -3.519035816192627, "loss": 0.6319, "odds_ratio_loss": 0.2605344355106354, "rewards/accuracies": 0.875, "rewards/chosen": -0.04963790625333786, "rewards/margins": 0.3022657036781311, "rewards/rejected": -0.35190361738204956, "sft_loss": 0.4963790774345398, "step": 1122 }, { "epoch": 1.6240057845263918, "grad_norm": 2.36256991290105, "learning_rate": 6.803058568804742e-06, "logits/chosen": 0.16033251583576202, "logits/rejected": 0.12260282039642334, "logps/chosen": -0.5150243639945984, "logps/rejected": -2.330399513244629, "loss": 0.4986, "odds_ratio_loss": 0.34891653060913086, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05150243639945984, "rewards/margins": 0.18153752386569977, "rewards/rejected": -0.2330399751663208, "sft_loss": 0.5150243639945984, "step": 1123 }, { "epoch": 1.6254519161243675, "grad_norm": 2.621102035932219, "learning_rate": 6.800842015920496e-06, "logits/chosen": 0.13604751229286194, "logits/rejected": 0.04528624191880226, "logps/chosen": -0.5994237661361694, "logps/rejected": -3.107229709625244, "loss": 0.6421, "odds_ratio_loss": 0.3047102689743042, "rewards/accuracies": 0.875, "rewards/chosen": -0.059942372143268585, "rewards/margins": 0.2507806122303009, "rewards/rejected": -0.3107229471206665, "sft_loss": 0.5994237661361694, "step": 1124 }, { "epoch": 1.6268980477223427, "grad_norm": 3.3189950012529845, "learning_rate": 6.798623774398169e-06, "logits/chosen": 0.2003846913576126, "logits/rejected": 0.11310932040214539, "logps/chosen": -0.5863832831382751, "logps/rejected": -1.5205748081207275, "loss": 0.6413, "odds_ratio_loss": 0.2823806405067444, "rewards/accuracies": 1.0, "rewards/chosen": -0.058638330549001694, "rewards/margins": 0.0934191569685936, "rewards/rejected": -0.1520574986934662, "sft_loss": 0.5863832831382751, "step": 1125 }, { "epoch": 1.6283441793203182, "grad_norm": 2.6471591529472693, "learning_rate": 6.796403845575145e-06, "logits/chosen": 0.2325344830751419, "logits/rejected": 0.1505952775478363, "logps/chosen": -0.40177321434020996, "logps/rejected": -2.4784207344055176, "loss": 0.6045, "odds_ratio_loss": 0.22741401195526123, "rewards/accuracies": 0.9375, "rewards/chosen": -0.040177322924137115, "rewards/margins": 0.20766472816467285, "rewards/rejected": -0.24784202873706818, "sft_loss": 0.40177321434020996, "step": 1126 }, { "epoch": 1.6297903109182936, "grad_norm": 2.475300022674049, "learning_rate": 6.794182230789827e-06, "logits/chosen": 0.16944287717342377, "logits/rejected": 0.20057857036590576, "logps/chosen": -0.5337626338005066, "logps/rejected": -1.8407460451126099, "loss": 0.5948, "odds_ratio_loss": 0.33305466175079346, "rewards/accuracies": 0.875, "rewards/chosen": -0.05337626859545708, "rewards/margins": 0.1306983232498169, "rewards/rejected": -0.18407458066940308, "sft_loss": 0.5337626338005066, "step": 1127 }, { "epoch": 1.6312364425162689, "grad_norm": 2.4279942191938293, "learning_rate": 6.7919589313816355e-06, "logits/chosen": 0.27863115072250366, "logits/rejected": 0.20137888193130493, "logps/chosen": -0.5784075260162354, "logps/rejected": -2.008815288543701, "loss": 0.5877, "odds_ratio_loss": 0.3455614745616913, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05784075707197189, "rewards/margins": 0.14304079115390778, "rewards/rejected": -0.20088155567646027, "sft_loss": 0.5784075260162354, "step": 1128 }, { "epoch": 1.6326825741142446, "grad_norm": 2.486755988814461, "learning_rate": 6.789733948691006e-06, "logits/chosen": 0.12318692356348038, "logits/rejected": 0.09866747260093689, "logps/chosen": -0.6427417397499084, "logps/rejected": -2.290992259979248, "loss": 0.6232, "odds_ratio_loss": 0.3308793306350708, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06427416950464249, "rewards/margins": 0.16482506692409515, "rewards/rejected": -0.22909922897815704, "sft_loss": 0.6427417397499084, "step": 1129 }, { "epoch": 1.6341287057122198, "grad_norm": 8.705640420104292, "learning_rate": 6.787507284059388e-06, "logits/chosen": 0.3377228379249573, "logits/rejected": 0.33408480882644653, "logps/chosen": -0.5667027235031128, "logps/rejected": -1.6456166505813599, "loss": 0.6168, "odds_ratio_loss": 0.39584583044052124, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05667027086019516, "rewards/margins": 0.10789139568805695, "rewards/rejected": -0.1645616590976715, "sft_loss": 0.5667027235031128, "step": 1130 }, { "epoch": 1.6355748373101953, "grad_norm": 2.23403133866239, "learning_rate": 6.785278938829248e-06, "logits/chosen": 0.18635720014572144, "logits/rejected": 0.12744563817977905, "logps/chosen": -0.5257724523544312, "logps/rejected": -3.5002336502075195, "loss": 0.5913, "odds_ratio_loss": 0.22851410508155823, "rewards/accuracies": 0.9375, "rewards/chosen": -0.052577245980501175, "rewards/margins": 0.2974461317062378, "rewards/rejected": -0.3500233590602875, "sft_loss": 0.5257724523544312, "step": 1131 }, { "epoch": 1.6370209689081707, "grad_norm": 2.2838844158071114, "learning_rate": 6.7830489143440625e-06, "logits/chosen": 0.24182364344596863, "logits/rejected": 0.12117785215377808, "logps/chosen": -0.5080547332763672, "logps/rejected": -1.8937900066375732, "loss": 0.6495, "odds_ratio_loss": 0.305306077003479, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0508054718375206, "rewards/margins": 0.1385735273361206, "rewards/rejected": -0.1893789917230606, "sft_loss": 0.5080547332763672, "step": 1132 }, { "epoch": 1.638467100506146, "grad_norm": 2.492923349789528, "learning_rate": 6.78081721194832e-06, "logits/chosen": 0.12716682255268097, "logits/rejected": 0.19468384981155396, "logps/chosen": -0.5700826644897461, "logps/rejected": -2.5006954669952393, "loss": 0.7108, "odds_ratio_loss": 0.24998816847801208, "rewards/accuracies": 0.875, "rewards/chosen": -0.05700826644897461, "rewards/margins": 0.19306129217147827, "rewards/rejected": -0.2500695586204529, "sft_loss": 0.5700826644897461, "step": 1133 }, { "epoch": 1.6399132321041214, "grad_norm": 2.473546132314206, "learning_rate": 6.778583832987524e-06, "logits/chosen": 0.21075762808322906, "logits/rejected": 0.07315226644277573, "logps/chosen": -0.5984592437744141, "logps/rejected": -2.3020665645599365, "loss": 0.6367, "odds_ratio_loss": 0.3468858599662781, "rewards/accuracies": 0.875, "rewards/chosen": -0.059845924377441406, "rewards/margins": 0.1703607439994812, "rewards/rejected": -0.2302066683769226, "sft_loss": 0.5984592437744141, "step": 1134 }, { "epoch": 1.641359363702097, "grad_norm": 3.815784113822362, "learning_rate": 6.776348778808187e-06, "logits/chosen": 0.2276933491230011, "logits/rejected": 0.20840172469615936, "logps/chosen": -0.4322483241558075, "logps/rejected": -2.1296682357788086, "loss": 0.5527, "odds_ratio_loss": 0.18725760281085968, "rewards/accuracies": 1.0, "rewards/chosen": -0.04322483763098717, "rewards/margins": 0.16974200308322906, "rewards/rejected": -0.21296685934066772, "sft_loss": 0.4322483241558075, "step": 1135 }, { "epoch": 1.6428054953000724, "grad_norm": 3.230042856607609, "learning_rate": 6.774112050757831e-06, "logits/chosen": 0.22059807181358337, "logits/rejected": 0.19416289031505585, "logps/chosen": -0.4906231760978699, "logps/rejected": -2.541626214981079, "loss": 0.6078, "odds_ratio_loss": 0.32527580857276917, "rewards/accuracies": 0.8125, "rewards/chosen": -0.04906231909990311, "rewards/margins": 0.20510032773017883, "rewards/rejected": -0.25416263937950134, "sft_loss": 0.4906231760978699, "step": 1136 }, { "epoch": 1.6442516268980478, "grad_norm": 2.507231602313924, "learning_rate": 6.771873650184987e-06, "logits/chosen": 0.2306595742702484, "logits/rejected": 0.17716917395591736, "logps/chosen": -0.6218141317367554, "logps/rejected": -3.662498950958252, "loss": 0.6342, "odds_ratio_loss": 0.29755860567092896, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06218141317367554, "rewards/margins": 0.30406850576400757, "rewards/rejected": -0.3662499189376831, "sft_loss": 0.6218141317367554, "step": 1137 }, { "epoch": 1.645697758496023, "grad_norm": 2.577574144850942, "learning_rate": 6.769633578439196e-06, "logits/chosen": 0.22073982656002045, "logits/rejected": 0.14277009665966034, "logps/chosen": -0.49371635913848877, "logps/rejected": -1.553049087524414, "loss": 0.5629, "odds_ratio_loss": 0.36073967814445496, "rewards/accuracies": 0.875, "rewards/chosen": -0.049371637403964996, "rewards/margins": 0.105933278799057, "rewards/rejected": -0.1553049087524414, "sft_loss": 0.49371635913848877, "step": 1138 }, { "epoch": 1.6471438900939985, "grad_norm": 2.5217495034916904, "learning_rate": 6.767391836871006e-06, "logits/chosen": 0.25824111700057983, "logits/rejected": 0.2234843224287033, "logps/chosen": -0.48958465456962585, "logps/rejected": -2.2131435871124268, "loss": 0.5043, "odds_ratio_loss": 0.2931782007217407, "rewards/accuracies": 0.9375, "rewards/chosen": -0.048958465456962585, "rewards/margins": 0.17235590517520905, "rewards/rejected": -0.22131435573101044, "sft_loss": 0.48958465456962585, "step": 1139 }, { "epoch": 1.648590021691974, "grad_norm": 3.0287506272655373, "learning_rate": 6.76514842683197e-06, "logits/chosen": 0.08717759698629379, "logits/rejected": 0.17295514047145844, "logps/chosen": -0.5925130844116211, "logps/rejected": -2.349668264389038, "loss": 0.6824, "odds_ratio_loss": 0.28350913524627686, "rewards/accuracies": 0.875, "rewards/chosen": -0.05925130844116211, "rewards/margins": 0.17571553587913513, "rewards/rejected": -0.23496684432029724, "sft_loss": 0.5925130844116211, "step": 1140 }, { "epoch": 1.6500361532899492, "grad_norm": 2.4412616506760427, "learning_rate": 6.7629033496746485e-06, "logits/chosen": 0.20098035037517548, "logits/rejected": 0.20969319343566895, "logps/chosen": -0.5476840138435364, "logps/rejected": -2.703965663909912, "loss": 0.6569, "odds_ratio_loss": 0.23726245760917664, "rewards/accuracies": 1.0, "rewards/chosen": -0.0547684021294117, "rewards/margins": 0.21562819182872772, "rewards/rejected": -0.2703965902328491, "sft_loss": 0.5476840138435364, "step": 1141 }, { "epoch": 1.651482284887925, "grad_norm": 3.32524625474085, "learning_rate": 6.760656606752608e-06, "logits/chosen": 0.19986768066883087, "logits/rejected": 0.2439718395471573, "logps/chosen": -0.5939256548881531, "logps/rejected": -3.133114814758301, "loss": 0.6345, "odds_ratio_loss": 0.3041260540485382, "rewards/accuracies": 0.875, "rewards/chosen": -0.05939256399869919, "rewards/margins": 0.25391891598701477, "rewards/rejected": -0.31331151723861694, "sft_loss": 0.5939256548881531, "step": 1142 }, { "epoch": 1.6529284164859002, "grad_norm": 2.9130226961401613, "learning_rate": 6.758408199420418e-06, "logits/chosen": 0.1656503677368164, "logits/rejected": 0.16937926411628723, "logps/chosen": -0.4139644503593445, "logps/rejected": -2.241116762161255, "loss": 0.5212, "odds_ratio_loss": 0.20857828855514526, "rewards/accuracies": 1.0, "rewards/chosen": -0.04139644652605057, "rewards/margins": 0.18271522223949432, "rewards/rejected": -0.2241116613149643, "sft_loss": 0.4139644503593445, "step": 1143 }, { "epoch": 1.6543745480838756, "grad_norm": 3.058127435624459, "learning_rate": 6.75615812903365e-06, "logits/chosen": 0.23680804669857025, "logits/rejected": 0.1664726287126541, "logps/chosen": -0.6967654228210449, "logps/rejected": -1.6403802633285522, "loss": 0.571, "odds_ratio_loss": 0.4093691408634186, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06967654079198837, "rewards/margins": 0.09436149895191193, "rewards/rejected": -0.1640380322933197, "sft_loss": 0.6967654228210449, "step": 1144 }, { "epoch": 1.655820679681851, "grad_norm": 2.1730215190277935, "learning_rate": 6.7539063969488825e-06, "logits/chosen": 0.18492895364761353, "logits/rejected": 0.26141905784606934, "logps/chosen": -0.5463985800743103, "logps/rejected": -1.4454739093780518, "loss": 0.6343, "odds_ratio_loss": 0.34149205684661865, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05463985726237297, "rewards/margins": 0.08990754932165146, "rewards/rejected": -0.14454740285873413, "sft_loss": 0.5463985800743103, "step": 1145 }, { "epoch": 1.6572668112798263, "grad_norm": 3.374659455042283, "learning_rate": 6.75165300452369e-06, "logits/chosen": 0.11994520574808121, "logits/rejected": 0.09264302998781204, "logps/chosen": -0.6896888613700867, "logps/rejected": -2.469841480255127, "loss": 0.5971, "odds_ratio_loss": 0.4484861493110657, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06896888464689255, "rewards/margins": 0.17801526188850403, "rewards/rejected": -0.24698415398597717, "sft_loss": 0.6896888613700867, "step": 1146 }, { "epoch": 1.658712942877802, "grad_norm": 2.7494237048983106, "learning_rate": 6.749397953116654e-06, "logits/chosen": 0.21989181637763977, "logits/rejected": 0.13300833106040955, "logps/chosen": -0.5863887071609497, "logps/rejected": -2.8034584522247314, "loss": 0.5857, "odds_ratio_loss": 0.419039785861969, "rewards/accuracies": 0.875, "rewards/chosen": -0.05863887071609497, "rewards/margins": 0.22170698642730713, "rewards/rejected": -0.2803458273410797, "sft_loss": 0.5863887071609497, "step": 1147 }, { "epoch": 1.6601590744757773, "grad_norm": 2.464174573411112, "learning_rate": 6.747141244087352e-06, "logits/chosen": 0.2038121223449707, "logits/rejected": 0.08812229335308075, "logps/chosen": -0.5297778844833374, "logps/rejected": -2.8625032901763916, "loss": 0.6452, "odds_ratio_loss": 0.19093012809753418, "rewards/accuracies": 0.875, "rewards/chosen": -0.0529777929186821, "rewards/margins": 0.23327253758907318, "rewards/rejected": -0.2862503230571747, "sft_loss": 0.5297778844833374, "step": 1148 }, { "epoch": 1.6616052060737527, "grad_norm": 2.275862629901525, "learning_rate": 6.744882878796362e-06, "logits/chosen": 0.2904704511165619, "logits/rejected": 0.29210197925567627, "logps/chosen": -0.47496497631073, "logps/rejected": -2.142214298248291, "loss": 0.5979, "odds_ratio_loss": 0.3119732737541199, "rewards/accuracies": 0.875, "rewards/chosen": -0.047496501356363297, "rewards/margins": 0.16672493517398834, "rewards/rejected": -0.21422143280506134, "sft_loss": 0.47496497631073, "step": 1149 }, { "epoch": 1.6630513376717282, "grad_norm": 2.4348073492732447, "learning_rate": 6.742622858605262e-06, "logits/chosen": 0.06364156305789948, "logits/rejected": 0.03990757837891579, "logps/chosen": -0.48284998536109924, "logps/rejected": -4.278642654418945, "loss": 0.6012, "odds_ratio_loss": 0.14767947793006897, "rewards/accuracies": 1.0, "rewards/chosen": -0.048285000026226044, "rewards/margins": 0.37957924604415894, "rewards/rejected": -0.42786428332328796, "sft_loss": 0.48284998536109924, "step": 1150 }, { "epoch": 1.6644974692697034, "grad_norm": 2.5895585584395073, "learning_rate": 6.740361184876625e-06, "logits/chosen": 0.1997375339269638, "logits/rejected": 0.17401760816574097, "logps/chosen": -0.4806000888347626, "logps/rejected": -1.8395509719848633, "loss": 0.5594, "odds_ratio_loss": 0.27543121576309204, "rewards/accuracies": 0.875, "rewards/chosen": -0.048060011118650436, "rewards/margins": 0.13589510321617126, "rewards/rejected": -0.1839551031589508, "sft_loss": 0.4806000888347626, "step": 1151 }, { "epoch": 1.6659436008676791, "grad_norm": 4.701884489199648, "learning_rate": 6.738097858974024e-06, "logits/chosen": 0.19600707292556763, "logits/rejected": 0.16965457797050476, "logps/chosen": -0.6757255792617798, "logps/rejected": -1.1500645875930786, "loss": 0.6207, "odds_ratio_loss": 0.6113768815994263, "rewards/accuracies": 0.625, "rewards/chosen": -0.06757256388664246, "rewards/margins": 0.04743390157818794, "rewards/rejected": -0.1150064542889595, "sft_loss": 0.6757255792617798, "step": 1152 }, { "epoch": 1.6673897324656544, "grad_norm": 3.175165956336856, "learning_rate": 6.735832882262026e-06, "logits/chosen": 0.08555848896503448, "logits/rejected": 0.10591746121644974, "logps/chosen": -0.4486284852027893, "logps/rejected": -2.222221612930298, "loss": 0.5803, "odds_ratio_loss": 0.1979960799217224, "rewards/accuracies": 1.0, "rewards/chosen": -0.04486284777522087, "rewards/margins": 0.17735931277275085, "rewards/rejected": -0.22222216427326202, "sft_loss": 0.4486284852027893, "step": 1153 }, { "epoch": 1.6688358640636298, "grad_norm": 2.720757064167482, "learning_rate": 6.733566256106193e-06, "logits/chosen": 0.4870707392692566, "logits/rejected": 0.2844332158565521, "logps/chosen": -0.6252628564834595, "logps/rejected": -2.486476421356201, "loss": 0.6303, "odds_ratio_loss": 0.3258809447288513, "rewards/accuracies": 0.75, "rewards/chosen": -0.06252628564834595, "rewards/margins": 0.1861213743686676, "rewards/rejected": -0.24864766001701355, "sft_loss": 0.6252628564834595, "step": 1154 }, { "epoch": 1.6702819956616053, "grad_norm": 5.187324442141512, "learning_rate": 6.731297981873086e-06, "logits/chosen": 0.1666257381439209, "logits/rejected": 0.1535015106201172, "logps/chosen": -0.6065875887870789, "logps/rejected": -2.051072359085083, "loss": 0.6106, "odds_ratio_loss": 0.43608808517456055, "rewards/accuracies": 0.6875, "rewards/chosen": -0.060658760368824005, "rewards/margins": 0.14444848895072937, "rewards/rejected": -0.20510724186897278, "sft_loss": 0.6065875887870789, "step": 1155 }, { "epoch": 1.6717281272595805, "grad_norm": 2.1684485335228354, "learning_rate": 6.729028060930251e-06, "logits/chosen": 0.358548104763031, "logits/rejected": 0.1484028548002243, "logps/chosen": -0.485321581363678, "logps/rejected": -3.6741294860839844, "loss": 0.589, "odds_ratio_loss": 0.25323089957237244, "rewards/accuracies": 0.875, "rewards/chosen": -0.0485321581363678, "rewards/margins": 0.3188807964324951, "rewards/rejected": -0.3674129545688629, "sft_loss": 0.485321581363678, "step": 1156 }, { "epoch": 1.673174258857556, "grad_norm": 2.46870459613307, "learning_rate": 6.726756494646235e-06, "logits/chosen": 0.16199025511741638, "logits/rejected": 0.1509033590555191, "logps/chosen": -0.6030094027519226, "logps/rejected": -1.4246817827224731, "loss": 0.5609, "odds_ratio_loss": 0.36534902453422546, "rewards/accuracies": 0.875, "rewards/chosen": -0.06030093878507614, "rewards/margins": 0.08216723799705505, "rewards/rejected": -0.1424681842327118, "sft_loss": 0.6030094027519226, "step": 1157 }, { "epoch": 1.6746203904555315, "grad_norm": 2.4726592245004633, "learning_rate": 6.7244832843905725e-06, "logits/chosen": 0.17424005270004272, "logits/rejected": 0.15898647904396057, "logps/chosen": -0.5712554454803467, "logps/rejected": -2.1054673194885254, "loss": 0.5444, "odds_ratio_loss": 0.21559767425060272, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05712554603815079, "rewards/margins": 0.15342120826244354, "rewards/rejected": -0.21054676175117493, "sft_loss": 0.5712554454803467, "step": 1158 }, { "epoch": 1.676066522053507, "grad_norm": 3.5416468577213345, "learning_rate": 6.72220843153379e-06, "logits/chosen": 0.2255643904209137, "logits/rejected": 0.13521726429462433, "logps/chosen": -0.5240362882614136, "logps/rejected": -2.168708324432373, "loss": 0.6036, "odds_ratio_loss": 0.3438939154148102, "rewards/accuracies": 0.8125, "rewards/chosen": -0.052403632551431656, "rewards/margins": 0.16446718573570251, "rewards/rejected": -0.21687081456184387, "sft_loss": 0.5240362882614136, "step": 1159 }, { "epoch": 1.6775126536514824, "grad_norm": 3.254244381492882, "learning_rate": 6.719931937447407e-06, "logits/chosen": 0.3357830047607422, "logits/rejected": 0.259264200925827, "logps/chosen": -0.542752206325531, "logps/rejected": -3.2315144538879395, "loss": 0.707, "odds_ratio_loss": 0.31859856843948364, "rewards/accuracies": 0.875, "rewards/chosen": -0.05427522212266922, "rewards/margins": 0.26887619495391846, "rewards/rejected": -0.3231514096260071, "sft_loss": 0.542752206325531, "step": 1160 }, { "epoch": 1.6789587852494576, "grad_norm": 5.511096848650093, "learning_rate": 6.717653803503928e-06, "logits/chosen": 0.22886249423027039, "logits/rejected": 0.11704693734645844, "logps/chosen": -0.5869529843330383, "logps/rejected": -2.572026252746582, "loss": 0.5864, "odds_ratio_loss": 0.27222052216529846, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05869529768824577, "rewards/margins": 0.19850732386112213, "rewards/rejected": -0.2572026252746582, "sft_loss": 0.5869529843330383, "step": 1161 }, { "epoch": 1.680404916847433, "grad_norm": 2.5036032853106014, "learning_rate": 6.71537403107685e-06, "logits/chosen": 0.15184980630874634, "logits/rejected": 0.1305277943611145, "logps/chosen": -0.5590934157371521, "logps/rejected": -2.7704367637634277, "loss": 0.5799, "odds_ratio_loss": 0.2579389810562134, "rewards/accuracies": 0.875, "rewards/chosen": -0.05590933561325073, "rewards/margins": 0.22113433480262756, "rewards/rejected": -0.2770436704158783, "sft_loss": 0.5590934157371521, "step": 1162 }, { "epoch": 1.6818510484454086, "grad_norm": 2.6005042356521564, "learning_rate": 6.713092621540655e-06, "logits/chosen": 0.28715190291404724, "logits/rejected": 0.11853550374507904, "logps/chosen": -0.7113485932350159, "logps/rejected": -2.0638599395751953, "loss": 0.6124, "odds_ratio_loss": 0.4370325803756714, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07113486528396606, "rewards/margins": 0.13525114953517914, "rewards/rejected": -0.206385999917984, "sft_loss": 0.7113485932350159, "step": 1163 }, { "epoch": 1.6832971800433838, "grad_norm": 2.3848381421994724, "learning_rate": 6.7108095762708136e-06, "logits/chosen": 0.1905509978532791, "logits/rejected": 0.1692703366279602, "logps/chosen": -0.550454318523407, "logps/rejected": -4.018769264221191, "loss": 0.5935, "odds_ratio_loss": 0.2899460792541504, "rewards/accuracies": 0.9375, "rewards/chosen": -0.055045437067747116, "rewards/margins": 0.3468315005302429, "rewards/rejected": -0.40187692642211914, "sft_loss": 0.550454318523407, "step": 1164 }, { "epoch": 1.6847433116413595, "grad_norm": 2.7571571011622105, "learning_rate": 6.708524896643782e-06, "logits/chosen": 0.188361257314682, "logits/rejected": 0.0712515264749527, "logps/chosen": -0.5353450775146484, "logps/rejected": -2.714508533477783, "loss": 0.634, "odds_ratio_loss": 0.28433114290237427, "rewards/accuracies": 0.9375, "rewards/chosen": -0.053534507751464844, "rewards/margins": 0.2179163247346878, "rewards/rejected": -0.27145081758499146, "sft_loss": 0.5353450775146484, "step": 1165 }, { "epoch": 1.6861894432393347, "grad_norm": 2.3320851121282438, "learning_rate": 6.706238584037003e-06, "logits/chosen": 0.15112724900245667, "logits/rejected": 0.12381379306316376, "logps/chosen": -0.4415856897830963, "logps/rejected": -3.0355641841888428, "loss": 0.5931, "odds_ratio_loss": 0.18794691562652588, "rewards/accuracies": 1.0, "rewards/chosen": -0.04415857046842575, "rewards/margins": 0.25939783453941345, "rewards/rejected": -0.3035564124584198, "sft_loss": 0.4415856897830963, "step": 1166 }, { "epoch": 1.6876355748373102, "grad_norm": 2.506114425073133, "learning_rate": 6.703950639828903e-06, "logits/chosen": 0.2930244207382202, "logits/rejected": 0.28212878108024597, "logps/chosen": -0.5601966977119446, "logps/rejected": -2.481189489364624, "loss": 0.627, "odds_ratio_loss": 0.3866164982318878, "rewards/accuracies": 0.75, "rewards/chosen": -0.05601967126131058, "rewards/margins": 0.19209925830364227, "rewards/rejected": -0.24811893701553345, "sft_loss": 0.5601966977119446, "step": 1167 }, { "epoch": 1.6890817064352857, "grad_norm": 2.3644437186432645, "learning_rate": 6.701661065398892e-06, "logits/chosen": 0.17681562900543213, "logits/rejected": 0.14664041996002197, "logps/chosen": -0.5699477195739746, "logps/rejected": -1.8158708810806274, "loss": 0.6535, "odds_ratio_loss": 0.3121950030326843, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05699477344751358, "rewards/margins": 0.12459231168031693, "rewards/rejected": -0.1815870851278305, "sft_loss": 0.5699477195739746, "step": 1168 }, { "epoch": 1.690527838033261, "grad_norm": 4.165250675002018, "learning_rate": 6.699369862127362e-06, "logits/chosen": 0.26053327322006226, "logits/rejected": 0.1782192438840866, "logps/chosen": -0.5276187658309937, "logps/rejected": -2.0793633460998535, "loss": 0.4889, "odds_ratio_loss": 0.39804819226264954, "rewards/accuracies": 0.6875, "rewards/chosen": -0.05276188254356384, "rewards/margins": 0.15517446398735046, "rewards/rejected": -0.2079363316297531, "sft_loss": 0.5276187658309937, "step": 1169 }, { "epoch": 1.6919739696312366, "grad_norm": 2.7717506547854094, "learning_rate": 6.69707703139569e-06, "logits/chosen": 0.3379908800125122, "logits/rejected": 0.1853041648864746, "logps/chosen": -0.6339027881622314, "logps/rejected": -2.206711530685425, "loss": 0.6035, "odds_ratio_loss": 0.3447216749191284, "rewards/accuracies": 0.875, "rewards/chosen": -0.06339027732610703, "rewards/margins": 0.15728086233139038, "rewards/rejected": -0.220671147108078, "sft_loss": 0.6339027881622314, "step": 1170 }, { "epoch": 1.6934201012292118, "grad_norm": 2.876125870078734, "learning_rate": 6.694782574586229e-06, "logits/chosen": 0.14711743593215942, "logits/rejected": 0.1626882553100586, "logps/chosen": -0.43197083473205566, "logps/rejected": -3.0947062969207764, "loss": 0.6135, "odds_ratio_loss": 0.23374246060848236, "rewards/accuracies": 0.875, "rewards/chosen": -0.04319708049297333, "rewards/margins": 0.266273558139801, "rewards/rejected": -0.30947065353393555, "sft_loss": 0.43197083473205566, "step": 1171 }, { "epoch": 1.6948662328271873, "grad_norm": 3.1337461461664975, "learning_rate": 6.692486493082317e-06, "logits/chosen": 0.19641858339309692, "logits/rejected": 0.11144060641527176, "logps/chosen": -0.6843913793563843, "logps/rejected": -2.083221435546875, "loss": 0.6497, "odds_ratio_loss": 0.38456079363822937, "rewards/accuracies": 0.75, "rewards/chosen": -0.06843913346529007, "rewards/margins": 0.13988301157951355, "rewards/rejected": -0.20832215249538422, "sft_loss": 0.6843913793563843, "step": 1172 }, { "epoch": 1.6963123644251628, "grad_norm": 3.552236706747717, "learning_rate": 6.690188788268273e-06, "logits/chosen": 0.21632656455039978, "logits/rejected": 0.115239217877388, "logps/chosen": -0.6581906676292419, "logps/rejected": -1.6244016885757446, "loss": 0.66, "odds_ratio_loss": 0.46846601366996765, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06581906229257584, "rewards/margins": 0.09662109613418579, "rewards/rejected": -0.16244015097618103, "sft_loss": 0.6581906676292419, "step": 1173 }, { "epoch": 1.697758496023138, "grad_norm": 2.557436742334174, "learning_rate": 6.687889461529386e-06, "logits/chosen": 0.2814568877220154, "logits/rejected": 0.18137472867965698, "logps/chosen": -0.5548322200775146, "logps/rejected": -2.5669074058532715, "loss": 0.6203, "odds_ratio_loss": 0.26014259457588196, "rewards/accuracies": 0.9375, "rewards/chosen": -0.055483222007751465, "rewards/margins": 0.20120754837989807, "rewards/rejected": -0.25669077038764954, "sft_loss": 0.5548322200775146, "step": 1174 }, { "epoch": 1.6992046276211137, "grad_norm": 3.737744307891954, "learning_rate": 6.685588514251934e-06, "logits/chosen": 0.1883585900068283, "logits/rejected": 0.11404610425233841, "logps/chosen": -0.5273705124855042, "logps/rejected": -2.0571155548095703, "loss": 0.6086, "odds_ratio_loss": 0.21898625791072845, "rewards/accuracies": 0.9375, "rewards/chosen": -0.052737053483724594, "rewards/margins": 0.15297451615333557, "rewards/rejected": -0.20571157336235046, "sft_loss": 0.5273705124855042, "step": 1175 }, { "epoch": 1.700650759219089, "grad_norm": 2.514643381538017, "learning_rate": 6.6832859478231635e-06, "logits/chosen": 0.1959303468465805, "logits/rejected": 0.08744394034147263, "logps/chosen": -0.5547116994857788, "logps/rejected": -2.5926737785339355, "loss": 0.6747, "odds_ratio_loss": 0.31549614667892456, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05547117441892624, "rewards/margins": 0.20379620790481567, "rewards/rejected": -0.2592673897743225, "sft_loss": 0.5547116994857788, "step": 1176 }, { "epoch": 1.7020968908170644, "grad_norm": 2.6698203453703973, "learning_rate": 6.680981763631303e-06, "logits/chosen": 0.12705998122692108, "logits/rejected": 0.08648968487977982, "logps/chosen": -0.5197083353996277, "logps/rejected": -3.4079229831695557, "loss": 0.5662, "odds_ratio_loss": 0.25399935245513916, "rewards/accuracies": 0.875, "rewards/chosen": -0.051970839500427246, "rewards/margins": 0.2888214588165283, "rewards/rejected": -0.34079229831695557, "sft_loss": 0.5197083353996277, "step": 1177 }, { "epoch": 1.7035430224150399, "grad_norm": 4.278657923028944, "learning_rate": 6.6786759630655505e-06, "logits/chosen": -0.007065432146191597, "logits/rejected": 0.04697701707482338, "logps/chosen": -0.5839925408363342, "logps/rejected": -2.6068286895751953, "loss": 0.6194, "odds_ratio_loss": 0.24275022745132446, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0583992525935173, "rewards/margins": 0.20228362083435059, "rewards/rejected": -0.2606828808784485, "sft_loss": 0.5839925408363342, "step": 1178 }, { "epoch": 1.704989154013015, "grad_norm": 2.3362879167020836, "learning_rate": 6.676368547516084e-06, "logits/chosen": 0.22390501201152802, "logits/rejected": 0.16717907786369324, "logps/chosen": -0.5782084465026855, "logps/rejected": -2.7273147106170654, "loss": 0.6032, "odds_ratio_loss": 0.24216079711914062, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05782084912061691, "rewards/margins": 0.214910626411438, "rewards/rejected": -0.2727314829826355, "sft_loss": 0.5782084465026855, "step": 1179 }, { "epoch": 1.7064352856109906, "grad_norm": 3.2234699582467075, "learning_rate": 6.674059518374052e-06, "logits/chosen": 0.13133449852466583, "logits/rejected": 0.12285110354423523, "logps/chosen": -0.5841748118400574, "logps/rejected": -1.5109751224517822, "loss": 0.6516, "odds_ratio_loss": 0.34580686688423157, "rewards/accuracies": 0.875, "rewards/chosen": -0.058417484164237976, "rewards/margins": 0.09268002957105637, "rewards/rejected": -0.15109750628471375, "sft_loss": 0.5841748118400574, "step": 1180 }, { "epoch": 1.707881417208966, "grad_norm": 3.8578564678205725, "learning_rate": 6.671748877031577e-06, "logits/chosen": 0.17093250155448914, "logits/rejected": 0.1525716334581375, "logps/chosen": -0.7296780347824097, "logps/rejected": -3.0437965393066406, "loss": 0.6778, "odds_ratio_loss": 0.31801509857177734, "rewards/accuracies": 0.875, "rewards/chosen": -0.07296780496835709, "rewards/margins": 0.23141184449195862, "rewards/rejected": -0.3043796420097351, "sft_loss": 0.7296780347824097, "step": 1181 }, { "epoch": 1.7093275488069413, "grad_norm": 2.8550366156495226, "learning_rate": 6.6694366248817544e-06, "logits/chosen": 0.07962992042303085, "logits/rejected": 0.10816764831542969, "logps/chosen": -0.5177446007728577, "logps/rejected": -3.558342456817627, "loss": 0.6032, "odds_ratio_loss": 0.2068100869655609, "rewards/accuracies": 0.875, "rewards/chosen": -0.051774464547634125, "rewards/margins": 0.304059773683548, "rewards/rejected": -0.3558342158794403, "sft_loss": 0.5177446007728577, "step": 1182 }, { "epoch": 1.710773680404917, "grad_norm": 2.7311591788385896, "learning_rate": 6.667122763318648e-06, "logits/chosen": 0.23495501279830933, "logits/rejected": 0.21480917930603027, "logps/chosen": -0.5341576337814331, "logps/rejected": -2.850303888320923, "loss": 0.6349, "odds_ratio_loss": 0.2973088324069977, "rewards/accuracies": 0.875, "rewards/chosen": -0.05341576412320137, "rewards/margins": 0.2316146194934845, "rewards/rejected": -0.2850303649902344, "sft_loss": 0.5341576337814331, "step": 1183 }, { "epoch": 1.7122198120028922, "grad_norm": 2.3366324777160488, "learning_rate": 6.664807293737293e-06, "logits/chosen": 0.22794067859649658, "logits/rejected": 0.20764786005020142, "logps/chosen": -0.5206387639045715, "logps/rejected": -2.451005458831787, "loss": 0.6059, "odds_ratio_loss": 0.226045623421669, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05206387862563133, "rewards/margins": 0.19303664565086365, "rewards/rejected": -0.24510052800178528, "sft_loss": 0.5206387639045715, "step": 1184 }, { "epoch": 1.7136659436008677, "grad_norm": 2.9344919439096446, "learning_rate": 6.662490217533697e-06, "logits/chosen": 0.18199431896209717, "logits/rejected": 0.08673392236232758, "logps/chosen": -0.6532351970672607, "logps/rejected": -3.143551826477051, "loss": 0.6625, "odds_ratio_loss": 0.4036434292793274, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06532351672649384, "rewards/margins": 0.2490316778421402, "rewards/rejected": -0.31435519456863403, "sft_loss": 0.6532351970672607, "step": 1185 }, { "epoch": 1.7151120751988431, "grad_norm": 3.033539871570757, "learning_rate": 6.660171536104833e-06, "logits/chosen": 0.13163414597511292, "logits/rejected": 0.13655593991279602, "logps/chosen": -0.6226808428764343, "logps/rejected": -3.9001150131225586, "loss": 0.5912, "odds_ratio_loss": 0.14715641736984253, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06226808577775955, "rewards/margins": 0.32774341106414795, "rewards/rejected": -0.3900114893913269, "sft_loss": 0.6226808428764343, "step": 1186 }, { "epoch": 1.7165582067968184, "grad_norm": 2.935200053719907, "learning_rate": 6.6578512508486425e-06, "logits/chosen": 0.17753368616104126, "logits/rejected": 0.0922786295413971, "logps/chosen": -0.5860269069671631, "logps/rejected": -1.86979341506958, "loss": 0.5886, "odds_ratio_loss": 0.35702764987945557, "rewards/accuracies": 0.875, "rewards/chosen": -0.05860269442200661, "rewards/margins": 0.12837664783000946, "rewards/rejected": -0.18697935342788696, "sft_loss": 0.5860269069671631, "step": 1187 }, { "epoch": 1.718004338394794, "grad_norm": 2.983899984553299, "learning_rate": 6.655529363164033e-06, "logits/chosen": 0.236445814371109, "logits/rejected": 0.2024727165699005, "logps/chosen": -0.5069154500961304, "logps/rejected": -2.2717783451080322, "loss": 0.7185, "odds_ratio_loss": 0.2174503207206726, "rewards/accuracies": 0.9375, "rewards/chosen": -0.050691548734903336, "rewards/margins": 0.1764862984418869, "rewards/rejected": -0.22717782855033875, "sft_loss": 0.5069154500961304, "step": 1188 }, { "epoch": 1.7194504699927693, "grad_norm": 3.5801911999906015, "learning_rate": 6.653205874450881e-06, "logits/chosen": 0.2863970398902893, "logits/rejected": 0.28775012493133545, "logps/chosen": -0.3487379848957062, "logps/rejected": -2.2851011753082275, "loss": 0.5455, "odds_ratio_loss": 0.23209895193576813, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03487379476428032, "rewards/margins": 0.19363632798194885, "rewards/rejected": -0.22851011157035828, "sft_loss": 0.3487379848957062, "step": 1189 }, { "epoch": 1.7208966015907448, "grad_norm": 2.235271967911421, "learning_rate": 6.650880786110026e-06, "logits/chosen": 0.13730399310588837, "logits/rejected": 0.10951994359493256, "logps/chosen": -0.6758534908294678, "logps/rejected": -2.470485210418701, "loss": 0.6652, "odds_ratio_loss": 0.3634181320667267, "rewards/accuracies": 0.75, "rewards/chosen": -0.06758534908294678, "rewards/margins": 0.17946317791938782, "rewards/rejected": -0.2470485270023346, "sft_loss": 0.6758534908294678, "step": 1190 }, { "epoch": 1.7223427331887202, "grad_norm": 2.135760086462582, "learning_rate": 6.6485540995432715e-06, "logits/chosen": 0.20422452688217163, "logits/rejected": 0.14295119047164917, "logps/chosen": -0.6017346978187561, "logps/rejected": -1.865221619606018, "loss": 0.5539, "odds_ratio_loss": 0.22437524795532227, "rewards/accuracies": 1.0, "rewards/chosen": -0.06017346307635307, "rewards/margins": 0.12634870409965515, "rewards/rejected": -0.18652215600013733, "sft_loss": 0.6017346978187561, "step": 1191 }, { "epoch": 1.7237888647866955, "grad_norm": 2.445534213047608, "learning_rate": 6.6462258161533854e-06, "logits/chosen": 0.18580156564712524, "logits/rejected": 0.10908015072345734, "logps/chosen": -0.5201189517974854, "logps/rejected": -2.300976276397705, "loss": 0.5537, "odds_ratio_loss": 0.29101625084877014, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05201189965009689, "rewards/margins": 0.17808572947978973, "rewards/rejected": -0.23009763658046722, "sft_loss": 0.5201189517974854, "step": 1192 }, { "epoch": 1.7252349963846711, "grad_norm": 2.7089892319390536, "learning_rate": 6.6438959373440995e-06, "logits/chosen": 0.10451790690422058, "logits/rejected": 0.152940571308136, "logps/chosen": -0.445254385471344, "logps/rejected": -2.340458393096924, "loss": 0.5893, "odds_ratio_loss": 0.17854472994804382, "rewards/accuracies": 1.0, "rewards/chosen": -0.04452543705701828, "rewards/margins": 0.18952038884162903, "rewards/rejected": -0.2340458333492279, "sft_loss": 0.445254385471344, "step": 1193 }, { "epoch": 1.7266811279826464, "grad_norm": 3.341107839803183, "learning_rate": 6.641564464520107e-06, "logits/chosen": 0.08841860294342041, "logits/rejected": 0.07997505366802216, "logps/chosen": -0.9160062074661255, "logps/rejected": -2.7940661907196045, "loss": 0.6897, "odds_ratio_loss": 0.35329318046569824, "rewards/accuracies": 0.8125, "rewards/chosen": -0.09160062670707703, "rewards/margins": 0.18780598044395447, "rewards/rejected": -0.2794066369533539, "sft_loss": 0.9160062074661255, "step": 1194 }, { "epoch": 1.7281272595806219, "grad_norm": 2.9141882593293986, "learning_rate": 6.6392313990870606e-06, "logits/chosen": 0.058398887515068054, "logits/rejected": 0.1468515694141388, "logps/chosen": -0.6274632215499878, "logps/rejected": -1.6033984422683716, "loss": 0.6933, "odds_ratio_loss": 0.3369959592819214, "rewards/accuracies": 0.875, "rewards/chosen": -0.0627463236451149, "rewards/margins": 0.0975935310125351, "rewards/rejected": -0.1603398323059082, "sft_loss": 0.6274632215499878, "step": 1195 }, { "epoch": 1.7295733911785973, "grad_norm": 3.9876324584086067, "learning_rate": 6.636896742451573e-06, "logits/chosen": 0.139909565448761, "logits/rejected": 0.08156219869852066, "logps/chosen": -0.6069450378417969, "logps/rejected": -2.3972322940826416, "loss": 0.6371, "odds_ratio_loss": 0.25904297828674316, "rewards/accuracies": 0.875, "rewards/chosen": -0.06069450080394745, "rewards/margins": 0.1790287345647812, "rewards/rejected": -0.23972323536872864, "sft_loss": 0.6069450378417969, "step": 1196 }, { "epoch": 1.7310195227765726, "grad_norm": 3.4932750748545254, "learning_rate": 6.634560496021219e-06, "logits/chosen": 0.17661643028259277, "logits/rejected": 0.10911494493484497, "logps/chosen": -0.524027943611145, "logps/rejected": -1.6556886434555054, "loss": 0.6024, "odds_ratio_loss": 0.2998625636100769, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0524027943611145, "rewards/margins": 0.11316606402397156, "rewards/rejected": -0.16556887328624725, "sft_loss": 0.524027943611145, "step": 1197 }, { "epoch": 1.7324656543745482, "grad_norm": 2.3743801625118564, "learning_rate": 6.632222661204529e-06, "logits/chosen": 0.11895874887704849, "logits/rejected": 0.021969705820083618, "logps/chosen": -0.45106062293052673, "logps/rejected": -2.416294574737549, "loss": 0.5822, "odds_ratio_loss": 0.26725825667381287, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04510606452822685, "rewards/margins": 0.19652342796325684, "rewards/rejected": -0.24162951111793518, "sft_loss": 0.45106062293052673, "step": 1198 }, { "epoch": 1.7339117859725235, "grad_norm": 4.233318357941509, "learning_rate": 6.629883239410995e-06, "logits/chosen": 0.3369359076023102, "logits/rejected": 0.1930331587791443, "logps/chosen": -0.5083121061325073, "logps/rejected": -2.8376667499542236, "loss": 0.6137, "odds_ratio_loss": 0.2584281861782074, "rewards/accuracies": 1.0, "rewards/chosen": -0.05083121359348297, "rewards/margins": 0.23293545842170715, "rewards/rejected": -0.28376662731170654, "sft_loss": 0.5083121061325073, "step": 1199 }, { "epoch": 1.735357917570499, "grad_norm": 3.619796068905745, "learning_rate": 6.62754223205106e-06, "logits/chosen": 0.15451756119728088, "logits/rejected": 0.08340193331241608, "logps/chosen": -0.6563261151313782, "logps/rejected": -2.759770631790161, "loss": 0.6097, "odds_ratio_loss": 0.2600904107093811, "rewards/accuracies": 1.0, "rewards/chosen": -0.06563261151313782, "rewards/margins": 0.21034446358680725, "rewards/rejected": -0.27597707509994507, "sft_loss": 0.6563261151313782, "step": 1200 }, { "epoch": 1.7368040491684744, "grad_norm": 3.0571929060012524, "learning_rate": 6.625199640536127e-06, "logits/chosen": 0.20956231653690338, "logits/rejected": 0.13567481935024261, "logps/chosen": -0.5605679154396057, "logps/rejected": -2.7481789588928223, "loss": 0.5964, "odds_ratio_loss": 0.3873436748981476, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05605679005384445, "rewards/margins": 0.2187611162662506, "rewards/rejected": -0.27481788396835327, "sft_loss": 0.5605679154396057, "step": 1201 }, { "epoch": 1.7382501807664497, "grad_norm": 2.9617368021465893, "learning_rate": 6.622855466278554e-06, "logits/chosen": 0.20532143115997314, "logits/rejected": 0.1692730188369751, "logps/chosen": -0.4614868462085724, "logps/rejected": -3.6775882244110107, "loss": 0.5834, "odds_ratio_loss": 0.18730753660202026, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04614868387579918, "rewards/margins": 0.32161015272140503, "rewards/rejected": -0.3677588403224945, "sft_loss": 0.4614868462085724, "step": 1202 }, { "epoch": 1.7396963123644251, "grad_norm": 2.5667355858388703, "learning_rate": 6.620509710691653e-06, "logits/chosen": 0.23696063458919525, "logits/rejected": 0.16394846141338348, "logps/chosen": -0.657241702079773, "logps/rejected": -2.8840408325195312, "loss": 0.7009, "odds_ratio_loss": 0.24847227334976196, "rewards/accuracies": 1.0, "rewards/chosen": -0.06572417169809341, "rewards/margins": 0.22267991304397583, "rewards/rejected": -0.28840407729148865, "sft_loss": 0.657241702079773, "step": 1203 }, { "epoch": 1.7411424439624006, "grad_norm": 2.907884753323847, "learning_rate": 6.618162375189687e-06, "logits/chosen": 0.054735876619815826, "logits/rejected": 0.011121401563286781, "logps/chosen": -0.5988122820854187, "logps/rejected": -1.7168229818344116, "loss": 0.5172, "odds_ratio_loss": 0.361611008644104, "rewards/accuracies": 0.75, "rewards/chosen": -0.05988123267889023, "rewards/margins": 0.11180106550455093, "rewards/rejected": -0.17168228328227997, "sft_loss": 0.5988122820854187, "step": 1204 }, { "epoch": 1.7425885755603758, "grad_norm": 2.744172673068131, "learning_rate": 6.615813461187873e-06, "logits/chosen": 0.1834808886051178, "logits/rejected": 0.13022924959659576, "logps/chosen": -0.5365532636642456, "logps/rejected": -1.4286985397338867, "loss": 0.5744, "odds_ratio_loss": 0.36767858266830444, "rewards/accuracies": 0.875, "rewards/chosen": -0.05365532636642456, "rewards/margins": 0.08921454101800919, "rewards/rejected": -0.14286985993385315, "sft_loss": 0.5365532636642456, "step": 1205 }, { "epoch": 1.7440347071583515, "grad_norm": 2.4576341795144443, "learning_rate": 6.6134629701023805e-06, "logits/chosen": 0.15828804671764374, "logits/rejected": 0.03511790186166763, "logps/chosen": -0.8053830862045288, "logps/rejected": -1.2731399536132812, "loss": 0.6786, "odds_ratio_loss": 0.47556188702583313, "rewards/accuracies": 0.75, "rewards/chosen": -0.0805383026599884, "rewards/margins": 0.0467756986618042, "rewards/rejected": -0.1273140013217926, "sft_loss": 0.8053830862045288, "step": 1206 }, { "epoch": 1.7454808387563268, "grad_norm": 2.7041504951107727, "learning_rate": 6.611110903350331e-06, "logits/chosen": 0.2982510030269623, "logits/rejected": 0.17835605144500732, "logps/chosen": -0.5107125043869019, "logps/rejected": -2.9214985370635986, "loss": 0.5927, "odds_ratio_loss": 0.44440388679504395, "rewards/accuracies": 0.6875, "rewards/chosen": -0.051071248948574066, "rewards/margins": 0.24107861518859863, "rewards/rejected": -0.2921498715877533, "sft_loss": 0.5107125043869019, "step": 1207 }, { "epoch": 1.7469269703543022, "grad_norm": 2.8209460199314704, "learning_rate": 6.608757262349792e-06, "logits/chosen": 0.06634090840816498, "logits/rejected": 0.13024091720581055, "logps/chosen": -0.6781923770904541, "logps/rejected": -1.6680748462677002, "loss": 0.6659, "odds_ratio_loss": 0.3676949739456177, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06781924515962601, "rewards/margins": 0.09898824989795685, "rewards/rejected": -0.16680750250816345, "sft_loss": 0.6781923770904541, "step": 1208 }, { "epoch": 1.7483731019522777, "grad_norm": 5.057921935760771, "learning_rate": 6.606402048519783e-06, "logits/chosen": 0.03448230028152466, "logits/rejected": 0.09395559132099152, "logps/chosen": -0.6829879283905029, "logps/rejected": -2.3134713172912598, "loss": 0.5778, "odds_ratio_loss": 0.3463206887245178, "rewards/accuracies": 0.875, "rewards/chosen": -0.06829879432916641, "rewards/margins": 0.16304834187030792, "rewards/rejected": -0.23134714365005493, "sft_loss": 0.6829879283905029, "step": 1209 }, { "epoch": 1.749819233550253, "grad_norm": 2.5596838109500104, "learning_rate": 6.604045263280273e-06, "logits/chosen": 0.12063422054052353, "logits/rejected": 0.09866572916507721, "logps/chosen": -0.7138389348983765, "logps/rejected": -2.4420652389526367, "loss": 0.6281, "odds_ratio_loss": 0.36622458696365356, "rewards/accuracies": 0.75, "rewards/chosen": -0.07138389348983765, "rewards/margins": 0.17282262444496155, "rewards/rejected": -0.2442065179347992, "sft_loss": 0.7138389348983765, "step": 1210 }, { "epoch": 1.7512653651482286, "grad_norm": 3.407537712357489, "learning_rate": 6.601686908052176e-06, "logits/chosen": 0.10398457199335098, "logits/rejected": 0.04173152521252632, "logps/chosen": -0.580781102180481, "logps/rejected": -2.662278175354004, "loss": 0.5826, "odds_ratio_loss": 0.3597915470600128, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0580781027674675, "rewards/margins": 0.208149716258049, "rewards/rejected": -0.2662278413772583, "sft_loss": 0.580781102180481, "step": 1211 }, { "epoch": 1.7527114967462039, "grad_norm": 2.6514455590176254, "learning_rate": 6.599326984257351e-06, "logits/chosen": 0.19883418083190918, "logits/rejected": 0.188827246427536, "logps/chosen": -0.5180667638778687, "logps/rejected": -2.4371070861816406, "loss": 0.575, "odds_ratio_loss": 0.32393041253089905, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05180668085813522, "rewards/margins": 0.19190403819084167, "rewards/rejected": -0.2437107264995575, "sft_loss": 0.5180667638778687, "step": 1212 }, { "epoch": 1.7541576283441793, "grad_norm": 2.418453470557911, "learning_rate": 6.596965493318606e-06, "logits/chosen": 0.1408243477344513, "logits/rejected": 0.1452597677707672, "logps/chosen": -0.5804046988487244, "logps/rejected": -1.683534860610962, "loss": 0.6441, "odds_ratio_loss": 0.35653501749038696, "rewards/accuracies": 0.8125, "rewards/chosen": -0.058040473610162735, "rewards/margins": 0.11031302809715271, "rewards/rejected": -0.16835349798202515, "sft_loss": 0.5804046988487244, "step": 1213 }, { "epoch": 1.7556037599421548, "grad_norm": 2.8936534377819982, "learning_rate": 6.594602436659695e-06, "logits/chosen": 0.29035723209381104, "logits/rejected": 0.13355976343154907, "logps/chosen": -0.3346521854400635, "logps/rejected": -4.254483699798584, "loss": 0.5783, "odds_ratio_loss": 0.22945931553840637, "rewards/accuracies": 0.875, "rewards/chosen": -0.033465221524238586, "rewards/margins": 0.3919832110404968, "rewards/rejected": -0.4254484176635742, "sft_loss": 0.3346521854400635, "step": 1214 }, { "epoch": 1.75704989154013, "grad_norm": 2.1734385221887362, "learning_rate": 6.592237815705309e-06, "logits/chosen": 0.2960423529148102, "logits/rejected": 0.13452109694480896, "logps/chosen": -0.42409226298332214, "logps/rejected": -1.8257529735565186, "loss": 0.5485, "odds_ratio_loss": 0.3245893716812134, "rewards/accuracies": 0.75, "rewards/chosen": -0.042409226298332214, "rewards/margins": 0.14016607403755188, "rewards/rejected": -0.1825753152370453, "sft_loss": 0.42409226298332214, "step": 1215 }, { "epoch": 1.7584960231381057, "grad_norm": 2.4855257032936935, "learning_rate": 6.589871631881092e-06, "logits/chosen": 0.20803236961364746, "logits/rejected": 0.21837976574897766, "logps/chosen": -0.6946249008178711, "logps/rejected": -1.5261632204055786, "loss": 0.5728, "odds_ratio_loss": 0.3592318594455719, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06946249306201935, "rewards/margins": 0.08315382152795792, "rewards/rejected": -0.15261633694171906, "sft_loss": 0.6946249008178711, "step": 1216 }, { "epoch": 1.759942154736081, "grad_norm": 2.447862716688948, "learning_rate": 6.587503886613619e-06, "logits/chosen": 0.23648259043693542, "logits/rejected": 0.26576292514801025, "logps/chosen": -0.5033541321754456, "logps/rejected": -2.461588144302368, "loss": 0.6131, "odds_ratio_loss": 0.20790676772594452, "rewards/accuracies": 0.9375, "rewards/chosen": -0.050335414707660675, "rewards/margins": 0.19582340121269226, "rewards/rejected": -0.24615880846977234, "sft_loss": 0.5033541321754456, "step": 1217 }, { "epoch": 1.7613882863340564, "grad_norm": 3.1404167129848672, "learning_rate": 6.585134581330419e-06, "logits/chosen": 0.1707933247089386, "logits/rejected": 0.17154398560523987, "logps/chosen": -0.5736587047576904, "logps/rejected": -2.2020938396453857, "loss": 0.6267, "odds_ratio_loss": 0.22983211278915405, "rewards/accuracies": 0.875, "rewards/chosen": -0.05736587569117546, "rewards/margins": 0.1628435105085373, "rewards/rejected": -0.22020938992500305, "sft_loss": 0.5736587047576904, "step": 1218 }, { "epoch": 1.7628344179320319, "grad_norm": 2.398668193202477, "learning_rate": 6.58276371745995e-06, "logits/chosen": 0.20608599483966827, "logits/rejected": 0.1394387036561966, "logps/chosen": -0.4304734766483307, "logps/rejected": -2.14104962348938, "loss": 0.5412, "odds_ratio_loss": 0.319057434797287, "rewards/accuracies": 0.875, "rewards/chosen": -0.04304734617471695, "rewards/margins": 0.17105762660503387, "rewards/rejected": -0.21410496532917023, "sft_loss": 0.4304734766483307, "step": 1219 }, { "epoch": 1.7642805495300071, "grad_norm": 2.6458674834216316, "learning_rate": 6.580391296431617e-06, "logits/chosen": 0.20387428998947144, "logits/rejected": 0.04644143208861351, "logps/chosen": -0.5995526313781738, "logps/rejected": -1.390120506286621, "loss": 0.662, "odds_ratio_loss": 0.32246649265289307, "rewards/accuracies": 1.0, "rewards/chosen": -0.05995526909828186, "rewards/margins": 0.07905678451061249, "rewards/rejected": -0.13901205360889435, "sft_loss": 0.5995526313781738, "step": 1220 }, { "epoch": 1.7657266811279828, "grad_norm": 2.30503961685648, "learning_rate": 6.578017319675762e-06, "logits/chosen": 0.10869728773832321, "logits/rejected": 0.039436303079128265, "logps/chosen": -0.6155337691307068, "logps/rejected": -2.0125937461853027, "loss": 0.6198, "odds_ratio_loss": 0.3492213785648346, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06155337765812874, "rewards/margins": 0.13970601558685303, "rewards/rejected": -0.20125938951969147, "sft_loss": 0.6155337691307068, "step": 1221 }, { "epoch": 1.767172812725958, "grad_norm": 2.893877860045242, "learning_rate": 6.5756417886236625e-06, "logits/chosen": 0.16021108627319336, "logits/rejected": 0.0937863439321518, "logps/chosen": -0.6446142792701721, "logps/rejected": -1.6120359897613525, "loss": 0.6553, "odds_ratio_loss": 0.35699713230133057, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06446143239736557, "rewards/margins": 0.09674215316772461, "rewards/rejected": -0.16120359301567078, "sft_loss": 0.6446142792701721, "step": 1222 }, { "epoch": 1.7686189443239335, "grad_norm": 3.040730778349726, "learning_rate": 6.573264704707537e-06, "logits/chosen": 0.15876901149749756, "logits/rejected": 0.04192943871021271, "logps/chosen": -0.5770949125289917, "logps/rejected": -2.769012451171875, "loss": 0.5405, "odds_ratio_loss": 0.22492100298404694, "rewards/accuracies": 1.0, "rewards/chosen": -0.05770949274301529, "rewards/margins": 0.2191917598247528, "rewards/rejected": -0.2769012451171875, "sft_loss": 0.5770949125289917, "step": 1223 }, { "epoch": 1.770065075921909, "grad_norm": 2.3456343253279677, "learning_rate": 6.570886069360535e-06, "logits/chosen": 0.1821683943271637, "logits/rejected": 0.19179019331932068, "logps/chosen": -0.567207396030426, "logps/rejected": -1.8817782402038574, "loss": 0.583, "odds_ratio_loss": 0.2690046429634094, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05672074481844902, "rewards/margins": 0.13145706057548523, "rewards/rejected": -0.18817779421806335, "sft_loss": 0.567207396030426, "step": 1224 }, { "epoch": 1.7715112075198842, "grad_norm": 2.683339111567326, "learning_rate": 6.568505884016749e-06, "logits/chosen": 0.13196396827697754, "logits/rejected": 0.0811544805765152, "logps/chosen": -0.527839183807373, "logps/rejected": -2.4704227447509766, "loss": 0.5611, "odds_ratio_loss": 0.3083456754684448, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05278391391038895, "rewards/margins": 0.19425836205482483, "rewards/rejected": -0.24704228341579437, "sft_loss": 0.527839183807373, "step": 1225 }, { "epoch": 1.7729573391178597, "grad_norm": 2.136034880015943, "learning_rate": 6.566124150111197e-06, "logits/chosen": 0.08882991969585419, "logits/rejected": 0.0734814703464508, "logps/chosen": -0.6307113170623779, "logps/rejected": -1.749953031539917, "loss": 0.5799, "odds_ratio_loss": 0.2958693206310272, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06307113170623779, "rewards/margins": 0.1119241863489151, "rewards/rejected": -0.1749953031539917, "sft_loss": 0.6307113170623779, "step": 1226 }, { "epoch": 1.7744034707158352, "grad_norm": 2.617436250062296, "learning_rate": 6.56374086907984e-06, "logits/chosen": 0.2238091230392456, "logits/rejected": 0.1482618749141693, "logps/chosen": -0.38627102971076965, "logps/rejected": -4.749565601348877, "loss": 0.5507, "odds_ratio_loss": 0.20005583763122559, "rewards/accuracies": 0.9375, "rewards/chosen": -0.038627102971076965, "rewards/margins": 0.4363294839859009, "rewards/rejected": -0.47495660185813904, "sft_loss": 0.38627102971076965, "step": 1227 }, { "epoch": 1.7758496023138104, "grad_norm": 2.529425804367851, "learning_rate": 6.561356042359563e-06, "logits/chosen": 0.1380154937505722, "logits/rejected": 0.11311140656471252, "logps/chosen": -0.7418189644813538, "logps/rejected": -1.8061516284942627, "loss": 0.6726, "odds_ratio_loss": 0.4076150953769684, "rewards/accuracies": 0.875, "rewards/chosen": -0.07418189942836761, "rewards/margins": 0.10643327236175537, "rewards/rejected": -0.18061517179012299, "sft_loss": 0.7418189644813538, "step": 1228 }, { "epoch": 1.777295733911786, "grad_norm": 2.3388201206907326, "learning_rate": 6.558969671388189e-06, "logits/chosen": 0.35083937644958496, "logits/rejected": 0.23928841948509216, "logps/chosen": -0.47196346521377563, "logps/rejected": -2.6147122383117676, "loss": 0.6066, "odds_ratio_loss": 0.3234768211841583, "rewards/accuracies": 0.875, "rewards/chosen": -0.04719635099172592, "rewards/margins": 0.21427488327026367, "rewards/rejected": -0.2614712119102478, "sft_loss": 0.47196346521377563, "step": 1229 }, { "epoch": 1.7787418655097613, "grad_norm": 2.396729475929612, "learning_rate": 6.55658175760447e-06, "logits/chosen": 0.20722784101963043, "logits/rejected": 0.18240517377853394, "logps/chosen": -0.35860294103622437, "logps/rejected": -1.4275670051574707, "loss": 0.5159, "odds_ratio_loss": 0.22660349309444427, "rewards/accuracies": 1.0, "rewards/chosen": -0.03586029261350632, "rewards/margins": 0.10689640045166016, "rewards/rejected": -0.14275670051574707, "sft_loss": 0.35860294103622437, "step": 1230 }, { "epoch": 1.7801879971077368, "grad_norm": 2.088719841010738, "learning_rate": 6.554192302448087e-06, "logits/chosen": 0.18596374988555908, "logits/rejected": 0.09559060633182526, "logps/chosen": -0.5696253776550293, "logps/rejected": -2.7666914463043213, "loss": 0.5767, "odds_ratio_loss": 0.27393674850463867, "rewards/accuracies": 0.875, "rewards/chosen": -0.05696253478527069, "rewards/margins": 0.21970660984516144, "rewards/rejected": -0.27666914463043213, "sft_loss": 0.5696253776550293, "step": 1231 }, { "epoch": 1.7816341287057122, "grad_norm": 3.5145170842289533, "learning_rate": 6.551801307359653e-06, "logits/chosen": 0.2097698450088501, "logits/rejected": 0.1429743766784668, "logps/chosen": -0.5215898156166077, "logps/rejected": -2.5869057178497314, "loss": 0.6702, "odds_ratio_loss": 0.2010580152273178, "rewards/accuracies": 1.0, "rewards/chosen": -0.05215898156166077, "rewards/margins": 0.2065315991640091, "rewards/rejected": -0.25869059562683105, "sft_loss": 0.5215898156166077, "step": 1232 }, { "epoch": 1.7830802603036875, "grad_norm": 2.300180857740447, "learning_rate": 6.549408773780706e-06, "logits/chosen": 0.16524893045425415, "logits/rejected": 0.10692907124757767, "logps/chosen": -0.552807629108429, "logps/rejected": -3.7471842765808105, "loss": 0.5814, "odds_ratio_loss": 0.2621445655822754, "rewards/accuracies": 0.875, "rewards/chosen": -0.055280767381191254, "rewards/margins": 0.3194376826286316, "rewards/rejected": -0.37471842765808105, "sft_loss": 0.552807629108429, "step": 1233 }, { "epoch": 1.7845263919016632, "grad_norm": 3.1641439654511587, "learning_rate": 6.5470147031537134e-06, "logits/chosen": 0.11416902393102646, "logits/rejected": 0.1004326269030571, "logps/chosen": -0.6147681474685669, "logps/rejected": -3.4372403621673584, "loss": 0.6441, "odds_ratio_loss": 0.20263680815696716, "rewards/accuracies": 1.0, "rewards/chosen": -0.06147681921720505, "rewards/margins": 0.28224724531173706, "rewards/rejected": -0.3437240421772003, "sft_loss": 0.6147681474685669, "step": 1234 }, { "epoch": 1.7859725234996384, "grad_norm": 2.4720750541905154, "learning_rate": 6.544619096922071e-06, "logits/chosen": 0.07047523558139801, "logits/rejected": 0.19488292932510376, "logps/chosen": -0.6933853030204773, "logps/rejected": -1.8614749908447266, "loss": 0.7145, "odds_ratio_loss": 0.314864844083786, "rewards/accuracies": 0.875, "rewards/chosen": -0.06933853030204773, "rewards/margins": 0.1168089509010315, "rewards/rejected": -0.18614749610424042, "sft_loss": 0.6933853030204773, "step": 1235 }, { "epoch": 1.7874186550976139, "grad_norm": 2.6188623851621187, "learning_rate": 6.542221956530099e-06, "logits/chosen": 0.3162124752998352, "logits/rejected": 0.10680659115314484, "logps/chosen": -0.819428563117981, "logps/rejected": -2.5189719200134277, "loss": 0.7024, "odds_ratio_loss": 0.48061349987983704, "rewards/accuracies": 0.625, "rewards/chosen": -0.0819428488612175, "rewards/margins": 0.1699543297290802, "rewards/rejected": -0.2518971860408783, "sft_loss": 0.819428563117981, "step": 1236 }, { "epoch": 1.7888647866955893, "grad_norm": 2.2671401247198686, "learning_rate": 6.539823283423041e-06, "logits/chosen": 0.1904761642217636, "logits/rejected": 0.22632457315921783, "logps/chosen": -0.7415924072265625, "logps/rejected": -2.621068000793457, "loss": 0.5859, "odds_ratio_loss": 0.3327312469482422, "rewards/accuracies": 0.875, "rewards/chosen": -0.07415923476219177, "rewards/margins": 0.1879475712776184, "rewards/rejected": -0.2621068060398102, "sft_loss": 0.7415924072265625, "step": 1237 }, { "epoch": 1.7903109182935646, "grad_norm": 2.466767010915428, "learning_rate": 6.537423079047064e-06, "logits/chosen": 0.25809118151664734, "logits/rejected": 0.24010270833969116, "logps/chosen": -0.5368831157684326, "logps/rejected": -2.134767770767212, "loss": 0.5884, "odds_ratio_loss": 0.36311042308807373, "rewards/accuracies": 0.75, "rewards/chosen": -0.05368831753730774, "rewards/margins": 0.15978845953941345, "rewards/rejected": -0.2134767770767212, "sft_loss": 0.5368831157684326, "step": 1238 }, { "epoch": 1.7917570498915403, "grad_norm": 2.6957973304314486, "learning_rate": 6.5350213448492645e-06, "logits/chosen": 0.10721335560083389, "logits/rejected": 0.15869364142417908, "logps/chosen": -0.532486081123352, "logps/rejected": -1.3220763206481934, "loss": 0.5876, "odds_ratio_loss": 0.4261801838874817, "rewards/accuracies": 0.8125, "rewards/chosen": -0.053248606622219086, "rewards/margins": 0.07895903289318085, "rewards/rejected": -0.13220764696598053, "sft_loss": 0.532486081123352, "step": 1239 }, { "epoch": 1.7932031814895155, "grad_norm": 2.4573630337591537, "learning_rate": 6.532618082277654e-06, "logits/chosen": 0.20036396384239197, "logits/rejected": 0.13425952196121216, "logps/chosen": -0.4425850212574005, "logps/rejected": -2.856372833251953, "loss": 0.5854, "odds_ratio_loss": 0.1853070706129074, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04425850138068199, "rewards/margins": 0.2413787692785263, "rewards/rejected": -0.2856372594833374, "sft_loss": 0.4425850212574005, "step": 1240 }, { "epoch": 1.794649313087491, "grad_norm": 2.7306661642453274, "learning_rate": 6.5302132927811695e-06, "logits/chosen": 0.1338634490966797, "logits/rejected": 0.06804355978965759, "logps/chosen": -0.7215875387191772, "logps/rejected": -1.886422872543335, "loss": 0.646, "odds_ratio_loss": 0.38540273904800415, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07215875387191772, "rewards/margins": 0.11648353934288025, "rewards/rejected": -0.18864230811595917, "sft_loss": 0.7215875387191772, "step": 1241 }, { "epoch": 1.7960954446854664, "grad_norm": 2.728754891284883, "learning_rate": 6.527806977809667e-06, "logits/chosen": 0.030140642076730728, "logits/rejected": 0.05486735701560974, "logps/chosen": -0.7998731136322021, "logps/rejected": -1.3197684288024902, "loss": 0.659, "odds_ratio_loss": 0.4492250084877014, "rewards/accuracies": 0.875, "rewards/chosen": -0.07998731732368469, "rewards/margins": 0.051989540457725525, "rewards/rejected": -0.13197685778141022, "sft_loss": 0.7998731136322021, "step": 1242 }, { "epoch": 1.7975415762834417, "grad_norm": 2.9607327160654076, "learning_rate": 6.525399138813923e-06, "logits/chosen": 0.13286061584949493, "logits/rejected": 0.1681133359670639, "logps/chosen": -0.7185732126235962, "logps/rejected": -1.5053967237472534, "loss": 0.7056, "odds_ratio_loss": 0.6407934427261353, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07185731083154678, "rewards/margins": 0.07868236303329468, "rewards/rejected": -0.15053966641426086, "sft_loss": 0.7185732126235962, "step": 1243 }, { "epoch": 1.7989877078814174, "grad_norm": 2.3600545000552464, "learning_rate": 6.522989777245632e-06, "logits/chosen": 0.29583603143692017, "logits/rejected": 0.11987383663654327, "logps/chosen": -0.46510180830955505, "logps/rejected": -1.4625095129013062, "loss": 0.6672, "odds_ratio_loss": 0.403626024723053, "rewards/accuracies": 0.8125, "rewards/chosen": -0.046510182321071625, "rewards/margins": 0.09974077343940735, "rewards/rejected": -0.14625096321105957, "sft_loss": 0.46510180830955505, "step": 1244 }, { "epoch": 1.8004338394793926, "grad_norm": 2.551051522105615, "learning_rate": 6.5205788945574084e-06, "logits/chosen": 0.0593152791261673, "logits/rejected": 0.020416993647813797, "logps/chosen": -0.5379363894462585, "logps/rejected": -2.7240047454833984, "loss": 0.5762, "odds_ratio_loss": 0.3037707209587097, "rewards/accuracies": 0.875, "rewards/chosen": -0.053793638944625854, "rewards/margins": 0.2186068594455719, "rewards/rejected": -0.27240049839019775, "sft_loss": 0.5379363894462585, "step": 1245 }, { "epoch": 1.801879971077368, "grad_norm": 2.8134198982380667, "learning_rate": 6.518166492202781e-06, "logits/chosen": 0.09442038834095001, "logits/rejected": 0.13767805695533752, "logps/chosen": -0.6870551109313965, "logps/rejected": -2.2993569374084473, "loss": 0.7367, "odds_ratio_loss": 0.3380257189273834, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06870551407337189, "rewards/margins": 0.161230206489563, "rewards/rejected": -0.22993570566177368, "sft_loss": 0.6870551109313965, "step": 1246 }, { "epoch": 1.8033261026753435, "grad_norm": 2.7343970380232827, "learning_rate": 6.5157525716361975e-06, "logits/chosen": 0.06337403506040573, "logits/rejected": 0.032632552087306976, "logps/chosen": -0.7269030809402466, "logps/rejected": -1.3690733909606934, "loss": 0.6109, "odds_ratio_loss": 0.4972846508026123, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07269031554460526, "rewards/margins": 0.06421701610088348, "rewards/rejected": -0.13690733909606934, "sft_loss": 0.7269030809402466, "step": 1247 }, { "epoch": 1.8047722342733188, "grad_norm": 3.0777568617671682, "learning_rate": 6.513337134313019e-06, "logits/chosen": 0.14764419198036194, "logits/rejected": 0.12649638950824738, "logps/chosen": -0.5623830556869507, "logps/rejected": -2.9645442962646484, "loss": 0.5848, "odds_ratio_loss": 0.21953721344470978, "rewards/accuracies": 1.0, "rewards/chosen": -0.05623830854892731, "rewards/margins": 0.24021612107753754, "rewards/rejected": -0.29645445942878723, "sft_loss": 0.5623830556869507, "step": 1248 }, { "epoch": 1.8062183658712943, "grad_norm": 2.732656690768169, "learning_rate": 6.5109201816895204e-06, "logits/chosen": 0.07473330199718475, "logits/rejected": 0.08126173168420792, "logps/chosen": -0.6446163058280945, "logps/rejected": -2.1125130653381348, "loss": 0.6244, "odds_ratio_loss": 0.3653605580329895, "rewards/accuracies": 0.875, "rewards/chosen": -0.06446163356304169, "rewards/margins": 0.14678969979286194, "rewards/rejected": -0.21125133335590363, "sft_loss": 0.6446163058280945, "step": 1249 }, { "epoch": 1.8076644974692697, "grad_norm": 2.6894663583372616, "learning_rate": 6.508501715222895e-06, "logits/chosen": 0.1402525007724762, "logits/rejected": 0.057519227266311646, "logps/chosen": -0.5445053577423096, "logps/rejected": -2.487497568130493, "loss": 0.6504, "odds_ratio_loss": 0.2824597656726837, "rewards/accuracies": 0.875, "rewards/chosen": -0.054450541734695435, "rewards/margins": 0.19429922103881836, "rewards/rejected": -0.2487497478723526, "sft_loss": 0.5445053577423096, "step": 1250 }, { "epoch": 1.809110629067245, "grad_norm": 2.860737934069698, "learning_rate": 6.506081736371241e-06, "logits/chosen": 0.12336266785860062, "logits/rejected": 0.1148252934217453, "logps/chosen": -0.5299996137619019, "logps/rejected": -2.73490047454834, "loss": 0.5286, "odds_ratio_loss": 0.2534593641757965, "rewards/accuracies": 0.875, "rewards/chosen": -0.052999965846538544, "rewards/margins": 0.22049006819725037, "rewards/rejected": -0.2734900414943695, "sft_loss": 0.5299996137619019, "step": 1251 }, { "epoch": 1.8105567606652206, "grad_norm": 2.0169373424978363, "learning_rate": 6.503660246593574e-06, "logits/chosen": 0.10713335871696472, "logits/rejected": 0.052206408232450485, "logps/chosen": -0.6295415759086609, "logps/rejected": -2.7591824531555176, "loss": 0.5845, "odds_ratio_loss": 0.279104620218277, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06295415759086609, "rewards/margins": 0.21296407282352448, "rewards/rejected": -0.27591824531555176, "sft_loss": 0.6295415759086609, "step": 1252 }, { "epoch": 1.8120028922631959, "grad_norm": 3.6466169903218963, "learning_rate": 6.50123724734982e-06, "logits/chosen": 0.2897685766220093, "logits/rejected": 0.18411767482757568, "logps/chosen": -0.4855603575706482, "logps/rejected": -3.1457629203796387, "loss": 0.5398, "odds_ratio_loss": 0.24788373708724976, "rewards/accuracies": 0.875, "rewards/chosen": -0.04855603724718094, "rewards/margins": 0.2660202383995056, "rewards/rejected": -0.31457629799842834, "sft_loss": 0.4855603575706482, "step": 1253 }, { "epoch": 1.8134490238611713, "grad_norm": 2.810165738303995, "learning_rate": 6.498812740100815e-06, "logits/chosen": 0.08351315557956696, "logits/rejected": 0.05155708268284798, "logps/chosen": -0.7639514207839966, "logps/rejected": -1.6764659881591797, "loss": 0.714, "odds_ratio_loss": 0.48868927359580994, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07639515399932861, "rewards/margins": 0.09125145524740219, "rewards/rejected": -0.1676466017961502, "sft_loss": 0.7639514207839966, "step": 1254 }, { "epoch": 1.8148951554591468, "grad_norm": 4.447263370216222, "learning_rate": 6.496386726308301e-06, "logits/chosen": 0.21318885684013367, "logits/rejected": 0.1973818689584732, "logps/chosen": -0.6246731877326965, "logps/rejected": -2.5932018756866455, "loss": 0.618, "odds_ratio_loss": 0.3143559396266937, "rewards/accuracies": 0.75, "rewards/chosen": -0.06246732175350189, "rewards/margins": 0.1968528926372528, "rewards/rejected": -0.2593201994895935, "sft_loss": 0.6246731877326965, "step": 1255 }, { "epoch": 1.816341287057122, "grad_norm": 2.1529265886576345, "learning_rate": 6.493959207434934e-06, "logits/chosen": 0.19203035533428192, "logits/rejected": 0.2155546247959137, "logps/chosen": -0.6295595765113831, "logps/rejected": -2.2532200813293457, "loss": 0.691, "odds_ratio_loss": 0.2799103856086731, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06295596063137054, "rewards/margins": 0.16236604750156403, "rewards/rejected": -0.22532200813293457, "sft_loss": 0.6295595765113831, "step": 1256 }, { "epoch": 1.8177874186550977, "grad_norm": 2.483176547201568, "learning_rate": 6.491530184944272e-06, "logits/chosen": 0.18721802532672882, "logits/rejected": 0.003281711135059595, "logps/chosen": -0.4738430380821228, "logps/rejected": -4.528861045837402, "loss": 0.5752, "odds_ratio_loss": 0.24636805057525635, "rewards/accuracies": 0.875, "rewards/chosen": -0.04738430306315422, "rewards/margins": 0.4055017828941345, "rewards/rejected": -0.45288610458374023, "sft_loss": 0.4738430380821228, "step": 1257 }, { "epoch": 1.819233550253073, "grad_norm": 2.4614873082947146, "learning_rate": 6.48909966030078e-06, "logits/chosen": 0.20001399517059326, "logits/rejected": 0.04840739816427231, "logps/chosen": -0.7521265745162964, "logps/rejected": -2.608405828475952, "loss": 0.5783, "odds_ratio_loss": 0.3967435956001282, "rewards/accuracies": 0.875, "rewards/chosen": -0.07521265745162964, "rewards/margins": 0.18562793731689453, "rewards/rejected": -0.26084059476852417, "sft_loss": 0.7521265745162964, "step": 1258 }, { "epoch": 1.8206796818510484, "grad_norm": 2.989994442994001, "learning_rate": 6.4866676349698334e-06, "logits/chosen": 0.23923185467720032, "logits/rejected": 0.3120484948158264, "logps/chosen": -0.4351702630519867, "logps/rejected": -1.782483458518982, "loss": 0.5075, "odds_ratio_loss": 0.2519921362400055, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04351702705025673, "rewards/margins": 0.13473132252693176, "rewards/rejected": -0.1782483607530594, "sft_loss": 0.4351702630519867, "step": 1259 }, { "epoch": 1.822125813449024, "grad_norm": 2.9573254382883682, "learning_rate": 6.484234110417709e-06, "logits/chosen": 0.16368578374385834, "logits/rejected": 0.11058539897203445, "logps/chosen": -0.7165206074714661, "logps/rejected": -1.5497503280639648, "loss": 0.7133, "odds_ratio_loss": 0.4126344323158264, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07165206223726273, "rewards/margins": 0.08332297950983047, "rewards/rejected": -0.1549750417470932, "sft_loss": 0.7165206074714661, "step": 1260 }, { "epoch": 1.8235719450469992, "grad_norm": 2.9020896841357358, "learning_rate": 6.481799088111588e-06, "logits/chosen": 0.22353379428386688, "logits/rejected": 0.06521797180175781, "logps/chosen": -0.6017415523529053, "logps/rejected": -2.608680248260498, "loss": 0.5805, "odds_ratio_loss": 0.3434939980506897, "rewards/accuracies": 0.75, "rewards/chosen": -0.060174159705638885, "rewards/margins": 0.20069387555122375, "rewards/rejected": -0.26086804270744324, "sft_loss": 0.6017415523529053, "step": 1261 }, { "epoch": 1.8250180766449748, "grad_norm": 2.4314449265663725, "learning_rate": 6.4793625695195525e-06, "logits/chosen": 0.1931626796722412, "logits/rejected": 0.15090645849704742, "logps/chosen": -0.5738415122032166, "logps/rejected": -2.4022316932678223, "loss": 0.6279, "odds_ratio_loss": 0.21190491318702698, "rewards/accuracies": 0.9375, "rewards/chosen": -0.057384148240089417, "rewards/margins": 0.18283900618553162, "rewards/rejected": -0.24022316932678223, "sft_loss": 0.5738415122032166, "step": 1262 }, { "epoch": 1.82646420824295, "grad_norm": 2.6562218026914826, "learning_rate": 6.476924556110589e-06, "logits/chosen": 0.13250023126602173, "logits/rejected": 0.06583386659622192, "logps/chosen": -0.6148098707199097, "logps/rejected": -2.1790497303009033, "loss": 0.5669, "odds_ratio_loss": 0.25489309430122375, "rewards/accuracies": 0.9375, "rewards/chosen": -0.061480991542339325, "rewards/margins": 0.15642398595809937, "rewards/rejected": -0.2179049700498581, "sft_loss": 0.6148098707199097, "step": 1263 }, { "epoch": 1.8279103398409255, "grad_norm": 2.863752051638278, "learning_rate": 6.474485049354587e-06, "logits/chosen": 0.08306652307510376, "logits/rejected": 0.08208741247653961, "logps/chosen": -0.5236082673072815, "logps/rejected": -3.5160677433013916, "loss": 0.636, "odds_ratio_loss": 0.21715307235717773, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05236082896590233, "rewards/margins": 0.2992459535598755, "rewards/rejected": -0.3516067862510681, "sft_loss": 0.5236082673072815, "step": 1264 }, { "epoch": 1.829356471438901, "grad_norm": 2.9820042330346967, "learning_rate": 6.4720440507223314e-06, "logits/chosen": 0.14499638974666595, "logits/rejected": 0.05099831521511078, "logps/chosen": -0.5267215967178345, "logps/rejected": -3.3054587841033936, "loss": 0.5866, "odds_ratio_loss": 0.247196227312088, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05267215520143509, "rewards/margins": 0.2778737545013428, "rewards/rejected": -0.33054590225219727, "sft_loss": 0.5267215967178345, "step": 1265 }, { "epoch": 1.8308026030368763, "grad_norm": 2.3343928851027855, "learning_rate": 6.469601561685512e-06, "logits/chosen": 0.1534407138824463, "logits/rejected": 0.02046523243188858, "logps/chosen": -0.5311034917831421, "logps/rejected": -2.802785873413086, "loss": 0.614, "odds_ratio_loss": 0.2740580141544342, "rewards/accuracies": 0.875, "rewards/chosen": -0.05311034992337227, "rewards/margins": 0.2271682620048523, "rewards/rejected": -0.28027862310409546, "sft_loss": 0.5311034917831421, "step": 1266 }, { "epoch": 1.8322487346348517, "grad_norm": 2.2828013182128624, "learning_rate": 6.467157583716712e-06, "logits/chosen": 0.0914180725812912, "logits/rejected": 0.07987754046916962, "logps/chosen": -0.693658709526062, "logps/rejected": -1.6100306510925293, "loss": 0.6032, "odds_ratio_loss": 0.3653374910354614, "rewards/accuracies": 0.875, "rewards/chosen": -0.06936587393283844, "rewards/margins": 0.09163719415664673, "rewards/rejected": -0.16100306808948517, "sft_loss": 0.693658709526062, "step": 1267 }, { "epoch": 1.8336948662328272, "grad_norm": 2.7682382563234533, "learning_rate": 6.464712118289418e-06, "logits/chosen": 0.09248815476894379, "logits/rejected": 0.09033681452274323, "logps/chosen": -0.6010981798171997, "logps/rejected": -1.8052887916564941, "loss": 0.5682, "odds_ratio_loss": 0.3576500415802002, "rewards/accuracies": 0.875, "rewards/chosen": -0.06010981649160385, "rewards/margins": 0.12041905522346497, "rewards/rejected": -0.18052886426448822, "sft_loss": 0.6010981798171997, "step": 1268 }, { "epoch": 1.8351409978308026, "grad_norm": 2.191095438710336, "learning_rate": 6.462265166878006e-06, "logits/chosen": 0.1091507226228714, "logits/rejected": 0.11014742404222488, "logps/chosen": -0.547124981880188, "logps/rejected": -2.7218005657196045, "loss": 0.5802, "odds_ratio_loss": 0.2503480017185211, "rewards/accuracies": 0.9375, "rewards/chosen": -0.054712504148483276, "rewards/margins": 0.2174675613641739, "rewards/rejected": -0.27218008041381836, "sft_loss": 0.547124981880188, "step": 1269 }, { "epoch": 1.836587129428778, "grad_norm": 2.3132104589718154, "learning_rate": 6.459816730957756e-06, "logits/chosen": 0.11147591471672058, "logits/rejected": 0.04091275855898857, "logps/chosen": -0.6386622190475464, "logps/rejected": -1.4583815336227417, "loss": 0.5806, "odds_ratio_loss": 0.48449695110321045, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06386622041463852, "rewards/margins": 0.08197192847728729, "rewards/rejected": -0.1458381563425064, "sft_loss": 0.6386622190475464, "step": 1270 }, { "epoch": 1.8380332610267534, "grad_norm": 2.6157790210799816, "learning_rate": 6.457366812004837e-06, "logits/chosen": 0.18917174637317657, "logits/rejected": 0.10391932725906372, "logps/chosen": -0.4687146246433258, "logps/rejected": -3.772552728652954, "loss": 0.6262, "odds_ratio_loss": 0.1897575557231903, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04687146842479706, "rewards/margins": 0.3303838074207306, "rewards/rejected": -0.37725526094436646, "sft_loss": 0.4687146246433258, "step": 1271 }, { "epoch": 1.8394793926247288, "grad_norm": 3.0406032129446814, "learning_rate": 6.4549154114963155e-06, "logits/chosen": -0.025457292795181274, "logits/rejected": 0.042470939457416534, "logps/chosen": -0.5330725312232971, "logps/rejected": -2.3417413234710693, "loss": 0.5894, "odds_ratio_loss": 0.34376680850982666, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05330725386738777, "rewards/margins": 0.18086686730384827, "rewards/rejected": -0.23417413234710693, "sft_loss": 0.5330725312232971, "step": 1272 }, { "epoch": 1.8409255242227043, "grad_norm": 2.4045595864109015, "learning_rate": 6.452462530910148e-06, "logits/chosen": 0.23581625521183014, "logits/rejected": 0.13572809100151062, "logps/chosen": -0.47560757398605347, "logps/rejected": -3.1639788150787354, "loss": 0.6038, "odds_ratio_loss": 0.24863043427467346, "rewards/accuracies": 1.0, "rewards/chosen": -0.047560758888721466, "rewards/margins": 0.2688371539115906, "rewards/rejected": -0.31639787554740906, "sft_loss": 0.47560757398605347, "step": 1273 }, { "epoch": 1.8423716558206795, "grad_norm": 2.593914431539175, "learning_rate": 6.4500081717251874e-06, "logits/chosen": 0.19233283400535583, "logits/rejected": 0.12806811928749084, "logps/chosen": -0.5220276117324829, "logps/rejected": -2.091871976852417, "loss": 0.6913, "odds_ratio_loss": 0.29791516065597534, "rewards/accuracies": 0.875, "rewards/chosen": -0.05220276862382889, "rewards/margins": 0.15698441863059998, "rewards/rejected": -0.20918720960617065, "sft_loss": 0.5220276117324829, "step": 1274 }, { "epoch": 1.8438177874186552, "grad_norm": 2.2558787495791526, "learning_rate": 6.447552335421175e-06, "logits/chosen": 0.15237554907798767, "logits/rejected": 0.09454990923404694, "logps/chosen": -0.46872109174728394, "logps/rejected": -3.8205952644348145, "loss": 0.5324, "odds_ratio_loss": 0.2774191200733185, "rewards/accuracies": 0.875, "rewards/chosen": -0.04687211290001869, "rewards/margins": 0.3351873755455017, "rewards/rejected": -0.3820595145225525, "sft_loss": 0.46872109174728394, "step": 1275 }, { "epoch": 1.8452639190166304, "grad_norm": 2.803061466485772, "learning_rate": 6.4450950234787445e-06, "logits/chosen": 0.18218779563903809, "logits/rejected": 0.1619456708431244, "logps/chosen": -0.4943692684173584, "logps/rejected": -2.0796353816986084, "loss": 0.583, "odds_ratio_loss": 0.33534127473831177, "rewards/accuracies": 0.875, "rewards/chosen": -0.04943692684173584, "rewards/margins": 0.15852662920951843, "rewards/rejected": -0.20796355605125427, "sft_loss": 0.4943692684173584, "step": 1276 }, { "epoch": 1.846710050614606, "grad_norm": 2.877597865288587, "learning_rate": 6.442636237379417e-06, "logits/chosen": 0.16758492588996887, "logits/rejected": 0.027434751391410828, "logps/chosen": -0.5394682884216309, "logps/rejected": -3.1126046180725098, "loss": 0.5703, "odds_ratio_loss": 0.30693167448043823, "rewards/accuracies": 0.875, "rewards/chosen": -0.053946830332279205, "rewards/margins": 0.25731363892555237, "rewards/rejected": -0.311260461807251, "sft_loss": 0.5394682884216309, "step": 1277 }, { "epoch": 1.8481561822125814, "grad_norm": 2.952969926759398, "learning_rate": 6.440175978605605e-06, "logits/chosen": 0.1769622266292572, "logits/rejected": 0.18280473351478577, "logps/chosen": -0.46669626235961914, "logps/rejected": -2.5780460834503174, "loss": 0.5815, "odds_ratio_loss": 0.2586829960346222, "rewards/accuracies": 0.875, "rewards/chosen": -0.046669624745845795, "rewards/margins": 0.21113497018814087, "rewards/rejected": -0.25780460238456726, "sft_loss": 0.46669626235961914, "step": 1278 }, { "epoch": 1.8496023138105566, "grad_norm": 2.457451794775069, "learning_rate": 6.437714248640608e-06, "logits/chosen": 0.19974656403064728, "logits/rejected": 0.11863253265619278, "logps/chosen": -0.6293502449989319, "logps/rejected": -2.235076427459717, "loss": 0.542, "odds_ratio_loss": 0.3751421570777893, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06293503195047379, "rewards/margins": 0.16057263314723969, "rewards/rejected": -0.22350764274597168, "sft_loss": 0.6293502449989319, "step": 1279 }, { "epoch": 1.8510484454085323, "grad_norm": 2.59547059999874, "learning_rate": 6.435251048968611e-06, "logits/chosen": 0.07283070683479309, "logits/rejected": 0.07713057100772858, "logps/chosen": -0.5544098019599915, "logps/rejected": -2.1915676593780518, "loss": 0.6361, "odds_ratio_loss": 0.24329525232315063, "rewards/accuracies": 0.9375, "rewards/chosen": -0.055440980941057205, "rewards/margins": 0.16371577978134155, "rewards/rejected": -0.21915677189826965, "sft_loss": 0.5544098019599915, "step": 1280 }, { "epoch": 1.8524945770065075, "grad_norm": 3.4915957074434916, "learning_rate": 6.432786381074686e-06, "logits/chosen": 0.17533960938453674, "logits/rejected": 0.16550162434577942, "logps/chosen": -0.541409432888031, "logps/rejected": -1.7533543109893799, "loss": 0.5982, "odds_ratio_loss": 0.4174317419528961, "rewards/accuracies": 0.75, "rewards/chosen": -0.05414094403386116, "rewards/margins": 0.12119448930025101, "rewards/rejected": -0.17533543705940247, "sft_loss": 0.541409432888031, "step": 1281 }, { "epoch": 1.853940708604483, "grad_norm": 2.211714065977566, "learning_rate": 6.430320246444793e-06, "logits/chosen": 0.25783729553222656, "logits/rejected": 0.13165737688541412, "logps/chosen": -0.3393861651420593, "logps/rejected": -3.9043896198272705, "loss": 0.5108, "odds_ratio_loss": 0.2384585589170456, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03393861651420593, "rewards/margins": 0.35650038719177246, "rewards/rejected": -0.390438973903656, "sft_loss": 0.3393861651420593, "step": 1282 }, { "epoch": 1.8553868402024585, "grad_norm": 2.241084791507465, "learning_rate": 6.427852646565771e-06, "logits/chosen": 0.27508994936943054, "logits/rejected": 0.23502448201179504, "logps/chosen": -0.541483461856842, "logps/rejected": -2.7447710037231445, "loss": 0.5657, "odds_ratio_loss": 0.3992454707622528, "rewards/accuracies": 0.8125, "rewards/chosen": -0.054148346185684204, "rewards/margins": 0.22032874822616577, "rewards/rejected": -0.27447709441185, "sft_loss": 0.541483461856842, "step": 1283 }, { "epoch": 1.8568329718004337, "grad_norm": 2.6425814567398365, "learning_rate": 6.425383582925345e-06, "logits/chosen": 0.1416487693786621, "logits/rejected": 0.12380547821521759, "logps/chosen": -0.5558061599731445, "logps/rejected": -1.249847412109375, "loss": 0.6625, "odds_ratio_loss": 0.3178814649581909, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05558061599731445, "rewards/margins": 0.06940412521362305, "rewards/rejected": -0.1249847412109375, "sft_loss": 0.5558061599731445, "step": 1284 }, { "epoch": 1.8582791033984094, "grad_norm": 3.655460612608677, "learning_rate": 6.4229130570121255e-06, "logits/chosen": 0.13592982292175293, "logits/rejected": 0.1365361362695694, "logps/chosen": -0.6427457928657532, "logps/rejected": -2.124833822250366, "loss": 0.6917, "odds_ratio_loss": 0.3293951749801636, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06427457928657532, "rewards/margins": 0.1482088267803192, "rewards/rejected": -0.21248340606689453, "sft_loss": 0.6427457928657532, "step": 1285 }, { "epoch": 1.8597252349963846, "grad_norm": 2.4326228387465503, "learning_rate": 6.420441070315599e-06, "logits/chosen": 0.2087874710559845, "logits/rejected": 0.11434569954872131, "logps/chosen": -0.5145087838172913, "logps/rejected": -2.841360092163086, "loss": 0.6276, "odds_ratio_loss": 0.31676119565963745, "rewards/accuracies": 0.8125, "rewards/chosen": -0.051450878381729126, "rewards/margins": 0.23268508911132812, "rewards/rejected": -0.28413599729537964, "sft_loss": 0.5145087838172913, "step": 1286 }, { "epoch": 1.86117136659436, "grad_norm": 2.443729267777704, "learning_rate": 6.417967624326136e-06, "logits/chosen": -0.01459294743835926, "logits/rejected": -0.00034431740641593933, "logps/chosen": -0.7525098323822021, "logps/rejected": -2.1756627559661865, "loss": 0.6109, "odds_ratio_loss": 0.3534230887889862, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07525098323822021, "rewards/margins": 0.14231529831886292, "rewards/rejected": -0.21756626665592194, "sft_loss": 0.7525098323822021, "step": 1287 }, { "epoch": 1.8626174981923356, "grad_norm": 2.7232168710965805, "learning_rate": 6.415492720534988e-06, "logits/chosen": 0.11816499382257462, "logits/rejected": 0.13505424559116364, "logps/chosen": -0.7191867232322693, "logps/rejected": -2.004683494567871, "loss": 0.6509, "odds_ratio_loss": 0.3239571750164032, "rewards/accuracies": 1.0, "rewards/chosen": -0.07191868126392365, "rewards/margins": 0.12854966521263123, "rewards/rejected": -0.20046836137771606, "sft_loss": 0.7191867232322693, "step": 1288 }, { "epoch": 1.8640636297903108, "grad_norm": 2.3846639533778924, "learning_rate": 6.413016360434282e-06, "logits/chosen": 0.17629383504390717, "logits/rejected": 0.07719264179468155, "logps/chosen": -0.6611538529396057, "logps/rejected": -1.976793646812439, "loss": 0.5742, "odds_ratio_loss": 0.4173961281776428, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0661153793334961, "rewards/margins": 0.1315639764070511, "rewards/rejected": -0.19767937064170837, "sft_loss": 0.6611538529396057, "step": 1289 }, { "epoch": 1.8655097613882863, "grad_norm": 2.540831158377396, "learning_rate": 6.410538545517026e-06, "logits/chosen": 0.22472399473190308, "logits/rejected": 0.1605992168188095, "logps/chosen": -0.4334484934806824, "logps/rejected": -3.1653952598571777, "loss": 0.6147, "odds_ratio_loss": 0.2583789527416229, "rewards/accuracies": 0.875, "rewards/chosen": -0.04334484785795212, "rewards/margins": 0.27319473028182983, "rewards/rejected": -0.31653958559036255, "sft_loss": 0.4334484934806824, "step": 1290 }, { "epoch": 1.8669558929862617, "grad_norm": 3.0085220127490437, "learning_rate": 6.408059277277102e-06, "logits/chosen": 0.05498197674751282, "logits/rejected": 0.14584404230117798, "logps/chosen": -0.7110510468482971, "logps/rejected": -1.6284350156784058, "loss": 0.676, "odds_ratio_loss": 0.3031470477581024, "rewards/accuracies": 1.0, "rewards/chosen": -0.07110510021448135, "rewards/margins": 0.09173840284347534, "rewards/rejected": -0.1628435105085373, "sft_loss": 0.7110510468482971, "step": 1291 }, { "epoch": 1.8684020245842372, "grad_norm": 2.4136664476356846, "learning_rate": 6.4055785572092715e-06, "logits/chosen": 0.14040209352970123, "logits/rejected": 0.006925854831933975, "logps/chosen": -0.5283491015434265, "logps/rejected": -3.145298480987549, "loss": 0.6115, "odds_ratio_loss": 0.244784876704216, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05283490940928459, "rewards/margins": 0.26169493794441223, "rewards/rejected": -0.3145298361778259, "sft_loss": 0.5283491015434265, "step": 1292 }, { "epoch": 1.8698481561822127, "grad_norm": 2.153459001550623, "learning_rate": 6.40309638680917e-06, "logits/chosen": 0.07764284312725067, "logits/rejected": 0.1401481181383133, "logps/chosen": -0.47514212131500244, "logps/rejected": -1.8801544904708862, "loss": 0.5984, "odds_ratio_loss": 0.3149198889732361, "rewards/accuracies": 0.8125, "rewards/chosen": -0.047514207661151886, "rewards/margins": 0.1405012607574463, "rewards/rejected": -0.18801546096801758, "sft_loss": 0.47514212131500244, "step": 1293 }, { "epoch": 1.871294287780188, "grad_norm": 3.0673694959311733, "learning_rate": 6.400612767573306e-06, "logits/chosen": 0.10028165578842163, "logits/rejected": 0.045115672051906586, "logps/chosen": -0.6269665360450745, "logps/rejected": -3.0080552101135254, "loss": 0.6535, "odds_ratio_loss": 0.3017050623893738, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06269665062427521, "rewards/margins": 0.23810890316963196, "rewards/rejected": -0.30080553889274597, "sft_loss": 0.6269665360450745, "step": 1294 }, { "epoch": 1.8727404193781634, "grad_norm": 2.6138455099005022, "learning_rate": 6.398127700999064e-06, "logits/chosen": 0.04801095277070999, "logits/rejected": 0.027741190046072006, "logps/chosen": -0.5394948124885559, "logps/rejected": -2.7543153762817383, "loss": 0.5348, "odds_ratio_loss": 0.3202323913574219, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05394947901368141, "rewards/margins": 0.2214820683002472, "rewards/rejected": -0.2754315733909607, "sft_loss": 0.5394948124885559, "step": 1295 }, { "epoch": 1.8741865509761388, "grad_norm": 2.6384210048459638, "learning_rate": 6.395641188584699e-06, "logits/chosen": 0.05813899263739586, "logits/rejected": 0.067634716629982, "logps/chosen": -0.47403401136398315, "logps/rejected": -2.7208306789398193, "loss": 0.6532, "odds_ratio_loss": 0.35417869687080383, "rewards/accuracies": 0.875, "rewards/chosen": -0.047403402626514435, "rewards/margins": 0.22467969357967377, "rewards/rejected": -0.2720831036567688, "sft_loss": 0.47403401136398315, "step": 1296 }, { "epoch": 1.875632682574114, "grad_norm": 2.443336263784716, "learning_rate": 6.393153231829341e-06, "logits/chosen": 0.1275801658630371, "logits/rejected": 0.07651050388813019, "logps/chosen": -0.5848337411880493, "logps/rejected": -3.3461596965789795, "loss": 0.5843, "odds_ratio_loss": 0.3497033417224884, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05848337337374687, "rewards/margins": 0.27613261342048645, "rewards/rejected": -0.3346160054206848, "sft_loss": 0.5848337411880493, "step": 1297 }, { "epoch": 1.8770788141720898, "grad_norm": 3.6724827423590862, "learning_rate": 6.390663832232985e-06, "logits/chosen": 0.16066929697990417, "logits/rejected": 0.13951550424098969, "logps/chosen": -0.6801087856292725, "logps/rejected": -2.111433744430542, "loss": 0.6903, "odds_ratio_loss": 0.39818501472473145, "rewards/accuracies": 0.875, "rewards/chosen": -0.06801088154315948, "rewards/margins": 0.14313249289989471, "rewards/rejected": -0.2111433744430542, "sft_loss": 0.6801087856292725, "step": 1298 }, { "epoch": 1.878524945770065, "grad_norm": 2.4997279980859366, "learning_rate": 6.3881729912965006e-06, "logits/chosen": 0.18264645338058472, "logits/rejected": 0.10208077728748322, "logps/chosen": -0.5250051021575928, "logps/rejected": -3.1066970825195312, "loss": 0.5647, "odds_ratio_loss": 0.3017498254776001, "rewards/accuracies": 0.875, "rewards/chosen": -0.052500516176223755, "rewards/margins": 0.25816917419433594, "rewards/rejected": -0.3106696903705597, "sft_loss": 0.5250051021575928, "step": 1299 }, { "epoch": 1.8799710773680405, "grad_norm": 3.1095920261150978, "learning_rate": 6.385680710521624e-06, "logits/chosen": 0.1434398889541626, "logits/rejected": 0.07962372153997421, "logps/chosen": -0.5778891444206238, "logps/rejected": -3.653951406478882, "loss": 0.5938, "odds_ratio_loss": 0.24841637909412384, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0577889159321785, "rewards/margins": 0.30760622024536133, "rewards/rejected": -0.36539512872695923, "sft_loss": 0.5778891444206238, "step": 1300 }, { "epoch": 1.881417208966016, "grad_norm": 2.381125913980573, "learning_rate": 6.383186991410964e-06, "logits/chosen": 0.1245647668838501, "logits/rejected": 0.010384336113929749, "logps/chosen": -0.7906035780906677, "logps/rejected": -1.5489108562469482, "loss": 0.6896, "odds_ratio_loss": 0.4644030034542084, "rewards/accuracies": 0.75, "rewards/chosen": -0.07906036078929901, "rewards/margins": 0.07583072781562805, "rewards/rejected": -0.15489107370376587, "sft_loss": 0.7906035780906677, "step": 1301 }, { "epoch": 1.8828633405639912, "grad_norm": 2.2563598663691558, "learning_rate": 6.38069183546799e-06, "logits/chosen": 0.20938679575920105, "logits/rejected": 0.15651705861091614, "logps/chosen": -0.7142879366874695, "logps/rejected": -3.187528371810913, "loss": 0.5619, "odds_ratio_loss": 0.44818058609962463, "rewards/accuracies": 0.75, "rewards/chosen": -0.07142879068851471, "rewards/margins": 0.24732404947280884, "rewards/rejected": -0.31875282526016235, "sft_loss": 0.7142879366874695, "step": 1302 }, { "epoch": 1.8843094721619669, "grad_norm": 2.9268345750498637, "learning_rate": 6.378195244197042e-06, "logits/chosen": 0.08001869916915894, "logits/rejected": 0.06469704955816269, "logps/chosen": -0.6338894367218018, "logps/rejected": -3.332848072052002, "loss": 0.6408, "odds_ratio_loss": 0.320722758769989, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06338894367218018, "rewards/margins": 0.26989588141441345, "rewards/rejected": -0.33328482508659363, "sft_loss": 0.6338894367218018, "step": 1303 }, { "epoch": 1.8857556037599421, "grad_norm": 2.980340525854677, "learning_rate": 6.3756972191033244e-06, "logits/chosen": 0.002490525133907795, "logits/rejected": 0.008151497691869736, "logps/chosen": -0.6807718873023987, "logps/rejected": -2.1325416564941406, "loss": 0.608, "odds_ratio_loss": 0.3046268820762634, "rewards/accuracies": 0.875, "rewards/chosen": -0.0680771917104721, "rewards/margins": 0.1451769769191742, "rewards/rejected": -0.2132541537284851, "sft_loss": 0.6807718873023987, "step": 1304 }, { "epoch": 1.8872017353579176, "grad_norm": 2.3608345245842104, "learning_rate": 6.373197761692905e-06, "logits/chosen": 0.13456328213214874, "logits/rejected": 0.11488880962133408, "logps/chosen": -0.5167301893234253, "logps/rejected": -1.9352409839630127, "loss": 0.5643, "odds_ratio_loss": 0.4348183870315552, "rewards/accuracies": 0.75, "rewards/chosen": -0.05167302116751671, "rewards/margins": 0.14185106754302979, "rewards/rejected": -0.1935240924358368, "sft_loss": 0.5167301893234253, "step": 1305 }, { "epoch": 1.888647866955893, "grad_norm": 2.7186771660256435, "learning_rate": 6.370696873472715e-06, "logits/chosen": 0.25382813811302185, "logits/rejected": 0.13196587562561035, "logps/chosen": -0.5394778251647949, "logps/rejected": -2.625441789627075, "loss": 0.5759, "odds_ratio_loss": 0.37160181999206543, "rewards/accuracies": 0.75, "rewards/chosen": -0.05394778400659561, "rewards/margins": 0.2085963934659958, "rewards/rejected": -0.262544184923172, "sft_loss": 0.5394778251647949, "step": 1306 }, { "epoch": 1.8900939985538683, "grad_norm": 2.425429249657979, "learning_rate": 6.368194555950552e-06, "logits/chosen": 0.12498855590820312, "logits/rejected": 0.08333387970924377, "logps/chosen": -0.6716573238372803, "logps/rejected": -2.1420390605926514, "loss": 0.6824, "odds_ratio_loss": 0.3968257009983063, "rewards/accuracies": 0.75, "rewards/chosen": -0.06716573238372803, "rewards/margins": 0.14703817665576935, "rewards/rejected": -0.21420392394065857, "sft_loss": 0.6716573238372803, "step": 1307 }, { "epoch": 1.891540130151844, "grad_norm": 2.8376022318759104, "learning_rate": 6.365690810635072e-06, "logits/chosen": 0.18870142102241516, "logits/rejected": -0.01581069827079773, "logps/chosen": -0.5011342763900757, "logps/rejected": -3.2665085792541504, "loss": 0.6059, "odds_ratio_loss": 0.31004223227500916, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05011342838406563, "rewards/margins": 0.2765374481678009, "rewards/rejected": -0.3266508877277374, "sft_loss": 0.5011342763900757, "step": 1308 }, { "epoch": 1.8929862617498192, "grad_norm": 2.551693877142118, "learning_rate": 6.363185639035791e-06, "logits/chosen": 0.2654971778392792, "logits/rejected": 0.09708055108785629, "logps/chosen": -0.5613073110580444, "logps/rejected": -2.9234206676483154, "loss": 0.5443, "odds_ratio_loss": 0.3154491186141968, "rewards/accuracies": 0.875, "rewards/chosen": -0.05613073334097862, "rewards/margins": 0.23621134459972382, "rewards/rejected": -0.29234206676483154, "sft_loss": 0.5613073110580444, "step": 1309 }, { "epoch": 1.8944323933477947, "grad_norm": 2.7390713645360782, "learning_rate": 6.360679042663085e-06, "logits/chosen": 0.16496847569942474, "logits/rejected": 0.13807308673858643, "logps/chosen": -0.594467043876648, "logps/rejected": -2.4058032035827637, "loss": 0.6005, "odds_ratio_loss": 0.3193429708480835, "rewards/accuracies": 0.875, "rewards/chosen": -0.059446703642606735, "rewards/margins": 0.18113362789154053, "rewards/rejected": -0.24058032035827637, "sft_loss": 0.594467043876648, "step": 1310 }, { "epoch": 1.8958785249457701, "grad_norm": 2.1716752579277094, "learning_rate": 6.3581710230281935e-06, "logits/chosen": 0.05845704302191734, "logits/rejected": 0.05051730200648308, "logps/chosen": -0.7212303876876831, "logps/rejected": -2.8155322074890137, "loss": 0.6325, "odds_ratio_loss": 0.3787981867790222, "rewards/accuracies": 0.75, "rewards/chosen": -0.07212303578853607, "rewards/margins": 0.20943017303943634, "rewards/rejected": -0.2815532088279724, "sft_loss": 0.7212303876876831, "step": 1311 }, { "epoch": 1.8973246565437454, "grad_norm": 3.4651701286123586, "learning_rate": 6.355661581643209e-06, "logits/chosen": 0.16038264334201813, "logits/rejected": 0.09201866388320923, "logps/chosen": -0.6783140897750854, "logps/rejected": -2.29701566696167, "loss": 0.7313, "odds_ratio_loss": 0.44825479388237, "rewards/accuracies": 0.625, "rewards/chosen": -0.06783141195774078, "rewards/margins": 0.16187018156051636, "rewards/rejected": -0.22970159351825714, "sft_loss": 0.6783140897750854, "step": 1312 }, { "epoch": 1.8987707881417208, "grad_norm": 2.8433098050638557, "learning_rate": 6.353150720021084e-06, "logits/chosen": 0.1522006392478943, "logits/rejected": 0.09702038764953613, "logps/chosen": -0.4841322898864746, "logps/rejected": -3.95847225189209, "loss": 0.5584, "odds_ratio_loss": 0.20565840601921082, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0484132282435894, "rewards/margins": 0.34743404388427734, "rewards/rejected": -0.39584726095199585, "sft_loss": 0.4841322898864746, "step": 1313 }, { "epoch": 1.9002169197396963, "grad_norm": 2.74621567318202, "learning_rate": 6.350638439675626e-06, "logits/chosen": -0.0204104445874691, "logits/rejected": 0.0010041743516921997, "logps/chosen": -0.7974636554718018, "logps/rejected": -1.3563494682312012, "loss": 0.6811, "odds_ratio_loss": 0.5164413452148438, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07974636554718018, "rewards/margins": 0.055888570845127106, "rewards/rejected": -0.13563494384288788, "sft_loss": 0.7974636554718018, "step": 1314 }, { "epoch": 1.9016630513376718, "grad_norm": 3.2230719639122585, "learning_rate": 6.348124742121497e-06, "logits/chosen": 0.19585879147052765, "logits/rejected": 0.12719781696796417, "logps/chosen": -0.47810548543930054, "logps/rejected": -3.516087055206299, "loss": 0.5758, "odds_ratio_loss": 0.18576735258102417, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04781055450439453, "rewards/margins": 0.3037981390953064, "rewards/rejected": -0.3516086935997009, "sft_loss": 0.47810548543930054, "step": 1315 }, { "epoch": 1.9031091829356472, "grad_norm": 2.3405960760811966, "learning_rate": 6.345609628874216e-06, "logits/chosen": 0.07255364954471588, "logits/rejected": 0.1415763944387436, "logps/chosen": -0.5264100432395935, "logps/rejected": -2.2038300037384033, "loss": 0.6285, "odds_ratio_loss": 0.3210826516151428, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05264100432395935, "rewards/margins": 0.16774199903011322, "rewards/rejected": -0.22038300335407257, "sft_loss": 0.5264100432395935, "step": 1316 }, { "epoch": 1.9045553145336225, "grad_norm": 4.048582332028306, "learning_rate": 6.3430931014501546e-06, "logits/chosen": 0.1995735615491867, "logits/rejected": 0.09358995407819748, "logps/chosen": -0.6856232285499573, "logps/rejected": -1.2745683193206787, "loss": 0.5704, "odds_ratio_loss": 0.48545917868614197, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0685623288154602, "rewards/margins": 0.058894507586956024, "rewards/rejected": -0.12745684385299683, "sft_loss": 0.6856232285499573, "step": 1317 }, { "epoch": 1.906001446131598, "grad_norm": 2.1869595703546247, "learning_rate": 6.340575161366536e-06, "logits/chosen": 0.19228671491146088, "logits/rejected": 0.07713081687688828, "logps/chosen": -0.6164018511772156, "logps/rejected": -2.873373508453369, "loss": 0.6288, "odds_ratio_loss": 0.27122145891189575, "rewards/accuracies": 0.875, "rewards/chosen": -0.061640188097953796, "rewards/margins": 0.22569714486598969, "rewards/rejected": -0.28733736276626587, "sft_loss": 0.6164018511772156, "step": 1318 }, { "epoch": 1.9074475777295734, "grad_norm": 2.5534553089237977, "learning_rate": 6.338055810141433e-06, "logits/chosen": 0.10392449051141739, "logits/rejected": 0.09642945230007172, "logps/chosen": -0.5694040656089783, "logps/rejected": -3.596972942352295, "loss": 0.6739, "odds_ratio_loss": 0.22980889678001404, "rewards/accuracies": 1.0, "rewards/chosen": -0.05694040656089783, "rewards/margins": 0.3027569055557251, "rewards/rejected": -0.35969728231430054, "sft_loss": 0.5694040656089783, "step": 1319 }, { "epoch": 1.9088937093275486, "grad_norm": 2.8970511977043274, "learning_rate": 6.335535049293776e-06, "logits/chosen": 0.18930402398109436, "logits/rejected": 0.11592543870210648, "logps/chosen": -0.5666981935501099, "logps/rejected": -2.564934253692627, "loss": 0.5861, "odds_ratio_loss": 0.340043842792511, "rewards/accuracies": 0.875, "rewards/chosen": -0.05666981637477875, "rewards/margins": 0.19982360303401947, "rewards/rejected": -0.2564934194087982, "sft_loss": 0.5666981935501099, "step": 1320 }, { "epoch": 1.9103398409255243, "grad_norm": 3.1968079824765163, "learning_rate": 6.333012880343339e-06, "logits/chosen": 0.13316094875335693, "logits/rejected": 0.09439437091350555, "logps/chosen": -0.5278730392456055, "logps/rejected": -1.6159777641296387, "loss": 0.5628, "odds_ratio_loss": 0.32977184653282166, "rewards/accuracies": 0.875, "rewards/chosen": -0.05278730392456055, "rewards/margins": 0.10881047695875168, "rewards/rejected": -0.16159778833389282, "sft_loss": 0.5278730392456055, "step": 1321 }, { "epoch": 1.9117859725234996, "grad_norm": 2.250481769511735, "learning_rate": 6.330489304810747e-06, "logits/chosen": 0.18630638718605042, "logits/rejected": 0.07742702960968018, "logps/chosen": -0.6264196634292603, "logps/rejected": -2.363523244857788, "loss": 0.6726, "odds_ratio_loss": 0.37035539746284485, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06264197081327438, "rewards/margins": 0.17371037602424622, "rewards/rejected": -0.23635233938694, "sft_loss": 0.6264196634292603, "step": 1322 }, { "epoch": 1.913232104121475, "grad_norm": 2.3131159606518175, "learning_rate": 6.327964324217474e-06, "logits/chosen": 0.13363704085350037, "logits/rejected": 0.026563134044408798, "logps/chosen": -0.5144316554069519, "logps/rejected": -2.5232536792755127, "loss": 0.5649, "odds_ratio_loss": 0.3306152820587158, "rewards/accuracies": 0.875, "rewards/chosen": -0.05144317075610161, "rewards/margins": 0.200882226228714, "rewards/rejected": -0.2523253858089447, "sft_loss": 0.5144316554069519, "step": 1323 }, { "epoch": 1.9146782357194505, "grad_norm": 2.839594699946253, "learning_rate": 6.325437940085839e-06, "logits/chosen": 0.10742896050214767, "logits/rejected": 0.08482453227043152, "logps/chosen": -0.4993060231208801, "logps/rejected": -3.533346176147461, "loss": 0.5296, "odds_ratio_loss": 0.31563615798950195, "rewards/accuracies": 0.875, "rewards/chosen": -0.04993060231208801, "rewards/margins": 0.3034040331840515, "rewards/rejected": -0.35333460569381714, "sft_loss": 0.4993060231208801, "step": 1324 }, { "epoch": 1.9161243673174257, "grad_norm": 2.500177382400634, "learning_rate": 6.32291015393901e-06, "logits/chosen": 0.1152525320649147, "logits/rejected": 0.0816318467259407, "logps/chosen": -0.48759815096855164, "logps/rejected": -2.2603230476379395, "loss": 0.5793, "odds_ratio_loss": 0.20722666382789612, "rewards/accuracies": 1.0, "rewards/chosen": -0.0487598180770874, "rewards/margins": 0.1772724837064743, "rewards/rejected": -0.2260323166847229, "sft_loss": 0.48759815096855164, "step": 1325 }, { "epoch": 1.9175704989154014, "grad_norm": 2.533564293757888, "learning_rate": 6.320380967300996e-06, "logits/chosen": 0.19266647100448608, "logits/rejected": 0.022887878119945526, "logps/chosen": -0.5034148097038269, "logps/rejected": -3.819204092025757, "loss": 0.5919, "odds_ratio_loss": 0.2226625382900238, "rewards/accuracies": 1.0, "rewards/chosen": -0.05034147948026657, "rewards/margins": 0.33157891035079956, "rewards/rejected": -0.38192039728164673, "sft_loss": 0.5034148097038269, "step": 1326 }, { "epoch": 1.9190166305133767, "grad_norm": 2.921269282131583, "learning_rate": 6.317850381696657e-06, "logits/chosen": 0.2217884510755539, "logits/rejected": 0.01924968883395195, "logps/chosen": -0.4081079363822937, "logps/rejected": -4.638675689697266, "loss": 0.5773, "odds_ratio_loss": 0.2604500651359558, "rewards/accuracies": 0.8125, "rewards/chosen": -0.04081079363822937, "rewards/margins": 0.4230567514896393, "rewards/rejected": -0.46386754512786865, "sft_loss": 0.4081079363822937, "step": 1327 }, { "epoch": 1.9204627621113521, "grad_norm": 2.3841994801404214, "learning_rate": 6.31531839865169e-06, "logits/chosen": 0.08403509855270386, "logits/rejected": 0.02859571948647499, "logps/chosen": -0.5892415642738342, "logps/rejected": -3.3557963371276855, "loss": 0.6229, "odds_ratio_loss": 0.41986650228500366, "rewards/accuracies": 0.8125, "rewards/chosen": -0.05892416089773178, "rewards/margins": 0.2766554653644562, "rewards/rejected": -0.33557960391044617, "sft_loss": 0.5892415642738342, "step": 1328 }, { "epoch": 1.9219088937093276, "grad_norm": 4.499602679189735, "learning_rate": 6.3127850196926365e-06, "logits/chosen": 0.02900572493672371, "logits/rejected": 0.0013004057109355927, "logps/chosen": -0.49205368757247925, "logps/rejected": -3.4965710639953613, "loss": 0.5968, "odds_ratio_loss": 0.22342023253440857, "rewards/accuracies": 0.9375, "rewards/chosen": -0.049205370247364044, "rewards/margins": 0.30045175552368164, "rewards/rejected": -0.3496571183204651, "sft_loss": 0.49205368757247925, "step": 1329 }, { "epoch": 1.9233550253073028, "grad_norm": 2.51797982968202, "learning_rate": 6.31025024634688e-06, "logits/chosen": 0.2063034176826477, "logits/rejected": 0.13551932573318481, "logps/chosen": -0.6059345006942749, "logps/rejected": -3.6007964611053467, "loss": 0.5624, "odds_ratio_loss": 0.19518573582172394, "rewards/accuracies": 1.0, "rewards/chosen": -0.06059345230460167, "rewards/margins": 0.2994861900806427, "rewards/rejected": -0.36007964611053467, "sft_loss": 0.6059345006942749, "step": 1330 }, { "epoch": 1.9248011569052785, "grad_norm": 3.447774341323875, "learning_rate": 6.307714080142648e-06, "logits/chosen": 0.11149915307760239, "logits/rejected": 0.19055677950382233, "logps/chosen": -0.4802173376083374, "logps/rejected": -3.084043025970459, "loss": 0.644, "odds_ratio_loss": 0.44687554240226746, "rewards/accuracies": 0.8125, "rewards/chosen": -0.04802173376083374, "rewards/margins": 0.26038259267807007, "rewards/rejected": -0.3084043264389038, "sft_loss": 0.4802173376083374, "step": 1331 }, { "epoch": 1.9262472885032538, "grad_norm": 3.9481918292240095, "learning_rate": 6.305176522609001e-06, "logits/chosen": 0.10856892168521881, "logits/rejected": 0.09776373207569122, "logps/chosen": -0.5553045272827148, "logps/rejected": -2.3343453407287598, "loss": 0.6648, "odds_ratio_loss": 0.25999295711517334, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05553045868873596, "rewards/margins": 0.17790409922599792, "rewards/rejected": -0.2334345579147339, "sft_loss": 0.5553045272827148, "step": 1332 }, { "epoch": 1.9276934201012292, "grad_norm": 2.152983324315511, "learning_rate": 6.302637575275842e-06, "logits/chosen": 0.14477403461933136, "logits/rejected": 0.12126276642084122, "logps/chosen": -0.6557214260101318, "logps/rejected": -1.5948143005371094, "loss": 0.6043, "odds_ratio_loss": 0.44841280579566956, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06557214260101318, "rewards/margins": 0.09390929341316223, "rewards/rejected": -0.15948143601417542, "sft_loss": 0.6557214260101318, "step": 1333 }, { "epoch": 1.9291395516992047, "grad_norm": 3.301517579732635, "learning_rate": 6.300097239673915e-06, "logits/chosen": 0.17113935947418213, "logits/rejected": 0.16045597195625305, "logps/chosen": -0.5597264766693115, "logps/rejected": -1.5276334285736084, "loss": 0.5554, "odds_ratio_loss": 0.41156938672065735, "rewards/accuracies": 0.75, "rewards/chosen": -0.055972643196582794, "rewards/margins": 0.09679071605205536, "rewards/rejected": -0.15276335179805756, "sft_loss": 0.5597264766693115, "step": 1334 }, { "epoch": 1.93058568329718, "grad_norm": 2.5923227720502537, "learning_rate": 6.297555517334794e-06, "logits/chosen": 0.04513344168663025, "logits/rejected": 0.0036378642544150352, "logps/chosen": -0.7068974375724792, "logps/rejected": -1.3616538047790527, "loss": 0.7021, "odds_ratio_loss": 0.46141964197158813, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07068974524736404, "rewards/margins": 0.06547562777996063, "rewards/rejected": -0.13616538047790527, "sft_loss": 0.7068974375724792, "step": 1335 }, { "epoch": 1.9320318148951554, "grad_norm": 2.5908665206389503, "learning_rate": 6.295012409790896e-06, "logits/chosen": 0.07112018764019012, "logits/rejected": 0.14503023028373718, "logps/chosen": -0.4185260236263275, "logps/rejected": -2.6089491844177246, "loss": 0.6039, "odds_ratio_loss": 0.3201354146003723, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04185260087251663, "rewards/margins": 0.2190423458814621, "rewards/rejected": -0.2608949542045593, "sft_loss": 0.4185260236263275, "step": 1336 }, { "epoch": 1.9334779464931309, "grad_norm": 2.5569688339171632, "learning_rate": 6.2924679185754684e-06, "logits/chosen": 0.17154423892498016, "logits/rejected": 0.0873059630393982, "logps/chosen": -0.6691811084747314, "logps/rejected": -1.198056936264038, "loss": 0.6591, "odds_ratio_loss": 0.3893122673034668, "rewards/accuracies": 0.875, "rewards/chosen": -0.06691811233758926, "rewards/margins": 0.05288759991526604, "rewards/rejected": -0.119805708527565, "sft_loss": 0.6691811084747314, "step": 1337 }, { "epoch": 1.9349240780911063, "grad_norm": 4.174361185121562, "learning_rate": 6.289922045222594e-06, "logits/chosen": 0.0211828351020813, "logits/rejected": 0.07563084363937378, "logps/chosen": -0.5859172940254211, "logps/rejected": -2.894932270050049, "loss": 0.5613, "odds_ratio_loss": 0.22862787544727325, "rewards/accuracies": 1.0, "rewards/chosen": -0.058591730892658234, "rewards/margins": 0.23090147972106934, "rewards/rejected": -0.28949323296546936, "sft_loss": 0.5859172940254211, "step": 1338 }, { "epoch": 1.9363702096890818, "grad_norm": 2.338603996374857, "learning_rate": 6.28737479126719e-06, "logits/chosen": 0.2083335667848587, "logits/rejected": 0.13322553038597107, "logps/chosen": -0.37943235039711, "logps/rejected": -3.214156150817871, "loss": 0.555, "odds_ratio_loss": 0.2722666561603546, "rewards/accuracies": 0.875, "rewards/chosen": -0.03794323652982712, "rewards/margins": 0.2834724187850952, "rewards/rejected": -0.32141566276550293, "sft_loss": 0.37943235039711, "step": 1339 }, { "epoch": 1.937816341287057, "grad_norm": 11.15462592256021, "learning_rate": 6.284826158245005e-06, "logits/chosen": 0.06666195392608643, "logits/rejected": 0.0736611858010292, "logps/chosen": -0.7771252989768982, "logps/rejected": -1.527148962020874, "loss": 0.6956, "odds_ratio_loss": 0.3552093803882599, "rewards/accuracies": 0.875, "rewards/chosen": -0.0777125358581543, "rewards/margins": 0.07500237226486206, "rewards/rejected": -0.15271490812301636, "sft_loss": 0.7771252989768982, "step": 1340 }, { "epoch": 1.9392624728850325, "grad_norm": 2.82099850918232, "learning_rate": 6.28227614769262e-06, "logits/chosen": 0.15024471282958984, "logits/rejected": 0.096395343542099, "logps/chosen": -0.5148159861564636, "logps/rejected": -2.127951145172119, "loss": 0.599, "odds_ratio_loss": 0.26387378573417664, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05148159712553024, "rewards/margins": 0.16131353378295898, "rewards/rejected": -0.21279510855674744, "sft_loss": 0.5148159861564636, "step": 1341 }, { "epoch": 1.940708604483008, "grad_norm": 2.379780310975218, "learning_rate": 6.279724761147445e-06, "logits/chosen": 0.09975308179855347, "logits/rejected": 0.05713462457060814, "logps/chosen": -0.3695414662361145, "logps/rejected": -3.6172609329223633, "loss": 0.5196, "odds_ratio_loss": 0.2230728268623352, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03695414215326309, "rewards/margins": 0.3247719407081604, "rewards/rejected": -0.3617260754108429, "sft_loss": 0.3695414662361145, "step": 1342 }, { "epoch": 1.9421547360809832, "grad_norm": 3.000388458997451, "learning_rate": 6.2771720001477216e-06, "logits/chosen": 0.16662685573101044, "logits/rejected": 0.060050975531339645, "logps/chosen": -0.6600314974784851, "logps/rejected": -4.327170372009277, "loss": 0.5381, "odds_ratio_loss": 0.2681175768375397, "rewards/accuracies": 0.875, "rewards/chosen": -0.06600315123796463, "rewards/margins": 0.36671388149261475, "rewards/rejected": -0.43271705508232117, "sft_loss": 0.6600314974784851, "step": 1343 }, { "epoch": 1.943600867678959, "grad_norm": 4.460845864755663, "learning_rate": 6.2746178662325176e-06, "logits/chosen": 0.2377113699913025, "logits/rejected": 0.05871110036969185, "logps/chosen": -0.52542644739151, "logps/rejected": -3.663177251815796, "loss": 0.6501, "odds_ratio_loss": 0.24479225277900696, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05254264175891876, "rewards/margins": 0.31377506256103516, "rewards/rejected": -0.3663177192211151, "sft_loss": 0.52542644739151, "step": 1344 }, { "epoch": 1.9450469992769341, "grad_norm": 2.6669621761593154, "learning_rate": 6.2720623609417315e-06, "logits/chosen": 0.1335269808769226, "logits/rejected": 0.08458232134580612, "logps/chosen": -0.49498024582862854, "logps/rejected": -3.2982349395751953, "loss": 0.5686, "odds_ratio_loss": 0.20877742767333984, "rewards/accuracies": 0.9375, "rewards/chosen": -0.049498021602630615, "rewards/margins": 0.2803254723548889, "rewards/rejected": -0.3298235237598419, "sft_loss": 0.49498024582862854, "step": 1345 }, { "epoch": 1.9464931308749096, "grad_norm": 2.3682656918705414, "learning_rate": 6.269505485816084e-06, "logits/chosen": 0.05494852364063263, "logits/rejected": 0.10123536735773087, "logps/chosen": -0.5091966986656189, "logps/rejected": -2.195608615875244, "loss": 0.5588, "odds_ratio_loss": 0.2521820366382599, "rewards/accuracies": 1.0, "rewards/chosen": -0.05091967061161995, "rewards/margins": 0.16864119470119476, "rewards/rejected": -0.21956084668636322, "sft_loss": 0.5091966986656189, "step": 1346 }, { "epoch": 1.947939262472885, "grad_norm": 2.6512013563553287, "learning_rate": 6.266947242397129e-06, "logits/chosen": -0.017354674637317657, "logits/rejected": 0.13395394384860992, "logps/chosen": -0.38874998688697815, "logps/rejected": -2.445746898651123, "loss": 0.4976, "odds_ratio_loss": 0.14418071508407593, "rewards/accuracies": 1.0, "rewards/chosen": -0.038874998688697815, "rewards/margins": 0.20569969713687897, "rewards/rejected": -0.24457469582557678, "sft_loss": 0.38874998688697815, "step": 1347 }, { "epoch": 1.9493853940708603, "grad_norm": 2.1651142096226916, "learning_rate": 6.264387632227237e-06, "logits/chosen": 0.15398314595222473, "logits/rejected": 0.08990249782800674, "logps/chosen": -0.5985828638076782, "logps/rejected": -3.3371827602386475, "loss": 0.5798, "odds_ratio_loss": 0.4579890966415405, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0598582923412323, "rewards/margins": 0.27385997772216797, "rewards/rejected": -0.33371827006340027, "sft_loss": 0.5985828638076782, "step": 1348 }, { "epoch": 1.950831525668836, "grad_norm": 2.1798011502799253, "learning_rate": 6.261826656849608e-06, "logits/chosen": 0.07891194522380829, "logits/rejected": 0.01930670440196991, "logps/chosen": -0.6371974945068359, "logps/rejected": -3.8084051609039307, "loss": 0.6145, "odds_ratio_loss": 0.38298970460891724, "rewards/accuracies": 0.875, "rewards/chosen": -0.06371975690126419, "rewards/margins": 0.3171207010746002, "rewards/rejected": -0.3808404505252838, "sft_loss": 0.6371974945068359, "step": 1349 }, { "epoch": 1.9522776572668112, "grad_norm": 2.6217473434650818, "learning_rate": 6.259264317808265e-06, "logits/chosen": 0.15782146155834198, "logits/rejected": 0.04384998604655266, "logps/chosen": -0.597080409526825, "logps/rejected": -1.9142271280288696, "loss": 0.6438, "odds_ratio_loss": 0.3650417923927307, "rewards/accuracies": 0.6875, "rewards/chosen": -0.059708043932914734, "rewards/margins": 0.13171467185020447, "rewards/rejected": -0.19142268598079681, "sft_loss": 0.597080409526825, "step": 1350 }, { "epoch": 1.9537237888647867, "grad_norm": 2.260839269208856, "learning_rate": 6.256700616648049e-06, "logits/chosen": 0.02536643110215664, "logits/rejected": -0.08087426424026489, "logps/chosen": -0.6350283026695251, "logps/rejected": -3.553034782409668, "loss": 0.6017, "odds_ratio_loss": 0.40396812558174133, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06350283324718475, "rewards/margins": 0.2918006479740143, "rewards/rejected": -0.3553035259246826, "sft_loss": 0.6350283026695251, "step": 1351 }, { "epoch": 1.9551699204627622, "grad_norm": 2.755672651848687, "learning_rate": 6.254135554914628e-06, "logits/chosen": 0.10055305063724518, "logits/rejected": 0.028218144550919533, "logps/chosen": -0.6204773187637329, "logps/rejected": -2.3922319412231445, "loss": 0.5954, "odds_ratio_loss": 0.26733872294425964, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06204773113131523, "rewards/margins": 0.17717546224594116, "rewards/rejected": -0.2392231971025467, "sft_loss": 0.6204773187637329, "step": 1352 }, { "epoch": 1.9566160520607374, "grad_norm": 4.454723264605439, "learning_rate": 6.251569134154482e-06, "logits/chosen": 0.10258796811103821, "logits/rejected": 0.07780814170837402, "logps/chosen": -0.6633048057556152, "logps/rejected": -2.7840261459350586, "loss": 0.615, "odds_ratio_loss": 0.28764694929122925, "rewards/accuracies": 0.875, "rewards/chosen": -0.06633048504590988, "rewards/margins": 0.21207213401794434, "rewards/rejected": -0.2784026265144348, "sft_loss": 0.6633048057556152, "step": 1353 }, { "epoch": 1.958062183658713, "grad_norm": 2.7603204339165597, "learning_rate": 6.2490013559149215e-06, "logits/chosen": 0.10790781676769257, "logits/rejected": 0.08358579128980637, "logps/chosen": -0.6533021330833435, "logps/rejected": -1.621474027633667, "loss": 0.6408, "odds_ratio_loss": 0.42883116006851196, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06533021479845047, "rewards/margins": 0.09681720286607742, "rewards/rejected": -0.1621474176645279, "sft_loss": 0.6533021330833435, "step": 1354 }, { "epoch": 1.9595083152566883, "grad_norm": 2.125375605273302, "learning_rate": 6.246432221744068e-06, "logits/chosen": 0.1881817877292633, "logits/rejected": 0.1281939148902893, "logps/chosen": -0.5085922479629517, "logps/rejected": -3.4811835289001465, "loss": 0.5576, "odds_ratio_loss": 0.25084131956100464, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05085922032594681, "rewards/margins": 0.297259122133255, "rewards/rejected": -0.3481183350086212, "sft_loss": 0.5085922479629517, "step": 1355 }, { "epoch": 1.9609544468546638, "grad_norm": 2.20344338079031, "learning_rate": 6.2438617331908616e-06, "logits/chosen": 0.11778862029314041, "logits/rejected": 0.0436352975666523, "logps/chosen": -0.5298951864242554, "logps/rejected": -2.8205578327178955, "loss": 0.5737, "odds_ratio_loss": 0.2629122734069824, "rewards/accuracies": 0.9375, "rewards/chosen": -0.052989523857831955, "rewards/margins": 0.22906626760959625, "rewards/rejected": -0.2820557951927185, "sft_loss": 0.5298951864242554, "step": 1356 }, { "epoch": 1.9624005784526393, "grad_norm": 2.1196838584709465, "learning_rate": 6.241289891805059e-06, "logits/chosen": 0.017896192148327827, "logits/rejected": 0.0749349296092987, "logps/chosen": -0.5861757397651672, "logps/rejected": -2.2477173805236816, "loss": 0.589, "odds_ratio_loss": 0.31650716066360474, "rewards/accuracies": 0.875, "rewards/chosen": -0.05861757695674896, "rewards/margins": 0.166154146194458, "rewards/rejected": -0.22477173805236816, "sft_loss": 0.5861757397651672, "step": 1357 }, { "epoch": 1.9638467100506145, "grad_norm": 2.4776207306510662, "learning_rate": 6.238716699137233e-06, "logits/chosen": 0.0893697589635849, "logits/rejected": -0.046085745096206665, "logps/chosen": -0.6362526416778564, "logps/rejected": -3.7252695560455322, "loss": 0.5203, "odds_ratio_loss": 0.29234158992767334, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0636252611875534, "rewards/margins": 0.30890169739723206, "rewards/rejected": -0.3725269138813019, "sft_loss": 0.6362526416778564, "step": 1358 }, { "epoch": 1.96529284164859, "grad_norm": 2.6960848499910863, "learning_rate": 6.23614215673877e-06, "logits/chosen": 0.12152606248855591, "logits/rejected": 0.026878921315073967, "logps/chosen": -0.6179218888282776, "logps/rejected": -1.8269139528274536, "loss": 0.6289, "odds_ratio_loss": 0.3888968229293823, "rewards/accuracies": 0.875, "rewards/chosen": -0.06179218739271164, "rewards/margins": 0.12089921534061432, "rewards/rejected": -0.18269139528274536, "sft_loss": 0.6179218888282776, "step": 1359 }, { "epoch": 1.9667389732465654, "grad_norm": 2.598237230889151, "learning_rate": 6.233566266161874e-06, "logits/chosen": 0.09245370328426361, "logits/rejected": 0.12901175022125244, "logps/chosen": -0.6184776425361633, "logps/rejected": -2.0056989192962646, "loss": 0.569, "odds_ratio_loss": 0.3535006642341614, "rewards/accuracies": 0.875, "rewards/chosen": -0.061847761273384094, "rewards/margins": 0.13872212171554565, "rewards/rejected": -0.20056991279125214, "sft_loss": 0.6184776425361633, "step": 1360 }, { "epoch": 1.968185104844541, "grad_norm": 2.58726516689127, "learning_rate": 6.230989028959558e-06, "logits/chosen": -0.00898485817015171, "logits/rejected": 0.036827899515628815, "logps/chosen": -0.7747241258621216, "logps/rejected": -2.6281819343566895, "loss": 0.6434, "odds_ratio_loss": 0.3512718677520752, "rewards/accuracies": 0.875, "rewards/chosen": -0.07747241109609604, "rewards/margins": 0.18534579873085022, "rewards/rejected": -0.26281821727752686, "sft_loss": 0.7747241258621216, "step": 1361 }, { "epoch": 1.9696312364425164, "grad_norm": 2.662372117558744, "learning_rate": 6.228410446685645e-06, "logits/chosen": 0.06157606095075607, "logits/rejected": -0.07763740420341492, "logps/chosen": -0.8153557777404785, "logps/rejected": -3.0472562313079834, "loss": 0.7715, "odds_ratio_loss": 0.40258070826530457, "rewards/accuracies": 0.75, "rewards/chosen": -0.08153557777404785, "rewards/margins": 0.223190039396286, "rewards/rejected": -0.30472564697265625, "sft_loss": 0.8153557777404785, "step": 1362 }, { "epoch": 1.9710773680404916, "grad_norm": 2.437914328745462, "learning_rate": 6.225830520894776e-06, "logits/chosen": 0.045589592307806015, "logits/rejected": 0.004592648707330227, "logps/chosen": -0.6443754434585571, "logps/rejected": -3.426424741744995, "loss": 0.6514, "odds_ratio_loss": 0.39619654417037964, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06443755328655243, "rewards/margins": 0.27820488810539246, "rewards/rejected": -0.3426424562931061, "sft_loss": 0.6443754434585571, "step": 1363 }, { "epoch": 1.972523499638467, "grad_norm": 3.9002817508631162, "learning_rate": 6.2232492531423945e-06, "logits/chosen": 0.17493419349193573, "logits/rejected": 0.04828786849975586, "logps/chosen": -0.6880456209182739, "logps/rejected": -3.114243984222412, "loss": 0.615, "odds_ratio_loss": 0.3803248405456543, "rewards/accuracies": 0.875, "rewards/chosen": -0.06880456209182739, "rewards/margins": 0.2426198124885559, "rewards/rejected": -0.3114243745803833, "sft_loss": 0.6880456209182739, "step": 1364 }, { "epoch": 1.9739696312364425, "grad_norm": 2.7356981494522916, "learning_rate": 6.22066664498476e-06, "logits/chosen": 0.09148862957954407, "logits/rejected": 0.030430784448981285, "logps/chosen": -0.6174599528312683, "logps/rejected": -1.8549262285232544, "loss": 0.6543, "odds_ratio_loss": 0.3864895701408386, "rewards/accuracies": 0.875, "rewards/chosen": -0.06174599006772041, "rewards/margins": 0.12374662607908249, "rewards/rejected": -0.1854926198720932, "sft_loss": 0.6174599528312683, "step": 1365 }, { "epoch": 1.9754157628344178, "grad_norm": 2.2797866265947957, "learning_rate": 6.218082697978934e-06, "logits/chosen": 0.20202934741973877, "logits/rejected": 0.13954401016235352, "logps/chosen": -0.5369247198104858, "logps/rejected": -2.2839553356170654, "loss": 0.5875, "odds_ratio_loss": 0.3428112864494324, "rewards/accuracies": 0.875, "rewards/chosen": -0.053692467510700226, "rewards/margins": 0.17470306158065796, "rewards/rejected": -0.2283955216407776, "sft_loss": 0.5369247198104858, "step": 1366 }, { "epoch": 1.9768618944323935, "grad_norm": 3.3190838962022573, "learning_rate": 6.215497413682786e-06, "logits/chosen": 0.17240914702415466, "logits/rejected": 0.08776585757732391, "logps/chosen": -0.6855835914611816, "logps/rejected": -2.02882981300354, "loss": 0.6874, "odds_ratio_loss": 0.3461560606956482, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06855836510658264, "rewards/margins": 0.13432462513446808, "rewards/rejected": -0.20288299024105072, "sft_loss": 0.6855835914611816, "step": 1367 }, { "epoch": 1.9783080260303687, "grad_norm": 3.256355322872998, "learning_rate": 6.212910793654999e-06, "logits/chosen": 0.061617545783519745, "logits/rejected": 0.03875245153903961, "logps/chosen": -0.5626049637794495, "logps/rejected": -2.379176616668701, "loss": 0.6345, "odds_ratio_loss": 0.30977410078048706, "rewards/accuracies": 0.9375, "rewards/chosen": -0.056260496377944946, "rewards/margins": 0.18165718019008636, "rewards/rejected": -0.23791766166687012, "sft_loss": 0.5626049637794495, "step": 1368 }, { "epoch": 1.9797541576283442, "grad_norm": 2.622988724915116, "learning_rate": 6.2103228394550515e-06, "logits/chosen": 0.06107413023710251, "logits/rejected": 0.10450088977813721, "logps/chosen": -0.6807230710983276, "logps/rejected": -2.148853302001953, "loss": 0.5965, "odds_ratio_loss": 0.32345831394195557, "rewards/accuracies": 0.875, "rewards/chosen": -0.06807231903076172, "rewards/margins": 0.14681300520896912, "rewards/rejected": -0.21488532423973083, "sft_loss": 0.6807230710983276, "step": 1369 }, { "epoch": 1.9812002892263196, "grad_norm": 2.682008253337646, "learning_rate": 6.207733552643231e-06, "logits/chosen": 0.1140088215470314, "logits/rejected": 0.08846345543861389, "logps/chosen": -0.5670576691627502, "logps/rejected": -1.7721221446990967, "loss": 0.6365, "odds_ratio_loss": 0.4360979199409485, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0567057728767395, "rewards/margins": 0.12050645053386688, "rewards/rejected": -0.1772122085094452, "sft_loss": 0.5670576691627502, "step": 1370 }, { "epoch": 1.9826464208242949, "grad_norm": 2.952914326179049, "learning_rate": 6.205142934780632e-06, "logits/chosen": 0.09416116774082184, "logits/rejected": 0.07221397757530212, "logps/chosen": -0.5539889931678772, "logps/rejected": -3.7700295448303223, "loss": 0.5511, "odds_ratio_loss": 0.27834856510162354, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05539890378713608, "rewards/margins": 0.32160407304763794, "rewards/rejected": -0.3770029544830322, "sft_loss": 0.5539889931678772, "step": 1371 }, { "epoch": 1.9840925524222706, "grad_norm": 2.4206064871782917, "learning_rate": 6.202550987429142e-06, "logits/chosen": 0.029989825561642647, "logits/rejected": -0.0034413645043969154, "logps/chosen": -0.8195029497146606, "logps/rejected": -2.111359119415283, "loss": 0.6337, "odds_ratio_loss": 0.43149256706237793, "rewards/accuracies": 0.75, "rewards/chosen": -0.08195029199123383, "rewards/margins": 0.12918563187122345, "rewards/rejected": -0.21113592386245728, "sft_loss": 0.8195029497146606, "step": 1372 }, { "epoch": 1.9855386840202458, "grad_norm": 2.5068243281617093, "learning_rate": 6.1999577121514595e-06, "logits/chosen": 0.15260952711105347, "logits/rejected": 0.10358820110559464, "logps/chosen": -0.6531432867050171, "logps/rejected": -2.2045655250549316, "loss": 0.6213, "odds_ratio_loss": 0.37667161226272583, "rewards/accuracies": 0.875, "rewards/chosen": -0.06531432271003723, "rewards/margins": 0.15514221787452698, "rewards/rejected": -0.2204565405845642, "sft_loss": 0.6531432867050171, "step": 1373 }, { "epoch": 1.9869848156182213, "grad_norm": 2.5425623444210954, "learning_rate": 6.197363110511078e-06, "logits/chosen": 0.1662987619638443, "logits/rejected": 0.020264727994799614, "logps/chosen": -0.485507607460022, "logps/rejected": -3.171699047088623, "loss": 0.6155, "odds_ratio_loss": 0.26024413108825684, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04855076223611832, "rewards/margins": 0.2686191499233246, "rewards/rejected": -0.3171699047088623, "sft_loss": 0.485507607460022, "step": 1374 }, { "epoch": 1.9884309472161967, "grad_norm": 3.1329413200606866, "learning_rate": 6.194767184072296e-06, "logits/chosen": 0.12748688459396362, "logits/rejected": 0.06533270329236984, "logps/chosen": -0.6616427302360535, "logps/rejected": -3.427320718765259, "loss": 0.5622, "odds_ratio_loss": 0.3506993353366852, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06616427004337311, "rewards/margins": 0.27656784653663635, "rewards/rejected": -0.34273210167884827, "sft_loss": 0.6616427302360535, "step": 1375 }, { "epoch": 1.989877078814172, "grad_norm": 2.888890196942516, "learning_rate": 6.192169934400202e-06, "logits/chosen": 0.14859473705291748, "logits/rejected": 0.07147763669490814, "logps/chosen": -0.39317965507507324, "logps/rejected": -3.1786046028137207, "loss": 0.5502, "odds_ratio_loss": 0.2858788073062897, "rewards/accuracies": 0.9375, "rewards/chosen": -0.039317965507507324, "rewards/margins": 0.27854251861572266, "rewards/rejected": -0.31786048412323, "sft_loss": 0.39317965507507324, "step": 1376 }, { "epoch": 1.9913232104121477, "grad_norm": 2.493676042638736, "learning_rate": 6.189571363060691e-06, "logits/chosen": 0.10260988026857376, "logits/rejected": 0.022080600261688232, "logps/chosen": -0.37363216280937195, "logps/rejected": -3.6419897079467773, "loss": 0.6115, "odds_ratio_loss": 0.17057818174362183, "rewards/accuracies": 0.9375, "rewards/chosen": -0.037363216280937195, "rewards/margins": 0.3268357515335083, "rewards/rejected": -0.3641989827156067, "sft_loss": 0.37363216280937195, "step": 1377 }, { "epoch": 1.992769342010123, "grad_norm": 2.5488337687194655, "learning_rate": 6.18697147162045e-06, "logits/chosen": 0.1628027707338333, "logits/rejected": 0.15313595533370972, "logps/chosen": -0.5107381343841553, "logps/rejected": -3.4903712272644043, "loss": 0.5691, "odds_ratio_loss": 0.26869532465934753, "rewards/accuracies": 1.0, "rewards/chosen": -0.05107381194829941, "rewards/margins": 0.29796332120895386, "rewards/rejected": -0.3490371108055115, "sft_loss": 0.5107381343841553, "step": 1378 }, { "epoch": 1.9942154736080984, "grad_norm": 2.990348500728237, "learning_rate": 6.184370261646964e-06, "logits/chosen": 0.25367701053619385, "logits/rejected": 0.05598670244216919, "logps/chosen": -0.5730169415473938, "logps/rejected": -4.140180587768555, "loss": 0.6438, "odds_ratio_loss": 0.31483423709869385, "rewards/accuracies": 0.875, "rewards/chosen": -0.05730169266462326, "rewards/margins": 0.3567163944244385, "rewards/rejected": -0.41401803493499756, "sft_loss": 0.5730169415473938, "step": 1379 }, { "epoch": 1.9956616052060738, "grad_norm": 2.4464817090945754, "learning_rate": 6.181767734708512e-06, "logits/chosen": 0.13885369896888733, "logits/rejected": 0.1410420686006546, "logps/chosen": -0.7233635187149048, "logps/rejected": -1.2208565473556519, "loss": 0.6577, "odds_ratio_loss": 0.4889734983444214, "rewards/accuracies": 0.75, "rewards/chosen": -0.072336345911026, "rewards/margins": 0.04974930360913277, "rewards/rejected": -0.12208565324544907, "sft_loss": 0.7233635187149048, "step": 1380 }, { "epoch": 1.997107736804049, "grad_norm": 2.624728985693753, "learning_rate": 6.179163892374164e-06, "logits/chosen": 0.14182862639427185, "logits/rejected": -0.06210314854979515, "logps/chosen": -0.7554327845573425, "logps/rejected": -3.2970387935638428, "loss": 0.6579, "odds_ratio_loss": 0.3780558109283447, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07554327696561813, "rewards/margins": 0.254160612821579, "rewards/rejected": -0.3297038972377777, "sft_loss": 0.7554327845573425, "step": 1381 }, { "epoch": 1.9985538684020245, "grad_norm": 2.500628814454199, "learning_rate": 6.176558736213793e-06, "logits/chosen": 0.09440597891807556, "logits/rejected": 0.0984802171587944, "logps/chosen": -0.535467267036438, "logps/rejected": -2.6558072566986084, "loss": 0.531, "odds_ratio_loss": 0.30303633213043213, "rewards/accuracies": 0.875, "rewards/chosen": -0.0535467267036438, "rewards/margins": 0.21203403174877167, "rewards/rejected": -0.2655807435512543, "sft_loss": 0.535467267036438, "step": 1382 }, { "epoch": 2.0, "grad_norm": 3.28712021596782, "learning_rate": 6.173952267798052e-06, "logits/chosen": 0.2147250473499298, "logits/rejected": 0.11980797350406647, "logps/chosen": -0.5156925916671753, "logps/rejected": -3.986497402191162, "loss": 0.5173, "odds_ratio_loss": 0.28765448927879333, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05156925693154335, "rewards/margins": 0.34708043932914734, "rewards/rejected": -0.3986497223377228, "sft_loss": 0.5156925916671753, "step": 1383 }, { "epoch": 2.0014461315979752, "grad_norm": 3.2315398678148486, "learning_rate": 6.171344488698393e-06, "logits/chosen": 0.12111207842826843, "logits/rejected": 0.17495957016944885, "logps/chosen": -0.3766588866710663, "logps/rejected": -2.8388314247131348, "loss": 0.382, "odds_ratio_loss": 0.17233169078826904, "rewards/accuracies": 1.0, "rewards/chosen": -0.03766588866710663, "rewards/margins": 0.246217280626297, "rewards/rejected": -0.28388315439224243, "sft_loss": 0.3766588866710663, "step": 1384 }, { "epoch": 2.002892263195951, "grad_norm": 3.189280598961191, "learning_rate": 6.168735400487054e-06, "logits/chosen": 0.03559612110257149, "logits/rejected": 0.03742482513189316, "logps/chosen": -0.5957620739936829, "logps/rejected": -2.5465335845947266, "loss": 0.475, "odds_ratio_loss": 0.32061469554901123, "rewards/accuracies": 0.875, "rewards/chosen": -0.059576209634542465, "rewards/margins": 0.19507713615894318, "rewards/rejected": -0.25465336441993713, "sft_loss": 0.5957620739936829, "step": 1385 }, { "epoch": 2.004338394793926, "grad_norm": 3.0677554248229746, "learning_rate": 6.166125004737065e-06, "logits/chosen": -0.16084589064121246, "logits/rejected": -0.14135894179344177, "logps/chosen": -0.32411518692970276, "logps/rejected": -2.226219415664673, "loss": 0.3491, "odds_ratio_loss": 0.1918307989835739, "rewards/accuracies": 0.9375, "rewards/chosen": -0.032411519438028336, "rewards/margins": 0.19021041691303253, "rewards/rejected": -0.22262193262577057, "sft_loss": 0.32411518692970276, "step": 1386 }, { "epoch": 2.005784526391902, "grad_norm": 2.1849423111948445, "learning_rate": 6.163513303022243e-06, "logits/chosen": -0.19721360504627228, "logits/rejected": -0.12379816919565201, "logps/chosen": -0.26573044061660767, "logps/rejected": -4.349565505981445, "loss": 0.3741, "odds_ratio_loss": 0.1065993681550026, "rewards/accuracies": 1.0, "rewards/chosen": -0.026573043316602707, "rewards/margins": 0.40838348865509033, "rewards/rejected": -0.43495655059814453, "sft_loss": 0.26573044061660767, "step": 1387 }, { "epoch": 2.007230657989877, "grad_norm": 2.3768163135616973, "learning_rate": 6.160900296917193e-06, "logits/chosen": -0.33068203926086426, "logits/rejected": -0.24483588337898254, "logps/chosen": -0.3788696527481079, "logps/rejected": -2.304666519165039, "loss": 0.3719, "odds_ratio_loss": 0.1597655564546585, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03788696601986885, "rewards/margins": 0.19257968664169312, "rewards/rejected": -0.23046666383743286, "sft_loss": 0.3788696527481079, "step": 1388 }, { "epoch": 2.0086767895878523, "grad_norm": 2.5696151099744426, "learning_rate": 6.158285987997306e-06, "logits/chosen": -0.3501858115196228, "logits/rejected": -0.3419753909111023, "logps/chosen": -0.3169471323490143, "logps/rejected": -3.3578648567199707, "loss": 0.3397, "odds_ratio_loss": 0.16776108741760254, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03169471397995949, "rewards/margins": 0.30409175157546997, "rewards/rejected": -0.33578646183013916, "sft_loss": 0.3169471323490143, "step": 1389 }, { "epoch": 2.010122921185828, "grad_norm": 3.936520281332721, "learning_rate": 6.155670377838758e-06, "logits/chosen": -0.6566485166549683, "logits/rejected": -0.5000695586204529, "logps/chosen": -0.3887198567390442, "logps/rejected": -3.2703821659088135, "loss": 0.4191, "odds_ratio_loss": 0.18304914236068726, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03887198492884636, "rewards/margins": 0.28816622495651245, "rewards/rejected": -0.3270382285118103, "sft_loss": 0.3887198567390442, "step": 1390 }, { "epoch": 2.0115690527838033, "grad_norm": 6.474734950245381, "learning_rate": 6.153053468018511e-06, "logits/chosen": -0.536896288394928, "logits/rejected": -0.40524619817733765, "logps/chosen": -0.3791639804840088, "logps/rejected": -2.7076427936553955, "loss": 0.4267, "odds_ratio_loss": 0.10726868361234665, "rewards/accuracies": 1.0, "rewards/chosen": -0.037916399538517, "rewards/margins": 0.2328478991985321, "rewards/rejected": -0.2707642912864685, "sft_loss": 0.3791639804840088, "step": 1391 }, { "epoch": 2.013015184381779, "grad_norm": 6.726741776150189, "learning_rate": 6.15043526011431e-06, "logits/chosen": -0.6585301756858826, "logits/rejected": -0.3784843683242798, "logps/chosen": -0.44925493001937866, "logps/rejected": -2.590120792388916, "loss": 0.4351, "odds_ratio_loss": 0.19256196916103363, "rewards/accuracies": 0.9375, "rewards/chosen": -0.044925495982170105, "rewards/margins": 0.21408656239509583, "rewards/rejected": -0.2590120732784271, "sft_loss": 0.44925493001937866, "step": 1392 }, { "epoch": 2.014461315979754, "grad_norm": 3.9009213533091125, "learning_rate": 6.14781575570468e-06, "logits/chosen": -0.5194917321205139, "logits/rejected": -0.2270890772342682, "logps/chosen": -0.31589779257774353, "logps/rejected": -5.69291877746582, "loss": 0.3387, "odds_ratio_loss": 0.0934254378080368, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03158978000283241, "rewards/margins": 0.5377020239830017, "rewards/rejected": -0.5692918300628662, "sft_loss": 0.31589779257774353, "step": 1393 }, { "epoch": 2.0159074475777294, "grad_norm": 2.9063940794517955, "learning_rate": 6.145194956368932e-06, "logits/chosen": -0.52901291847229, "logits/rejected": -0.3962056636810303, "logps/chosen": -0.24508774280548096, "logps/rejected": -4.180789470672607, "loss": 0.3288, "odds_ratio_loss": 0.12533631920814514, "rewards/accuracies": 1.0, "rewards/chosen": -0.024508774280548096, "rewards/margins": 0.3935701847076416, "rewards/rejected": -0.4180789589881897, "sft_loss": 0.24508774280548096, "step": 1394 }, { "epoch": 2.017353579175705, "grad_norm": 3.1811792443833067, "learning_rate": 6.142572863687157e-06, "logits/chosen": -0.24593108892440796, "logits/rejected": -0.17790842056274414, "logps/chosen": -0.28362900018692017, "logps/rejected": -3.8639004230499268, "loss": 0.4598, "odds_ratio_loss": 0.1365346908569336, "rewards/accuracies": 1.0, "rewards/chosen": -0.028362900018692017, "rewards/margins": 0.3580271601676941, "rewards/rejected": -0.3863900601863861, "sft_loss": 0.28362900018692017, "step": 1395 }, { "epoch": 2.0187997107736804, "grad_norm": 2.742354366098211, "learning_rate": 6.13994947924022e-06, "logits/chosen": -0.4542756974697113, "logits/rejected": -0.5005182027816772, "logps/chosen": -0.4230310916900635, "logps/rejected": -4.989255905151367, "loss": 0.3642, "odds_ratio_loss": 0.2016012817621231, "rewards/accuracies": 1.0, "rewards/chosen": -0.04230311140418053, "rewards/margins": 0.45662248134613037, "rewards/rejected": -0.4989256262779236, "sft_loss": 0.4230310916900635, "step": 1396 }, { "epoch": 2.0202458423716556, "grad_norm": 2.683983578822834, "learning_rate": 6.137324804609774e-06, "logits/chosen": -0.19099709391593933, "logits/rejected": -0.16608811914920807, "logps/chosen": -0.2979000210762024, "logps/rejected": -4.687519550323486, "loss": 0.4058, "odds_ratio_loss": 0.09216830134391785, "rewards/accuracies": 1.0, "rewards/chosen": -0.029789999127388, "rewards/margins": 0.4389619529247284, "rewards/rejected": -0.4687519669532776, "sft_loss": 0.2979000210762024, "step": 1397 }, { "epoch": 2.0216919739696313, "grad_norm": 3.327282280218847, "learning_rate": 6.134698841378243e-06, "logits/chosen": -0.3360438644886017, "logits/rejected": -0.18919801712036133, "logps/chosen": -0.4767918288707733, "logps/rejected": -3.2631969451904297, "loss": 0.3656, "odds_ratio_loss": 0.15634480118751526, "rewards/accuracies": 1.0, "rewards/chosen": -0.04767918586730957, "rewards/margins": 0.278640478849411, "rewards/rejected": -0.32631969451904297, "sft_loss": 0.4767918288707733, "step": 1398 }, { "epoch": 2.0231381055676065, "grad_norm": 2.2233945899908285, "learning_rate": 6.132071591128829e-06, "logits/chosen": -0.2488166093826294, "logits/rejected": -0.12372294068336487, "logps/chosen": -0.39679771661758423, "logps/rejected": -3.140080451965332, "loss": 0.3152, "odds_ratio_loss": 0.1701274961233139, "rewards/accuracies": 1.0, "rewards/chosen": -0.03967977315187454, "rewards/margins": 0.2743282914161682, "rewards/rejected": -0.31400808691978455, "sft_loss": 0.39679771661758423, "step": 1399 }, { "epoch": 2.0245842371655822, "grad_norm": 2.6076662390403236, "learning_rate": 6.129443055445512e-06, "logits/chosen": -0.29236453771591187, "logits/rejected": -0.30255061388015747, "logps/chosen": -0.315972238779068, "logps/rejected": -3.8869261741638184, "loss": 0.3984, "odds_ratio_loss": 0.1734756976366043, "rewards/accuracies": 1.0, "rewards/chosen": -0.03159722685813904, "rewards/margins": 0.3570953607559204, "rewards/rejected": -0.38869261741638184, "sft_loss": 0.315972238779068, "step": 1400 }, { "epoch": 2.0260303687635575, "grad_norm": 2.3586036539147797, "learning_rate": 6.1268132359130475e-06, "logits/chosen": -0.5538555383682251, "logits/rejected": -0.3169947564601898, "logps/chosen": -0.4706674814224243, "logps/rejected": -3.450441360473633, "loss": 0.3946, "odds_ratio_loss": 0.12920330464839935, "rewards/accuracies": 1.0, "rewards/chosen": -0.04706674814224243, "rewards/margins": 0.29797738790512085, "rewards/rejected": -0.34504416584968567, "sft_loss": 0.4706674814224243, "step": 1401 }, { "epoch": 2.0274765003615327, "grad_norm": 2.4006021186149544, "learning_rate": 6.12418213411696e-06, "logits/chosen": -0.32977399230003357, "logits/rejected": -0.2614250183105469, "logps/chosen": -0.234191432595253, "logps/rejected": -2.427402973175049, "loss": 0.3291, "odds_ratio_loss": 0.10665269196033478, "rewards/accuracies": 1.0, "rewards/chosen": -0.02341914176940918, "rewards/margins": 0.21932116150856018, "rewards/rejected": -0.24274030327796936, "sft_loss": 0.234191432595253, "step": 1402 }, { "epoch": 2.0289226319595084, "grad_norm": 2.4765029159664924, "learning_rate": 6.121549751643554e-06, "logits/chosen": -0.4366395175457001, "logits/rejected": -0.2806912660598755, "logps/chosen": -0.3349638879299164, "logps/rejected": -2.964669704437256, "loss": 0.3604, "odds_ratio_loss": 0.10340587794780731, "rewards/accuracies": 1.0, "rewards/chosen": -0.03349638730287552, "rewards/margins": 0.26297056674957275, "rewards/rejected": -0.2964669466018677, "sft_loss": 0.3349638879299164, "step": 1403 }, { "epoch": 2.0303687635574836, "grad_norm": 2.285256352398014, "learning_rate": 6.118916090079901e-06, "logits/chosen": -0.35529831051826477, "logits/rejected": -0.43684202432632446, "logps/chosen": -0.19356098771095276, "logps/rejected": -2.894097328186035, "loss": 0.3484, "odds_ratio_loss": 0.06699930876493454, "rewards/accuracies": 1.0, "rewards/chosen": -0.019356099888682365, "rewards/margins": 0.2700536251068115, "rewards/rejected": -0.28940972685813904, "sft_loss": 0.19356098771095276, "step": 1404 }, { "epoch": 2.0318148951554593, "grad_norm": 2.7019259490766685, "learning_rate": 6.116281151013846e-06, "logits/chosen": -0.5889612436294556, "logits/rejected": -0.3919902443885803, "logps/chosen": -0.31004148721694946, "logps/rejected": -4.649913311004639, "loss": 0.3577, "odds_ratio_loss": 0.11446838825941086, "rewards/accuracies": 1.0, "rewards/chosen": -0.031004149466753006, "rewards/margins": 0.43398720026016235, "rewards/rejected": -0.46499139070510864, "sft_loss": 0.31004148721694946, "step": 1405 }, { "epoch": 2.0332610267534346, "grad_norm": 2.244900017647067, "learning_rate": 6.113644936034002e-06, "logits/chosen": -0.5037296414375305, "logits/rejected": -0.39371100068092346, "logps/chosen": -0.43020951747894287, "logps/rejected": -2.473050832748413, "loss": 0.3748, "odds_ratio_loss": 0.1621953845024109, "rewards/accuracies": 1.0, "rewards/chosen": -0.04302094876766205, "rewards/margins": 0.20428414642810822, "rewards/rejected": -0.24730511009693146, "sft_loss": 0.43020951747894287, "step": 1406 }, { "epoch": 2.03470715835141, "grad_norm": 2.474297866515971, "learning_rate": 6.111007446729754e-06, "logits/chosen": -0.8331849575042725, "logits/rejected": -0.4606708288192749, "logps/chosen": -0.2885475754737854, "logps/rejected": -4.700456619262695, "loss": 0.3241, "odds_ratio_loss": 0.06865033507347107, "rewards/accuracies": 1.0, "rewards/chosen": -0.02885475754737854, "rewards/margins": 0.4411908984184265, "rewards/rejected": -0.47004562616348267, "sft_loss": 0.2885475754737854, "step": 1407 }, { "epoch": 2.0361532899493855, "grad_norm": 4.328881723447609, "learning_rate": 6.108368684691255e-06, "logits/chosen": -0.3607381284236908, "logits/rejected": -0.3933161497116089, "logps/chosen": -0.5243850946426392, "logps/rejected": -2.1831188201904297, "loss": 0.3454, "odds_ratio_loss": 0.21166378259658813, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05243851616978645, "rewards/margins": 0.16587337851524353, "rewards/rejected": -0.2183118760585785, "sft_loss": 0.5243850946426392, "step": 1408 }, { "epoch": 2.0375994215473607, "grad_norm": 2.6476216733494073, "learning_rate": 6.105728651509423e-06, "logits/chosen": -0.26458385586738586, "logits/rejected": -0.1522885262966156, "logps/chosen": -0.3708568513393402, "logps/rejected": -3.2693662643432617, "loss": 0.3426, "odds_ratio_loss": 0.14628729224205017, "rewards/accuracies": 1.0, "rewards/chosen": -0.03708568960428238, "rewards/margins": 0.28985095024108887, "rewards/rejected": -0.32693663239479065, "sft_loss": 0.3708568513393402, "step": 1409 }, { "epoch": 2.0390455531453364, "grad_norm": 2.284226764361494, "learning_rate": 6.103087348775945e-06, "logits/chosen": -0.2937602996826172, "logits/rejected": -0.39900609850883484, "logps/chosen": -0.5060720443725586, "logps/rejected": -2.6557836532592773, "loss": 0.3781, "odds_ratio_loss": 0.194400817155838, "rewards/accuracies": 1.0, "rewards/chosen": -0.05060720071196556, "rewards/margins": 0.2149711549282074, "rewards/rejected": -0.26557838916778564, "sft_loss": 0.5060720443725586, "step": 1410 }, { "epoch": 2.0404916847433117, "grad_norm": 2.6244235607829243, "learning_rate": 6.100444778083271e-06, "logits/chosen": -0.46006476879119873, "logits/rejected": -0.16463829576969147, "logps/chosen": -0.4287043809890747, "logps/rejected": -2.332899332046509, "loss": 0.4342, "odds_ratio_loss": 0.13950975239276886, "rewards/accuracies": 1.0, "rewards/chosen": -0.04287043958902359, "rewards/margins": 0.1904195100069046, "rewards/rejected": -0.2332899272441864, "sft_loss": 0.4287043809890747, "step": 1411 }, { "epoch": 2.041937816341287, "grad_norm": 2.2183067171761057, "learning_rate": 6.097800941024618e-06, "logits/chosen": -0.4148803651332855, "logits/rejected": -0.4651082754135132, "logps/chosen": -0.39564990997314453, "logps/rejected": -2.487851858139038, "loss": 0.3184, "odds_ratio_loss": 0.15196757018566132, "rewards/accuracies": 1.0, "rewards/chosen": -0.03956499695777893, "rewards/margins": 0.20922018587589264, "rewards/rejected": -0.24878518283367157, "sft_loss": 0.39564990997314453, "step": 1412 }, { "epoch": 2.0433839479392626, "grad_norm": 2.5032801033265715, "learning_rate": 6.095155839193964e-06, "logits/chosen": -0.2763396203517914, "logits/rejected": -0.12382631003856659, "logps/chosen": -0.28163591027259827, "logps/rejected": -4.132618427276611, "loss": 0.3437, "odds_ratio_loss": 0.08366397768259048, "rewards/accuracies": 1.0, "rewards/chosen": -0.028163593262434006, "rewards/margins": 0.3850982189178467, "rewards/rejected": -0.4132618308067322, "sft_loss": 0.28163591027259827, "step": 1413 }, { "epoch": 2.044830079537238, "grad_norm": 2.667578076549321, "learning_rate": 6.092509474186052e-06, "logits/chosen": -0.40817150473594666, "logits/rejected": -0.4003009796142578, "logps/chosen": -0.459774911403656, "logps/rejected": -1.49889075756073, "loss": 0.4754, "odds_ratio_loss": 0.22171664237976074, "rewards/accuracies": 1.0, "rewards/chosen": -0.04597749188542366, "rewards/margins": 0.10391158610582352, "rewards/rejected": -0.14988906681537628, "sft_loss": 0.459774911403656, "step": 1414 }, { "epoch": 2.0462762111352135, "grad_norm": 4.948341598400315, "learning_rate": 6.089861847596385e-06, "logits/chosen": -0.3838014006614685, "logits/rejected": -0.32643434405326843, "logps/chosen": -0.42405328154563904, "logps/rejected": -3.091123104095459, "loss": 0.4266, "odds_ratio_loss": 0.1393635869026184, "rewards/accuracies": 1.0, "rewards/chosen": -0.04240532964468002, "rewards/margins": 0.2667069733142853, "rewards/rejected": -0.3091123402118683, "sft_loss": 0.42405328154563904, "step": 1415 }, { "epoch": 2.0477223427331888, "grad_norm": 2.333675906831936, "learning_rate": 6.087212961021226e-06, "logits/chosen": -0.361023873090744, "logits/rejected": -0.31981074810028076, "logps/chosen": -0.2632179856300354, "logps/rejected": -3.056180000305176, "loss": 0.3845, "odds_ratio_loss": 0.1083202064037323, "rewards/accuracies": 1.0, "rewards/chosen": -0.02632179856300354, "rewards/margins": 0.2792961597442627, "rewards/rejected": -0.3056179881095886, "sft_loss": 0.2632179856300354, "step": 1416 }, { "epoch": 2.049168474331164, "grad_norm": 2.291013338955284, "learning_rate": 6.084562816057599e-06, "logits/chosen": -0.35837870836257935, "logits/rejected": -0.29917028546333313, "logps/chosen": -0.22934255003929138, "logps/rejected": -3.9777276515960693, "loss": 0.3472, "odds_ratio_loss": 0.0663267970085144, "rewards/accuracies": 1.0, "rewards/chosen": -0.02293425425887108, "rewards/margins": 0.37483853101730347, "rewards/rejected": -0.39777275919914246, "sft_loss": 0.22934255003929138, "step": 1417 }, { "epoch": 2.0506146059291397, "grad_norm": 2.361170250846157, "learning_rate": 6.081911414303286e-06, "logits/chosen": -0.3557944595813751, "logits/rejected": -0.2875556945800781, "logps/chosen": -0.33980539441108704, "logps/rejected": -2.679823398590088, "loss": 0.3357, "odds_ratio_loss": 0.13775214552879333, "rewards/accuracies": 1.0, "rewards/chosen": -0.033980537205934525, "rewards/margins": 0.23400181531906128, "rewards/rejected": -0.2679823637008667, "sft_loss": 0.33980539441108704, "step": 1418 }, { "epoch": 2.052060737527115, "grad_norm": 2.658491704323656, "learning_rate": 6.0792587573568285e-06, "logits/chosen": -0.3484780192375183, "logits/rejected": -0.21401377022266388, "logps/chosen": -0.3252895474433899, "logps/rejected": -3.0992331504821777, "loss": 0.3577, "odds_ratio_loss": 0.08834249526262283, "rewards/accuracies": 1.0, "rewards/chosen": -0.03252895176410675, "rewards/margins": 0.2773943543434143, "rewards/rejected": -0.30992335081100464, "sft_loss": 0.3252895474433899, "step": 1419 }, { "epoch": 2.05350686912509, "grad_norm": 2.505632092772328, "learning_rate": 6.076604846817522e-06, "logits/chosen": -0.46591854095458984, "logits/rejected": -0.24125395715236664, "logps/chosen": -0.39621445536613464, "logps/rejected": -3.352841854095459, "loss": 0.3723, "odds_ratio_loss": 0.10687874257564545, "rewards/accuracies": 1.0, "rewards/chosen": -0.039621446281671524, "rewards/margins": 0.2956627607345581, "rewards/rejected": -0.33528417348861694, "sft_loss": 0.39621445536613464, "step": 1420 }, { "epoch": 2.054953000723066, "grad_norm": 2.2675780954478486, "learning_rate": 6.073949684285419e-06, "logits/chosen": -0.2978203296661377, "logits/rejected": -0.15997031331062317, "logps/chosen": -0.3860008716583252, "logps/rejected": -2.842381000518799, "loss": 0.332, "odds_ratio_loss": 0.17229007184505463, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03860008716583252, "rewards/margins": 0.24563804268836975, "rewards/rejected": -0.28423812985420227, "sft_loss": 0.3860008716583252, "step": 1421 }, { "epoch": 2.056399132321041, "grad_norm": 2.3078172574413096, "learning_rate": 6.071293271361327e-06, "logits/chosen": -0.36375710368156433, "logits/rejected": -0.291500061750412, "logps/chosen": -0.37237662076950073, "logps/rejected": -3.426270008087158, "loss": 0.3802, "odds_ratio_loss": 0.14352582395076752, "rewards/accuracies": 1.0, "rewards/chosen": -0.037237659096717834, "rewards/margins": 0.3053893446922302, "rewards/rejected": -0.34262698888778687, "sft_loss": 0.37237662076950073, "step": 1422 }, { "epoch": 2.057845263919017, "grad_norm": 1.983808931410135, "learning_rate": 6.068635609646808e-06, "logits/chosen": -0.41622287034988403, "logits/rejected": -0.169271320104599, "logps/chosen": -0.3853681683540344, "logps/rejected": -4.620948791503906, "loss": 0.3248, "odds_ratio_loss": 0.05672647804021835, "rewards/accuracies": 1.0, "rewards/chosen": -0.03853682056069374, "rewards/margins": 0.4235580861568451, "rewards/rejected": -0.4620949327945709, "sft_loss": 0.3853681683540344, "step": 1423 }, { "epoch": 2.059291395516992, "grad_norm": 2.335324198672471, "learning_rate": 6.065976700744174e-06, "logits/chosen": -0.231398805975914, "logits/rejected": -0.194851815700531, "logps/chosen": -0.39015644788742065, "logps/rejected": -4.776452541351318, "loss": 0.3797, "odds_ratio_loss": 0.1511651873588562, "rewards/accuracies": 1.0, "rewards/chosen": -0.039015647023916245, "rewards/margins": 0.4386296272277832, "rewards/rejected": -0.47764530777931213, "sft_loss": 0.39015644788742065, "step": 1424 }, { "epoch": 2.0607375271149673, "grad_norm": 1.9881483716659663, "learning_rate": 6.063316546256494e-06, "logits/chosen": -0.4184289574623108, "logits/rejected": -0.23517775535583496, "logps/chosen": -0.2277361899614334, "logps/rejected": -3.0598249435424805, "loss": 0.3115, "odds_ratio_loss": 0.08884076774120331, "rewards/accuracies": 1.0, "rewards/chosen": -0.0227736197412014, "rewards/margins": 0.2832088768482208, "rewards/rejected": -0.3059825003147125, "sft_loss": 0.2277361899614334, "step": 1425 }, { "epoch": 2.062183658712943, "grad_norm": 3.2525491666588797, "learning_rate": 6.060655147787583e-06, "logits/chosen": -0.6589328646659851, "logits/rejected": -0.32301610708236694, "logps/chosen": -0.2796526551246643, "logps/rejected": -2.624934673309326, "loss": 0.3279, "odds_ratio_loss": 0.08123660087585449, "rewards/accuracies": 1.0, "rewards/chosen": -0.02796526625752449, "rewards/margins": 0.23452815413475037, "rewards/rejected": -0.26249343156814575, "sft_loss": 0.2796526551246643, "step": 1426 }, { "epoch": 2.063629790310918, "grad_norm": 2.345153662950682, "learning_rate": 6.057992506942011e-06, "logits/chosen": -0.40048494935035706, "logits/rejected": -0.2785194218158722, "logps/chosen": -0.248963862657547, "logps/rejected": -2.053396224975586, "loss": 0.3094, "odds_ratio_loss": 0.08460384607315063, "rewards/accuracies": 1.0, "rewards/chosen": -0.02489638887345791, "rewards/margins": 0.18044325709342957, "rewards/rejected": -0.20533964037895203, "sft_loss": 0.248963862657547, "step": 1427 }, { "epoch": 2.065075921908894, "grad_norm": 2.48367237998288, "learning_rate": 6.05532862532509e-06, "logits/chosen": -0.39829522371292114, "logits/rejected": -0.17285127937793732, "logps/chosen": -0.3011299669742584, "logps/rejected": -2.470442295074463, "loss": 0.3419, "odds_ratio_loss": 0.060505881905555725, "rewards/accuracies": 1.0, "rewards/chosen": -0.03011299856007099, "rewards/margins": 0.21693125367164612, "rewards/rejected": -0.24704423546791077, "sft_loss": 0.3011299669742584, "step": 1428 }, { "epoch": 2.066522053506869, "grad_norm": 2.6112651161735503, "learning_rate": 6.052663504542885e-06, "logits/chosen": -0.23426532745361328, "logits/rejected": -0.22717681527137756, "logps/chosen": -0.4564788341522217, "logps/rejected": -3.6011054515838623, "loss": 0.363, "odds_ratio_loss": 0.15331970155239105, "rewards/accuracies": 1.0, "rewards/chosen": -0.04564788565039635, "rewards/margins": 0.31446266174316406, "rewards/rejected": -0.3601105511188507, "sft_loss": 0.4564788341522217, "step": 1429 }, { "epoch": 2.0679681851048444, "grad_norm": 2.6695083429349715, "learning_rate": 6.049997146202209e-06, "logits/chosen": -0.2030077427625656, "logits/rejected": -0.09532079100608826, "logps/chosen": -0.28371351957321167, "logps/rejected": -3.22945499420166, "loss": 0.3147, "odds_ratio_loss": 0.10474459081888199, "rewards/accuracies": 0.9375, "rewards/chosen": -0.028371348977088928, "rewards/margins": 0.29457414150238037, "rewards/rejected": -0.3229454755783081, "sft_loss": 0.28371351957321167, "step": 1430 }, { "epoch": 2.06941431670282, "grad_norm": 2.0842316658257376, "learning_rate": 6.047329551910618e-06, "logits/chosen": -0.42209625244140625, "logits/rejected": -0.3098366856575012, "logps/chosen": -0.44695577025413513, "logps/rejected": -4.16575813293457, "loss": 0.3566, "odds_ratio_loss": 0.1542935073375702, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04469558224081993, "rewards/margins": 0.3718802332878113, "rewards/rejected": -0.41657575964927673, "sft_loss": 0.44695577025413513, "step": 1431 }, { "epoch": 2.0708604483007953, "grad_norm": 2.558852343608781, "learning_rate": 6.044660723276416e-06, "logits/chosen": -0.49510613083839417, "logits/rejected": -0.3418270945549011, "logps/chosen": -0.4254521131515503, "logps/rejected": -3.873826742172241, "loss": 0.3418, "odds_ratio_loss": 0.10353498160839081, "rewards/accuracies": 1.0, "rewards/chosen": -0.04254521429538727, "rewards/margins": 0.3448374569416046, "rewards/rejected": -0.38738271594047546, "sft_loss": 0.4254521131515503, "step": 1432 }, { "epoch": 2.072306579898771, "grad_norm": 2.2228242975974393, "learning_rate": 6.0419906619086485e-06, "logits/chosen": -0.4619770646095276, "logits/rejected": -0.21492421627044678, "logps/chosen": -0.23521602153778076, "logps/rejected": -3.90215802192688, "loss": 0.3316, "odds_ratio_loss": 0.0810660645365715, "rewards/accuracies": 1.0, "rewards/chosen": -0.023521605879068375, "rewards/margins": 0.36669421195983887, "rewards/rejected": -0.39021581411361694, "sft_loss": 0.23521602153778076, "step": 1433 }, { "epoch": 2.0737527114967462, "grad_norm": 2.5406277424592143, "learning_rate": 6.0393193694171055e-06, "logits/chosen": -0.32404616475105286, "logits/rejected": -0.1625586450099945, "logps/chosen": -0.26816701889038086, "logps/rejected": -4.354163646697998, "loss": 0.4009, "odds_ratio_loss": 0.08176937699317932, "rewards/accuracies": 1.0, "rewards/chosen": -0.026816701516509056, "rewards/margins": 0.4085996747016907, "rewards/rejected": -0.43541640043258667, "sft_loss": 0.26816701889038086, "step": 1434 }, { "epoch": 2.0751988430947215, "grad_norm": 2.2669801066940907, "learning_rate": 6.03664684741232e-06, "logits/chosen": -0.5940994024276733, "logits/rejected": -0.299757182598114, "logps/chosen": -0.3598395586013794, "logps/rejected": -3.4677209854125977, "loss": 0.3712, "odds_ratio_loss": 0.1251244992017746, "rewards/accuracies": 1.0, "rewards/chosen": -0.03598395735025406, "rewards/margins": 0.3107881546020508, "rewards/rejected": -0.34677213430404663, "sft_loss": 0.3598395586013794, "step": 1435 }, { "epoch": 2.076644974692697, "grad_norm": 2.1993829720352602, "learning_rate": 6.033973097505564e-06, "logits/chosen": -0.197866290807724, "logits/rejected": -0.18637999892234802, "logps/chosen": -0.3552109897136688, "logps/rejected": -4.12385892868042, "loss": 0.3592, "odds_ratio_loss": 0.19755232334136963, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03552110120654106, "rewards/margins": 0.3768647611141205, "rewards/rejected": -0.41238588094711304, "sft_loss": 0.3552109897136688, "step": 1436 }, { "epoch": 2.0780911062906724, "grad_norm": 2.4029872434354282, "learning_rate": 6.031298121308852e-06, "logits/chosen": -0.24159984290599823, "logits/rejected": -0.18060526251792908, "logps/chosen": -0.2984180152416229, "logps/rejected": -3.943260908126831, "loss": 0.3652, "odds_ratio_loss": 0.1224970817565918, "rewards/accuracies": 0.9375, "rewards/chosen": -0.029841801151633263, "rewards/margins": 0.3644842803478241, "rewards/rejected": -0.3943260908126831, "sft_loss": 0.2984180152416229, "step": 1437 }, { "epoch": 2.0795372378886476, "grad_norm": 2.058087744744759, "learning_rate": 6.028621920434938e-06, "logits/chosen": -0.18899688124656677, "logits/rejected": -0.17238955199718475, "logps/chosen": -0.4569585621356964, "logps/rejected": -2.871255397796631, "loss": 0.3386, "odds_ratio_loss": 0.1777493953704834, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04569585993885994, "rewards/margins": 0.24142968654632568, "rewards/rejected": -0.28712552785873413, "sft_loss": 0.4569585621356964, "step": 1438 }, { "epoch": 2.0809833694866233, "grad_norm": 2.7713822465891975, "learning_rate": 6.025944496497313e-06, "logits/chosen": -0.2562764286994934, "logits/rejected": -0.2689245939254761, "logps/chosen": -0.3553205728530884, "logps/rejected": -3.7182862758636475, "loss": 0.3025, "odds_ratio_loss": 0.16768378019332886, "rewards/accuracies": 1.0, "rewards/chosen": -0.03553205728530884, "rewards/margins": 0.33629655838012695, "rewards/rejected": -0.3718286156654358, "sft_loss": 0.3553205728530884, "step": 1439 }, { "epoch": 2.0824295010845986, "grad_norm": 2.104472864919027, "learning_rate": 6.023265851110206e-06, "logits/chosen": -0.4192018210887909, "logits/rejected": -0.4185807406902313, "logps/chosen": -0.29165175557136536, "logps/rejected": -3.254685878753662, "loss": 0.3423, "odds_ratio_loss": 0.07294730842113495, "rewards/accuracies": 1.0, "rewards/chosen": -0.029165174812078476, "rewards/margins": 0.2963034212589264, "rewards/rejected": -0.32546859979629517, "sft_loss": 0.29165175557136536, "step": 1440 }, { "epoch": 2.0838756326825743, "grad_norm": 3.2957542835935714, "learning_rate": 6.0205859858885815e-06, "logits/chosen": -0.38088786602020264, "logits/rejected": -0.3934791088104248, "logps/chosen": -0.3900391757488251, "logps/rejected": -3.510143756866455, "loss": 0.3766, "odds_ratio_loss": 0.16783195734024048, "rewards/accuracies": 0.9375, "rewards/chosen": -0.039003919810056686, "rewards/margins": 0.31201043725013733, "rewards/rejected": -0.3510143458843231, "sft_loss": 0.3900391757488251, "step": 1441 }, { "epoch": 2.0853217642805495, "grad_norm": 2.578364476031665, "learning_rate": 6.01790490244814e-06, "logits/chosen": -0.4493975043296814, "logits/rejected": -0.4192134737968445, "logps/chosen": -0.5234689712524414, "logps/rejected": -4.9513840675354, "loss": 0.4525, "odds_ratio_loss": 0.19005721807479858, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05234689638018608, "rewards/margins": 0.44279155135154724, "rewards/rejected": -0.4951384365558624, "sft_loss": 0.5234689712524414, "step": 1442 }, { "epoch": 2.0867678958785247, "grad_norm": 2.396996991834125, "learning_rate": 6.015222602405318e-06, "logits/chosen": -0.337810754776001, "logits/rejected": -0.22689113020896912, "logps/chosen": -0.2907380759716034, "logps/rejected": -3.4063007831573486, "loss": 0.3455, "odds_ratio_loss": 0.12362713366746902, "rewards/accuracies": 1.0, "rewards/chosen": -0.0290738046169281, "rewards/margins": 0.31155624985694885, "rewards/rejected": -0.34063005447387695, "sft_loss": 0.2907380759716034, "step": 1443 }, { "epoch": 2.0882140274765004, "grad_norm": 2.5822318052964803, "learning_rate": 6.012539087377284e-06, "logits/chosen": -0.3404845595359802, "logits/rejected": -0.3742299973964691, "logps/chosen": -0.4546850323677063, "logps/rejected": -2.706575870513916, "loss": 0.4444, "odds_ratio_loss": 0.17479830980300903, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04546850547194481, "rewards/margins": 0.22518908977508545, "rewards/rejected": -0.27065759897232056, "sft_loss": 0.4546850323677063, "step": 1444 }, { "epoch": 2.0896601590744757, "grad_norm": 2.504565887985123, "learning_rate": 6.009854358981938e-06, "logits/chosen": -0.27679046988487244, "logits/rejected": -0.3251495361328125, "logps/chosen": -0.38980117440223694, "logps/rejected": -3.832094192504883, "loss": 0.3848, "odds_ratio_loss": 0.14260970056056976, "rewards/accuracies": 1.0, "rewards/chosen": -0.03898011893033981, "rewards/margins": 0.3442293107509613, "rewards/rejected": -0.3832094073295593, "sft_loss": 0.38980117440223694, "step": 1445 }, { "epoch": 2.0911062906724514, "grad_norm": 2.287041085168791, "learning_rate": 6.007168418837913e-06, "logits/chosen": -0.342728853225708, "logits/rejected": -0.27378779649734497, "logps/chosen": -0.31904324889183044, "logps/rejected": -4.140244007110596, "loss": 0.3465, "odds_ratio_loss": 0.11946651339530945, "rewards/accuracies": 1.0, "rewards/chosen": -0.03190432861447334, "rewards/margins": 0.38212013244628906, "rewards/rejected": -0.4140244126319885, "sft_loss": 0.31904324889183044, "step": 1446 }, { "epoch": 2.0925524222704266, "grad_norm": 2.4471257161139164, "learning_rate": 6.004481268564573e-06, "logits/chosen": -0.32866159081459045, "logits/rejected": -0.27318888902664185, "logps/chosen": -0.34691765904426575, "logps/rejected": -4.629284858703613, "loss": 0.2858, "odds_ratio_loss": 0.07531234622001648, "rewards/accuracies": 1.0, "rewards/chosen": -0.034691765904426575, "rewards/margins": 0.428236722946167, "rewards/rejected": -0.4629284739494324, "sft_loss": 0.34691765904426575, "step": 1447 }, { "epoch": 2.093998553868402, "grad_norm": 2.5341871999397068, "learning_rate": 6.001792909782012e-06, "logits/chosen": -0.17833292484283447, "logits/rejected": -0.31797996163368225, "logps/chosen": -0.4669630229473114, "logps/rejected": -2.347073793411255, "loss": 0.4726, "odds_ratio_loss": 0.23080308735370636, "rewards/accuracies": 0.875, "rewards/chosen": -0.04669630154967308, "rewards/margins": 0.18801110982894897, "rewards/rejected": -0.23470738530158997, "sft_loss": 0.4669630229473114, "step": 1448 }, { "epoch": 2.0954446854663775, "grad_norm": 2.0149281802026104, "learning_rate": 5.999103344111049e-06, "logits/chosen": -0.282225638628006, "logits/rejected": -0.33347731828689575, "logps/chosen": -0.3228495419025421, "logps/rejected": -4.6756062507629395, "loss": 0.312, "odds_ratio_loss": 0.15524053573608398, "rewards/accuracies": 1.0, "rewards/chosen": -0.03228495270013809, "rewards/margins": 0.4352756440639496, "rewards/rejected": -0.4675605893135071, "sft_loss": 0.3228495419025421, "step": 1449 }, { "epoch": 2.0968908170643528, "grad_norm": 2.4196352705276296, "learning_rate": 5.996412573173233e-06, "logits/chosen": -0.23691165447235107, "logits/rejected": -0.2045428454875946, "logps/chosen": -0.3596211075782776, "logps/rejected": -3.006347179412842, "loss": 0.4048, "odds_ratio_loss": 0.17852452397346497, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03596211224794388, "rewards/margins": 0.2646726071834564, "rewards/rejected": -0.3006346821784973, "sft_loss": 0.3596211075782776, "step": 1450 }, { "epoch": 2.0983369486623284, "grad_norm": 4.531344093999202, "learning_rate": 5.993720598590844e-06, "logits/chosen": -0.40281057357788086, "logits/rejected": -0.30446839332580566, "logps/chosen": -0.3508280813694, "logps/rejected": -1.9880702495574951, "loss": 0.3769, "odds_ratio_loss": 0.14897975325584412, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03508280962705612, "rewards/margins": 0.16372422873973846, "rewards/rejected": -0.198807030916214, "sft_loss": 0.3508280813694, "step": 1451 }, { "epoch": 2.0997830802603037, "grad_norm": 3.298538569192066, "learning_rate": 5.99102742198688e-06, "logits/chosen": -0.3555454909801483, "logits/rejected": -0.2171410322189331, "logps/chosen": -0.2860006093978882, "logps/rejected": -3.838912010192871, "loss": 0.4106, "odds_ratio_loss": 0.12151093780994415, "rewards/accuracies": 0.9375, "rewards/chosen": -0.028600063174962997, "rewards/margins": 0.3552911877632141, "rewards/rejected": -0.383891224861145, "sft_loss": 0.2860006093978882, "step": 1452 }, { "epoch": 2.101229211858279, "grad_norm": 3.7818647661180673, "learning_rate": 5.988333044985067e-06, "logits/chosen": -0.2495804727077484, "logits/rejected": -0.17438268661499023, "logps/chosen": -0.24221912026405334, "logps/rejected": -2.4278950691223145, "loss": 0.304, "odds_ratio_loss": 0.12932734191417694, "rewards/accuracies": 1.0, "rewards/chosen": -0.024221913889050484, "rewards/margins": 0.21856757998466492, "rewards/rejected": -0.24278950691223145, "sft_loss": 0.24221912026405334, "step": 1453 }, { "epoch": 2.1026753434562546, "grad_norm": 3.043471182443482, "learning_rate": 5.985637469209855e-06, "logits/chosen": -0.2824966609477997, "logits/rejected": -0.23998260498046875, "logps/chosen": -0.22306549549102783, "logps/rejected": -3.71874737739563, "loss": 0.3302, "odds_ratio_loss": 0.0718691423535347, "rewards/accuracies": 1.0, "rewards/chosen": -0.022306548431515694, "rewards/margins": 0.3495681881904602, "rewards/rejected": -0.37187474966049194, "sft_loss": 0.22306549549102783, "step": 1454 }, { "epoch": 2.10412147505423, "grad_norm": 2.343444348205682, "learning_rate": 5.98294069628642e-06, "logits/chosen": -0.3612365126609802, "logits/rejected": -0.3207022547721863, "logps/chosen": -0.4752647578716278, "logps/rejected": -2.5952839851379395, "loss": 0.4237, "odds_ratio_loss": 0.17657390236854553, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04752647876739502, "rewards/margins": 0.21200190484523773, "rewards/rejected": -0.25952839851379395, "sft_loss": 0.4752647578716278, "step": 1455 }, { "epoch": 2.1055676066522055, "grad_norm": 2.4742086731738633, "learning_rate": 5.980242727840653e-06, "logits/chosen": -0.5590451955795288, "logits/rejected": -0.43150633573532104, "logps/chosen": -0.3560371696949005, "logps/rejected": -2.1207826137542725, "loss": 0.3667, "odds_ratio_loss": 0.12519478797912598, "rewards/accuracies": 1.0, "rewards/chosen": -0.03560371696949005, "rewards/margins": 0.17647455632686615, "rewards/rejected": -0.2120782434940338, "sft_loss": 0.3560371696949005, "step": 1456 }, { "epoch": 2.107013738250181, "grad_norm": 3.0913457397602664, "learning_rate": 5.9775435654991695e-06, "logits/chosen": -0.321668416261673, "logits/rejected": -0.2340753823518753, "logps/chosen": -0.3525116443634033, "logps/rejected": -1.9900145530700684, "loss": 0.3869, "odds_ratio_loss": 0.17652732133865356, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03525116667151451, "rewards/margins": 0.1637502759695053, "rewards/rejected": -0.19900144636631012, "sft_loss": 0.3525116443634033, "step": 1457 }, { "epoch": 2.108459869848156, "grad_norm": 2.3776487240970665, "learning_rate": 5.974843210889306e-06, "logits/chosen": -0.3362390398979187, "logits/rejected": -0.26001688838005066, "logps/chosen": -0.3976631462574005, "logps/rejected": -3.968487024307251, "loss": 0.3903, "odds_ratio_loss": 0.1420903205871582, "rewards/accuracies": 1.0, "rewards/chosen": -0.03976631909608841, "rewards/margins": 0.3570823669433594, "rewards/rejected": -0.3968486487865448, "sft_loss": 0.3976631462574005, "step": 1458 }, { "epoch": 2.1099060014461317, "grad_norm": 2.1848053409122445, "learning_rate": 5.972141665639116e-06, "logits/chosen": -0.35201942920684814, "logits/rejected": -0.48823869228363037, "logps/chosen": -0.3853376507759094, "logps/rejected": -2.537548542022705, "loss": 0.3747, "odds_ratio_loss": 0.12692534923553467, "rewards/accuracies": 1.0, "rewards/chosen": -0.038533765822649, "rewards/margins": 0.2152210772037506, "rewards/rejected": -0.2537548542022705, "sft_loss": 0.3853376507759094, "step": 1459 }, { "epoch": 2.111352133044107, "grad_norm": 2.33061695960309, "learning_rate": 5.969438931377368e-06, "logits/chosen": -0.3261950612068176, "logits/rejected": -0.22967484593391418, "logps/chosen": -0.28880342841148376, "logps/rejected": -2.5488944053649902, "loss": 0.3145, "odds_ratio_loss": 0.11012070626020432, "rewards/accuracies": 1.0, "rewards/chosen": -0.028880342841148376, "rewards/margins": 0.2260090857744217, "rewards/rejected": -0.25488942861557007, "sft_loss": 0.28880342841148376, "step": 1460 }, { "epoch": 2.112798264642082, "grad_norm": 2.415827008373336, "learning_rate": 5.966735009733555e-06, "logits/chosen": -0.15220189094543457, "logits/rejected": -0.09951989352703094, "logps/chosen": -0.2214992344379425, "logps/rejected": -3.8833041191101074, "loss": 0.3814, "odds_ratio_loss": 0.15547379851341248, "rewards/accuracies": 0.9375, "rewards/chosen": -0.02214992418885231, "rewards/margins": 0.3661804795265198, "rewards/rejected": -0.3883304297924042, "sft_loss": 0.2214992344379425, "step": 1461 }, { "epoch": 2.114244396240058, "grad_norm": 2.60807066935678, "learning_rate": 5.964029902337881e-06, "logits/chosen": -0.2591196298599243, "logits/rejected": -0.23985594511032104, "logps/chosen": -0.3921408951282501, "logps/rejected": -1.9706977605819702, "loss": 0.4367, "odds_ratio_loss": 0.20559149980545044, "rewards/accuracies": 1.0, "rewards/chosen": -0.03921408951282501, "rewards/margins": 0.15785571932792664, "rewards/rejected": -0.19706979393959045, "sft_loss": 0.3921408951282501, "step": 1462 }, { "epoch": 2.115690527838033, "grad_norm": 2.414112330767816, "learning_rate": 5.961323610821263e-06, "logits/chosen": -0.4906052350997925, "logits/rejected": -0.37421441078186035, "logps/chosen": -0.33694005012512207, "logps/rejected": -2.2455477714538574, "loss": 0.3451, "odds_ratio_loss": 0.12408454716205597, "rewards/accuracies": 1.0, "rewards/chosen": -0.033694006502628326, "rewards/margins": 0.19086073338985443, "rewards/rejected": -0.22455476224422455, "sft_loss": 0.33694005012512207, "step": 1463 }, { "epoch": 2.117136659436009, "grad_norm": 2.341502677921619, "learning_rate": 5.9586161368153345e-06, "logits/chosen": -0.23700487613677979, "logits/rejected": -0.24998445808887482, "logps/chosen": -0.32917293906211853, "logps/rejected": -2.9448232650756836, "loss": 0.338, "odds_ratio_loss": 0.17031408846378326, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03291729465126991, "rewards/margins": 0.26156502962112427, "rewards/rejected": -0.2944823205471039, "sft_loss": 0.32917293906211853, "step": 1464 }, { "epoch": 2.118582791033984, "grad_norm": 2.8595164304602863, "learning_rate": 5.955907481952444e-06, "logits/chosen": -0.3763851523399353, "logits/rejected": -0.27793562412261963, "logps/chosen": -0.49938255548477173, "logps/rejected": -4.238722324371338, "loss": 0.3731, "odds_ratio_loss": 0.14284662902355194, "rewards/accuracies": 1.0, "rewards/chosen": -0.04993825778365135, "rewards/margins": 0.37393397092819214, "rewards/rejected": -0.4238722324371338, "sft_loss": 0.49938255548477173, "step": 1465 }, { "epoch": 2.1200289226319593, "grad_norm": 5.9339196597381045, "learning_rate": 5.953197647865646e-06, "logits/chosen": -0.30029064416885376, "logits/rejected": -0.2469271421432495, "logps/chosen": -0.2845560908317566, "logps/rejected": -5.233601093292236, "loss": 0.3089, "odds_ratio_loss": 0.06689879298210144, "rewards/accuracies": 1.0, "rewards/chosen": -0.028455613180994987, "rewards/margins": 0.494904488325119, "rewards/rejected": -0.5233600735664368, "sft_loss": 0.2845560908317566, "step": 1466 }, { "epoch": 2.121475054229935, "grad_norm": 5.434321085409511, "learning_rate": 5.950486636188713e-06, "logits/chosen": -0.36320340633392334, "logits/rejected": -0.23057013750076294, "logps/chosen": -0.3520301580429077, "logps/rejected": -4.251169204711914, "loss": 0.353, "odds_ratio_loss": 0.1139448881149292, "rewards/accuracies": 1.0, "rewards/chosen": -0.03520301356911659, "rewards/margins": 0.3899139165878296, "rewards/rejected": -0.4251168966293335, "sft_loss": 0.3520301580429077, "step": 1467 }, { "epoch": 2.1229211858279102, "grad_norm": 2.4788779481832988, "learning_rate": 5.947774448556123e-06, "logits/chosen": -0.2565758526325226, "logits/rejected": -0.16842395067214966, "logps/chosen": -0.3628906011581421, "logps/rejected": -3.6560139656066895, "loss": 0.3845, "odds_ratio_loss": 0.1105215921998024, "rewards/accuracies": 1.0, "rewards/chosen": -0.03628905862569809, "rewards/margins": 0.3293122947216034, "rewards/rejected": -0.3656013607978821, "sft_loss": 0.3628906011581421, "step": 1468 }, { "epoch": 2.124367317425886, "grad_norm": 2.086746606792482, "learning_rate": 5.9450610866030635e-06, "logits/chosen": -0.24368371069431305, "logits/rejected": -0.16405165195465088, "logps/chosen": -0.41052013635635376, "logps/rejected": -3.302800416946411, "loss": 0.3025, "odds_ratio_loss": 0.16078418493270874, "rewards/accuracies": 1.0, "rewards/chosen": -0.041052013635635376, "rewards/margins": 0.28922802209854126, "rewards/rejected": -0.330280065536499, "sft_loss": 0.41052013635635376, "step": 1469 }, { "epoch": 2.125813449023861, "grad_norm": 3.696865328122287, "learning_rate": 5.94234655196543e-06, "logits/chosen": -0.3694218397140503, "logits/rejected": -0.20579954981803894, "logps/chosen": -0.2565337121486664, "logps/rejected": -3.989961624145508, "loss": 0.3041, "odds_ratio_loss": 0.09575256705284119, "rewards/accuracies": 1.0, "rewards/chosen": -0.02565336972475052, "rewards/margins": 0.3733427822589874, "rewards/rejected": -0.39899617433547974, "sft_loss": 0.2565337121486664, "step": 1470 }, { "epoch": 2.1272595806218364, "grad_norm": 2.1564596951388095, "learning_rate": 5.939630846279828e-06, "logits/chosen": -0.40536144375801086, "logits/rejected": -0.37199535965919495, "logps/chosen": -0.379259318113327, "logps/rejected": -4.299976825714111, "loss": 0.3549, "odds_ratio_loss": 0.14220932126045227, "rewards/accuracies": 1.0, "rewards/chosen": -0.03792593628168106, "rewards/margins": 0.3920717239379883, "rewards/rejected": -0.42999768257141113, "sft_loss": 0.379259318113327, "step": 1471 }, { "epoch": 2.128705712219812, "grad_norm": 2.2649955437637272, "learning_rate": 5.936913971183562e-06, "logits/chosen": -0.29743102192878723, "logits/rejected": -0.19528326392173767, "logps/chosen": -0.336579829454422, "logps/rejected": -4.535792350769043, "loss": 0.3418, "odds_ratio_loss": 0.11933722347021103, "rewards/accuracies": 1.0, "rewards/chosen": -0.0336579829454422, "rewards/margins": 0.41992127895355225, "rewards/rejected": -0.45357927680015564, "sft_loss": 0.336579829454422, "step": 1472 }, { "epoch": 2.1301518438177873, "grad_norm": 2.1566457466271616, "learning_rate": 5.93419592831465e-06, "logits/chosen": -0.24219074845314026, "logits/rejected": -0.10176640748977661, "logps/chosen": -0.296125590801239, "logps/rejected": -3.7083725929260254, "loss": 0.3531, "odds_ratio_loss": 0.14927887916564941, "rewards/accuracies": 1.0, "rewards/chosen": -0.02961255982518196, "rewards/margins": 0.341224730014801, "rewards/rejected": -0.3708372712135315, "sft_loss": 0.296125590801239, "step": 1473 }, { "epoch": 2.131597975415763, "grad_norm": 2.641384013878491, "learning_rate": 5.9314767193118104e-06, "logits/chosen": -0.28196150064468384, "logits/rejected": -0.15774741768836975, "logps/chosen": -0.3431814908981323, "logps/rejected": -3.5857958793640137, "loss": 0.3147, "odds_ratio_loss": 0.1359584480524063, "rewards/accuracies": 1.0, "rewards/chosen": -0.03431814908981323, "rewards/margins": 0.3242614269256592, "rewards/rejected": -0.3585796058177948, "sft_loss": 0.3431814908981323, "step": 1474 }, { "epoch": 2.1330441070137383, "grad_norm": 2.1937742000667018, "learning_rate": 5.928756345814462e-06, "logits/chosen": -0.15431058406829834, "logits/rejected": -0.1452016532421112, "logps/chosen": -0.38092872500419617, "logps/rejected": -3.261220932006836, "loss": 0.3528, "odds_ratio_loss": 0.1584779918193817, "rewards/accuracies": 0.9375, "rewards/chosen": -0.038092873990535736, "rewards/margins": 0.2880292236804962, "rewards/rejected": -0.32612210512161255, "sft_loss": 0.38092872500419617, "step": 1475 }, { "epoch": 2.1344902386117135, "grad_norm": 2.7322141667155297, "learning_rate": 5.926034809462729e-06, "logits/chosen": -0.2722333073616028, "logits/rejected": -0.28116655349731445, "logps/chosen": -0.3736817240715027, "logps/rejected": -3.1287436485290527, "loss": 0.3954, "odds_ratio_loss": 0.16877394914627075, "rewards/accuracies": 1.0, "rewards/chosen": -0.03736817464232445, "rewards/margins": 0.2755061984062195, "rewards/rejected": -0.31287437677383423, "sft_loss": 0.3736817240715027, "step": 1476 }, { "epoch": 2.135936370209689, "grad_norm": 2.3384100846293454, "learning_rate": 5.923312111897437e-06, "logits/chosen": -0.20643384754657745, "logits/rejected": -0.19397512078285217, "logps/chosen": -0.3226177394390106, "logps/rejected": -3.1748647689819336, "loss": 0.3832, "odds_ratio_loss": 0.13731184601783752, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03226177394390106, "rewards/margins": 0.2852247357368469, "rewards/rejected": -0.3174864947795868, "sft_loss": 0.3226177394390106, "step": 1477 }, { "epoch": 2.1373825018076644, "grad_norm": 2.691733868567432, "learning_rate": 5.92058825476011e-06, "logits/chosen": -0.3043122887611389, "logits/rejected": -0.2385656237602234, "logps/chosen": -0.39572885632514954, "logps/rejected": -2.609856128692627, "loss": 0.3954, "odds_ratio_loss": 0.11852793395519257, "rewards/accuracies": 0.9375, "rewards/chosen": -0.039572883397340775, "rewards/margins": 0.2214127480983734, "rewards/rejected": -0.2609856128692627, "sft_loss": 0.39572885632514954, "step": 1478 }, { "epoch": 2.13882863340564, "grad_norm": 3.5489740621337864, "learning_rate": 5.917863239692969e-06, "logits/chosen": -0.20991955697536469, "logits/rejected": -0.19747528433799744, "logps/chosen": -0.34138545393943787, "logps/rejected": -3.09604549407959, "loss": 0.4404, "odds_ratio_loss": 0.13940584659576416, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03413854539394379, "rewards/margins": 0.27546602487564087, "rewards/rejected": -0.30960455536842346, "sft_loss": 0.34138545393943787, "step": 1479 }, { "epoch": 2.1402747650036154, "grad_norm": 2.1460462624407914, "learning_rate": 5.91513706833894e-06, "logits/chosen": -0.24288667738437653, "logits/rejected": -0.15092161297798157, "logps/chosen": -0.2745353877544403, "logps/rejected": -2.8250560760498047, "loss": 0.3642, "odds_ratio_loss": 0.09830009937286377, "rewards/accuracies": 1.0, "rewards/chosen": -0.02745353803038597, "rewards/margins": 0.25505203008651733, "rewards/rejected": -0.2825055718421936, "sft_loss": 0.2745353877544403, "step": 1480 }, { "epoch": 2.1417208966015906, "grad_norm": 2.89276313645248, "learning_rate": 5.912409742341639e-06, "logits/chosen": -0.2257198840379715, "logits/rejected": -0.28929704427719116, "logps/chosen": -0.5430160760879517, "logps/rejected": -2.884078025817871, "loss": 0.3624, "odds_ratio_loss": 0.18084710836410522, "rewards/accuracies": 1.0, "rewards/chosen": -0.05430160462856293, "rewards/margins": 0.23410621285438538, "rewards/rejected": -0.2884078323841095, "sft_loss": 0.5430160760879517, "step": 1481 }, { "epoch": 2.1431670281995663, "grad_norm": 2.12580286870257, "learning_rate": 5.909681263345382e-06, "logits/chosen": -0.26066428422927856, "logits/rejected": -0.2458367943763733, "logps/chosen": -0.46075600385665894, "logps/rejected": -2.822026252746582, "loss": 0.4077, "odds_ratio_loss": 0.18911300599575043, "rewards/accuracies": 1.0, "rewards/chosen": -0.04607560113072395, "rewards/margins": 0.23612701892852783, "rewards/rejected": -0.2822026312351227, "sft_loss": 0.46075600385665894, "step": 1482 }, { "epoch": 2.1446131597975415, "grad_norm": 2.339258086988403, "learning_rate": 5.906951632995179e-06, "logits/chosen": -0.22002863883972168, "logits/rejected": -0.16895297169685364, "logps/chosen": -0.39282089471817017, "logps/rejected": -4.94498872756958, "loss": 0.3477, "odds_ratio_loss": 0.11626096069812775, "rewards/accuracies": 1.0, "rewards/chosen": -0.039282094687223434, "rewards/margins": 0.45521676540374756, "rewards/rejected": -0.4944988489151001, "sft_loss": 0.39282089471817017, "step": 1483 }, { "epoch": 2.1460592913955168, "grad_norm": 2.4373005337957516, "learning_rate": 5.904220852936733e-06, "logits/chosen": -0.253675639629364, "logits/rejected": -0.33716779947280884, "logps/chosen": -0.2549727261066437, "logps/rejected": -3.052334785461426, "loss": 0.2846, "odds_ratio_loss": 0.13065417110919952, "rewards/accuracies": 1.0, "rewards/chosen": -0.02549727074801922, "rewards/margins": 0.2797362208366394, "rewards/rejected": -0.3052334785461426, "sft_loss": 0.2549727261066437, "step": 1484 }, { "epoch": 2.1475054229934925, "grad_norm": 3.249695223924806, "learning_rate": 5.901488924816444e-06, "logits/chosen": -0.3323328495025635, "logits/rejected": -0.2721770405769348, "logps/chosen": -0.4279077351093292, "logps/rejected": -2.4622364044189453, "loss": 0.4213, "odds_ratio_loss": 0.1369471549987793, "rewards/accuracies": 1.0, "rewards/chosen": -0.04279077425599098, "rewards/margins": 0.2034328728914261, "rewards/rejected": -0.24622364342212677, "sft_loss": 0.4279077351093292, "step": 1485 }, { "epoch": 2.1489515545914677, "grad_norm": 2.2438085957378293, "learning_rate": 5.8987558502814e-06, "logits/chosen": -0.3333353102207184, "logits/rejected": -0.27355486154556274, "logps/chosen": -0.482401043176651, "logps/rejected": -3.146805763244629, "loss": 0.4227, "odds_ratio_loss": 0.19827474653720856, "rewards/accuracies": 1.0, "rewards/chosen": -0.04824010655283928, "rewards/margins": 0.2664404511451721, "rewards/rejected": -0.3146805763244629, "sft_loss": 0.482401043176651, "step": 1486 }, { "epoch": 2.1503976861894434, "grad_norm": 2.104417928949804, "learning_rate": 5.896021630979382e-06, "logits/chosen": -0.35297325253486633, "logits/rejected": -0.2755223512649536, "logps/chosen": -0.3497896194458008, "logps/rejected": -2.348954439163208, "loss": 0.3253, "odds_ratio_loss": 0.1740923374891281, "rewards/accuracies": 1.0, "rewards/chosen": -0.0349789634346962, "rewards/margins": 0.19991648197174072, "rewards/rejected": -0.23489545285701752, "sft_loss": 0.3497896194458008, "step": 1487 }, { "epoch": 2.1518438177874186, "grad_norm": 2.600438723649929, "learning_rate": 5.89328626855886e-06, "logits/chosen": -0.3274020850658417, "logits/rejected": -0.19287540018558502, "logps/chosen": -0.2810543477535248, "logps/rejected": -3.3650529384613037, "loss": 0.3803, "odds_ratio_loss": 0.06862045079469681, "rewards/accuracies": 1.0, "rewards/chosen": -0.02810543403029442, "rewards/margins": 0.30839988589286804, "rewards/rejected": -0.33650532364845276, "sft_loss": 0.2810543477535248, "step": 1488 }, { "epoch": 2.153289949385394, "grad_norm": 2.4433889180529382, "learning_rate": 5.890549764668996e-06, "logits/chosen": -0.1400214284658432, "logits/rejected": -0.08871597051620483, "logps/chosen": -0.2968231737613678, "logps/rejected": -2.3680202960968018, "loss": 0.2994, "odds_ratio_loss": 0.1344771385192871, "rewards/accuracies": 1.0, "rewards/chosen": -0.02968231588602066, "rewards/margins": 0.20711973309516907, "rewards/rejected": -0.23680204153060913, "sft_loss": 0.2968231737613678, "step": 1489 }, { "epoch": 2.1547360809833696, "grad_norm": 2.2686277694593144, "learning_rate": 5.88781212095964e-06, "logits/chosen": -0.4212700426578522, "logits/rejected": -0.21845176815986633, "logps/chosen": -0.2443973571062088, "logps/rejected": -5.354547500610352, "loss": 0.3728, "odds_ratio_loss": 0.08256202936172485, "rewards/accuracies": 1.0, "rewards/chosen": -0.02443973906338215, "rewards/margins": 0.5110150575637817, "rewards/rejected": -0.5354547500610352, "sft_loss": 0.2443973571062088, "step": 1490 }, { "epoch": 2.156182212581345, "grad_norm": 2.32926426146957, "learning_rate": 5.885073339081323e-06, "logits/chosen": -0.28272032737731934, "logits/rejected": -0.24804599583148956, "logps/chosen": -0.3671835660934448, "logps/rejected": -3.007756233215332, "loss": 0.3437, "odds_ratio_loss": 0.12326133251190186, "rewards/accuracies": 1.0, "rewards/chosen": -0.03671835735440254, "rewards/margins": 0.2640572488307953, "rewards/rejected": -0.3007756173610687, "sft_loss": 0.3671835660934448, "step": 1491 }, { "epoch": 2.1576283441793205, "grad_norm": 3.877286822110988, "learning_rate": 5.882333420685269e-06, "logits/chosen": -0.5978209376335144, "logits/rejected": -0.43089497089385986, "logps/chosen": -0.2918180227279663, "logps/rejected": -3.2058868408203125, "loss": 0.3835, "odds_ratio_loss": 0.0905960351228714, "rewards/accuracies": 1.0, "rewards/chosen": -0.02918180450797081, "rewards/margins": 0.29140689969062805, "rewards/rejected": -0.32058870792388916, "sft_loss": 0.2918180227279663, "step": 1492 }, { "epoch": 2.1590744757772957, "grad_norm": 2.453498067923628, "learning_rate": 5.879592367423386e-06, "logits/chosen": -0.2288958728313446, "logits/rejected": -0.15676091611385345, "logps/chosen": -0.4409548044204712, "logps/rejected": -2.5945446491241455, "loss": 0.4106, "odds_ratio_loss": 0.132335364818573, "rewards/accuracies": 1.0, "rewards/chosen": -0.044095478951931, "rewards/margins": 0.21535900235176086, "rewards/rejected": -0.25945448875427246, "sft_loss": 0.4409548044204712, "step": 1493 }, { "epoch": 2.160520607375271, "grad_norm": 2.689912629165323, "learning_rate": 5.876850180948265e-06, "logits/chosen": -0.30421751737594604, "logits/rejected": -0.23876774311065674, "logps/chosen": -0.3396947979927063, "logps/rejected": -3.2614731788635254, "loss": 0.405, "odds_ratio_loss": 0.12139807641506195, "rewards/accuracies": 1.0, "rewards/chosen": -0.03396947681903839, "rewards/margins": 0.29217785596847534, "rewards/rejected": -0.32614731788635254, "sft_loss": 0.3396947979927063, "step": 1494 }, { "epoch": 2.1619667389732466, "grad_norm": 2.4501357659326675, "learning_rate": 5.8741068629131785e-06, "logits/chosen": -0.18467435240745544, "logits/rejected": -0.16658593714237213, "logps/chosen": -0.29923176765441895, "logps/rejected": -6.522655487060547, "loss": 0.3052, "odds_ratio_loss": 0.10115586221218109, "rewards/accuracies": 1.0, "rewards/chosen": -0.029923178255558014, "rewards/margins": 0.6223424077033997, "rewards/rejected": -0.6522656083106995, "sft_loss": 0.29923176765441895, "step": 1495 }, { "epoch": 2.163412870571222, "grad_norm": 2.286318868410751, "learning_rate": 5.871362414972084e-06, "logits/chosen": -0.1483822911977768, "logits/rejected": -0.19763171672821045, "logps/chosen": -0.3210403323173523, "logps/rejected": -2.6171536445617676, "loss": 0.3945, "odds_ratio_loss": 0.14660733938217163, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03210403397679329, "rewards/margins": 0.229611337184906, "rewards/rejected": -0.2617153525352478, "sft_loss": 0.3210403323173523, "step": 1496 }, { "epoch": 2.1648590021691976, "grad_norm": 2.561828911466251, "learning_rate": 5.8686168387796205e-06, "logits/chosen": -0.2733272314071655, "logits/rejected": -0.2274172306060791, "logps/chosen": -0.33266696333885193, "logps/rejected": -2.2120425701141357, "loss": 0.4399, "odds_ratio_loss": 0.178709477186203, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03326670080423355, "rewards/margins": 0.18793755769729614, "rewards/rejected": -0.2212042361497879, "sft_loss": 0.33266696333885193, "step": 1497 }, { "epoch": 2.166305133767173, "grad_norm": 2.9375646923284338, "learning_rate": 5.865870135991107e-06, "logits/chosen": -0.30630001425743103, "logits/rejected": -0.19453690946102142, "logps/chosen": -0.21391445398330688, "logps/rejected": -2.3469486236572266, "loss": 0.3543, "odds_ratio_loss": 0.10182206332683563, "rewards/accuracies": 1.0, "rewards/chosen": -0.02139144390821457, "rewards/margins": 0.21330343186855316, "rewards/rejected": -0.23469488322734833, "sft_loss": 0.21391445398330688, "step": 1498 }, { "epoch": 2.167751265365148, "grad_norm": 2.2293176746225645, "learning_rate": 5.863122308262538e-06, "logits/chosen": -0.36253878474235535, "logits/rejected": -0.217836394906044, "logps/chosen": -0.28018245100975037, "logps/rejected": -3.587547779083252, "loss": 0.2916, "odds_ratio_loss": 0.07377097755670547, "rewards/accuracies": 1.0, "rewards/chosen": -0.028018245473504066, "rewards/margins": 0.33073654770851135, "rewards/rejected": -0.35875481367111206, "sft_loss": 0.28018245100975037, "step": 1499 }, { "epoch": 2.1691973969631237, "grad_norm": 2.1990139631710544, "learning_rate": 5.86037335725059e-06, "logits/chosen": -0.2960984408855438, "logits/rejected": -0.17636118829250336, "logps/chosen": -0.36348897218704224, "logps/rejected": -3.5222487449645996, "loss": 0.3755, "odds_ratio_loss": 0.06385881453752518, "rewards/accuracies": 1.0, "rewards/chosen": -0.036348894238471985, "rewards/margins": 0.31587597727775574, "rewards/rejected": -0.35222485661506653, "sft_loss": 0.36348897218704224, "step": 1500 }, { "epoch": 2.170643528561099, "grad_norm": 3.0269353766132534, "learning_rate": 5.857623284612616e-06, "logits/chosen": -0.3736283481121063, "logits/rejected": -0.3629686236381531, "logps/chosen": -0.3977740406990051, "logps/rejected": -3.777735710144043, "loss": 0.3153, "odds_ratio_loss": 0.112032949924469, "rewards/accuracies": 1.0, "rewards/chosen": -0.039777401834726334, "rewards/margins": 0.3379961848258972, "rewards/rejected": -0.37777355313301086, "sft_loss": 0.3977740406990051, "step": 1501 }, { "epoch": 2.1720896601590747, "grad_norm": 2.413695680791457, "learning_rate": 5.854872092006645e-06, "logits/chosen": -0.20464223623275757, "logits/rejected": -0.19365337491035461, "logps/chosen": -0.264778733253479, "logps/rejected": -2.7025864124298096, "loss": 0.2889, "odds_ratio_loss": 0.0847976952791214, "rewards/accuracies": 1.0, "rewards/chosen": -0.0264778733253479, "rewards/margins": 0.24378079175949097, "rewards/rejected": -0.27025866508483887, "sft_loss": 0.264778733253479, "step": 1502 }, { "epoch": 2.17353579175705, "grad_norm": 3.374590164095406, "learning_rate": 5.852119781091381e-06, "logits/chosen": -0.4547528922557831, "logits/rejected": -0.34812527894973755, "logps/chosen": -0.4338432252407074, "logps/rejected": -3.0329198837280273, "loss": 0.4761, "odds_ratio_loss": 0.12955176830291748, "rewards/accuracies": 1.0, "rewards/chosen": -0.04338432103395462, "rewards/margins": 0.25990769267082214, "rewards/rejected": -0.30329203605651855, "sft_loss": 0.4338432252407074, "step": 1503 }, { "epoch": 2.174981923355025, "grad_norm": 2.146790054606926, "learning_rate": 5.8493663535262045e-06, "logits/chosen": -0.18142646551132202, "logits/rejected": -0.1190670058131218, "logps/chosen": -0.23194721341133118, "logps/rejected": -3.571894645690918, "loss": 0.3787, "odds_ratio_loss": 0.09054180234670639, "rewards/accuracies": 1.0, "rewards/chosen": -0.023194722831249237, "rewards/margins": 0.3339947462081909, "rewards/rejected": -0.35718944668769836, "sft_loss": 0.23194721341133118, "step": 1504 }, { "epoch": 2.176428054953001, "grad_norm": 2.1354133121281076, "learning_rate": 5.846611810971166e-06, "logits/chosen": -0.32426026463508606, "logits/rejected": -0.24918986856937408, "logps/chosen": -0.3435249328613281, "logps/rejected": -3.3698172569274902, "loss": 0.3461, "odds_ratio_loss": 0.15128874778747559, "rewards/accuracies": 1.0, "rewards/chosen": -0.03435249626636505, "rewards/margins": 0.3026292324066162, "rewards/rejected": -0.33698174357414246, "sft_loss": 0.3435249328613281, "step": 1505 }, { "epoch": 2.177874186550976, "grad_norm": 2.7078861801848744, "learning_rate": 5.843856155086988e-06, "logits/chosen": 0.006320249754935503, "logits/rejected": -0.04288327321410179, "logps/chosen": -0.29730698466300964, "logps/rejected": -3.772261619567871, "loss": 0.2973, "odds_ratio_loss": 0.1492423266172409, "rewards/accuracies": 0.9375, "rewards/chosen": -0.029730698093771935, "rewards/margins": 0.347495436668396, "rewards/rejected": -0.3772261142730713, "sft_loss": 0.29730698466300964, "step": 1506 }, { "epoch": 2.1793203181489513, "grad_norm": 2.815754065971916, "learning_rate": 5.841099387535067e-06, "logits/chosen": -0.13005805015563965, "logits/rejected": -0.21556347608566284, "logps/chosen": -0.486311674118042, "logps/rejected": -3.861595869064331, "loss": 0.359, "odds_ratio_loss": 0.21694286167621613, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04863116517663002, "rewards/margins": 0.33752843737602234, "rewards/rejected": -0.38615962862968445, "sft_loss": 0.486311674118042, "step": 1507 }, { "epoch": 2.180766449746927, "grad_norm": 2.365644205544063, "learning_rate": 5.838341509977468e-06, "logits/chosen": -0.26615601778030396, "logits/rejected": -0.30764591693878174, "logps/chosen": -0.38783353567123413, "logps/rejected": -3.8921470642089844, "loss": 0.4536, "odds_ratio_loss": 0.13304699957370758, "rewards/accuracies": 1.0, "rewards/chosen": -0.03878335654735565, "rewards/margins": 0.3504313826560974, "rewards/rejected": -0.38921472430229187, "sft_loss": 0.38783353567123413, "step": 1508 }, { "epoch": 2.1822125813449023, "grad_norm": 2.3090179607820875, "learning_rate": 5.835582524076927e-06, "logits/chosen": -0.2245335578918457, "logits/rejected": -0.3578979969024658, "logps/chosen": -0.40855467319488525, "logps/rejected": -2.059465169906616, "loss": 0.4066, "odds_ratio_loss": 0.18744851648807526, "rewards/accuracies": 1.0, "rewards/chosen": -0.04085546359419823, "rewards/margins": 0.16509106755256653, "rewards/rejected": -0.20594651997089386, "sft_loss": 0.40855467319488525, "step": 1509 }, { "epoch": 2.183658712942878, "grad_norm": 2.794188680111159, "learning_rate": 5.832822431496845e-06, "logits/chosen": -0.265776127576828, "logits/rejected": -0.12581999599933624, "logps/chosen": -0.4239056706428528, "logps/rejected": -1.9903366565704346, "loss": 0.3896, "odds_ratio_loss": 0.1404733806848526, "rewards/accuracies": 1.0, "rewards/chosen": -0.042390573769807816, "rewards/margins": 0.1566431075334549, "rewards/rejected": -0.1990336775779724, "sft_loss": 0.4239056706428528, "step": 1510 }, { "epoch": 2.185104844540853, "grad_norm": 2.7143930939543486, "learning_rate": 5.830061233901293e-06, "logits/chosen": -0.29388314485549927, "logits/rejected": -0.31625548005104065, "logps/chosen": -0.39467012882232666, "logps/rejected": -2.1236424446105957, "loss": 0.3985, "odds_ratio_loss": 0.17664192616939545, "rewards/accuracies": 1.0, "rewards/chosen": -0.03946701064705849, "rewards/margins": 0.17289723455905914, "rewards/rejected": -0.21236424148082733, "sft_loss": 0.39467012882232666, "step": 1511 }, { "epoch": 2.1865509761388284, "grad_norm": 2.362745074271975, "learning_rate": 5.827298932955006e-06, "logits/chosen": -0.2649492025375366, "logits/rejected": -0.14799144864082336, "logps/chosen": -0.21747642755508423, "logps/rejected": -5.31502103805542, "loss": 0.3557, "odds_ratio_loss": 0.08317440748214722, "rewards/accuracies": 1.0, "rewards/chosen": -0.02174764685332775, "rewards/margins": 0.509754478931427, "rewards/rejected": -0.5315020680427551, "sft_loss": 0.21747642755508423, "step": 1512 }, { "epoch": 2.187997107736804, "grad_norm": 2.5515207334279997, "learning_rate": 5.8245355303233885e-06, "logits/chosen": -0.3026021718978882, "logits/rejected": -0.160127192735672, "logps/chosen": -0.29952093958854675, "logps/rejected": -3.455519199371338, "loss": 0.3361, "odds_ratio_loss": 0.07680322229862213, "rewards/accuracies": 1.0, "rewards/chosen": -0.029952093958854675, "rewards/margins": 0.31559985876083374, "rewards/rejected": -0.3455519378185272, "sft_loss": 0.29952093958854675, "step": 1513 }, { "epoch": 2.1894432393347794, "grad_norm": 2.312767228310274, "learning_rate": 5.8217710276725034e-06, "logits/chosen": -0.3605382740497589, "logits/rejected": -0.20827777683734894, "logps/chosen": -0.36337408423423767, "logps/rejected": -3.856226921081543, "loss": 0.3926, "odds_ratio_loss": 0.16154375672340393, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03633740544319153, "rewards/margins": 0.3492853045463562, "rewards/rejected": -0.38562270998954773, "sft_loss": 0.36337408423423767, "step": 1514 }, { "epoch": 2.190889370932755, "grad_norm": 2.5824996018885518, "learning_rate": 5.819005426669081e-06, "logits/chosen": -0.45703208446502686, "logits/rejected": -0.34461653232574463, "logps/chosen": -0.32233983278274536, "logps/rejected": -3.073678970336914, "loss": 0.3253, "odds_ratio_loss": 0.11809199303388596, "rewards/accuracies": 1.0, "rewards/chosen": -0.032233983278274536, "rewards/margins": 0.27513387799263, "rewards/rejected": -0.30736786127090454, "sft_loss": 0.32233983278274536, "step": 1515 }, { "epoch": 2.1923355025307303, "grad_norm": 2.8923982163571713, "learning_rate": 5.816238728980512e-06, "logits/chosen": -0.12480375170707703, "logits/rejected": -0.20347639918327332, "logps/chosen": -0.3857441842556, "logps/rejected": -4.365511894226074, "loss": 0.374, "odds_ratio_loss": 0.18619787693023682, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03857441619038582, "rewards/margins": 0.39797675609588623, "rewards/rejected": -0.43655121326446533, "sft_loss": 0.3857441842556, "step": 1516 }, { "epoch": 2.1937816341287055, "grad_norm": 2.3513904706926465, "learning_rate": 5.81347093627485e-06, "logits/chosen": -0.18540328741073608, "logits/rejected": -0.2682988941669464, "logps/chosen": -0.41352933645248413, "logps/rejected": -2.648174285888672, "loss": 0.3474, "odds_ratio_loss": 0.20547065138816833, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04135293513536453, "rewards/margins": 0.2234645038843155, "rewards/rejected": -0.26481741666793823, "sft_loss": 0.41352933645248413, "step": 1517 }, { "epoch": 2.195227765726681, "grad_norm": 2.3218155921073924, "learning_rate": 5.810702050220806e-06, "logits/chosen": -0.26080507040023804, "logits/rejected": -0.16710934042930603, "logps/chosen": -0.5044231414794922, "logps/rejected": -3.791574716567993, "loss": 0.3948, "odds_ratio_loss": 0.15905873477458954, "rewards/accuracies": 1.0, "rewards/chosen": -0.05044231563806534, "rewards/margins": 0.32871514558792114, "rewards/rejected": -0.3791574537754059, "sft_loss": 0.5044231414794922, "step": 1518 }, { "epoch": 2.1966738973246565, "grad_norm": 2.523759857827364, "learning_rate": 5.807932072487751e-06, "logits/chosen": -0.24406197667121887, "logits/rejected": -0.18871784210205078, "logps/chosen": -0.37185871601104736, "logps/rejected": -2.7142934799194336, "loss": 0.3622, "odds_ratio_loss": 0.21322374045848846, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03718587011098862, "rewards/margins": 0.2342434823513031, "rewards/rejected": -0.2714293599128723, "sft_loss": 0.37185871601104736, "step": 1519 }, { "epoch": 2.198120028922632, "grad_norm": 2.3677642125499387, "learning_rate": 5.805161004745716e-06, "logits/chosen": -0.29101336002349854, "logits/rejected": -0.3166297674179077, "logps/chosen": -0.4049059748649597, "logps/rejected": -2.3067679405212402, "loss": 0.4088, "odds_ratio_loss": 0.19290974736213684, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04049060121178627, "rewards/margins": 0.19018618762493134, "rewards/rejected": -0.2306768000125885, "sft_loss": 0.4049059748649597, "step": 1520 }, { "epoch": 2.1995661605206074, "grad_norm": 2.3802608136855268, "learning_rate": 5.802388848665391e-06, "logits/chosen": -0.236686110496521, "logits/rejected": -0.09095649421215057, "logps/chosen": -0.30331850051879883, "logps/rejected": -3.6234824657440186, "loss": 0.3045, "odds_ratio_loss": 0.0965602695941925, "rewards/accuracies": 1.0, "rewards/chosen": -0.030331851914525032, "rewards/margins": 0.33201637864112854, "rewards/rejected": -0.3623482584953308, "sft_loss": 0.30331850051879883, "step": 1521 }, { "epoch": 2.2010122921185826, "grad_norm": 9.350151858192348, "learning_rate": 5.7996156059181135e-06, "logits/chosen": -0.27046582102775574, "logits/rejected": -0.17398667335510254, "logps/chosen": -0.20965230464935303, "logps/rejected": -2.9031565189361572, "loss": 0.3707, "odds_ratio_loss": 0.08045334368944168, "rewards/accuracies": 1.0, "rewards/chosen": -0.020965231582522392, "rewards/margins": 0.26935043931007385, "rewards/rejected": -0.2903156578540802, "sft_loss": 0.20965230464935303, "step": 1522 }, { "epoch": 2.2024584237165583, "grad_norm": 2.9087098075463276, "learning_rate": 5.796841278175886e-06, "logits/chosen": -0.34366631507873535, "logits/rejected": -0.2516658306121826, "logps/chosen": -0.3939272165298462, "logps/rejected": -2.5516390800476074, "loss": 0.4125, "odds_ratio_loss": 0.11796994507312775, "rewards/accuracies": 1.0, "rewards/chosen": -0.03939272090792656, "rewards/margins": 0.21577118337154388, "rewards/rejected": -0.25516390800476074, "sft_loss": 0.3939272165298462, "step": 1523 }, { "epoch": 2.2039045553145336, "grad_norm": 2.3977389772644417, "learning_rate": 5.794065867111359e-06, "logits/chosen": -0.15821127593517303, "logits/rejected": -0.3337038457393646, "logps/chosen": -0.5078226923942566, "logps/rejected": -3.0043892860412598, "loss": 0.3828, "odds_ratio_loss": 0.2925236225128174, "rewards/accuracies": 0.875, "rewards/chosen": -0.05078226700425148, "rewards/margins": 0.24965664744377136, "rewards/rejected": -0.30043891072273254, "sft_loss": 0.5078226923942566, "step": 1524 }, { "epoch": 2.2053506869125092, "grad_norm": 2.468736682512821, "learning_rate": 5.791289374397839e-06, "logits/chosen": -0.31990671157836914, "logits/rejected": -0.4198131561279297, "logps/chosen": -0.3348398804664612, "logps/rejected": -2.396658420562744, "loss": 0.3388, "odds_ratio_loss": 0.14263346791267395, "rewards/accuracies": 1.0, "rewards/chosen": -0.03348398581147194, "rewards/margins": 0.20618188381195068, "rewards/rejected": -0.23966586589813232, "sft_loss": 0.3348398804664612, "step": 1525 }, { "epoch": 2.2067968185104845, "grad_norm": 2.172638209003934, "learning_rate": 5.788511801709283e-06, "logits/chosen": -0.30636391043663025, "logits/rejected": -0.2372589260339737, "logps/chosen": -0.26845329999923706, "logps/rejected": -4.8146281242370605, "loss": 0.3712, "odds_ratio_loss": 0.07393340766429901, "rewards/accuracies": 1.0, "rewards/chosen": -0.026845330372452736, "rewards/margins": 0.4546175003051758, "rewards/rejected": -0.48146283626556396, "sft_loss": 0.26845329999923706, "step": 1526 }, { "epoch": 2.2082429501084597, "grad_norm": 3.077739945365637, "learning_rate": 5.785733150720301e-06, "logits/chosen": -0.3096201717853546, "logits/rejected": -0.1331636607646942, "logps/chosen": -0.5265023708343506, "logps/rejected": -4.324217796325684, "loss": 0.358, "odds_ratio_loss": 0.12146250903606415, "rewards/accuracies": 1.0, "rewards/chosen": -0.05265023931860924, "rewards/margins": 0.3797715902328491, "rewards/rejected": -0.4324217736721039, "sft_loss": 0.5265023708343506, "step": 1527 }, { "epoch": 2.2096890817064354, "grad_norm": 2.6942329260961744, "learning_rate": 5.782953423106153e-06, "logits/chosen": -0.1731545627117157, "logits/rejected": -0.3327570855617523, "logps/chosen": -0.38150539994239807, "logps/rejected": -3.1940901279449463, "loss": 0.384, "odds_ratio_loss": 0.1315080225467682, "rewards/accuracies": 0.9375, "rewards/chosen": -0.038150541484355927, "rewards/margins": 0.2812584340572357, "rewards/rejected": -0.31940898299217224, "sft_loss": 0.38150539994239807, "step": 1528 }, { "epoch": 2.2111352133044107, "grad_norm": 2.569601628130879, "learning_rate": 5.780172620542744e-06, "logits/chosen": -0.3070398271083832, "logits/rejected": -0.2711654305458069, "logps/chosen": -0.35240018367767334, "logps/rejected": -3.4073891639709473, "loss": 0.3628, "odds_ratio_loss": 0.09907615929841995, "rewards/accuracies": 1.0, "rewards/chosen": -0.03524002060294151, "rewards/margins": 0.3054988980293274, "rewards/rejected": -0.3407389223575592, "sft_loss": 0.35240018367767334, "step": 1529 }, { "epoch": 2.212581344902386, "grad_norm": 2.322302360573357, "learning_rate": 5.777390744706633e-06, "logits/chosen": -0.11148097366094589, "logits/rejected": -0.10225170850753784, "logps/chosen": -0.37124699354171753, "logps/rejected": -1.8571745157241821, "loss": 0.3813, "odds_ratio_loss": 0.16256582736968994, "rewards/accuracies": 1.0, "rewards/chosen": -0.037124693393707275, "rewards/margins": 0.1485927402973175, "rewards/rejected": -0.18571743369102478, "sft_loss": 0.37124699354171753, "step": 1530 }, { "epoch": 2.2140274765003616, "grad_norm": 2.645782452973883, "learning_rate": 5.774607797275022e-06, "logits/chosen": -0.21809406578540802, "logits/rejected": -0.14942537248134613, "logps/chosen": -0.2832741439342499, "logps/rejected": -2.8527350425720215, "loss": 0.3689, "odds_ratio_loss": 0.14639288187026978, "rewards/accuracies": 0.9375, "rewards/chosen": -0.028327414765954018, "rewards/margins": 0.2569460868835449, "rewards/rejected": -0.2852734923362732, "sft_loss": 0.2832741439342499, "step": 1531 }, { "epoch": 2.215473608098337, "grad_norm": 2.793134220684291, "learning_rate": 5.7718237799257625e-06, "logits/chosen": -0.1597381979227066, "logits/rejected": -0.2850331962108612, "logps/chosen": -0.3894084692001343, "logps/rejected": -1.2152106761932373, "loss": 0.3134, "odds_ratio_loss": 0.23522335290908813, "rewards/accuracies": 1.0, "rewards/chosen": -0.03894084692001343, "rewards/margins": 0.08258021622896194, "rewards/rejected": -0.12152105569839478, "sft_loss": 0.3894084692001343, "step": 1532 }, { "epoch": 2.2169197396963125, "grad_norm": 2.5901822894457056, "learning_rate": 5.7690386943373446e-06, "logits/chosen": -0.3303926885128021, "logits/rejected": -0.15845176577568054, "logps/chosen": -0.23409810662269592, "logps/rejected": -4.917486190795898, "loss": 0.2976, "odds_ratio_loss": 0.05652255564928055, "rewards/accuracies": 1.0, "rewards/chosen": -0.023409809917211533, "rewards/margins": 0.46833881735801697, "rewards/rejected": -0.4917486011981964, "sft_loss": 0.23409810662269592, "step": 1533 }, { "epoch": 2.2183658712942878, "grad_norm": 2.844379078494416, "learning_rate": 5.76625254218891e-06, "logits/chosen": -0.30775171518325806, "logits/rejected": -0.3369847536087036, "logps/chosen": -0.3618554472923279, "logps/rejected": -2.403303861618042, "loss": 0.329, "odds_ratio_loss": 0.1528136283159256, "rewards/accuracies": 1.0, "rewards/chosen": -0.03618554398417473, "rewards/margins": 0.20414483547210693, "rewards/rejected": -0.24033036828041077, "sft_loss": 0.3618554472923279, "step": 1534 }, { "epoch": 2.219812002892263, "grad_norm": 2.324950377218177, "learning_rate": 5.76346532516024e-06, "logits/chosen": -0.3371676206588745, "logits/rejected": -0.32295674085617065, "logps/chosen": -0.3337056636810303, "logps/rejected": -3.918829917907715, "loss": 0.3407, "odds_ratio_loss": 0.09082160890102386, "rewards/accuracies": 1.0, "rewards/chosen": -0.03337056562304497, "rewards/margins": 0.3585124611854553, "rewards/rejected": -0.3918830156326294, "sft_loss": 0.3337056636810303, "step": 1535 }, { "epoch": 2.2212581344902387, "grad_norm": 2.290719343416125, "learning_rate": 5.760677044931757e-06, "logits/chosen": -0.22997716069221497, "logits/rejected": -0.30874577164649963, "logps/chosen": -0.43819090723991394, "logps/rejected": -2.9812252521514893, "loss": 0.3561, "odds_ratio_loss": 0.13137231767177582, "rewards/accuracies": 1.0, "rewards/chosen": -0.04381909221410751, "rewards/margins": 0.2543034553527832, "rewards/rejected": -0.2981225252151489, "sft_loss": 0.43819090723991394, "step": 1536 }, { "epoch": 2.222704266088214, "grad_norm": 2.2585019090751004, "learning_rate": 5.7578877031845265e-06, "logits/chosen": -0.20603443682193756, "logits/rejected": -0.17099282145500183, "logps/chosen": -0.3480113446712494, "logps/rejected": -4.4564995765686035, "loss": 0.3771, "odds_ratio_loss": 0.09290958940982819, "rewards/accuracies": 1.0, "rewards/chosen": -0.03480113670229912, "rewards/margins": 0.41084882616996765, "rewards/rejected": -0.44565001130104065, "sft_loss": 0.3480113446712494, "step": 1537 }, { "epoch": 2.2241503976861896, "grad_norm": 2.3447858727279933, "learning_rate": 5.755097301600253e-06, "logits/chosen": -0.3461535573005676, "logits/rejected": -0.2234947383403778, "logps/chosen": -0.3295784890651703, "logps/rejected": -2.077148199081421, "loss": 0.4611, "odds_ratio_loss": 0.13608911633491516, "rewards/accuracies": 1.0, "rewards/chosen": -0.03295784816145897, "rewards/margins": 0.1747569590806961, "rewards/rejected": -0.20771479606628418, "sft_loss": 0.3295784890651703, "step": 1538 }, { "epoch": 2.225596529284165, "grad_norm": 2.5895792850597226, "learning_rate": 5.752305841861279e-06, "logits/chosen": -0.2195424735546112, "logits/rejected": -0.1914370357990265, "logps/chosen": -0.34031060338020325, "logps/rejected": -3.349759578704834, "loss": 0.3642, "odds_ratio_loss": 0.1579233705997467, "rewards/accuracies": 0.9375, "rewards/chosen": -0.034031059592962265, "rewards/margins": 0.3009449243545532, "rewards/rejected": -0.3349759578704834, "sft_loss": 0.34031060338020325, "step": 1539 }, { "epoch": 2.22704266088214, "grad_norm": 2.363995097567075, "learning_rate": 5.749513325650586e-06, "logits/chosen": -0.09926959127187729, "logits/rejected": -0.14528873562812805, "logps/chosen": -0.20169880986213684, "logps/rejected": -4.050556659698486, "loss": 0.3357, "odds_ratio_loss": 0.07336930185556412, "rewards/accuracies": 1.0, "rewards/chosen": -0.020169882103800774, "rewards/margins": 0.3848857581615448, "rewards/rejected": -0.40505561232566833, "sft_loss": 0.20169880986213684, "step": 1540 }, { "epoch": 2.2284887924801158, "grad_norm": 2.356577808785414, "learning_rate": 5.746719754651795e-06, "logits/chosen": -0.32525795698165894, "logits/rejected": -0.2390633374452591, "logps/chosen": -0.47373491525650024, "logps/rejected": -2.445295572280884, "loss": 0.3442, "odds_ratio_loss": 0.17469418048858643, "rewards/accuracies": 1.0, "rewards/chosen": -0.047373492270708084, "rewards/margins": 0.19715607166290283, "rewards/rejected": -0.2445295751094818, "sft_loss": 0.47373491525650024, "step": 1541 }, { "epoch": 2.229934924078091, "grad_norm": 2.0394552640377652, "learning_rate": 5.743925130549157e-06, "logits/chosen": -0.2975271940231323, "logits/rejected": -0.12008170038461685, "logps/chosen": -0.38611143827438354, "logps/rejected": -3.99062442779541, "loss": 0.4142, "odds_ratio_loss": 0.08279373496770859, "rewards/accuracies": 1.0, "rewards/chosen": -0.038611143827438354, "rewards/margins": 0.3604513108730316, "rewards/rejected": -0.39906245470046997, "sft_loss": 0.38611143827438354, "step": 1542 }, { "epoch": 2.2313810556760667, "grad_norm": 2.5021991247752973, "learning_rate": 5.741129455027563e-06, "logits/chosen": -0.22896796464920044, "logits/rejected": -0.31885460019111633, "logps/chosen": -0.3669053018093109, "logps/rejected": -2.162534713745117, "loss": 0.4192, "odds_ratio_loss": 0.16942007839679718, "rewards/accuracies": 1.0, "rewards/chosen": -0.03669053316116333, "rewards/margins": 0.17956297099590302, "rewards/rejected": -0.21625348925590515, "sft_loss": 0.3669053018093109, "step": 1543 }, { "epoch": 2.232827187274042, "grad_norm": 2.4755706908681656, "learning_rate": 5.738332729772537e-06, "logits/chosen": -0.21915926039218903, "logits/rejected": -0.14056669175624847, "logps/chosen": -0.4281160533428192, "logps/rejected": -2.106233835220337, "loss": 0.3769, "odds_ratio_loss": 0.18937034904956818, "rewards/accuracies": 1.0, "rewards/chosen": -0.04281160607933998, "rewards/margins": 0.16781175136566162, "rewards/rejected": -0.2106233537197113, "sft_loss": 0.4281160533428192, "step": 1544 }, { "epoch": 2.234273318872017, "grad_norm": 2.233386700181181, "learning_rate": 5.735534956470232e-06, "logits/chosen": -0.3864569365978241, "logits/rejected": -0.3033338785171509, "logps/chosen": -0.4460701644420624, "logps/rejected": -2.0371532440185547, "loss": 0.29, "odds_ratio_loss": 0.15289722383022308, "rewards/accuracies": 1.0, "rewards/chosen": -0.044607020914554596, "rewards/margins": 0.15910829603672028, "rewards/rejected": -0.20371532440185547, "sft_loss": 0.4460701644420624, "step": 1545 }, { "epoch": 2.235719450469993, "grad_norm": 2.655882713565303, "learning_rate": 5.732736136807439e-06, "logits/chosen": -0.2490551769733429, "logits/rejected": -0.22575412690639496, "logps/chosen": -0.4899711608886719, "logps/rejected": -3.2424023151397705, "loss": 0.395, "odds_ratio_loss": 0.18628421425819397, "rewards/accuracies": 0.9375, "rewards/chosen": -0.048997119069099426, "rewards/margins": 0.2752431035041809, "rewards/rejected": -0.32424020767211914, "sft_loss": 0.4899711608886719, "step": 1546 }, { "epoch": 2.237165582067968, "grad_norm": 3.28371337385153, "learning_rate": 5.729936272471576e-06, "logits/chosen": -0.11610257625579834, "logits/rejected": -0.1111803650856018, "logps/chosen": -0.24118897318840027, "logps/rejected": -3.4827942848205566, "loss": 0.3084, "odds_ratio_loss": 0.11601855605840683, "rewards/accuracies": 1.0, "rewards/chosen": -0.024118896573781967, "rewards/margins": 0.32416054606437683, "rewards/rejected": -0.3482794463634491, "sft_loss": 0.24118897318840027, "step": 1547 }, { "epoch": 2.238611713665944, "grad_norm": 2.2346359384189554, "learning_rate": 5.7271353651506914e-06, "logits/chosen": -0.19794616103172302, "logits/rejected": -0.1796175241470337, "logps/chosen": -0.1995583176612854, "logps/rejected": -4.046960353851318, "loss": 0.2976, "odds_ratio_loss": 0.07498744130134583, "rewards/accuracies": 1.0, "rewards/chosen": -0.01995583064854145, "rewards/margins": 0.3847401738166809, "rewards/rejected": -0.4046960175037384, "sft_loss": 0.1995583176612854, "step": 1548 }, { "epoch": 2.240057845263919, "grad_norm": 2.356672314146801, "learning_rate": 5.724333416533462e-06, "logits/chosen": -0.30946430563926697, "logits/rejected": -0.3228055238723755, "logps/chosen": -0.39164984226226807, "logps/rejected": -4.4202165603637695, "loss": 0.3586, "odds_ratio_loss": 0.07636836171150208, "rewards/accuracies": 1.0, "rewards/chosen": -0.039164986461400986, "rewards/margins": 0.402856707572937, "rewards/rejected": -0.4420216679573059, "sft_loss": 0.39164984226226807, "step": 1549 }, { "epoch": 2.2415039768618943, "grad_norm": 1.9100552032789626, "learning_rate": 5.721530428309193e-06, "logits/chosen": -0.24844273924827576, "logits/rejected": -0.2502165734767914, "logps/chosen": -0.2769375443458557, "logps/rejected": -3.178349256515503, "loss": 0.3436, "odds_ratio_loss": 0.13039150834083557, "rewards/accuracies": 1.0, "rewards/chosen": -0.02769375592470169, "rewards/margins": 0.29014119505882263, "rewards/rejected": -0.3178349733352661, "sft_loss": 0.2769375443458557, "step": 1550 }, { "epoch": 2.24295010845987, "grad_norm": 2.268645975601301, "learning_rate": 5.718726402167819e-06, "logits/chosen": -0.38420581817626953, "logits/rejected": -0.2832087278366089, "logps/chosen": -0.3715498745441437, "logps/rejected": -2.8785829544067383, "loss": 0.4685, "odds_ratio_loss": 0.13134051859378815, "rewards/accuracies": 1.0, "rewards/chosen": -0.03715498745441437, "rewards/margins": 0.2507033348083496, "rewards/rejected": -0.2878583073616028, "sft_loss": 0.3715498745441437, "step": 1551 }, { "epoch": 2.244396240057845, "grad_norm": 2.161285650065301, "learning_rate": 5.715921339799895e-06, "logits/chosen": -0.18678617477416992, "logits/rejected": -0.24380554258823395, "logps/chosen": -0.3200456500053406, "logps/rejected": -4.384403228759766, "loss": 0.3791, "odds_ratio_loss": 0.12353577464818954, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03200456500053406, "rewards/margins": 0.4064357876777649, "rewards/rejected": -0.43844032287597656, "sft_loss": 0.3200456500053406, "step": 1552 }, { "epoch": 2.2458423716558205, "grad_norm": 2.117005371321443, "learning_rate": 5.713115242896604e-06, "logits/chosen": -0.37839236855506897, "logits/rejected": -0.3601863980293274, "logps/chosen": -0.2849191427230835, "logps/rejected": -4.144497394561768, "loss": 0.2864, "odds_ratio_loss": 0.09585070610046387, "rewards/accuracies": 1.0, "rewards/chosen": -0.02849191054701805, "rewards/margins": 0.385957807302475, "rewards/rejected": -0.4144497215747833, "sft_loss": 0.2849191427230835, "step": 1553 }, { "epoch": 2.247288503253796, "grad_norm": 4.326532462909167, "learning_rate": 5.710308113149753e-06, "logits/chosen": -0.37594687938690186, "logits/rejected": -0.4265070855617523, "logps/chosen": -0.4674017131328583, "logps/rejected": -3.600942850112915, "loss": 0.4116, "odds_ratio_loss": 0.2612759470939636, "rewards/accuracies": 0.875, "rewards/chosen": -0.046740174293518066, "rewards/margins": 0.31335410475730896, "rewards/rejected": -0.360094279050827, "sft_loss": 0.4674017131328583, "step": 1554 }, { "epoch": 2.2487346348517714, "grad_norm": 2.9543812647551495, "learning_rate": 5.707499952251771e-06, "logits/chosen": -0.2548982799053192, "logits/rejected": -0.18201807141304016, "logps/chosen": -0.340849369764328, "logps/rejected": -2.363661766052246, "loss": 0.3777, "odds_ratio_loss": 0.18882401287555695, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03408493846654892, "rewards/margins": 0.20228123664855957, "rewards/rejected": -0.2363661825656891, "sft_loss": 0.340849369764328, "step": 1555 }, { "epoch": 2.250180766449747, "grad_norm": 2.5022329892397814, "learning_rate": 5.704690761895708e-06, "logits/chosen": -0.2796105444431305, "logits/rejected": -0.20917150378227234, "logps/chosen": -0.3368090093135834, "logps/rejected": -4.430875301361084, "loss": 0.3872, "odds_ratio_loss": 0.05535319447517395, "rewards/accuracies": 1.0, "rewards/chosen": -0.03368090093135834, "rewards/margins": 0.4094066321849823, "rewards/rejected": -0.44308754801750183, "sft_loss": 0.3368090093135834, "step": 1556 }, { "epoch": 2.2516268980477223, "grad_norm": 2.354413200823632, "learning_rate": 5.7018805437752366e-06, "logits/chosen": -0.3531820476055145, "logits/rejected": -0.2590283751487732, "logps/chosen": -0.4060213267803192, "logps/rejected": -2.9722695350646973, "loss": 0.4336, "odds_ratio_loss": 0.18422815203666687, "rewards/accuracies": 0.875, "rewards/chosen": -0.04060213267803192, "rewards/margins": 0.25662487745285034, "rewards/rejected": -0.29722699522972107, "sft_loss": 0.4060213267803192, "step": 1557 }, { "epoch": 2.253073029645698, "grad_norm": 2.082596219774037, "learning_rate": 5.699069299584646e-06, "logits/chosen": -0.14018461108207703, "logits/rejected": -0.14732438325881958, "logps/chosen": -0.30800139904022217, "logps/rejected": -3.749959945678711, "loss": 0.403, "odds_ratio_loss": 0.10707899928092957, "rewards/accuracies": 1.0, "rewards/chosen": -0.030800141394138336, "rewards/margins": 0.3441958427429199, "rewards/rejected": -0.37499600648880005, "sft_loss": 0.30800139904022217, "step": 1558 }, { "epoch": 2.2545191612436732, "grad_norm": 2.631005758226159, "learning_rate": 5.696257031018847e-06, "logits/chosen": -0.20500501990318298, "logits/rejected": -0.15662819147109985, "logps/chosen": -0.3218449056148529, "logps/rejected": -4.851985454559326, "loss": 0.3504, "odds_ratio_loss": 0.05477791652083397, "rewards/accuracies": 1.0, "rewards/chosen": -0.03218448907136917, "rewards/margins": 0.4530141055583954, "rewards/rejected": -0.4851985573768616, "sft_loss": 0.3218449056148529, "step": 1559 }, { "epoch": 2.2559652928416485, "grad_norm": 2.153417090098337, "learning_rate": 5.6934437397733664e-06, "logits/chosen": -0.27325424551963806, "logits/rejected": -0.38711655139923096, "logps/chosen": -0.3427465856075287, "logps/rejected": -4.2784833908081055, "loss": 0.3354, "odds_ratio_loss": 0.19038382172584534, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03427466005086899, "rewards/margins": 0.39357367157936096, "rewards/rejected": -0.42784836888313293, "sft_loss": 0.3427465856075287, "step": 1560 }, { "epoch": 2.257411424439624, "grad_norm": 2.6827162453990163, "learning_rate": 5.690629427544348e-06, "logits/chosen": -0.3731663227081299, "logits/rejected": -0.41208338737487793, "logps/chosen": -0.34578463435173035, "logps/rejected": -3.8187103271484375, "loss": 0.3752, "odds_ratio_loss": 0.1189727634191513, "rewards/accuracies": 0.9375, "rewards/chosen": -0.034578464925289154, "rewards/margins": 0.34729254245758057, "rewards/rejected": -0.38187098503112793, "sft_loss": 0.34578463435173035, "step": 1561 }, { "epoch": 2.2588575560375994, "grad_norm": 2.321970247707492, "learning_rate": 5.68781409602855e-06, "logits/chosen": -0.13874304294586182, "logits/rejected": -0.1839933842420578, "logps/chosen": -0.35330072045326233, "logps/rejected": -3.258653163909912, "loss": 0.3224, "odds_ratio_loss": 0.14055302739143372, "rewards/accuracies": 1.0, "rewards/chosen": -0.035330068320035934, "rewards/margins": 0.29053524136543274, "rewards/rejected": -0.3258652985095978, "sft_loss": 0.35330072045326233, "step": 1562 }, { "epoch": 2.2603036876355747, "grad_norm": 2.4312378589175085, "learning_rate": 5.684997746923349e-06, "logits/chosen": -0.36342984437942505, "logits/rejected": -0.18977266550064087, "logps/chosen": -0.3192789554595947, "logps/rejected": -4.501648902893066, "loss": 0.3177, "odds_ratio_loss": 0.07834473252296448, "rewards/accuracies": 1.0, "rewards/chosen": -0.03192789852619171, "rewards/margins": 0.41823703050613403, "rewards/rejected": -0.45016491413116455, "sft_loss": 0.3192789554595947, "step": 1563 }, { "epoch": 2.2617498192335503, "grad_norm": 2.2726822259954726, "learning_rate": 5.6821803819267306e-06, "logits/chosen": -0.3450077474117279, "logits/rejected": -0.1680723875761032, "logps/chosen": -0.30977359414100647, "logps/rejected": -3.765933036804199, "loss": 0.3865, "odds_ratio_loss": 0.09045100957155228, "rewards/accuracies": 1.0, "rewards/chosen": -0.030977360904216766, "rewards/margins": 0.345615953207016, "rewards/rejected": -0.37659329175949097, "sft_loss": 0.30977359414100647, "step": 1564 }, { "epoch": 2.2631959508315256, "grad_norm": 3.067578422370726, "learning_rate": 5.679362002737295e-06, "logits/chosen": -0.32226288318634033, "logits/rejected": -0.35191404819488525, "logps/chosen": -0.4810445308685303, "logps/rejected": -3.7931578159332275, "loss": 0.3509, "odds_ratio_loss": 0.19164201617240906, "rewards/accuracies": 1.0, "rewards/chosen": -0.04810445010662079, "rewards/margins": 0.3312113285064697, "rewards/rejected": -0.3793157935142517, "sft_loss": 0.4810445308685303, "step": 1565 }, { "epoch": 2.2646420824295013, "grad_norm": 3.6725044603624832, "learning_rate": 5.676542611054253e-06, "logits/chosen": -0.5231328010559082, "logits/rejected": -0.33863845467567444, "logps/chosen": -0.3771399259567261, "logps/rejected": -2.39211368560791, "loss": 0.3874, "odds_ratio_loss": 0.1751105934381485, "rewards/accuracies": 1.0, "rewards/chosen": -0.03771399334073067, "rewards/margins": 0.20149734616279602, "rewards/rejected": -0.23921135067939758, "sft_loss": 0.3771399259567261, "step": 1566 }, { "epoch": 2.2660882140274765, "grad_norm": 2.3515505758843367, "learning_rate": 5.673722208577426e-06, "logits/chosen": -0.18822497129440308, "logits/rejected": -0.1882564127445221, "logps/chosen": -0.2656404376029968, "logps/rejected": -3.3323557376861572, "loss": 0.3407, "odds_ratio_loss": 0.08594746887683868, "rewards/accuracies": 1.0, "rewards/chosen": -0.026564043015241623, "rewards/margins": 0.30667150020599365, "rewards/rejected": -0.33323556184768677, "sft_loss": 0.2656404376029968, "step": 1567 }, { "epoch": 2.2675343456254518, "grad_norm": 2.6577713876240434, "learning_rate": 5.670900797007246e-06, "logits/chosen": -0.24050821363925934, "logits/rejected": -0.2003868669271469, "logps/chosen": -0.3192717432975769, "logps/rejected": -2.265965700149536, "loss": 0.411, "odds_ratio_loss": 0.08194440603256226, "rewards/accuracies": 1.0, "rewards/chosen": -0.03192717581987381, "rewards/margins": 0.19466939568519592, "rewards/rejected": -0.22659656405448914, "sft_loss": 0.3192717432975769, "step": 1568 }, { "epoch": 2.2689804772234274, "grad_norm": 2.2815630278049355, "learning_rate": 5.668078378044753e-06, "logits/chosen": -0.2181173712015152, "logits/rejected": -0.1383255273103714, "logps/chosen": -0.25415533781051636, "logps/rejected": -4.44294548034668, "loss": 0.2843, "odds_ratio_loss": 0.06722106039524078, "rewards/accuracies": 1.0, "rewards/chosen": -0.025415534153580666, "rewards/margins": 0.4188790023326874, "rewards/rejected": -0.4442945420742035, "sft_loss": 0.25415533781051636, "step": 1569 }, { "epoch": 2.2704266088214027, "grad_norm": 4.736083331714562, "learning_rate": 5.665254953391593e-06, "logits/chosen": -0.21430820226669312, "logits/rejected": -0.34477680921554565, "logps/chosen": -0.43496644496917725, "logps/rejected": -3.20230770111084, "loss": 0.3673, "odds_ratio_loss": 0.12124226987361908, "rewards/accuracies": 1.0, "rewards/chosen": -0.043496645987033844, "rewards/margins": 0.2767341434955597, "rewards/rejected": -0.32023078203201294, "sft_loss": 0.43496644496917725, "step": 1570 }, { "epoch": 2.2718727404193784, "grad_norm": 2.8236198696675725, "learning_rate": 5.662430524750021e-06, "logits/chosen": -0.15283435583114624, "logits/rejected": -0.19004985690116882, "logps/chosen": -0.5074213147163391, "logps/rejected": -4.028660297393799, "loss": 0.4107, "odds_ratio_loss": 0.1295696198940277, "rewards/accuracies": 1.0, "rewards/chosen": -0.05074213445186615, "rewards/margins": 0.3521239459514618, "rewards/rejected": -0.40286606550216675, "sft_loss": 0.5074213147163391, "step": 1571 }, { "epoch": 2.2733188720173536, "grad_norm": 2.3251385690876085, "learning_rate": 5.659605093822891e-06, "logits/chosen": -0.2327795922756195, "logits/rejected": -0.1698169857263565, "logps/chosen": -0.4730731248855591, "logps/rejected": -3.355661392211914, "loss": 0.4151, "odds_ratio_loss": 0.19861802458763123, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04730731621384621, "rewards/margins": 0.288258820772171, "rewards/rejected": -0.33556610345840454, "sft_loss": 0.4730731248855591, "step": 1572 }, { "epoch": 2.274765003615329, "grad_norm": 2.4179894674242255, "learning_rate": 5.656778662313671e-06, "logits/chosen": -0.05721982568502426, "logits/rejected": -0.03928186744451523, "logps/chosen": -0.36657410860061646, "logps/rejected": -2.0526092052459717, "loss": 0.3521, "odds_ratio_loss": 0.19514396786689758, "rewards/accuracies": 0.875, "rewards/chosen": -0.036657411605119705, "rewards/margins": 0.16860352456569672, "rewards/rejected": -0.20526093244552612, "sft_loss": 0.36657410860061646, "step": 1573 }, { "epoch": 2.2762111352133045, "grad_norm": 2.159865190586481, "learning_rate": 5.653951231926425e-06, "logits/chosen": -0.19648948311805725, "logits/rejected": -0.3218488097190857, "logps/chosen": -0.3456001877784729, "logps/rejected": -4.398506164550781, "loss": 0.3824, "odds_ratio_loss": 0.15548524260520935, "rewards/accuracies": 1.0, "rewards/chosen": -0.03456001728773117, "rewards/margins": 0.4052906334400177, "rewards/rejected": -0.4398505985736847, "sft_loss": 0.3456001877784729, "step": 1574 }, { "epoch": 2.27765726681128, "grad_norm": 2.2373719533444305, "learning_rate": 5.651122804365822e-06, "logits/chosen": -0.15806661546230316, "logits/rejected": -0.24433177709579468, "logps/chosen": -0.28150510787963867, "logps/rejected": -2.8424034118652344, "loss": 0.3845, "odds_ratio_loss": 0.10211706161499023, "rewards/accuracies": 1.0, "rewards/chosen": -0.028150510042905807, "rewards/margins": 0.25608983635902405, "rewards/rejected": -0.28424033522605896, "sft_loss": 0.28150510787963867, "step": 1575 }, { "epoch": 2.279103398409255, "grad_norm": 3.2054290088877346, "learning_rate": 5.6482933813371295e-06, "logits/chosen": -0.1453315168619156, "logits/rejected": -0.1947060525417328, "logps/chosen": -0.5173884630203247, "logps/rejected": -1.7582117319107056, "loss": 0.4024, "odds_ratio_loss": 0.24231034517288208, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05173884332180023, "rewards/margins": 0.12408233433961868, "rewards/rejected": -0.1758211851119995, "sft_loss": 0.5173884630203247, "step": 1576 }, { "epoch": 2.2805495300072307, "grad_norm": 2.8464475838685814, "learning_rate": 5.645462964546218e-06, "logits/chosen": -0.5189359784126282, "logits/rejected": -0.32251614332199097, "logps/chosen": -0.3037870228290558, "logps/rejected": -2.479979991912842, "loss": 0.3257, "odds_ratio_loss": 0.12056048214435577, "rewards/accuracies": 1.0, "rewards/chosen": -0.03037870302796364, "rewards/margins": 0.21761931478977203, "rewards/rejected": -0.24799801409244537, "sft_loss": 0.3037870228290558, "step": 1577 }, { "epoch": 2.281995661605206, "grad_norm": 2.6644325848886017, "learning_rate": 5.642631555699557e-06, "logits/chosen": -0.13242171704769135, "logits/rejected": -0.20441709458827972, "logps/chosen": -0.4285106956958771, "logps/rejected": -3.3875656127929688, "loss": 0.3443, "odds_ratio_loss": 0.12020072340965271, "rewards/accuracies": 1.0, "rewards/chosen": -0.04285107180476189, "rewards/margins": 0.29590553045272827, "rewards/rejected": -0.3387565612792969, "sft_loss": 0.4285106956958771, "step": 1578 }, { "epoch": 2.2834417932031816, "grad_norm": 2.2024351205491253, "learning_rate": 5.639799156504215e-06, "logits/chosen": -0.0775141790509224, "logits/rejected": -0.14957457780838013, "logps/chosen": -0.3367796838283539, "logps/rejected": -3.4249143600463867, "loss": 0.4105, "odds_ratio_loss": 0.09568314254283905, "rewards/accuracies": 1.0, "rewards/chosen": -0.03367796912789345, "rewards/margins": 0.3088134527206421, "rewards/rejected": -0.34249138832092285, "sft_loss": 0.3367796838283539, "step": 1579 }, { "epoch": 2.284887924801157, "grad_norm": 3.0406782139100006, "learning_rate": 5.636965768667852e-06, "logits/chosen": -0.1693412810564041, "logits/rejected": -0.20891311764717102, "logps/chosen": -0.3991576135158539, "logps/rejected": -3.2093253135681152, "loss": 0.3954, "odds_ratio_loss": 0.15770481526851654, "rewards/accuracies": 1.0, "rewards/chosen": -0.03991575911641121, "rewards/margins": 0.2810167670249939, "rewards/rejected": -0.3209325075149536, "sft_loss": 0.3991576135158539, "step": 1580 }, { "epoch": 2.2863340563991326, "grad_norm": 2.3637053224768176, "learning_rate": 5.6341313938987314e-06, "logits/chosen": -0.19435811042785645, "logits/rejected": -0.08956211805343628, "logps/chosen": -0.40447860956192017, "logps/rejected": -2.695289373397827, "loss": 0.4166, "odds_ratio_loss": 0.14535120129585266, "rewards/accuracies": 1.0, "rewards/chosen": -0.04044786095619202, "rewards/margins": 0.22908106446266174, "rewards/rejected": -0.26952892541885376, "sft_loss": 0.40447860956192017, "step": 1581 }, { "epoch": 2.287780187997108, "grad_norm": 2.3223751156245984, "learning_rate": 5.631296033905707e-06, "logits/chosen": -0.20546045899391174, "logits/rejected": -0.13516230881214142, "logps/chosen": -0.2861219346523285, "logps/rejected": -4.935697555541992, "loss": 0.2542, "odds_ratio_loss": 0.0564875528216362, "rewards/accuracies": 1.0, "rewards/chosen": -0.02861219458281994, "rewards/margins": 0.4649575650691986, "rewards/rejected": -0.4935697317123413, "sft_loss": 0.2861219346523285, "step": 1582 }, { "epoch": 2.289226319595083, "grad_norm": 2.4551762245248807, "learning_rate": 5.628459690398229e-06, "logits/chosen": -0.2333042323589325, "logits/rejected": -0.14890116453170776, "logps/chosen": -0.3023528456687927, "logps/rejected": -4.237617015838623, "loss": 0.3242, "odds_ratio_loss": 0.1010616272687912, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03023528680205345, "rewards/margins": 0.39352643489837646, "rewards/rejected": -0.4237616956233978, "sft_loss": 0.3023528456687927, "step": 1583 }, { "epoch": 2.2906724511930587, "grad_norm": 2.3849079762907697, "learning_rate": 5.625622365086338e-06, "logits/chosen": -0.2263105809688568, "logits/rejected": -0.14233849942684174, "logps/chosen": -0.4132815897464752, "logps/rejected": -1.7932350635528564, "loss": 0.394, "odds_ratio_loss": 0.14649933576583862, "rewards/accuracies": 1.0, "rewards/chosen": -0.04132816195487976, "rewards/margins": 0.13799536228179932, "rewards/rejected": -0.17932350933551788, "sft_loss": 0.4132815897464752, "step": 1584 }, { "epoch": 2.292118582791034, "grad_norm": 2.833070998521513, "learning_rate": 5.6227840596806685e-06, "logits/chosen": -0.23122721910476685, "logits/rejected": -0.17656515538692474, "logps/chosen": -0.37628495693206787, "logps/rejected": -2.2837910652160645, "loss": 0.3649, "odds_ratio_loss": 0.1681251972913742, "rewards/accuracies": 1.0, "rewards/chosen": -0.03762849420309067, "rewards/margins": 0.1907506287097931, "rewards/rejected": -0.22837910056114197, "sft_loss": 0.37628495693206787, "step": 1585 }, { "epoch": 2.293564714389009, "grad_norm": 2.1224928456477574, "learning_rate": 5.6199447758924454e-06, "logits/chosen": -0.1943625807762146, "logits/rejected": -0.15852569043636322, "logps/chosen": -0.31918591260910034, "logps/rejected": -2.6646080017089844, "loss": 0.3446, "odds_ratio_loss": 0.10638980567455292, "rewards/accuracies": 1.0, "rewards/chosen": -0.031918592751026154, "rewards/margins": 0.23454220592975616, "rewards/rejected": -0.2664608061313629, "sft_loss": 0.31918591260910034, "step": 1586 }, { "epoch": 2.295010845986985, "grad_norm": 2.630639755058868, "learning_rate": 5.617104515433485e-06, "logits/chosen": -0.36188462376594543, "logits/rejected": -0.27973437309265137, "logps/chosen": -0.36127543449401855, "logps/rejected": -4.726844787597656, "loss": 0.3132, "odds_ratio_loss": 0.20731763541698456, "rewards/accuracies": 0.875, "rewards/chosen": -0.03612754866480827, "rewards/margins": 0.43655693531036377, "rewards/rejected": -0.47268450260162354, "sft_loss": 0.36127543449401855, "step": 1587 }, { "epoch": 2.29645697758496, "grad_norm": 2.327846104888335, "learning_rate": 5.614263280016188e-06, "logits/chosen": -0.31197673082351685, "logits/rejected": -0.1848919838666916, "logps/chosen": -0.346306711435318, "logps/rejected": -2.4159696102142334, "loss": 0.3584, "odds_ratio_loss": 0.10952720046043396, "rewards/accuracies": 1.0, "rewards/chosen": -0.0346306711435318, "rewards/margins": 0.20696629583835602, "rewards/rejected": -0.24159696698188782, "sft_loss": 0.346306711435318, "step": 1588 }, { "epoch": 2.297903109182936, "grad_norm": 3.011830252732662, "learning_rate": 5.611421071353547e-06, "logits/chosen": -0.18929818272590637, "logits/rejected": -0.14918991923332214, "logps/chosen": -0.23382627964019775, "logps/rejected": -2.677581310272217, "loss": 0.2883, "odds_ratio_loss": 0.07694613188505173, "rewards/accuracies": 1.0, "rewards/chosen": -0.023382626473903656, "rewards/margins": 0.24437551200389862, "rewards/rejected": -0.2677581310272217, "sft_loss": 0.23382627964019775, "step": 1589 }, { "epoch": 2.299349240780911, "grad_norm": 2.482707891186509, "learning_rate": 5.608577891159141e-06, "logits/chosen": -0.3832859396934509, "logits/rejected": -0.18635818362236023, "logps/chosen": -0.3988485038280487, "logps/rejected": -2.1749815940856934, "loss": 0.3525, "odds_ratio_loss": 0.11210395395755768, "rewards/accuracies": 1.0, "rewards/chosen": -0.03988485038280487, "rewards/margins": 0.17761333286762238, "rewards/rejected": -0.21749816834926605, "sft_loss": 0.3988485038280487, "step": 1590 }, { "epoch": 2.3007953723788863, "grad_norm": 2.9702079948943774, "learning_rate": 5.605733741147135e-06, "logits/chosen": -0.1605539172887802, "logits/rejected": -0.23468124866485596, "logps/chosen": -0.46900007128715515, "logps/rejected": -1.4842274188995361, "loss": 0.4442, "odds_ratio_loss": 0.22805261611938477, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04690001159906387, "rewards/margins": 0.10152273625135422, "rewards/rejected": -0.1484227478504181, "sft_loss": 0.46900007128715515, "step": 1591 }, { "epoch": 2.302241503976862, "grad_norm": 2.438494937693134, "learning_rate": 5.602888623032275e-06, "logits/chosen": -0.2626468539237976, "logits/rejected": -0.18816189467906952, "logps/chosen": -0.30163171887397766, "logps/rejected": -3.5179507732391357, "loss": 0.3645, "odds_ratio_loss": 0.07172201573848724, "rewards/accuracies": 1.0, "rewards/chosen": -0.030163172632455826, "rewards/margins": 0.32163190841674805, "rewards/rejected": -0.3517950773239136, "sft_loss": 0.30163171887397766, "step": 1592 }, { "epoch": 2.3036876355748372, "grad_norm": 2.433363209494988, "learning_rate": 5.600042538529893e-06, "logits/chosen": -0.10985489189624786, "logits/rejected": -0.2541528344154358, "logps/chosen": -0.3402894139289856, "logps/rejected": -4.231769561767578, "loss": 0.3691, "odds_ratio_loss": 0.15273644030094147, "rewards/accuracies": 1.0, "rewards/chosen": -0.03402893990278244, "rewards/margins": 0.3891479969024658, "rewards/rejected": -0.42317694425582886, "sft_loss": 0.3402894139289856, "step": 1593 }, { "epoch": 2.305133767172813, "grad_norm": 2.1024763470220296, "learning_rate": 5.597195489355907e-06, "logits/chosen": -0.3650687336921692, "logits/rejected": -0.36112746596336365, "logps/chosen": -0.36865419149398804, "logps/rejected": -2.913234233856201, "loss": 0.3323, "odds_ratio_loss": 0.15798337757587433, "rewards/accuracies": 1.0, "rewards/chosen": -0.03686542063951492, "rewards/margins": 0.2544580399990082, "rewards/rejected": -0.2913234829902649, "sft_loss": 0.36865419149398804, "step": 1594 }, { "epoch": 2.306579898770788, "grad_norm": 2.426897816658698, "learning_rate": 5.594347477226811e-06, "logits/chosen": -0.19730783998966217, "logits/rejected": -0.31510406732559204, "logps/chosen": -0.39814573526382446, "logps/rejected": -3.4227256774902344, "loss": 0.3904, "odds_ratio_loss": 0.11480727046728134, "rewards/accuracies": 1.0, "rewards/chosen": -0.03981457278132439, "rewards/margins": 0.3024579882621765, "rewards/rejected": -0.3422725796699524, "sft_loss": 0.39814573526382446, "step": 1595 }, { "epoch": 2.3080260303687634, "grad_norm": 2.7323229958390973, "learning_rate": 5.591498503859683e-06, "logits/chosen": -0.12184923887252808, "logits/rejected": -0.12560710310935974, "logps/chosen": -0.3566288650035858, "logps/rejected": -2.558873176574707, "loss": 0.3593, "odds_ratio_loss": 0.12249592691659927, "rewards/accuracies": 1.0, "rewards/chosen": -0.03566288948059082, "rewards/margins": 0.22022442519664764, "rewards/rejected": -0.25588732957839966, "sft_loss": 0.3566288650035858, "step": 1596 }, { "epoch": 2.309472161966739, "grad_norm": 3.2383620194913374, "learning_rate": 5.58864857097218e-06, "logits/chosen": -0.4806884527206421, "logits/rejected": -0.3084767758846283, "logps/chosen": -0.3511160612106323, "logps/rejected": -2.599006175994873, "loss": 0.357, "odds_ratio_loss": 0.16197161376476288, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03511160612106323, "rewards/margins": 0.22478902339935303, "rewards/rejected": -0.25990062952041626, "sft_loss": 0.3511160612106323, "step": 1597 }, { "epoch": 2.3109182935647143, "grad_norm": 2.8203160493059882, "learning_rate": 5.585797680282537e-06, "logits/chosen": -0.17948085069656372, "logits/rejected": -0.16148388385772705, "logps/chosen": -0.2975139617919922, "logps/rejected": -4.275174140930176, "loss": 0.463, "odds_ratio_loss": 0.1559380292892456, "rewards/accuracies": 0.9375, "rewards/chosen": -0.029751399531960487, "rewards/margins": 0.3977660536766052, "rewards/rejected": -0.4275174140930176, "sft_loss": 0.2975139617919922, "step": 1598 }, { "epoch": 2.3123644251626896, "grad_norm": 2.553105162778212, "learning_rate": 5.582945833509567e-06, "logits/chosen": -0.13604110479354858, "logits/rejected": -0.11723095923662186, "logps/chosen": -0.3466201424598694, "logps/rejected": -2.9810094833374023, "loss": 0.3345, "odds_ratio_loss": 0.11769793182611465, "rewards/accuracies": 1.0, "rewards/chosen": -0.03466201201081276, "rewards/margins": 0.2634389400482178, "rewards/rejected": -0.29810094833374023, "sft_loss": 0.3466201424598694, "step": 1599 }, { "epoch": 2.3138105567606653, "grad_norm": 2.2628957428181042, "learning_rate": 5.580093032372657e-06, "logits/chosen": -0.28234773874282837, "logits/rejected": -0.3510010540485382, "logps/chosen": -0.26932573318481445, "logps/rejected": -5.100521564483643, "loss": 0.279, "odds_ratio_loss": 0.08029340207576752, "rewards/accuracies": 1.0, "rewards/chosen": -0.026932574808597565, "rewards/margins": 0.48311957716941833, "rewards/rejected": -0.5100522041320801, "sft_loss": 0.26932573318481445, "step": 1600 }, { "epoch": 2.3152566883586405, "grad_norm": 2.449902283988286, "learning_rate": 5.577239278591773e-06, "logits/chosen": -0.1463385969400406, "logits/rejected": -0.12493189424276352, "logps/chosen": -0.3543298840522766, "logps/rejected": -3.84521222114563, "loss": 0.3948, "odds_ratio_loss": 0.11065103113651276, "rewards/accuracies": 1.0, "rewards/chosen": -0.03543298691511154, "rewards/margins": 0.34908822178840637, "rewards/rejected": -0.3845212161540985, "sft_loss": 0.3543298840522766, "step": 1601 }, { "epoch": 2.316702819956616, "grad_norm": 2.2581803555255164, "learning_rate": 5.574384573887455e-06, "logits/chosen": -0.14903423190116882, "logits/rejected": -0.09336914867162704, "logps/chosen": -0.2692457437515259, "logps/rejected": -1.9810104370117188, "loss": 0.2907, "odds_ratio_loss": 0.14965462684631348, "rewards/accuracies": 1.0, "rewards/chosen": -0.026924576610326767, "rewards/margins": 0.1711764633655548, "rewards/rejected": -0.19810104370117188, "sft_loss": 0.2692457437515259, "step": 1602 }, { "epoch": 2.3181489515545914, "grad_norm": 2.3125551075396187, "learning_rate": 5.571528919980813e-06, "logits/chosen": -0.2181708812713623, "logits/rejected": -0.14055103063583374, "logps/chosen": -0.39502519369125366, "logps/rejected": -2.1328787803649902, "loss": 0.3481, "odds_ratio_loss": 0.17443543672561646, "rewards/accuracies": 0.9375, "rewards/chosen": -0.039502520114183426, "rewards/margins": 0.17378535866737366, "rewards/rejected": -0.2132878601551056, "sft_loss": 0.39502519369125366, "step": 1603 }, { "epoch": 2.319595083152567, "grad_norm": 2.4434556007442434, "learning_rate": 5.568672318593532e-06, "logits/chosen": -0.14153406023979187, "logits/rejected": -0.12052149325609207, "logps/chosen": -0.2865070104598999, "logps/rejected": -1.733748197555542, "loss": 0.365, "odds_ratio_loss": 0.09804748743772507, "rewards/accuracies": 1.0, "rewards/chosen": -0.02865069918334484, "rewards/margins": 0.14472413063049316, "rewards/rejected": -0.17337481677532196, "sft_loss": 0.2865070104598999, "step": 1604 }, { "epoch": 2.3210412147505424, "grad_norm": 2.6434099915570104, "learning_rate": 5.5658147714478674e-06, "logits/chosen": -0.2638757824897766, "logits/rejected": -0.15295341610908508, "logps/chosen": -0.3959035277366638, "logps/rejected": -2.8813705444335938, "loss": 0.3362, "odds_ratio_loss": 0.1301441639661789, "rewards/accuracies": 1.0, "rewards/chosen": -0.03959035128355026, "rewards/margins": 0.24854671955108643, "rewards/rejected": -0.2881370782852173, "sft_loss": 0.3959035277366638, "step": 1605 }, { "epoch": 2.3224873463485176, "grad_norm": 2.322832426601466, "learning_rate": 5.5629562802666466e-06, "logits/chosen": -0.08946573734283447, "logits/rejected": -0.08883717656135559, "logps/chosen": -0.22113816440105438, "logps/rejected": -5.388749599456787, "loss": 0.3381, "odds_ratio_loss": 0.07277220487594604, "rewards/accuracies": 1.0, "rewards/chosen": -0.022113818675279617, "rewards/margins": 0.5167611837387085, "rewards/rejected": -0.5388749837875366, "sft_loss": 0.22113816440105438, "step": 1606 }, { "epoch": 2.3239334779464933, "grad_norm": 2.072956257971344, "learning_rate": 5.5600968467732624e-06, "logits/chosen": -0.1779942810535431, "logits/rejected": -0.30677855014801025, "logps/chosen": -0.3421142101287842, "logps/rejected": -3.0264923572540283, "loss": 0.3417, "odds_ratio_loss": 0.1177440956234932, "rewards/accuracies": 1.0, "rewards/chosen": -0.0342114195227623, "rewards/margins": 0.26843780279159546, "rewards/rejected": -0.30264922976493835, "sft_loss": 0.3421142101287842, "step": 1607 }, { "epoch": 2.3253796095444685, "grad_norm": 2.1562983279773458, "learning_rate": 5.557236472691679e-06, "logits/chosen": -0.3682592511177063, "logits/rejected": -0.2512025237083435, "logps/chosen": -0.4082300066947937, "logps/rejected": -4.011061668395996, "loss": 0.3899, "odds_ratio_loss": 0.14864492416381836, "rewards/accuracies": 1.0, "rewards/chosen": -0.04082300141453743, "rewards/margins": 0.36028316617012024, "rewards/rejected": -0.40110617876052856, "sft_loss": 0.4082300066947937, "step": 1608 }, { "epoch": 2.326825741142444, "grad_norm": 2.3589668387098146, "learning_rate": 5.554375159746426e-06, "logits/chosen": -0.08552505075931549, "logits/rejected": -0.08895818889141083, "logps/chosen": -0.37686607241630554, "logps/rejected": -4.821796894073486, "loss": 0.3431, "odds_ratio_loss": 0.1013682559132576, "rewards/accuracies": 0.9375, "rewards/chosen": -0.037686608731746674, "rewards/margins": 0.4444930851459503, "rewards/rejected": -0.4821796715259552, "sft_loss": 0.37686607241630554, "step": 1609 }, { "epoch": 2.3282718727404195, "grad_norm": 2.573254384047697, "learning_rate": 5.551512909662601e-06, "logits/chosen": -0.2964475452899933, "logits/rejected": -0.2937246263027191, "logps/chosen": -0.36214813590049744, "logps/rejected": -4.1340742111206055, "loss": 0.3698, "odds_ratio_loss": 0.11668211221694946, "rewards/accuracies": 1.0, "rewards/chosen": -0.036214813590049744, "rewards/margins": 0.3771926462650299, "rewards/rejected": -0.41340741515159607, "sft_loss": 0.36214813590049744, "step": 1610 }, { "epoch": 2.3297180043383947, "grad_norm": 2.3020655070754286, "learning_rate": 5.548649724165864e-06, "logits/chosen": -0.2145930677652359, "logits/rejected": -0.21104787290096283, "logps/chosen": -0.43282127380371094, "logps/rejected": -2.7367076873779297, "loss": 0.404, "odds_ratio_loss": 0.1472308188676834, "rewards/accuracies": 1.0, "rewards/chosen": -0.04328212887048721, "rewards/margins": 0.23038864135742188, "rewards/rejected": -0.2736707925796509, "sft_loss": 0.43282127380371094, "step": 1611 }, { "epoch": 2.3311641359363704, "grad_norm": 2.51857465350935, "learning_rate": 5.545785604982441e-06, "logits/chosen": -0.2099495381116867, "logits/rejected": -0.15136927366256714, "logps/chosen": -0.34850409626960754, "logps/rejected": -3.427006244659424, "loss": 0.3156, "odds_ratio_loss": 0.13121187686920166, "rewards/accuracies": 1.0, "rewards/chosen": -0.034850407391786575, "rewards/margins": 0.3078502118587494, "rewards/rejected": -0.34270063042640686, "sft_loss": 0.34850409626960754, "step": 1612 }, { "epoch": 2.3326102675343456, "grad_norm": 2.1677477412206443, "learning_rate": 5.542920553839118e-06, "logits/chosen": -0.13443297147750854, "logits/rejected": -0.10968001186847687, "logps/chosen": -0.3220829367637634, "logps/rejected": -4.633582592010498, "loss": 0.3408, "odds_ratio_loss": 0.14093899726867676, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03220829367637634, "rewards/margins": 0.431149959564209, "rewards/rejected": -0.4633582532405853, "sft_loss": 0.3220829367637634, "step": 1613 }, { "epoch": 2.334056399132321, "grad_norm": 2.3359773826568775, "learning_rate": 5.540054572463249e-06, "logits/chosen": -0.17375683784484863, "logits/rejected": -0.05261297523975372, "logps/chosen": -0.3510321378707886, "logps/rejected": -3.398996353149414, "loss": 0.3801, "odds_ratio_loss": 0.11157586425542831, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0351032130420208, "rewards/margins": 0.3047964572906494, "rewards/rejected": -0.3398996591567993, "sft_loss": 0.3510321378707886, "step": 1614 }, { "epoch": 2.3355025307302966, "grad_norm": 2.4381191286475516, "learning_rate": 5.5371876625827405e-06, "logits/chosen": -0.00956578366458416, "logits/rejected": -0.046139203011989594, "logps/chosen": -0.3790590167045593, "logps/rejected": -3.913565158843994, "loss": 0.3705, "odds_ratio_loss": 0.16183798015117645, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03790590539574623, "rewards/margins": 0.35345059633255005, "rewards/rejected": -0.39135652780532837, "sft_loss": 0.3790590167045593, "step": 1615 }, { "epoch": 2.336948662328272, "grad_norm": 2.6120749263979466, "learning_rate": 5.534319825926066e-06, "logits/chosen": -0.1352277547121048, "logits/rejected": -0.2033660113811493, "logps/chosen": -0.46139898896217346, "logps/rejected": -2.7106308937072754, "loss": 0.4047, "odds_ratio_loss": 0.11662759631872177, "rewards/accuracies": 1.0, "rewards/chosen": -0.046139899641275406, "rewards/margins": 0.22492320835590363, "rewards/rejected": -0.27106308937072754, "sft_loss": 0.46139898896217346, "step": 1616 }, { "epoch": 2.3383947939262475, "grad_norm": 2.5091957811194825, "learning_rate": 5.531451064222254e-06, "logits/chosen": -0.4063303470611572, "logits/rejected": -0.35132384300231934, "logps/chosen": -0.30066221952438354, "logps/rejected": -2.8382208347320557, "loss": 0.3784, "odds_ratio_loss": 0.09488758444786072, "rewards/accuracies": 1.0, "rewards/chosen": -0.030066223815083504, "rewards/margins": 0.2537558674812317, "rewards/rejected": -0.28382205963134766, "sft_loss": 0.30066221952438354, "step": 1617 }, { "epoch": 2.3398409255242227, "grad_norm": 4.2557519976659055, "learning_rate": 5.528581379200892e-06, "logits/chosen": -0.1894337385892868, "logits/rejected": -0.3243355453014374, "logps/chosen": -0.5176692605018616, "logps/rejected": -2.7984094619750977, "loss": 0.3693, "odds_ratio_loss": 0.24235056340694427, "rewards/accuracies": 0.9375, "rewards/chosen": -0.051766932010650635, "rewards/margins": 0.22807404398918152, "rewards/rejected": -0.27984097599983215, "sft_loss": 0.5176692605018616, "step": 1618 }, { "epoch": 2.341287057122198, "grad_norm": 2.181643600483142, "learning_rate": 5.525710772592123e-06, "logits/chosen": -0.1662239134311676, "logits/rejected": -0.26300936937332153, "logps/chosen": -0.27158379554748535, "logps/rejected": -3.557821273803711, "loss": 0.3126, "odds_ratio_loss": 0.13274052739143372, "rewards/accuracies": 1.0, "rewards/chosen": -0.027158383280038834, "rewards/margins": 0.3286237418651581, "rewards/rejected": -0.3557821214199066, "sft_loss": 0.27158379554748535, "step": 1619 }, { "epoch": 2.3427331887201737, "grad_norm": 3.107249690135884, "learning_rate": 5.522839246126646e-06, "logits/chosen": -0.32011640071868896, "logits/rejected": -0.22637443244457245, "logps/chosen": -0.4642205238342285, "logps/rejected": -2.9727323055267334, "loss": 0.4463, "odds_ratio_loss": 0.13931193947792053, "rewards/accuracies": 1.0, "rewards/chosen": -0.04642205685377121, "rewards/margins": 0.25085121393203735, "rewards/rejected": -0.2972732186317444, "sft_loss": 0.4642205238342285, "step": 1620 }, { "epoch": 2.344179320318149, "grad_norm": 2.2401661446990166, "learning_rate": 5.519966801535716e-06, "logits/chosen": -0.3168110251426697, "logits/rejected": -0.1873689442873001, "logps/chosen": -0.36878371238708496, "logps/rejected": -3.3086607456207275, "loss": 0.4047, "odds_ratio_loss": 0.13117769360542297, "rewards/accuracies": 1.0, "rewards/chosen": -0.03687836974859238, "rewards/margins": 0.2939877212047577, "rewards/rejected": -0.33086609840393066, "sft_loss": 0.36878371238708496, "step": 1621 }, { "epoch": 2.345625451916124, "grad_norm": 2.477972330148887, "learning_rate": 5.5170934405511415e-06, "logits/chosen": -0.2881897985935211, "logits/rejected": -0.20093801617622375, "logps/chosen": -0.3505580425262451, "logps/rejected": -2.470184564590454, "loss": 0.3773, "odds_ratio_loss": 0.11334122717380524, "rewards/accuracies": 1.0, "rewards/chosen": -0.03505580127239227, "rewards/margins": 0.21196265518665314, "rewards/rejected": -0.2470184564590454, "sft_loss": 0.3505580425262451, "step": 1622 }, { "epoch": 2.3470715835141, "grad_norm": 2.620054583835237, "learning_rate": 5.514219164905281e-06, "logits/chosen": -0.19945791363716125, "logits/rejected": -0.13795584440231323, "logps/chosen": -0.34572529792785645, "logps/rejected": -3.0925025939941406, "loss": 0.3618, "odds_ratio_loss": 0.11854077130556107, "rewards/accuracies": 1.0, "rewards/chosen": -0.034572526812553406, "rewards/margins": 0.27467772364616394, "rewards/rejected": -0.30925023555755615, "sft_loss": 0.34572529792785645, "step": 1623 }, { "epoch": 2.348517715112075, "grad_norm": 2.6367388267693053, "learning_rate": 5.511343976331046e-06, "logits/chosen": -0.11850079149007797, "logits/rejected": -0.20706413686275482, "logps/chosen": -0.238744854927063, "logps/rejected": -4.866823196411133, "loss": 0.3102, "odds_ratio_loss": 0.05267883464694023, "rewards/accuracies": 1.0, "rewards/chosen": -0.02387448400259018, "rewards/margins": 0.462807834148407, "rewards/rejected": -0.48668232560157776, "sft_loss": 0.238744854927063, "step": 1624 }, { "epoch": 2.3499638467100508, "grad_norm": 2.4421056602770745, "learning_rate": 5.5084678765618994e-06, "logits/chosen": -0.037788279354572296, "logits/rejected": -0.1496773064136505, "logps/chosen": -0.2970682382583618, "logps/rejected": -5.067946434020996, "loss": 0.3392, "odds_ratio_loss": 0.13825398683547974, "rewards/accuracies": 1.0, "rewards/chosen": -0.02970682457089424, "rewards/margins": 0.4770878553390503, "rewards/rejected": -0.5067946314811707, "sft_loss": 0.2970682382583618, "step": 1625 }, { "epoch": 2.351409978308026, "grad_norm": 3.113000332950432, "learning_rate": 5.505590867331852e-06, "logits/chosen": -0.004002872854471207, "logits/rejected": 0.004178255796432495, "logps/chosen": -0.26595339179039, "logps/rejected": -3.0875930786132812, "loss": 0.3252, "odds_ratio_loss": 0.12349444627761841, "rewards/accuracies": 1.0, "rewards/chosen": -0.02659534104168415, "rewards/margins": 0.28216397762298584, "rewards/rejected": -0.30875933170318604, "sft_loss": 0.26595339179039, "step": 1626 }, { "epoch": 2.3528561099060017, "grad_norm": 3.2436647624885144, "learning_rate": 5.502712950375462e-06, "logits/chosen": -0.27105289697647095, "logits/rejected": -0.2777266204357147, "logps/chosen": -0.29128292202949524, "logps/rejected": -3.751927375793457, "loss": 0.356, "odds_ratio_loss": 0.08841782808303833, "rewards/accuracies": 1.0, "rewards/chosen": -0.029128294438123703, "rewards/margins": 0.3460644483566284, "rewards/rejected": -0.3751927316188812, "sft_loss": 0.29128292202949524, "step": 1627 }, { "epoch": 2.354302241503977, "grad_norm": 2.247157821160996, "learning_rate": 5.499834127427839e-06, "logits/chosen": -0.23087726533412933, "logits/rejected": -0.21010765433311462, "logps/chosen": -0.28846973180770874, "logps/rejected": -2.382047653198242, "loss": 0.3048, "odds_ratio_loss": 0.1609227955341339, "rewards/accuracies": 0.875, "rewards/chosen": -0.028846973553299904, "rewards/margins": 0.20935779809951782, "rewards/rejected": -0.23820477724075317, "sft_loss": 0.28846973180770874, "step": 1628 }, { "epoch": 2.355748373101952, "grad_norm": 2.3409386872383005, "learning_rate": 5.4969544002246355e-06, "logits/chosen": -0.13611923158168793, "logits/rejected": -0.17130246758460999, "logps/chosen": -0.38131436705589294, "logps/rejected": -2.489086389541626, "loss": 0.3294, "odds_ratio_loss": 0.10580040514469147, "rewards/accuracies": 1.0, "rewards/chosen": -0.038131434470415115, "rewards/margins": 0.21077720820903778, "rewards/rejected": -0.2489086389541626, "sft_loss": 0.38131436705589294, "step": 1629 }, { "epoch": 2.357194504699928, "grad_norm": 2.2600185027758273, "learning_rate": 5.494073770502046e-06, "logits/chosen": -0.17199571430683136, "logits/rejected": -0.29529863595962524, "logps/chosen": -0.3580894470214844, "logps/rejected": -1.801833152770996, "loss": 0.3892, "odds_ratio_loss": 0.19995692372322083, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03580894321203232, "rewards/margins": 0.14437437057495117, "rewards/rejected": -0.1801833212375641, "sft_loss": 0.3580894470214844, "step": 1630 }, { "epoch": 2.358640636297903, "grad_norm": 2.1930278604061755, "learning_rate": 5.4911922399968175e-06, "logits/chosen": -0.2684285640716553, "logits/rejected": -0.20364916324615479, "logps/chosen": -0.41383859515190125, "logps/rejected": -2.9595816135406494, "loss": 0.3904, "odds_ratio_loss": 0.12188836932182312, "rewards/accuracies": 1.0, "rewards/chosen": -0.04138386249542236, "rewards/margins": 0.2545742988586426, "rewards/rejected": -0.29595816135406494, "sft_loss": 0.41383859515190125, "step": 1631 }, { "epoch": 2.3600867678958783, "grad_norm": 2.1821309968908364, "learning_rate": 5.488309810446233e-06, "logits/chosen": -0.06782963871955872, "logits/rejected": -0.2602359652519226, "logps/chosen": -0.2693048417568207, "logps/rejected": -6.274295330047607, "loss": 0.3377, "odds_ratio_loss": 0.12266825139522552, "rewards/accuracies": 0.9375, "rewards/chosen": -0.026930484920740128, "rewards/margins": 0.6004990339279175, "rewards/rejected": -0.6274295449256897, "sft_loss": 0.2693048417568207, "step": 1632 }, { "epoch": 2.361532899493854, "grad_norm": 2.279263386951824, "learning_rate": 5.485426483588121e-06, "logits/chosen": -0.25602948665618896, "logits/rejected": -0.21806968748569489, "logps/chosen": -0.3889194428920746, "logps/rejected": -2.4275267124176025, "loss": 0.369, "odds_ratio_loss": 0.11265772581100464, "rewards/accuracies": 1.0, "rewards/chosen": -0.03889194503426552, "rewards/margins": 0.20386072993278503, "rewards/rejected": -0.24275267124176025, "sft_loss": 0.3889194428920746, "step": 1633 }, { "epoch": 2.3629790310918293, "grad_norm": 2.4595078931890475, "learning_rate": 5.482542261160849e-06, "logits/chosen": -0.17380627989768982, "logits/rejected": -0.06503782421350479, "logps/chosen": -0.33673372864723206, "logps/rejected": -2.4677717685699463, "loss": 0.3231, "odds_ratio_loss": 0.12649573385715485, "rewards/accuracies": 1.0, "rewards/chosen": -0.033673375844955444, "rewards/margins": 0.21310380101203918, "rewards/rejected": -0.24677720665931702, "sft_loss": 0.33673372864723206, "step": 1634 }, { "epoch": 2.364425162689805, "grad_norm": 2.1872419569650052, "learning_rate": 5.479657144903327e-06, "logits/chosen": -0.30153438448905945, "logits/rejected": -0.22050045430660248, "logps/chosen": -0.31074702739715576, "logps/rejected": -2.7804017066955566, "loss": 0.3438, "odds_ratio_loss": 0.09722352027893066, "rewards/accuracies": 1.0, "rewards/chosen": -0.031074702739715576, "rewards/margins": 0.24696548283100128, "rewards/rejected": -0.27804017066955566, "sft_loss": 0.31074702739715576, "step": 1635 }, { "epoch": 2.36587129428778, "grad_norm": 2.370571176585605, "learning_rate": 5.476771136555002e-06, "logits/chosen": -0.20978805422782898, "logits/rejected": -0.404296338558197, "logps/chosen": -0.3532348573207855, "logps/rejected": -2.493743658065796, "loss": 0.4171, "odds_ratio_loss": 0.16929349303245544, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03532348573207855, "rewards/margins": 0.21405087411403656, "rewards/rejected": -0.2493743747472763, "sft_loss": 0.3532348573207855, "step": 1636 }, { "epoch": 2.3673174258857554, "grad_norm": 2.0667254729152478, "learning_rate": 5.4738842378558596e-06, "logits/chosen": -0.304945170879364, "logits/rejected": -0.3427213430404663, "logps/chosen": -0.3743901550769806, "logps/rejected": -3.3770601749420166, "loss": 0.2937, "odds_ratio_loss": 0.16788852214813232, "rewards/accuracies": 1.0, "rewards/chosen": -0.03743901476264, "rewards/margins": 0.3002670109272003, "rewards/rejected": -0.3377059996128082, "sft_loss": 0.3743901550769806, "step": 1637 }, { "epoch": 2.368763557483731, "grad_norm": 2.425960860579774, "learning_rate": 5.470996450546419e-06, "logits/chosen": -0.11950647830963135, "logits/rejected": -0.08787774294614792, "logps/chosen": -0.1783597767353058, "logps/rejected": -3.924421787261963, "loss": 0.3464, "odds_ratio_loss": 0.088666170835495, "rewards/accuracies": 1.0, "rewards/chosen": -0.01783597841858864, "rewards/margins": 0.37460625171661377, "rewards/rejected": -0.3924421966075897, "sft_loss": 0.1783597767353058, "step": 1638 }, { "epoch": 2.3702096890817064, "grad_norm": 2.4264887671943516, "learning_rate": 5.46810777636774e-06, "logits/chosen": -0.16909067332744598, "logits/rejected": -0.26136305928230286, "logps/chosen": -0.4223901629447937, "logps/rejected": -3.608455181121826, "loss": 0.3747, "odds_ratio_loss": 0.06967879831790924, "rewards/accuracies": 1.0, "rewards/chosen": -0.04223902150988579, "rewards/margins": 0.31860652565956116, "rewards/rejected": -0.36084550619125366, "sft_loss": 0.4223901629447937, "step": 1639 }, { "epoch": 2.371655820679682, "grad_norm": 2.8905963601226983, "learning_rate": 5.465218217061415e-06, "logits/chosen": -0.25168269872665405, "logits/rejected": -0.36768415570259094, "logps/chosen": -0.29154396057128906, "logps/rejected": -4.554935455322266, "loss": 0.3803, "odds_ratio_loss": 0.09950277209281921, "rewards/accuracies": 1.0, "rewards/chosen": -0.029154395684599876, "rewards/margins": 0.42633917927742004, "rewards/rejected": -0.4554935693740845, "sft_loss": 0.29154396057128906, "step": 1640 }, { "epoch": 2.3731019522776573, "grad_norm": 2.3523138704511686, "learning_rate": 5.46232777436957e-06, "logits/chosen": -0.3319550156593323, "logits/rejected": -0.25737500190734863, "logps/chosen": -0.35822594165802, "logps/rejected": -4.630117893218994, "loss": 0.3781, "odds_ratio_loss": 0.08917144685983658, "rewards/accuracies": 1.0, "rewards/chosen": -0.03582259640097618, "rewards/margins": 0.4271891713142395, "rewards/rejected": -0.46301180124282837, "sft_loss": 0.35822594165802, "step": 1641 }, { "epoch": 2.3745480838756325, "grad_norm": 2.244224679372636, "learning_rate": 5.4594364500348635e-06, "logits/chosen": -0.2159147709608078, "logits/rejected": -0.1707444190979004, "logps/chosen": -0.1780027151107788, "logps/rejected": -5.523977279663086, "loss": 0.3182, "odds_ratio_loss": 0.062143001705408096, "rewards/accuracies": 1.0, "rewards/chosen": -0.01780027337372303, "rewards/margins": 0.5345974564552307, "rewards/rejected": -0.5523977279663086, "sft_loss": 0.1780027151107788, "step": 1642 }, { "epoch": 2.3759942154736082, "grad_norm": 2.97272951082299, "learning_rate": 5.456544245800486e-06, "logits/chosen": -0.43455255031585693, "logits/rejected": -0.26484546065330505, "logps/chosen": -0.2586551010608673, "logps/rejected": -4.144586086273193, "loss": 0.3175, "odds_ratio_loss": 0.0730627104640007, "rewards/accuracies": 1.0, "rewards/chosen": -0.02586551196873188, "rewards/margins": 0.38859307765960693, "rewards/rejected": -0.41445863246917725, "sft_loss": 0.2586551010608673, "step": 1643 }, { "epoch": 2.3774403470715835, "grad_norm": 3.159824614056135, "learning_rate": 5.453651163410157e-06, "logits/chosen": -0.2961457669734955, "logits/rejected": -0.3236675262451172, "logps/chosen": -0.4048958718776703, "logps/rejected": -5.435911178588867, "loss": 0.3044, "odds_ratio_loss": 0.07996393740177155, "rewards/accuracies": 1.0, "rewards/chosen": -0.04048958793282509, "rewards/margins": 0.5031015872955322, "rewards/rejected": -0.5435911417007446, "sft_loss": 0.4048958718776703, "step": 1644 }, { "epoch": 2.3788864786695587, "grad_norm": 2.4232305825130873, "learning_rate": 5.45075720460813e-06, "logits/chosen": -0.2177274227142334, "logits/rejected": -0.2560899257659912, "logps/chosen": -0.39400598406791687, "logps/rejected": -3.329862594604492, "loss": 0.3572, "odds_ratio_loss": 0.17939239740371704, "rewards/accuracies": 0.9375, "rewards/chosen": -0.039400599896907806, "rewards/margins": 0.2935856580734253, "rewards/rejected": -0.3329862654209137, "sft_loss": 0.39400598406791687, "step": 1645 }, { "epoch": 2.3803326102675344, "grad_norm": 2.3769012497590136, "learning_rate": 5.4478623711391785e-06, "logits/chosen": -0.41154927015304565, "logits/rejected": -0.1991625726222992, "logps/chosen": -0.26996538043022156, "logps/rejected": -3.1177456378936768, "loss": 0.3654, "odds_ratio_loss": 0.10832569748163223, "rewards/accuracies": 0.9375, "rewards/chosen": -0.026996538043022156, "rewards/margins": 0.28477805852890015, "rewards/rejected": -0.3117745816707611, "sft_loss": 0.26996538043022156, "step": 1646 }, { "epoch": 2.3817787418655096, "grad_norm": 2.9369047044268206, "learning_rate": 5.4449666647486125e-06, "logits/chosen": -0.13625237345695496, "logits/rejected": -0.08367015421390533, "logps/chosen": -0.43870049715042114, "logps/rejected": -2.0339059829711914, "loss": 0.4882, "odds_ratio_loss": 0.16639426350593567, "rewards/accuracies": 1.0, "rewards/chosen": -0.043870046734809875, "rewards/margins": 0.15952055156230927, "rewards/rejected": -0.20339059829711914, "sft_loss": 0.43870049715042114, "step": 1647 }, { "epoch": 2.3832248734634853, "grad_norm": 6.563565558670957, "learning_rate": 5.4420700871822616e-06, "logits/chosen": -0.25511327385902405, "logits/rejected": -0.1838105469942093, "logps/chosen": -0.46203696727752686, "logps/rejected": -4.043597221374512, "loss": 0.3845, "odds_ratio_loss": 0.19478821754455566, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04620370268821716, "rewards/margins": 0.3581559956073761, "rewards/rejected": -0.40435969829559326, "sft_loss": 0.46203696727752686, "step": 1648 }, { "epoch": 2.3846710050614606, "grad_norm": 2.4494751986770646, "learning_rate": 5.439172640186484e-06, "logits/chosen": -0.07603715360164642, "logits/rejected": -0.08391943573951721, "logps/chosen": -0.47998425364494324, "logps/rejected": -4.149824142456055, "loss": 0.3811, "odds_ratio_loss": 0.16126686334609985, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04799842834472656, "rewards/margins": 0.36698397994041443, "rewards/rejected": -0.4149824380874634, "sft_loss": 0.47998425364494324, "step": 1649 }, { "epoch": 2.3861171366594363, "grad_norm": 3.217436034110034, "learning_rate": 5.436274325508164e-06, "logits/chosen": -0.1692204475402832, "logits/rejected": -0.18024393916130066, "logps/chosen": -0.3294297754764557, "logps/rejected": -2.8294403553009033, "loss": 0.3719, "odds_ratio_loss": 0.11639772355556488, "rewards/accuracies": 1.0, "rewards/chosen": -0.03294298052787781, "rewards/margins": 0.25000107288360596, "rewards/rejected": -0.28294405341148376, "sft_loss": 0.3294297754764557, "step": 1650 }, { "epoch": 2.3875632682574115, "grad_norm": 2.43631615840935, "learning_rate": 5.433375144894701e-06, "logits/chosen": -0.21938952803611755, "logits/rejected": -0.1646469086408615, "logps/chosen": -0.2409743219614029, "logps/rejected": -4.187996864318848, "loss": 0.3565, "odds_ratio_loss": 0.03713899105787277, "rewards/accuracies": 1.0, "rewards/chosen": -0.02409743145108223, "rewards/margins": 0.3947022259235382, "rewards/rejected": -0.4187996983528137, "sft_loss": 0.2409743219614029, "step": 1651 }, { "epoch": 2.3890093998553867, "grad_norm": 2.6636340250404253, "learning_rate": 5.430475100094026e-06, "logits/chosen": -0.16366271674633026, "logits/rejected": -0.2531028091907501, "logps/chosen": -0.3762364983558655, "logps/rejected": -4.51442289352417, "loss": 0.3527, "odds_ratio_loss": 0.1351589560508728, "rewards/accuracies": 1.0, "rewards/chosen": -0.03762365132570267, "rewards/margins": 0.4138185977935791, "rewards/rejected": -0.45144224166870117, "sft_loss": 0.3762364983558655, "step": 1652 }, { "epoch": 2.3904555314533624, "grad_norm": 2.1635789447744815, "learning_rate": 5.427574192854586e-06, "logits/chosen": -0.18240058422088623, "logits/rejected": -0.12168880552053452, "logps/chosen": -0.3345116376876831, "logps/rejected": -3.058701753616333, "loss": 0.3597, "odds_ratio_loss": 0.09098277986049652, "rewards/accuracies": 1.0, "rewards/chosen": -0.03345116227865219, "rewards/margins": 0.2724190056324005, "rewards/rejected": -0.3058701753616333, "sft_loss": 0.3345116376876831, "step": 1653 }, { "epoch": 2.3919016630513377, "grad_norm": 2.0656474748307363, "learning_rate": 5.424672424925347e-06, "logits/chosen": -0.13181257247924805, "logits/rejected": -0.0836930125951767, "logps/chosen": -0.2531171441078186, "logps/rejected": -3.279249906539917, "loss": 0.3388, "odds_ratio_loss": 0.08992984890937805, "rewards/accuracies": 1.0, "rewards/chosen": -0.02531171217560768, "rewards/margins": 0.3026133179664612, "rewards/rejected": -0.32792502641677856, "sft_loss": 0.2531171441078186, "step": 1654 }, { "epoch": 2.393347794649313, "grad_norm": 2.2867887869624126, "learning_rate": 5.4217697980557986e-06, "logits/chosen": -0.18714286386966705, "logits/rejected": -0.2178667187690735, "logps/chosen": -0.456027090549469, "logps/rejected": -3.494032621383667, "loss": 0.3659, "odds_ratio_loss": 0.1177714616060257, "rewards/accuracies": 1.0, "rewards/chosen": -0.0456027090549469, "rewards/margins": 0.3038005530834198, "rewards/rejected": -0.3494032621383667, "sft_loss": 0.456027090549469, "step": 1655 }, { "epoch": 2.3947939262472886, "grad_norm": 2.385763157906236, "learning_rate": 5.418866313995942e-06, "logits/chosen": -0.14378896355628967, "logits/rejected": -0.02609366364777088, "logps/chosen": -0.33504900336265564, "logps/rejected": -2.4867379665374756, "loss": 0.3178, "odds_ratio_loss": 0.10914792120456696, "rewards/accuracies": 1.0, "rewards/chosen": -0.033504899591207504, "rewards/margins": 0.21516892313957214, "rewards/rejected": -0.24867381155490875, "sft_loss": 0.33504900336265564, "step": 1656 }, { "epoch": 2.396240057845264, "grad_norm": 2.5382250015062793, "learning_rate": 5.415961974496303e-06, "logits/chosen": -0.27873504161834717, "logits/rejected": -0.10101763904094696, "logps/chosen": -0.5347107648849487, "logps/rejected": -2.174900531768799, "loss": 0.4443, "odds_ratio_loss": 0.18945449590682983, "rewards/accuracies": 1.0, "rewards/chosen": -0.05347108095884323, "rewards/margins": 0.16401898860931396, "rewards/rejected": -0.2174900472164154, "sft_loss": 0.5347107648849487, "step": 1657 }, { "epoch": 2.3976861894432395, "grad_norm": 2.094994084526692, "learning_rate": 5.413056781307913e-06, "logits/chosen": -0.09330153465270996, "logits/rejected": -0.0472065694630146, "logps/chosen": -0.34694916009902954, "logps/rejected": -2.9058146476745605, "loss": 0.427, "odds_ratio_loss": 0.10710655152797699, "rewards/accuracies": 1.0, "rewards/chosen": -0.03469491004943848, "rewards/margins": 0.2558865547180176, "rewards/rejected": -0.29058146476745605, "sft_loss": 0.34694916009902954, "step": 1658 }, { "epoch": 2.3991323210412148, "grad_norm": 2.3038775902562425, "learning_rate": 5.4101507361823276e-06, "logits/chosen": -0.3557550609111786, "logits/rejected": -0.37921270728111267, "logps/chosen": -0.3219497799873352, "logps/rejected": -3.9622740745544434, "loss": 0.3468, "odds_ratio_loss": 0.13163802027702332, "rewards/accuracies": 1.0, "rewards/chosen": -0.03219497948884964, "rewards/margins": 0.36403244733810425, "rewards/rejected": -0.3962274193763733, "sft_loss": 0.3219497799873352, "step": 1659 }, { "epoch": 2.40057845263919, "grad_norm": 2.3419223245541727, "learning_rate": 5.407243840871612e-06, "logits/chosen": -0.20818328857421875, "logits/rejected": -0.16019529104232788, "logps/chosen": -0.39010053873062134, "logps/rejected": -2.511380672454834, "loss": 0.363, "odds_ratio_loss": 0.1622430980205536, "rewards/accuracies": 1.0, "rewards/chosen": -0.039010051637887955, "rewards/margins": 0.21212802827358246, "rewards/rejected": -0.2511380910873413, "sft_loss": 0.39010053873062134, "step": 1660 }, { "epoch": 2.4020245842371657, "grad_norm": 4.380688001278866, "learning_rate": 5.404336097128343e-06, "logits/chosen": -0.2879038155078888, "logits/rejected": -0.30223992466926575, "logps/chosen": -0.4455604553222656, "logps/rejected": -2.5138981342315674, "loss": 0.3703, "odds_ratio_loss": 0.15395966172218323, "rewards/accuracies": 1.0, "rewards/chosen": -0.04455604776740074, "rewards/margins": 0.20683377981185913, "rewards/rejected": -0.25138983130455017, "sft_loss": 0.4455604553222656, "step": 1661 }, { "epoch": 2.403470715835141, "grad_norm": 2.3187283533283938, "learning_rate": 5.401427506705611e-06, "logits/chosen": -0.08860334753990173, "logits/rejected": -0.08838482946157455, "logps/chosen": -0.2660841941833496, "logps/rejected": -3.3898189067840576, "loss": 0.4052, "odds_ratio_loss": 0.09372745454311371, "rewards/accuracies": 1.0, "rewards/chosen": -0.0266084186732769, "rewards/margins": 0.31237348914146423, "rewards/rejected": -0.33898189663887024, "sft_loss": 0.2660841941833496, "step": 1662 }, { "epoch": 2.4049168474331166, "grad_norm": 2.7227957969637595, "learning_rate": 5.398518071357015e-06, "logits/chosen": -0.1172524020075798, "logits/rejected": -0.33546555042266846, "logps/chosen": -0.37985527515411377, "logps/rejected": -2.7703769207000732, "loss": 0.4467, "odds_ratio_loss": 0.2563801407814026, "rewards/accuracies": 0.9375, "rewards/chosen": -0.037985529750585556, "rewards/margins": 0.2390521615743637, "rewards/rejected": -0.27703768014907837, "sft_loss": 0.37985527515411377, "step": 1663 }, { "epoch": 2.406362979031092, "grad_norm": 2.773528286720876, "learning_rate": 5.395607792836667e-06, "logits/chosen": -0.20116093754768372, "logits/rejected": -0.11985060572624207, "logps/chosen": -0.3083297610282898, "logps/rejected": -1.3712188005447388, "loss": 0.3717, "odds_ratio_loss": 0.17615623772144318, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03083297424018383, "rewards/margins": 0.10628890991210938, "rewards/rejected": -0.13712188601493835, "sft_loss": 0.3083297610282898, "step": 1664 }, { "epoch": 2.407809110629067, "grad_norm": 2.3754850371079197, "learning_rate": 5.392696672899181e-06, "logits/chosen": -0.3145734667778015, "logits/rejected": -0.23199528455734253, "logps/chosen": -0.26135680079460144, "logps/rejected": -3.4864580631256104, "loss": 0.3643, "odds_ratio_loss": 0.06875382363796234, "rewards/accuracies": 1.0, "rewards/chosen": -0.026135679334402084, "rewards/margins": 0.32251012325286865, "rewards/rejected": -0.34864580631256104, "sft_loss": 0.26135680079460144, "step": 1665 }, { "epoch": 2.409255242227043, "grad_norm": 2.497467322183816, "learning_rate": 5.389784713299686e-06, "logits/chosen": -0.07668744772672653, "logits/rejected": -0.007611650973558426, "logps/chosen": -0.21216994524002075, "logps/rejected": -4.282060146331787, "loss": 0.303, "odds_ratio_loss": 0.1205163523554802, "rewards/accuracies": 1.0, "rewards/chosen": -0.021216996014118195, "rewards/margins": 0.40698903799057007, "rewards/rejected": -0.4282059967517853, "sft_loss": 0.21216994524002075, "step": 1666 }, { "epoch": 2.410701373825018, "grad_norm": 2.1520905912288177, "learning_rate": 5.386871915793809e-06, "logits/chosen": -0.2075837105512619, "logits/rejected": -0.1876615285873413, "logps/chosen": -0.2702080309391022, "logps/rejected": -5.139091968536377, "loss": 0.3105, "odds_ratio_loss": 0.1293424367904663, "rewards/accuracies": 1.0, "rewards/chosen": -0.027020802721381187, "rewards/margins": 0.48688840866088867, "rewards/rejected": -0.5139092206954956, "sft_loss": 0.2702080309391022, "step": 1667 }, { "epoch": 2.4121475054229933, "grad_norm": 3.2146716999123193, "learning_rate": 5.383958282137691e-06, "logits/chosen": -0.12676213681697845, "logits/rejected": -0.1635175496339798, "logps/chosen": -0.2572956681251526, "logps/rejected": -3.4657866954803467, "loss": 0.39, "odds_ratio_loss": 0.10569733381271362, "rewards/accuracies": 1.0, "rewards/chosen": -0.025729568675160408, "rewards/margins": 0.32084912061691284, "rewards/rejected": -0.3465786874294281, "sft_loss": 0.2572956681251526, "step": 1668 }, { "epoch": 2.413593637020969, "grad_norm": 2.3616969258191065, "learning_rate": 5.381043814087968e-06, "logits/chosen": -0.38791531324386597, "logits/rejected": -0.27840742468833923, "logps/chosen": -0.3243800401687622, "logps/rejected": -3.7442002296447754, "loss": 0.3139, "odds_ratio_loss": 0.10507690906524658, "rewards/accuracies": 1.0, "rewards/chosen": -0.0324380025267601, "rewards/margins": 0.34198203682899475, "rewards/rejected": -0.37442004680633545, "sft_loss": 0.3243800401687622, "step": 1669 }, { "epoch": 2.415039768618944, "grad_norm": 2.2888964516476467, "learning_rate": 5.3781285134017865e-06, "logits/chosen": -0.25714176893234253, "logits/rejected": -0.1979777216911316, "logps/chosen": -0.3115207552909851, "logps/rejected": -3.852271556854248, "loss": 0.3226, "odds_ratio_loss": 0.09582686424255371, "rewards/accuracies": 1.0, "rewards/chosen": -0.03115207701921463, "rewards/margins": 0.3540751039981842, "rewards/rejected": -0.3852272033691406, "sft_loss": 0.3115207552909851, "step": 1670 }, { "epoch": 2.41648590021692, "grad_norm": 2.041066759979958, "learning_rate": 5.375212381836793e-06, "logits/chosen": -0.10201995074748993, "logits/rejected": -0.08090163767337799, "logps/chosen": -0.3471717834472656, "logps/rejected": -5.151330947875977, "loss": 0.3825, "odds_ratio_loss": 0.08015061914920807, "rewards/accuracies": 1.0, "rewards/chosen": -0.03471717610955238, "rewards/margins": 0.480415940284729, "rewards/rejected": -0.5151331424713135, "sft_loss": 0.3471717834472656, "step": 1671 }, { "epoch": 2.417932031814895, "grad_norm": 2.402521519996911, "learning_rate": 5.3722954211511314e-06, "logits/chosen": -0.20054322481155396, "logits/rejected": -0.3008997440338135, "logps/chosen": -0.19180569052696228, "logps/rejected": -4.623170375823975, "loss": 0.3376, "odds_ratio_loss": 0.032107461243867874, "rewards/accuracies": 1.0, "rewards/chosen": -0.019180569797754288, "rewards/margins": 0.4431364834308624, "rewards/rejected": -0.4623170495033264, "sft_loss": 0.19180569052696228, "step": 1672 }, { "epoch": 2.419378163412871, "grad_norm": 2.8755560314390736, "learning_rate": 5.369377633103449e-06, "logits/chosen": -0.24951305985450745, "logits/rejected": -0.16665250062942505, "logps/chosen": -0.21055331826210022, "logps/rejected": -2.9385476112365723, "loss": 0.3952, "odds_ratio_loss": 0.0641937106847763, "rewards/accuracies": 1.0, "rewards/chosen": -0.021055329591035843, "rewards/margins": 0.27279946208000183, "rewards/rejected": -0.2938547730445862, "sft_loss": 0.21055331826210022, "step": 1673 }, { "epoch": 2.420824295010846, "grad_norm": 2.2612662097686047, "learning_rate": 5.366459019452893e-06, "logits/chosen": -0.3434632122516632, "logits/rejected": -0.21899384260177612, "logps/chosen": -0.38703057169914246, "logps/rejected": -2.9113190174102783, "loss": 0.3515, "odds_ratio_loss": 0.15410572290420532, "rewards/accuracies": 1.0, "rewards/chosen": -0.038703057914972305, "rewards/margins": 0.2524288594722748, "rewards/rejected": -0.2911319136619568, "sft_loss": 0.38703057169914246, "step": 1674 }, { "epoch": 2.4222704266088213, "grad_norm": 2.1759070085509067, "learning_rate": 5.363539581959102e-06, "logits/chosen": -0.3367041349411011, "logits/rejected": -0.2844616770744324, "logps/chosen": -0.39401325583457947, "logps/rejected": -2.856097459793091, "loss": 0.406, "odds_ratio_loss": 0.16076377034187317, "rewards/accuracies": 1.0, "rewards/chosen": -0.03940132260322571, "rewards/margins": 0.24620842933654785, "rewards/rejected": -0.28560978174209595, "sft_loss": 0.39401325583457947, "step": 1675 }, { "epoch": 2.423716558206797, "grad_norm": 4.34686480463768, "learning_rate": 5.3606193223822215e-06, "logits/chosen": -0.28801190853118896, "logits/rejected": -0.31603163480758667, "logps/chosen": -0.3019119203090668, "logps/rejected": -3.840654134750366, "loss": 0.3827, "odds_ratio_loss": 0.07661937922239304, "rewards/accuracies": 1.0, "rewards/chosen": -0.030191190540790558, "rewards/margins": 0.35387420654296875, "rewards/rejected": -0.3840653896331787, "sft_loss": 0.3019119203090668, "step": 1676 }, { "epoch": 2.4251626898047722, "grad_norm": 2.2347029036364083, "learning_rate": 5.357698242482884e-06, "logits/chosen": -0.30417659878730774, "logits/rejected": -0.16208516061306, "logps/chosen": -0.3697463870048523, "logps/rejected": -5.748754978179932, "loss": 0.3974, "odds_ratio_loss": 0.1115613579750061, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03697463870048523, "rewards/margins": 0.5379008054733276, "rewards/rejected": -0.5748754739761353, "sft_loss": 0.3697463870048523, "step": 1677 }, { "epoch": 2.4266088214027475, "grad_norm": 2.746152164966934, "learning_rate": 5.354776344022219e-06, "logits/chosen": -0.2900420129299164, "logits/rejected": -0.23626813292503357, "logps/chosen": -0.2787986993789673, "logps/rejected": -2.1767630577087402, "loss": 0.3304, "odds_ratio_loss": 0.120830237865448, "rewards/accuracies": 1.0, "rewards/chosen": -0.027879871428012848, "rewards/margins": 0.18979641795158386, "rewards/rejected": -0.2176763117313385, "sft_loss": 0.2787986993789673, "step": 1678 }, { "epoch": 2.428054953000723, "grad_norm": 2.620208103065728, "learning_rate": 5.35185362876185e-06, "logits/chosen": -0.3657112121582031, "logits/rejected": -0.3040888011455536, "logps/chosen": -0.43680983781814575, "logps/rejected": -2.43369197845459, "loss": 0.3569, "odds_ratio_loss": 0.1762293428182602, "rewards/accuracies": 1.0, "rewards/chosen": -0.04368098825216293, "rewards/margins": 0.19968819618225098, "rewards/rejected": -0.2433691918849945, "sft_loss": 0.43680983781814575, "step": 1679 }, { "epoch": 2.4295010845986984, "grad_norm": 2.4352727537772316, "learning_rate": 5.348930098463894e-06, "logits/chosen": -0.6010982394218445, "logits/rejected": -0.30227330327033997, "logps/chosen": -0.3067004978656769, "logps/rejected": -3.6038007736206055, "loss": 0.3692, "odds_ratio_loss": 0.06560537964105606, "rewards/accuracies": 1.0, "rewards/chosen": -0.030670054256916046, "rewards/margins": 0.3297100067138672, "rewards/rejected": -0.36038005352020264, "sft_loss": 0.3067004978656769, "step": 1680 }, { "epoch": 2.430947216196674, "grad_norm": 2.4907614571390337, "learning_rate": 5.346005754890956e-06, "logits/chosen": -0.40090346336364746, "logits/rejected": -0.18178865313529968, "logps/chosen": -0.3909543752670288, "logps/rejected": -2.7736334800720215, "loss": 0.4267, "odds_ratio_loss": 0.09884399175643921, "rewards/accuracies": 1.0, "rewards/chosen": -0.039095439016819, "rewards/margins": 0.2382679283618927, "rewards/rejected": -0.2773633599281311, "sft_loss": 0.3909543752670288, "step": 1681 }, { "epoch": 2.4323933477946493, "grad_norm": 2.9164477390226398, "learning_rate": 5.3430805998061375e-06, "logits/chosen": -0.24474173784255981, "logits/rejected": -0.21296316385269165, "logps/chosen": -0.4344663918018341, "logps/rejected": -3.148064136505127, "loss": 0.3499, "odds_ratio_loss": 0.09581369161605835, "rewards/accuracies": 1.0, "rewards/chosen": -0.04344664514064789, "rewards/margins": 0.27135974168777466, "rewards/rejected": -0.31480640172958374, "sft_loss": 0.4344663918018341, "step": 1682 }, { "epoch": 2.4338394793926246, "grad_norm": 2.1284693896066735, "learning_rate": 5.340154634973023e-06, "logits/chosen": -0.18710467219352722, "logits/rejected": -0.18588191270828247, "logps/chosen": -0.3472820818424225, "logps/rejected": -2.692941427230835, "loss": 0.4106, "odds_ratio_loss": 0.12289082258939743, "rewards/accuracies": 1.0, "rewards/chosen": -0.03472820669412613, "rewards/margins": 0.23456594347953796, "rewards/rejected": -0.2692941427230835, "sft_loss": 0.3472820818424225, "step": 1683 }, { "epoch": 2.4352856109906003, "grad_norm": 2.467601888547069, "learning_rate": 5.337227862155687e-06, "logits/chosen": -0.13609479367733002, "logits/rejected": -0.135749951004982, "logps/chosen": -0.35460272431373596, "logps/rejected": -3.266855478286743, "loss": 0.3055, "odds_ratio_loss": 0.12023431807756424, "rewards/accuracies": 1.0, "rewards/chosen": -0.035460274666547775, "rewards/margins": 0.29122528433799744, "rewards/rejected": -0.3266855478286743, "sft_loss": 0.35460272431373596, "step": 1684 }, { "epoch": 2.4367317425885755, "grad_norm": 3.181573666104813, "learning_rate": 5.334300283118692e-06, "logits/chosen": -0.20146742463111877, "logits/rejected": -0.2168891727924347, "logps/chosen": -0.5368006825447083, "logps/rejected": -3.477538824081421, "loss": 0.3767, "odds_ratio_loss": 0.1835523396730423, "rewards/accuracies": 0.9375, "rewards/chosen": -0.053680069744586945, "rewards/margins": 0.29407382011413574, "rewards/rejected": -0.3477538824081421, "sft_loss": 0.5368006825447083, "step": 1685 }, { "epoch": 2.438177874186551, "grad_norm": 2.23714776177442, "learning_rate": 5.331371899627088e-06, "logits/chosen": -0.10427071899175644, "logits/rejected": -0.18009522557258606, "logps/chosen": -0.3216210603713989, "logps/rejected": -4.574368000030518, "loss": 0.3989, "odds_ratio_loss": 0.09731189906597137, "rewards/accuracies": 1.0, "rewards/chosen": -0.03216210752725601, "rewards/margins": 0.42527472972869873, "rewards/rejected": -0.45743680000305176, "sft_loss": 0.3216210603713989, "step": 1686 }, { "epoch": 2.4396240057845264, "grad_norm": 2.070128394841874, "learning_rate": 5.328442713446407e-06, "logits/chosen": -0.16747835278511047, "logits/rejected": -0.25303810834884644, "logps/chosen": -0.2828282415866852, "logps/rejected": -2.099355459213257, "loss": 0.3438, "odds_ratio_loss": 0.1440654695034027, "rewards/accuracies": 0.9375, "rewards/chosen": -0.028282826766371727, "rewards/margins": 0.1816527247428894, "rewards/rejected": -0.20993554592132568, "sft_loss": 0.2828282415866852, "step": 1687 }, { "epoch": 2.4410701373825017, "grad_norm": 2.458892699445312, "learning_rate": 5.325512726342665e-06, "logits/chosen": -0.28189414739608765, "logits/rejected": -0.383240282535553, "logps/chosen": -0.3891836404800415, "logps/rejected": -3.0360727310180664, "loss": 0.4436, "odds_ratio_loss": 0.16465787589550018, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03891836479306221, "rewards/margins": 0.2646889090538025, "rewards/rejected": -0.3036072850227356, "sft_loss": 0.3891836404800415, "step": 1688 }, { "epoch": 2.4425162689804774, "grad_norm": 2.337184676111808, "learning_rate": 5.322581940082365e-06, "logits/chosen": -0.13843964040279388, "logits/rejected": -0.15620598196983337, "logps/chosen": -0.20821207761764526, "logps/rejected": -3.2480251789093018, "loss": 0.3055, "odds_ratio_loss": 0.05318732559680939, "rewards/accuracies": 1.0, "rewards/chosen": -0.020821209996938705, "rewards/margins": 0.30398133397102356, "rewards/rejected": -0.3248025178909302, "sft_loss": 0.20821207761764526, "step": 1689 }, { "epoch": 2.4439624005784526, "grad_norm": 2.4611245315908197, "learning_rate": 5.319650356432487e-06, "logits/chosen": -0.38569334149360657, "logits/rejected": -0.34914782643318176, "logps/chosen": -0.2619364261627197, "logps/rejected": -4.187211036682129, "loss": 0.3669, "odds_ratio_loss": 0.11673162132501602, "rewards/accuracies": 1.0, "rewards/chosen": -0.026193641126155853, "rewards/margins": 0.39252740144729614, "rewards/rejected": -0.418721079826355, "sft_loss": 0.2619364261627197, "step": 1690 }, { "epoch": 2.445408532176428, "grad_norm": 2.3690510230543937, "learning_rate": 5.316717977160495e-06, "logits/chosen": -0.1654704362154007, "logits/rejected": -0.24338343739509583, "logps/chosen": -0.36776530742645264, "logps/rejected": -4.427489757537842, "loss": 0.3698, "odds_ratio_loss": 0.14131522178649902, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03677653148770332, "rewards/margins": 0.405972421169281, "rewards/rejected": -0.4427489638328552, "sft_loss": 0.36776530742645264, "step": 1691 }, { "epoch": 2.4468546637744035, "grad_norm": 2.7640415911692315, "learning_rate": 5.31378480403433e-06, "logits/chosen": -0.11378660053014755, "logits/rejected": -0.28160223364830017, "logps/chosen": -0.39058995246887207, "logps/rejected": -1.9533859491348267, "loss": 0.3786, "odds_ratio_loss": 0.24598926305770874, "rewards/accuracies": 0.8125, "rewards/chosen": -0.039058998227119446, "rewards/margins": 0.15627959370613098, "rewards/rejected": -0.19533859193325043, "sft_loss": 0.39058995246887207, "step": 1692 }, { "epoch": 2.4483007953723788, "grad_norm": 2.3969905094526816, "learning_rate": 5.310850838822413e-06, "logits/chosen": -0.25515154004096985, "logits/rejected": -0.3220319151878357, "logps/chosen": -0.2755758464336395, "logps/rejected": -4.758266925811768, "loss": 0.2722, "odds_ratio_loss": 0.08147071301937103, "rewards/accuracies": 1.0, "rewards/chosen": -0.027557585388422012, "rewards/margins": 0.4482691287994385, "rewards/rejected": -0.4758267104625702, "sft_loss": 0.2755758464336395, "step": 1693 }, { "epoch": 2.4497469269703545, "grad_norm": 2.7245877218943138, "learning_rate": 5.307916083293643e-06, "logits/chosen": -0.24902743101119995, "logits/rejected": -0.173957958817482, "logps/chosen": -0.2562694847583771, "logps/rejected": -2.591289758682251, "loss": 0.3452, "odds_ratio_loss": 0.12518920004367828, "rewards/accuracies": 1.0, "rewards/chosen": -0.025626949965953827, "rewards/margins": 0.23350203037261963, "rewards/rejected": -0.25912898778915405, "sft_loss": 0.2562694847583771, "step": 1694 }, { "epoch": 2.4511930585683297, "grad_norm": 2.525754725939901, "learning_rate": 5.304980539217397e-06, "logits/chosen": -0.255185067653656, "logits/rejected": -0.2326502501964569, "logps/chosen": -0.3928337097167969, "logps/rejected": -4.967213153839111, "loss": 0.3448, "odds_ratio_loss": 0.13604749739170074, "rewards/accuracies": 1.0, "rewards/chosen": -0.03928337246179581, "rewards/margins": 0.45743799209594727, "rewards/rejected": -0.49672138690948486, "sft_loss": 0.3928337097167969, "step": 1695 }, { "epoch": 2.4526391901663054, "grad_norm": 2.157462237065824, "learning_rate": 5.3020442083635225e-06, "logits/chosen": -0.33897683024406433, "logits/rejected": -0.36058297753334045, "logps/chosen": -0.3103875517845154, "logps/rejected": -4.488574981689453, "loss": 0.3695, "odds_ratio_loss": 0.07876772433519363, "rewards/accuracies": 1.0, "rewards/chosen": -0.031038757413625717, "rewards/margins": 0.41781869530677795, "rewards/rejected": -0.44885745644569397, "sft_loss": 0.3103875517845154, "step": 1696 }, { "epoch": 2.4540853217642806, "grad_norm": 3.264192289306274, "learning_rate": 5.299107092502345e-06, "logits/chosen": -0.14832520484924316, "logits/rejected": -0.16244558990001678, "logps/chosen": -0.4910193979740143, "logps/rejected": -3.7641544342041016, "loss": 0.4286, "odds_ratio_loss": 0.13385328650474548, "rewards/accuracies": 1.0, "rewards/chosen": -0.04910193756222725, "rewards/margins": 0.32731351256370544, "rewards/rejected": -0.376415491104126, "sft_loss": 0.4910193979740143, "step": 1697 }, { "epoch": 2.455531453362256, "grad_norm": 2.813819539819064, "learning_rate": 5.296169193404664e-06, "logits/chosen": -0.24914006888866425, "logits/rejected": -0.20338988304138184, "logps/chosen": -0.21061301231384277, "logps/rejected": -4.377030849456787, "loss": 0.3424, "odds_ratio_loss": 0.038216181099414825, "rewards/accuracies": 1.0, "rewards/chosen": -0.021061301231384277, "rewards/margins": 0.4166417717933655, "rewards/rejected": -0.43770307302474976, "sft_loss": 0.21061301231384277, "step": 1698 }, { "epoch": 2.4569775849602316, "grad_norm": 2.6327695248210983, "learning_rate": 5.2932305128417484e-06, "logits/chosen": -0.17436552047729492, "logits/rejected": -0.16264991462230682, "logps/chosen": -0.35267582535743713, "logps/rejected": -3.094590663909912, "loss": 0.3573, "odds_ratio_loss": 0.14291812479496002, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03526758402585983, "rewards/margins": 0.2741914987564087, "rewards/rejected": -0.30945906043052673, "sft_loss": 0.35267582535743713, "step": 1699 }, { "epoch": 2.458423716558207, "grad_norm": 2.7114975370715704, "learning_rate": 5.2902910525853406e-06, "logits/chosen": -0.16905909776687622, "logits/rejected": -0.12139132618904114, "logps/chosen": -0.518578290939331, "logps/rejected": -2.4249773025512695, "loss": 0.4167, "odds_ratio_loss": 0.183926060795784, "rewards/accuracies": 0.9375, "rewards/chosen": -0.051857829093933105, "rewards/margins": 0.19063988327980042, "rewards/rejected": -0.24249771237373352, "sft_loss": 0.518578290939331, "step": 1700 }, { "epoch": 2.459869848156182, "grad_norm": 2.209082477790294, "learning_rate": 5.28735081440765e-06, "logits/chosen": -0.14498230814933777, "logits/rejected": -0.25710102915763855, "logps/chosen": -0.37873467803001404, "logps/rejected": -3.46140193939209, "loss": 0.3614, "odds_ratio_loss": 0.17570456862449646, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03787346929311752, "rewards/margins": 0.30826669931411743, "rewards/rejected": -0.34614014625549316, "sft_loss": 0.37873467803001404, "step": 1701 }, { "epoch": 2.4613159797541577, "grad_norm": 2.563412683406822, "learning_rate": 5.284409800081359e-06, "logits/chosen": -0.05897979438304901, "logits/rejected": -0.1600523293018341, "logps/chosen": -0.3373136818408966, "logps/rejected": -2.6773574352264404, "loss": 0.3735, "odds_ratio_loss": 0.15935732424259186, "rewards/accuracies": 1.0, "rewards/chosen": -0.0337313674390316, "rewards/margins": 0.23400436341762543, "rewards/rejected": -0.2677357494831085, "sft_loss": 0.3373136818408966, "step": 1702 }, { "epoch": 2.462762111352133, "grad_norm": 2.8871356309464016, "learning_rate": 5.281468011379618e-06, "logits/chosen": -0.12770754098892212, "logits/rejected": -0.05635076016187668, "logps/chosen": -0.3339754045009613, "logps/rejected": -2.7018704414367676, "loss": 0.4228, "odds_ratio_loss": 0.09883897006511688, "rewards/accuracies": 1.0, "rewards/chosen": -0.03339754045009613, "rewards/margins": 0.2367894947528839, "rewards/rejected": -0.27018705010414124, "sft_loss": 0.3339754045009613, "step": 1703 }, { "epoch": 2.4642082429501087, "grad_norm": 2.5395367967099958, "learning_rate": 5.278525450076038e-06, "logits/chosen": -0.19829659163951874, "logits/rejected": -0.25783413648605347, "logps/chosen": -0.28885769844055176, "logps/rejected": -3.8891334533691406, "loss": 0.3464, "odds_ratio_loss": 0.12367962300777435, "rewards/accuracies": 1.0, "rewards/chosen": -0.028885772451758385, "rewards/margins": 0.36002761125564575, "rewards/rejected": -0.3889133930206299, "sft_loss": 0.28885769844055176, "step": 1704 }, { "epoch": 2.465654374548084, "grad_norm": 2.7686820913229866, "learning_rate": 5.275582117944704e-06, "logits/chosen": -0.20002993941307068, "logits/rejected": -0.23674233257770538, "logps/chosen": -0.26880428194999695, "logps/rejected": -4.4511919021606445, "loss": 0.382, "odds_ratio_loss": 0.07453987747430801, "rewards/accuracies": 1.0, "rewards/chosen": -0.026880430057644844, "rewards/margins": 0.4182387888431549, "rewards/rejected": -0.4451192021369934, "sft_loss": 0.26880428194999695, "step": 1705 }, { "epoch": 2.467100506146059, "grad_norm": 2.482392899389993, "learning_rate": 5.2726380167601595e-06, "logits/chosen": -0.03449631482362747, "logits/rejected": -0.16584265232086182, "logps/chosen": -0.40714091062545776, "logps/rejected": -2.2733700275421143, "loss": 0.383, "odds_ratio_loss": 0.18216630816459656, "rewards/accuracies": 1.0, "rewards/chosen": -0.040714096277952194, "rewards/margins": 0.18662291765213013, "rewards/rejected": -0.22733700275421143, "sft_loss": 0.40714091062545776, "step": 1706 }, { "epoch": 2.468546637744035, "grad_norm": 2.4272265459394418, "learning_rate": 5.269693148297415e-06, "logits/chosen": -0.18469105660915375, "logits/rejected": -0.18337099254131317, "logps/chosen": -0.276080459356308, "logps/rejected": -3.6516079902648926, "loss": 0.354, "odds_ratio_loss": 0.12775567173957825, "rewards/accuracies": 1.0, "rewards/chosen": -0.027608048170804977, "rewards/margins": 0.3375527262687683, "rewards/rejected": -0.3651607930660248, "sft_loss": 0.276080459356308, "step": 1707 }, { "epoch": 2.46999276934201, "grad_norm": 2.1862324548948315, "learning_rate": 5.266747514331943e-06, "logits/chosen": -0.09375106543302536, "logits/rejected": -0.06606069207191467, "logps/chosen": -0.3427000343799591, "logps/rejected": -4.577975273132324, "loss": 0.3287, "odds_ratio_loss": 0.11397892236709595, "rewards/accuracies": 1.0, "rewards/chosen": -0.03427000343799591, "rewards/margins": 0.4235275089740753, "rewards/rejected": -0.45779749751091003, "sft_loss": 0.3427000343799591, "step": 1708 }, { "epoch": 2.4714389009399857, "grad_norm": 3.017503213679481, "learning_rate": 5.2638011166396765e-06, "logits/chosen": -0.15729525685310364, "logits/rejected": -0.31160077452659607, "logps/chosen": -0.40494704246520996, "logps/rejected": -3.3157148361206055, "loss": 0.3849, "odds_ratio_loss": 0.1344398856163025, "rewards/accuracies": 1.0, "rewards/chosen": -0.04049470275640488, "rewards/margins": 0.29107674956321716, "rewards/rejected": -0.33157145977020264, "sft_loss": 0.40494704246520996, "step": 1709 }, { "epoch": 2.472885032537961, "grad_norm": 2.498208498215936, "learning_rate": 5.26085395699701e-06, "logits/chosen": -0.30746138095855713, "logits/rejected": -0.2394457757472992, "logps/chosen": -0.3937021493911743, "logps/rejected": -3.2749500274658203, "loss": 0.3325, "odds_ratio_loss": 0.13062110543251038, "rewards/accuracies": 1.0, "rewards/chosen": -0.03937021642923355, "rewards/margins": 0.28812479972839355, "rewards/rejected": -0.3274950087070465, "sft_loss": 0.3937021493911743, "step": 1710 }, { "epoch": 2.4743311641359362, "grad_norm": 2.347678712121523, "learning_rate": 5.257906037180797e-06, "logits/chosen": -0.2330562025308609, "logits/rejected": -0.30464982986450195, "logps/chosen": -0.3614528775215149, "logps/rejected": -4.782719135284424, "loss": 0.3511, "odds_ratio_loss": 0.13488833606243134, "rewards/accuracies": 1.0, "rewards/chosen": -0.03614528849720955, "rewards/margins": 0.44212663173675537, "rewards/rejected": -0.4782719016075134, "sft_loss": 0.3614528775215149, "step": 1711 }, { "epoch": 2.475777295733912, "grad_norm": 2.3030121406973025, "learning_rate": 5.2549573589683494e-06, "logits/chosen": -0.129247784614563, "logits/rejected": -0.21539118885993958, "logps/chosen": -0.23400619626045227, "logps/rejected": -3.61014986038208, "loss": 0.3575, "odds_ratio_loss": 0.0835333988070488, "rewards/accuracies": 1.0, "rewards/chosen": -0.023400619626045227, "rewards/margins": 0.33761438727378845, "rewards/rejected": -0.3610149919986725, "sft_loss": 0.23400619626045227, "step": 1712 }, { "epoch": 2.477223427331887, "grad_norm": 14.891392154897925, "learning_rate": 5.252007924137435e-06, "logits/chosen": -0.20700593292713165, "logits/rejected": -0.1795155555009842, "logps/chosen": -0.37232303619384766, "logps/rejected": -3.1692867279052734, "loss": 0.3599, "odds_ratio_loss": 0.08920063823461533, "rewards/accuracies": 1.0, "rewards/chosen": -0.03723230957984924, "rewards/margins": 0.27969637513160706, "rewards/rejected": -0.3169286847114563, "sft_loss": 0.37232303619384766, "step": 1713 }, { "epoch": 2.4786695589298624, "grad_norm": 2.223481410602812, "learning_rate": 5.24905773446628e-06, "logits/chosen": -0.13908199965953827, "logits/rejected": -0.13732969760894775, "logps/chosen": -0.38454872369766235, "logps/rejected": -4.367215156555176, "loss": 0.3717, "odds_ratio_loss": 0.12911780178546906, "rewards/accuracies": 1.0, "rewards/chosen": -0.038454875349998474, "rewards/margins": 0.39826661348342896, "rewards/rejected": -0.4367215037345886, "sft_loss": 0.38454872369766235, "step": 1714 }, { "epoch": 2.480115690527838, "grad_norm": 2.161222093495033, "learning_rate": 5.2461067917335655e-06, "logits/chosen": -0.18323522806167603, "logits/rejected": -0.262523353099823, "logps/chosen": -0.3305858373641968, "logps/rejected": -3.8417184352874756, "loss": 0.3449, "odds_ratio_loss": 0.11526430398225784, "rewards/accuracies": 1.0, "rewards/chosen": -0.03305858373641968, "rewards/margins": 0.3511132299900055, "rewards/rejected": -0.38417184352874756, "sft_loss": 0.3305858373641968, "step": 1715 }, { "epoch": 2.4815618221258133, "grad_norm": 2.1036749771049172, "learning_rate": 5.2431550977184255e-06, "logits/chosen": -0.11821121722459793, "logits/rejected": -0.08068836480379105, "logps/chosen": -0.327120840549469, "logps/rejected": -4.980037689208984, "loss": 0.3295, "odds_ratio_loss": 0.12885092198848724, "rewards/accuracies": 1.0, "rewards/chosen": -0.03271208703517914, "rewards/margins": 0.4652916491031647, "rewards/rejected": -0.4980037212371826, "sft_loss": 0.327120840549469, "step": 1716 }, { "epoch": 2.483007953723789, "grad_norm": 2.229147869090855, "learning_rate": 5.240202654200448e-06, "logits/chosen": -0.24735526740550995, "logits/rejected": -0.19834265112876892, "logps/chosen": -0.313556432723999, "logps/rejected": -2.68381404876709, "loss": 0.3639, "odds_ratio_loss": 0.12288917601108551, "rewards/accuracies": 1.0, "rewards/chosen": -0.03135564550757408, "rewards/margins": 0.23702575266361237, "rewards/rejected": -0.26838141679763794, "sft_loss": 0.313556432723999, "step": 1717 }, { "epoch": 2.4844540853217643, "grad_norm": 3.1343885146836885, "learning_rate": 5.237249462959671e-06, "logits/chosen": -0.03448060154914856, "logits/rejected": -0.14667919278144836, "logps/chosen": -0.4028518795967102, "logps/rejected": -3.0416674613952637, "loss": 0.3938, "odds_ratio_loss": 0.09556388854980469, "rewards/accuracies": 1.0, "rewards/chosen": -0.04028518870472908, "rewards/margins": 0.2638815641403198, "rewards/rejected": -0.3041667342185974, "sft_loss": 0.4028518795967102, "step": 1718 }, { "epoch": 2.4859002169197395, "grad_norm": 2.5207532915019506, "learning_rate": 5.234295525776583e-06, "logits/chosen": -0.3418402075767517, "logits/rejected": -0.3059270977973938, "logps/chosen": -0.42127281427383423, "logps/rejected": -4.643973350524902, "loss": 0.3958, "odds_ratio_loss": 0.11073525995016098, "rewards/accuracies": 1.0, "rewards/chosen": -0.04212728142738342, "rewards/margins": 0.4222700595855713, "rewards/rejected": -0.4643973410129547, "sft_loss": 0.42127281427383423, "step": 1719 }, { "epoch": 2.487346348517715, "grad_norm": 2.3498993433765683, "learning_rate": 5.231340844432127e-06, "logits/chosen": -0.07430073618888855, "logits/rejected": -0.014699429273605347, "logps/chosen": -0.2515304386615753, "logps/rejected": -4.094211101531982, "loss": 0.3621, "odds_ratio_loss": 0.13141734898090363, "rewards/accuracies": 1.0, "rewards/chosen": -0.025153042748570442, "rewards/margins": 0.3842681050300598, "rewards/rejected": -0.4094211161136627, "sft_loss": 0.2515304386615753, "step": 1720 }, { "epoch": 2.4887924801156904, "grad_norm": 2.257830950000833, "learning_rate": 5.228385420707688e-06, "logits/chosen": -0.2305629849433899, "logits/rejected": -0.22776669263839722, "logps/chosen": -0.2552865445613861, "logps/rejected": -4.56567907333374, "loss": 0.332, "odds_ratio_loss": 0.09708862006664276, "rewards/accuracies": 1.0, "rewards/chosen": -0.02552865631878376, "rewards/margins": 0.4310392737388611, "rewards/rejected": -0.4565679430961609, "sft_loss": 0.2552865445613861, "step": 1721 }, { "epoch": 2.490238611713666, "grad_norm": 3.0069655541822193, "learning_rate": 5.225429256385107e-06, "logits/chosen": -0.3482007384300232, "logits/rejected": -0.18393605947494507, "logps/chosen": -0.31930986046791077, "logps/rejected": -2.396862268447876, "loss": 0.337, "odds_ratio_loss": 0.13777418434619904, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03193098306655884, "rewards/margins": 0.20775523781776428, "rewards/rejected": -0.23968622088432312, "sft_loss": 0.31930986046791077, "step": 1722 }, { "epoch": 2.4916847433116414, "grad_norm": 2.5308497032909116, "learning_rate": 5.2224723532466615e-06, "logits/chosen": -0.1514590084552765, "logits/rejected": -0.20194968581199646, "logps/chosen": -0.4799078702926636, "logps/rejected": -3.5649938583374023, "loss": 0.3802, "odds_ratio_loss": 0.19344067573547363, "rewards/accuracies": 1.0, "rewards/chosen": -0.04799078777432442, "rewards/margins": 0.30850860476493835, "rewards/rejected": -0.3564993739128113, "sft_loss": 0.4799078702926636, "step": 1723 }, { "epoch": 2.4931308749096166, "grad_norm": 2.264852029050408, "learning_rate": 5.219514713075082e-06, "logits/chosen": -0.4040144681930542, "logits/rejected": -0.2476567029953003, "logps/chosen": -0.3959325850009918, "logps/rejected": -3.717017650604248, "loss": 0.3556, "odds_ratio_loss": 0.12017939984798431, "rewards/accuracies": 1.0, "rewards/chosen": -0.03959326446056366, "rewards/margins": 0.3321084976196289, "rewards/rejected": -0.37170177698135376, "sft_loss": 0.3959325850009918, "step": 1724 }, { "epoch": 2.4945770065075923, "grad_norm": 2.322670274244093, "learning_rate": 5.216556337653538e-06, "logits/chosen": -0.19745758175849915, "logits/rejected": -0.2630073130130768, "logps/chosen": -0.4968724846839905, "logps/rejected": -4.204809188842773, "loss": 0.4631, "odds_ratio_loss": 0.1597743183374405, "rewards/accuracies": 1.0, "rewards/chosen": -0.04968724772334099, "rewards/margins": 0.3707936704158783, "rewards/rejected": -0.4204809069633484, "sft_loss": 0.4968724846839905, "step": 1725 }, { "epoch": 2.4960231381055675, "grad_norm": 2.2728574888436226, "learning_rate": 5.213597228765649e-06, "logits/chosen": -0.25864142179489136, "logits/rejected": -0.29847434163093567, "logps/chosen": -0.3347530663013458, "logps/rejected": -2.365760087966919, "loss": 0.3878, "odds_ratio_loss": 0.11688689887523651, "rewards/accuracies": 1.0, "rewards/chosen": -0.03347530961036682, "rewards/margins": 0.20310071110725403, "rewards/rejected": -0.23657602071762085, "sft_loss": 0.3347530663013458, "step": 1726 }, { "epoch": 2.497469269703543, "grad_norm": 2.259706025611289, "learning_rate": 5.210637388195471e-06, "logits/chosen": -0.25694119930267334, "logits/rejected": -0.33037617802619934, "logps/chosen": -0.42037951946258545, "logps/rejected": -3.479623794555664, "loss": 0.4642, "odds_ratio_loss": 0.13307222723960876, "rewards/accuracies": 1.0, "rewards/chosen": -0.042037952691316605, "rewards/margins": 0.3059244155883789, "rewards/rejected": -0.347962349653244, "sft_loss": 0.42037951946258545, "step": 1727 }, { "epoch": 2.4989154013015185, "grad_norm": 3.456435966726747, "learning_rate": 5.207676817727501e-06, "logits/chosen": -0.19776073098182678, "logits/rejected": -0.1664964109659195, "logps/chosen": -0.46457743644714355, "logps/rejected": -3.276613473892212, "loss": 0.4425, "odds_ratio_loss": 0.41586270928382874, "rewards/accuracies": 0.875, "rewards/chosen": -0.046457745134830475, "rewards/margins": 0.28120362758636475, "rewards/rejected": -0.3276613652706146, "sft_loss": 0.46457743644714355, "step": 1728 }, { "epoch": 2.5003615328994937, "grad_norm": 2.383659929319483, "learning_rate": 5.204715519146681e-06, "logits/chosen": -0.3448188602924347, "logits/rejected": -0.32050418853759766, "logps/chosen": -0.4060935378074646, "logps/rejected": -2.0824034214019775, "loss": 0.3698, "odds_ratio_loss": 0.15973150730133057, "rewards/accuracies": 1.0, "rewards/chosen": -0.04060935229063034, "rewards/margins": 0.16763100028038025, "rewards/rejected": -0.2082403302192688, "sft_loss": 0.4060935378074646, "step": 1729 }, { "epoch": 2.5018076644974694, "grad_norm": 2.3443217772376737, "learning_rate": 5.201753494238388e-06, "logits/chosen": -0.24633188545703888, "logits/rejected": -0.14402739703655243, "logps/chosen": -0.38479936122894287, "logps/rejected": -2.347269058227539, "loss": 0.3658, "odds_ratio_loss": 0.15785683691501617, "rewards/accuracies": 1.0, "rewards/chosen": -0.03847993165254593, "rewards/margins": 0.1962469518184662, "rewards/rejected": -0.2347269058227539, "sft_loss": 0.38479936122894287, "step": 1730 }, { "epoch": 2.5032537960954446, "grad_norm": 2.37665885239867, "learning_rate": 5.198790744788437e-06, "logits/chosen": -0.21425655484199524, "logits/rejected": -0.22479024529457092, "logps/chosen": -0.2819811999797821, "logps/rejected": -2.8894686698913574, "loss": 0.2802, "odds_ratio_loss": 0.10602892190217972, "rewards/accuracies": 1.0, "rewards/chosen": -0.0281981211155653, "rewards/margins": 0.2607487440109253, "rewards/rejected": -0.28894686698913574, "sft_loss": 0.2819811999797821, "step": 1731 }, { "epoch": 2.5046999276934203, "grad_norm": 2.314013984407445, "learning_rate": 5.195827272583081e-06, "logits/chosen": -0.26546451449394226, "logits/rejected": -0.3177344799041748, "logps/chosen": -0.35778549313545227, "logps/rejected": -5.036369800567627, "loss": 0.4006, "odds_ratio_loss": 0.09799312800168991, "rewards/accuracies": 1.0, "rewards/chosen": -0.03577854856848717, "rewards/margins": 0.4678584933280945, "rewards/rejected": -0.5036370158195496, "sft_loss": 0.35778549313545227, "step": 1732 }, { "epoch": 2.5061460592913956, "grad_norm": 3.047246994936488, "learning_rate": 5.192863079409009e-06, "logits/chosen": -0.22313661873340607, "logits/rejected": -0.12890973687171936, "logps/chosen": -0.3412661850452423, "logps/rejected": -4.243041515350342, "loss": 0.3225, "odds_ratio_loss": 0.12478017061948776, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03412661701440811, "rewards/margins": 0.39017754793167114, "rewards/rejected": -0.42430418729782104, "sft_loss": 0.3412661850452423, "step": 1733 }, { "epoch": 2.507592190889371, "grad_norm": 2.490857880194928, "learning_rate": 5.189898167053344e-06, "logits/chosen": -0.1840282380580902, "logits/rejected": -0.06506327539682388, "logps/chosen": -0.26957932114601135, "logps/rejected": -4.575106143951416, "loss": 0.3101, "odds_ratio_loss": 0.15700577199459076, "rewards/accuracies": 0.875, "rewards/chosen": -0.026957932859659195, "rewards/margins": 0.4305526614189148, "rewards/rejected": -0.4575105905532837, "sft_loss": 0.26957932114601135, "step": 1734 }, { "epoch": 2.5090383224873465, "grad_norm": 2.9187090161830582, "learning_rate": 5.186932537303642e-06, "logits/chosen": -0.4572184383869171, "logits/rejected": -0.3056212067604065, "logps/chosen": -0.40126121044158936, "logps/rejected": -3.7890090942382812, "loss": 0.3877, "odds_ratio_loss": 0.1334933638572693, "rewards/accuracies": 1.0, "rewards/chosen": -0.04012611508369446, "rewards/margins": 0.33877480030059814, "rewards/rejected": -0.3789009153842926, "sft_loss": 0.40126121044158936, "step": 1735 }, { "epoch": 2.5104844540853217, "grad_norm": 2.377849930238879, "learning_rate": 5.183966191947893e-06, "logits/chosen": -0.2291964888572693, "logits/rejected": -0.21579010784626007, "logps/chosen": -0.360879123210907, "logps/rejected": -2.2859559059143066, "loss": 0.3647, "odds_ratio_loss": 0.16899463534355164, "rewards/accuracies": 1.0, "rewards/chosen": -0.03608791530132294, "rewards/margins": 0.19250769913196564, "rewards/rejected": -0.22859559953212738, "sft_loss": 0.360879123210907, "step": 1736 }, { "epoch": 2.511930585683297, "grad_norm": 2.6284904360262606, "learning_rate": 5.180999132774517e-06, "logits/chosen": -0.18004505336284637, "logits/rejected": -0.17705950140953064, "logps/chosen": -0.40871530771255493, "logps/rejected": -3.158888101577759, "loss": 0.3391, "odds_ratio_loss": 0.15241380035877228, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04087153077125549, "rewards/margins": 0.27501729130744934, "rewards/rejected": -0.31588882207870483, "sft_loss": 0.40871530771255493, "step": 1737 }, { "epoch": 2.5133767172812727, "grad_norm": 2.8519916155319454, "learning_rate": 5.1780313615723655e-06, "logits/chosen": -0.13925258815288544, "logits/rejected": -0.0671801045536995, "logps/chosen": -0.35058093070983887, "logps/rejected": -2.5616981983184814, "loss": 0.3681, "odds_ratio_loss": 0.16618746519088745, "rewards/accuracies": 1.0, "rewards/chosen": -0.03505809232592583, "rewards/margins": 0.2211117297410965, "rewards/rejected": -0.2561698257923126, "sft_loss": 0.35058093070983887, "step": 1738 }, { "epoch": 2.514822848879248, "grad_norm": 2.4797827030221598, "learning_rate": 5.175062880130719e-06, "logits/chosen": -0.11228927969932556, "logits/rejected": -0.11741993576288223, "logps/chosen": -0.4418836832046509, "logps/rejected": -3.7302236557006836, "loss": 0.3823, "odds_ratio_loss": 0.1561032235622406, "rewards/accuracies": 1.0, "rewards/chosen": -0.04418836906552315, "rewards/margins": 0.3288339674472809, "rewards/rejected": -0.3730223774909973, "sft_loss": 0.4418836832046509, "step": 1739 }, { "epoch": 2.5162689804772236, "grad_norm": 2.2932480834373874, "learning_rate": 5.172093690239284e-06, "logits/chosen": -0.031644146889448166, "logits/rejected": -0.13634979724884033, "logps/chosen": -0.25486522912979126, "logps/rejected": -4.379749298095703, "loss": 0.3217, "odds_ratio_loss": 0.09034018963575363, "rewards/accuracies": 1.0, "rewards/chosen": -0.025486523285508156, "rewards/margins": 0.4124884307384491, "rewards/rejected": -0.4379749298095703, "sft_loss": 0.25486522912979126, "step": 1740 }, { "epoch": 2.517715112075199, "grad_norm": 2.1654095204528847, "learning_rate": 5.1691237936881994e-06, "logits/chosen": -0.2752266526222229, "logits/rejected": -0.2713732421398163, "logps/chosen": -0.34550338983535767, "logps/rejected": -2.8938426971435547, "loss": 0.3392, "odds_ratio_loss": 0.15460900962352753, "rewards/accuracies": 1.0, "rewards/chosen": -0.03455033898353577, "rewards/margins": 0.2548339366912842, "rewards/rejected": -0.28938430547714233, "sft_loss": 0.34550338983535767, "step": 1741 }, { "epoch": 2.5191612436731745, "grad_norm": 2.6936346195386016, "learning_rate": 5.166153192268025e-06, "logits/chosen": -0.17216843366622925, "logits/rejected": -0.24632829427719116, "logps/chosen": -0.44458234310150146, "logps/rejected": -2.9181787967681885, "loss": 0.4975, "odds_ratio_loss": 0.2072901576757431, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04445823282003403, "rewards/margins": 0.24735963344573975, "rewards/rejected": -0.29181787371635437, "sft_loss": 0.44458234310150146, "step": 1742 }, { "epoch": 2.5206073752711498, "grad_norm": 2.3809133922547736, "learning_rate": 5.163181887769747e-06, "logits/chosen": -0.2375820130109787, "logits/rejected": -0.16164171695709229, "logps/chosen": -0.2919715642929077, "logps/rejected": -3.8255269527435303, "loss": 0.345, "odds_ratio_loss": 0.11046281456947327, "rewards/accuracies": 1.0, "rewards/chosen": -0.02919716015458107, "rewards/margins": 0.3533555567264557, "rewards/rejected": -0.3825526833534241, "sft_loss": 0.2919715642929077, "step": 1743 }, { "epoch": 2.522053506869125, "grad_norm": 1.961472431039239, "learning_rate": 5.160209881984777e-06, "logits/chosen": -0.13250018656253815, "logits/rejected": -0.08406895399093628, "logps/chosen": -0.3599141836166382, "logps/rejected": -3.0004963874816895, "loss": 0.3582, "odds_ratio_loss": 0.19820904731750488, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03599141910672188, "rewards/margins": 0.2640582323074341, "rewards/rejected": -0.30004966259002686, "sft_loss": 0.3599141836166382, "step": 1744 }, { "epoch": 2.5234996384671007, "grad_norm": 2.452443666828418, "learning_rate": 5.15723717670495e-06, "logits/chosen": -0.30242788791656494, "logits/rejected": -0.23525869846343994, "logps/chosen": -0.3961702883243561, "logps/rejected": -2.2211079597473145, "loss": 0.3799, "odds_ratio_loss": 0.16140399873256683, "rewards/accuracies": 1.0, "rewards/chosen": -0.03961702808737755, "rewards/margins": 0.18249374628067017, "rewards/rejected": -0.222110778093338, "sft_loss": 0.3961702883243561, "step": 1745 }, { "epoch": 2.524945770065076, "grad_norm": 2.28124974891748, "learning_rate": 5.154263773722517e-06, "logits/chosen": -0.32361429929733276, "logits/rejected": -0.21904835104942322, "logps/chosen": -0.40971463918685913, "logps/rejected": -3.0012941360473633, "loss": 0.3653, "odds_ratio_loss": 0.15136511623859406, "rewards/accuracies": 1.0, "rewards/chosen": -0.04097146540880203, "rewards/margins": 0.2591579556465149, "rewards/rejected": -0.30012941360473633, "sft_loss": 0.40971463918685913, "step": 1746 }, { "epoch": 2.526391901663051, "grad_norm": 4.146284761192451, "learning_rate": 5.151289674830156e-06, "logits/chosen": -0.2679465711116791, "logits/rejected": -0.2800706923007965, "logps/chosen": -0.33803799748420715, "logps/rejected": -4.649941921234131, "loss": 0.4659, "odds_ratio_loss": 0.09396877884864807, "rewards/accuracies": 1.0, "rewards/chosen": -0.033803801983594894, "rewards/margins": 0.4311903417110443, "rewards/rejected": -0.4649941623210907, "sft_loss": 0.33803799748420715, "step": 1747 }, { "epoch": 2.527838033261027, "grad_norm": 2.5255140596275494, "learning_rate": 5.1483148818209625e-06, "logits/chosen": -0.2974938750267029, "logits/rejected": -0.24165314435958862, "logps/chosen": -0.2745737135410309, "logps/rejected": -3.838913917541504, "loss": 0.3668, "odds_ratio_loss": 0.09007024019956589, "rewards/accuracies": 1.0, "rewards/chosen": -0.02745737135410309, "rewards/margins": 0.35643401741981506, "rewards/rejected": -0.38389140367507935, "sft_loss": 0.2745737135410309, "step": 1748 }, { "epoch": 2.529284164859002, "grad_norm": 2.338760253007698, "learning_rate": 5.145339396488451e-06, "logits/chosen": -0.24079769849777222, "logits/rejected": -0.19569242000579834, "logps/chosen": -0.3212064504623413, "logps/rejected": -3.878096580505371, "loss": 0.408, "odds_ratio_loss": 0.11682701855897903, "rewards/accuracies": 1.0, "rewards/chosen": -0.03212064504623413, "rewards/margins": 0.35568904876708984, "rewards/rejected": -0.387809693813324, "sft_loss": 0.3212064504623413, "step": 1749 }, { "epoch": 2.5307302964569773, "grad_norm": 2.828854798037771, "learning_rate": 5.142363220626551e-06, "logits/chosen": -0.08817961812019348, "logits/rejected": -0.052211862057447433, "logps/chosen": -0.31730931997299194, "logps/rejected": -1.8730766773223877, "loss": 0.3626, "odds_ratio_loss": 0.10939719527959824, "rewards/accuracies": 1.0, "rewards/chosen": -0.03173093497753143, "rewards/margins": 0.15557675063610077, "rewards/rejected": -0.187307670712471, "sft_loss": 0.31730931997299194, "step": 1750 }, { "epoch": 2.532176428054953, "grad_norm": 2.378524988191866, "learning_rate": 5.13938635602961e-06, "logits/chosen": -0.2921004891395569, "logits/rejected": -0.13167288899421692, "logps/chosen": -0.4142140746116638, "logps/rejected": -4.806788921356201, "loss": 0.3473, "odds_ratio_loss": 0.15716761350631714, "rewards/accuracies": 1.0, "rewards/chosen": -0.04142140597105026, "rewards/margins": 0.4392574727535248, "rewards/rejected": -0.480678915977478, "sft_loss": 0.4142140746116638, "step": 1751 }, { "epoch": 2.5336225596529283, "grad_norm": 2.206670746385489, "learning_rate": 5.136408804492392e-06, "logits/chosen": -0.16804078221321106, "logits/rejected": -0.12704414129257202, "logps/chosen": -0.4827273190021515, "logps/rejected": -3.0154800415039062, "loss": 0.415, "odds_ratio_loss": 0.19364948570728302, "rewards/accuracies": 1.0, "rewards/chosen": -0.04827273264527321, "rewards/margins": 0.2532752454280853, "rewards/rejected": -0.30154797434806824, "sft_loss": 0.4827273190021515, "step": 1752 }, { "epoch": 2.535068691250904, "grad_norm": 2.7522459127137004, "learning_rate": 5.133430567810073e-06, "logits/chosen": -0.4092782437801361, "logits/rejected": -0.4329729378223419, "logps/chosen": -0.40332627296447754, "logps/rejected": -2.8373968601226807, "loss": 0.3651, "odds_ratio_loss": 0.175297811627388, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04033263027667999, "rewards/margins": 0.24340707063674927, "rewards/rejected": -0.28373971581459045, "sft_loss": 0.40332627296447754, "step": 1753 }, { "epoch": 2.536514822848879, "grad_norm": 2.438351487855574, "learning_rate": 5.1304516477782444e-06, "logits/chosen": -0.32628947496414185, "logits/rejected": -0.30095720291137695, "logps/chosen": -0.3710482716560364, "logps/rejected": -4.039909362792969, "loss": 0.3771, "odds_ratio_loss": 0.1693974733352661, "rewards/accuracies": 1.0, "rewards/chosen": -0.03710482642054558, "rewards/margins": 0.3668861389160156, "rewards/rejected": -0.4039909541606903, "sft_loss": 0.3710482716560364, "step": 1754 }, { "epoch": 2.537960954446855, "grad_norm": 2.222626728874631, "learning_rate": 5.127472046192904e-06, "logits/chosen": -0.24637198448181152, "logits/rejected": -0.21760375797748566, "logps/chosen": -0.5183929204940796, "logps/rejected": -4.226579666137695, "loss": 0.3649, "odds_ratio_loss": 0.17385032773017883, "rewards/accuracies": 1.0, "rewards/chosen": -0.05183929204940796, "rewards/margins": 0.3708186745643616, "rewards/rejected": -0.4226579964160919, "sft_loss": 0.5183929204940796, "step": 1755 }, { "epoch": 2.53940708604483, "grad_norm": 2.3187889318353427, "learning_rate": 5.12449176485047e-06, "logits/chosen": -0.32632243633270264, "logits/rejected": -0.23128627240657806, "logps/chosen": -0.41758835315704346, "logps/rejected": -3.34269380569458, "loss": 0.3577, "odds_ratio_loss": 0.1132265031337738, "rewards/accuracies": 1.0, "rewards/chosen": -0.041758835315704346, "rewards/margins": 0.2925105690956116, "rewards/rejected": -0.3342694044113159, "sft_loss": 0.41758835315704346, "step": 1756 }, { "epoch": 2.5408532176428054, "grad_norm": 2.454164577145383, "learning_rate": 5.121510805547764e-06, "logits/chosen": -0.2939985692501068, "logits/rejected": -0.239321768283844, "logps/chosen": -0.34688299894332886, "logps/rejected": -2.9085657596588135, "loss": 0.3901, "odds_ratio_loss": 0.1403641402721405, "rewards/accuracies": 0.9375, "rewards/chosen": -0.034688301384449005, "rewards/margins": 0.25616827607154846, "rewards/rejected": -0.29085657000541687, "sft_loss": 0.34688299894332886, "step": 1757 }, { "epoch": 2.542299349240781, "grad_norm": 2.434087179736664, "learning_rate": 5.118529170082016e-06, "logits/chosen": -0.15723645687103271, "logits/rejected": -0.11477909237146378, "logps/chosen": -0.4263688325881958, "logps/rejected": -2.351132392883301, "loss": 0.3856, "odds_ratio_loss": 0.2027743011713028, "rewards/accuracies": 0.875, "rewards/chosen": -0.04263688623905182, "rewards/margins": 0.19247636198997498, "rewards/rejected": -0.2351132482290268, "sft_loss": 0.4263688325881958, "step": 1758 }, { "epoch": 2.5437454808387563, "grad_norm": 2.066090453677344, "learning_rate": 5.115546860250865e-06, "logits/chosen": -0.20648221671581268, "logits/rejected": -0.14154908061027527, "logps/chosen": -0.4318174123764038, "logps/rejected": -2.9517130851745605, "loss": 0.3699, "odds_ratio_loss": 0.22002653777599335, "rewards/accuracies": 0.875, "rewards/chosen": -0.04318173974752426, "rewards/margins": 0.25198957324028015, "rewards/rejected": -0.295171320438385, "sft_loss": 0.4318174123764038, "step": 1759 }, { "epoch": 2.5451916124367315, "grad_norm": 3.54077227747576, "learning_rate": 5.112563877852356e-06, "logits/chosen": -0.29718634486198425, "logits/rejected": -0.2714141607284546, "logps/chosen": -0.4731605350971222, "logps/rejected": -2.4305949211120605, "loss": 0.3938, "odds_ratio_loss": 0.15969420969486237, "rewards/accuracies": 1.0, "rewards/chosen": -0.0473160557448864, "rewards/margins": 0.19574342668056488, "rewards/rejected": -0.24305948615074158, "sft_loss": 0.4731605350971222, "step": 1760 }, { "epoch": 2.546637744034707, "grad_norm": 2.1618260984771016, "learning_rate": 5.1095802246849435e-06, "logits/chosen": -0.2292264997959137, "logits/rejected": -0.15521396696567535, "logps/chosen": -0.3275163173675537, "logps/rejected": -3.1341147422790527, "loss": 0.4518, "odds_ratio_loss": 0.1408037543296814, "rewards/accuracies": 1.0, "rewards/chosen": -0.03275163471698761, "rewards/margins": 0.28065982460975647, "rewards/rejected": -0.3134114444255829, "sft_loss": 0.3275163173675537, "step": 1761 }, { "epoch": 2.5480838756326825, "grad_norm": 2.568063340628784, "learning_rate": 5.10659590254748e-06, "logits/chosen": -0.2893018126487732, "logits/rejected": -0.28161394596099854, "logps/chosen": -0.3742448687553406, "logps/rejected": -2.4380226135253906, "loss": 0.3964, "odds_ratio_loss": 0.12025295197963715, "rewards/accuracies": 1.0, "rewards/chosen": -0.037424489855766296, "rewards/margins": 0.206377774477005, "rewards/rejected": -0.2438022643327713, "sft_loss": 0.3742448687553406, "step": 1762 }, { "epoch": 2.549530007230658, "grad_norm": 3.318379202971589, "learning_rate": 5.103610913239225e-06, "logits/chosen": -0.17492324113845825, "logits/rejected": -0.1730252504348755, "logps/chosen": -0.3307463526725769, "logps/rejected": -4.673501014709473, "loss": 0.401, "odds_ratio_loss": 0.11378967761993408, "rewards/accuracies": 1.0, "rewards/chosen": -0.03307463601231575, "rewards/margins": 0.43427544832229614, "rewards/rejected": -0.4673501253128052, "sft_loss": 0.3307463526725769, "step": 1763 }, { "epoch": 2.5509761388286334, "grad_norm": 2.2137214503057017, "learning_rate": 5.100625258559841e-06, "logits/chosen": -0.09644006192684174, "logits/rejected": -0.12039525806903839, "logps/chosen": -0.3201032280921936, "logps/rejected": -2.2959609031677246, "loss": 0.3397, "odds_ratio_loss": 0.1471930742263794, "rewards/accuracies": 1.0, "rewards/chosen": -0.03201032429933548, "rewards/margins": 0.19758576154708862, "rewards/rejected": -0.2295960783958435, "sft_loss": 0.3201032280921936, "step": 1764 }, { "epoch": 2.552422270426609, "grad_norm": 2.450947199520994, "learning_rate": 5.097638940309389e-06, "logits/chosen": -0.32353049516677856, "logits/rejected": -0.3000898063182831, "logps/chosen": -0.41360723972320557, "logps/rejected": -2.5127758979797363, "loss": 0.3678, "odds_ratio_loss": 0.15336181223392487, "rewards/accuracies": 1.0, "rewards/chosen": -0.041360728442668915, "rewards/margins": 0.2099168598651886, "rewards/rejected": -0.2512775957584381, "sft_loss": 0.41360723972320557, "step": 1765 }, { "epoch": 2.5538684020245843, "grad_norm": 2.4017350682281604, "learning_rate": 5.094651960288332e-06, "logits/chosen": -0.1385897994041443, "logits/rejected": -0.19996079802513123, "logps/chosen": -0.21565288305282593, "logps/rejected": -4.9466094970703125, "loss": 0.3049, "odds_ratio_loss": 0.032502688467502594, "rewards/accuracies": 1.0, "rewards/chosen": -0.02156529203057289, "rewards/margins": 0.4730956256389618, "rewards/rejected": -0.494660884141922, "sft_loss": 0.21565288305282593, "step": 1766 }, { "epoch": 2.5553145336225596, "grad_norm": 2.5067478353690413, "learning_rate": 5.0916643202975305e-06, "logits/chosen": -0.20666462182998657, "logits/rejected": -0.2589164972305298, "logps/chosen": -0.3715488612651825, "logps/rejected": -2.179995059967041, "loss": 0.441, "odds_ratio_loss": 0.16341277956962585, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03715488687157631, "rewards/margins": 0.18084463477134705, "rewards/rejected": -0.21799951791763306, "sft_loss": 0.3715488612651825, "step": 1767 }, { "epoch": 2.5567606652205352, "grad_norm": 2.0202885964031836, "learning_rate": 5.088676022138245e-06, "logits/chosen": -0.23196789622306824, "logits/rejected": -0.19383683800697327, "logps/chosen": -0.24499648809432983, "logps/rejected": -4.9586405754089355, "loss": 0.259, "odds_ratio_loss": 0.10917812585830688, "rewards/accuracies": 1.0, "rewards/chosen": -0.024499651044607162, "rewards/margins": 0.47136440873146057, "rewards/rejected": -0.49586403369903564, "sft_loss": 0.24499648809432983, "step": 1768 }, { "epoch": 2.5582067968185105, "grad_norm": 2.1272904237171466, "learning_rate": 5.0856870676121304e-06, "logits/chosen": -0.23364225029945374, "logits/rejected": -0.2448486089706421, "logps/chosen": -0.36238884925842285, "logps/rejected": -3.3999645709991455, "loss": 0.3577, "odds_ratio_loss": 0.18717606365680695, "rewards/accuracies": 0.9375, "rewards/chosen": -0.036238886415958405, "rewards/margins": 0.30375754833221436, "rewards/rejected": -0.33999642729759216, "sft_loss": 0.36238884925842285, "step": 1769 }, { "epoch": 2.5596529284164857, "grad_norm": 2.284375817275679, "learning_rate": 5.082697458521241e-06, "logits/chosen": -0.25109994411468506, "logits/rejected": -0.22882197797298431, "logps/chosen": -0.40674877166748047, "logps/rejected": -1.9313181638717651, "loss": 0.398, "odds_ratio_loss": 0.10357539355754852, "rewards/accuracies": 1.0, "rewards/chosen": -0.040674880146980286, "rewards/margins": 0.15245693922042847, "rewards/rejected": -0.19313181936740875, "sft_loss": 0.40674877166748047, "step": 1770 }, { "epoch": 2.5610990600144614, "grad_norm": 2.4387901998478596, "learning_rate": 5.079707196668019e-06, "logits/chosen": -0.26109209656715393, "logits/rejected": -0.16213780641555786, "logps/chosen": -0.3069310784339905, "logps/rejected": -5.687686920166016, "loss": 0.312, "odds_ratio_loss": 0.09986848384141922, "rewards/accuracies": 1.0, "rewards/chosen": -0.030693108215928078, "rewards/margins": 0.5380756258964539, "rewards/rejected": -0.5687687397003174, "sft_loss": 0.3069310784339905, "step": 1771 }, { "epoch": 2.5625451916124367, "grad_norm": 2.6215024856040445, "learning_rate": 5.076716283855309e-06, "logits/chosen": -0.1803428828716278, "logits/rejected": -0.22443482279777527, "logps/chosen": -0.3120989203453064, "logps/rejected": -2.8625402450561523, "loss": 0.3838, "odds_ratio_loss": 0.11180431395769119, "rewards/accuracies": 1.0, "rewards/chosen": -0.03120989166200161, "rewards/margins": 0.2550441324710846, "rewards/rejected": -0.28625404834747314, "sft_loss": 0.3120989203453064, "step": 1772 }, { "epoch": 2.563991323210412, "grad_norm": 4.162773344211327, "learning_rate": 5.073724721886341e-06, "logits/chosen": -0.2028559148311615, "logits/rejected": -0.3784460425376892, "logps/chosen": -0.37769874930381775, "logps/rejected": -3.5124316215515137, "loss": 0.3303, "odds_ratio_loss": 0.12656457722187042, "rewards/accuracies": 0.9375, "rewards/chosen": -0.037769876420497894, "rewards/margins": 0.31347325444221497, "rewards/rejected": -0.35124313831329346, "sft_loss": 0.37769874930381775, "step": 1773 }, { "epoch": 2.5654374548083876, "grad_norm": 3.3997712168625416, "learning_rate": 5.0707325125647395e-06, "logits/chosen": -0.26617467403411865, "logits/rejected": -0.3048926293849945, "logps/chosen": -0.2930275499820709, "logps/rejected": -4.605068683624268, "loss": 0.3076, "odds_ratio_loss": 0.08594189584255219, "rewards/accuracies": 1.0, "rewards/chosen": -0.02930275723338127, "rewards/margins": 0.4312041103839874, "rewards/rejected": -0.4605068266391754, "sft_loss": 0.2930275499820709, "step": 1774 }, { "epoch": 2.566883586406363, "grad_norm": 2.6963829649064133, "learning_rate": 5.067739657694517e-06, "logits/chosen": -0.2267257273197174, "logits/rejected": -0.24845337867736816, "logps/chosen": -0.40409570932388306, "logps/rejected": -3.9773378372192383, "loss": 0.368, "odds_ratio_loss": 0.11968840658664703, "rewards/accuracies": 1.0, "rewards/chosen": -0.040409572422504425, "rewards/margins": 0.3573242425918579, "rewards/rejected": -0.39773380756378174, "sft_loss": 0.40409570932388306, "step": 1775 }, { "epoch": 2.5683297180043385, "grad_norm": 2.727857096900295, "learning_rate": 5.064746159080079e-06, "logits/chosen": -0.2270076870918274, "logits/rejected": -0.22585347294807434, "logps/chosen": -0.37619680166244507, "logps/rejected": -3.0323472023010254, "loss": 0.4573, "odds_ratio_loss": 0.12163165211677551, "rewards/accuracies": 1.0, "rewards/chosen": -0.03761968016624451, "rewards/margins": 0.2656150460243225, "rewards/rejected": -0.30323469638824463, "sft_loss": 0.37619680166244507, "step": 1776 }, { "epoch": 2.5697758496023138, "grad_norm": 2.065413871507, "learning_rate": 5.061752018526217e-06, "logits/chosen": -0.2135777473449707, "logits/rejected": -0.1215197741985321, "logps/chosen": -0.18856766819953918, "logps/rejected": -5.205729007720947, "loss": 0.2787, "odds_ratio_loss": 0.03695458173751831, "rewards/accuracies": 1.0, "rewards/chosen": -0.018856767565011978, "rewards/margins": 0.501716136932373, "rewards/rejected": -0.5205729007720947, "sft_loss": 0.18856766819953918, "step": 1777 }, { "epoch": 2.5712219812002894, "grad_norm": 3.612771418317987, "learning_rate": 5.058757237838107e-06, "logits/chosen": -0.28231173753738403, "logits/rejected": -0.21534737944602966, "logps/chosen": -0.25970977544784546, "logps/rejected": -3.527226209640503, "loss": 0.3124, "odds_ratio_loss": 0.08613273501396179, "rewards/accuracies": 1.0, "rewards/chosen": -0.025970978662371635, "rewards/margins": 0.3267516493797302, "rewards/rejected": -0.3527226448059082, "sft_loss": 0.25970977544784546, "step": 1778 }, { "epoch": 2.5726681127982647, "grad_norm": 2.3176478326208265, "learning_rate": 5.0557618188213155e-06, "logits/chosen": -0.22027641534805298, "logits/rejected": -0.19152703881263733, "logps/chosen": -0.2974584400653839, "logps/rejected": -4.103922367095947, "loss": 0.3498, "odds_ratio_loss": 0.09893074631690979, "rewards/accuracies": 1.0, "rewards/chosen": -0.02974584326148033, "rewards/margins": 0.3806464374065399, "rewards/rejected": -0.41039225459098816, "sft_loss": 0.2974584400653839, "step": 1779 }, { "epoch": 2.57411424439624, "grad_norm": 2.6204193220471637, "learning_rate": 5.052765763281792e-06, "logits/chosen": -0.10732519626617432, "logits/rejected": -0.1166638433933258, "logps/chosen": -0.3432236313819885, "logps/rejected": -4.708727836608887, "loss": 0.3735, "odds_ratio_loss": 0.1429092288017273, "rewards/accuracies": 0.875, "rewards/chosen": -0.03432236239314079, "rewards/margins": 0.43655040860176086, "rewards/rejected": -0.47087281942367554, "sft_loss": 0.3432236313819885, "step": 1780 }, { "epoch": 2.5755603759942156, "grad_norm": 2.364122545033094, "learning_rate": 5.049769073025869e-06, "logits/chosen": -0.2854674458503723, "logits/rejected": -0.20616787672042847, "logps/chosen": -0.365227073431015, "logps/rejected": -3.223012924194336, "loss": 0.3999, "odds_ratio_loss": 0.14509882032871246, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03652270510792732, "rewards/margins": 0.28577861189842224, "rewards/rejected": -0.32230129837989807, "sft_loss": 0.365227073431015, "step": 1781 }, { "epoch": 2.577006507592191, "grad_norm": 2.3993429794021894, "learning_rate": 5.046771749860261e-06, "logits/chosen": -0.18015708029270172, "logits/rejected": -0.12893535196781158, "logps/chosen": -0.32794156670570374, "logps/rejected": -3.0708723068237305, "loss": 0.361, "odds_ratio_loss": 0.0962129533290863, "rewards/accuracies": 1.0, "rewards/chosen": -0.03279416263103485, "rewards/margins": 0.27429306507110596, "rewards/rejected": -0.307087242603302, "sft_loss": 0.32794156670570374, "step": 1782 }, { "epoch": 2.578452639190166, "grad_norm": 2.244853886626483, "learning_rate": 5.0437737955920665e-06, "logits/chosen": -0.13234691321849823, "logits/rejected": -0.19031468033790588, "logps/chosen": -0.4550319015979767, "logps/rejected": -2.549581527709961, "loss": 0.3881, "odds_ratio_loss": 0.12124250829219818, "rewards/accuracies": 1.0, "rewards/chosen": -0.04550319164991379, "rewards/margins": 0.2094549834728241, "rewards/rejected": -0.2549581825733185, "sft_loss": 0.4550319015979767, "step": 1783 }, { "epoch": 2.579898770788142, "grad_norm": 2.451189109939498, "learning_rate": 5.040775212028764e-06, "logits/chosen": -0.19473308324813843, "logits/rejected": -0.3043419122695923, "logps/chosen": -0.41355323791503906, "logps/rejected": -3.423119068145752, "loss": 0.4049, "odds_ratio_loss": 0.20922225713729858, "rewards/accuracies": 0.875, "rewards/chosen": -0.041355326771736145, "rewards/margins": 0.3009566068649292, "rewards/rejected": -0.34231194853782654, "sft_loss": 0.41355323791503906, "step": 1784 }, { "epoch": 2.581344902386117, "grad_norm": 2.176181910862176, "learning_rate": 5.03777600097821e-06, "logits/chosen": -0.19760388135910034, "logits/rejected": -0.17856121063232422, "logps/chosen": -0.3006751537322998, "logps/rejected": -3.3096654415130615, "loss": 0.2758, "odds_ratio_loss": 0.10000480711460114, "rewards/accuracies": 1.0, "rewards/chosen": -0.03006751835346222, "rewards/margins": 0.30089902877807617, "rewards/rejected": -0.330966591835022, "sft_loss": 0.3006751537322998, "step": 1785 }, { "epoch": 2.5827910339840927, "grad_norm": 2.485935727916885, "learning_rate": 5.034776164248639e-06, "logits/chosen": -0.07469113171100616, "logits/rejected": -0.1526949256658554, "logps/chosen": -0.39017635583877563, "logps/rejected": -3.454286575317383, "loss": 0.3686, "odds_ratio_loss": 0.14497698843479156, "rewards/accuracies": 0.9375, "rewards/chosen": -0.039017632603645325, "rewards/margins": 0.3064110279083252, "rewards/rejected": -0.3454286754131317, "sft_loss": 0.39017635583877563, "step": 1786 }, { "epoch": 2.584237165582068, "grad_norm": 7.36586680246059, "learning_rate": 5.031775703648665e-06, "logits/chosen": -0.15478000044822693, "logits/rejected": -0.13516290485858917, "logps/chosen": -0.469347208738327, "logps/rejected": -2.471134662628174, "loss": 0.35, "odds_ratio_loss": 0.15310752391815186, "rewards/accuracies": 1.0, "rewards/chosen": -0.04693472385406494, "rewards/margins": 0.20017877221107483, "rewards/rejected": -0.24711349606513977, "sft_loss": 0.469347208738327, "step": 1787 }, { "epoch": 2.5856832971800436, "grad_norm": 2.21099952495446, "learning_rate": 5.028774620987278e-06, "logits/chosen": -0.14791055023670197, "logits/rejected": -0.17355845868587494, "logps/chosen": -0.3638818860054016, "logps/rejected": -5.218241214752197, "loss": 0.4219, "odds_ratio_loss": 0.11315981298685074, "rewards/accuracies": 1.0, "rewards/chosen": -0.03638819232583046, "rewards/margins": 0.48543596267700195, "rewards/rejected": -0.5218241214752197, "sft_loss": 0.3638818860054016, "step": 1788 }, { "epoch": 2.587129428778019, "grad_norm": 2.2439650907311957, "learning_rate": 5.025772918073839e-06, "logits/chosen": -0.22446078062057495, "logits/rejected": -0.3045191168785095, "logps/chosen": -0.48371070623397827, "logps/rejected": -1.8402783870697021, "loss": 0.3619, "odds_ratio_loss": 0.21439674496650696, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04837106913328171, "rewards/margins": 0.13565678894519806, "rewards/rejected": -0.18402786552906036, "sft_loss": 0.48371070623397827, "step": 1789 }, { "epoch": 2.588575560375994, "grad_norm": 2.4998273129519264, "learning_rate": 5.0227705967180875e-06, "logits/chosen": -0.2764180302619934, "logits/rejected": -0.19125843048095703, "logps/chosen": -0.33420807123184204, "logps/rejected": -2.7277328968048096, "loss": 0.4544, "odds_ratio_loss": 0.10430102050304413, "rewards/accuracies": 1.0, "rewards/chosen": -0.033420804888010025, "rewards/margins": 0.2393524944782257, "rewards/rejected": -0.2727733254432678, "sft_loss": 0.33420807123184204, "step": 1790 }, { "epoch": 2.59002169197397, "grad_norm": 2.9213180750521617, "learning_rate": 5.019767658730133e-06, "logits/chosen": -0.166767418384552, "logits/rejected": -0.13819444179534912, "logps/chosen": -0.3064895272254944, "logps/rejected": -2.5118443965911865, "loss": 0.3143, "odds_ratio_loss": 0.14545565843582153, "rewards/accuracies": 0.9375, "rewards/chosen": -0.030648954212665558, "rewards/margins": 0.22053547203540802, "rewards/rejected": -0.2511844336986542, "sft_loss": 0.3064895272254944, "step": 1791 }, { "epoch": 2.591467823571945, "grad_norm": 2.3230275123604027, "learning_rate": 5.016764105920462e-06, "logits/chosen": -0.2184886634349823, "logits/rejected": -0.24857985973358154, "logps/chosen": -0.43511849641799927, "logps/rejected": -3.9661684036254883, "loss": 0.4028, "odds_ratio_loss": 0.19005386531352997, "rewards/accuracies": 0.875, "rewards/chosen": -0.04351184517145157, "rewards/margins": 0.35310500860214233, "rewards/rejected": -0.3966168761253357, "sft_loss": 0.43511849641799927, "step": 1792 }, { "epoch": 2.5929139551699203, "grad_norm": 2.3592440154154897, "learning_rate": 5.013759940099921e-06, "logits/chosen": -0.19317913055419922, "logits/rejected": -0.22358053922653198, "logps/chosen": -0.2777489125728607, "logps/rejected": -3.4764034748077393, "loss": 0.3779, "odds_ratio_loss": 0.07792896032333374, "rewards/accuracies": 1.0, "rewards/chosen": -0.02777489274740219, "rewards/margins": 0.3198654353618622, "rewards/rejected": -0.34764033555984497, "sft_loss": 0.2777489125728607, "step": 1793 }, { "epoch": 2.594360086767896, "grad_norm": 2.388230189802143, "learning_rate": 5.010755163079739e-06, "logits/chosen": -0.12977398931980133, "logits/rejected": -0.26473286747932434, "logps/chosen": -0.42577025294303894, "logps/rejected": -2.8500428199768066, "loss": 0.4189, "odds_ratio_loss": 0.19891750812530518, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04257702827453613, "rewards/margins": 0.242427259683609, "rewards/rejected": -0.28500428795814514, "sft_loss": 0.42577025294303894, "step": 1794 }, { "epoch": 2.595806218365871, "grad_norm": 2.622254082892315, "learning_rate": 5.007749776671503e-06, "logits/chosen": -0.282731294631958, "logits/rejected": -0.17344717681407928, "logps/chosen": -0.33832526206970215, "logps/rejected": -2.8043172359466553, "loss": 0.3371, "odds_ratio_loss": 0.10820700973272324, "rewards/accuracies": 1.0, "rewards/chosen": -0.033832527697086334, "rewards/margins": 0.2465991973876953, "rewards/rejected": -0.28043174743652344, "sft_loss": 0.33832526206970215, "step": 1795 }, { "epoch": 2.5972523499638465, "grad_norm": 2.535611097183941, "learning_rate": 5.0047437826871745e-06, "logits/chosen": -0.29366135597229004, "logits/rejected": -0.29001742601394653, "logps/chosen": -0.37644755840301514, "logps/rejected": -2.264528274536133, "loss": 0.3912, "odds_ratio_loss": 0.19424787163734436, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03764475882053375, "rewards/margins": 0.18880808353424072, "rewards/rejected": -0.22645285725593567, "sft_loss": 0.37644755840301514, "step": 1796 }, { "epoch": 2.598698481561822, "grad_norm": 2.46192301019977, "learning_rate": 5.001737182939077e-06, "logits/chosen": -0.32444244623184204, "logits/rejected": -0.18136954307556152, "logps/chosen": -0.4060314893722534, "logps/rejected": -2.0913939476013184, "loss": 0.3662, "odds_ratio_loss": 0.17886869609355927, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0406031496822834, "rewards/margins": 0.1685362458229065, "rewards/rejected": -0.2091393917798996, "sft_loss": 0.4060314893722534, "step": 1797 }, { "epoch": 2.6001446131597974, "grad_norm": 2.4189945586758403, "learning_rate": 4.9987299792399014e-06, "logits/chosen": -0.26979711651802063, "logits/rejected": -0.22070440649986267, "logps/chosen": -0.47620007395744324, "logps/rejected": -4.007668972015381, "loss": 0.4251, "odds_ratio_loss": 0.14132900536060333, "rewards/accuracies": 1.0, "rewards/chosen": -0.047620005905628204, "rewards/margins": 0.35314691066741943, "rewards/rejected": -0.40076693892478943, "sft_loss": 0.47620007395744324, "step": 1798 }, { "epoch": 2.601590744757773, "grad_norm": 2.296226400698763, "learning_rate": 4.995722173402702e-06, "logits/chosen": -0.3933625817298889, "logits/rejected": -0.24747397005558014, "logps/chosen": -0.3423520028591156, "logps/rejected": -2.0690927505493164, "loss": 0.4247, "odds_ratio_loss": 0.1446721851825714, "rewards/accuracies": 1.0, "rewards/chosen": -0.03423519805073738, "rewards/margins": 0.17267407476902008, "rewards/rejected": -0.20690926909446716, "sft_loss": 0.3423520028591156, "step": 1799 }, { "epoch": 2.6030368763557483, "grad_norm": 2.337562593004367, "learning_rate": 4.9927137672408955e-06, "logits/chosen": -0.24255606532096863, "logits/rejected": -0.38033175468444824, "logps/chosen": -0.4118923246860504, "logps/rejected": -3.219302177429199, "loss": 0.3441, "odds_ratio_loss": 0.22141076624393463, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04118923097848892, "rewards/margins": 0.28074100613594055, "rewards/rejected": -0.3219302296638489, "sft_loss": 0.4118923246860504, "step": 1800 }, { "epoch": 2.604483007953724, "grad_norm": 2.4080451586988003, "learning_rate": 4.989704762568262e-06, "logits/chosen": -0.2137508988380432, "logits/rejected": -0.2190382331609726, "logps/chosen": -0.29253143072128296, "logps/rejected": -5.632781982421875, "loss": 0.3804, "odds_ratio_loss": 0.06299417465925217, "rewards/accuracies": 1.0, "rewards/chosen": -0.029253143817186356, "rewards/margins": 0.5340250730514526, "rewards/rejected": -0.5632781982421875, "sft_loss": 0.29253143072128296, "step": 1801 }, { "epoch": 2.6059291395516992, "grad_norm": 2.640139404715311, "learning_rate": 4.986695161198939e-06, "logits/chosen": -0.3997032046318054, "logits/rejected": -0.3515118360519409, "logps/chosen": -0.3487919569015503, "logps/rejected": -3.5432519912719727, "loss": 0.3935, "odds_ratio_loss": 0.08523302525281906, "rewards/accuracies": 1.0, "rewards/chosen": -0.03487919270992279, "rewards/margins": 0.31944602727890015, "rewards/rejected": -0.35432523488998413, "sft_loss": 0.3487919569015503, "step": 1802 }, { "epoch": 2.6073752711496745, "grad_norm": 6.59977106509988, "learning_rate": 4.98368496494743e-06, "logits/chosen": -0.07293814420700073, "logits/rejected": -0.09770108759403229, "logps/chosen": -0.3182450234889984, "logps/rejected": -5.30604362487793, "loss": 0.4086, "odds_ratio_loss": 0.1032043844461441, "rewards/accuracies": 1.0, "rewards/chosen": -0.0318245030939579, "rewards/margins": 0.498779833316803, "rewards/rejected": -0.530604362487793, "sft_loss": 0.3182450234889984, "step": 1803 }, { "epoch": 2.60882140274765, "grad_norm": 2.1346639954836726, "learning_rate": 4.980674175628593e-06, "logits/chosen": -0.3480367660522461, "logits/rejected": -0.2504662573337555, "logps/chosen": -0.270319402217865, "logps/rejected": -4.932693958282471, "loss": 0.3592, "odds_ratio_loss": 0.10022924840450287, "rewards/accuracies": 1.0, "rewards/chosen": -0.02703193947672844, "rewards/margins": 0.46623745560646057, "rewards/rejected": -0.4932693839073181, "sft_loss": 0.270319402217865, "step": 1804 }, { "epoch": 2.6102675343456254, "grad_norm": 2.214914283556716, "learning_rate": 4.977662795057641e-06, "logits/chosen": -0.3243568539619446, "logits/rejected": -0.20184937119483948, "logps/chosen": -0.38584083318710327, "logps/rejected": -4.922272205352783, "loss": 0.4123, "odds_ratio_loss": 0.14743968844413757, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03858408331871033, "rewards/margins": 0.45364317297935486, "rewards/rejected": -0.4922272562980652, "sft_loss": 0.38584083318710327, "step": 1805 }, { "epoch": 2.6117136659436007, "grad_norm": 2.396234219132774, "learning_rate": 4.974650825050149e-06, "logits/chosen": -0.3889038860797882, "logits/rejected": -0.39398396015167236, "logps/chosen": -0.4082852900028229, "logps/rejected": -2.398747682571411, "loss": 0.3867, "odds_ratio_loss": 0.15101824700832367, "rewards/accuracies": 1.0, "rewards/chosen": -0.040828533470630646, "rewards/margins": 0.19904625415802002, "rewards/rejected": -0.23987479507923126, "sft_loss": 0.4082852900028229, "step": 1806 }, { "epoch": 2.6131597975415763, "grad_norm": 2.170326355834699, "learning_rate": 4.971638267422046e-06, "logits/chosen": -0.45325085520744324, "logits/rejected": -0.3461891710758209, "logps/chosen": -0.3370608687400818, "logps/rejected": -3.079336166381836, "loss": 0.3023, "odds_ratio_loss": 0.09138752520084381, "rewards/accuracies": 1.0, "rewards/chosen": -0.03370608389377594, "rewards/margins": 0.2742275297641754, "rewards/rejected": -0.30793359875679016, "sft_loss": 0.3370608687400818, "step": 1807 }, { "epoch": 2.6146059291395516, "grad_norm": 2.4688817277696695, "learning_rate": 4.968625123989612e-06, "logits/chosen": -0.28213053941726685, "logits/rejected": -0.28659752011299133, "logps/chosen": -0.29719579219818115, "logps/rejected": -3.136472225189209, "loss": 0.3744, "odds_ratio_loss": 0.08925367891788483, "rewards/accuracies": 1.0, "rewards/chosen": -0.029719576239585876, "rewards/margins": 0.28392767906188965, "rewards/rejected": -0.31364724040031433, "sft_loss": 0.29719579219818115, "step": 1808 }, { "epoch": 2.6160520607375273, "grad_norm": 2.7610315458080086, "learning_rate": 4.965611396569483e-06, "logits/chosen": -0.25537610054016113, "logits/rejected": -0.23609262704849243, "logps/chosen": -0.4210742712020874, "logps/rejected": -3.389645576477051, "loss": 0.4538, "odds_ratio_loss": 0.15438979864120483, "rewards/accuracies": 1.0, "rewards/chosen": -0.04210742563009262, "rewards/margins": 0.2968571186065674, "rewards/rejected": -0.3389645218849182, "sft_loss": 0.4210742712020874, "step": 1809 }, { "epoch": 2.6174981923355025, "grad_norm": 2.3541496517196685, "learning_rate": 4.962597086978646e-06, "logits/chosen": -0.16515159606933594, "logits/rejected": -0.14388681948184967, "logps/chosen": -0.48628005385398865, "logps/rejected": -4.436060905456543, "loss": 0.3932, "odds_ratio_loss": 0.19803836941719055, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04862800985574722, "rewards/margins": 0.39497804641723633, "rewards/rejected": -0.44360604882240295, "sft_loss": 0.48628005385398865, "step": 1810 }, { "epoch": 2.618944323933478, "grad_norm": 2.4441155908403838, "learning_rate": 4.959582197034442e-06, "logits/chosen": -0.2943183183670044, "logits/rejected": -0.2773338258266449, "logps/chosen": -0.35410210490226746, "logps/rejected": -4.187530994415283, "loss": 0.311, "odds_ratio_loss": 0.07550371438264847, "rewards/accuracies": 1.0, "rewards/chosen": -0.035410210490226746, "rewards/margins": 0.3833428919315338, "rewards/rejected": -0.41875308752059937, "sft_loss": 0.35410210490226746, "step": 1811 }, { "epoch": 2.6203904555314534, "grad_norm": 11.302558835724303, "learning_rate": 4.956566728554556e-06, "logits/chosen": -0.1322237253189087, "logits/rejected": -0.163202166557312, "logps/chosen": -0.34461265802383423, "logps/rejected": -4.301257133483887, "loss": 0.4088, "odds_ratio_loss": 0.13887152075767517, "rewards/accuracies": 1.0, "rewards/chosen": -0.03446126729249954, "rewards/margins": 0.3956645131111145, "rewards/rejected": -0.43012574315071106, "sft_loss": 0.34461265802383423, "step": 1812 }, { "epoch": 2.6218365871294287, "grad_norm": 2.472796208481264, "learning_rate": 4.953550683357027e-06, "logits/chosen": -0.27975982427597046, "logits/rejected": -0.222725510597229, "logps/chosen": -0.3341180086135864, "logps/rejected": -2.5925729274749756, "loss": 0.3509, "odds_ratio_loss": 0.14651554822921753, "rewards/accuracies": 1.0, "rewards/chosen": -0.03341180086135864, "rewards/margins": 0.22584550082683563, "rewards/rejected": -0.25925731658935547, "sft_loss": 0.3341180086135864, "step": 1813 }, { "epoch": 2.6232827187274044, "grad_norm": 2.7613022352029306, "learning_rate": 4.95053406326024e-06, "logits/chosen": -0.28327706456184387, "logits/rejected": -0.2542087435722351, "logps/chosen": -0.3347577452659607, "logps/rejected": -2.825500011444092, "loss": 0.3515, "odds_ratio_loss": 0.17398758232593536, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03347577154636383, "rewards/margins": 0.24907422065734863, "rewards/rejected": -0.28255000710487366, "sft_loss": 0.3347577452659607, "step": 1814 }, { "epoch": 2.6247288503253796, "grad_norm": 3.439891250550689, "learning_rate": 4.947516870082926e-06, "logits/chosen": -0.31636762619018555, "logits/rejected": -0.2951328456401825, "logps/chosen": -0.3941154479980469, "logps/rejected": -3.2448151111602783, "loss": 0.3978, "odds_ratio_loss": 0.15645679831504822, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03941154479980469, "rewards/margins": 0.28506994247436523, "rewards/rejected": -0.32448145747184753, "sft_loss": 0.3941154479980469, "step": 1815 }, { "epoch": 2.626174981923355, "grad_norm": 2.899928062378467, "learning_rate": 4.944499105644163e-06, "logits/chosen": -0.3948509991168976, "logits/rejected": -0.2616193890571594, "logps/chosen": -0.4997680187225342, "logps/rejected": -2.383761405944824, "loss": 0.3824, "odds_ratio_loss": 0.12019702047109604, "rewards/accuracies": 1.0, "rewards/chosen": -0.04997680336236954, "rewards/margins": 0.18839934468269348, "rewards/rejected": -0.23837614059448242, "sft_loss": 0.4997680187225342, "step": 1816 }, { "epoch": 2.6276211135213305, "grad_norm": 2.326957425313526, "learning_rate": 4.94148077176337e-06, "logits/chosen": -0.3444955348968506, "logits/rejected": -0.4002199172973633, "logps/chosen": -0.2849689722061157, "logps/rejected": -4.096587657928467, "loss": 0.2927, "odds_ratio_loss": 0.10544468462467194, "rewards/accuracies": 1.0, "rewards/chosen": -0.028496896848082542, "rewards/margins": 0.3811618983745575, "rewards/rejected": -0.4096587598323822, "sft_loss": 0.2849689722061157, "step": 1817 }, { "epoch": 2.629067245119306, "grad_norm": 3.583265711689172, "learning_rate": 4.938461870260314e-06, "logits/chosen": -0.31613588333129883, "logits/rejected": -0.24397434294223785, "logps/chosen": -0.352634072303772, "logps/rejected": -4.161520957946777, "loss": 0.3471, "odds_ratio_loss": 0.0841374322772026, "rewards/accuracies": 1.0, "rewards/chosen": -0.03526340797543526, "rewards/margins": 0.3808886408805847, "rewards/rejected": -0.41615208983421326, "sft_loss": 0.352634072303772, "step": 1818 }, { "epoch": 2.630513376717281, "grad_norm": 2.1560310075706046, "learning_rate": 4.9354424029551005e-06, "logits/chosen": -0.43916022777557373, "logits/rejected": -0.18733128905296326, "logps/chosen": -0.25644809007644653, "logps/rejected": -2.241384267807007, "loss": 0.3276, "odds_ratio_loss": 0.112449511885643, "rewards/accuracies": 1.0, "rewards/chosen": -0.025644807144999504, "rewards/margins": 0.1984936147928238, "rewards/rejected": -0.22413842380046844, "sft_loss": 0.25644809007644653, "step": 1819 }, { "epoch": 2.6319595083152567, "grad_norm": 2.445334001860914, "learning_rate": 4.93242237166818e-06, "logits/chosen": -0.257645845413208, "logits/rejected": -0.2424604594707489, "logps/chosen": -0.39798110723495483, "logps/rejected": -4.620807647705078, "loss": 0.3767, "odds_ratio_loss": 0.09533874690532684, "rewards/accuracies": 1.0, "rewards/chosen": -0.03979811072349548, "rewards/margins": 0.4222826063632965, "rewards/rejected": -0.462080717086792, "sft_loss": 0.39798110723495483, "step": 1820 }, { "epoch": 2.633405639913232, "grad_norm": 2.917112963148812, "learning_rate": 4.929401778220337e-06, "logits/chosen": -0.41579800844192505, "logits/rejected": -0.2872324585914612, "logps/chosen": -0.2373175173997879, "logps/rejected": -5.390769004821777, "loss": 0.3582, "odds_ratio_loss": 0.05972566083073616, "rewards/accuracies": 1.0, "rewards/chosen": -0.02373175323009491, "rewards/margins": 0.5153451561927795, "rewards/rejected": -0.5390769243240356, "sft_loss": 0.2373175173997879, "step": 1821 }, { "epoch": 2.6348517715112076, "grad_norm": 2.2456614622953714, "learning_rate": 4.926380624432701e-06, "logits/chosen": -0.19419153034687042, "logits/rejected": -0.18315161764621735, "logps/chosen": -0.46090608835220337, "logps/rejected": -3.4876163005828857, "loss": 0.3824, "odds_ratio_loss": 0.1783159077167511, "rewards/accuracies": 0.9375, "rewards/chosen": -0.046090610325336456, "rewards/margins": 0.30267101526260376, "rewards/rejected": -0.3487616181373596, "sft_loss": 0.46090608835220337, "step": 1822 }, { "epoch": 2.636297903109183, "grad_norm": 2.510652257091109, "learning_rate": 4.923358912126737e-06, "logits/chosen": -0.20722374320030212, "logits/rejected": -0.10644985735416412, "logps/chosen": -0.4585767388343811, "logps/rejected": -4.008731842041016, "loss": 0.4682, "odds_ratio_loss": 0.17287862300872803, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04585767537355423, "rewards/margins": 0.35501551628112793, "rewards/rejected": -0.40087318420410156, "sft_loss": 0.4585767388343811, "step": 1823 }, { "epoch": 2.6377440347071586, "grad_norm": 3.897456193625065, "learning_rate": 4.920336643124245e-06, "logits/chosen": -0.038326654583215714, "logits/rejected": -0.1093643456697464, "logps/chosen": -0.27784785628318787, "logps/rejected": -4.6210737228393555, "loss": 0.3785, "odds_ratio_loss": 0.08139775693416595, "rewards/accuracies": 0.9375, "rewards/chosen": -0.027784785255789757, "rewards/margins": 0.4343225955963135, "rewards/rejected": -0.4621073603630066, "sft_loss": 0.27784785628318787, "step": 1824 }, { "epoch": 2.639190166305134, "grad_norm": 2.560995261737236, "learning_rate": 4.917313819247363e-06, "logits/chosen": -0.1732892543077469, "logits/rejected": -0.25222983956336975, "logps/chosen": -0.3237385153770447, "logps/rejected": -2.947514295578003, "loss": 0.3709, "odds_ratio_loss": 0.16104400157928467, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03237384930253029, "rewards/margins": 0.2623775899410248, "rewards/rejected": -0.29475143551826477, "sft_loss": 0.3237385153770447, "step": 1825 }, { "epoch": 2.640636297903109, "grad_norm": 2.765067172884269, "learning_rate": 4.914290442318564e-06, "logits/chosen": -0.31066641211509705, "logits/rejected": -0.15391992032527924, "logps/chosen": -0.32306790351867676, "logps/rejected": -2.4956536293029785, "loss": 0.3796, "odds_ratio_loss": 0.11629980057477951, "rewards/accuracies": 1.0, "rewards/chosen": -0.032306790351867676, "rewards/margins": 0.21725855767726898, "rewards/rejected": -0.24956536293029785, "sft_loss": 0.32306790351867676, "step": 1826 }, { "epoch": 2.6420824295010847, "grad_norm": 2.2647017949805015, "learning_rate": 4.911266514160652e-06, "logits/chosen": -0.23672957718372345, "logits/rejected": -0.21628616750240326, "logps/chosen": -0.3888474106788635, "logps/rejected": -3.2323203086853027, "loss": 0.3573, "odds_ratio_loss": 0.22670409083366394, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03888474404811859, "rewards/margins": 0.2843472957611084, "rewards/rejected": -0.3232320547103882, "sft_loss": 0.3888474106788635, "step": 1827 }, { "epoch": 2.64352856109906, "grad_norm": 2.2611203101933013, "learning_rate": 4.908242036596764e-06, "logits/chosen": -0.1369129866361618, "logits/rejected": -0.2426975816488266, "logps/chosen": -0.4664827883243561, "logps/rejected": -2.282384157180786, "loss": 0.3863, "odds_ratio_loss": 0.2692616283893585, "rewards/accuracies": 1.0, "rewards/chosen": -0.04664827883243561, "rewards/margins": 0.18159013986587524, "rewards/rejected": -0.22823843359947205, "sft_loss": 0.4664827883243561, "step": 1828 }, { "epoch": 2.6449746926970352, "grad_norm": 3.4362278755855487, "learning_rate": 4.905217011450371e-06, "logits/chosen": -0.2658514976501465, "logits/rejected": -0.13316062092781067, "logps/chosen": -0.42111894488334656, "logps/rejected": -3.255000591278076, "loss": 0.3749, "odds_ratio_loss": 0.09841261804103851, "rewards/accuracies": 1.0, "rewards/chosen": -0.042111895978450775, "rewards/margins": 0.2833881676197052, "rewards/rejected": -0.3255000710487366, "sft_loss": 0.42111894488334656, "step": 1829 }, { "epoch": 2.646420824295011, "grad_norm": 2.5230284188198855, "learning_rate": 4.902191440545269e-06, "logits/chosen": -0.1730635017156601, "logits/rejected": -0.08948790282011032, "logps/chosen": -0.35513409972190857, "logps/rejected": -2.901704788208008, "loss": 0.428, "odds_ratio_loss": 0.09851216524839401, "rewards/accuracies": 1.0, "rewards/chosen": -0.03551340475678444, "rewards/margins": 0.2546570599079132, "rewards/rejected": -0.29017049074172974, "sft_loss": 0.35513409972190857, "step": 1830 }, { "epoch": 2.647866955892986, "grad_norm": 4.862374325559571, "learning_rate": 4.899165325705588e-06, "logits/chosen": -0.11161284148693085, "logits/rejected": -0.31778573989868164, "logps/chosen": -0.2354210615158081, "logps/rejected": -4.787618637084961, "loss": 0.3417, "odds_ratio_loss": 0.1100718304514885, "rewards/accuracies": 1.0, "rewards/chosen": -0.02354210615158081, "rewards/margins": 0.45521974563598633, "rewards/rejected": -0.4787618815898895, "sft_loss": 0.2354210615158081, "step": 1831 }, { "epoch": 2.649313087490962, "grad_norm": 2.260787046159271, "learning_rate": 4.896138668755783e-06, "logits/chosen": -0.09058425575494766, "logits/rejected": 0.016488205641508102, "logps/chosen": -0.2910672724246979, "logps/rejected": -2.503805637359619, "loss": 0.3102, "odds_ratio_loss": 0.15633143484592438, "rewards/accuracies": 1.0, "rewards/chosen": -0.029106728732585907, "rewards/margins": 0.22127383947372437, "rewards/rejected": -0.25038057565689087, "sft_loss": 0.2910672724246979, "step": 1832 }, { "epoch": 2.650759219088937, "grad_norm": 2.1646837648987343, "learning_rate": 4.893111471520637e-06, "logits/chosen": -0.2946851849555969, "logits/rejected": -0.3171956539154053, "logps/chosen": -0.4242430329322815, "logps/rejected": -4.413708209991455, "loss": 0.3374, "odds_ratio_loss": 0.13221263885498047, "rewards/accuracies": 1.0, "rewards/chosen": -0.04242429882287979, "rewards/margins": 0.39894652366638184, "rewards/rejected": -0.4413708448410034, "sft_loss": 0.4242430329322815, "step": 1833 }, { "epoch": 2.6522053506869128, "grad_norm": 2.3527904907517105, "learning_rate": 4.890083735825257e-06, "logits/chosen": -0.12634724378585815, "logits/rejected": -0.18269193172454834, "logps/chosen": -0.3488408625125885, "logps/rejected": -3.032761573791504, "loss": 0.3689, "odds_ratio_loss": 0.16602487862110138, "rewards/accuracies": 1.0, "rewards/chosen": -0.03488408774137497, "rewards/margins": 0.26839208602905273, "rewards/rejected": -0.3032761812210083, "sft_loss": 0.3488408625125885, "step": 1834 }, { "epoch": 2.653651482284888, "grad_norm": 2.332969487913798, "learning_rate": 4.88705546349508e-06, "logits/chosen": -0.41772475838661194, "logits/rejected": -0.2163325548171997, "logps/chosen": -0.31502100825309753, "logps/rejected": -3.173156976699829, "loss": 0.3645, "odds_ratio_loss": 0.10136070847511292, "rewards/accuracies": 1.0, "rewards/chosen": -0.03150210529565811, "rewards/margins": 0.2858135998249054, "rewards/rejected": -0.3173157274723053, "sft_loss": 0.31502100825309753, "step": 1835 }, { "epoch": 2.6550976138828633, "grad_norm": 2.353448236079133, "learning_rate": 4.884026656355859e-06, "logits/chosen": -0.16601859033107758, "logits/rejected": -0.1426706314086914, "logps/chosen": -0.29813799262046814, "logps/rejected": -3.2178196907043457, "loss": 0.371, "odds_ratio_loss": 0.07204075902700424, "rewards/accuracies": 1.0, "rewards/chosen": -0.029813801869750023, "rewards/margins": 0.2919681668281555, "rewards/rejected": -0.3217819929122925, "sft_loss": 0.29813799262046814, "step": 1836 }, { "epoch": 2.656543745480839, "grad_norm": 2.5198880328260858, "learning_rate": 4.880997316233675e-06, "logits/chosen": -0.08051356673240662, "logits/rejected": -0.03858156129717827, "logps/chosen": -0.27724963426589966, "logps/rejected": -2.3203272819519043, "loss": 0.3583, "odds_ratio_loss": 0.1639135181903839, "rewards/accuracies": 0.875, "rewards/chosen": -0.027724966406822205, "rewards/margins": 0.20430777966976166, "rewards/rejected": -0.23203276097774506, "sft_loss": 0.27724963426589966, "step": 1837 }, { "epoch": 2.657989877078814, "grad_norm": 2.2869035506328763, "learning_rate": 4.877967444954928e-06, "logits/chosen": -0.11426656693220139, "logits/rejected": -0.21190780401229858, "logps/chosen": -0.27625131607055664, "logps/rejected": -3.109833002090454, "loss": 0.3755, "odds_ratio_loss": 0.08180101215839386, "rewards/accuracies": 1.0, "rewards/chosen": -0.027625130489468575, "rewards/margins": 0.2833581864833832, "rewards/rejected": -0.3109833002090454, "sft_loss": 0.27625131607055664, "step": 1838 }, { "epoch": 2.6594360086767894, "grad_norm": 2.057596946172654, "learning_rate": 4.874937044346338e-06, "logits/chosen": -0.18351957201957703, "logits/rejected": -0.19910424947738647, "logps/chosen": -0.3876161575317383, "logps/rejected": -4.693860054016113, "loss": 0.3362, "odds_ratio_loss": 0.13495394587516785, "rewards/accuracies": 1.0, "rewards/chosen": -0.03876161575317383, "rewards/margins": 0.43062442541122437, "rewards/rejected": -0.4693860709667206, "sft_loss": 0.3876161575317383, "step": 1839 }, { "epoch": 2.660882140274765, "grad_norm": 2.296001222551746, "learning_rate": 4.871906116234946e-06, "logits/chosen": -0.23469124734401703, "logits/rejected": -0.22504733502864838, "logps/chosen": -0.2928099036216736, "logps/rejected": -5.243224143981934, "loss": 0.342, "odds_ratio_loss": 0.0769428163766861, "rewards/accuracies": 1.0, "rewards/chosen": -0.029280992224812508, "rewards/margins": 0.49504148960113525, "rewards/rejected": -0.524322509765625, "sft_loss": 0.2928099036216736, "step": 1840 }, { "epoch": 2.6623282718727403, "grad_norm": 2.686575716206401, "learning_rate": 4.868874662448108e-06, "logits/chosen": -0.14365555346012115, "logits/rejected": -0.13080452382564545, "logps/chosen": -0.29430562257766724, "logps/rejected": -4.659070014953613, "loss": 0.3909, "odds_ratio_loss": 0.09934721887111664, "rewards/accuracies": 1.0, "rewards/chosen": -0.029430562630295753, "rewards/margins": 0.4364764392375946, "rewards/rejected": -0.4659070074558258, "sft_loss": 0.29430562257766724, "step": 1841 }, { "epoch": 2.6637744034707156, "grad_norm": 2.3024896264526964, "learning_rate": 4.865842684813501e-06, "logits/chosen": -0.14090366661548615, "logits/rejected": -0.2288772016763687, "logps/chosen": -0.13625164330005646, "logps/rejected": -4.667929649353027, "loss": 0.2728, "odds_ratio_loss": 0.05977560207247734, "rewards/accuracies": 1.0, "rewards/chosen": -0.01362516451627016, "rewards/margins": 0.45316773653030396, "rewards/rejected": -0.46679291129112244, "sft_loss": 0.13625164330005646, "step": 1842 }, { "epoch": 2.6652205350686913, "grad_norm": 2.5026158538150822, "learning_rate": 4.862810185159115e-06, "logits/chosen": -0.12085071206092834, "logits/rejected": -0.1948763132095337, "logps/chosen": -0.3157857656478882, "logps/rejected": -4.3117899894714355, "loss": 0.312, "odds_ratio_loss": 0.12068923562765121, "rewards/accuracies": 1.0, "rewards/chosen": -0.03157857805490494, "rewards/margins": 0.39960044622421265, "rewards/rejected": -0.431179016828537, "sft_loss": 0.3157857656478882, "step": 1843 }, { "epoch": 2.6666666666666665, "grad_norm": 2.42599710631053, "learning_rate": 4.859777165313254e-06, "logits/chosen": -0.06984113156795502, "logits/rejected": -0.11556413769721985, "logps/chosen": -0.26944583654403687, "logps/rejected": -5.107959747314453, "loss": 0.3727, "odds_ratio_loss": 0.06308241933584213, "rewards/accuracies": 1.0, "rewards/chosen": -0.026944583281874657, "rewards/margins": 0.4838513433933258, "rewards/rejected": -0.5107959508895874, "sft_loss": 0.26944583654403687, "step": 1844 }, { "epoch": 2.668112798264642, "grad_norm": 2.362337955779663, "learning_rate": 4.856743627104538e-06, "logits/chosen": -0.13892969489097595, "logits/rejected": -0.2013864815235138, "logps/chosen": -0.4483214020729065, "logps/rejected": -4.599874496459961, "loss": 0.3419, "odds_ratio_loss": 0.1677556186914444, "rewards/accuracies": 1.0, "rewards/chosen": -0.04483213648200035, "rewards/margins": 0.4151553511619568, "rewards/rejected": -0.45998746156692505, "sft_loss": 0.4483214020729065, "step": 1845 }, { "epoch": 2.6695589298626174, "grad_norm": 2.174536905457461, "learning_rate": 4.8537095723618984e-06, "logits/chosen": -0.3669096529483795, "logits/rejected": -0.22529584169387817, "logps/chosen": -0.33727243542671204, "logps/rejected": -4.307671070098877, "loss": 0.3986, "odds_ratio_loss": 0.07581018656492233, "rewards/accuracies": 1.0, "rewards/chosen": -0.033727243542671204, "rewards/margins": 0.39703983068466187, "rewards/rejected": -0.43076711893081665, "sft_loss": 0.33727243542671204, "step": 1846 }, { "epoch": 2.671005061460593, "grad_norm": 2.2127792507009985, "learning_rate": 4.850675002914579e-06, "logits/chosen": -0.23622475564479828, "logits/rejected": -0.3096325397491455, "logps/chosen": -0.377718448638916, "logps/rejected": -2.8641698360443115, "loss": 0.372, "odds_ratio_loss": 0.1540893316268921, "rewards/accuracies": 1.0, "rewards/chosen": -0.03777184337377548, "rewards/margins": 0.24864515662193298, "rewards/rejected": -0.28641700744628906, "sft_loss": 0.377718448638916, "step": 1847 }, { "epoch": 2.6724511930585684, "grad_norm": 2.641085873459316, "learning_rate": 4.847639920592131e-06, "logits/chosen": -0.4159122705459595, "logits/rejected": -0.47509440779685974, "logps/chosen": -0.4206668734550476, "logps/rejected": -3.3233633041381836, "loss": 0.4468, "odds_ratio_loss": 0.1505662351846695, "rewards/accuracies": 1.0, "rewards/chosen": -0.04206668958067894, "rewards/margins": 0.290269672870636, "rewards/rejected": -0.3323363661766052, "sft_loss": 0.4206668734550476, "step": 1848 }, { "epoch": 2.6738973246565436, "grad_norm": 2.5318611255942773, "learning_rate": 4.8446043272244174e-06, "logits/chosen": -0.2710360586643219, "logits/rejected": -0.3191642165184021, "logps/chosen": -0.43725037574768066, "logps/rejected": -3.675550699234009, "loss": 0.3333, "odds_ratio_loss": 0.16352544724941254, "rewards/accuracies": 1.0, "rewards/chosen": -0.04372503608465195, "rewards/margins": 0.3238300681114197, "rewards/rejected": -0.36755508184432983, "sft_loss": 0.43725037574768066, "step": 1849 }, { "epoch": 2.6753434562545193, "grad_norm": 1.9803532713734116, "learning_rate": 4.841568224641611e-06, "logits/chosen": -0.2555657923221588, "logits/rejected": -0.21312148869037628, "logps/chosen": -0.3199189603328705, "logps/rejected": -4.294778823852539, "loss": 0.353, "odds_ratio_loss": 0.11003275960683823, "rewards/accuracies": 1.0, "rewards/chosen": -0.03199189528822899, "rewards/margins": 0.39748603105545044, "rewards/rejected": -0.4294779300689697, "sft_loss": 0.3199189603328705, "step": 1850 }, { "epoch": 2.6767895878524945, "grad_norm": 2.3395839403914476, "learning_rate": 4.838531614674187e-06, "logits/chosen": -0.12954504787921906, "logits/rejected": -0.13004763424396515, "logps/chosen": -0.3799174427986145, "logps/rejected": -2.644596576690674, "loss": 0.3671, "odds_ratio_loss": 0.11590912193059921, "rewards/accuracies": 1.0, "rewards/chosen": -0.03799174726009369, "rewards/margins": 0.22646790742874146, "rewards/rejected": -0.26445966958999634, "sft_loss": 0.3799174427986145, "step": 1851 }, { "epoch": 2.67823571945047, "grad_norm": 2.2326525435791713, "learning_rate": 4.835494499152929e-06, "logits/chosen": -0.1807328760623932, "logits/rejected": -0.1978977471590042, "logps/chosen": -0.5129043459892273, "logps/rejected": -4.094215393066406, "loss": 0.4231, "odds_ratio_loss": 0.16595354676246643, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05129043757915497, "rewards/margins": 0.3581311106681824, "rewards/rejected": -0.40942153334617615, "sft_loss": 0.5129043459892273, "step": 1852 }, { "epoch": 2.6796818510484455, "grad_norm": 2.5480902565729946, "learning_rate": 4.832456879908925e-06, "logits/chosen": -0.28215038776397705, "logits/rejected": -0.2248668372631073, "logps/chosen": -0.4055119752883911, "logps/rejected": -3.6705069541931152, "loss": 0.4347, "odds_ratio_loss": 0.10765072703361511, "rewards/accuracies": 1.0, "rewards/chosen": -0.04055120050907135, "rewards/margins": 0.3264995217323303, "rewards/rejected": -0.36705073714256287, "sft_loss": 0.4055119752883911, "step": 1853 }, { "epoch": 2.6811279826464207, "grad_norm": 2.0106016982008064, "learning_rate": 4.829418758773569e-06, "logits/chosen": -0.36731940507888794, "logits/rejected": -0.3151874244213104, "logps/chosen": -0.276762455701828, "logps/rejected": -5.286036968231201, "loss": 0.31, "odds_ratio_loss": 0.052199844270944595, "rewards/accuracies": 1.0, "rewards/chosen": -0.02767624519765377, "rewards/margins": 0.5009274482727051, "rewards/rejected": -0.528603732585907, "sft_loss": 0.276762455701828, "step": 1854 }, { "epoch": 2.6825741142443964, "grad_norm": 2.547022613146218, "learning_rate": 4.826380137578554e-06, "logits/chosen": -0.20882338285446167, "logits/rejected": -0.2723812162876129, "logps/chosen": -0.3922734260559082, "logps/rejected": -3.59426212310791, "loss": 0.3881, "odds_ratio_loss": 0.13410308957099915, "rewards/accuracies": 1.0, "rewards/chosen": -0.03922734037041664, "rewards/margins": 0.3201989233493805, "rewards/rejected": -0.35942625999450684, "sft_loss": 0.3922734260559082, "step": 1855 }, { "epoch": 2.6840202458423716, "grad_norm": 2.798533244170884, "learning_rate": 4.823341018155876e-06, "logits/chosen": -0.2806504964828491, "logits/rejected": -0.3115236163139343, "logps/chosen": -0.3371948301792145, "logps/rejected": -1.5915367603302002, "loss": 0.3913, "odds_ratio_loss": 0.17964649200439453, "rewards/accuracies": 1.0, "rewards/chosen": -0.03371948376297951, "rewards/margins": 0.12543419003486633, "rewards/rejected": -0.15915367007255554, "sft_loss": 0.3371948301792145, "step": 1856 }, { "epoch": 2.6854663774403473, "grad_norm": 2.0924046826364595, "learning_rate": 4.8203014023378315e-06, "logits/chosen": -0.34187567234039307, "logits/rejected": -0.31444215774536133, "logps/chosen": -0.4763234853744507, "logps/rejected": -3.6403157711029053, "loss": 0.4091, "odds_ratio_loss": 0.17209962010383606, "rewards/accuracies": 1.0, "rewards/chosen": -0.04763234779238701, "rewards/margins": 0.3163992464542389, "rewards/rejected": -0.3640316128730774, "sft_loss": 0.4763234853744507, "step": 1857 }, { "epoch": 2.6869125090383226, "grad_norm": 2.692447994435838, "learning_rate": 4.8172612919570175e-06, "logits/chosen": -0.2727445363998413, "logits/rejected": -0.3767099678516388, "logps/chosen": -0.5349310636520386, "logps/rejected": -4.6468892097473145, "loss": 0.4726, "odds_ratio_loss": 0.12882837653160095, "rewards/accuracies": 1.0, "rewards/chosen": -0.05349310487508774, "rewards/margins": 0.41119587421417236, "rewards/rejected": -0.4646889567375183, "sft_loss": 0.5349310636520386, "step": 1858 }, { "epoch": 2.688358640636298, "grad_norm": 2.336635461911087, "learning_rate": 4.814220688846326e-06, "logits/chosen": -0.30912578105926514, "logits/rejected": -0.11871747672557831, "logps/chosen": -0.2691580653190613, "logps/rejected": -2.848756790161133, "loss": 0.3284, "odds_ratio_loss": 0.08548923581838608, "rewards/accuracies": 1.0, "rewards/chosen": -0.026915807276964188, "rewards/margins": 0.25795987248420715, "rewards/rejected": -0.28487569093704224, "sft_loss": 0.2691580653190613, "step": 1859 }, { "epoch": 2.6898047722342735, "grad_norm": 2.6703621825691024, "learning_rate": 4.811179594838949e-06, "logits/chosen": -0.2815786898136139, "logits/rejected": -0.1816381812095642, "logps/chosen": -0.45205157995224, "logps/rejected": -3.6539969444274902, "loss": 0.4271, "odds_ratio_loss": 0.11903213709592819, "rewards/accuracies": 1.0, "rewards/chosen": -0.04520515725016594, "rewards/margins": 0.3201946020126343, "rewards/rejected": -0.3653997480869293, "sft_loss": 0.45205157995224, "step": 1860 }, { "epoch": 2.6912509038322487, "grad_norm": 2.3383290717041545, "learning_rate": 4.808138011768372e-06, "logits/chosen": -0.35975927114486694, "logits/rejected": -0.263312965631485, "logps/chosen": -0.37474626302719116, "logps/rejected": -3.156010150909424, "loss": 0.3931, "odds_ratio_loss": 0.09446077048778534, "rewards/accuracies": 0.9375, "rewards/chosen": -0.037474624812603, "rewards/margins": 0.27812641859054565, "rewards/rejected": -0.31560102105140686, "sft_loss": 0.37474626302719116, "step": 1861 }, { "epoch": 2.692697035430224, "grad_norm": 2.49353100276086, "learning_rate": 4.805095941468379e-06, "logits/chosen": -0.23262527585029602, "logits/rejected": -0.14135374128818512, "logps/chosen": -0.19227707386016846, "logps/rejected": -6.0062255859375, "loss": 0.3543, "odds_ratio_loss": 0.022998683154582977, "rewards/accuracies": 1.0, "rewards/chosen": -0.019227707758545876, "rewards/margins": 0.5813947916030884, "rewards/rejected": -0.6006225347518921, "sft_loss": 0.19227707386016846, "step": 1862 }, { "epoch": 2.6941431670281997, "grad_norm": 2.254707565413196, "learning_rate": 4.8020533857730446e-06, "logits/chosen": -0.3178359866142273, "logits/rejected": -0.42701420187950134, "logps/chosen": -0.33618977665901184, "logps/rejected": -5.497029781341553, "loss": 0.3486, "odds_ratio_loss": 0.11398158967494965, "rewards/accuracies": 1.0, "rewards/chosen": -0.0336189791560173, "rewards/margins": 0.5160840153694153, "rewards/rejected": -0.5497030019760132, "sft_loss": 0.33618977665901184, "step": 1863 }, { "epoch": 2.695589298626175, "grad_norm": 2.2948507334152732, "learning_rate": 4.799010346516736e-06, "logits/chosen": -0.199011892080307, "logits/rejected": -0.21153825521469116, "logps/chosen": -0.21537664532661438, "logps/rejected": -3.909799098968506, "loss": 0.2931, "odds_ratio_loss": 0.09779515117406845, "rewards/accuracies": 1.0, "rewards/chosen": -0.02153766341507435, "rewards/margins": 0.36944228410720825, "rewards/rejected": -0.39097991585731506, "sft_loss": 0.21537664532661438, "step": 1864 }, { "epoch": 2.69703543022415, "grad_norm": 2.355929773573819, "learning_rate": 4.795966825534113e-06, "logits/chosen": -0.2665369212627411, "logits/rejected": -0.1927603930234909, "logps/chosen": -0.2670694887638092, "logps/rejected": -3.359889030456543, "loss": 0.3798, "odds_ratio_loss": 0.09360802173614502, "rewards/accuracies": 1.0, "rewards/chosen": -0.02670694887638092, "rewards/margins": 0.30928194522857666, "rewards/rejected": -0.3359888792037964, "sft_loss": 0.2670694887638092, "step": 1865 }, { "epoch": 2.698481561822126, "grad_norm": 2.361748512537748, "learning_rate": 4.7929228246601284e-06, "logits/chosen": -0.14877094328403473, "logits/rejected": -0.20902572572231293, "logps/chosen": -0.2963876724243164, "logps/rejected": -3.6602306365966797, "loss": 0.3352, "odds_ratio_loss": 0.11799450218677521, "rewards/accuracies": 1.0, "rewards/chosen": -0.02963876724243164, "rewards/margins": 0.33638429641723633, "rewards/rejected": -0.36602309346199036, "sft_loss": 0.2963876724243164, "step": 1866 }, { "epoch": 2.699927693420101, "grad_norm": 2.690629222337708, "learning_rate": 4.789878345730018e-06, "logits/chosen": -0.3562997877597809, "logits/rejected": -0.26115548610687256, "logps/chosen": -0.44163084030151367, "logps/rejected": -2.134089469909668, "loss": 0.3545, "odds_ratio_loss": 0.1546330451965332, "rewards/accuracies": 1.0, "rewards/chosen": -0.04416308552026749, "rewards/margins": 0.1692458689212799, "rewards/rejected": -0.2134089469909668, "sft_loss": 0.44163084030151367, "step": 1867 }, { "epoch": 2.7013738250180768, "grad_norm": 3.981075125457187, "learning_rate": 4.786833390579312e-06, "logits/chosen": -0.20393416285514832, "logits/rejected": -0.25781792402267456, "logps/chosen": -0.30942434072494507, "logps/rejected": -5.1820878982543945, "loss": 0.3604, "odds_ratio_loss": 0.10879567265510559, "rewards/accuracies": 0.9375, "rewards/chosen": -0.030942432582378387, "rewards/margins": 0.4872663617134094, "rewards/rejected": -0.5182087421417236, "sft_loss": 0.30942434072494507, "step": 1868 }, { "epoch": 2.702819956616052, "grad_norm": 6.019392366774251, "learning_rate": 4.783787961043824e-06, "logits/chosen": -0.21762330830097198, "logits/rejected": -0.18777711689472198, "logps/chosen": -0.4841902256011963, "logps/rejected": -2.6938180923461914, "loss": 0.3912, "odds_ratio_loss": 0.23626786470413208, "rewards/accuracies": 0.875, "rewards/chosen": -0.04841902479529381, "rewards/margins": 0.220962792634964, "rewards/rejected": -0.2693817913532257, "sft_loss": 0.4841902256011963, "step": 1869 }, { "epoch": 2.7042660882140277, "grad_norm": 2.30269671228443, "learning_rate": 4.780742058959657e-06, "logits/chosen": -0.10956840962171555, "logits/rejected": -0.15537512302398682, "logps/chosen": -0.301375150680542, "logps/rejected": -3.672297239303589, "loss": 0.3748, "odds_ratio_loss": 0.0992063656449318, "rewards/accuracies": 1.0, "rewards/chosen": -0.03013751283288002, "rewards/margins": 0.33709222078323364, "rewards/rejected": -0.367229700088501, "sft_loss": 0.301375150680542, "step": 1870 }, { "epoch": 2.705712219812003, "grad_norm": 2.932343582129926, "learning_rate": 4.777695686163193e-06, "logits/chosen": -0.17159150540828705, "logits/rejected": -0.2002825140953064, "logps/chosen": -0.37887436151504517, "logps/rejected": -1.8474010229110718, "loss": 0.4221, "odds_ratio_loss": 0.13345777988433838, "rewards/accuracies": 1.0, "rewards/chosen": -0.037887439131736755, "rewards/margins": 0.14685267210006714, "rewards/rejected": -0.1847401261329651, "sft_loss": 0.37887436151504517, "step": 1871 }, { "epoch": 2.707158351409978, "grad_norm": 2.7981338785453183, "learning_rate": 4.774648844491103e-06, "logits/chosen": -0.15738992393016815, "logits/rejected": -0.13649892807006836, "logps/chosen": -0.3411653935909271, "logps/rejected": -3.9342665672302246, "loss": 0.3566, "odds_ratio_loss": 0.09266319870948792, "rewards/accuracies": 1.0, "rewards/chosen": -0.03411654382944107, "rewards/margins": 0.359310120344162, "rewards/rejected": -0.39342665672302246, "sft_loss": 0.3411653935909271, "step": 1872 }, { "epoch": 2.708604483007954, "grad_norm": 2.162648757347475, "learning_rate": 4.77160153578034e-06, "logits/chosen": -0.11060208082199097, "logits/rejected": -0.16701571643352509, "logps/chosen": -0.3412019610404968, "logps/rejected": -2.1302669048309326, "loss": 0.3996, "odds_ratio_loss": 0.18422572314739227, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03412019461393356, "rewards/margins": 0.17890648543834686, "rewards/rejected": -0.21302670240402222, "sft_loss": 0.3412019610404968, "step": 1873 }, { "epoch": 2.710050614605929, "grad_norm": 4.0674750995594, "learning_rate": 4.7685537618681375e-06, "logits/chosen": -0.19251331686973572, "logits/rejected": -0.21041332185268402, "logps/chosen": -0.431249737739563, "logps/rejected": -1.8905670642852783, "loss": 0.3992, "odds_ratio_loss": 0.1726667582988739, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0431249774992466, "rewards/margins": 0.14593173563480377, "rewards/rejected": -0.18905670940876007, "sft_loss": 0.431249737739563, "step": 1874 }, { "epoch": 2.7114967462039044, "grad_norm": 2.152647508251286, "learning_rate": 4.765505524592009e-06, "logits/chosen": -0.10581720620393753, "logits/rejected": -0.0880403071641922, "logps/chosen": -0.3255786895751953, "logps/rejected": -2.7813684940338135, "loss": 0.3069, "odds_ratio_loss": 0.11545327305793762, "rewards/accuracies": 1.0, "rewards/chosen": -0.03255787119269371, "rewards/margins": 0.24557897448539734, "rewards/rejected": -0.27813684940338135, "sft_loss": 0.3255786895751953, "step": 1875 }, { "epoch": 2.71294287780188, "grad_norm": 2.099315725130296, "learning_rate": 4.762456825789747e-06, "logits/chosen": -0.1821872740983963, "logits/rejected": -0.2587732672691345, "logps/chosen": -0.5633606910705566, "logps/rejected": -2.573119640350342, "loss": 0.4576, "odds_ratio_loss": 0.26334935426712036, "rewards/accuracies": 0.9375, "rewards/chosen": -0.056336067616939545, "rewards/margins": 0.2009759247303009, "rewards/rejected": -0.25731199979782104, "sft_loss": 0.5633606910705566, "step": 1876 }, { "epoch": 2.7143890093998553, "grad_norm": 2.197321214866364, "learning_rate": 4.759407667299429e-06, "logits/chosen": -0.22565627098083496, "logits/rejected": -0.2331491857767105, "logps/chosen": -0.46682557463645935, "logps/rejected": -3.439094066619873, "loss": 0.4119, "odds_ratio_loss": 0.24814468622207642, "rewards/accuracies": 0.875, "rewards/chosen": -0.046682558953762054, "rewards/margins": 0.29722684621810913, "rewards/rejected": -0.3439093828201294, "sft_loss": 0.46682557463645935, "step": 1877 }, { "epoch": 2.715835140997831, "grad_norm": 2.479655479780689, "learning_rate": 4.756358050959398e-06, "logits/chosen": -0.4306910037994385, "logits/rejected": -0.4077626168727875, "logps/chosen": -0.3527236878871918, "logps/rejected": -2.965698003768921, "loss": 0.4205, "odds_ratio_loss": 0.14371559023857117, "rewards/accuracies": 1.0, "rewards/chosen": -0.035272371023893356, "rewards/margins": 0.26129746437072754, "rewards/rejected": -0.29656982421875, "sft_loss": 0.3527236878871918, "step": 1878 }, { "epoch": 2.717281272595806, "grad_norm": 4.301904026116894, "learning_rate": 4.75330797860828e-06, "logits/chosen": -0.1542791873216629, "logits/rejected": -0.12551337480545044, "logps/chosen": -0.3861375153064728, "logps/rejected": -4.375819683074951, "loss": 0.3695, "odds_ratio_loss": 0.17965860664844513, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03861375153064728, "rewards/margins": 0.3989682197570801, "rewards/rejected": -0.43758198618888855, "sft_loss": 0.3861375153064728, "step": 1879 }, { "epoch": 2.718727404193782, "grad_norm": 2.494798707133846, "learning_rate": 4.750257452084979e-06, "logits/chosen": -0.2010928988456726, "logits/rejected": -0.2024509459733963, "logps/chosen": -0.3827640414237976, "logps/rejected": -3.6829090118408203, "loss": 0.305, "odds_ratio_loss": 0.15489722788333893, "rewards/accuracies": 1.0, "rewards/chosen": -0.03827640041708946, "rewards/margins": 0.33001452684402466, "rewards/rejected": -0.36829090118408203, "sft_loss": 0.3827640414237976, "step": 1880 }, { "epoch": 2.720173535791757, "grad_norm": 2.424215307007899, "learning_rate": 4.747206473228664e-06, "logits/chosen": -0.259044885635376, "logits/rejected": -0.16557075083255768, "logps/chosen": -0.3136051297187805, "logps/rejected": -3.817262649536133, "loss": 0.3868, "odds_ratio_loss": 0.24824589490890503, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03136051446199417, "rewards/margins": 0.3503657579421997, "rewards/rejected": -0.3817262649536133, "sft_loss": 0.3136051297187805, "step": 1881 }, { "epoch": 2.7216196673897324, "grad_norm": 2.205167059464972, "learning_rate": 4.744155043878784e-06, "logits/chosen": -0.23924411833286285, "logits/rejected": -0.16571259498596191, "logps/chosen": -0.35944584012031555, "logps/rejected": -4.56205940246582, "loss": 0.3075, "odds_ratio_loss": 0.15001921355724335, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03594458848237991, "rewards/margins": 0.4202613830566406, "rewards/rejected": -0.45620596408843994, "sft_loss": 0.35944584012031555, "step": 1882 }, { "epoch": 2.723065798987708, "grad_norm": 2.3130878554020646, "learning_rate": 4.741103165875056e-06, "logits/chosen": -0.23174218833446503, "logits/rejected": -0.13708661496639252, "logps/chosen": -0.3431485593318939, "logps/rejected": -2.549919843673706, "loss": 0.3049, "odds_ratio_loss": 0.10180535167455673, "rewards/accuracies": 1.0, "rewards/chosen": -0.03431485593318939, "rewards/margins": 0.22067713737487793, "rewards/rejected": -0.25499197840690613, "sft_loss": 0.3431485593318939, "step": 1883 }, { "epoch": 2.7245119305856833, "grad_norm": 2.3552092476829634, "learning_rate": 4.738050841057469e-06, "logits/chosen": -0.5356272459030151, "logits/rejected": -0.5339710116386414, "logps/chosen": -0.35015958547592163, "logps/rejected": -2.9828152656555176, "loss": 0.3505, "odds_ratio_loss": 0.09774233400821686, "rewards/accuracies": 1.0, "rewards/chosen": -0.03501596301794052, "rewards/margins": 0.26326555013656616, "rewards/rejected": -0.2982814908027649, "sft_loss": 0.35015958547592163, "step": 1884 }, { "epoch": 2.7259580621836585, "grad_norm": 2.4694330118824896, "learning_rate": 4.734998071266282e-06, "logits/chosen": -0.3343193233013153, "logits/rejected": -0.24137598276138306, "logps/chosen": -0.37988466024398804, "logps/rejected": -3.0774569511413574, "loss": 0.368, "odds_ratio_loss": 0.10103145241737366, "rewards/accuracies": 1.0, "rewards/chosen": -0.03798846900463104, "rewards/margins": 0.2697572708129883, "rewards/rejected": -0.3077457547187805, "sft_loss": 0.37988466024398804, "step": 1885 }, { "epoch": 2.7274041937816342, "grad_norm": 2.2085538092010206, "learning_rate": 4.7319448583420195e-06, "logits/chosen": -0.3982861042022705, "logits/rejected": -0.36158859729766846, "logps/chosen": -0.3293648958206177, "logps/rejected": -2.769756555557251, "loss": 0.3269, "odds_ratio_loss": 0.09307774901390076, "rewards/accuracies": 1.0, "rewards/chosen": -0.03293649107217789, "rewards/margins": 0.24403917789459229, "rewards/rejected": -0.2769756615161896, "sft_loss": 0.3293648958206177, "step": 1886 }, { "epoch": 2.7288503253796095, "grad_norm": 2.6220443532888407, "learning_rate": 4.7288912041254765e-06, "logits/chosen": -0.35197606682777405, "logits/rejected": -0.2917653024196625, "logps/chosen": -0.2370256781578064, "logps/rejected": -4.931891441345215, "loss": 0.3206, "odds_ratio_loss": 0.08049122244119644, "rewards/accuracies": 1.0, "rewards/chosen": -0.02370256744325161, "rewards/margins": 0.4694865643978119, "rewards/rejected": -0.49318912625312805, "sft_loss": 0.2370256781578064, "step": 1887 }, { "epoch": 2.7302964569775847, "grad_norm": 2.229589260053656, "learning_rate": 4.72583711045771e-06, "logits/chosen": -0.26306650042533875, "logits/rejected": -0.3761969208717346, "logps/chosen": -0.5922081470489502, "logps/rejected": -3.768022298812866, "loss": 0.4555, "odds_ratio_loss": 0.2729063630104065, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0592208169400692, "rewards/margins": 0.3175814151763916, "rewards/rejected": -0.3768022358417511, "sft_loss": 0.5922081470489502, "step": 1888 }, { "epoch": 2.7317425885755604, "grad_norm": 2.0352388645586124, "learning_rate": 4.722782579180048e-06, "logits/chosen": -0.23472218215465546, "logits/rejected": -0.15055781602859497, "logps/chosen": -0.23011812567710876, "logps/rejected": -3.8374698162078857, "loss": 0.302, "odds_ratio_loss": 0.06934744119644165, "rewards/accuracies": 1.0, "rewards/chosen": -0.023011812940239906, "rewards/margins": 0.3607351779937744, "rewards/rejected": -0.3837469816207886, "sft_loss": 0.23011812567710876, "step": 1889 }, { "epoch": 2.7331887201735356, "grad_norm": 2.1876288211484156, "learning_rate": 4.719727612134077e-06, "logits/chosen": -0.24740484356880188, "logits/rejected": -0.23094268143177032, "logps/chosen": -0.32862362265586853, "logps/rejected": -2.0781450271606445, "loss": 0.3565, "odds_ratio_loss": 0.11739077419042587, "rewards/accuracies": 1.0, "rewards/chosen": -0.03286236524581909, "rewards/margins": 0.17495213449001312, "rewards/rejected": -0.2078145146369934, "sft_loss": 0.32862362265586853, "step": 1890 }, { "epoch": 2.7346348517715113, "grad_norm": 2.539941122203935, "learning_rate": 4.716672211161648e-06, "logits/chosen": -0.2699774503707886, "logits/rejected": -0.3347630202770233, "logps/chosen": -0.38555604219436646, "logps/rejected": -3.047736644744873, "loss": 0.354, "odds_ratio_loss": 0.15888690948486328, "rewards/accuracies": 1.0, "rewards/chosen": -0.038555607199668884, "rewards/margins": 0.26621806621551514, "rewards/rejected": -0.3047736883163452, "sft_loss": 0.38555604219436646, "step": 1891 }, { "epoch": 2.7360809833694866, "grad_norm": 2.97857518962784, "learning_rate": 4.713616378104874e-06, "logits/chosen": -0.15838538110256195, "logits/rejected": -0.18540534377098083, "logps/chosen": -0.2908265292644501, "logps/rejected": -2.8895366191864014, "loss": 0.3958, "odds_ratio_loss": 0.11644550412893295, "rewards/accuracies": 1.0, "rewards/chosen": -0.029082655906677246, "rewards/margins": 0.2598710060119629, "rewards/rejected": -0.28895366191864014, "sft_loss": 0.2908265292644501, "step": 1892 }, { "epoch": 2.7375271149674623, "grad_norm": 3.137520567531617, "learning_rate": 4.710560114806128e-06, "logits/chosen": -0.2546279728412628, "logits/rejected": -0.22949475049972534, "logps/chosen": -0.5076591372489929, "logps/rejected": -3.7342796325683594, "loss": 0.5022, "odds_ratio_loss": 0.22245776653289795, "rewards/accuracies": 0.875, "rewards/chosen": -0.05076591670513153, "rewards/margins": 0.32266199588775635, "rewards/rejected": -0.3734279274940491, "sft_loss": 0.5076591372489929, "step": 1893 }, { "epoch": 2.7389732465654375, "grad_norm": 2.8291511708947965, "learning_rate": 4.707503423108042e-06, "logits/chosen": -0.24433070421218872, "logits/rejected": -0.24968703091144562, "logps/chosen": -0.48571792244911194, "logps/rejected": -1.9403795003890991, "loss": 0.4089, "odds_ratio_loss": 0.2183419167995453, "rewards/accuracies": 1.0, "rewards/chosen": -0.048571791499853134, "rewards/margins": 0.1454661637544632, "rewards/rejected": -0.19403795897960663, "sft_loss": 0.48571792244911194, "step": 1894 }, { "epoch": 2.7404193781634127, "grad_norm": 5.589426058096608, "learning_rate": 4.7044463048535065e-06, "logits/chosen": -0.21336159110069275, "logits/rejected": -0.15851540863513947, "logps/chosen": -0.3277932405471802, "logps/rejected": -3.657266139984131, "loss": 0.3647, "odds_ratio_loss": 0.0775853842496872, "rewards/accuracies": 1.0, "rewards/chosen": -0.03277932107448578, "rewards/margins": 0.33294734358787537, "rewards/rejected": -0.36572664976119995, "sft_loss": 0.3277932405471802, "step": 1895 }, { "epoch": 2.7418655097613884, "grad_norm": 2.559094190532531, "learning_rate": 4.70138876188567e-06, "logits/chosen": -0.1807275265455246, "logits/rejected": -0.25307416915893555, "logps/chosen": -0.5286034941673279, "logps/rejected": -4.811222553253174, "loss": 0.4474, "odds_ratio_loss": 0.15058596432209015, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05286034941673279, "rewards/margins": 0.428261935710907, "rewards/rejected": -0.4811222553253174, "sft_loss": 0.5286034941673279, "step": 1896 }, { "epoch": 2.7433116413593637, "grad_norm": 2.7848490528190295, "learning_rate": 4.6983307960479386e-06, "logits/chosen": -0.10986755788326263, "logits/rejected": -0.18220268189907074, "logps/chosen": -0.2580132782459259, "logps/rejected": -3.7098939418792725, "loss": 0.3357, "odds_ratio_loss": 0.11131416261196136, "rewards/accuracies": 1.0, "rewards/chosen": -0.02580132894217968, "rewards/margins": 0.34518808126449585, "rewards/rejected": -0.3709894120693207, "sft_loss": 0.2580132782459259, "step": 1897 }, { "epoch": 2.744757772957339, "grad_norm": 2.3933005373715575, "learning_rate": 4.695272409183969e-06, "logits/chosen": -0.36661919951438904, "logits/rejected": -0.2121463268995285, "logps/chosen": -0.45644479990005493, "logps/rejected": -4.979647636413574, "loss": 0.4361, "odds_ratio_loss": 0.1398359090089798, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04564448446035385, "rewards/margins": 0.45232027769088745, "rewards/rejected": -0.4979647099971771, "sft_loss": 0.45644479990005493, "step": 1898 }, { "epoch": 2.7462039045553146, "grad_norm": 2.6501699786450947, "learning_rate": 4.692213603137673e-06, "logits/chosen": -0.15103618800640106, "logits/rejected": -0.2331678569316864, "logps/chosen": -0.3875581622123718, "logps/rejected": -2.742100238800049, "loss": 0.369, "odds_ratio_loss": 0.20608189702033997, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03875581920146942, "rewards/margins": 0.23545420169830322, "rewards/rejected": -0.27421000599861145, "sft_loss": 0.3875581622123718, "step": 1899 }, { "epoch": 2.74765003615329, "grad_norm": 2.2818384142651658, "learning_rate": 4.689154379753219e-06, "logits/chosen": -0.2545371651649475, "logits/rejected": -0.3067525327205658, "logps/chosen": -0.3318255841732025, "logps/rejected": -4.289620876312256, "loss": 0.395, "odds_ratio_loss": 0.10435459017753601, "rewards/accuracies": 1.0, "rewards/chosen": -0.03318255394697189, "rewards/margins": 0.395779550075531, "rewards/rejected": -0.4289621114730835, "sft_loss": 0.3318255841732025, "step": 1900 }, { "epoch": 2.7490961677512655, "grad_norm": 2.5609234326640675, "learning_rate": 4.686094740875022e-06, "logits/chosen": -0.2666279673576355, "logits/rejected": -0.21327508985996246, "logps/chosen": -0.2115049809217453, "logps/rejected": -3.417975664138794, "loss": 0.3291, "odds_ratio_loss": 0.07255256921052933, "rewards/accuracies": 1.0, "rewards/chosen": -0.02115049958229065, "rewards/margins": 0.32064709067344666, "rewards/rejected": -0.3417975902557373, "sft_loss": 0.2115049809217453, "step": 1901 }, { "epoch": 2.7505422993492408, "grad_norm": 2.3829562380839913, "learning_rate": 4.68303468834775e-06, "logits/chosen": -0.12061008810997009, "logits/rejected": -0.12486258894205093, "logps/chosen": -0.3109768033027649, "logps/rejected": -3.7834866046905518, "loss": 0.4206, "odds_ratio_loss": 0.11438363790512085, "rewards/accuracies": 1.0, "rewards/chosen": -0.03109768033027649, "rewards/margins": 0.3472509980201721, "rewards/rejected": -0.3783486485481262, "sft_loss": 0.3109768033027649, "step": 1902 }, { "epoch": 2.7519884309472165, "grad_norm": 2.862427532503618, "learning_rate": 4.67997422401632e-06, "logits/chosen": -0.16161176562309265, "logits/rejected": -0.20153605937957764, "logps/chosen": -0.41737887263298035, "logps/rejected": -1.8366215229034424, "loss": 0.3795, "odds_ratio_loss": 0.2205577939748764, "rewards/accuracies": 0.9375, "rewards/chosen": -0.041737888008356094, "rewards/margins": 0.14192426204681396, "rewards/rejected": -0.18366214632987976, "sft_loss": 0.41737887263298035, "step": 1903 }, { "epoch": 2.7534345625451917, "grad_norm": 3.6690114295501624, "learning_rate": 4.6769133497259006e-06, "logits/chosen": -0.20664706826210022, "logits/rejected": -0.026020802557468414, "logps/chosen": -0.34037303924560547, "logps/rejected": -3.9743449687957764, "loss": 0.4032, "odds_ratio_loss": 0.08332730084657669, "rewards/accuracies": 1.0, "rewards/chosen": -0.034037306904792786, "rewards/margins": 0.3633972108364105, "rewards/rejected": -0.3974345326423645, "sft_loss": 0.34037303924560547, "step": 1904 }, { "epoch": 2.754880694143167, "grad_norm": 2.294070706599266, "learning_rate": 4.673852067321899e-06, "logits/chosen": -0.18250615894794464, "logits/rejected": -0.2798006534576416, "logps/chosen": -0.32629454135894775, "logps/rejected": -2.708524703979492, "loss": 0.3524, "odds_ratio_loss": 0.1289883852005005, "rewards/accuracies": 1.0, "rewards/chosen": -0.032629452645778656, "rewards/margins": 0.23822301626205444, "rewards/rejected": -0.2708524763584137, "sft_loss": 0.32629454135894775, "step": 1905 }, { "epoch": 2.7563268257411426, "grad_norm": 2.554522375851076, "learning_rate": 4.670790378649977e-06, "logits/chosen": -0.08530348539352417, "logits/rejected": -0.12577472627162933, "logps/chosen": -0.31157389283180237, "logps/rejected": -3.460144281387329, "loss": 0.3684, "odds_ratio_loss": 0.1313571035861969, "rewards/accuracies": 1.0, "rewards/chosen": -0.031157393008470535, "rewards/margins": 0.31485703587532043, "rewards/rejected": -0.34601444005966187, "sft_loss": 0.31157389283180237, "step": 1906 }, { "epoch": 2.757772957339118, "grad_norm": 2.6229933469821898, "learning_rate": 4.6677282855560375e-06, "logits/chosen": -0.3336745798587799, "logits/rejected": -0.16491839289665222, "logps/chosen": -0.2330479621887207, "logps/rejected": -3.2504565715789795, "loss": 0.374, "odds_ratio_loss": 0.054218702018260956, "rewards/accuracies": 1.0, "rewards/chosen": -0.02330479584634304, "rewards/margins": 0.3017408847808838, "rewards/rejected": -0.3250456750392914, "sft_loss": 0.2330479621887207, "step": 1907 }, { "epoch": 2.759219088937093, "grad_norm": 3.622563710757675, "learning_rate": 4.6646657898862284e-06, "logits/chosen": -0.3674685060977936, "logits/rejected": -0.3329426050186157, "logps/chosen": -0.499603271484375, "logps/rejected": -1.849241852760315, "loss": 0.4121, "odds_ratio_loss": 0.2355216145515442, "rewards/accuracies": 1.0, "rewards/chosen": -0.04996032640337944, "rewards/margins": 0.13496387004852295, "rewards/rejected": -0.1849241852760315, "sft_loss": 0.499603271484375, "step": 1908 }, { "epoch": 2.760665220535069, "grad_norm": 3.44369490784189, "learning_rate": 4.661602893486939e-06, "logits/chosen": -0.24251700937747955, "logits/rejected": -0.36239850521087646, "logps/chosen": -0.49035611748695374, "logps/rejected": -2.837653636932373, "loss": 0.329, "odds_ratio_loss": 0.20834286510944366, "rewards/accuracies": 1.0, "rewards/chosen": -0.04903561249375343, "rewards/margins": 0.23472976684570312, "rewards/rejected": -0.28376537561416626, "sft_loss": 0.49035611748695374, "step": 1909 }, { "epoch": 2.762111352133044, "grad_norm": 2.5770843438365243, "learning_rate": 4.6585395982048e-06, "logits/chosen": -0.2157149314880371, "logits/rejected": -0.2589437663555145, "logps/chosen": -0.34972837567329407, "logps/rejected": -2.4410877227783203, "loss": 0.4031, "odds_ratio_loss": 0.1218947246670723, "rewards/accuracies": 1.0, "rewards/chosen": -0.034972839057445526, "rewards/margins": 0.20913594961166382, "rewards/rejected": -0.24410878121852875, "sft_loss": 0.34972837567329407, "step": 1910 }, { "epoch": 2.7635574837310193, "grad_norm": 3.9997670265300695, "learning_rate": 4.655475905886685e-06, "logits/chosen": -0.31204700469970703, "logits/rejected": -0.2673895061016083, "logps/chosen": -0.3067542314529419, "logps/rejected": -2.5147337913513184, "loss": 0.3784, "odds_ratio_loss": 0.11027967929840088, "rewards/accuracies": 1.0, "rewards/chosen": -0.03067542240023613, "rewards/margins": 0.22079795598983765, "rewards/rejected": -0.2514733672142029, "sft_loss": 0.3067542314529419, "step": 1911 }, { "epoch": 2.765003615328995, "grad_norm": 2.6263995198618337, "learning_rate": 4.652411818379706e-06, "logits/chosen": -0.2939597964286804, "logits/rejected": -0.22305241227149963, "logps/chosen": -0.3409603238105774, "logps/rejected": -2.78116774559021, "loss": 0.3668, "odds_ratio_loss": 0.0981820672750473, "rewards/accuracies": 1.0, "rewards/chosen": -0.03409603238105774, "rewards/margins": 0.2440207302570343, "rewards/rejected": -0.27811676263809204, "sft_loss": 0.3409603238105774, "step": 1912 }, { "epoch": 2.76644974692697, "grad_norm": 3.1002078754602933, "learning_rate": 4.6493473375312106e-06, "logits/chosen": -0.1616482436656952, "logits/rejected": -0.18430642783641815, "logps/chosen": -0.296634316444397, "logps/rejected": -3.99161434173584, "loss": 0.3846, "odds_ratio_loss": 0.1349400132894516, "rewards/accuracies": 1.0, "rewards/chosen": -0.02966342866420746, "rewards/margins": 0.36949801445007324, "rewards/rejected": -0.3991614282131195, "sft_loss": 0.296634316444397, "step": 1913 }, { "epoch": 2.767895878524946, "grad_norm": 4.2146173773241244, "learning_rate": 4.646282465188788e-06, "logits/chosen": -0.3848811984062195, "logits/rejected": -0.3408425450325012, "logps/chosen": -0.36459803581237793, "logps/rejected": -2.6756057739257812, "loss": 0.3984, "odds_ratio_loss": 0.1446124166250229, "rewards/accuracies": 1.0, "rewards/chosen": -0.03645980730652809, "rewards/margins": 0.23110075294971466, "rewards/rejected": -0.26756054162979126, "sft_loss": 0.36459803581237793, "step": 1914 }, { "epoch": 2.769342010122921, "grad_norm": 2.4636205252773133, "learning_rate": 4.643217203200259e-06, "logits/chosen": -0.17520803213119507, "logits/rejected": -0.21425886452198029, "logps/chosen": -0.3626866638660431, "logps/rejected": -3.3118858337402344, "loss": 0.4663, "odds_ratio_loss": 0.11417470127344131, "rewards/accuracies": 1.0, "rewards/chosen": -0.036268673837184906, "rewards/margins": 0.2949199378490448, "rewards/rejected": -0.3311886191368103, "sft_loss": 0.3626866638660431, "step": 1915 }, { "epoch": 2.770788141720897, "grad_norm": 2.1402630431394782, "learning_rate": 4.640151553413683e-06, "logits/chosen": -0.2816272974014282, "logits/rejected": -0.3485042154788971, "logps/chosen": -0.4563095271587372, "logps/rejected": -3.226483106613159, "loss": 0.4248, "odds_ratio_loss": 0.1943037509918213, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04563095420598984, "rewards/margins": 0.27701735496520996, "rewards/rejected": -0.3226483166217804, "sft_loss": 0.4563095271587372, "step": 1916 }, { "epoch": 2.772234273318872, "grad_norm": 2.2310359352159654, "learning_rate": 4.637085517677351e-06, "logits/chosen": -0.12221245467662811, "logits/rejected": -0.11277172714471817, "logps/chosen": -0.4151383638381958, "logps/rejected": -2.5533032417297363, "loss": 0.3756, "odds_ratio_loss": 0.20337051153182983, "rewards/accuracies": 1.0, "rewards/chosen": -0.0415138378739357, "rewards/margins": 0.21381649374961853, "rewards/rejected": -0.25533032417297363, "sft_loss": 0.4151383638381958, "step": 1917 }, { "epoch": 2.7736804049168473, "grad_norm": 8.261017962903177, "learning_rate": 4.634019097839788e-06, "logits/chosen": -0.39285701513290405, "logits/rejected": -0.25874412059783936, "logps/chosen": -0.5222992300987244, "logps/rejected": -2.182068347930908, "loss": 0.4893, "odds_ratio_loss": 0.1882818341255188, "rewards/accuracies": 0.9375, "rewards/chosen": -0.052229925990104675, "rewards/margins": 0.16597692668437958, "rewards/rejected": -0.21820686757564545, "sft_loss": 0.5222992300987244, "step": 1918 }, { "epoch": 2.775126536514823, "grad_norm": 2.1730397586110053, "learning_rate": 4.630952295749749e-06, "logits/chosen": -0.1443309485912323, "logits/rejected": -0.4016629457473755, "logps/chosen": -0.384902685880661, "logps/rejected": -2.4064371585845947, "loss": 0.4804, "odds_ratio_loss": 0.18294137716293335, "rewards/accuracies": 1.0, "rewards/chosen": -0.03849026933312416, "rewards/margins": 0.20215344429016113, "rewards/rejected": -0.240643709897995, "sft_loss": 0.384902685880661, "step": 1919 }, { "epoch": 2.7765726681127982, "grad_norm": 2.675044128872941, "learning_rate": 4.627885113256223e-06, "logits/chosen": -0.06406591832637787, "logits/rejected": -0.044175997376441956, "logps/chosen": -0.3621594309806824, "logps/rejected": -2.851767063140869, "loss": 0.334, "odds_ratio_loss": 0.14854490756988525, "rewards/accuracies": 1.0, "rewards/chosen": -0.036215946078300476, "rewards/margins": 0.24896079301834106, "rewards/rejected": -0.28517672419548035, "sft_loss": 0.3621594309806824, "step": 1920 }, { "epoch": 2.7780187997107735, "grad_norm": 2.4626982788921623, "learning_rate": 4.624817552208422e-06, "logits/chosen": -0.1586943119764328, "logits/rejected": -0.1781804859638214, "logps/chosen": -0.2940862476825714, "logps/rejected": -2.0932531356811523, "loss": 0.3358, "odds_ratio_loss": 0.12081068754196167, "rewards/accuracies": 1.0, "rewards/chosen": -0.029408622533082962, "rewards/margins": 0.17991669476032257, "rewards/rejected": -0.20932531356811523, "sft_loss": 0.2940862476825714, "step": 1921 }, { "epoch": 2.779464931308749, "grad_norm": 3.056963625474084, "learning_rate": 4.621749614455792e-06, "logits/chosen": -0.21742503345012665, "logits/rejected": -0.2462393343448639, "logps/chosen": -0.4259098172187805, "logps/rejected": -2.3623046875, "loss": 0.3804, "odds_ratio_loss": 0.13163158297538757, "rewards/accuracies": 1.0, "rewards/chosen": -0.04259098693728447, "rewards/margins": 0.19363948702812195, "rewards/rejected": -0.23623046278953552, "sft_loss": 0.4259098172187805, "step": 1922 }, { "epoch": 2.7809110629067244, "grad_norm": 2.591769874435251, "learning_rate": 4.618681301848004e-06, "logits/chosen": -0.3165079951286316, "logits/rejected": -0.11529199779033661, "logps/chosen": -0.4419441819190979, "logps/rejected": -2.79586124420166, "loss": 0.3997, "odds_ratio_loss": 0.14373686909675598, "rewards/accuracies": 1.0, "rewards/chosen": -0.04419442266225815, "rewards/margins": 0.23539170622825623, "rewards/rejected": -0.2795861065387726, "sft_loss": 0.4419441819190979, "step": 1923 }, { "epoch": 2.7823571945047, "grad_norm": 2.5546096197432835, "learning_rate": 4.615612616234955e-06, "logits/chosen": -0.2990122139453888, "logits/rejected": -0.32261422276496887, "logps/chosen": -0.32381701469421387, "logps/rejected": -3.8825607299804688, "loss": 0.3826, "odds_ratio_loss": 0.09977390617132187, "rewards/accuracies": 1.0, "rewards/chosen": -0.032381702214479446, "rewards/margins": 0.3558743894100189, "rewards/rejected": -0.3882560729980469, "sft_loss": 0.32381701469421387, "step": 1924 }, { "epoch": 2.7838033261026753, "grad_norm": 2.607250283502564, "learning_rate": 4.6125435594667664e-06, "logits/chosen": -0.286873459815979, "logits/rejected": -0.3211660087108612, "logps/chosen": -0.37380242347717285, "logps/rejected": -3.475512981414795, "loss": 0.3714, "odds_ratio_loss": 0.1345575600862503, "rewards/accuracies": 1.0, "rewards/chosen": -0.037380240857601166, "rewards/margins": 0.31017106771469116, "rewards/rejected": -0.3475513160228729, "sft_loss": 0.37380242347717285, "step": 1925 }, { "epoch": 2.785249457700651, "grad_norm": 2.3706014238306015, "learning_rate": 4.609474133393785e-06, "logits/chosen": -0.3925056457519531, "logits/rejected": -0.4020850658416748, "logps/chosen": -0.2961789071559906, "logps/rejected": -2.771475315093994, "loss": 0.346, "odds_ratio_loss": 0.10398849099874496, "rewards/accuracies": 1.0, "rewards/chosen": -0.02961789071559906, "rewards/margins": 0.24752962589263916, "rewards/rejected": -0.2771475315093994, "sft_loss": 0.2961789071559906, "step": 1926 }, { "epoch": 2.7866955892986263, "grad_norm": 3.0033007962008327, "learning_rate": 4.606404339866578e-06, "logits/chosen": -0.2951948046684265, "logits/rejected": -0.23653864860534668, "logps/chosen": -0.43498289585113525, "logps/rejected": -3.2631759643554688, "loss": 0.3759, "odds_ratio_loss": 0.14501672983169556, "rewards/accuracies": 1.0, "rewards/chosen": -0.043498288840055466, "rewards/margins": 0.28281930088996887, "rewards/rejected": -0.32631760835647583, "sft_loss": 0.43498289585113525, "step": 1927 }, { "epoch": 2.7881417208966015, "grad_norm": 2.454319936149338, "learning_rate": 4.603334180735937e-06, "logits/chosen": -0.3240278661251068, "logits/rejected": -0.2735302448272705, "logps/chosen": -0.31935903429985046, "logps/rejected": -3.252980947494507, "loss": 0.3543, "odds_ratio_loss": 0.07885086536407471, "rewards/accuracies": 1.0, "rewards/chosen": -0.031935904175043106, "rewards/margins": 0.29336220026016235, "rewards/rejected": -0.32529810070991516, "sft_loss": 0.31935903429985046, "step": 1928 }, { "epoch": 2.789587852494577, "grad_norm": 2.3281215603843237, "learning_rate": 4.6002636578528694e-06, "logits/chosen": -0.18546296656131744, "logits/rejected": -0.17621514201164246, "logps/chosen": -0.2209104746580124, "logps/rejected": -4.707164287567139, "loss": 0.2778, "odds_ratio_loss": 0.054455384612083435, "rewards/accuracies": 1.0, "rewards/chosen": -0.022091049700975418, "rewards/margins": 0.448625385761261, "rewards/rejected": -0.4707164168357849, "sft_loss": 0.2209104746580124, "step": 1929 }, { "epoch": 2.7910339840925524, "grad_norm": 2.251641017951371, "learning_rate": 4.5971927730686086e-06, "logits/chosen": -0.29331666231155396, "logits/rejected": -0.29489755630493164, "logps/chosen": -0.4534047245979309, "logps/rejected": -1.960153341293335, "loss": 0.4003, "odds_ratio_loss": 0.15419454872608185, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04534047096967697, "rewards/margins": 0.15067486464977264, "rewards/rejected": -0.19601532816886902, "sft_loss": 0.4534047245979309, "step": 1930 }, { "epoch": 2.7924801156905277, "grad_norm": 2.304782801749755, "learning_rate": 4.594121528234601e-06, "logits/chosen": -0.1861201524734497, "logits/rejected": -0.22230559587478638, "logps/chosen": -0.39050740003585815, "logps/rejected": -4.507702350616455, "loss": 0.4072, "odds_ratio_loss": 0.13908474147319794, "rewards/accuracies": 1.0, "rewards/chosen": -0.039050742983818054, "rewards/margins": 0.41171950101852417, "rewards/rejected": -0.45077022910118103, "sft_loss": 0.39050740003585815, "step": 1931 }, { "epoch": 2.7939262472885034, "grad_norm": 2.5018670474835845, "learning_rate": 4.59104992520251e-06, "logits/chosen": -0.24248866736888885, "logits/rejected": -0.23236139118671417, "logps/chosen": -0.446467787027359, "logps/rejected": -2.6032891273498535, "loss": 0.38, "odds_ratio_loss": 0.20997995138168335, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04464678466320038, "rewards/margins": 0.21568216383457184, "rewards/rejected": -0.2603289484977722, "sft_loss": 0.446467787027359, "step": 1932 }, { "epoch": 2.7953723788864786, "grad_norm": 2.272702135116343, "learning_rate": 4.5879779658242185e-06, "logits/chosen": -0.22327488660812378, "logits/rejected": -0.2359078824520111, "logps/chosen": -0.27088087797164917, "logps/rejected": -2.6554477214813232, "loss": 0.3654, "odds_ratio_loss": 0.08074113726615906, "rewards/accuracies": 1.0, "rewards/chosen": -0.027088087052106857, "rewards/margins": 0.23845668137073517, "rewards/rejected": -0.2655448019504547, "sft_loss": 0.27088087797164917, "step": 1933 }, { "epoch": 2.796818510484454, "grad_norm": 2.714031627693277, "learning_rate": 4.584905651951821e-06, "logits/chosen": -0.31978341937065125, "logits/rejected": -0.19837269186973572, "logps/chosen": -0.3051181733608246, "logps/rejected": -3.7367353439331055, "loss": 0.4014, "odds_ratio_loss": 0.09203245490789413, "rewards/accuracies": 1.0, "rewards/chosen": -0.03051181510090828, "rewards/margins": 0.3431617021560669, "rewards/rejected": -0.3736734986305237, "sft_loss": 0.3051181733608246, "step": 1934 }, { "epoch": 2.7982646420824295, "grad_norm": 2.8248128614823362, "learning_rate": 4.581832985437628e-06, "logits/chosen": -0.2673256993293762, "logits/rejected": -0.33064621686935425, "logps/chosen": -0.3906742334365845, "logps/rejected": -2.7726762294769287, "loss": 0.3861, "odds_ratio_loss": 0.20273485779762268, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03906742483377457, "rewards/margins": 0.23820018768310547, "rewards/rejected": -0.2772676348686218, "sft_loss": 0.3906742334365845, "step": 1935 }, { "epoch": 2.7997107736804048, "grad_norm": 2.2782935918289287, "learning_rate": 4.578759968134162e-06, "logits/chosen": -0.4593513607978821, "logits/rejected": -0.26922720670700073, "logps/chosen": -0.4158691167831421, "logps/rejected": -2.6285738945007324, "loss": 0.3801, "odds_ratio_loss": 0.19207270443439484, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04158691689372063, "rewards/margins": 0.22127047181129456, "rewards/rejected": -0.2628573775291443, "sft_loss": 0.4158691167831421, "step": 1936 }, { "epoch": 2.8011569052783805, "grad_norm": 7.689527810087393, "learning_rate": 4.575686601894154e-06, "logits/chosen": -0.3853088617324829, "logits/rejected": -0.40306854248046875, "logps/chosen": -0.38764315843582153, "logps/rejected": -3.561250925064087, "loss": 0.4396, "odds_ratio_loss": 0.12997904419898987, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03876432031393051, "rewards/margins": 0.3173607885837555, "rewards/rejected": -0.3561251163482666, "sft_loss": 0.38764315843582153, "step": 1937 }, { "epoch": 2.8026030368763557, "grad_norm": 2.4015205603833865, "learning_rate": 4.572612888570551e-06, "logits/chosen": -0.15974129736423492, "logits/rejected": -0.201629638671875, "logps/chosen": -0.2965061664581299, "logps/rejected": -2.6370511054992676, "loss": 0.3518, "odds_ratio_loss": 0.14970335364341736, "rewards/accuracies": 1.0, "rewards/chosen": -0.029650619253516197, "rewards/margins": 0.23405447602272034, "rewards/rejected": -0.2637051045894623, "sft_loss": 0.2965061664581299, "step": 1938 }, { "epoch": 2.8040491684743314, "grad_norm": 2.3197713473946773, "learning_rate": 4.569538830016504e-06, "logits/chosen": -0.19120481610298157, "logits/rejected": -0.16116847097873688, "logps/chosen": -0.3215819001197815, "logps/rejected": -4.226866245269775, "loss": 0.3769, "odds_ratio_loss": 0.09225130826234818, "rewards/accuracies": 1.0, "rewards/chosen": -0.03215819224715233, "rewards/margins": 0.39052847027778625, "rewards/rejected": -0.4226866364479065, "sft_loss": 0.3215819001197815, "step": 1939 }, { "epoch": 2.8054953000723066, "grad_norm": 2.4727718660935407, "learning_rate": 4.566464428085375e-06, "logits/chosen": -0.1913873255252838, "logits/rejected": -0.27844709157943726, "logps/chosen": -0.20122195780277252, "logps/rejected": -3.5175328254699707, "loss": 0.3739, "odds_ratio_loss": 0.08544261008501053, "rewards/accuracies": 1.0, "rewards/chosen": -0.020122196525335312, "rewards/margins": 0.331631064414978, "rewards/rejected": -0.351753294467926, "sft_loss": 0.20122195780277252, "step": 1940 }, { "epoch": 2.806941431670282, "grad_norm": 2.946690687261266, "learning_rate": 4.563389684630733e-06, "logits/chosen": -0.130567729473114, "logits/rejected": -0.1708630472421646, "logps/chosen": -0.3934086561203003, "logps/rejected": -4.302565097808838, "loss": 0.3624, "odds_ratio_loss": 0.14866124093532562, "rewards/accuracies": 1.0, "rewards/chosen": -0.03934086486697197, "rewards/margins": 0.3909156620502472, "rewards/rejected": -0.43025651574134827, "sft_loss": 0.3934086561203003, "step": 1941 }, { "epoch": 2.8083875632682576, "grad_norm": 2.189219374296923, "learning_rate": 4.560314601506352e-06, "logits/chosen": -0.09481626749038696, "logits/rejected": -0.11865237355232239, "logps/chosen": -0.19691374897956848, "logps/rejected": -3.1772546768188477, "loss": 0.2849, "odds_ratio_loss": 0.09188981354236603, "rewards/accuracies": 1.0, "rewards/chosen": -0.019691377878189087, "rewards/margins": 0.29803407192230225, "rewards/rejected": -0.3177254796028137, "sft_loss": 0.19691374897956848, "step": 1942 }, { "epoch": 2.809833694866233, "grad_norm": 2.558939249575432, "learning_rate": 4.557239180566211e-06, "logits/chosen": -0.24929408729076385, "logits/rejected": -0.20789062976837158, "logps/chosen": -0.39418619871139526, "logps/rejected": -3.524662494659424, "loss": 0.3461, "odds_ratio_loss": 0.20861908793449402, "rewards/accuracies": 0.9375, "rewards/chosen": -0.039418622851371765, "rewards/margins": 0.3130476176738739, "rewards/rejected": -0.3524662256240845, "sft_loss": 0.39418619871139526, "step": 1943 }, { "epoch": 2.811279826464208, "grad_norm": 3.5546240440051413, "learning_rate": 4.554163423664492e-06, "logits/chosen": -0.2653443217277527, "logits/rejected": -0.19230744242668152, "logps/chosen": -0.3097788393497467, "logps/rejected": -2.285008192062378, "loss": 0.3426, "odds_ratio_loss": 0.15280936658382416, "rewards/accuracies": 1.0, "rewards/chosen": -0.03097788617014885, "rewards/margins": 0.19752293825149536, "rewards/rejected": -0.2285008281469345, "sft_loss": 0.3097788393497467, "step": 1944 }, { "epoch": 2.8127259580621837, "grad_norm": 2.464371112246266, "learning_rate": 4.551087332655581e-06, "logits/chosen": -0.19536343216896057, "logits/rejected": -0.21240335702896118, "logps/chosen": -0.3807547688484192, "logps/rejected": -4.58089542388916, "loss": 0.3613, "odds_ratio_loss": 0.12009456753730774, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03807547688484192, "rewards/margins": 0.42001405358314514, "rewards/rejected": -0.4580894708633423, "sft_loss": 0.3807547688484192, "step": 1945 }, { "epoch": 2.814172089660159, "grad_norm": 2.8583032297639694, "learning_rate": 4.548010909394065e-06, "logits/chosen": -0.25077760219573975, "logits/rejected": -0.22733455896377563, "logps/chosen": -0.2656141519546509, "logps/rejected": -2.585810422897339, "loss": 0.3708, "odds_ratio_loss": 0.07707735151052475, "rewards/accuracies": 1.0, "rewards/chosen": -0.026561414822936058, "rewards/margins": 0.23201963305473328, "rewards/rejected": -0.2585810422897339, "sft_loss": 0.2656141519546509, "step": 1946 }, { "epoch": 2.815618221258134, "grad_norm": 2.1958212700336777, "learning_rate": 4.5449341557347314e-06, "logits/chosen": -0.24419622123241425, "logits/rejected": -0.32629725337028503, "logps/chosen": -0.38885653018951416, "logps/rejected": -2.0550642013549805, "loss": 0.4423, "odds_ratio_loss": 0.1665182113647461, "rewards/accuracies": 0.9375, "rewards/chosen": -0.038885653018951416, "rewards/margins": 0.16662079095840454, "rewards/rejected": -0.20550641417503357, "sft_loss": 0.38885653018951416, "step": 1947 }, { "epoch": 2.81706435285611, "grad_norm": 2.827337130609505, "learning_rate": 4.541857073532565e-06, "logits/chosen": -0.29342615604400635, "logits/rejected": -0.37808701395988464, "logps/chosen": -0.4292759597301483, "logps/rejected": -3.3842148780822754, "loss": 0.4184, "odds_ratio_loss": 0.16178587079048157, "rewards/accuracies": 1.0, "rewards/chosen": -0.04292759299278259, "rewards/margins": 0.2954939007759094, "rewards/rejected": -0.338421493768692, "sft_loss": 0.4292759597301483, "step": 1948 }, { "epoch": 2.8185104844540856, "grad_norm": 2.9319747599147346, "learning_rate": 4.538779664642751e-06, "logits/chosen": -0.1982938051223755, "logits/rejected": -0.2894214987754822, "logps/chosen": -0.4327597916126251, "logps/rejected": -4.025196552276611, "loss": 0.38, "odds_ratio_loss": 0.1208379864692688, "rewards/accuracies": 1.0, "rewards/chosen": -0.04327598214149475, "rewards/margins": 0.3592436909675598, "rewards/rejected": -0.40251967310905457, "sft_loss": 0.4327597916126251, "step": 1949 }, { "epoch": 2.819956616052061, "grad_norm": 2.781771227196679, "learning_rate": 4.535701930920669e-06, "logits/chosen": -0.250921368598938, "logits/rejected": -0.21995939314365387, "logps/chosen": -0.4275512099266052, "logps/rejected": -2.362046003341675, "loss": 0.3795, "odds_ratio_loss": 0.11597280949354172, "rewards/accuracies": 1.0, "rewards/chosen": -0.042755126953125, "rewards/margins": 0.1934494823217392, "rewards/rejected": -0.2362046092748642, "sft_loss": 0.4275512099266052, "step": 1950 }, { "epoch": 2.821402747650036, "grad_norm": 2.3237978364107295, "learning_rate": 4.532623874221901e-06, "logits/chosen": -0.15754200518131256, "logits/rejected": -0.06704466044902802, "logps/chosen": -0.33436861634254456, "logps/rejected": -4.438542366027832, "loss": 0.3531, "odds_ratio_loss": 0.09081018716096878, "rewards/accuracies": 1.0, "rewards/chosen": -0.033436864614486694, "rewards/margins": 0.4104173481464386, "rewards/rejected": -0.4438541829586029, "sft_loss": 0.33436861634254456, "step": 1951 }, { "epoch": 2.8228488792480118, "grad_norm": 3.35312868677405, "learning_rate": 4.529545496402214e-06, "logits/chosen": -0.2972349226474762, "logits/rejected": -0.29672643542289734, "logps/chosen": -0.3766123652458191, "logps/rejected": -4.205931186676025, "loss": 0.3747, "odds_ratio_loss": 0.12844733893871307, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03766123950481415, "rewards/margins": 0.3829319477081299, "rewards/rejected": -0.42059317231178284, "sft_loss": 0.3766123652458191, "step": 1952 }, { "epoch": 2.824295010845987, "grad_norm": 2.0202609151492563, "learning_rate": 4.526466799317574e-06, "logits/chosen": -0.1269502341747284, "logits/rejected": -0.19187532365322113, "logps/chosen": -0.5017333030700684, "logps/rejected": -3.294124126434326, "loss": 0.4123, "odds_ratio_loss": 0.19588340818881989, "rewards/accuracies": 1.0, "rewards/chosen": -0.0501733273267746, "rewards/margins": 0.27923905849456787, "rewards/rejected": -0.32941240072250366, "sft_loss": 0.5017333030700684, "step": 1953 }, { "epoch": 2.8257411424439622, "grad_norm": 2.1424438067504608, "learning_rate": 4.5233877848241405e-06, "logits/chosen": -0.20550042390823364, "logits/rejected": -0.217835932970047, "logps/chosen": -0.4110490679740906, "logps/rejected": -5.2300262451171875, "loss": 0.3385, "odds_ratio_loss": 0.1557879000902176, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04110490530729294, "rewards/margins": 0.48189777135849, "rewards/rejected": -0.5230026841163635, "sft_loss": 0.4110490679740906, "step": 1954 }, { "epoch": 2.827187274041938, "grad_norm": 2.2898308985512434, "learning_rate": 4.5203084547782625e-06, "logits/chosen": -0.1175132691860199, "logits/rejected": -0.1555338203907013, "logps/chosen": -0.3281111717224121, "logps/rejected": -2.776350736618042, "loss": 0.3175, "odds_ratio_loss": 0.1153721809387207, "rewards/accuracies": 1.0, "rewards/chosen": -0.03281112015247345, "rewards/margins": 0.24482394754886627, "rewards/rejected": -0.2776350677013397, "sft_loss": 0.3281111717224121, "step": 1955 }, { "epoch": 2.828633405639913, "grad_norm": 2.556433357322931, "learning_rate": 4.517228811036479e-06, "logits/chosen": -0.12393073737621307, "logits/rejected": -0.058607179671525955, "logps/chosen": -0.3850470781326294, "logps/rejected": -2.437091827392578, "loss": 0.3945, "odds_ratio_loss": 0.11526143550872803, "rewards/accuracies": 0.9375, "rewards/chosen": -0.038504708558321, "rewards/margins": 0.20520448684692383, "rewards/rejected": -0.24370920658111572, "sft_loss": 0.3850470781326294, "step": 1956 }, { "epoch": 2.8300795372378884, "grad_norm": 2.3441089513748934, "learning_rate": 4.514148855455519e-06, "logits/chosen": -0.2920979857444763, "logits/rejected": -0.21535375714302063, "logps/chosen": -0.42927250266075134, "logps/rejected": -4.101245403289795, "loss": 0.4083, "odds_ratio_loss": 0.0865693911910057, "rewards/accuracies": 1.0, "rewards/chosen": -0.042927250266075134, "rewards/margins": 0.36719733476638794, "rewards/rejected": -0.4101245403289795, "sft_loss": 0.42927250266075134, "step": 1957 }, { "epoch": 2.831525668835864, "grad_norm": 2.1830249246574, "learning_rate": 4.511068589892299e-06, "logits/chosen": -0.2623273730278015, "logits/rejected": -0.28325706720352173, "logps/chosen": -0.3608659505844116, "logps/rejected": -4.1569623947143555, "loss": 0.3382, "odds_ratio_loss": 0.08394452929496765, "rewards/accuracies": 1.0, "rewards/chosen": -0.036086589097976685, "rewards/margins": 0.3796096444129944, "rewards/rejected": -0.4156962037086487, "sft_loss": 0.3608659505844116, "step": 1958 }, { "epoch": 2.8329718004338393, "grad_norm": 2.4965701484780682, "learning_rate": 4.507988016203924e-06, "logits/chosen": -0.12955503165721893, "logits/rejected": -0.13261455297470093, "logps/chosen": -0.45575547218322754, "logps/rejected": -5.616315841674805, "loss": 0.4245, "odds_ratio_loss": 0.15984290838241577, "rewards/accuracies": 1.0, "rewards/chosen": -0.045575544238090515, "rewards/margins": 0.5160560011863708, "rewards/rejected": -0.5616315603256226, "sft_loss": 0.45575547218322754, "step": 1959 }, { "epoch": 2.834417932031815, "grad_norm": 2.609261696479621, "learning_rate": 4.50490713624768e-06, "logits/chosen": -0.278617262840271, "logits/rejected": -0.3227754831314087, "logps/chosen": -0.44750720262527466, "logps/rejected": -3.244765043258667, "loss": 0.354, "odds_ratio_loss": 0.13986484706401825, "rewards/accuracies": 1.0, "rewards/chosen": -0.044750723987817764, "rewards/margins": 0.2797257900238037, "rewards/rejected": -0.32447654008865356, "sft_loss": 0.44750720262527466, "step": 1960 }, { "epoch": 2.8358640636297903, "grad_norm": 2.4790920191828034, "learning_rate": 4.501825951881044e-06, "logits/chosen": -0.2545323967933655, "logits/rejected": -0.23980608582496643, "logps/chosen": -0.21073514223098755, "logps/rejected": -3.4034600257873535, "loss": 0.3826, "odds_ratio_loss": 0.13137222826480865, "rewards/accuracies": 1.0, "rewards/chosen": -0.021073516458272934, "rewards/margins": 0.319272518157959, "rewards/rejected": -0.3403460383415222, "sft_loss": 0.21073514223098755, "step": 1961 }, { "epoch": 2.837310195227766, "grad_norm": 2.460263782780314, "learning_rate": 4.498744464961673e-06, "logits/chosen": -0.14830397069454193, "logits/rejected": -0.17653128504753113, "logps/chosen": -0.37625688314437866, "logps/rejected": -2.8310248851776123, "loss": 0.3778, "odds_ratio_loss": 0.2329844832420349, "rewards/accuracies": 0.9375, "rewards/chosen": -0.037625689059495926, "rewards/margins": 0.24547676742076874, "rewards/rejected": -0.28310248255729675, "sft_loss": 0.37625688314437866, "step": 1962 }, { "epoch": 2.838756326825741, "grad_norm": 2.41693580770391, "learning_rate": 4.495662677347406e-06, "logits/chosen": -0.2042870670557022, "logits/rejected": -0.2227935642004013, "logps/chosen": -0.30183953046798706, "logps/rejected": -3.360196590423584, "loss": 0.3708, "odds_ratio_loss": 0.11629791557788849, "rewards/accuracies": 1.0, "rewards/chosen": -0.030183954164385796, "rewards/margins": 0.30583569407463074, "rewards/rejected": -0.3360196352005005, "sft_loss": 0.30183953046798706, "step": 1963 }, { "epoch": 2.8402024584237164, "grad_norm": 2.2424319365202483, "learning_rate": 4.492580590896266e-06, "logits/chosen": -0.29322850704193115, "logits/rejected": -0.2505452334880829, "logps/chosen": -0.3406594395637512, "logps/rejected": -3.0602898597717285, "loss": 0.3607, "odds_ratio_loss": 0.14059670269489288, "rewards/accuracies": 1.0, "rewards/chosen": -0.03406594693660736, "rewards/margins": 0.27196305990219116, "rewards/rejected": -0.30602899193763733, "sft_loss": 0.3406594395637512, "step": 1964 }, { "epoch": 2.841648590021692, "grad_norm": 2.034324925578539, "learning_rate": 4.489498207466452e-06, "logits/chosen": -0.23154626786708832, "logits/rejected": -0.22844228148460388, "logps/chosen": -0.35256677865982056, "logps/rejected": -3.0019397735595703, "loss": 0.2994, "odds_ratio_loss": 0.14165949821472168, "rewards/accuracies": 1.0, "rewards/chosen": -0.035256676375865936, "rewards/margins": 0.26493731141090393, "rewards/rejected": -0.30019402503967285, "sft_loss": 0.35256677865982056, "step": 1965 }, { "epoch": 2.8430947216196674, "grad_norm": 2.297602410152931, "learning_rate": 4.486415528916345e-06, "logits/chosen": -0.26251840591430664, "logits/rejected": -0.21951915323734283, "logps/chosen": -0.4026089906692505, "logps/rejected": -2.375399589538574, "loss": 0.3456, "odds_ratio_loss": 0.12557853758335114, "rewards/accuracies": 1.0, "rewards/chosen": -0.04026089981198311, "rewards/margins": 0.19727906584739685, "rewards/rejected": -0.23753996193408966, "sft_loss": 0.4026089906692505, "step": 1966 }, { "epoch": 2.8445408532176426, "grad_norm": 2.3981821546026594, "learning_rate": 4.483332557104506e-06, "logits/chosen": -0.31481003761291504, "logits/rejected": -0.23861244320869446, "logps/chosen": -0.45590323209762573, "logps/rejected": -4.1064300537109375, "loss": 0.4072, "odds_ratio_loss": 0.1462942659854889, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04559032618999481, "rewards/margins": 0.3650527596473694, "rewards/rejected": -0.4106430411338806, "sft_loss": 0.45590323209762573, "step": 1967 }, { "epoch": 2.8459869848156183, "grad_norm": 2.5671049781697786, "learning_rate": 4.4802492938896665e-06, "logits/chosen": -0.28605127334594727, "logits/rejected": -0.3137693405151367, "logps/chosen": -0.44223883748054504, "logps/rejected": -3.326413631439209, "loss": 0.3795, "odds_ratio_loss": 0.149356871843338, "rewards/accuracies": 1.0, "rewards/chosen": -0.044223885983228683, "rewards/margins": 0.2884174585342407, "rewards/rejected": -0.3326413333415985, "sft_loss": 0.44223883748054504, "step": 1968 }, { "epoch": 2.8474331164135935, "grad_norm": 2.2327104314001893, "learning_rate": 4.477165741130739e-06, "logits/chosen": -0.25932520627975464, "logits/rejected": -0.2992691397666931, "logps/chosen": -0.4744134843349457, "logps/rejected": -2.5101125240325928, "loss": 0.4011, "odds_ratio_loss": 0.1707616001367569, "rewards/accuracies": 1.0, "rewards/chosen": -0.047441352158784866, "rewards/margins": 0.20356988906860352, "rewards/rejected": -0.2510112524032593, "sft_loss": 0.4744134843349457, "step": 1969 }, { "epoch": 2.8488792480115688, "grad_norm": 2.293971064562503, "learning_rate": 4.474081900686811e-06, "logits/chosen": -0.11275100708007812, "logits/rejected": -0.20851099491119385, "logps/chosen": -0.3515588641166687, "logps/rejected": -4.357938766479492, "loss": 0.3836, "odds_ratio_loss": 0.10340322554111481, "rewards/accuracies": 1.0, "rewards/chosen": -0.03515588864684105, "rewards/margins": 0.40063801407814026, "rewards/rejected": -0.4357939064502716, "sft_loss": 0.3515588641166687, "step": 1970 }, { "epoch": 2.8503253796095445, "grad_norm": 2.3802443419941293, "learning_rate": 4.470997774417138e-06, "logits/chosen": -0.23158510029315948, "logits/rejected": -0.12711213529109955, "logps/chosen": -0.18399205803871155, "logps/rejected": -4.86518669128418, "loss": 0.3133, "odds_ratio_loss": 0.026787567883729935, "rewards/accuracies": 1.0, "rewards/chosen": -0.018399206921458244, "rewards/margins": 0.46811944246292114, "rewards/rejected": -0.4865187108516693, "sft_loss": 0.18399205803871155, "step": 1971 }, { "epoch": 2.85177151120752, "grad_norm": 4.32614696988975, "learning_rate": 4.467913364181152e-06, "logits/chosen": -0.1700606644153595, "logits/rejected": -0.4262159466743469, "logps/chosen": -0.3667178153991699, "logps/rejected": -4.321340560913086, "loss": 0.3362, "odds_ratio_loss": 0.15199513733386993, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03667178750038147, "rewards/margins": 0.395462304353714, "rewards/rejected": -0.43213409185409546, "sft_loss": 0.3667178153991699, "step": 1972 }, { "epoch": 2.8532176428054954, "grad_norm": 2.4945795457234623, "learning_rate": 4.464828671838456e-06, "logits/chosen": -0.20741111040115356, "logits/rejected": -0.28869232535362244, "logps/chosen": -0.4533330798149109, "logps/rejected": -3.0039780139923096, "loss": 0.3851, "odds_ratio_loss": 0.14457382261753082, "rewards/accuracies": 1.0, "rewards/chosen": -0.04533331096172333, "rewards/margins": 0.2550644874572754, "rewards/rejected": -0.3003978133201599, "sft_loss": 0.4533330798149109, "step": 1973 }, { "epoch": 2.8546637744034706, "grad_norm": 2.2392165620325555, "learning_rate": 4.4617436992488255e-06, "logits/chosen": -0.21774300932884216, "logits/rejected": -0.22718608379364014, "logps/chosen": -0.26567813754081726, "logps/rejected": -3.7159323692321777, "loss": 0.3558, "odds_ratio_loss": 0.11599615216255188, "rewards/accuracies": 1.0, "rewards/chosen": -0.026567813009023666, "rewards/margins": 0.3450254201889038, "rewards/rejected": -0.3715932369232178, "sft_loss": 0.26567813754081726, "step": 1974 }, { "epoch": 2.8561099060014463, "grad_norm": 2.468051654089775, "learning_rate": 4.458658448272196e-06, "logits/chosen": -0.3251468241214752, "logits/rejected": -0.34824731945991516, "logps/chosen": -0.3763624429702759, "logps/rejected": -2.987534523010254, "loss": 0.3765, "odds_ratio_loss": 0.14948882162570953, "rewards/accuracies": 1.0, "rewards/chosen": -0.03763624280691147, "rewards/margins": 0.26111719012260437, "rewards/rejected": -0.29875344038009644, "sft_loss": 0.3763624429702759, "step": 1975 }, { "epoch": 2.8575560375994216, "grad_norm": 2.1974442142697272, "learning_rate": 4.455572920768681e-06, "logits/chosen": -0.2270897775888443, "logits/rejected": -0.25597840547561646, "logps/chosen": -0.4614500105381012, "logps/rejected": -3.404634714126587, "loss": 0.37, "odds_ratio_loss": 0.19110485911369324, "rewards/accuracies": 0.9375, "rewards/chosen": -0.046144999563694, "rewards/margins": 0.29431846737861633, "rewards/rejected": -0.34046345949172974, "sft_loss": 0.4614500105381012, "step": 1976 }, { "epoch": 2.859002169197397, "grad_norm": 2.0140127976514846, "learning_rate": 4.452487118598554e-06, "logits/chosen": -0.08335888385772705, "logits/rejected": -0.18578308820724487, "logps/chosen": -0.3250659108161926, "logps/rejected": -4.700540542602539, "loss": 0.3012, "odds_ratio_loss": 0.12990859150886536, "rewards/accuracies": 1.0, "rewards/chosen": -0.03250659257173538, "rewards/margins": 0.4375474452972412, "rewards/rejected": -0.470054030418396, "sft_loss": 0.3250659108161926, "step": 1977 }, { "epoch": 2.8604483007953725, "grad_norm": 2.4445686295120987, "learning_rate": 4.44940104362226e-06, "logits/chosen": -0.14989988505840302, "logits/rejected": -0.26366785168647766, "logps/chosen": -0.352664053440094, "logps/rejected": -2.437861680984497, "loss": 0.3218, "odds_ratio_loss": 0.1567659229040146, "rewards/accuracies": 1.0, "rewards/chosen": -0.03526640683412552, "rewards/margins": 0.20851975679397583, "rewards/rejected": -0.24378615617752075, "sft_loss": 0.352664053440094, "step": 1978 }, { "epoch": 2.8618944323933477, "grad_norm": 2.2321339276585483, "learning_rate": 4.446314697700402e-06, "logits/chosen": -0.2847731113433838, "logits/rejected": -0.1818363070487976, "logps/chosen": -0.2849189341068268, "logps/rejected": -3.662630558013916, "loss": 0.315, "odds_ratio_loss": 0.10266374051570892, "rewards/accuracies": 1.0, "rewards/chosen": -0.02849189192056656, "rewards/margins": 0.3377711772918701, "rewards/rejected": -0.3662630617618561, "sft_loss": 0.2849189341068268, "step": 1979 }, { "epoch": 2.863340563991323, "grad_norm": 3.2130933735507603, "learning_rate": 4.44322808269375e-06, "logits/chosen": -0.43049612641334534, "logits/rejected": -0.31409528851509094, "logps/chosen": -0.3502507507801056, "logps/rejected": -4.642218112945557, "loss": 0.3838, "odds_ratio_loss": 0.06136965751647949, "rewards/accuracies": 1.0, "rewards/chosen": -0.03502507507801056, "rewards/margins": 0.4291967749595642, "rewards/rejected": -0.46422186493873596, "sft_loss": 0.3502507507801056, "step": 1980 }, { "epoch": 2.8647866955892987, "grad_norm": 2.6696002700885826, "learning_rate": 4.440141200463237e-06, "logits/chosen": -0.20311379432678223, "logits/rejected": -0.19404862821102142, "logps/chosen": -0.3664669394493103, "logps/rejected": -4.6123809814453125, "loss": 0.342, "odds_ratio_loss": 0.18475694954395294, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03664669767022133, "rewards/margins": 0.42459142208099365, "rewards/rejected": -0.4612380862236023, "sft_loss": 0.3664669394493103, "step": 1981 }, { "epoch": 2.866232827187274, "grad_norm": 2.313763366163634, "learning_rate": 4.437054052869955e-06, "logits/chosen": -0.14165878295898438, "logits/rejected": -0.0675276443362236, "logps/chosen": -0.4247075319290161, "logps/rejected": -3.4205408096313477, "loss": 0.4258, "odds_ratio_loss": 0.1645819991827011, "rewards/accuracies": 1.0, "rewards/chosen": -0.04247075691819191, "rewards/margins": 0.2995833158493042, "rewards/rejected": -0.3420540690422058, "sft_loss": 0.4247075319290161, "step": 1982 }, { "epoch": 2.8676789587852496, "grad_norm": 2.4102794434344044, "learning_rate": 4.433966641775155e-06, "logits/chosen": -0.037582166492938995, "logits/rejected": -0.06151975318789482, "logps/chosen": -0.2677247226238251, "logps/rejected": -2.7316319942474365, "loss": 0.3653, "odds_ratio_loss": 0.1058277040719986, "rewards/accuracies": 1.0, "rewards/chosen": -0.026772471144795418, "rewards/margins": 0.24639073014259338, "rewards/rejected": -0.27316319942474365, "sft_loss": 0.2677247226238251, "step": 1983 }, { "epoch": 2.869125090383225, "grad_norm": 2.0562322116516647, "learning_rate": 4.430878969040252e-06, "logits/chosen": -0.16897395253181458, "logits/rejected": -0.08548015356063843, "logps/chosen": -0.531076192855835, "logps/rejected": -2.6939971446990967, "loss": 0.3929, "odds_ratio_loss": 0.24118700623512268, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0531076155602932, "rewards/margins": 0.21629208326339722, "rewards/rejected": -0.2693997025489807, "sft_loss": 0.531076192855835, "step": 1984 }, { "epoch": 2.8705712219812005, "grad_norm": 2.4932561485656004, "learning_rate": 4.427791036526813e-06, "logits/chosen": -0.35910582542419434, "logits/rejected": -0.25231948494911194, "logps/chosen": -0.41696059703826904, "logps/rejected": -3.8195204734802246, "loss": 0.3503, "odds_ratio_loss": 0.12001129239797592, "rewards/accuracies": 1.0, "rewards/chosen": -0.041696060448884964, "rewards/margins": 0.3402559757232666, "rewards/rejected": -0.3819520175457001, "sft_loss": 0.41696059703826904, "step": 1985 }, { "epoch": 2.8720173535791758, "grad_norm": 2.328030440907696, "learning_rate": 4.4247028460965665e-06, "logits/chosen": -0.11354707181453705, "logits/rejected": -0.058158066123723984, "logps/chosen": -0.2665402293205261, "logps/rejected": -4.639363765716553, "loss": 0.3374, "odds_ratio_loss": 0.049688465893268585, "rewards/accuracies": 1.0, "rewards/chosen": -0.026654021814465523, "rewards/margins": 0.43728238344192505, "rewards/rejected": -0.46393638849258423, "sft_loss": 0.2665402293205261, "step": 1986 }, { "epoch": 2.873463485177151, "grad_norm": 2.2398543080728786, "learning_rate": 4.4216143996113905e-06, "logits/chosen": -0.21972495317459106, "logits/rejected": -0.18882104754447937, "logps/chosen": -0.3515039384365082, "logps/rejected": -4.940927982330322, "loss": 0.3557, "odds_ratio_loss": 0.15298905968666077, "rewards/accuracies": 1.0, "rewards/chosen": -0.03515039384365082, "rewards/margins": 0.4589424729347229, "rewards/rejected": -0.4940928518772125, "sft_loss": 0.3515039384365082, "step": 1987 }, { "epoch": 2.8749096167751267, "grad_norm": 2.534407245251865, "learning_rate": 4.418525698933324e-06, "logits/chosen": -0.23043112456798553, "logits/rejected": -0.29163533449172974, "logps/chosen": -0.3577241897583008, "logps/rejected": -3.6975207328796387, "loss": 0.3254, "odds_ratio_loss": 0.16912941634655, "rewards/accuracies": 1.0, "rewards/chosen": -0.0357724204659462, "rewards/margins": 0.33397969603538513, "rewards/rejected": -0.36975207924842834, "sft_loss": 0.3577241897583008, "step": 1988 }, { "epoch": 2.876355748373102, "grad_norm": 3.1712296916429317, "learning_rate": 4.415436745924553e-06, "logits/chosen": -0.1479547768831253, "logits/rejected": -0.1481505185365677, "logps/chosen": -0.40298303961753845, "logps/rejected": -4.623858451843262, "loss": 0.3811, "odds_ratio_loss": 0.11026880890130997, "rewards/accuracies": 1.0, "rewards/chosen": -0.040298305451869965, "rewards/margins": 0.42208752036094666, "rewards/rejected": -0.4623858332633972, "sft_loss": 0.40298303961753845, "step": 1989 }, { "epoch": 2.877801879971077, "grad_norm": 2.256856567396973, "learning_rate": 4.412347542447423e-06, "logits/chosen": -0.2117566615343094, "logits/rejected": -0.15659171342849731, "logps/chosen": -0.31408044695854187, "logps/rejected": -4.856984615325928, "loss": 0.2812, "odds_ratio_loss": 0.05605805665254593, "rewards/accuracies": 1.0, "rewards/chosen": -0.03140804544091225, "rewards/margins": 0.4542904496192932, "rewards/rejected": -0.4856984615325928, "sft_loss": 0.31408044695854187, "step": 1990 }, { "epoch": 2.879248011569053, "grad_norm": 2.7358142066590374, "learning_rate": 4.409258090364424e-06, "logits/chosen": -0.33230820298194885, "logits/rejected": -0.2448243498802185, "logps/chosen": -0.5196665525436401, "logps/rejected": -3.3180594444274902, "loss": 0.4021, "odds_ratio_loss": 0.1978374421596527, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05196665599942207, "rewards/margins": 0.27983927726745605, "rewards/rejected": -0.331805944442749, "sft_loss": 0.5196665525436401, "step": 1991 }, { "epoch": 2.880694143167028, "grad_norm": 2.2547489447553715, "learning_rate": 4.406168391538197e-06, "logits/chosen": -0.2996472120285034, "logits/rejected": -0.25789931416511536, "logps/chosen": -0.28679195046424866, "logps/rejected": -4.5447258949279785, "loss": 0.3514, "odds_ratio_loss": 0.07733619213104248, "rewards/accuracies": 1.0, "rewards/chosen": -0.028679195791482925, "rewards/margins": 0.4257934093475342, "rewards/rejected": -0.4544726014137268, "sft_loss": 0.28679195046424866, "step": 1992 }, { "epoch": 2.8821402747650033, "grad_norm": 2.49254805386154, "learning_rate": 4.403078447831534e-06, "logits/chosen": -0.2353857308626175, "logits/rejected": -0.1543744057416916, "logps/chosen": -0.15707671642303467, "logps/rejected": -3.2030768394470215, "loss": 0.2922, "odds_ratio_loss": 0.05911577120423317, "rewards/accuracies": 1.0, "rewards/chosen": -0.015707671642303467, "rewards/margins": 0.3046000301837921, "rewards/rejected": -0.32030773162841797, "sft_loss": 0.15707671642303467, "step": 1993 }, { "epoch": 2.883586406362979, "grad_norm": 2.2256194334422967, "learning_rate": 4.399988261107373e-06, "logits/chosen": -0.13290566205978394, "logits/rejected": -0.31374818086624146, "logps/chosen": -0.5070281028747559, "logps/rejected": -2.8628692626953125, "loss": 0.3685, "odds_ratio_loss": 0.29547181725502014, "rewards/accuracies": 0.875, "rewards/chosen": -0.05070280656218529, "rewards/margins": 0.23558413982391357, "rewards/rejected": -0.2862869203090668, "sft_loss": 0.5070281028747559, "step": 1994 }, { "epoch": 2.8850325379609547, "grad_norm": 3.5396995388199994, "learning_rate": 4.396897833228801e-06, "logits/chosen": -0.09624442458152771, "logits/rejected": -0.20986582338809967, "logps/chosen": -0.3855375349521637, "logps/rejected": -4.744143486022949, "loss": 0.3784, "odds_ratio_loss": 0.0990835428237915, "rewards/accuracies": 1.0, "rewards/chosen": -0.03855375573039055, "rewards/margins": 0.43586063385009766, "rewards/rejected": -0.4744143784046173, "sft_loss": 0.3855375349521637, "step": 1995 }, { "epoch": 2.88647866955893, "grad_norm": 2.809101607987937, "learning_rate": 4.393807166059044e-06, "logits/chosen": -0.15723095834255219, "logits/rejected": -0.20489570498466492, "logps/chosen": -0.5089755058288574, "logps/rejected": -3.931450843811035, "loss": 0.407, "odds_ratio_loss": 0.22489072382450104, "rewards/accuracies": 0.875, "rewards/chosen": -0.05089755356311798, "rewards/margins": 0.34224751591682434, "rewards/rejected": -0.39314502477645874, "sft_loss": 0.5089755058288574, "step": 1996 }, { "epoch": 2.887924801156905, "grad_norm": 2.5416958031833077, "learning_rate": 4.390716261461484e-06, "logits/chosen": -0.25893235206604004, "logits/rejected": -0.2654384970664978, "logps/chosen": -0.4957669675350189, "logps/rejected": -3.1314098834991455, "loss": 0.4241, "odds_ratio_loss": 0.17412987351417542, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04957669600844383, "rewards/margins": 0.26356425881385803, "rewards/rejected": -0.31314098834991455, "sft_loss": 0.4957669675350189, "step": 1997 }, { "epoch": 2.889370932754881, "grad_norm": 2.293405340339323, "learning_rate": 4.387625121299632e-06, "logits/chosen": -0.23933672904968262, "logits/rejected": -0.2176375389099121, "logps/chosen": -0.3258305788040161, "logps/rejected": -3.0861976146698, "loss": 0.3925, "odds_ratio_loss": 0.14498911798000336, "rewards/accuracies": 1.0, "rewards/chosen": -0.03258305788040161, "rewards/margins": 0.27603673934936523, "rewards/rejected": -0.30861979722976685, "sft_loss": 0.3258305788040161, "step": 1998 }, { "epoch": 2.890817064352856, "grad_norm": 2.1941997131433344, "learning_rate": 4.384533747437151e-06, "logits/chosen": -0.33271104097366333, "logits/rejected": -0.28577131032943726, "logps/chosen": -0.3192511796951294, "logps/rejected": -3.704035758972168, "loss": 0.3826, "odds_ratio_loss": 0.11826330423355103, "rewards/accuracies": 1.0, "rewards/chosen": -0.03192511945962906, "rewards/margins": 0.3384784460067749, "rewards/rejected": -0.370403528213501, "sft_loss": 0.3192511796951294, "step": 1999 }, { "epoch": 2.8922631959508314, "grad_norm": 2.451640159463382, "learning_rate": 4.381442141737842e-06, "logits/chosen": -0.1317928582429886, "logits/rejected": -0.19109252095222473, "logps/chosen": -0.44239649176597595, "logps/rejected": -2.6792032718658447, "loss": 0.3352, "odds_ratio_loss": 0.19351676106452942, "rewards/accuracies": 1.0, "rewards/chosen": -0.044239647686481476, "rewards/margins": 0.22368068993091583, "rewards/rejected": -0.2679203450679779, "sft_loss": 0.44239649176597595, "step": 2000 }, { "epoch": 2.893709327548807, "grad_norm": 2.9987909450785004, "learning_rate": 4.378350306065647e-06, "logits/chosen": -0.11029291152954102, "logits/rejected": -0.20811080932617188, "logps/chosen": -0.5102984309196472, "logps/rejected": -2.461052894592285, "loss": 0.4728, "odds_ratio_loss": 0.26014572381973267, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05102984607219696, "rewards/margins": 0.19507546722888947, "rewards/rejected": -0.24610531330108643, "sft_loss": 0.5102984309196472, "step": 2001 }, { "epoch": 2.8951554591467823, "grad_norm": 4.577215598616385, "learning_rate": 4.375258242284641e-06, "logits/chosen": -0.125568687915802, "logits/rejected": -0.23617637157440186, "logps/chosen": -0.1757880002260208, "logps/rejected": -5.425671100616455, "loss": 0.3219, "odds_ratio_loss": 0.075681671500206, "rewards/accuracies": 1.0, "rewards/chosen": -0.01757879927754402, "rewards/margins": 0.5249882936477661, "rewards/rejected": -0.5425670742988586, "sft_loss": 0.1757880002260208, "step": 2002 }, { "epoch": 2.8966015907447575, "grad_norm": 2.5535034041144864, "learning_rate": 4.372165952259043e-06, "logits/chosen": -0.25127291679382324, "logits/rejected": -0.20186248421669006, "logps/chosen": -0.40638288855552673, "logps/rejected": -1.6983206272125244, "loss": 0.4103, "odds_ratio_loss": 0.17356377840042114, "rewards/accuracies": 1.0, "rewards/chosen": -0.04063829034566879, "rewards/margins": 0.12919378280639648, "rewards/rejected": -0.16983206570148468, "sft_loss": 0.40638288855552673, "step": 2003 }, { "epoch": 2.8980477223427332, "grad_norm": 2.415128934348208, "learning_rate": 4.369073437853208e-06, "logits/chosen": -0.20070333778858185, "logits/rejected": -0.18373045325279236, "logps/chosen": -0.3903118968009949, "logps/rejected": -5.177107810974121, "loss": 0.4006, "odds_ratio_loss": 0.12080082297325134, "rewards/accuracies": 1.0, "rewards/chosen": -0.03903118893504143, "rewards/margins": 0.4786795973777771, "rewards/rejected": -0.51771080493927, "sft_loss": 0.3903118968009949, "step": 2004 }, { "epoch": 2.8994938539407085, "grad_norm": 2.5665948351613945, "learning_rate": 4.365980700931622e-06, "logits/chosen": -0.21441921591758728, "logits/rejected": -0.16341614723205566, "logps/chosen": -0.3254525661468506, "logps/rejected": -4.25070333480835, "loss": 0.3888, "odds_ratio_loss": 0.13244378566741943, "rewards/accuracies": 1.0, "rewards/chosen": -0.03254526108503342, "rewards/margins": 0.3925250768661499, "rewards/rejected": -0.4250703454017639, "sft_loss": 0.3254525661468506, "step": 2005 }, { "epoch": 2.900939985538684, "grad_norm": 2.41284437035138, "learning_rate": 4.3628877433589085e-06, "logits/chosen": -0.216677725315094, "logits/rejected": -0.24935810267925262, "logps/chosen": -0.30690398812294006, "logps/rejected": -4.660876750946045, "loss": 0.3046, "odds_ratio_loss": 0.11162017285823822, "rewards/accuracies": 1.0, "rewards/chosen": -0.030690398067235947, "rewards/margins": 0.43539732694625854, "rewards/rejected": -0.4660876989364624, "sft_loss": 0.30690398812294006, "step": 2006 }, { "epoch": 2.9023861171366594, "grad_norm": 2.335670194118243, "learning_rate": 4.359794566999822e-06, "logits/chosen": -0.10944445431232452, "logits/rejected": -0.1079663336277008, "logps/chosen": -0.4069558382034302, "logps/rejected": -3.064016342163086, "loss": 0.3672, "odds_ratio_loss": 0.17633295059204102, "rewards/accuracies": 1.0, "rewards/chosen": -0.04069558531045914, "rewards/margins": 0.26570606231689453, "rewards/rejected": -0.30640166997909546, "sft_loss": 0.4069558382034302, "step": 2007 }, { "epoch": 2.903832248734635, "grad_norm": 2.4178856154509836, "learning_rate": 4.356701173719252e-06, "logits/chosen": -0.1341048777103424, "logits/rejected": -0.20276595652103424, "logps/chosen": -0.3767666220664978, "logps/rejected": -2.81579852104187, "loss": 0.3617, "odds_ratio_loss": 0.18513742089271545, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03767666220664978, "rewards/margins": 0.24390317499637604, "rewards/rejected": -0.281579852104187, "sft_loss": 0.3767666220664978, "step": 2008 }, { "epoch": 2.9052783803326103, "grad_norm": 2.132182042456953, "learning_rate": 4.3536075653822155e-06, "logits/chosen": -0.1696842908859253, "logits/rejected": -0.2770502269268036, "logps/chosen": -0.38094091415405273, "logps/rejected": -4.085338115692139, "loss": 0.3671, "odds_ratio_loss": 0.13694609701633453, "rewards/accuracies": 1.0, "rewards/chosen": -0.038094088435173035, "rewards/margins": 0.370439738035202, "rewards/rejected": -0.40853381156921387, "sft_loss": 0.38094091415405273, "step": 2009 }, { "epoch": 2.9067245119305856, "grad_norm": 2.4441830024202638, "learning_rate": 4.3505137438538605e-06, "logits/chosen": -0.30766594409942627, "logits/rejected": -0.17128098011016846, "logps/chosen": -0.31321054697036743, "logps/rejected": -3.57883620262146, "loss": 0.3863, "odds_ratio_loss": 0.09254439175128937, "rewards/accuracies": 1.0, "rewards/chosen": -0.03132105618715286, "rewards/margins": 0.3265625834465027, "rewards/rejected": -0.35788363218307495, "sft_loss": 0.31321054697036743, "step": 2010 }, { "epoch": 2.9081706435285612, "grad_norm": 2.474111533149485, "learning_rate": 4.347419710999464e-06, "logits/chosen": -0.198749840259552, "logits/rejected": -0.24889856576919556, "logps/chosen": -0.4312863349914551, "logps/rejected": -3.4733762741088867, "loss": 0.3641, "odds_ratio_loss": 0.15984997153282166, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04312863573431969, "rewards/margins": 0.3042089641094208, "rewards/rejected": -0.34733760356903076, "sft_loss": 0.4312863349914551, "step": 2011 }, { "epoch": 2.9096167751265365, "grad_norm": 2.2140379184527874, "learning_rate": 4.34432546868443e-06, "logits/chosen": -0.0032480377703905106, "logits/rejected": -0.05943544581532478, "logps/chosen": -0.28691381216049194, "logps/rejected": -2.6978836059570312, "loss": 0.3977, "odds_ratio_loss": 0.11956118047237396, "rewards/accuracies": 1.0, "rewards/chosen": -0.028691379353404045, "rewards/margins": 0.24109697341918945, "rewards/rejected": -0.26978832483291626, "sft_loss": 0.28691381216049194, "step": 2012 }, { "epoch": 2.9110629067245117, "grad_norm": 2.1933346188671705, "learning_rate": 4.3412310187742895e-06, "logits/chosen": -0.21547159552574158, "logits/rejected": -0.20167696475982666, "logps/chosen": -0.3793547451496124, "logps/rejected": -3.8487207889556885, "loss": 0.3633, "odds_ratio_loss": 0.11790119856595993, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03793547675013542, "rewards/margins": 0.3469366133213043, "rewards/rejected": -0.38487207889556885, "sft_loss": 0.3793547451496124, "step": 2013 }, { "epoch": 2.9125090383224874, "grad_norm": 2.2000607686897884, "learning_rate": 4.338136363134696e-06, "logits/chosen": -0.16653400659561157, "logits/rejected": -0.22715330123901367, "logps/chosen": -0.3354107737541199, "logps/rejected": -4.489915370941162, "loss": 0.3521, "odds_ratio_loss": 0.09342381358146667, "rewards/accuracies": 1.0, "rewards/chosen": -0.03354107588529587, "rewards/margins": 0.41545045375823975, "rewards/rejected": -0.4489915370941162, "sft_loss": 0.3354107737541199, "step": 2014 }, { "epoch": 2.9139551699204627, "grad_norm": 2.345249585264906, "learning_rate": 4.3350415036314295e-06, "logits/chosen": -0.1863386034965515, "logits/rejected": -0.17719031870365143, "logps/chosen": -0.35153529047966003, "logps/rejected": -3.003571033477783, "loss": 0.2938, "odds_ratio_loss": 0.20526739954948425, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03515353053808212, "rewards/margins": 0.265203595161438, "rewards/rejected": -0.3003571033477783, "sft_loss": 0.35153529047966003, "step": 2015 }, { "epoch": 2.915401301518438, "grad_norm": 2.664410560581058, "learning_rate": 4.331946442130393e-06, "logits/chosen": -0.32930123805999756, "logits/rejected": -0.31893402338027954, "logps/chosen": -0.4697510004043579, "logps/rejected": -2.2911057472229004, "loss": 0.4361, "odds_ratio_loss": 0.14694637060165405, "rewards/accuracies": 1.0, "rewards/chosen": -0.04697509855031967, "rewards/margins": 0.18213549256324768, "rewards/rejected": -0.22911058366298676, "sft_loss": 0.4697510004043579, "step": 2016 }, { "epoch": 2.9168474331164136, "grad_norm": 2.0455275341242096, "learning_rate": 4.32885118049761e-06, "logits/chosen": -0.24276413023471832, "logits/rejected": -0.3042674660682678, "logps/chosen": -0.31963613629341125, "logps/rejected": -4.9066643714904785, "loss": 0.3045, "odds_ratio_loss": 0.1002592146396637, "rewards/accuracies": 1.0, "rewards/chosen": -0.03196360915899277, "rewards/margins": 0.45870283246040344, "rewards/rejected": -0.4906664490699768, "sft_loss": 0.31963613629341125, "step": 2017 }, { "epoch": 2.9182935647143893, "grad_norm": 2.4816803752814014, "learning_rate": 4.325755720599226e-06, "logits/chosen": -0.17681774497032166, "logits/rejected": -0.13152150809764862, "logps/chosen": -0.39280036091804504, "logps/rejected": -2.7280354499816895, "loss": 0.3974, "odds_ratio_loss": 0.11691177636384964, "rewards/accuracies": 1.0, "rewards/chosen": -0.03928003087639809, "rewards/margins": 0.23352351784706116, "rewards/rejected": -0.27280354499816895, "sft_loss": 0.39280036091804504, "step": 2018 }, { "epoch": 2.9197396963123645, "grad_norm": 2.4828814390035374, "learning_rate": 4.322660064301504e-06, "logits/chosen": -0.20829474925994873, "logits/rejected": -0.2244415581226349, "logps/chosen": -0.38568395376205444, "logps/rejected": -3.2982382774353027, "loss": 0.3738, "odds_ratio_loss": 0.11840784549713135, "rewards/accuracies": 1.0, "rewards/chosen": -0.0385683998465538, "rewards/margins": 0.2912554442882538, "rewards/rejected": -0.3298238515853882, "sft_loss": 0.38568395376205444, "step": 2019 }, { "epoch": 2.9211858279103398, "grad_norm": 2.6918576722836085, "learning_rate": 4.319564213470828e-06, "logits/chosen": -0.20029829442501068, "logits/rejected": -0.15000326931476593, "logps/chosen": -0.5417320132255554, "logps/rejected": -4.520184516906738, "loss": 0.3793, "odds_ratio_loss": 0.19173075258731842, "rewards/accuracies": 0.875, "rewards/chosen": -0.05417320504784584, "rewards/margins": 0.3978452682495117, "rewards/rejected": -0.45201849937438965, "sft_loss": 0.5417320132255554, "step": 2020 }, { "epoch": 2.9226319595083154, "grad_norm": 2.6470788518613233, "learning_rate": 4.316468169973698e-06, "logits/chosen": -0.2127470076084137, "logits/rejected": -0.30946892499923706, "logps/chosen": -0.3537566363811493, "logps/rejected": -4.601029396057129, "loss": 0.4096, "odds_ratio_loss": 0.06098828837275505, "rewards/accuracies": 1.0, "rewards/chosen": -0.03537566214799881, "rewards/margins": 0.42472726106643677, "rewards/rejected": -0.460102915763855, "sft_loss": 0.3537566363811493, "step": 2021 }, { "epoch": 2.9240780911062907, "grad_norm": 2.31032069822132, "learning_rate": 4.31337193567673e-06, "logits/chosen": -0.27767282724380493, "logits/rejected": -0.2333839237689972, "logps/chosen": -0.4112637937068939, "logps/rejected": -5.107657432556152, "loss": 0.3246, "odds_ratio_loss": 0.13032792508602142, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04112637788057327, "rewards/margins": 0.4696393609046936, "rewards/rejected": -0.5107657313346863, "sft_loss": 0.4112637937068939, "step": 2022 }, { "epoch": 2.925524222704266, "grad_norm": 2.155257212541574, "learning_rate": 4.3102755124466525e-06, "logits/chosen": -0.2570434510707855, "logits/rejected": -0.20935188233852386, "logps/chosen": -0.40464794635772705, "logps/rejected": -2.422607183456421, "loss": 0.3659, "odds_ratio_loss": 0.14167605340480804, "rewards/accuracies": 0.9375, "rewards/chosen": -0.040464796125888824, "rewards/margins": 0.20179593563079834, "rewards/rejected": -0.24226072430610657, "sft_loss": 0.40464794635772705, "step": 2023 }, { "epoch": 2.9269703543022416, "grad_norm": 2.2816190632498343, "learning_rate": 4.307178902150315e-06, "logits/chosen": -0.18600818514823914, "logits/rejected": -0.24983981251716614, "logps/chosen": -0.3823423385620117, "logps/rejected": -2.81748104095459, "loss": 0.405, "odds_ratio_loss": 0.1572282761335373, "rewards/accuracies": 1.0, "rewards/chosen": -0.03823423013091087, "rewards/margins": 0.24351388216018677, "rewards/rejected": -0.28174811601638794, "sft_loss": 0.3823423385620117, "step": 2024 }, { "epoch": 2.928416485900217, "grad_norm": 2.2882575706198125, "learning_rate": 4.3040821066546736e-06, "logits/chosen": -0.22499872744083405, "logits/rejected": -0.17465078830718994, "logps/chosen": -0.38680657744407654, "logps/rejected": -3.9832472801208496, "loss": 0.3367, "odds_ratio_loss": 0.10981922596693039, "rewards/accuracies": 1.0, "rewards/chosen": -0.038680657744407654, "rewards/margins": 0.3596440851688385, "rewards/rejected": -0.39832472801208496, "sft_loss": 0.38680657744407654, "step": 2025 }, { "epoch": 2.929862617498192, "grad_norm": 2.3630936993834335, "learning_rate": 4.300985127826796e-06, "logits/chosen": -0.3212575912475586, "logits/rejected": -0.3458347022533417, "logps/chosen": -0.5177043676376343, "logps/rejected": -3.096360206604004, "loss": 0.4595, "odds_ratio_loss": 0.16100654006004333, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05177043378353119, "rewards/margins": 0.2578656077384949, "rewards/rejected": -0.30963602662086487, "sft_loss": 0.5177043676376343, "step": 2026 }, { "epoch": 2.931308749096168, "grad_norm": 2.3846314203711674, "learning_rate": 4.297887967533865e-06, "logits/chosen": -0.17958812415599823, "logits/rejected": -0.10923945903778076, "logps/chosen": -0.3733343780040741, "logps/rejected": -4.5489301681518555, "loss": 0.3881, "odds_ratio_loss": 0.18937966227531433, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03733343631029129, "rewards/margins": 0.4175596237182617, "rewards/rejected": -0.4548930525779724, "sft_loss": 0.3733343780040741, "step": 2027 }, { "epoch": 2.932754880694143, "grad_norm": 2.586612143421851, "learning_rate": 4.294790627643169e-06, "logits/chosen": -0.3360060453414917, "logits/rejected": -0.3124268651008606, "logps/chosen": -0.2640987038612366, "logps/rejected": -3.922750234603882, "loss": 0.4057, "odds_ratio_loss": 0.06727544218301773, "rewards/accuracies": 1.0, "rewards/chosen": -0.026409868150949478, "rewards/margins": 0.36586514115333557, "rewards/rejected": -0.39227503538131714, "sft_loss": 0.2640987038612366, "step": 2028 }, { "epoch": 2.9342010122921187, "grad_norm": 3.000278375564883, "learning_rate": 4.2916931100221056e-06, "logits/chosen": -0.16415224969387054, "logits/rejected": -0.2963314652442932, "logps/chosen": -0.4116109013557434, "logps/rejected": -2.8866724967956543, "loss": 0.4228, "odds_ratio_loss": 0.20921087265014648, "rewards/accuracies": 1.0, "rewards/chosen": -0.04116109386086464, "rewards/margins": 0.24750614166259766, "rewards/rejected": -0.288667231798172, "sft_loss": 0.4116109013557434, "step": 2029 }, { "epoch": 2.935647143890094, "grad_norm": 2.5339856332023936, "learning_rate": 4.288595416538179e-06, "logits/chosen": -0.233162060379982, "logits/rejected": -0.23949375748634338, "logps/chosen": -0.3766535222530365, "logps/rejected": -4.669894695281982, "loss": 0.4059, "odds_ratio_loss": 0.08383668214082718, "rewards/accuracies": 1.0, "rewards/chosen": -0.03766535222530365, "rewards/margins": 0.4293241500854492, "rewards/rejected": -0.46698951721191406, "sft_loss": 0.3766535222530365, "step": 2030 }, { "epoch": 2.9370932754880696, "grad_norm": 2.5983661776860796, "learning_rate": 4.285497549059001e-06, "logits/chosen": -0.16616466641426086, "logits/rejected": -0.13526281714439392, "logps/chosen": -0.3628811240196228, "logps/rejected": -3.515805721282959, "loss": 0.3736, "odds_ratio_loss": 0.07886926084756851, "rewards/accuracies": 1.0, "rewards/chosen": -0.03628811240196228, "rewards/margins": 0.31529247760772705, "rewards/rejected": -0.35158056020736694, "sft_loss": 0.3628811240196228, "step": 2031 }, { "epoch": 2.938539407086045, "grad_norm": 1.9948751384440309, "learning_rate": 4.282399509452288e-06, "logits/chosen": -0.29387766122817993, "logits/rejected": -0.3654487729072571, "logps/chosen": -0.32119542360305786, "logps/rejected": -3.8875999450683594, "loss": 0.3627, "odds_ratio_loss": 0.07366838306188583, "rewards/accuracies": 1.0, "rewards/chosen": -0.032119542360305786, "rewards/margins": 0.35664045810699463, "rewards/rejected": -0.388759970664978, "sft_loss": 0.32119542360305786, "step": 2032 }, { "epoch": 2.93998553868402, "grad_norm": 2.0374482986686786, "learning_rate": 4.279301299585859e-06, "logits/chosen": -0.23302549123764038, "logits/rejected": -0.16700975596904755, "logps/chosen": -0.2750675678253174, "logps/rejected": -2.0055830478668213, "loss": 0.3007, "odds_ratio_loss": 0.11454141139984131, "rewards/accuracies": 1.0, "rewards/chosen": -0.027506759390234947, "rewards/margins": 0.17305153608322144, "rewards/rejected": -0.20055830478668213, "sft_loss": 0.2750675678253174, "step": 2033 }, { "epoch": 2.941431670281996, "grad_norm": 2.848613244485053, "learning_rate": 4.276202921327636e-06, "logits/chosen": -0.11311466991901398, "logits/rejected": -0.18168656527996063, "logps/chosen": -0.3654848337173462, "logps/rejected": -2.3824963569641113, "loss": 0.4009, "odds_ratio_loss": 0.19991439580917358, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03654848411679268, "rewards/margins": 0.20170114934444427, "rewards/rejected": -0.23824964463710785, "sft_loss": 0.3654848337173462, "step": 2034 }, { "epoch": 2.942877801879971, "grad_norm": 2.173231264643902, "learning_rate": 4.273104376545643e-06, "logits/chosen": -0.307564914226532, "logits/rejected": -0.2892274558544159, "logps/chosen": -0.3027850389480591, "logps/rejected": -2.450563669204712, "loss": 0.3599, "odds_ratio_loss": 0.16147339344024658, "rewards/accuracies": 1.0, "rewards/chosen": -0.030278503894805908, "rewards/margins": 0.214777871966362, "rewards/rejected": -0.2450563758611679, "sft_loss": 0.3027850389480591, "step": 2035 }, { "epoch": 2.9443239334779463, "grad_norm": 2.1288421307945478, "learning_rate": 4.2700056671080044e-06, "logits/chosen": -0.15985527634620667, "logits/rejected": -0.30066192150115967, "logps/chosen": -0.4228432774543762, "logps/rejected": -3.4859423637390137, "loss": 0.3645, "odds_ratio_loss": 0.12999072670936584, "rewards/accuracies": 1.0, "rewards/chosen": -0.04228432849049568, "rewards/margins": 0.30630987882614136, "rewards/rejected": -0.34859421849250793, "sft_loss": 0.4228432774543762, "step": 2036 }, { "epoch": 2.945770065075922, "grad_norm": 2.4343642295209422, "learning_rate": 4.2669067948829425e-06, "logits/chosen": -0.25953909754753113, "logits/rejected": -0.1952180564403534, "logps/chosen": -0.41905689239501953, "logps/rejected": -5.190460681915283, "loss": 0.3077, "odds_ratio_loss": 0.2337718904018402, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04190569370985031, "rewards/margins": 0.4771403968334198, "rewards/rejected": -0.5190460681915283, "sft_loss": 0.41905689239501953, "step": 2037 }, { "epoch": 2.9472161966738972, "grad_norm": 2.44183904840763, "learning_rate": 4.263807761738781e-06, "logits/chosen": -0.254044771194458, "logits/rejected": -0.20028023421764374, "logps/chosen": -0.24801340699195862, "logps/rejected": -4.495428085327148, "loss": 0.3435, "odds_ratio_loss": 0.04579523205757141, "rewards/accuracies": 1.0, "rewards/chosen": -0.024801339954137802, "rewards/margins": 0.4247414469718933, "rewards/rejected": -0.4495427906513214, "sft_loss": 0.24801340699195862, "step": 2038 }, { "epoch": 2.9486623282718725, "grad_norm": 3.1947725674504537, "learning_rate": 4.260708569543937e-06, "logits/chosen": -0.19664132595062256, "logits/rejected": -0.29793834686279297, "logps/chosen": -0.4847376346588135, "logps/rejected": -3.5855674743652344, "loss": 0.4131, "odds_ratio_loss": 0.1429980844259262, "rewards/accuracies": 1.0, "rewards/chosen": -0.04847376421093941, "rewards/margins": 0.31008297204971313, "rewards/rejected": -0.35855674743652344, "sft_loss": 0.4847376346588135, "step": 2039 }, { "epoch": 2.950108459869848, "grad_norm": 2.3990419415007818, "learning_rate": 4.257609220166927e-06, "logits/chosen": -0.18066376447677612, "logits/rejected": -0.18795566260814667, "logps/chosen": -0.3433865010738373, "logps/rejected": -2.787734270095825, "loss": 0.3489, "odds_ratio_loss": 0.13024930655956268, "rewards/accuracies": 1.0, "rewards/chosen": -0.03433865308761597, "rewards/margins": 0.24443475902080536, "rewards/rejected": -0.27877339720726013, "sft_loss": 0.3433865010738373, "step": 2040 }, { "epoch": 2.951554591467824, "grad_norm": 2.7949894636338395, "learning_rate": 4.254509715476356e-06, "logits/chosen": -0.1268310844898224, "logits/rejected": -0.2035345733165741, "logps/chosen": -0.38025563955307007, "logps/rejected": -3.1927952766418457, "loss": 0.3854, "odds_ratio_loss": 0.11911129951477051, "rewards/accuracies": 1.0, "rewards/chosen": -0.038025565445423126, "rewards/margins": 0.28125396370887756, "rewards/rejected": -0.3192794919013977, "sft_loss": 0.38025563955307007, "step": 2041 }, { "epoch": 2.953000723065799, "grad_norm": 2.822207275782586, "learning_rate": 4.251410057340932e-06, "logits/chosen": -0.2712603211402893, "logits/rejected": -0.32916659116744995, "logps/chosen": -0.39672163128852844, "logps/rejected": -4.876379013061523, "loss": 0.3427, "odds_ratio_loss": 0.06600239872932434, "rewards/accuracies": 1.0, "rewards/chosen": -0.039672162383794785, "rewards/margins": 0.44796574115753174, "rewards/rejected": -0.4876379072666168, "sft_loss": 0.39672163128852844, "step": 2042 }, { "epoch": 2.9544468546637743, "grad_norm": 2.205048962845581, "learning_rate": 4.248310247629446e-06, "logits/chosen": -0.2773968577384949, "logits/rejected": -0.20153555274009705, "logps/chosen": -0.18046045303344727, "logps/rejected": -4.676190376281738, "loss": 0.3392, "odds_ratio_loss": 0.06551147252321243, "rewards/accuracies": 1.0, "rewards/chosen": -0.018046045675873756, "rewards/margins": 0.4495730400085449, "rewards/rejected": -0.46761903166770935, "sft_loss": 0.18046045303344727, "step": 2043 }, { "epoch": 2.95589298626175, "grad_norm": 2.3273549681064325, "learning_rate": 4.24521028821079e-06, "logits/chosen": -0.16675907373428345, "logits/rejected": -0.09869304299354553, "logps/chosen": -0.2557486593723297, "logps/rejected": -4.221222877502441, "loss": 0.2917, "odds_ratio_loss": 0.05778995156288147, "rewards/accuracies": 1.0, "rewards/chosen": -0.02557486668229103, "rewards/margins": 0.39654743671417236, "rewards/rejected": -0.4221222698688507, "sft_loss": 0.2557486593723297, "step": 2044 }, { "epoch": 2.9573391178597253, "grad_norm": 9.342734141369453, "learning_rate": 4.242110180953935e-06, "logits/chosen": -0.16678950190544128, "logits/rejected": -0.07353587448596954, "logps/chosen": -0.35120508074760437, "logps/rejected": -2.6658384799957275, "loss": 0.384, "odds_ratio_loss": 0.16317623853683472, "rewards/accuracies": 0.9375, "rewards/chosen": -0.035120509564876556, "rewards/margins": 0.23146334290504456, "rewards/rejected": -0.2665838599205017, "sft_loss": 0.35120508074760437, "step": 2045 }, { "epoch": 2.9587852494577005, "grad_norm": 1.98977459889491, "learning_rate": 4.239009927727952e-06, "logits/chosen": -0.18185609579086304, "logits/rejected": -0.1600634753704071, "logps/chosen": -0.22608599066734314, "logps/rejected": -5.732004642486572, "loss": 0.3355, "odds_ratio_loss": 0.08565568923950195, "rewards/accuracies": 1.0, "rewards/chosen": -0.022608600556850433, "rewards/margins": 0.5505918860435486, "rewards/rejected": -0.5732004642486572, "sft_loss": 0.22608599066734314, "step": 2046 }, { "epoch": 2.960231381055676, "grad_norm": 2.6070638936508534, "learning_rate": 4.235909530401992e-06, "logits/chosen": -0.1590268611907959, "logits/rejected": -0.1363297998905182, "logps/chosen": -0.3237609267234802, "logps/rejected": -2.4293899536132812, "loss": 0.3868, "odds_ratio_loss": 0.11542137712240219, "rewards/accuracies": 1.0, "rewards/chosen": -0.03237609565258026, "rewards/margins": 0.21056291460990906, "rewards/rejected": -0.24293901026248932, "sft_loss": 0.3237609267234802, "step": 2047 }, { "epoch": 2.9616775126536514, "grad_norm": 2.629598255334325, "learning_rate": 4.232808990845298e-06, "logits/chosen": -0.1289118379354477, "logits/rejected": -0.19492867588996887, "logps/chosen": -0.22533206641674042, "logps/rejected": -3.8823606967926025, "loss": 0.3449, "odds_ratio_loss": 0.06637558341026306, "rewards/accuracies": 1.0, "rewards/chosen": -0.022533204406499863, "rewards/margins": 0.36570286750793457, "rewards/rejected": -0.38823604583740234, "sft_loss": 0.22533206641674042, "step": 2048 }, { "epoch": 2.9631236442516267, "grad_norm": 2.149957958947979, "learning_rate": 4.229708310927196e-06, "logits/chosen": -0.16187317669391632, "logits/rejected": -0.31687965989112854, "logps/chosen": -0.41082707047462463, "logps/rejected": -4.500897407531738, "loss": 0.4035, "odds_ratio_loss": 0.17749832570552826, "rewards/accuracies": 0.9375, "rewards/chosen": -0.041082706302404404, "rewards/margins": 0.4090070128440857, "rewards/rejected": -0.4500897526741028, "sft_loss": 0.41082707047462463, "step": 2049 }, { "epoch": 2.9645697758496024, "grad_norm": 2.570665705617286, "learning_rate": 4.2266074925170975e-06, "logits/chosen": -0.26426705718040466, "logits/rejected": -0.18332192301750183, "logps/chosen": -0.33762437105178833, "logps/rejected": -3.996952533721924, "loss": 0.3992, "odds_ratio_loss": 0.09728610515594482, "rewards/accuracies": 1.0, "rewards/chosen": -0.03376243636012077, "rewards/margins": 0.365932822227478, "rewards/rejected": -0.3996952474117279, "sft_loss": 0.33762437105178833, "step": 2050 }, { "epoch": 2.9660159074475776, "grad_norm": 2.17098598945688, "learning_rate": 4.223506537484499e-06, "logits/chosen": -0.1906561255455017, "logits/rejected": -0.12105484306812286, "logps/chosen": -0.3024832606315613, "logps/rejected": -1.6752251386642456, "loss": 0.3359, "odds_ratio_loss": 0.13633744418621063, "rewards/accuracies": 1.0, "rewards/chosen": -0.030248325318098068, "rewards/margins": 0.13727419078350067, "rewards/rejected": -0.16752250492572784, "sft_loss": 0.3024832606315613, "step": 2051 }, { "epoch": 2.9674620390455533, "grad_norm": 2.2566433623049336, "learning_rate": 4.220405447698976e-06, "logits/chosen": -0.10293443500995636, "logits/rejected": -0.14721611142158508, "logps/chosen": -0.33052927255630493, "logps/rejected": -3.822780132293701, "loss": 0.3424, "odds_ratio_loss": 0.09570840746164322, "rewards/accuracies": 1.0, "rewards/chosen": -0.03305293247103691, "rewards/margins": 0.34922507405281067, "rewards/rejected": -0.3822779953479767, "sft_loss": 0.33052927255630493, "step": 2052 }, { "epoch": 2.9689081706435285, "grad_norm": 2.3375168447838615, "learning_rate": 4.217304225030187e-06, "logits/chosen": -0.19826990365982056, "logits/rejected": -0.19559445977210999, "logps/chosen": -0.2711048722267151, "logps/rejected": -5.2059326171875, "loss": 0.3442, "odds_ratio_loss": 0.09908849745988846, "rewards/accuracies": 1.0, "rewards/chosen": -0.027110489085316658, "rewards/margins": 0.4934827387332916, "rewards/rejected": -0.5205932259559631, "sft_loss": 0.2711048722267151, "step": 2053 }, { "epoch": 2.970354302241504, "grad_norm": 2.1135846620709655, "learning_rate": 4.214202871347873e-06, "logits/chosen": -0.11324124038219452, "logits/rejected": -0.1152898520231247, "logps/chosen": -0.3063899874687195, "logps/rejected": -2.742185354232788, "loss": 0.3318, "odds_ratio_loss": 0.18975186347961426, "rewards/accuracies": 1.0, "rewards/chosen": -0.030639000236988068, "rewards/margins": 0.24357952177524567, "rewards/rejected": -0.27421852946281433, "sft_loss": 0.3063899874687195, "step": 2054 }, { "epoch": 2.9718004338394794, "grad_norm": 2.649346066064892, "learning_rate": 4.211101388521849e-06, "logits/chosen": -0.20979903638362885, "logits/rejected": -0.24693188071250916, "logps/chosen": -0.2766479551792145, "logps/rejected": -3.680441379547119, "loss": 0.3301, "odds_ratio_loss": 0.059294842183589935, "rewards/accuracies": 1.0, "rewards/chosen": -0.027664795517921448, "rewards/margins": 0.34037935733795166, "rewards/rejected": -0.3680441379547119, "sft_loss": 0.2766479551792145, "step": 2055 }, { "epoch": 2.9732465654374547, "grad_norm": 2.1537269933729584, "learning_rate": 4.207999778422013e-06, "logits/chosen": -0.19660750031471252, "logits/rejected": -0.1521485149860382, "logps/chosen": -0.3724362850189209, "logps/rejected": -2.149691343307495, "loss": 0.366, "odds_ratio_loss": 0.10635429620742798, "rewards/accuracies": 1.0, "rewards/chosen": -0.03724363446235657, "rewards/margins": 0.17772549390792847, "rewards/rejected": -0.21496912837028503, "sft_loss": 0.3724362850189209, "step": 2056 }, { "epoch": 2.9746926970354304, "grad_norm": 3.7741465300804333, "learning_rate": 4.204898042918334e-06, "logits/chosen": -0.1858070194721222, "logits/rejected": -0.15695783495903015, "logps/chosen": -0.29552727937698364, "logps/rejected": -3.873058319091797, "loss": 0.3326, "odds_ratio_loss": 0.1055610179901123, "rewards/accuracies": 1.0, "rewards/chosen": -0.029552727937698364, "rewards/margins": 0.35775309801101685, "rewards/rejected": -0.3873058557510376, "sft_loss": 0.29552727937698364, "step": 2057 }, { "epoch": 2.9761388286334056, "grad_norm": 2.0496999134672187, "learning_rate": 4.201796183880863e-06, "logits/chosen": -0.1244896799325943, "logits/rejected": -0.21163567900657654, "logps/chosen": -0.4614596962928772, "logps/rejected": -3.733137845993042, "loss": 0.3574, "odds_ratio_loss": 0.17765578627586365, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0461459681391716, "rewards/margins": 0.3271678388118744, "rewards/rejected": -0.3733137845993042, "sft_loss": 0.4614596962928772, "step": 2058 }, { "epoch": 2.977584960231381, "grad_norm": 2.359398391670168, "learning_rate": 4.1986942031797205e-06, "logits/chosen": -0.11397155374288559, "logits/rejected": -0.11231968551874161, "logps/chosen": -0.28144025802612305, "logps/rejected": -2.4738903045654297, "loss": 0.3272, "odds_ratio_loss": 0.12439928948879242, "rewards/accuracies": 1.0, "rewards/chosen": -0.028144024312496185, "rewards/margins": 0.21924501657485962, "rewards/rejected": -0.2473890334367752, "sft_loss": 0.28144025802612305, "step": 2059 }, { "epoch": 2.9790310918293565, "grad_norm": 2.387322205273284, "learning_rate": 4.1955921026851044e-06, "logits/chosen": -0.20542100071907043, "logits/rejected": -0.14749495685100555, "logps/chosen": -0.4900125563144684, "logps/rejected": -2.852177619934082, "loss": 0.3538, "odds_ratio_loss": 0.15802116692066193, "rewards/accuracies": 1.0, "rewards/chosen": -0.04900125786662102, "rewards/margins": 0.23621651530265808, "rewards/rejected": -0.2852177619934082, "sft_loss": 0.4900125563144684, "step": 2060 }, { "epoch": 2.980477223427332, "grad_norm": 2.570217232894552, "learning_rate": 4.19248988426728e-06, "logits/chosen": -0.22486189007759094, "logits/rejected": -0.16402465105056763, "logps/chosen": -0.2641967535018921, "logps/rejected": -3.638213872909546, "loss": 0.3508, "odds_ratio_loss": 0.06803110986948013, "rewards/accuracies": 1.0, "rewards/chosen": -0.02641967684030533, "rewards/margins": 0.33740171790122986, "rewards/rejected": -0.3638213574886322, "sft_loss": 0.2641967535018921, "step": 2061 }, { "epoch": 2.981923355025307, "grad_norm": 2.2112546704563183, "learning_rate": 4.189387549796587e-06, "logits/chosen": -0.0817125216126442, "logits/rejected": -0.12825915217399597, "logps/chosen": -0.38398486375808716, "logps/rejected": -3.2384722232818604, "loss": 0.3811, "odds_ratio_loss": 0.17186611890792847, "rewards/accuracies": 1.0, "rewards/chosen": -0.038398489356040955, "rewards/margins": 0.28544872999191284, "rewards/rejected": -0.323847234249115, "sft_loss": 0.38398486375808716, "step": 2062 }, { "epoch": 2.9833694866232827, "grad_norm": 2.8149652071190667, "learning_rate": 4.186285101143435e-06, "logits/chosen": -0.25716516375541687, "logits/rejected": -0.21721546351909637, "logps/chosen": -0.42149853706359863, "logps/rejected": -2.934656858444214, "loss": 0.3683, "odds_ratio_loss": 0.1831168532371521, "rewards/accuracies": 0.9375, "rewards/chosen": -0.042149852961301804, "rewards/margins": 0.2513158321380615, "rewards/rejected": -0.29346567392349243, "sft_loss": 0.42149853706359863, "step": 2063 }, { "epoch": 2.9848156182212584, "grad_norm": 2.4516629385806734, "learning_rate": 4.183182540178301e-06, "logits/chosen": -0.3914540112018585, "logits/rejected": -0.30311813950538635, "logps/chosen": -0.25005167722702026, "logps/rejected": -3.276885747909546, "loss": 0.3934, "odds_ratio_loss": 0.07259271293878555, "rewards/accuracies": 1.0, "rewards/chosen": -0.025005165487527847, "rewards/margins": 0.30268341302871704, "rewards/rejected": -0.3276885747909546, "sft_loss": 0.25005167722702026, "step": 2064 }, { "epoch": 2.9862617498192336, "grad_norm": 2.5350627953826477, "learning_rate": 4.180079868771733e-06, "logits/chosen": -0.1459140181541443, "logits/rejected": -0.21189197897911072, "logps/chosen": -0.3703756332397461, "logps/rejected": -2.440211296081543, "loss": 0.3872, "odds_ratio_loss": 0.15449750423431396, "rewards/accuracies": 0.875, "rewards/chosen": -0.03703756630420685, "rewards/margins": 0.20698359608650208, "rewards/rejected": -0.24402114748954773, "sft_loss": 0.3703756332397461, "step": 2065 }, { "epoch": 2.987707881417209, "grad_norm": 2.185668703924067, "learning_rate": 4.176977088794341e-06, "logits/chosen": -0.30598506331443787, "logits/rejected": -0.2191198468208313, "logps/chosen": -0.40353310108184814, "logps/rejected": -2.7489523887634277, "loss": 0.4252, "odds_ratio_loss": 0.14380109310150146, "rewards/accuracies": 1.0, "rewards/chosen": -0.04035331308841705, "rewards/margins": 0.2345419079065323, "rewards/rejected": -0.27489522099494934, "sft_loss": 0.40353310108184814, "step": 2066 }, { "epoch": 2.9891540130151846, "grad_norm": 2.9826272049580513, "learning_rate": 4.173874202116803e-06, "logits/chosen": -0.37615519762039185, "logits/rejected": -0.38028109073638916, "logps/chosen": -0.4126752018928528, "logps/rejected": -2.373816967010498, "loss": 0.3951, "odds_ratio_loss": 0.17959663271903992, "rewards/accuracies": 0.9375, "rewards/chosen": -0.041267525404691696, "rewards/margins": 0.1961141675710678, "rewards/rejected": -0.2373816967010498, "sft_loss": 0.4126752018928528, "step": 2067 }, { "epoch": 2.99060014461316, "grad_norm": 2.341677277000319, "learning_rate": 4.170771210609861e-06, "logits/chosen": -0.2782983183860779, "logits/rejected": -0.193710595369339, "logps/chosen": -0.44476866722106934, "logps/rejected": -2.913386821746826, "loss": 0.3934, "odds_ratio_loss": 0.14662766456604004, "rewards/accuracies": 0.9375, "rewards/chosen": -0.044476866722106934, "rewards/margins": 0.24686183035373688, "rewards/rejected": -0.2913386821746826, "sft_loss": 0.44476866722106934, "step": 2068 }, { "epoch": 2.992046276211135, "grad_norm": 2.129466110457359, "learning_rate": 4.167668116144319e-06, "logits/chosen": -0.09359033405780792, "logits/rejected": -0.26171213388442993, "logps/chosen": -0.3228522539138794, "logps/rejected": -2.8332600593566895, "loss": 0.3746, "odds_ratio_loss": 0.11024969816207886, "rewards/accuracies": 1.0, "rewards/chosen": -0.03228522837162018, "rewards/margins": 0.25104081630706787, "rewards/rejected": -0.28332602977752686, "sft_loss": 0.3228522539138794, "step": 2069 }, { "epoch": 2.9934924078091107, "grad_norm": 2.18740864051717, "learning_rate": 4.164564920591047e-06, "logits/chosen": -0.08463630080223083, "logits/rejected": -0.15710577368736267, "logps/chosen": -0.3528903126716614, "logps/rejected": -4.306077003479004, "loss": 0.3843, "odds_ratio_loss": 0.15712261199951172, "rewards/accuracies": 1.0, "rewards/chosen": -0.035289034247398376, "rewards/margins": 0.39531874656677246, "rewards/rejected": -0.43060773611068726, "sft_loss": 0.3528903126716614, "step": 2070 }, { "epoch": 2.994938539407086, "grad_norm": 2.988838631607076, "learning_rate": 4.16146162582097e-06, "logits/chosen": -0.21836420893669128, "logits/rejected": -0.1393698751926422, "logps/chosen": -0.26207345724105835, "logps/rejected": -4.11907434463501, "loss": 0.3516, "odds_ratio_loss": 0.125029519200325, "rewards/accuracies": 1.0, "rewards/chosen": -0.026207346469163895, "rewards/margins": 0.3857001066207886, "rewards/rejected": -0.411907434463501, "sft_loss": 0.26207345724105835, "step": 2071 }, { "epoch": 2.9963846710050612, "grad_norm": 2.5582808538128026, "learning_rate": 4.158358233705078e-06, "logits/chosen": -0.16028903424739838, "logits/rejected": -0.13912662863731384, "logps/chosen": -0.3448236286640167, "logps/rejected": -2.6296613216400146, "loss": 0.4081, "odds_ratio_loss": 0.19457879662513733, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03448236733675003, "rewards/margins": 0.22848376631736755, "rewards/rejected": -0.2629661560058594, "sft_loss": 0.3448236286640167, "step": 2072 }, { "epoch": 2.997830802603037, "grad_norm": 3.5422999299640074, "learning_rate": 4.155254746114417e-06, "logits/chosen": -0.28058838844299316, "logits/rejected": -0.27185720205307007, "logps/chosen": -0.4271693825721741, "logps/rejected": -2.3834800720214844, "loss": 0.4268, "odds_ratio_loss": 0.12995006144046783, "rewards/accuracies": 1.0, "rewards/chosen": -0.04271693527698517, "rewards/margins": 0.19563104212284088, "rewards/rejected": -0.23834799230098724, "sft_loss": 0.4271693825721741, "step": 2073 }, { "epoch": 2.999276934201012, "grad_norm": 2.339601181160581, "learning_rate": 4.152151164920091e-06, "logits/chosen": -0.2662460505962372, "logits/rejected": -0.24672895669937134, "logps/chosen": -0.25342679023742676, "logps/rejected": -4.663143634796143, "loss": 0.3761, "odds_ratio_loss": 0.05503750592470169, "rewards/accuracies": 1.0, "rewards/chosen": -0.025342680513858795, "rewards/margins": 0.4409716725349426, "rewards/rejected": -0.4663143754005432, "sft_loss": 0.25342679023742676, "step": 2074 }, { "epoch": 3.000723065798988, "grad_norm": 2.4154432516967472, "learning_rate": 4.149047491993262e-06, "logits/chosen": -0.3026701509952545, "logits/rejected": -0.3210662305355072, "logps/chosen": -0.21438798308372498, "logps/rejected": -4.023871898651123, "loss": 0.2418, "odds_ratio_loss": 0.06626778841018677, "rewards/accuracies": 1.0, "rewards/chosen": -0.021438799798488617, "rewards/margins": 0.38094842433929443, "rewards/rejected": -0.40238723158836365, "sft_loss": 0.21438798308372498, "step": 2075 }, { "epoch": 3.002169197396963, "grad_norm": 4.450487484527005, "learning_rate": 4.145943729205145e-06, "logits/chosen": -0.28832417726516724, "logits/rejected": -0.24835100769996643, "logps/chosen": -0.32508355379104614, "logps/rejected": -2.2887487411499023, "loss": 0.2607, "odds_ratio_loss": 0.09311854094266891, "rewards/accuracies": 1.0, "rewards/chosen": -0.03250835835933685, "rewards/margins": 0.19636651873588562, "rewards/rejected": -0.22887487709522247, "sft_loss": 0.32508355379104614, "step": 2076 }, { "epoch": 3.0036153289949383, "grad_norm": 2.782558761277381, "learning_rate": 4.142839878427008e-06, "logits/chosen": -0.28335806727409363, "logits/rejected": -0.2334311306476593, "logps/chosen": -0.18162278831005096, "logps/rejected": -2.974604606628418, "loss": 0.2096, "odds_ratio_loss": 0.06165589019656181, "rewards/accuracies": 1.0, "rewards/chosen": -0.018162280321121216, "rewards/margins": 0.27929815649986267, "rewards/rejected": -0.2974604368209839, "sft_loss": 0.18162278831005096, "step": 2077 }, { "epoch": 3.005061460592914, "grad_norm": 2.239035652889864, "learning_rate": 4.1397359415301784e-06, "logits/chosen": -0.3854452669620514, "logits/rejected": -0.3029145896434784, "logps/chosen": -0.2096811830997467, "logps/rejected": -4.488315582275391, "loss": 0.1834, "odds_ratio_loss": 0.053315550088882446, "rewards/accuracies": 1.0, "rewards/chosen": -0.0209681186825037, "rewards/margins": 0.4278634786605835, "rewards/rejected": -0.44883155822753906, "sft_loss": 0.2096811830997467, "step": 2078 }, { "epoch": 3.0065075921908893, "grad_norm": 2.2164457668179978, "learning_rate": 4.1366319203860286e-06, "logits/chosen": -0.610029399394989, "logits/rejected": -0.41548243165016174, "logps/chosen": -0.1624118685722351, "logps/rejected": -2.981743574142456, "loss": 0.1688, "odds_ratio_loss": 0.04206952825188637, "rewards/accuracies": 1.0, "rewards/chosen": -0.01624118722975254, "rewards/margins": 0.28193315863609314, "rewards/rejected": -0.29817435145378113, "sft_loss": 0.1624118685722351, "step": 2079 }, { "epoch": 3.007953723788865, "grad_norm": 2.4066543302384833, "learning_rate": 4.133527816865985e-06, "logits/chosen": -0.6042571663856506, "logits/rejected": -0.37458497285842896, "logps/chosen": -0.16903214156627655, "logps/rejected": -3.6454310417175293, "loss": 0.2058, "odds_ratio_loss": 0.020001225173473358, "rewards/accuracies": 1.0, "rewards/chosen": -0.016903216019272804, "rewards/margins": 0.34763991832733154, "rewards/rejected": -0.3645431399345398, "sft_loss": 0.16903214156627655, "step": 2080 }, { "epoch": 3.00939985538684, "grad_norm": 2.312606834909949, "learning_rate": 4.130423632841524e-06, "logits/chosen": -0.628654956817627, "logits/rejected": -0.531693160533905, "logps/chosen": -0.14114609360694885, "logps/rejected": -3.86907958984375, "loss": 0.2121, "odds_ratio_loss": 0.02958705648779869, "rewards/accuracies": 1.0, "rewards/chosen": -0.014114608988165855, "rewards/margins": 0.37279337644577026, "rewards/rejected": -0.3869079649448395, "sft_loss": 0.14114609360694885, "step": 2081 }, { "epoch": 3.0108459869848154, "grad_norm": 2.267565799953285, "learning_rate": 4.127319370184169e-06, "logits/chosen": -0.6523191928863525, "logits/rejected": -0.3899100720882416, "logps/chosen": -0.09384972602128983, "logps/rejected": -3.3656177520751953, "loss": 0.1418, "odds_ratio_loss": 0.024596964940428734, "rewards/accuracies": 1.0, "rewards/chosen": -0.009384972974658012, "rewards/margins": 0.3271768093109131, "rewards/rejected": -0.33656179904937744, "sft_loss": 0.09384972602128983, "step": 2082 }, { "epoch": 3.012292118582791, "grad_norm": 3.145990187795483, "learning_rate": 4.124215030765491e-06, "logits/chosen": -1.0443816184997559, "logits/rejected": -0.7100991010665894, "logps/chosen": -0.16050492227077484, "logps/rejected": -3.2707293033599854, "loss": 0.1905, "odds_ratio_loss": 0.03136509656906128, "rewards/accuracies": 1.0, "rewards/chosen": -0.016050491482019424, "rewards/margins": 0.31102246046066284, "rewards/rejected": -0.32707294821739197, "sft_loss": 0.16050492227077484, "step": 2083 }, { "epoch": 3.0137382501807664, "grad_norm": 4.579359916220718, "learning_rate": 4.121110616457108e-06, "logits/chosen": -0.5784963369369507, "logits/rejected": -0.47556227445602417, "logps/chosen": -0.2588617205619812, "logps/rejected": -3.156449794769287, "loss": 0.2127, "odds_ratio_loss": 0.05530041828751564, "rewards/accuracies": 1.0, "rewards/chosen": -0.0258861742913723, "rewards/margins": 0.2897588014602661, "rewards/rejected": -0.3156449794769287, "sft_loss": 0.2588617205619812, "step": 2084 }, { "epoch": 3.015184381778742, "grad_norm": 2.7301283495678605, "learning_rate": 4.118006129130684e-06, "logits/chosen": -0.6547715663909912, "logits/rejected": -0.40283170342445374, "logps/chosen": -0.1553865224123001, "logps/rejected": -4.589504718780518, "loss": 0.1887, "odds_ratio_loss": 0.028455141931772232, "rewards/accuracies": 1.0, "rewards/chosen": -0.015538652427494526, "rewards/margins": 0.44341176748275757, "rewards/rejected": -0.4589504599571228, "sft_loss": 0.1553865224123001, "step": 2085 }, { "epoch": 3.0166305133767173, "grad_norm": 2.181836161479129, "learning_rate": 4.114901570657925e-06, "logits/chosen": -0.3926331698894501, "logits/rejected": -0.35337767004966736, "logps/chosen": -0.1808508038520813, "logps/rejected": -4.288332939147949, "loss": 0.2029, "odds_ratio_loss": 0.057720035314559937, "rewards/accuracies": 1.0, "rewards/chosen": -0.01808508113026619, "rewards/margins": 0.4107481837272644, "rewards/rejected": -0.4288333058357239, "sft_loss": 0.1808508038520813, "step": 2086 }, { "epoch": 3.0180766449746925, "grad_norm": 2.479142035501784, "learning_rate": 4.111796942910581e-06, "logits/chosen": -0.5397032499313354, "logits/rejected": -0.5243762731552124, "logps/chosen": -0.15296411514282227, "logps/rejected": -3.3156747817993164, "loss": 0.1995, "odds_ratio_loss": 0.052010953426361084, "rewards/accuracies": 1.0, "rewards/chosen": -0.01529641356319189, "rewards/margins": 0.316271036863327, "rewards/rejected": -0.3315674662590027, "sft_loss": 0.15296411514282227, "step": 2087 }, { "epoch": 3.019522776572668, "grad_norm": 1.8418463482304441, "learning_rate": 4.108692247760445e-06, "logits/chosen": -0.42499226331710815, "logits/rejected": -0.3503390848636627, "logps/chosen": -0.1449154168367386, "logps/rejected": -5.186098575592041, "loss": 0.1672, "odds_ratio_loss": 0.03054206445813179, "rewards/accuracies": 1.0, "rewards/chosen": -0.014491541311144829, "rewards/margins": 0.5041183233261108, "rewards/rejected": -0.5186098217964172, "sft_loss": 0.1449154168367386, "step": 2088 }, { "epoch": 3.0209689081706435, "grad_norm": 2.29448458956409, "learning_rate": 4.105587487079345e-06, "logits/chosen": -0.6999224424362183, "logits/rejected": -0.48990195989608765, "logps/chosen": -0.13518781960010529, "logps/rejected": -2.764622211456299, "loss": 0.1565, "odds_ratio_loss": 0.042032863944768906, "rewards/accuracies": 1.0, "rewards/chosen": -0.013518782332539558, "rewards/margins": 0.26294347643852234, "rewards/rejected": -0.27646225690841675, "sft_loss": 0.13518781960010529, "step": 2089 }, { "epoch": 3.022415039768619, "grad_norm": 1.8885209324093233, "learning_rate": 4.1024826627391575e-06, "logits/chosen": -0.5850200653076172, "logits/rejected": -0.41026294231414795, "logps/chosen": -0.2129233032464981, "logps/rejected": -3.2739462852478027, "loss": 0.1821, "odds_ratio_loss": 0.05676015093922615, "rewards/accuracies": 1.0, "rewards/chosen": -0.02129232883453369, "rewards/margins": 0.30610227584838867, "rewards/rejected": -0.32739460468292236, "sft_loss": 0.2129233032464981, "step": 2090 }, { "epoch": 3.0238611713665944, "grad_norm": 2.618426799746971, "learning_rate": 4.0993777766117915e-06, "logits/chosen": -0.6758778095245361, "logits/rejected": -0.47175687551498413, "logps/chosen": -0.27825915813446045, "logps/rejected": -4.689516067504883, "loss": 0.1887, "odds_ratio_loss": 0.041840679943561554, "rewards/accuracies": 1.0, "rewards/chosen": -0.027825918048620224, "rewards/margins": 0.4411257207393646, "rewards/rejected": -0.46895164251327515, "sft_loss": 0.27825915813446045, "step": 2091 }, { "epoch": 3.0253073029645696, "grad_norm": 2.0671758104441094, "learning_rate": 4.0962728305691926e-06, "logits/chosen": -0.495349258184433, "logits/rejected": -0.41494473814964294, "logps/chosen": -0.1947062909603119, "logps/rejected": -3.9549150466918945, "loss": 0.1946, "odds_ratio_loss": 0.05212587118148804, "rewards/accuracies": 1.0, "rewards/chosen": -0.01947062835097313, "rewards/margins": 0.3760209083557129, "rewards/rejected": -0.3954915404319763, "sft_loss": 0.1947062909603119, "step": 2092 }, { "epoch": 3.0267534345625453, "grad_norm": 2.043595727301972, "learning_rate": 4.093167826483347e-06, "logits/chosen": -0.4349784851074219, "logits/rejected": -0.39165636897087097, "logps/chosen": -0.16431699693202972, "logps/rejected": -3.6643869876861572, "loss": 0.225, "odds_ratio_loss": 0.035537637770175934, "rewards/accuracies": 1.0, "rewards/chosen": -0.016431700438261032, "rewards/margins": 0.350007027387619, "rewards/rejected": -0.36643874645233154, "sft_loss": 0.16431699693202972, "step": 2093 }, { "epoch": 3.0281995661605206, "grad_norm": 2.232927910314228, "learning_rate": 4.090062766226271e-06, "logits/chosen": -0.7041189670562744, "logits/rejected": -0.47187769412994385, "logps/chosen": -0.1355535387992859, "logps/rejected": -5.294040203094482, "loss": 0.2344, "odds_ratio_loss": 0.026054540649056435, "rewards/accuracies": 1.0, "rewards/chosen": -0.013555353507399559, "rewards/margins": 0.515848696231842, "rewards/rejected": -0.5294040441513062, "sft_loss": 0.1355535387992859, "step": 2094 }, { "epoch": 3.0296456977584962, "grad_norm": 2.5169435782219667, "learning_rate": 4.086957651670018e-06, "logits/chosen": -0.6608462333679199, "logits/rejected": -0.48246246576309204, "logps/chosen": -0.29138410091400146, "logps/rejected": -2.3921921253204346, "loss": 0.2552, "odds_ratio_loss": 0.06351328641176224, "rewards/accuracies": 1.0, "rewards/chosen": -0.029138410463929176, "rewards/margins": 0.2100808173418045, "rewards/rejected": -0.23921923339366913, "sft_loss": 0.29138410091400146, "step": 2095 }, { "epoch": 3.0310918293564715, "grad_norm": 1.9132864556802796, "learning_rate": 4.0838524846866735e-06, "logits/chosen": -0.7670720219612122, "logits/rejected": -0.7422049641609192, "logps/chosen": -0.18818186223506927, "logps/rejected": -3.1613006591796875, "loss": 0.1341, "odds_ratio_loss": 0.04930785298347473, "rewards/accuracies": 1.0, "rewards/chosen": -0.018818188458681107, "rewards/margins": 0.2973118722438812, "rewards/rejected": -0.31613004207611084, "sft_loss": 0.18818186223506927, "step": 2096 }, { "epoch": 3.0325379609544467, "grad_norm": 1.9483355773154174, "learning_rate": 4.080747267148353e-06, "logits/chosen": -0.4968271553516388, "logits/rejected": -0.4267679452896118, "logps/chosen": -0.15192314982414246, "logps/rejected": -3.1880059242248535, "loss": 0.1494, "odds_ratio_loss": 0.05188725143671036, "rewards/accuracies": 1.0, "rewards/chosen": -0.01519231591373682, "rewards/margins": 0.3036082684993744, "rewards/rejected": -0.31880059838294983, "sft_loss": 0.15192314982414246, "step": 2097 }, { "epoch": 3.0339840925524224, "grad_norm": 2.402577443846496, "learning_rate": 4.077642000927205e-06, "logits/chosen": -0.6681464314460754, "logits/rejected": -0.41508573293685913, "logps/chosen": -0.1843072772026062, "logps/rejected": -5.525094985961914, "loss": 0.1644, "odds_ratio_loss": 0.02610270492732525, "rewards/accuracies": 1.0, "rewards/chosen": -0.01843072660267353, "rewards/margins": 0.5340787768363953, "rewards/rejected": -0.5525094270706177, "sft_loss": 0.1843072772026062, "step": 2098 }, { "epoch": 3.0354302241503976, "grad_norm": 2.4018786992472054, "learning_rate": 4.074536687895405e-06, "logits/chosen": -0.7136832475662231, "logits/rejected": -0.47490206360816956, "logps/chosen": -0.15297845005989075, "logps/rejected": -4.706912994384766, "loss": 0.2074, "odds_ratio_loss": 0.02270958572626114, "rewards/accuracies": 1.0, "rewards/chosen": -0.015297845005989075, "rewards/margins": 0.4553934931755066, "rewards/rejected": -0.47069135308265686, "sft_loss": 0.15297845005989075, "step": 2099 }, { "epoch": 3.036876355748373, "grad_norm": 2.043474240086887, "learning_rate": 4.0714313299251575e-06, "logits/chosen": -0.7795498967170715, "logits/rejected": -0.5213183164596558, "logps/chosen": -0.11158294975757599, "logps/rejected": -4.195063591003418, "loss": 0.1316, "odds_ratio_loss": 0.019902389496564865, "rewards/accuracies": 1.0, "rewards/chosen": -0.011158295907080173, "rewards/margins": 0.40834811329841614, "rewards/rejected": -0.41950640082359314, "sft_loss": 0.11158294975757599, "step": 2100 }, { "epoch": 3.0383224873463486, "grad_norm": 2.1009794089099647, "learning_rate": 4.0683259288886965e-06, "logits/chosen": -0.7349028587341309, "logits/rejected": -0.5432249307632446, "logps/chosen": -0.10936813056468964, "logps/rejected": -5.250774383544922, "loss": 0.175, "odds_ratio_loss": 0.019732775166630745, "rewards/accuracies": 1.0, "rewards/chosen": -0.010936813428997993, "rewards/margins": 0.5141406059265137, "rewards/rejected": -0.5250774025917053, "sft_loss": 0.10936813056468964, "step": 2101 }, { "epoch": 3.039768618944324, "grad_norm": 1.9958955503465796, "learning_rate": 4.065220486658277e-06, "logits/chosen": -0.5262503623962402, "logits/rejected": -0.4228152632713318, "logps/chosen": -0.16352926194667816, "logps/rejected": -3.2567734718322754, "loss": 0.1686, "odds_ratio_loss": 0.037220489233732224, "rewards/accuracies": 1.0, "rewards/chosen": -0.016352925449609756, "rewards/margins": 0.3093244433403015, "rewards/rejected": -0.32567736506462097, "sft_loss": 0.16352926194667816, "step": 2102 }, { "epoch": 3.0412147505422995, "grad_norm": 1.8636021031520353, "learning_rate": 4.062115005106184e-06, "logits/chosen": -0.42543336749076843, "logits/rejected": -0.4913654923439026, "logps/chosen": -0.30701306462287903, "logps/rejected": -2.3156869411468506, "loss": 0.2009, "odds_ratio_loss": 0.07792215794324875, "rewards/accuracies": 1.0, "rewards/chosen": -0.03070130944252014, "rewards/margins": 0.20086738467216492, "rewards/rejected": -0.23156869411468506, "sft_loss": 0.30701306462287903, "step": 2103 }, { "epoch": 3.0426608821402747, "grad_norm": 2.429431192098418, "learning_rate": 4.059009486104723e-06, "logits/chosen": -0.578881025314331, "logits/rejected": -0.4205232262611389, "logps/chosen": -0.13701388239860535, "logps/rejected": -4.750458717346191, "loss": 0.1819, "odds_ratio_loss": 0.01851857826113701, "rewards/accuracies": 1.0, "rewards/chosen": -0.01370138768106699, "rewards/margins": 0.46134448051452637, "rewards/rejected": -0.4750458896160126, "sft_loss": 0.13701388239860535, "step": 2104 }, { "epoch": 3.04410701373825, "grad_norm": 2.293787093452759, "learning_rate": 4.055903931526223e-06, "logits/chosen": -0.7719517350196838, "logits/rejected": -0.6306342482566833, "logps/chosen": -0.18231219053268433, "logps/rejected": -3.1785566806793213, "loss": 0.201, "odds_ratio_loss": 0.04030875861644745, "rewards/accuracies": 1.0, "rewards/chosen": -0.018231220543384552, "rewards/margins": 0.2996244430541992, "rewards/rejected": -0.3178556561470032, "sft_loss": 0.18231219053268433, "step": 2105 }, { "epoch": 3.0455531453362257, "grad_norm": 2.5028298804086235, "learning_rate": 4.052798343243036e-06, "logits/chosen": -0.7744817733764648, "logits/rejected": -0.5614781379699707, "logps/chosen": -0.22691814601421356, "logps/rejected": -4.627237796783447, "loss": 0.2317, "odds_ratio_loss": 0.05098215490579605, "rewards/accuracies": 1.0, "rewards/chosen": -0.022691816091537476, "rewards/margins": 0.4400319755077362, "rewards/rejected": -0.4627237915992737, "sft_loss": 0.22691814601421356, "step": 2106 }, { "epoch": 3.046999276934201, "grad_norm": 2.295477572904047, "learning_rate": 4.04969272312753e-06, "logits/chosen": -0.8552494049072266, "logits/rejected": -0.6359434723854065, "logps/chosen": -0.20083674788475037, "logps/rejected": -3.457988977432251, "loss": 0.2102, "odds_ratio_loss": 0.029120953753590584, "rewards/accuracies": 1.0, "rewards/chosen": -0.020083673298358917, "rewards/margins": 0.32571524381637573, "rewards/rejected": -0.34579890966415405, "sft_loss": 0.20083674788475037, "step": 2107 }, { "epoch": 3.0484454085321766, "grad_norm": 2.4749090053073233, "learning_rate": 4.0465870730520954e-06, "logits/chosen": -0.7466475963592529, "logits/rejected": -0.5847068428993225, "logps/chosen": -0.05967186018824577, "logps/rejected": -3.782038688659668, "loss": 0.1441, "odds_ratio_loss": 0.009426586329936981, "rewards/accuracies": 1.0, "rewards/chosen": -0.005967185832560062, "rewards/margins": 0.3722366690635681, "rewards/rejected": -0.3782038688659668, "sft_loss": 0.05967186018824577, "step": 2108 }, { "epoch": 3.049891540130152, "grad_norm": 2.0123379540693334, "learning_rate": 4.043481394889142e-06, "logits/chosen": -0.678367555141449, "logits/rejected": -0.6387525200843811, "logps/chosen": -0.2518230974674225, "logps/rejected": -3.079854726791382, "loss": 0.2228, "odds_ratio_loss": 0.05264444649219513, "rewards/accuracies": 1.0, "rewards/chosen": -0.025182312354445457, "rewards/margins": 0.28280317783355713, "rewards/rejected": -0.30798548460006714, "sft_loss": 0.2518230974674225, "step": 2109 }, { "epoch": 3.051337671728127, "grad_norm": 2.5649086388818767, "learning_rate": 4.040375690511094e-06, "logits/chosen": -0.733020544052124, "logits/rejected": -0.4613838493824005, "logps/chosen": -0.10593655705451965, "logps/rejected": -5.208601951599121, "loss": 0.1497, "odds_ratio_loss": 0.018377486616373062, "rewards/accuracies": 1.0, "rewards/chosen": -0.010593656450510025, "rewards/margins": 0.5102665424346924, "rewards/rejected": -0.5208601951599121, "sft_loss": 0.10593655705451965, "step": 2110 }, { "epoch": 3.0527838033261028, "grad_norm": 1.996597938866012, "learning_rate": 4.03726996179039e-06, "logits/chosen": -0.7812892198562622, "logits/rejected": -0.6449815034866333, "logps/chosen": -0.26513880491256714, "logps/rejected": -3.381945848464966, "loss": 0.1614, "odds_ratio_loss": 0.049505963921546936, "rewards/accuracies": 1.0, "rewards/chosen": -0.026513883844017982, "rewards/margins": 0.31168070435523987, "rewards/rejected": -0.3381945788860321, "sft_loss": 0.26513880491256714, "step": 2111 }, { "epoch": 3.054229934924078, "grad_norm": 1.8174594922591507, "learning_rate": 4.034164210599488e-06, "logits/chosen": -0.5324065685272217, "logits/rejected": -0.4299301505088806, "logps/chosen": -0.11089219897985458, "logps/rejected": -5.629248142242432, "loss": 0.1388, "odds_ratio_loss": 0.01814519427716732, "rewards/accuracies": 1.0, "rewards/chosen": -0.011089220643043518, "rewards/margins": 0.5518355965614319, "rewards/rejected": -0.5629248023033142, "sft_loss": 0.11089219897985458, "step": 2112 }, { "epoch": 3.0556760665220537, "grad_norm": 2.122834905775403, "learning_rate": 4.031058438810857e-06, "logits/chosen": -0.7841843366622925, "logits/rejected": -0.5668798685073853, "logps/chosen": -0.1282779723405838, "logps/rejected": -3.247554302215576, "loss": 0.1335, "odds_ratio_loss": 0.03160501644015312, "rewards/accuracies": 1.0, "rewards/chosen": -0.0128277987241745, "rewards/margins": 0.3119276165962219, "rewards/rejected": -0.3247554302215576, "sft_loss": 0.1282779723405838, "step": 2113 }, { "epoch": 3.057122198120029, "grad_norm": 2.405269498818451, "learning_rate": 4.027952648296978e-06, "logits/chosen": -0.6955947875976562, "logits/rejected": -0.5394856929779053, "logps/chosen": -0.16242378950119019, "logps/rejected": -5.063565731048584, "loss": 0.2003, "odds_ratio_loss": 0.027161482721567154, "rewards/accuracies": 1.0, "rewards/chosen": -0.01624237932264805, "rewards/margins": 0.4901142120361328, "rewards/rejected": -0.5063565373420715, "sft_loss": 0.16242378950119019, "step": 2114 }, { "epoch": 3.058568329718004, "grad_norm": 1.9522892318832643, "learning_rate": 4.0248468409303425e-06, "logits/chosen": -0.7737332582473755, "logits/rejected": -0.4954363703727722, "logps/chosen": -0.1599772721529007, "logps/rejected": -3.514385223388672, "loss": 0.1724, "odds_ratio_loss": 0.022483911365270615, "rewards/accuracies": 1.0, "rewards/chosen": -0.01599772833287716, "rewards/margins": 0.33544081449508667, "rewards/rejected": -0.3514385223388672, "sft_loss": 0.1599772721529007, "step": 2115 }, { "epoch": 3.06001446131598, "grad_norm": 2.153304126645383, "learning_rate": 4.0217410185834536e-06, "logits/chosen": -0.7023472785949707, "logits/rejected": -0.533017635345459, "logps/chosen": -0.16424649953842163, "logps/rejected": -5.224195957183838, "loss": 0.1543, "odds_ratio_loss": 0.043630871921777725, "rewards/accuracies": 1.0, "rewards/chosen": -0.016424652189016342, "rewards/margins": 0.5059949159622192, "rewards/rejected": -0.5224196314811707, "sft_loss": 0.16424649953842163, "step": 2116 }, { "epoch": 3.061460592913955, "grad_norm": 2.0513351448836104, "learning_rate": 4.018635183128823e-06, "logits/chosen": -0.6236501932144165, "logits/rejected": -0.5361102819442749, "logps/chosen": -0.20569291710853577, "logps/rejected": -2.541423797607422, "loss": 0.1819, "odds_ratio_loss": 0.05294905602931976, "rewards/accuracies": 1.0, "rewards/chosen": -0.020569290965795517, "rewards/margins": 0.23357310891151428, "rewards/rejected": -0.2541424036026001, "sft_loss": 0.20569291710853577, "step": 2117 }, { "epoch": 3.062906724511931, "grad_norm": 2.1448700342273597, "learning_rate": 4.015529336438973e-06, "logits/chosen": -0.7315176129341125, "logits/rejected": -0.540245532989502, "logps/chosen": -0.14228178560733795, "logps/rejected": -4.650500774383545, "loss": 0.1486, "odds_ratio_loss": 0.026424670591950417, "rewards/accuracies": 1.0, "rewards/chosen": -0.014228180050849915, "rewards/margins": 0.4508219063282013, "rewards/rejected": -0.4650501012802124, "sft_loss": 0.14228178560733795, "step": 2118 }, { "epoch": 3.064352856109906, "grad_norm": 2.341561331050371, "learning_rate": 4.012423480386426e-06, "logits/chosen": -0.5403401851654053, "logits/rejected": -0.4582579731941223, "logps/chosen": -0.14413967728614807, "logps/rejected": -4.726695537567139, "loss": 0.1921, "odds_ratio_loss": 0.02182953804731369, "rewards/accuracies": 1.0, "rewards/chosen": -0.014413966797292233, "rewards/margins": 0.4582555890083313, "rewards/rejected": -0.4726695418357849, "sft_loss": 0.14413967728614807, "step": 2119 }, { "epoch": 3.0657989877078813, "grad_norm": 2.386821389201234, "learning_rate": 4.009317616843718e-06, "logits/chosen": -0.44133293628692627, "logits/rejected": -0.4722680449485779, "logps/chosen": -0.22620916366577148, "logps/rejected": -2.8983755111694336, "loss": 0.1771, "odds_ratio_loss": 0.05633849278092384, "rewards/accuracies": 1.0, "rewards/chosen": -0.02262091636657715, "rewards/margins": 0.26721662282943726, "rewards/rejected": -0.2898375391960144, "sft_loss": 0.22620916366577148, "step": 2120 }, { "epoch": 3.067245119305857, "grad_norm": 1.7305002096138642, "learning_rate": 4.006211747683384e-06, "logits/chosen": -0.6039251685142517, "logits/rejected": -0.5303505063056946, "logps/chosen": -0.13674521446228027, "logps/rejected": -4.530514240264893, "loss": 0.1803, "odds_ratio_loss": 0.035152267664670944, "rewards/accuracies": 1.0, "rewards/chosen": -0.013674520887434483, "rewards/margins": 0.43937692046165466, "rewards/rejected": -0.45305144786834717, "sft_loss": 0.13674521446228027, "step": 2121 }, { "epoch": 3.068691250903832, "grad_norm": 2.1291486853323405, "learning_rate": 4.003105874777963e-06, "logits/chosen": -0.7058529853820801, "logits/rejected": -0.5664353966712952, "logps/chosen": -0.09501180052757263, "logps/rejected": -3.9581353664398193, "loss": 0.1877, "odds_ratio_loss": 0.022701166570186615, "rewards/accuracies": 1.0, "rewards/chosen": -0.009501179680228233, "rewards/margins": 0.386312335729599, "rewards/rejected": -0.395813524723053, "sft_loss": 0.09501180052757263, "step": 2122 }, { "epoch": 3.0701373825018075, "grad_norm": 2.7097953576820624, "learning_rate": 4e-06, "logits/chosen": -0.7147266864776611, "logits/rejected": -0.4768485426902771, "logps/chosen": -0.1762448400259018, "logps/rejected": -4.018145561218262, "loss": 0.2214, "odds_ratio_loss": 0.027249373495578766, "rewards/accuracies": 1.0, "rewards/chosen": -0.01762448623776436, "rewards/margins": 0.38419008255004883, "rewards/rejected": -0.4018145799636841, "sft_loss": 0.1762448400259018, "step": 2123 }, { "epoch": 3.071583514099783, "grad_norm": 1.8998144556018384, "learning_rate": 3.996894125222036e-06, "logits/chosen": -0.5158587694168091, "logits/rejected": -0.4066459536552429, "logps/chosen": -0.16753427684307098, "logps/rejected": -4.010112762451172, "loss": 0.1828, "odds_ratio_loss": 0.04549555480480194, "rewards/accuracies": 1.0, "rewards/chosen": -0.016753429546952248, "rewards/margins": 0.38425785303115845, "rewards/rejected": -0.40101125836372375, "sft_loss": 0.16753427684307098, "step": 2124 }, { "epoch": 3.0730296456977584, "grad_norm": 2.1176577092878133, "learning_rate": 3.993788252316617e-06, "logits/chosen": -0.6375981569290161, "logits/rejected": -0.47454291582107544, "logps/chosen": -0.1505729854106903, "logps/rejected": -8.243040084838867, "loss": 0.203, "odds_ratio_loss": 0.012890107929706573, "rewards/accuracies": 1.0, "rewards/chosen": -0.01505729928612709, "rewards/margins": 0.809246838092804, "rewards/rejected": -0.8243041634559631, "sft_loss": 0.1505729854106903, "step": 2125 }, { "epoch": 3.074475777295734, "grad_norm": 2.222962148586192, "learning_rate": 3.990682383156282e-06, "logits/chosen": -0.620424211025238, "logits/rejected": -0.4490058720111847, "logps/chosen": -0.22270803153514862, "logps/rejected": -4.201064586639404, "loss": 0.2145, "odds_ratio_loss": 0.05165166035294533, "rewards/accuracies": 1.0, "rewards/chosen": -0.022270802408456802, "rewards/margins": 0.3978356719017029, "rewards/rejected": -0.4201064705848694, "sft_loss": 0.22270803153514862, "step": 2126 }, { "epoch": 3.0759219088937093, "grad_norm": 2.4804554039420577, "learning_rate": 3.987576519613574e-06, "logits/chosen": -0.4530171751976013, "logits/rejected": -0.412492036819458, "logps/chosen": -0.2665051519870758, "logps/rejected": -3.0253868103027344, "loss": 0.2164, "odds_ratio_loss": 0.061206281185150146, "rewards/accuracies": 1.0, "rewards/chosen": -0.02665051445364952, "rewards/margins": 0.2758881747722626, "rewards/rejected": -0.3025386929512024, "sft_loss": 0.2665051519870758, "step": 2127 }, { "epoch": 3.0773680404916846, "grad_norm": 1.7493140822868425, "learning_rate": 3.984470663561027e-06, "logits/chosen": -0.6655663847923279, "logits/rejected": -0.7546771764755249, "logps/chosen": -0.2823176681995392, "logps/rejected": -3.803299903869629, "loss": 0.1604, "odds_ratio_loss": 0.06763014197349548, "rewards/accuracies": 1.0, "rewards/chosen": -0.02823176607489586, "rewards/margins": 0.3520982563495636, "rewards/rejected": -0.38033002614974976, "sft_loss": 0.2823176681995392, "step": 2128 }, { "epoch": 3.0788141720896602, "grad_norm": 2.5403797596003166, "learning_rate": 3.981364816871177e-06, "logits/chosen": -0.7357568740844727, "logits/rejected": -0.583640456199646, "logps/chosen": -0.17165610194206238, "logps/rejected": -3.8525633811950684, "loss": 0.1617, "odds_ratio_loss": 0.024731557816267014, "rewards/accuracies": 1.0, "rewards/chosen": -0.017165610566735268, "rewards/margins": 0.36809074878692627, "rewards/rejected": -0.3852563500404358, "sft_loss": 0.17165610194206238, "step": 2129 }, { "epoch": 3.0802603036876355, "grad_norm": 2.054847911596039, "learning_rate": 3.978258981416547e-06, "logits/chosen": -0.7385162711143494, "logits/rejected": -0.6317088603973389, "logps/chosen": -0.14730429649353027, "logps/rejected": -2.464529037475586, "loss": 0.1315, "odds_ratio_loss": 0.03632424771785736, "rewards/accuracies": 1.0, "rewards/chosen": -0.014730430208146572, "rewards/margins": 0.23172245919704437, "rewards/rejected": -0.24645289778709412, "sft_loss": 0.14730429649353027, "step": 2130 }, { "epoch": 3.081706435285611, "grad_norm": 2.177460049188998, "learning_rate": 3.975153159069659e-06, "logits/chosen": -0.4692709147930145, "logits/rejected": -0.4788125157356262, "logps/chosen": -0.13799342513084412, "logps/rejected": -3.8743791580200195, "loss": 0.1705, "odds_ratio_loss": 0.03219764679670334, "rewards/accuracies": 1.0, "rewards/chosen": -0.013799343258142471, "rewards/margins": 0.3736385703086853, "rewards/rejected": -0.3874379098415375, "sft_loss": 0.13799342513084412, "step": 2131 }, { "epoch": 3.0831525668835864, "grad_norm": 2.2308389604931995, "learning_rate": 3.972047351703023e-06, "logits/chosen": -0.5637999773025513, "logits/rejected": -0.46241044998168945, "logps/chosen": -0.28073132038116455, "logps/rejected": -5.222455978393555, "loss": 0.1847, "odds_ratio_loss": 0.04090370982885361, "rewards/accuracies": 1.0, "rewards/chosen": -0.028073132038116455, "rewards/margins": 0.49417245388031006, "rewards/rejected": -0.5222455859184265, "sft_loss": 0.28073132038116455, "step": 2132 }, { "epoch": 3.0845986984815617, "grad_norm": 2.065829840448526, "learning_rate": 3.968941561189144e-06, "logits/chosen": -0.6958283185958862, "logits/rejected": -0.4899245500564575, "logps/chosen": -0.1636473834514618, "logps/rejected": -3.7653756141662598, "loss": 0.2069, "odds_ratio_loss": 0.033067747950553894, "rewards/accuracies": 1.0, "rewards/chosen": -0.01636473834514618, "rewards/margins": 0.3601728081703186, "rewards/rejected": -0.3765375316143036, "sft_loss": 0.1636473834514618, "step": 2133 }, { "epoch": 3.0860448300795373, "grad_norm": 1.876329075053545, "learning_rate": 3.965835789400511e-06, "logits/chosen": -0.563957691192627, "logits/rejected": -0.5317133665084839, "logps/chosen": -0.12031100690364838, "logps/rejected": -3.236659526824951, "loss": 0.1561, "odds_ratio_loss": 0.027744002640247345, "rewards/accuracies": 1.0, "rewards/chosen": -0.012031100690364838, "rewards/margins": 0.3116348683834076, "rewards/rejected": -0.323665976524353, "sft_loss": 0.12031100690364838, "step": 2134 }, { "epoch": 3.0874909616775126, "grad_norm": 2.0304470488543616, "learning_rate": 3.96273003820961e-06, "logits/chosen": -0.7444326877593994, "logits/rejected": -0.5694934129714966, "logps/chosen": -0.22145669162273407, "logps/rejected": -3.7066285610198975, "loss": 0.1865, "odds_ratio_loss": 0.028277050703763962, "rewards/accuracies": 1.0, "rewards/chosen": -0.022145669907331467, "rewards/margins": 0.34851717948913574, "rewards/rejected": -0.3706628680229187, "sft_loss": 0.22145669162273407, "step": 2135 }, { "epoch": 3.0889370932754883, "grad_norm": 1.8981778171798918, "learning_rate": 3.959624309488907e-06, "logits/chosen": -0.614467978477478, "logits/rejected": -0.3776948153972626, "logps/chosen": -0.11771661043167114, "logps/rejected": -6.75969123840332, "loss": 0.1404, "odds_ratio_loss": 0.01951497234404087, "rewards/accuracies": 1.0, "rewards/chosen": -0.01177166122943163, "rewards/margins": 0.6641974449157715, "rewards/rejected": -0.675969123840332, "sft_loss": 0.11771661043167114, "step": 2136 }, { "epoch": 3.0903832248734635, "grad_norm": 2.1400342160750054, "learning_rate": 3.956518605110858e-06, "logits/chosen": -0.8642411828041077, "logits/rejected": -0.6858229637145996, "logps/chosen": -0.2012554109096527, "logps/rejected": -2.901653528213501, "loss": 0.1956, "odds_ratio_loss": 0.03699169307947159, "rewards/accuracies": 1.0, "rewards/chosen": -0.02012554183602333, "rewards/margins": 0.27003979682922363, "rewards/rejected": -0.29016533493995667, "sft_loss": 0.2012554109096527, "step": 2137 }, { "epoch": 3.0918293564714388, "grad_norm": 2.0394103728196638, "learning_rate": 3.953412926947904e-06, "logits/chosen": -0.5077266097068787, "logits/rejected": -0.42272666096687317, "logps/chosen": -0.10724125802516937, "logps/rejected": -7.563508033752441, "loss": 0.1693, "odds_ratio_loss": 0.020378630608320236, "rewards/accuracies": 1.0, "rewards/chosen": -0.010724126361310482, "rewards/margins": 0.74562668800354, "rewards/rejected": -0.7563507556915283, "sft_loss": 0.10724125802516937, "step": 2138 }, { "epoch": 3.0932754880694144, "grad_norm": 2.233590697823899, "learning_rate": 3.95030727687247e-06, "logits/chosen": -0.6356875896453857, "logits/rejected": -0.4583815932273865, "logps/chosen": -0.13725396990776062, "logps/rejected": -3.551173686981201, "loss": 0.1757, "odds_ratio_loss": 0.030577151104807854, "rewards/accuracies": 1.0, "rewards/chosen": -0.013725396245718002, "rewards/margins": 0.34139198064804077, "rewards/rejected": -0.3551173806190491, "sft_loss": 0.13725396990776062, "step": 2139 }, { "epoch": 3.0947216196673897, "grad_norm": 1.998594502803306, "learning_rate": 3.947201656756965e-06, "logits/chosen": -0.5484251976013184, "logits/rejected": -0.43344441056251526, "logps/chosen": -0.20057189464569092, "logps/rejected": -3.6383776664733887, "loss": 0.1883, "odds_ratio_loss": 0.04176495224237442, "rewards/accuracies": 1.0, "rewards/chosen": -0.02005719020962715, "rewards/margins": 0.3437805771827698, "rewards/rejected": -0.3638377785682678, "sft_loss": 0.20057189464569092, "step": 2140 }, { "epoch": 3.096167751265365, "grad_norm": 2.0891032642185365, "learning_rate": 3.944096068473776e-06, "logits/chosen": -0.48677608370780945, "logits/rejected": -0.405622273683548, "logps/chosen": -0.3364259600639343, "logps/rejected": -4.87321662902832, "loss": 0.1989, "odds_ratio_loss": 0.0456356480717659, "rewards/accuracies": 1.0, "rewards/chosen": -0.03364259749650955, "rewards/margins": 0.4536791145801544, "rewards/rejected": -0.487321674823761, "sft_loss": 0.3364259600639343, "step": 2141 }, { "epoch": 3.0976138828633406, "grad_norm": 2.3986916761530397, "learning_rate": 3.940990513895277e-06, "logits/chosen": -0.49167507886886597, "logits/rejected": -0.2597613036632538, "logps/chosen": -0.14950230717658997, "logps/rejected": -4.943644046783447, "loss": 0.1609, "odds_ratio_loss": 0.02151937410235405, "rewards/accuracies": 1.0, "rewards/chosen": -0.014950230717658997, "rewards/margins": 0.47941410541534424, "rewards/rejected": -0.4943643808364868, "sft_loss": 0.14950230717658997, "step": 2142 }, { "epoch": 3.099060014461316, "grad_norm": 2.3771417631729204, "learning_rate": 3.937884994893815e-06, "logits/chosen": -0.6202237606048584, "logits/rejected": -0.34670570492744446, "logps/chosen": -0.16555556654930115, "logps/rejected": -3.282248020172119, "loss": 0.1799, "odds_ratio_loss": 0.0847080647945404, "rewards/accuracies": 0.9375, "rewards/chosen": -0.016555557027459145, "rewards/margins": 0.3116692304611206, "rewards/rejected": -0.328224778175354, "sft_loss": 0.16555556654930115, "step": 2143 }, { "epoch": 3.1005061460592915, "grad_norm": 2.044758701059538, "learning_rate": 3.934779513341723e-06, "logits/chosen": -0.5131956338882446, "logits/rejected": -0.3095894753932953, "logps/chosen": -0.2654573917388916, "logps/rejected": -4.961881160736084, "loss": 0.2241, "odds_ratio_loss": 0.04515118524432182, "rewards/accuracies": 1.0, "rewards/chosen": -0.02654574252665043, "rewards/margins": 0.46964243054389954, "rewards/rejected": -0.49618813395500183, "sft_loss": 0.2654573917388916, "step": 2144 }, { "epoch": 3.1019522776572668, "grad_norm": 1.7210001373799835, "learning_rate": 3.931674071111304e-06, "logits/chosen": -0.45441409945487976, "logits/rejected": -0.40361326932907104, "logps/chosen": -0.23514500260353088, "logps/rejected": -3.1339406967163086, "loss": 0.1595, "odds_ratio_loss": 0.06016864255070686, "rewards/accuracies": 1.0, "rewards/chosen": -0.02351449988782406, "rewards/margins": 0.28987962007522583, "rewards/rejected": -0.31339409947395325, "sft_loss": 0.23514500260353088, "step": 2145 }, { "epoch": 3.103398409255242, "grad_norm": 2.024843444988982, "learning_rate": 3.928568670074843e-06, "logits/chosen": -0.4629301428794861, "logits/rejected": -0.34551891684532166, "logps/chosen": -0.13455036282539368, "logps/rejected": -2.923142194747925, "loss": 0.1627, "odds_ratio_loss": 0.022695183753967285, "rewards/accuracies": 1.0, "rewards/chosen": -0.013455037027597427, "rewards/margins": 0.2788591980934143, "rewards/rejected": -0.29231423139572144, "sft_loss": 0.13455036282539368, "step": 2146 }, { "epoch": 3.1048445408532177, "grad_norm": 2.2333254041840034, "learning_rate": 3.925463312104596e-06, "logits/chosen": -0.7243767380714417, "logits/rejected": -0.3842761516571045, "logps/chosen": -0.21467825770378113, "logps/rejected": -4.240947723388672, "loss": 0.2452, "odds_ratio_loss": 0.04264179617166519, "rewards/accuracies": 1.0, "rewards/chosen": -0.021467823535203934, "rewards/margins": 0.4026269316673279, "rewards/rejected": -0.4240947365760803, "sft_loss": 0.21467825770378113, "step": 2147 }, { "epoch": 3.106290672451193, "grad_norm": 2.188334438515691, "learning_rate": 3.922357999072796e-06, "logits/chosen": -0.8195107579231262, "logits/rejected": -0.521625816822052, "logps/chosen": -0.24980032444000244, "logps/rejected": -3.1828725337982178, "loss": 0.2113, "odds_ratio_loss": 0.021983999758958817, "rewards/accuracies": 1.0, "rewards/chosen": -0.024980032816529274, "rewards/margins": 0.29330721497535706, "rewards/rejected": -0.3182872533798218, "sft_loss": 0.24980032444000244, "step": 2148 }, { "epoch": 3.1077368040491686, "grad_norm": 2.3063303796163637, "learning_rate": 3.9192527328516475e-06, "logits/chosen": -0.4675880968570709, "logits/rejected": -0.32635053992271423, "logps/chosen": -0.28106504678726196, "logps/rejected": -3.9863810539245605, "loss": 0.2463, "odds_ratio_loss": 0.06380581855773926, "rewards/accuracies": 1.0, "rewards/chosen": -0.028106503188610077, "rewards/margins": 0.3705316185951233, "rewards/rejected": -0.39863812923431396, "sft_loss": 0.28106504678726196, "step": 2149 }, { "epoch": 3.109182935647144, "grad_norm": 3.585902690061833, "learning_rate": 3.916147515313326e-06, "logits/chosen": -0.7061706781387329, "logits/rejected": -0.5766094326972961, "logps/chosen": -0.2508338689804077, "logps/rejected": -3.741215229034424, "loss": 0.2621, "odds_ratio_loss": 0.05887717008590698, "rewards/accuracies": 1.0, "rewards/chosen": -0.025083385407924652, "rewards/margins": 0.34903812408447266, "rewards/rejected": -0.3741214871406555, "sft_loss": 0.2508338689804077, "step": 2150 }, { "epoch": 3.110629067245119, "grad_norm": 2.042487934584991, "learning_rate": 3.9130423483299815e-06, "logits/chosen": -0.5965969562530518, "logits/rejected": -0.5227322578430176, "logps/chosen": -0.1644129902124405, "logps/rejected": -3.138468027114868, "loss": 0.1707, "odds_ratio_loss": 0.04252278432250023, "rewards/accuracies": 1.0, "rewards/chosen": -0.01644129864871502, "rewards/margins": 0.29740551114082336, "rewards/rejected": -0.31384679675102234, "sft_loss": 0.1644129902124405, "step": 2151 }, { "epoch": 3.112075198843095, "grad_norm": 2.1188301952171513, "learning_rate": 3.9099372337737285e-06, "logits/chosen": -0.6012832522392273, "logits/rejected": -0.49223455786705017, "logps/chosen": -0.17560946941375732, "logps/rejected": -4.076479434967041, "loss": 0.1919, "odds_ratio_loss": 0.03781703859567642, "rewards/accuracies": 1.0, "rewards/chosen": -0.017560947686433792, "rewards/margins": 0.39008697867393494, "rewards/rejected": -0.407647967338562, "sft_loss": 0.17560946941375732, "step": 2152 }, { "epoch": 3.11352133044107, "grad_norm": 2.6657369394288564, "learning_rate": 3.906832173516653e-06, "logits/chosen": -0.4916571378707886, "logits/rejected": -0.41273894906044006, "logps/chosen": -0.07565654069185257, "logps/rejected": -5.079089641571045, "loss": 0.1556, "odds_ratio_loss": 0.011491803452372551, "rewards/accuracies": 1.0, "rewards/chosen": -0.007565653882920742, "rewards/margins": 0.5003433227539062, "rewards/rejected": -0.5079089403152466, "sft_loss": 0.07565654069185257, "step": 2153 }, { "epoch": 3.1149674620390457, "grad_norm": 2.238773018686905, "learning_rate": 3.903727169430806e-06, "logits/chosen": -0.5660860538482666, "logits/rejected": -0.35988539457321167, "logps/chosen": -0.11718136072158813, "logps/rejected": -3.5155582427978516, "loss": 0.1476, "odds_ratio_loss": 0.03395921364426613, "rewards/accuracies": 1.0, "rewards/chosen": -0.011718135327100754, "rewards/margins": 0.3398377001285553, "rewards/rejected": -0.35155582427978516, "sft_loss": 0.11718136072158813, "step": 2154 }, { "epoch": 3.116413593637021, "grad_norm": 2.006913186733354, "learning_rate": 3.900622223388209e-06, "logits/chosen": -0.5774864554405212, "logits/rejected": -0.4433661997318268, "logps/chosen": -0.14877638220787048, "logps/rejected": -5.834985256195068, "loss": 0.2236, "odds_ratio_loss": 0.0120998565107584, "rewards/accuracies": 1.0, "rewards/chosen": -0.014877637848258018, "rewards/margins": 0.5686209201812744, "rewards/rejected": -0.583498477935791, "sft_loss": 0.14877638220787048, "step": 2155 }, { "epoch": 3.117859725234996, "grad_norm": 4.473257009520163, "learning_rate": 3.897517337260842e-06, "logits/chosen": -0.580504834651947, "logits/rejected": -0.3841407299041748, "logps/chosen": -0.2404867708683014, "logps/rejected": -3.8186872005462646, "loss": 0.2396, "odds_ratio_loss": 0.03407532721757889, "rewards/accuracies": 1.0, "rewards/chosen": -0.02404867857694626, "rewards/margins": 0.3578200340270996, "rewards/rejected": -0.38186874985694885, "sft_loss": 0.2404867708683014, "step": 2156 }, { "epoch": 3.119305856832972, "grad_norm": 2.1972664434284366, "learning_rate": 3.894412512920655e-06, "logits/chosen": -0.575932502746582, "logits/rejected": -0.593704104423523, "logps/chosen": -0.20279648900032043, "logps/rejected": -4.450626373291016, "loss": 0.1834, "odds_ratio_loss": 0.06688258051872253, "rewards/accuracies": 0.9375, "rewards/chosen": -0.020279649645090103, "rewards/margins": 0.42478299140930176, "rewards/rejected": -0.44506266713142395, "sft_loss": 0.20279648900032043, "step": 2157 }, { "epoch": 3.120751988430947, "grad_norm": 2.7066013861559677, "learning_rate": 3.891307752239556e-06, "logits/chosen": -0.5452953577041626, "logits/rejected": -0.4585101008415222, "logps/chosen": -0.1771378219127655, "logps/rejected": -2.6734554767608643, "loss": 0.2138, "odds_ratio_loss": 0.050962068140506744, "rewards/accuracies": 1.0, "rewards/chosen": -0.01771378144621849, "rewards/margins": 0.24963179230690002, "rewards/rejected": -0.2673455774784088, "sft_loss": 0.1771378219127655, "step": 2158 }, { "epoch": 3.122198120028923, "grad_norm": 2.211874916353963, "learning_rate": 3.8882030570894194e-06, "logits/chosen": -0.5663500428199768, "logits/rejected": -0.4593850076198578, "logps/chosen": -0.09784838557243347, "logps/rejected": -2.9985287189483643, "loss": 0.1453, "odds_ratio_loss": 0.026956774294376373, "rewards/accuracies": 1.0, "rewards/chosen": -0.009784838184714317, "rewards/margins": 0.29006803035736084, "rewards/rejected": -0.2998528778553009, "sft_loss": 0.09784838557243347, "step": 2159 }, { "epoch": 3.123644251626898, "grad_norm": 1.8720211085043381, "learning_rate": 3.8850984293420755e-06, "logits/chosen": -0.8111594319343567, "logits/rejected": -0.42723047733306885, "logps/chosen": -0.1713923215866089, "logps/rejected": -2.9202356338500977, "loss": 0.1496, "odds_ratio_loss": 0.0293788630515337, "rewards/accuracies": 1.0, "rewards/chosen": -0.017139233648777008, "rewards/margins": 0.27488431334495544, "rewards/rejected": -0.29202353954315186, "sft_loss": 0.1713923215866089, "step": 2160 }, { "epoch": 3.1250903832248733, "grad_norm": 2.1561396202024423, "learning_rate": 3.881993870869317e-06, "logits/chosen": -0.4672960638999939, "logits/rejected": -0.23019523918628693, "logps/chosen": -0.2502592206001282, "logps/rejected": -3.436143398284912, "loss": 0.2187, "odds_ratio_loss": 0.04205578565597534, "rewards/accuracies": 1.0, "rewards/chosen": -0.025025920942425728, "rewards/margins": 0.31858840584754944, "rewards/rejected": -0.3436143398284912, "sft_loss": 0.2502592206001282, "step": 2161 }, { "epoch": 3.126536514822849, "grad_norm": 1.7497644379347141, "learning_rate": 3.878889383542892e-06, "logits/chosen": -0.5571482181549072, "logits/rejected": -0.41588735580444336, "logps/chosen": -0.18334656953811646, "logps/rejected": -3.0952415466308594, "loss": 0.1189, "odds_ratio_loss": 0.03541785106062889, "rewards/accuracies": 1.0, "rewards/chosen": -0.018334658816456795, "rewards/margins": 0.2911895215511322, "rewards/rejected": -0.30952417850494385, "sft_loss": 0.18334656953811646, "step": 2162 }, { "epoch": 3.1279826464208242, "grad_norm": 2.2392104542666784, "learning_rate": 3.87578496923451e-06, "logits/chosen": -0.4766693413257599, "logits/rejected": -0.40024662017822266, "logps/chosen": -0.2564614415168762, "logps/rejected": -3.4361166954040527, "loss": 0.2093, "odds_ratio_loss": 0.05638567730784416, "rewards/accuracies": 1.0, "rewards/chosen": -0.025646142661571503, "rewards/margins": 0.3179655075073242, "rewards/rejected": -0.3436116576194763, "sft_loss": 0.2564614415168762, "step": 2163 }, { "epoch": 3.1294287780187995, "grad_norm": 2.1439404626315546, "learning_rate": 3.872680629815832e-06, "logits/chosen": -0.5200873613357544, "logits/rejected": -0.4677450358867645, "logps/chosen": -0.2646474242210388, "logps/rejected": -4.130258560180664, "loss": 0.1668, "odds_ratio_loss": 0.07118930667638779, "rewards/accuracies": 1.0, "rewards/chosen": -0.026464741677045822, "rewards/margins": 0.3865610957145691, "rewards/rejected": -0.4130258858203888, "sft_loss": 0.2646474242210388, "step": 2164 }, { "epoch": 3.130874909616775, "grad_norm": 2.3317828336888975, "learning_rate": 3.869576367158475e-06, "logits/chosen": -0.4354207515716553, "logits/rejected": -0.33270853757858276, "logps/chosen": -0.1295112818479538, "logps/rejected": -5.896690368652344, "loss": 0.1458, "odds_ratio_loss": 0.016541190445423126, "rewards/accuracies": 1.0, "rewards/chosen": -0.01295112818479538, "rewards/margins": 0.5767179131507874, "rewards/rejected": -0.5896689891815186, "sft_loss": 0.1295112818479538, "step": 2165 }, { "epoch": 3.1323210412147504, "grad_norm": 2.0183818962715514, "learning_rate": 3.866472183134015e-06, "logits/chosen": -0.314644455909729, "logits/rejected": -0.21877217292785645, "logps/chosen": -0.2322487086057663, "logps/rejected": -4.360037326812744, "loss": 0.1972, "odds_ratio_loss": 0.0481003001332283, "rewards/accuracies": 1.0, "rewards/chosen": -0.02322487160563469, "rewards/margins": 0.4127788841724396, "rewards/rejected": -0.43600374460220337, "sft_loss": 0.2322487086057663, "step": 2166 }, { "epoch": 3.133767172812726, "grad_norm": 2.219897757568939, "learning_rate": 3.863368079613971e-06, "logits/chosen": -0.5187494158744812, "logits/rejected": -0.37773463129997253, "logps/chosen": -0.12917321920394897, "logps/rejected": -3.245424270629883, "loss": 0.201, "odds_ratio_loss": 0.03227677941322327, "rewards/accuracies": 1.0, "rewards/chosen": -0.012917323037981987, "rewards/margins": 0.3116251230239868, "rewards/rejected": -0.32454243302345276, "sft_loss": 0.12917321920394897, "step": 2167 }, { "epoch": 3.1352133044107013, "grad_norm": 2.1556321676285393, "learning_rate": 3.860264058469822e-06, "logits/chosen": -0.6454533934593201, "logits/rejected": -0.40207356214523315, "logps/chosen": -0.19393956661224365, "logps/rejected": -2.3810601234436035, "loss": 0.1878, "odds_ratio_loss": 0.057800304144620895, "rewards/accuracies": 1.0, "rewards/chosen": -0.019393956288695335, "rewards/margins": 0.21871204674243927, "rewards/rejected": -0.23810601234436035, "sft_loss": 0.19393956661224365, "step": 2168 }, { "epoch": 3.1366594360086766, "grad_norm": 2.465289843971513, "learning_rate": 3.8571601215729904e-06, "logits/chosen": -0.41532689332962036, "logits/rejected": -0.3971560299396515, "logps/chosen": -0.14556801319122314, "logps/rejected": -3.225522994995117, "loss": 0.1605, "odds_ratio_loss": 0.03414017707109451, "rewards/accuracies": 1.0, "rewards/chosen": -0.01455680187791586, "rewards/margins": 0.3079955279827118, "rewards/rejected": -0.32255232334136963, "sft_loss": 0.14556801319122314, "step": 2169 }, { "epoch": 3.1381055676066523, "grad_norm": 2.213239859679258, "learning_rate": 3.854056270794856e-06, "logits/chosen": -0.7330466508865356, "logits/rejected": -0.3163624405860901, "logps/chosen": -0.13772007822990417, "logps/rejected": -6.967245578765869, "loss": 0.1757, "odds_ratio_loss": 0.014283371157944202, "rewards/accuracies": 1.0, "rewards/chosen": -0.013772009871900082, "rewards/margins": 0.6829525232315063, "rewards/rejected": -0.6967245936393738, "sft_loss": 0.13772007822990417, "step": 2170 }, { "epoch": 3.1395516992046275, "grad_norm": 2.1787848582083895, "learning_rate": 3.8509525080067375e-06, "logits/chosen": -0.6489847898483276, "logits/rejected": -0.36521393060684204, "logps/chosen": -0.20706214010715485, "logps/rejected": -3.5721209049224854, "loss": 0.2048, "odds_ratio_loss": 0.017135875299572945, "rewards/accuracies": 1.0, "rewards/chosen": -0.020706214010715485, "rewards/margins": 0.3365058898925781, "rewards/rejected": -0.3572120666503906, "sft_loss": 0.20706214010715485, "step": 2171 }, { "epoch": 3.140997830802603, "grad_norm": 1.98445645583722, "learning_rate": 3.847848835079909e-06, "logits/chosen": -0.770344614982605, "logits/rejected": -0.5676401257514954, "logps/chosen": -0.182399719953537, "logps/rejected": -3.97198224067688, "loss": 0.1593, "odds_ratio_loss": 0.0314740389585495, "rewards/accuracies": 1.0, "rewards/chosen": -0.01823997311294079, "rewards/margins": 0.3789582848548889, "rewards/rejected": -0.39719823002815247, "sft_loss": 0.182399719953537, "step": 2172 }, { "epoch": 3.1424439624005784, "grad_norm": 1.985445789793339, "learning_rate": 3.8447452538855835e-06, "logits/chosen": -0.46158552169799805, "logits/rejected": -0.4430912435054779, "logps/chosen": -0.2738022804260254, "logps/rejected": -4.37603235244751, "loss": 0.2017, "odds_ratio_loss": 0.10020264238119125, "rewards/accuracies": 1.0, "rewards/chosen": -0.02738022990524769, "rewards/margins": 0.41022300720214844, "rewards/rejected": -0.437603235244751, "sft_loss": 0.2738022804260254, "step": 2173 }, { "epoch": 3.1438900939985537, "grad_norm": 2.371867630391579, "learning_rate": 3.841641766294923e-06, "logits/chosen": -0.42799311876296997, "logits/rejected": -0.388339102268219, "logps/chosen": -0.14372651278972626, "logps/rejected": -3.0899109840393066, "loss": 0.1701, "odds_ratio_loss": 0.03828097879886627, "rewards/accuracies": 1.0, "rewards/chosen": -0.01437265146523714, "rewards/margins": 0.2946184277534485, "rewards/rejected": -0.30899107456207275, "sft_loss": 0.14372651278972626, "step": 2174 }, { "epoch": 3.1453362255965294, "grad_norm": 2.0472933581207897, "learning_rate": 3.83853837417903e-06, "logits/chosen": -0.6643333435058594, "logits/rejected": -0.5412406921386719, "logps/chosen": -0.2567393481731415, "logps/rejected": -3.749028205871582, "loss": 0.2037, "odds_ratio_loss": 0.0705362930893898, "rewards/accuracies": 1.0, "rewards/chosen": -0.02567393332719803, "rewards/margins": 0.3492288887500763, "rewards/rejected": -0.3749028444290161, "sft_loss": 0.2567393481731415, "step": 2175 }, { "epoch": 3.1467823571945046, "grad_norm": 2.147926415455977, "learning_rate": 3.835435079408954e-06, "logits/chosen": -0.6184093952178955, "logits/rejected": -0.4594188630580902, "logps/chosen": -0.24748066067695618, "logps/rejected": -2.9573397636413574, "loss": 0.1725, "odds_ratio_loss": 0.03922613710165024, "rewards/accuracies": 1.0, "rewards/chosen": -0.024748066440224648, "rewards/margins": 0.2709859311580658, "rewards/rejected": -0.2957339882850647, "sft_loss": 0.24748066067695618, "step": 2176 }, { "epoch": 3.1482284887924803, "grad_norm": 2.176903500984134, "learning_rate": 3.8323318838556814e-06, "logits/chosen": -0.8568693995475769, "logits/rejected": -0.6487101316452026, "logps/chosen": -0.1801314353942871, "logps/rejected": -4.282716274261475, "loss": 0.1795, "odds_ratio_loss": 0.044610489159822464, "rewards/accuracies": 1.0, "rewards/chosen": -0.01801314391195774, "rewards/margins": 0.4102584719657898, "rewards/rejected": -0.4282715916633606, "sft_loss": 0.1801314353942871, "step": 2177 }, { "epoch": 3.1496746203904555, "grad_norm": 2.036253753343565, "learning_rate": 3.829228789390139e-06, "logits/chosen": -0.558239758014679, "logits/rejected": -0.45034515857696533, "logps/chosen": -0.2566141188144684, "logps/rejected": -3.356292724609375, "loss": 0.2143, "odds_ratio_loss": 0.047206830233335495, "rewards/accuracies": 1.0, "rewards/chosen": -0.025661412626504898, "rewards/margins": 0.30996787548065186, "rewards/rejected": -0.33562928438186646, "sft_loss": 0.2566141188144684, "step": 2178 }, { "epoch": 3.151120751988431, "grad_norm": 2.1664134746621246, "learning_rate": 3.826125797883197e-06, "logits/chosen": -0.43020087480545044, "logits/rejected": -0.43662816286087036, "logps/chosen": -0.2651152014732361, "logps/rejected": -3.1269278526306152, "loss": 0.2053, "odds_ratio_loss": 0.06396672129631042, "rewards/accuracies": 1.0, "rewards/chosen": -0.02651152014732361, "rewards/margins": 0.28618124127388, "rewards/rejected": -0.312692791223526, "sft_loss": 0.2651152014732361, "step": 2179 }, { "epoch": 3.1525668835864065, "grad_norm": 1.9887411533498236, "learning_rate": 3.823022911205659e-06, "logits/chosen": -0.6099244356155396, "logits/rejected": -0.35586732625961304, "logps/chosen": -0.11266939342021942, "logps/rejected": -4.009099006652832, "loss": 0.2009, "odds_ratio_loss": 0.02088777907192707, "rewards/accuracies": 1.0, "rewards/chosen": -0.011266940273344517, "rewards/margins": 0.38964295387268066, "rewards/rejected": -0.4009098708629608, "sft_loss": 0.11266939342021942, "step": 2180 }, { "epoch": 3.1540130151843817, "grad_norm": 2.186223351643586, "learning_rate": 3.819920131228268e-06, "logits/chosen": -0.6756264567375183, "logits/rejected": -0.5853908061981201, "logps/chosen": -0.15891686081886292, "logps/rejected": -4.220728874206543, "loss": 0.1641, "odds_ratio_loss": 0.027598343789577484, "rewards/accuracies": 1.0, "rewards/chosen": -0.01589168794453144, "rewards/margins": 0.4061812162399292, "rewards/rejected": -0.4220728874206543, "sft_loss": 0.15891686081886292, "step": 2181 }, { "epoch": 3.1554591467823574, "grad_norm": 2.2133671954217213, "learning_rate": 3.816817459821698e-06, "logits/chosen": -0.46562516689300537, "logits/rejected": -0.4322385787963867, "logps/chosen": -0.13615188002586365, "logps/rejected": -3.9099514484405518, "loss": 0.1749, "odds_ratio_loss": 0.03285899758338928, "rewards/accuracies": 1.0, "rewards/chosen": -0.01361518818885088, "rewards/margins": 0.3773799538612366, "rewards/rejected": -0.3909951448440552, "sft_loss": 0.13615188002586365, "step": 2182 }, { "epoch": 3.1569052783803326, "grad_norm": 2.176409976005723, "learning_rate": 3.8137148988565655e-06, "logits/chosen": -0.7380187511444092, "logits/rejected": -0.6416623592376709, "logps/chosen": -0.27821779251098633, "logps/rejected": -3.296513557434082, "loss": 0.2334, "odds_ratio_loss": 0.02916991338133812, "rewards/accuracies": 1.0, "rewards/chosen": -0.027821781113743782, "rewards/margins": 0.30182960629463196, "rewards/rejected": -0.329651415348053, "sft_loss": 0.27821779251098633, "step": 2183 }, { "epoch": 3.158351409978308, "grad_norm": 2.0522969283962413, "learning_rate": 3.8106124502034133e-06, "logits/chosen": -0.8222931027412415, "logits/rejected": -0.7903980016708374, "logps/chosen": -0.1292247623205185, "logps/rejected": -3.884793758392334, "loss": 0.1517, "odds_ratio_loss": 0.024996642023324966, "rewards/accuracies": 1.0, "rewards/chosen": -0.012922476977109909, "rewards/margins": 0.3755568861961365, "rewards/rejected": -0.3884793519973755, "sft_loss": 0.1292247623205185, "step": 2184 }, { "epoch": 3.1597975415762836, "grad_norm": 2.261195748283917, "learning_rate": 3.8075101157327215e-06, "logits/chosen": -0.5183153748512268, "logits/rejected": -0.4637899398803711, "logps/chosen": -0.1162676066160202, "logps/rejected": -3.6541671752929688, "loss": 0.1853, "odds_ratio_loss": 0.023313432931900024, "rewards/accuracies": 1.0, "rewards/chosen": -0.01162676140666008, "rewards/margins": 0.35378995537757874, "rewards/rejected": -0.3654167056083679, "sft_loss": 0.1162676066160202, "step": 2185 }, { "epoch": 3.161243673174259, "grad_norm": 2.2397538577382146, "learning_rate": 3.8044078973148965e-06, "logits/chosen": -0.6720691919326782, "logits/rejected": -0.5420901775360107, "logps/chosen": -0.22410547733306885, "logps/rejected": -2.626776695251465, "loss": 0.2293, "odds_ratio_loss": 0.04131586477160454, "rewards/accuracies": 1.0, "rewards/chosen": -0.022410545498132706, "rewards/margins": 0.2402670979499817, "rewards/rejected": -0.2626776695251465, "sft_loss": 0.22410547733306885, "step": 2186 }, { "epoch": 3.162689804772234, "grad_norm": 1.9970829869530207, "learning_rate": 3.8013057968202796e-06, "logits/chosen": -0.7500214576721191, "logits/rejected": -0.617428183555603, "logps/chosen": -0.17513474822044373, "logps/rejected": -3.5836827754974365, "loss": 0.2036, "odds_ratio_loss": 0.04269365221261978, "rewards/accuracies": 1.0, "rewards/chosen": -0.017513476312160492, "rewards/margins": 0.34085479378700256, "rewards/rejected": -0.35836827754974365, "sft_loss": 0.17513474822044373, "step": 2187 }, { "epoch": 3.1641359363702097, "grad_norm": 1.9591010980917978, "learning_rate": 3.7982038161191375e-06, "logits/chosen": -0.6849007606506348, "logits/rejected": -0.5874512195587158, "logps/chosen": -0.17351099848747253, "logps/rejected": -3.3261361122131348, "loss": 0.1952, "odds_ratio_loss": 0.05133747309446335, "rewards/accuracies": 1.0, "rewards/chosen": -0.017351102083921432, "rewards/margins": 0.3152625262737274, "rewards/rejected": -0.33261361718177795, "sft_loss": 0.17351099848747253, "step": 2188 }, { "epoch": 3.165582067968185, "grad_norm": 2.1623811095435883, "learning_rate": 3.7951019570816664e-06, "logits/chosen": -0.6917041540145874, "logits/rejected": -0.6428658962249756, "logps/chosen": -0.1369011104106903, "logps/rejected": -4.486429691314697, "loss": 0.1815, "odds_ratio_loss": 0.02341514080762863, "rewards/accuracies": 1.0, "rewards/chosen": -0.013690110296010971, "rewards/margins": 0.43495285511016846, "rewards/rejected": -0.4486429989337921, "sft_loss": 0.1369011104106903, "step": 2189 }, { "epoch": 3.1670281995661607, "grad_norm": 2.203850860590357, "learning_rate": 3.7920002215779875e-06, "logits/chosen": -0.7099331617355347, "logits/rejected": -0.63243567943573, "logps/chosen": -0.1877567023038864, "logps/rejected": -3.5332705974578857, "loss": 0.1762, "odds_ratio_loss": 0.04335636645555496, "rewards/accuracies": 1.0, "rewards/chosen": -0.01877567172050476, "rewards/margins": 0.3345514237880707, "rewards/rejected": -0.3533271253108978, "sft_loss": 0.1877567023038864, "step": 2190 }, { "epoch": 3.168474331164136, "grad_norm": 2.0629683680334083, "learning_rate": 3.78889861147815e-06, "logits/chosen": -0.5730685591697693, "logits/rejected": -0.31942451000213623, "logps/chosen": -0.11757153272628784, "logps/rejected": -4.100497722625732, "loss": 0.206, "odds_ratio_loss": 0.009712357074022293, "rewards/accuracies": 1.0, "rewards/chosen": -0.011757154017686844, "rewards/margins": 0.398292601108551, "rewards/rejected": -0.41004979610443115, "sft_loss": 0.11757153272628784, "step": 2191 }, { "epoch": 3.169920462762111, "grad_norm": 6.148173254503388, "learning_rate": 3.7857971286521273e-06, "logits/chosen": -0.6861517429351807, "logits/rejected": -0.5459161996841431, "logps/chosen": -0.1798088550567627, "logps/rejected": -4.008049964904785, "loss": 0.2586, "odds_ratio_loss": 0.03384054824709892, "rewards/accuracies": 1.0, "rewards/chosen": -0.01798088476061821, "rewards/margins": 0.3828241229057312, "rewards/rejected": -0.4008050262928009, "sft_loss": 0.1798088550567627, "step": 2192 }, { "epoch": 3.171366594360087, "grad_norm": 1.981916691678493, "learning_rate": 3.782695774969811e-06, "logits/chosen": -0.47907698154449463, "logits/rejected": -0.45603060722351074, "logps/chosen": -0.2008480578660965, "logps/rejected": -2.513578414916992, "loss": 0.191, "odds_ratio_loss": 0.07965853065252304, "rewards/accuracies": 1.0, "rewards/chosen": -0.0200848076492548, "rewards/margins": 0.23127305507659912, "rewards/rejected": -0.2513578534126282, "sft_loss": 0.2008480578660965, "step": 2193 }, { "epoch": 3.172812725958062, "grad_norm": 2.209839340924135, "learning_rate": 3.7795945523010236e-06, "logits/chosen": -0.4900778532028198, "logits/rejected": -0.45295250415802, "logps/chosen": -0.09929634630680084, "logps/rejected": -4.984482765197754, "loss": 0.1897, "odds_ratio_loss": 0.025094415992498398, "rewards/accuracies": 1.0, "rewards/chosen": -0.009929634630680084, "rewards/margins": 0.48851868510246277, "rewards/rejected": -0.49844831228256226, "sft_loss": 0.09929634630680084, "step": 2194 }, { "epoch": 3.1742588575560378, "grad_norm": 2.304743853675743, "learning_rate": 3.776493462515501e-06, "logits/chosen": -0.6225374341011047, "logits/rejected": -0.44118446111679077, "logps/chosen": -0.171739399433136, "logps/rejected": -3.954901695251465, "loss": 0.1846, "odds_ratio_loss": 0.03998706489801407, "rewards/accuracies": 1.0, "rewards/chosen": -0.01717394031584263, "rewards/margins": 0.3783162534236908, "rewards/rejected": -0.3954901695251465, "sft_loss": 0.171739399433136, "step": 2195 }, { "epoch": 3.175704989154013, "grad_norm": 1.7701915623752982, "learning_rate": 3.7733925074829026e-06, "logits/chosen": -0.7270638942718506, "logits/rejected": -0.5552908182144165, "logps/chosen": -0.18401703238487244, "logps/rejected": -3.406104564666748, "loss": 0.2344, "odds_ratio_loss": 0.031469061970710754, "rewards/accuracies": 1.0, "rewards/chosen": -0.018401702865958214, "rewards/margins": 0.3222087323665619, "rewards/rejected": -0.34061044454574585, "sft_loss": 0.18401703238487244, "step": 2196 }, { "epoch": 3.1771511207519882, "grad_norm": 2.059233864845376, "learning_rate": 3.7702916890728037e-06, "logits/chosen": -0.5447147488594055, "logits/rejected": -0.45716631412506104, "logps/chosen": -0.18555516004562378, "logps/rejected": -4.597956657409668, "loss": 0.1853, "odds_ratio_loss": 0.026829298585653305, "rewards/accuracies": 1.0, "rewards/chosen": -0.01855551451444626, "rewards/margins": 0.44124019145965576, "rewards/rejected": -0.45979568362236023, "sft_loss": 0.18555516004562378, "step": 2197 }, { "epoch": 3.178597252349964, "grad_norm": 1.807317879130678, "learning_rate": 3.767191009154703e-06, "logits/chosen": -0.5492076873779297, "logits/rejected": -0.32834699749946594, "logps/chosen": -0.24237895011901855, "logps/rejected": -4.1069793701171875, "loss": 0.1525, "odds_ratio_loss": 0.025217795744538307, "rewards/accuracies": 1.0, "rewards/chosen": -0.024237895384430885, "rewards/margins": 0.3864600956439972, "rewards/rejected": -0.41069793701171875, "sft_loss": 0.24237895011901855, "step": 2198 }, { "epoch": 3.180043383947939, "grad_norm": 1.9245969065946, "learning_rate": 3.764090469598009e-06, "logits/chosen": -0.6140974164009094, "logits/rejected": -0.5183427333831787, "logps/chosen": -0.12481319904327393, "logps/rejected": -4.698176383972168, "loss": 0.1857, "odds_ratio_loss": 0.03451571986079216, "rewards/accuracies": 1.0, "rewards/chosen": -0.012481319718062878, "rewards/margins": 0.45733633637428284, "rewards/rejected": -0.4698176681995392, "sft_loss": 0.12481319904327393, "step": 2199 }, { "epoch": 3.181489515545915, "grad_norm": 2.191252165583729, "learning_rate": 3.76099007227205e-06, "logits/chosen": -0.791283130645752, "logits/rejected": -0.5891169309616089, "logps/chosen": -0.11535245925188065, "logps/rejected": -3.5455451011657715, "loss": 0.1683, "odds_ratio_loss": 0.01490130927413702, "rewards/accuracies": 1.0, "rewards/chosen": -0.01153524499386549, "rewards/margins": 0.3430192470550537, "rewards/rejected": -0.35455450415611267, "sft_loss": 0.11535245925188065, "step": 2200 }, { "epoch": 3.18293564714389, "grad_norm": 2.268876201951002, "learning_rate": 3.757889819046065e-06, "logits/chosen": -0.6616383790969849, "logits/rejected": -0.5038686990737915, "logps/chosen": -0.21815867722034454, "logps/rejected": -3.9103035926818848, "loss": 0.1883, "odds_ratio_loss": 0.03401227667927742, "rewards/accuracies": 1.0, "rewards/chosen": -0.021815868094563484, "rewards/margins": 0.3692144751548767, "rewards/rejected": -0.39103034138679504, "sft_loss": 0.21815867722034454, "step": 2201 }, { "epoch": 3.1843817787418653, "grad_norm": 2.4282555689676, "learning_rate": 3.754789711789212e-06, "logits/chosen": -0.5337816476821899, "logits/rejected": -0.46093809604644775, "logps/chosen": -0.1983119249343872, "logps/rejected": -4.407778739929199, "loss": 0.245, "odds_ratio_loss": 0.02094094827771187, "rewards/accuracies": 1.0, "rewards/chosen": -0.01983119361102581, "rewards/margins": 0.4209466576576233, "rewards/rejected": -0.44077789783477783, "sft_loss": 0.1983119249343872, "step": 2202 }, { "epoch": 3.185827910339841, "grad_norm": 2.326999903416427, "learning_rate": 3.7516897523705537e-06, "logits/chosen": -0.5480571985244751, "logits/rejected": -0.4346959590911865, "logps/chosen": -0.14086556434631348, "logps/rejected": -2.17596435546875, "loss": 0.1529, "odds_ratio_loss": 0.029783004894852638, "rewards/accuracies": 1.0, "rewards/chosen": -0.014086555689573288, "rewards/margins": 0.2035098671913147, "rewards/rejected": -0.2175964117050171, "sft_loss": 0.14086556434631348, "step": 2203 }, { "epoch": 3.1872740419378163, "grad_norm": 2.949948093264566, "learning_rate": 3.7485899426590676e-06, "logits/chosen": -0.604040265083313, "logits/rejected": -0.5441180467605591, "logps/chosen": -0.2571631669998169, "logps/rejected": -4.050754070281982, "loss": 0.2018, "odds_ratio_loss": 0.07504919171333313, "rewards/accuracies": 1.0, "rewards/chosen": -0.025716319680213928, "rewards/margins": 0.37935909628868103, "rewards/rejected": -0.4050753712654114, "sft_loss": 0.2571631669998169, "step": 2204 }, { "epoch": 3.188720173535792, "grad_norm": 2.1085036743628938, "learning_rate": 3.7454902845236433e-06, "logits/chosen": -0.8290140628814697, "logits/rejected": -0.529233992099762, "logps/chosen": -0.16382518410682678, "logps/rejected": -4.446649551391602, "loss": 0.1828, "odds_ratio_loss": 0.025827059522271156, "rewards/accuracies": 1.0, "rewards/chosen": -0.016382519155740738, "rewards/margins": 0.4282824993133545, "rewards/rejected": -0.44466501474380493, "sft_loss": 0.16382518410682678, "step": 2205 }, { "epoch": 3.190166305133767, "grad_norm": 5.203311282071658, "learning_rate": 3.7423907798330735e-06, "logits/chosen": -0.5849148631095886, "logits/rejected": -0.44293212890625, "logps/chosen": -0.2151346355676651, "logps/rejected": -4.317756652832031, "loss": 0.1996, "odds_ratio_loss": 0.04586172476410866, "rewards/accuracies": 1.0, "rewards/chosen": -0.02151346392929554, "rewards/margins": 0.41026225686073303, "rewards/rejected": -0.43177568912506104, "sft_loss": 0.2151346355676651, "step": 2206 }, { "epoch": 3.1916124367317424, "grad_norm": 2.3996402747625014, "learning_rate": 3.739291430456063e-06, "logits/chosen": -0.5927102565765381, "logits/rejected": -0.48668432235717773, "logps/chosen": -0.16127264499664307, "logps/rejected": -4.136725902557373, "loss": 0.1826, "odds_ratio_loss": 0.050145432353019714, "rewards/accuracies": 1.0, "rewards/chosen": -0.016127265989780426, "rewards/margins": 0.39754533767700195, "rewards/rejected": -0.4136725664138794, "sft_loss": 0.16127264499664307, "step": 2207 }, { "epoch": 3.193058568329718, "grad_norm": 2.331118390165532, "learning_rate": 3.736192238261218e-06, "logits/chosen": -0.4932352602481842, "logits/rejected": -0.30938223004341125, "logps/chosen": -0.11432990431785583, "logps/rejected": -3.635563850402832, "loss": 0.2564, "odds_ratio_loss": 0.034632958471775055, "rewards/accuracies": 1.0, "rewards/chosen": -0.011432991363108158, "rewards/margins": 0.3521234095096588, "rewards/rejected": -0.3635564148426056, "sft_loss": 0.11432990431785583, "step": 2208 }, { "epoch": 3.1945046999276934, "grad_norm": 2.0566819085700123, "learning_rate": 3.733093205117057e-06, "logits/chosen": -0.5601751208305359, "logits/rejected": -0.3214784562587738, "logps/chosen": -0.11301355063915253, "logps/rejected": -5.065844535827637, "loss": 0.1695, "odds_ratio_loss": 0.02004491351544857, "rewards/accuracies": 1.0, "rewards/chosen": -0.011301355436444283, "rewards/margins": 0.4952830970287323, "rewards/rejected": -0.5065844655036926, "sft_loss": 0.11301355063915253, "step": 2209 }, { "epoch": 3.1959508315256686, "grad_norm": 1.9736197886022742, "learning_rate": 3.7299943328919956e-06, "logits/chosen": -0.5280442833900452, "logits/rejected": -0.4607413411140442, "logps/chosen": -0.18449607491493225, "logps/rejected": -3.659106969833374, "loss": 0.1599, "odds_ratio_loss": 0.03631690517067909, "rewards/accuracies": 1.0, "rewards/chosen": -0.018449608236551285, "rewards/margins": 0.34746110439300537, "rewards/rejected": -0.36591070890426636, "sft_loss": 0.18449607491493225, "step": 2210 }, { "epoch": 3.1973969631236443, "grad_norm": 3.0301528491892826, "learning_rate": 3.726895623454358e-06, "logits/chosen": -0.5963754057884216, "logits/rejected": -0.6541630625724792, "logps/chosen": -0.40213772654533386, "logps/rejected": -2.991434335708618, "loss": 0.2434, "odds_ratio_loss": 0.10465790331363678, "rewards/accuracies": 1.0, "rewards/chosen": -0.04021377116441727, "rewards/margins": 0.25892966985702515, "rewards/rejected": -0.2991434335708618, "sft_loss": 0.40213772654533386, "step": 2211 }, { "epoch": 3.1988430947216195, "grad_norm": 1.8336318495290496, "learning_rate": 3.7237970786723638e-06, "logits/chosen": -0.6186486482620239, "logits/rejected": -0.44536644220352173, "logps/chosen": -0.26954495906829834, "logps/rejected": -4.847094535827637, "loss": 0.1987, "odds_ratio_loss": 0.06486377865076065, "rewards/accuracies": 1.0, "rewards/chosen": -0.026954498142004013, "rewards/margins": 0.4577549695968628, "rewards/rejected": -0.4847094416618347, "sft_loss": 0.26954495906829834, "step": 2212 }, { "epoch": 3.2002892263195952, "grad_norm": 1.9924435818068609, "learning_rate": 3.7206987004141417e-06, "logits/chosen": -0.6402283906936646, "logits/rejected": -0.5902421474456787, "logps/chosen": -0.22126686573028564, "logps/rejected": -3.0365238189697266, "loss": 0.1705, "odds_ratio_loss": 0.04855208098888397, "rewards/accuracies": 1.0, "rewards/chosen": -0.022126685827970505, "rewards/margins": 0.28152570128440857, "rewards/rejected": -0.3036523759365082, "sft_loss": 0.22126686573028564, "step": 2213 }, { "epoch": 3.2017353579175705, "grad_norm": 2.3205251480643208, "learning_rate": 3.717600490547712e-06, "logits/chosen": -0.5460687875747681, "logits/rejected": -0.42219042778015137, "logps/chosen": -0.24014821648597717, "logps/rejected": -2.918666124343872, "loss": 0.1981, "odds_ratio_loss": 0.0672679990530014, "rewards/accuracies": 1.0, "rewards/chosen": -0.024014823138713837, "rewards/margins": 0.2678517699241638, "rewards/rejected": -0.29186660051345825, "sft_loss": 0.24014821648597717, "step": 2214 }, { "epoch": 3.2031814895155457, "grad_norm": 2.0440762308575042, "learning_rate": 3.7145024509409994e-06, "logits/chosen": -0.7297794818878174, "logits/rejected": -0.5262342691421509, "logps/chosen": -0.21348436176776886, "logps/rejected": -2.7271389961242676, "loss": 0.2102, "odds_ratio_loss": 0.04441622644662857, "rewards/accuracies": 1.0, "rewards/chosen": -0.021348439157009125, "rewards/margins": 0.2513654828071594, "rewards/rejected": -0.27271392941474915, "sft_loss": 0.21348436176776886, "step": 2215 }, { "epoch": 3.2046276211135214, "grad_norm": 2.7250778588447107, "learning_rate": 3.711404583461821e-06, "logits/chosen": -0.5237282514572144, "logits/rejected": -0.45042431354522705, "logps/chosen": -0.14554253220558167, "logps/rejected": -3.1719679832458496, "loss": 0.2359, "odds_ratio_loss": 0.054552722722291946, "rewards/accuracies": 1.0, "rewards/chosen": -0.014554252848029137, "rewards/margins": 0.3026425540447235, "rewards/rejected": -0.3171968460083008, "sft_loss": 0.14554253220558167, "step": 2216 }, { "epoch": 3.2060737527114966, "grad_norm": 2.002981078083241, "learning_rate": 3.7083068899778936e-06, "logits/chosen": -0.5125364661216736, "logits/rejected": -0.5378755927085876, "logps/chosen": -0.1441064178943634, "logps/rejected": -3.8481955528259277, "loss": 0.1459, "odds_ratio_loss": 0.0454072542488575, "rewards/accuracies": 1.0, "rewards/chosen": -0.01441064290702343, "rewards/margins": 0.37040889263153076, "rewards/rejected": -0.38481953740119934, "sft_loss": 0.1441064178943634, "step": 2217 }, { "epoch": 3.2075198843094723, "grad_norm": 1.9684062919816647, "learning_rate": 3.705209372356831e-06, "logits/chosen": -0.462546706199646, "logits/rejected": -0.371360719203949, "logps/chosen": -0.14098519086837769, "logps/rejected": -5.05507755279541, "loss": 0.1494, "odds_ratio_loss": 0.022105194628238678, "rewards/accuracies": 1.0, "rewards/chosen": -0.014098519459366798, "rewards/margins": 0.49140921235084534, "rewards/rejected": -0.50550776720047, "sft_loss": 0.14098519086837769, "step": 2218 }, { "epoch": 3.2089660159074476, "grad_norm": 1.8324522959136393, "learning_rate": 3.702112032466134e-06, "logits/chosen": -0.5853912234306335, "logits/rejected": -0.5114383101463318, "logps/chosen": -0.15347421169281006, "logps/rejected": -3.8486804962158203, "loss": 0.144, "odds_ratio_loss": 0.03078891895711422, "rewards/accuracies": 1.0, "rewards/chosen": -0.01534742210060358, "rewards/margins": 0.3695206642150879, "rewards/rejected": -0.3848680555820465, "sft_loss": 0.15347421169281006, "step": 2219 }, { "epoch": 3.210412147505423, "grad_norm": 2.908343261824301, "learning_rate": 3.6990148721732037e-06, "logits/chosen": -0.6292286515235901, "logits/rejected": -0.6121467351913452, "logps/chosen": -0.12733693420886993, "logps/rejected": -4.009809970855713, "loss": 0.1963, "odds_ratio_loss": 0.03087882697582245, "rewards/accuracies": 1.0, "rewards/chosen": -0.012733692303299904, "rewards/margins": 0.3882473409175873, "rewards/rejected": -0.40098103880882263, "sft_loss": 0.12733693420886993, "step": 2220 }, { "epoch": 3.2118582791033985, "grad_norm": 1.9318114387309266, "learning_rate": 3.695917893345326e-06, "logits/chosen": -0.4239646792411804, "logits/rejected": -0.35339295864105225, "logps/chosen": -0.10661177337169647, "logps/rejected": -4.832416534423828, "loss": 0.1664, "odds_ratio_loss": 0.026308678090572357, "rewards/accuracies": 1.0, "rewards/chosen": -0.010661177337169647, "rewards/margins": 0.4725804626941681, "rewards/rejected": -0.48324161767959595, "sft_loss": 0.10661177337169647, "step": 2221 }, { "epoch": 3.2133044107013737, "grad_norm": 1.9552969597340941, "learning_rate": 3.6928210978496844e-06, "logits/chosen": -0.5122228264808655, "logits/rejected": -0.4361344575881958, "logps/chosen": -0.2315160632133484, "logps/rejected": -2.3202056884765625, "loss": 0.2312, "odds_ratio_loss": 0.07692821323871613, "rewards/accuracies": 1.0, "rewards/chosen": -0.02315160632133484, "rewards/margins": 0.20886898040771484, "rewards/rejected": -0.2320205718278885, "sft_loss": 0.2315160632133484, "step": 2222 }, { "epoch": 3.2147505422993494, "grad_norm": 3.9027921143976867, "learning_rate": 3.6897244875533463e-06, "logits/chosen": -0.4394450783729553, "logits/rejected": -0.4192020893096924, "logps/chosen": -0.25058937072753906, "logps/rejected": -5.037656784057617, "loss": 0.1914, "odds_ratio_loss": 0.04292678460478783, "rewards/accuracies": 1.0, "rewards/chosen": -0.025058934465050697, "rewards/margins": 0.4787067770957947, "rewards/rejected": -0.5037657022476196, "sft_loss": 0.25058937072753906, "step": 2223 }, { "epoch": 3.2161966738973247, "grad_norm": 1.9675271422773393, "learning_rate": 3.686628064323271e-06, "logits/chosen": -0.4829399585723877, "logits/rejected": -0.46176087856292725, "logps/chosen": -0.22450929880142212, "logps/rejected": -4.800108432769775, "loss": 0.2024, "odds_ratio_loss": 0.05624048411846161, "rewards/accuracies": 1.0, "rewards/chosen": -0.02245093137025833, "rewards/margins": 0.4575599431991577, "rewards/rejected": -0.48001086711883545, "sft_loss": 0.22450929880142212, "step": 2224 }, { "epoch": 3.2176428054953, "grad_norm": 2.1076011418204033, "learning_rate": 3.6835318300263012e-06, "logits/chosen": -0.4614330232143402, "logits/rejected": -0.4945675730705261, "logps/chosen": -0.2269093543291092, "logps/rejected": -3.4398250579833984, "loss": 0.2459, "odds_ratio_loss": 0.0666380450129509, "rewards/accuracies": 1.0, "rewards/chosen": -0.02269093692302704, "rewards/margins": 0.32129159569740295, "rewards/rejected": -0.3439825475215912, "sft_loss": 0.2269093543291092, "step": 2225 }, { "epoch": 3.2190889370932756, "grad_norm": 1.9003759460409313, "learning_rate": 3.6804357865291715e-06, "logits/chosen": -0.38930198550224304, "logits/rejected": -0.32057294249534607, "logps/chosen": -0.15316364169120789, "logps/rejected": -4.212235927581787, "loss": 0.167, "odds_ratio_loss": 0.017919572070240974, "rewards/accuracies": 1.0, "rewards/chosen": -0.015316363424062729, "rewards/margins": 0.40590721368789673, "rewards/rejected": -0.42122358083724976, "sft_loss": 0.15316364169120789, "step": 2226 }, { "epoch": 3.220535068691251, "grad_norm": 1.9152259661536026, "learning_rate": 3.677339935698495e-06, "logits/chosen": -0.45317479968070984, "logits/rejected": -0.4471889138221741, "logps/chosen": -0.19737450778484344, "logps/rejected": -3.9483349323272705, "loss": 0.1805, "odds_ratio_loss": 0.0657714456319809, "rewards/accuracies": 1.0, "rewards/chosen": -0.019737452268600464, "rewards/margins": 0.37509608268737793, "rewards/rejected": -0.394833505153656, "sft_loss": 0.19737450778484344, "step": 2227 }, { "epoch": 3.2219812002892265, "grad_norm": 2.2018903514064276, "learning_rate": 3.6742442794007746e-06, "logits/chosen": -0.47369611263275146, "logits/rejected": -0.38501498103141785, "logps/chosen": -0.24870990216732025, "logps/rejected": -2.4713776111602783, "loss": 0.2353, "odds_ratio_loss": 0.0954592227935791, "rewards/accuracies": 1.0, "rewards/chosen": -0.024870987981557846, "rewards/margins": 0.2222667783498764, "rewards/rejected": -0.24713777005672455, "sft_loss": 0.24870990216732025, "step": 2228 }, { "epoch": 3.2234273318872018, "grad_norm": 2.4456927538611613, "learning_rate": 3.6711488195023893e-06, "logits/chosen": -0.455442875623703, "logits/rejected": -0.37904882431030273, "logps/chosen": -0.231689915060997, "logps/rejected": -3.092597246170044, "loss": 0.1983, "odds_ratio_loss": 0.06161237508058548, "rewards/accuracies": 1.0, "rewards/chosen": -0.02316899225115776, "rewards/margins": 0.28609076142311096, "rewards/rejected": -0.3092597424983978, "sft_loss": 0.231689915060997, "step": 2229 }, { "epoch": 3.224873463485177, "grad_norm": 2.2262037548504745, "learning_rate": 3.6680535578696073e-06, "logits/chosen": -0.5955873727798462, "logits/rejected": -0.48187780380249023, "logps/chosen": -0.2466651201248169, "logps/rejected": -3.732814311981201, "loss": 0.237, "odds_ratio_loss": 0.0446498841047287, "rewards/accuracies": 1.0, "rewards/chosen": -0.02466651052236557, "rewards/margins": 0.3486149311065674, "rewards/rejected": -0.37328144907951355, "sft_loss": 0.2466651201248169, "step": 2230 }, { "epoch": 3.2263195950831527, "grad_norm": 2.1717346267474147, "learning_rate": 3.6649584963685706e-06, "logits/chosen": -0.6487745642662048, "logits/rejected": -0.554900050163269, "logps/chosen": -0.2109365165233612, "logps/rejected": -2.185053586959839, "loss": 0.2322, "odds_ratio_loss": 0.048502735793590546, "rewards/accuracies": 1.0, "rewards/chosen": -0.02109365165233612, "rewards/margins": 0.1974116861820221, "rewards/rejected": -0.2185053676366806, "sft_loss": 0.2109365165233612, "step": 2231 }, { "epoch": 3.227765726681128, "grad_norm": 2.2488395772011742, "learning_rate": 3.6618636368653033e-06, "logits/chosen": -0.46767958998680115, "logits/rejected": -0.32060617208480835, "logps/chosen": -0.11053460091352463, "logps/rejected": -6.905789375305176, "loss": 0.1755, "odds_ratio_loss": 0.02767387591302395, "rewards/accuracies": 1.0, "rewards/chosen": -0.011053459718823433, "rewards/margins": 0.6795254945755005, "rewards/rejected": -0.6905789375305176, "sft_loss": 0.11053460091352463, "step": 2232 }, { "epoch": 3.229211858279103, "grad_norm": 2.4396261057570015, "learning_rate": 3.6587689812257106e-06, "logits/chosen": -0.45963820815086365, "logits/rejected": -0.4651893675327301, "logps/chosen": -0.10936955362558365, "logps/rejected": -5.02745246887207, "loss": 0.1596, "odds_ratio_loss": 0.010914292186498642, "rewards/accuracies": 1.0, "rewards/chosen": -0.010936954990029335, "rewards/margins": 0.49180832505226135, "rewards/rejected": -0.5027452707290649, "sft_loss": 0.10936955362558365, "step": 2233 }, { "epoch": 3.230657989877079, "grad_norm": 2.280698528298338, "learning_rate": 3.655674531315569e-06, "logits/chosen": -0.5203260183334351, "logits/rejected": -0.35767683386802673, "logps/chosen": -0.22178664803504944, "logps/rejected": -2.980849027633667, "loss": 0.2226, "odds_ratio_loss": 0.045914776623249054, "rewards/accuracies": 1.0, "rewards/chosen": -0.022178664803504944, "rewards/margins": 0.2759062647819519, "rewards/rejected": -0.29808491468429565, "sft_loss": 0.22178664803504944, "step": 2234 }, { "epoch": 3.232104121475054, "grad_norm": 1.7926615791207154, "learning_rate": 3.6525802890005357e-06, "logits/chosen": -0.36176133155822754, "logits/rejected": -0.32122957706451416, "logps/chosen": -0.0988263189792633, "logps/rejected": -5.585865497589111, "loss": 0.1614, "odds_ratio_loss": 0.0176323801279068, "rewards/accuracies": 1.0, "rewards/chosen": -0.00988263264298439, "rewards/margins": 0.5487039089202881, "rewards/rejected": -0.5585865378379822, "sft_loss": 0.0988263189792633, "step": 2235 }, { "epoch": 3.23355025307303, "grad_norm": 1.93765750188442, "learning_rate": 3.6494862561461387e-06, "logits/chosen": -0.5547440648078918, "logits/rejected": -0.4816645681858063, "logps/chosen": -0.21579933166503906, "logps/rejected": -5.2509565353393555, "loss": 0.1766, "odds_ratio_loss": 0.028800413012504578, "rewards/accuracies": 1.0, "rewards/chosen": -0.021579932421445847, "rewards/margins": 0.5035157203674316, "rewards/rejected": -0.5250956416130066, "sft_loss": 0.21579933166503906, "step": 2236 }, { "epoch": 3.234996384671005, "grad_norm": 2.1995050438159076, "learning_rate": 3.646392434617785e-06, "logits/chosen": -0.6402944326400757, "logits/rejected": -0.421334445476532, "logps/chosen": -0.19929829239845276, "logps/rejected": -5.820633888244629, "loss": 0.1767, "odds_ratio_loss": 0.048856209963560104, "rewards/accuracies": 1.0, "rewards/chosen": -0.019929828122258186, "rewards/margins": 0.5621335506439209, "rewards/rejected": -0.5820633172988892, "sft_loss": 0.19929829239845276, "step": 2237 }, { "epoch": 3.2364425162689803, "grad_norm": 1.8119486632998885, "learning_rate": 3.6432988262807483e-06, "logits/chosen": -0.5280413627624512, "logits/rejected": -0.42112961411476135, "logps/chosen": -0.15436744689941406, "logps/rejected": -5.035494327545166, "loss": 0.1644, "odds_ratio_loss": 0.030892375856637955, "rewards/accuracies": 1.0, "rewards/chosen": -0.015436742454767227, "rewards/margins": 0.4881126880645752, "rewards/rejected": -0.5035493969917297, "sft_loss": 0.15436744689941406, "step": 2238 }, { "epoch": 3.237888647866956, "grad_norm": 2.2278105118916995, "learning_rate": 3.6402054330001787e-06, "logits/chosen": -0.5598574876785278, "logits/rejected": -0.29619458317756653, "logps/chosen": -0.14671733975410461, "logps/rejected": -6.336015224456787, "loss": 0.1365, "odds_ratio_loss": 0.02809251844882965, "rewards/accuracies": 1.0, "rewards/chosen": -0.014671734534204006, "rewards/margins": 0.6189297437667847, "rewards/rejected": -0.6336015462875366, "sft_loss": 0.14671733975410461, "step": 2239 }, { "epoch": 3.239334779464931, "grad_norm": 3.86220961291117, "learning_rate": 3.637112256641092e-06, "logits/chosen": -0.5911388993263245, "logits/rejected": -0.3398038148880005, "logps/chosen": -0.16455015540122986, "logps/rejected": -5.836204528808594, "loss": 0.246, "odds_ratio_loss": 0.0325884185731411, "rewards/accuracies": 1.0, "rewards/chosen": -0.016455017030239105, "rewards/margins": 0.5671654343605042, "rewards/rejected": -0.5836204290390015, "sft_loss": 0.16455015540122986, "step": 2240 }, { "epoch": 3.240780911062907, "grad_norm": 1.9752946959873765, "learning_rate": 3.6340192990683785e-06, "logits/chosen": -0.5412940979003906, "logits/rejected": -0.40820765495300293, "logps/chosen": -0.14699213206768036, "logps/rejected": -4.479633331298828, "loss": 0.174, "odds_ratio_loss": 0.023793520405888557, "rewards/accuracies": 1.0, "rewards/chosen": -0.01469921413809061, "rewards/margins": 0.4332641363143921, "rewards/rejected": -0.44796332716941833, "sft_loss": 0.14699213206768036, "step": 2241 }, { "epoch": 3.242227042660882, "grad_norm": 2.2022818942213864, "learning_rate": 3.6309265621467923e-06, "logits/chosen": -0.4983745217323303, "logits/rejected": -0.5105041265487671, "logps/chosen": -0.28136447072029114, "logps/rejected": -3.8148107528686523, "loss": 0.2074, "odds_ratio_loss": 0.05450871214270592, "rewards/accuracies": 1.0, "rewards/chosen": -0.028136447072029114, "rewards/margins": 0.3533446788787842, "rewards/rejected": -0.3814811110496521, "sft_loss": 0.28136447072029114, "step": 2242 }, { "epoch": 3.2436731742588574, "grad_norm": 2.3767532424730455, "learning_rate": 3.627834047740957e-06, "logits/chosen": -0.5548741221427917, "logits/rejected": -0.44200655817985535, "logps/chosen": -0.16056154668331146, "logps/rejected": -3.1678218841552734, "loss": 0.1749, "odds_ratio_loss": 0.023494180291891098, "rewards/accuracies": 1.0, "rewards/chosen": -0.016056153923273087, "rewards/margins": 0.300726056098938, "rewards/rejected": -0.3167821764945984, "sft_loss": 0.16056154668331146, "step": 2243 }, { "epoch": 3.245119305856833, "grad_norm": 1.8650655116730133, "learning_rate": 3.624741757715359e-06, "logits/chosen": -0.6051979660987854, "logits/rejected": -0.4340522587299347, "logps/chosen": -0.19842886924743652, "logps/rejected": -3.8932723999023438, "loss": 0.1699, "odds_ratio_loss": 0.035827018320560455, "rewards/accuracies": 1.0, "rewards/chosen": -0.019842887297272682, "rewards/margins": 0.3694843649864197, "rewards/rejected": -0.3893272876739502, "sft_loss": 0.19842886924743652, "step": 2244 }, { "epoch": 3.2465654374548083, "grad_norm": 1.8596281529466636, "learning_rate": 3.621649693934353e-06, "logits/chosen": -0.5226513147354126, "logits/rejected": -0.49524396657943726, "logps/chosen": -0.14361608028411865, "logps/rejected": -4.346930980682373, "loss": 0.177, "odds_ratio_loss": 0.02752107009291649, "rewards/accuracies": 1.0, "rewards/chosen": -0.014361606910824776, "rewards/margins": 0.4203314781188965, "rewards/rejected": -0.4346930682659149, "sft_loss": 0.14361608028411865, "step": 2245 }, { "epoch": 3.248011569052784, "grad_norm": 2.174693078196969, "learning_rate": 3.6185578582621573e-06, "logits/chosen": -0.5399285554885864, "logits/rejected": -0.4264034628868103, "logps/chosen": -0.2987480163574219, "logps/rejected": -4.435233116149902, "loss": 0.1745, "odds_ratio_loss": 0.05470084026455879, "rewards/accuracies": 1.0, "rewards/chosen": -0.02987479791045189, "rewards/margins": 0.41364848613739014, "rewards/rejected": -0.4435232877731323, "sft_loss": 0.2987480163574219, "step": 2246 }, { "epoch": 3.2494577006507592, "grad_norm": 1.9731539471015809, "learning_rate": 3.6154662525628474e-06, "logits/chosen": -0.6945093274116516, "logits/rejected": -0.5601460337638855, "logps/chosen": -0.15286661684513092, "logps/rejected": -3.5629913806915283, "loss": 0.1972, "odds_ratio_loss": 0.05991650000214577, "rewards/accuracies": 1.0, "rewards/chosen": -0.015286662615835667, "rewards/margins": 0.34101247787475586, "rewards/rejected": -0.35629913210868835, "sft_loss": 0.15286661684513092, "step": 2247 }, { "epoch": 3.2509038322487345, "grad_norm": 1.9180427426151503, "learning_rate": 3.612374878700368e-06, "logits/chosen": -0.4100998640060425, "logits/rejected": -0.5385414958000183, "logps/chosen": -0.22914831340312958, "logps/rejected": -4.232925891876221, "loss": 0.1886, "odds_ratio_loss": 0.07522237300872803, "rewards/accuracies": 1.0, "rewards/chosen": -0.022914830595254898, "rewards/margins": 0.4003777801990509, "rewards/rejected": -0.4232926070690155, "sft_loss": 0.22914831340312958, "step": 2248 }, { "epoch": 3.25234996384671, "grad_norm": 2.1307859276198164, "learning_rate": 3.609283738538517e-06, "logits/chosen": -0.673978328704834, "logits/rejected": -0.42554956674575806, "logps/chosen": -0.24228356778621674, "logps/rejected": -5.043076992034912, "loss": 0.2614, "odds_ratio_loss": 0.04325840622186661, "rewards/accuracies": 1.0, "rewards/chosen": -0.024228356778621674, "rewards/margins": 0.4800793528556824, "rewards/rejected": -0.5043076872825623, "sft_loss": 0.24228356778621674, "step": 2249 }, { "epoch": 3.2537960954446854, "grad_norm": 2.377612293864264, "learning_rate": 3.606192833940956e-06, "logits/chosen": -0.5075133442878723, "logits/rejected": -0.3776240646839142, "logps/chosen": -0.139189213514328, "logps/rejected": -1.9026098251342773, "loss": 0.1694, "odds_ratio_loss": 0.06290970742702484, "rewards/accuracies": 1.0, "rewards/chosen": -0.01391892321407795, "rewards/margins": 0.17634207010269165, "rewards/rejected": -0.19026097655296326, "sft_loss": 0.139189213514328, "step": 2250 }, { "epoch": 3.255242227042661, "grad_norm": 1.94278364933995, "learning_rate": 3.6031021667712e-06, "logits/chosen": -0.5642733573913574, "logits/rejected": -0.38058435916900635, "logps/chosen": -0.15122462809085846, "logps/rejected": -5.234777450561523, "loss": 0.183, "odds_ratio_loss": 0.027925534173846245, "rewards/accuracies": 1.0, "rewards/chosen": -0.015122463926672935, "rewards/margins": 0.5083552598953247, "rewards/rejected": -0.5234777927398682, "sft_loss": 0.15122462809085846, "step": 2251 }, { "epoch": 3.2566883586406363, "grad_norm": 1.7092281341499098, "learning_rate": 3.600011738892628e-06, "logits/chosen": -0.4542877674102783, "logits/rejected": -0.4579513370990753, "logps/chosen": -0.08828302472829819, "logps/rejected": -4.268548488616943, "loss": 0.1504, "odds_ratio_loss": 0.020018287003040314, "rewards/accuracies": 1.0, "rewards/chosen": -0.008828302845358849, "rewards/margins": 0.41802653670310974, "rewards/rejected": -0.4268548786640167, "sft_loss": 0.08828302472829819, "step": 2252 }, { "epoch": 3.2581344902386116, "grad_norm": 2.4301885734804634, "learning_rate": 3.5969215521684673e-06, "logits/chosen": -0.5045682191848755, "logits/rejected": -0.3198281228542328, "logps/chosen": -0.17714548110961914, "logps/rejected": -2.6474297046661377, "loss": 0.1836, "odds_ratio_loss": 0.044076722115278244, "rewards/accuracies": 1.0, "rewards/chosen": -0.017714550718665123, "rewards/margins": 0.2470283955335617, "rewards/rejected": -0.26474297046661377, "sft_loss": 0.17714548110961914, "step": 2253 }, { "epoch": 3.2595806218365873, "grad_norm": 2.274847989564652, "learning_rate": 3.593831608461805e-06, "logits/chosen": -0.7203308939933777, "logits/rejected": -0.5131949782371521, "logps/chosen": -0.17507284879684448, "logps/rejected": -5.116349220275879, "loss": 0.201, "odds_ratio_loss": 0.03700259327888489, "rewards/accuracies": 1.0, "rewards/chosen": -0.017507284879684448, "rewards/margins": 0.49412769079208374, "rewards/rejected": -0.5116349458694458, "sft_loss": 0.17507284879684448, "step": 2254 }, { "epoch": 3.2610267534345625, "grad_norm": 2.2182355679207766, "learning_rate": 3.5907419096355768e-06, "logits/chosen": -0.40628182888031006, "logits/rejected": -0.2812096178531647, "logps/chosen": -0.15741974115371704, "logps/rejected": -4.353770732879639, "loss": 0.1389, "odds_ratio_loss": 0.03497250750660896, "rewards/accuracies": 1.0, "rewards/chosen": -0.015741974115371704, "rewards/margins": 0.4196351170539856, "rewards/rejected": -0.4353770911693573, "sft_loss": 0.15741974115371704, "step": 2255 }, { "epoch": 3.2624728850325377, "grad_norm": 2.6818134401064424, "learning_rate": 3.5876524575525774e-06, "logits/chosen": -0.6847485303878784, "logits/rejected": -0.6329970955848694, "logps/chosen": -0.1577267199754715, "logps/rejected": -3.3478763103485107, "loss": 0.222, "odds_ratio_loss": 0.05068032443523407, "rewards/accuracies": 1.0, "rewards/chosen": -0.01577267237007618, "rewards/margins": 0.3190149664878845, "rewards/rejected": -0.33478766679763794, "sft_loss": 0.1577267199754715, "step": 2256 }, { "epoch": 3.2639190166305134, "grad_norm": 2.3388227044279497, "learning_rate": 3.584563254075446e-06, "logits/chosen": -0.476225346326828, "logits/rejected": -0.3441811501979828, "logps/chosen": -0.17915165424346924, "logps/rejected": -3.2945075035095215, "loss": 0.1806, "odds_ratio_loss": 0.05011521279811859, "rewards/accuracies": 1.0, "rewards/chosen": -0.017915166914463043, "rewards/margins": 0.3115355968475342, "rewards/rejected": -0.329450786113739, "sft_loss": 0.17915165424346924, "step": 2257 }, { "epoch": 3.2653651482284887, "grad_norm": 1.9674045184480031, "learning_rate": 3.5814743010666757e-06, "logits/chosen": -0.5762167572975159, "logits/rejected": -0.3875581622123718, "logps/chosen": -0.1739146113395691, "logps/rejected": -3.7504653930664062, "loss": 0.159, "odds_ratio_loss": 0.044472791254520416, "rewards/accuracies": 1.0, "rewards/chosen": -0.01739146001636982, "rewards/margins": 0.3576550781726837, "rewards/rejected": -0.3750465214252472, "sft_loss": 0.1739146113395691, "step": 2258 }, { "epoch": 3.2668112798264644, "grad_norm": 2.8547990030832735, "learning_rate": 3.578385600388609e-06, "logits/chosen": -0.4150635600090027, "logits/rejected": -0.3737567067146301, "logps/chosen": -0.19305887818336487, "logps/rejected": -5.184538841247559, "loss": 0.1916, "odds_ratio_loss": 0.02490687370300293, "rewards/accuracies": 1.0, "rewards/chosen": -0.019305888563394547, "rewards/margins": 0.49914807081222534, "rewards/rejected": -0.5184538960456848, "sft_loss": 0.19305887818336487, "step": 2259 }, { "epoch": 3.2682574114244396, "grad_norm": 1.9041525202532068, "learning_rate": 3.575297153903434e-06, "logits/chosen": -0.6946017742156982, "logits/rejected": -0.5355392694473267, "logps/chosen": -0.13331067562103271, "logps/rejected": -5.4609198570251465, "loss": 0.1833, "odds_ratio_loss": 0.014529845677316189, "rewards/accuracies": 1.0, "rewards/chosen": -0.013331068679690361, "rewards/margins": 0.5327609181404114, "rewards/rejected": -0.5460919737815857, "sft_loss": 0.13331067562103271, "step": 2260 }, { "epoch": 3.2697035430224153, "grad_norm": 1.914055513543421, "learning_rate": 3.5722089634731868e-06, "logits/chosen": -0.4900306463241577, "logits/rejected": -0.3406396210193634, "logps/chosen": -0.11849421262741089, "logps/rejected": -5.536858081817627, "loss": 0.195, "odds_ratio_loss": 0.007503737695515156, "rewards/accuracies": 1.0, "rewards/chosen": -0.011849422007799149, "rewards/margins": 0.5418363809585571, "rewards/rejected": -0.5536858439445496, "sft_loss": 0.11849421262741089, "step": 2261 }, { "epoch": 3.2711496746203905, "grad_norm": 2.1438021427198737, "learning_rate": 3.5691210309597473e-06, "logits/chosen": -0.6778137683868408, "logits/rejected": -0.5302555561065674, "logps/chosen": -0.10795378684997559, "logps/rejected": -4.853133201599121, "loss": 0.1451, "odds_ratio_loss": 0.02070033550262451, "rewards/accuracies": 1.0, "rewards/chosen": -0.010795379988849163, "rewards/margins": 0.47451794147491455, "rewards/rejected": -0.485313355922699, "sft_loss": 0.10795378684997559, "step": 2262 }, { "epoch": 3.2725958062183658, "grad_norm": 1.9860009136951866, "learning_rate": 3.566033358224845e-06, "logits/chosen": -0.799131453037262, "logits/rejected": -0.49861404299736023, "logps/chosen": -0.12334741652011871, "logps/rejected": -4.193481922149658, "loss": 0.1469, "odds_ratio_loss": 0.028686096891760826, "rewards/accuracies": 1.0, "rewards/chosen": -0.012334741652011871, "rewards/margins": 0.4070134162902832, "rewards/rejected": -0.41934818029403687, "sft_loss": 0.12334741652011871, "step": 2263 }, { "epoch": 3.2740419378163415, "grad_norm": 1.7714968208013866, "learning_rate": 3.5629459471300455e-06, "logits/chosen": -0.5801678895950317, "logits/rejected": -0.38160601258277893, "logps/chosen": -0.12812554836273193, "logps/rejected": -5.513360977172852, "loss": 0.1586, "odds_ratio_loss": 0.022318800911307335, "rewards/accuracies": 1.0, "rewards/chosen": -0.012812554836273193, "rewards/margins": 0.5385234951972961, "rewards/rejected": -0.5513360500335693, "sft_loss": 0.12812554836273193, "step": 2264 }, { "epoch": 3.2754880694143167, "grad_norm": 1.781393491270812, "learning_rate": 3.5598587995367645e-06, "logits/chosen": -0.48313915729522705, "logits/rejected": -0.34745123982429504, "logps/chosen": -0.12040194869041443, "logps/rejected": -2.534296989440918, "loss": 0.1496, "odds_ratio_loss": 0.041830502450466156, "rewards/accuracies": 1.0, "rewards/chosen": -0.012040195055305958, "rewards/margins": 0.24138952791690826, "rewards/rejected": -0.25342971086502075, "sft_loss": 0.12040194869041443, "step": 2265 }, { "epoch": 3.276934201012292, "grad_norm": 2.249398935208769, "learning_rate": 3.5567719173062503e-06, "logits/chosen": -0.7603170871734619, "logits/rejected": -0.4521884322166443, "logps/chosen": -0.11175563186407089, "logps/rejected": -4.254090785980225, "loss": 0.1363, "odds_ratio_loss": 0.010375426150858402, "rewards/accuracies": 1.0, "rewards/chosen": -0.011175563558936119, "rewards/margins": 0.4142335057258606, "rewards/rejected": -0.42540907859802246, "sft_loss": 0.11175563186407089, "step": 2266 }, { "epoch": 3.2783803326102676, "grad_norm": 1.9991795543356456, "learning_rate": 3.553685302299599e-06, "logits/chosen": -0.3749878406524658, "logits/rejected": -0.27797868847846985, "logps/chosen": -0.10831344127655029, "logps/rejected": -4.4174909591674805, "loss": 0.1216, "odds_ratio_loss": 0.02435128577053547, "rewards/accuracies": 1.0, "rewards/chosen": -0.010831343941390514, "rewards/margins": 0.43091776967048645, "rewards/rejected": -0.4417491555213928, "sft_loss": 0.10831344127655029, "step": 2267 }, { "epoch": 3.279826464208243, "grad_norm": 2.1564064092160726, "learning_rate": 3.5505989563777402e-06, "logits/chosen": -0.5478772521018982, "logits/rejected": -0.35927248001098633, "logps/chosen": -0.12505735456943512, "logps/rejected": -4.873144149780273, "loss": 0.1495, "odds_ratio_loss": 0.01490817777812481, "rewards/accuracies": 1.0, "rewards/chosen": -0.012505735270678997, "rewards/margins": 0.4748087227344513, "rewards/rejected": -0.48731446266174316, "sft_loss": 0.12505735456943512, "step": 2268 }, { "epoch": 3.2812725958062186, "grad_norm": 1.8682733125562931, "learning_rate": 3.5475128814014457e-06, "logits/chosen": -0.6678739190101624, "logits/rejected": -0.5783143043518066, "logps/chosen": -0.32308146357536316, "logps/rejected": -3.722277879714966, "loss": 0.1831, "odds_ratio_loss": 0.08787070214748383, "rewards/accuracies": 0.9375, "rewards/chosen": -0.032308146357536316, "rewards/margins": 0.3399196267127991, "rewards/rejected": -0.3722277879714966, "sft_loss": 0.32308146357536316, "step": 2269 }, { "epoch": 3.282718727404194, "grad_norm": 2.2558187195382517, "learning_rate": 3.5444270792313196e-06, "logits/chosen": -0.6178706884384155, "logits/rejected": -0.4912964105606079, "logps/chosen": -0.18187105655670166, "logps/rejected": -6.225649833679199, "loss": 0.1925, "odds_ratio_loss": 0.014290587976574898, "rewards/accuracies": 1.0, "rewards/chosen": -0.018187105655670166, "rewards/margins": 0.6043779253959656, "rewards/rejected": -0.6225650310516357, "sft_loss": 0.18187105655670166, "step": 2270 }, { "epoch": 3.284164859002169, "grad_norm": 2.0678723535377905, "learning_rate": 3.5413415517278033e-06, "logits/chosen": -0.5242627859115601, "logits/rejected": -0.46460655331611633, "logps/chosen": -0.26317670941352844, "logps/rejected": -4.386211395263672, "loss": 0.2566, "odds_ratio_loss": 0.05664074793457985, "rewards/accuracies": 1.0, "rewards/chosen": -0.026317669078707695, "rewards/margins": 0.41230350732803345, "rewards/rejected": -0.4386211633682251, "sft_loss": 0.26317670941352844, "step": 2271 }, { "epoch": 3.2856109906001447, "grad_norm": 2.296493485099005, "learning_rate": 3.5382563007511754e-06, "logits/chosen": -0.601801872253418, "logits/rejected": -0.3562720715999603, "logps/chosen": -0.25675979256629944, "logps/rejected": -3.735896110534668, "loss": 0.2085, "odds_ratio_loss": 0.04126711189746857, "rewards/accuracies": 1.0, "rewards/chosen": -0.025675982236862183, "rewards/margins": 0.3479136526584625, "rewards/rejected": -0.3735896348953247, "sft_loss": 0.25675979256629944, "step": 2272 }, { "epoch": 3.28705712219812, "grad_norm": 2.251694032374471, "learning_rate": 3.535171328161542e-06, "logits/chosen": -0.5955110788345337, "logits/rejected": -0.4887436032295227, "logps/chosen": -0.3352237939834595, "logps/rejected": -4.323999404907227, "loss": 0.2193, "odds_ratio_loss": 0.017490914091467857, "rewards/accuracies": 1.0, "rewards/chosen": -0.03352237865328789, "rewards/margins": 0.3988775610923767, "rewards/rejected": -0.4323999583721161, "sft_loss": 0.3352237939834595, "step": 2273 }, { "epoch": 3.2885032537960956, "grad_norm": 2.075285424913544, "learning_rate": 3.532086635818848e-06, "logits/chosen": -0.42095768451690674, "logits/rejected": -0.31674113869667053, "logps/chosen": -0.2579844892024994, "logps/rejected": -4.457534313201904, "loss": 0.2311, "odds_ratio_loss": 0.040766119956970215, "rewards/accuracies": 1.0, "rewards/chosen": -0.025798451155424118, "rewards/margins": 0.4199550449848175, "rewards/rejected": -0.4457534849643707, "sft_loss": 0.2579844892024994, "step": 2274 }, { "epoch": 3.289949385394071, "grad_norm": 2.193805076834671, "learning_rate": 3.529002225582862e-06, "logits/chosen": -0.6587631106376648, "logits/rejected": -0.4716310501098633, "logps/chosen": -0.1755092442035675, "logps/rejected": -3.7191784381866455, "loss": 0.1745, "odds_ratio_loss": 0.04562509059906006, "rewards/accuracies": 1.0, "rewards/chosen": -0.01755092665553093, "rewards/margins": 0.3543669283390045, "rewards/rejected": -0.37191784381866455, "sft_loss": 0.1755092442035675, "step": 2275 }, { "epoch": 3.291395516992046, "grad_norm": 2.0765831775562518, "learning_rate": 3.5259180993131893e-06, "logits/chosen": -0.5486062169075012, "logits/rejected": -0.45100995898246765, "logps/chosen": -0.17713306844234467, "logps/rejected": -2.9269280433654785, "loss": 0.2267, "odds_ratio_loss": 0.04936151206493378, "rewards/accuracies": 1.0, "rewards/chosen": -0.017713308334350586, "rewards/margins": 0.27497947216033936, "rewards/rejected": -0.29269278049468994, "sft_loss": 0.17713306844234467, "step": 2276 }, { "epoch": 3.292841648590022, "grad_norm": 2.1171590132243105, "learning_rate": 3.5228342588692603e-06, "logits/chosen": -0.36625778675079346, "logits/rejected": -0.26877880096435547, "logps/chosen": -0.22854745388031006, "logps/rejected": -4.398021221160889, "loss": 0.2072, "odds_ratio_loss": 0.021160855889320374, "rewards/accuracies": 1.0, "rewards/chosen": -0.022854745388031006, "rewards/margins": 0.4169473350048065, "rewards/rejected": -0.4398021101951599, "sft_loss": 0.22854745388031006, "step": 2277 }, { "epoch": 3.294287780187997, "grad_norm": 3.1829007081069176, "learning_rate": 3.519750706110334e-06, "logits/chosen": -0.5905520915985107, "logits/rejected": -0.4183228611946106, "logps/chosen": -0.12734369933605194, "logps/rejected": -4.160774230957031, "loss": 0.1687, "odds_ratio_loss": 0.03372897580265999, "rewards/accuracies": 1.0, "rewards/chosen": -0.012734370306134224, "rewards/margins": 0.4033430218696594, "rewards/rejected": -0.4160774350166321, "sft_loss": 0.12734369933605194, "step": 2278 }, { "epoch": 3.2957339117859723, "grad_norm": 2.1113324149581594, "learning_rate": 3.516667442895494e-06, "logits/chosen": -0.6072694659233093, "logits/rejected": -0.3589474558830261, "logps/chosen": -0.279439240694046, "logps/rejected": -3.625382423400879, "loss": 0.2092, "odds_ratio_loss": 0.06730879098176956, "rewards/accuracies": 1.0, "rewards/chosen": -0.027943922206759453, "rewards/margins": 0.3345942795276642, "rewards/rejected": -0.36253821849823, "sft_loss": 0.279439240694046, "step": 2279 }, { "epoch": 3.297180043383948, "grad_norm": 3.3151954739010767, "learning_rate": 3.5135844710836545e-06, "logits/chosen": -0.40881648659706116, "logits/rejected": -0.3644575774669647, "logps/chosen": -0.3337644338607788, "logps/rejected": -5.119471549987793, "loss": 0.2165, "odds_ratio_loss": 0.09850729256868362, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03337644785642624, "rewards/margins": 0.478570818901062, "rewards/rejected": -0.5119472742080688, "sft_loss": 0.3337644338607788, "step": 2280 }, { "epoch": 3.2986261749819232, "grad_norm": 2.007323942505847, "learning_rate": 3.510501792533548e-06, "logits/chosen": -0.6884647607803345, "logits/rejected": -0.6225319504737854, "logps/chosen": -0.23053047060966492, "logps/rejected": -3.772772789001465, "loss": 0.1856, "odds_ratio_loss": 0.058163873851299286, "rewards/accuracies": 1.0, "rewards/chosen": -0.02305304817855358, "rewards/margins": 0.35422423481941223, "rewards/rejected": -0.37727731466293335, "sft_loss": 0.23053047060966492, "step": 2281 }, { "epoch": 3.300072306579899, "grad_norm": 2.1743690077166784, "learning_rate": 3.5074194091037354e-06, "logits/chosen": -0.9183458089828491, "logits/rejected": -0.6526041626930237, "logps/chosen": -0.22743211686611176, "logps/rejected": -2.442390203475952, "loss": 0.2149, "odds_ratio_loss": 0.05146101117134094, "rewards/accuracies": 1.0, "rewards/chosen": -0.022743212059140205, "rewards/margins": 0.2214958220720291, "rewards/rejected": -0.24423903226852417, "sft_loss": 0.22743211686611176, "step": 2282 }, { "epoch": 3.301518438177874, "grad_norm": 2.643722453059619, "learning_rate": 3.5043373226525933e-06, "logits/chosen": -0.6142512559890747, "logits/rejected": -0.4808671474456787, "logps/chosen": -0.1182275265455246, "logps/rejected": -4.5574140548706055, "loss": 0.1724, "odds_ratio_loss": 0.021381031721830368, "rewards/accuracies": 1.0, "rewards/chosen": -0.011822751723229885, "rewards/margins": 0.4439186751842499, "rewards/rejected": -0.4557414650917053, "sft_loss": 0.1182275265455246, "step": 2283 }, { "epoch": 3.30296456977585, "grad_norm": 2.7578652269026644, "learning_rate": 3.5012555350383265e-06, "logits/chosen": -0.46577584743499756, "logits/rejected": -0.3899204134941101, "logps/chosen": -0.1679561287164688, "logps/rejected": -3.679994583129883, "loss": 0.1715, "odds_ratio_loss": 0.012146007269620895, "rewards/accuracies": 1.0, "rewards/chosen": -0.016795611009001732, "rewards/margins": 0.3512038588523865, "rewards/rejected": -0.36799943447113037, "sft_loss": 0.1679561287164688, "step": 2284 }, { "epoch": 3.304410701373825, "grad_norm": 1.983775119024542, "learning_rate": 3.4981740481189557e-06, "logits/chosen": -0.6946292519569397, "logits/rejected": -0.606383204460144, "logps/chosen": -0.24009007215499878, "logps/rejected": -2.6545491218566895, "loss": 0.1572, "odds_ratio_loss": 0.039150357246398926, "rewards/accuracies": 1.0, "rewards/chosen": -0.02400900609791279, "rewards/margins": 0.2414458990097046, "rewards/rejected": -0.26545488834381104, "sft_loss": 0.24009007215499878, "step": 2285 }, { "epoch": 3.3058568329718003, "grad_norm": 2.1786049400838388, "learning_rate": 3.495092863752319e-06, "logits/chosen": -0.6590663194656372, "logits/rejected": -0.5998579263687134, "logps/chosen": -0.1764366626739502, "logps/rejected": -3.5466344356536865, "loss": 0.2583, "odds_ratio_loss": 0.03589484095573425, "rewards/accuracies": 1.0, "rewards/chosen": -0.01764366589486599, "rewards/margins": 0.33701977133750916, "rewards/rejected": -0.3546634614467621, "sft_loss": 0.1764366626739502, "step": 2286 }, { "epoch": 3.307302964569776, "grad_norm": 2.1572867888890936, "learning_rate": 3.4920119837960764e-06, "logits/chosen": -0.6842783093452454, "logits/rejected": -0.6382707357406616, "logps/chosen": -0.09783312678337097, "logps/rejected": -5.192519187927246, "loss": 0.1635, "odds_ratio_loss": 0.010324829258024693, "rewards/accuracies": 1.0, "rewards/chosen": -0.009783312678337097, "rewards/margins": 0.5094686150550842, "rewards/rejected": -0.5192519426345825, "sft_loss": 0.09783312678337097, "step": 2287 }, { "epoch": 3.3087490961677513, "grad_norm": 2.5485270310682706, "learning_rate": 3.4889314101077006e-06, "logits/chosen": -0.7338652610778809, "logits/rejected": -0.5132858157157898, "logps/chosen": -0.18935944139957428, "logps/rejected": -3.8639068603515625, "loss": 0.2015, "odds_ratio_loss": 0.028721699491143227, "rewards/accuracies": 1.0, "rewards/chosen": -0.018935944885015488, "rewards/margins": 0.36745476722717285, "rewards/rejected": -0.38639071583747864, "sft_loss": 0.18935944139957428, "step": 2288 }, { "epoch": 3.3101952277657265, "grad_norm": 2.185540071670579, "learning_rate": 3.4858511445444814e-06, "logits/chosen": -0.5446025729179382, "logits/rejected": -0.48353853821754456, "logps/chosen": -0.18259915709495544, "logps/rejected": -2.570195198059082, "loss": 0.1704, "odds_ratio_loss": 0.05168168246746063, "rewards/accuracies": 1.0, "rewards/chosen": -0.018259914591908455, "rewards/margins": 0.2387596219778061, "rewards/rejected": -0.2570195496082306, "sft_loss": 0.18259915709495544, "step": 2289 }, { "epoch": 3.311641359363702, "grad_norm": 2.1793350901491615, "learning_rate": 3.4827711889635207e-06, "logits/chosen": -0.6570934057235718, "logits/rejected": -0.5295805335044861, "logps/chosen": -0.12861356139183044, "logps/rejected": -5.604830265045166, "loss": 0.1796, "odds_ratio_loss": 0.017030876129865646, "rewards/accuracies": 1.0, "rewards/chosen": -0.012861356139183044, "rewards/margins": 0.5476217269897461, "rewards/rejected": -0.5604830980300903, "sft_loss": 0.12861356139183044, "step": 2290 }, { "epoch": 3.3130874909616774, "grad_norm": 2.0587136754455586, "learning_rate": 3.4796915452217376e-06, "logits/chosen": -0.5374751687049866, "logits/rejected": -0.5378148555755615, "logps/chosen": -0.25962379574775696, "logps/rejected": -2.395240068435669, "loss": 0.2276, "odds_ratio_loss": 0.06650760024785995, "rewards/accuracies": 1.0, "rewards/chosen": -0.025962380692362785, "rewards/margins": 0.21356163918972015, "rewards/rejected": -0.2395240217447281, "sft_loss": 0.25962379574775696, "step": 2291 }, { "epoch": 3.314533622559653, "grad_norm": 1.944851752443858, "learning_rate": 3.4766122151758595e-06, "logits/chosen": -0.5675870180130005, "logits/rejected": -0.3958700895309448, "logps/chosen": -0.25842157006263733, "logps/rejected": -4.663134574890137, "loss": 0.2255, "odds_ratio_loss": 0.04698663204908371, "rewards/accuracies": 1.0, "rewards/chosen": -0.025842156261205673, "rewards/margins": 0.4404713213443756, "rewards/rejected": -0.46631351113319397, "sft_loss": 0.25842157006263733, "step": 2292 }, { "epoch": 3.3159797541576284, "grad_norm": 2.7882638415952177, "learning_rate": 3.473533200682427e-06, "logits/chosen": -0.4775904715061188, "logits/rejected": -0.5372059345245361, "logps/chosen": -0.15239141881465912, "logps/rejected": -2.723569869995117, "loss": 0.2259, "odds_ratio_loss": 0.04135645925998688, "rewards/accuracies": 1.0, "rewards/chosen": -0.015239142812788486, "rewards/margins": 0.2571178376674652, "rewards/rejected": -0.2723569869995117, "sft_loss": 0.15239141881465912, "step": 2293 }, { "epoch": 3.3174258857556036, "grad_norm": 2.3481373712095963, "learning_rate": 3.4704545035977866e-06, "logits/chosen": -0.7102236151695251, "logits/rejected": -0.5002355575561523, "logps/chosen": -0.1865234225988388, "logps/rejected": -2.455569267272949, "loss": 0.2064, "odds_ratio_loss": 0.05270793288946152, "rewards/accuracies": 1.0, "rewards/chosen": -0.01865234225988388, "rewards/margins": 0.22690460085868835, "rewards/rejected": -0.24555695056915283, "sft_loss": 0.1865234225988388, "step": 2294 }, { "epoch": 3.3188720173535793, "grad_norm": 2.0824459255780283, "learning_rate": 3.4673761257781e-06, "logits/chosen": -0.6822003126144409, "logits/rejected": -0.4748939871788025, "logps/chosen": -0.13890133798122406, "logps/rejected": -6.211273670196533, "loss": 0.205, "odds_ratio_loss": 0.009621636010706425, "rewards/accuracies": 1.0, "rewards/chosen": -0.013890134170651436, "rewards/margins": 0.6072372794151306, "rewards/rejected": -0.6211273670196533, "sft_loss": 0.13890133798122406, "step": 2295 }, { "epoch": 3.3203181489515545, "grad_norm": 2.061127162028327, "learning_rate": 3.46429806907933e-06, "logits/chosen": -0.7757879495620728, "logits/rejected": -0.5989354848861694, "logps/chosen": -0.1916319727897644, "logps/rejected": -3.734245777130127, "loss": 0.1461, "odds_ratio_loss": 0.027212627232074738, "rewards/accuracies": 1.0, "rewards/chosen": -0.01916319690644741, "rewards/margins": 0.3542613983154297, "rewards/rejected": -0.37342458963394165, "sft_loss": 0.1916319727897644, "step": 2296 }, { "epoch": 3.32176428054953, "grad_norm": 2.114351797322761, "learning_rate": 3.4612203353572503e-06, "logits/chosen": -0.6353336572647095, "logits/rejected": -0.5170816779136658, "logps/chosen": -0.1459745466709137, "logps/rejected": -4.982494831085205, "loss": 0.2044, "odds_ratio_loss": 0.020803630352020264, "rewards/accuracies": 1.0, "rewards/chosen": -0.014597454108297825, "rewards/margins": 0.4836519956588745, "rewards/rejected": -0.49824947118759155, "sft_loss": 0.1459745466709137, "step": 2297 }, { "epoch": 3.3232104121475055, "grad_norm": 1.946211322014238, "learning_rate": 3.458142926467435e-06, "logits/chosen": -0.3869672119617462, "logits/rejected": -0.4443845748901367, "logps/chosen": -0.12994037568569183, "logps/rejected": -5.26711893081665, "loss": 0.1448, "odds_ratio_loss": 0.0468423031270504, "rewards/accuracies": 1.0, "rewards/chosen": -0.012994037009775639, "rewards/margins": 0.5137178897857666, "rewards/rejected": -0.5267119407653809, "sft_loss": 0.12994037568569183, "step": 2298 }, { "epoch": 3.3246565437454807, "grad_norm": 1.9993034368183027, "learning_rate": 3.4550658442652686e-06, "logits/chosen": -0.5554428100585938, "logits/rejected": -0.40344059467315674, "logps/chosen": -0.1770596206188202, "logps/rejected": -5.824186325073242, "loss": 0.2275, "odds_ratio_loss": 0.009020314551889896, "rewards/accuracies": 1.0, "rewards/chosen": -0.01770596206188202, "rewards/margins": 0.564712643623352, "rewards/rejected": -0.5824186205863953, "sft_loss": 0.1770596206188202, "step": 2299 }, { "epoch": 3.3261026753434564, "grad_norm": 1.9294061959051394, "learning_rate": 3.4519890906059354e-06, "logits/chosen": -0.6760809421539307, "logits/rejected": -0.5845804214477539, "logps/chosen": -0.14634880423545837, "logps/rejected": -3.8098344802856445, "loss": 0.1445, "odds_ratio_loss": 0.03921994939446449, "rewards/accuracies": 1.0, "rewards/chosen": -0.014634879305958748, "rewards/margins": 0.3663485646247864, "rewards/rejected": -0.3809834420681, "sft_loss": 0.14634880423545837, "step": 2300 }, { "epoch": 3.3275488069414316, "grad_norm": 7.102938265333184, "learning_rate": 3.448912667344418e-06, "logits/chosen": -0.7923925518989563, "logits/rejected": -0.5611757040023804, "logps/chosen": -0.190931037068367, "logps/rejected": -3.764523506164551, "loss": 0.1962, "odds_ratio_loss": 0.035255275666713715, "rewards/accuracies": 1.0, "rewards/chosen": -0.0190931037068367, "rewards/margins": 0.35735929012298584, "rewards/rejected": -0.37645238637924194, "sft_loss": 0.190931037068367, "step": 2301 }, { "epoch": 3.328994938539407, "grad_norm": 1.9369947329468233, "learning_rate": 3.445836576335508e-06, "logits/chosen": -0.6175248622894287, "logits/rejected": -0.5267677903175354, "logps/chosen": -0.2175239473581314, "logps/rejected": -3.4381308555603027, "loss": 0.1882, "odds_ratio_loss": 0.041964493691921234, "rewards/accuracies": 1.0, "rewards/chosen": -0.02175239287316799, "rewards/margins": 0.3220607042312622, "rewards/rejected": -0.34381306171417236, "sft_loss": 0.2175239473581314, "step": 2302 }, { "epoch": 3.3304410701373826, "grad_norm": 2.1746368999366212, "learning_rate": 3.4427608194337895e-06, "logits/chosen": -0.8088559508323669, "logits/rejected": -0.731338381767273, "logps/chosen": -0.23601864278316498, "logps/rejected": -2.960899829864502, "loss": 0.1881, "odds_ratio_loss": 0.05574827641248703, "rewards/accuracies": 1.0, "rewards/chosen": -0.023601865395903587, "rewards/margins": 0.2724881172180176, "rewards/rejected": -0.2960899770259857, "sft_loss": 0.23601864278316498, "step": 2303 }, { "epoch": 3.331887201735358, "grad_norm": 2.3661530179952908, "learning_rate": 3.4396853984936487e-06, "logits/chosen": -0.4649229049682617, "logits/rejected": -0.3578875660896301, "logps/chosen": -0.09250445663928986, "logps/rejected": -3.7127480506896973, "loss": 0.1542, "odds_ratio_loss": 0.017382729798555374, "rewards/accuracies": 1.0, "rewards/chosen": -0.00925044622272253, "rewards/margins": 0.36202433705329895, "rewards/rejected": -0.37127479910850525, "sft_loss": 0.09250445663928986, "step": 2304 }, { "epoch": 3.3333333333333335, "grad_norm": 3.0663201292058044, "learning_rate": 3.4366103153692667e-06, "logits/chosen": -0.5506351590156555, "logits/rejected": -0.4261362850666046, "logps/chosen": -0.35145315527915955, "logps/rejected": -2.7159597873687744, "loss": 0.244, "odds_ratio_loss": 0.06705104559659958, "rewards/accuracies": 1.0, "rewards/chosen": -0.03514531999826431, "rewards/margins": 0.2364506721496582, "rewards/rejected": -0.27159595489501953, "sft_loss": 0.35145315527915955, "step": 2305 }, { "epoch": 3.3347794649313087, "grad_norm": 2.187765405868488, "learning_rate": 3.433535571914625e-06, "logits/chosen": -0.6699624061584473, "logits/rejected": -0.516162633895874, "logps/chosen": -0.11950968205928802, "logps/rejected": -4.081395626068115, "loss": 0.186, "odds_ratio_loss": 0.01648012362420559, "rewards/accuracies": 1.0, "rewards/chosen": -0.011950968764722347, "rewards/margins": 0.3961886167526245, "rewards/rejected": -0.40813958644866943, "sft_loss": 0.11950968205928802, "step": 2306 }, { "epoch": 3.3362255965292844, "grad_norm": 3.9448298831896684, "learning_rate": 3.430461169983497e-06, "logits/chosen": -0.5674360394477844, "logits/rejected": -0.48793601989746094, "logps/chosen": -0.17865288257598877, "logps/rejected": -4.214674949645996, "loss": 0.1906, "odds_ratio_loss": 0.07441666722297668, "rewards/accuracies": 0.9375, "rewards/chosen": -0.017865289002656937, "rewards/margins": 0.4036021828651428, "rewards/rejected": -0.42146748304367065, "sft_loss": 0.17865288257598877, "step": 2307 }, { "epoch": 3.3376717281272597, "grad_norm": 1.970348244123892, "learning_rate": 3.4273871114294503e-06, "logits/chosen": -0.7424345016479492, "logits/rejected": -0.7071555852890015, "logps/chosen": -0.13843649625778198, "logps/rejected": -4.173454284667969, "loss": 0.1585, "odds_ratio_loss": 0.021304195746779442, "rewards/accuracies": 1.0, "rewards/chosen": -0.013843650929629803, "rewards/margins": 0.40350183844566345, "rewards/rejected": -0.41734546422958374, "sft_loss": 0.13843649625778198, "step": 2308 }, { "epoch": 3.339117859725235, "grad_norm": 1.9857192130170105, "learning_rate": 3.4243133981058457e-06, "logits/chosen": -0.5343315005302429, "logits/rejected": -0.3975837826728821, "logps/chosen": -0.11095140129327774, "logps/rejected": -4.798469543457031, "loss": 0.1805, "odds_ratio_loss": 0.024698395282030106, "rewards/accuracies": 1.0, "rewards/chosen": -0.011095140129327774, "rewards/margins": 0.46875184774398804, "rewards/rejected": -0.4798470139503479, "sft_loss": 0.11095140129327774, "step": 2309 }, { "epoch": 3.3405639913232106, "grad_norm": 1.9290741417311212, "learning_rate": 3.421240031865839e-06, "logits/chosen": -0.5730533599853516, "logits/rejected": -0.5497214794158936, "logps/chosen": -0.11085940897464752, "logps/rejected": -3.2291131019592285, "loss": 0.1509, "odds_ratio_loss": 0.031010687351226807, "rewards/accuracies": 1.0, "rewards/chosen": -0.011085940524935722, "rewards/margins": 0.31182539463043213, "rewards/rejected": -0.3229113221168518, "sft_loss": 0.11085940897464752, "step": 2310 }, { "epoch": 3.342010122921186, "grad_norm": 2.2141056969800883, "learning_rate": 3.418167014562372e-06, "logits/chosen": -0.6565252542495728, "logits/rejected": -0.6016082763671875, "logps/chosen": -0.12213000655174255, "logps/rejected": -2.5358057022094727, "loss": 0.2017, "odds_ratio_loss": 0.03299042582511902, "rewards/accuracies": 1.0, "rewards/chosen": -0.012213001027703285, "rewards/margins": 0.24136759340763092, "rewards/rejected": -0.25358060002326965, "sft_loss": 0.12213000655174255, "step": 2311 }, { "epoch": 3.343456254519161, "grad_norm": 3.54538224416602, "learning_rate": 3.415094348048178e-06, "logits/chosen": -0.6507242918014526, "logits/rejected": -0.5217114090919495, "logps/chosen": -0.25277209281921387, "logps/rejected": -3.5952529907226562, "loss": 0.2842, "odds_ratio_loss": 0.048409249633550644, "rewards/accuracies": 1.0, "rewards/chosen": -0.025277208536863327, "rewards/margins": 0.3342480957508087, "rewards/rejected": -0.35952532291412354, "sft_loss": 0.25277209281921387, "step": 2312 }, { "epoch": 3.3449023861171367, "grad_norm": 2.029733725748979, "learning_rate": 3.4120220341757816e-06, "logits/chosen": -0.6231029033660889, "logits/rejected": -0.5002346634864807, "logps/chosen": -0.13461542129516602, "logps/rejected": -3.9896812438964844, "loss": 0.1734, "odds_ratio_loss": 0.025905992835760117, "rewards/accuracies": 1.0, "rewards/chosen": -0.013461543247103691, "rewards/margins": 0.3855065703392029, "rewards/rejected": -0.3989681005477905, "sft_loss": 0.13461542129516602, "step": 2313 }, { "epoch": 3.346348517715112, "grad_norm": 2.0223739152146694, "learning_rate": 3.408950074797489e-06, "logits/chosen": -0.47878187894821167, "logits/rejected": -0.321734219789505, "logps/chosen": -0.11608566343784332, "logps/rejected": -4.263401031494141, "loss": 0.1635, "odds_ratio_loss": 0.028983620926737785, "rewards/accuracies": 1.0, "rewards/chosen": -0.011608565226197243, "rewards/margins": 0.41473156213760376, "rewards/rejected": -0.42634010314941406, "sft_loss": 0.11608566343784332, "step": 2314 }, { "epoch": 3.3477946493130877, "grad_norm": 1.9365576676145997, "learning_rate": 3.4058784717653995e-06, "logits/chosen": -0.4275784492492676, "logits/rejected": -0.36998388171195984, "logps/chosen": -0.1506321281194687, "logps/rejected": -3.957401752471924, "loss": 0.1715, "odds_ratio_loss": 0.023368481546640396, "rewards/accuracies": 1.0, "rewards/chosen": -0.015063212253153324, "rewards/margins": 0.3806769549846649, "rewards/rejected": -0.3957401514053345, "sft_loss": 0.1506321281194687, "step": 2315 }, { "epoch": 3.349240780911063, "grad_norm": 2.305557489402209, "learning_rate": 3.402807226931391e-06, "logits/chosen": -0.3678243160247803, "logits/rejected": -0.3632332682609558, "logps/chosen": -0.1722518503665924, "logps/rejected": -4.876021862030029, "loss": 0.1641, "odds_ratio_loss": 0.0586848147213459, "rewards/accuracies": 1.0, "rewards/chosen": -0.01722518540918827, "rewards/margins": 0.4703770577907562, "rewards/rejected": -0.48760223388671875, "sft_loss": 0.1722518503665924, "step": 2316 }, { "epoch": 3.350686912509038, "grad_norm": 1.809648668266014, "learning_rate": 3.39973634214713e-06, "logits/chosen": -0.4578765332698822, "logits/rejected": -0.307986319065094, "logps/chosen": -0.16912201046943665, "logps/rejected": -2.635956048965454, "loss": 0.2058, "odds_ratio_loss": 0.04555736109614372, "rewards/accuracies": 1.0, "rewards/chosen": -0.016912199556827545, "rewards/margins": 0.24668340384960175, "rewards/rejected": -0.2635956108570099, "sft_loss": 0.16912201046943665, "step": 2317 }, { "epoch": 3.352133044107014, "grad_norm": 2.0091293833397312, "learning_rate": 3.396665819264063e-06, "logits/chosen": -0.564257800579071, "logits/rejected": -0.5668538808822632, "logps/chosen": -0.23128202557563782, "logps/rejected": -3.673203229904175, "loss": 0.1731, "odds_ratio_loss": 0.07565176486968994, "rewards/accuracies": 1.0, "rewards/chosen": -0.023128200322389603, "rewards/margins": 0.34419214725494385, "rewards/rejected": -0.36732035875320435, "sft_loss": 0.23128202557563782, "step": 2318 }, { "epoch": 3.353579175704989, "grad_norm": 1.9041789941699363, "learning_rate": 3.393595660133422e-06, "logits/chosen": -0.6914946436882019, "logits/rejected": -0.45303675532341003, "logps/chosen": -0.15402580797672272, "logps/rejected": -3.851901054382324, "loss": 0.1957, "odds_ratio_loss": 0.0196257084608078, "rewards/accuracies": 1.0, "rewards/chosen": -0.015402580611407757, "rewards/margins": 0.3697875142097473, "rewards/rejected": -0.38519006967544556, "sft_loss": 0.15402580797672272, "step": 2319 }, { "epoch": 3.3550253073029648, "grad_norm": 2.4464738329993594, "learning_rate": 3.390525866606215e-06, "logits/chosen": -0.5936788320541382, "logits/rejected": -0.3911159634590149, "logps/chosen": -0.16877484321594238, "logps/rejected": -5.249103546142578, "loss": 0.185, "odds_ratio_loss": 0.02867019921541214, "rewards/accuracies": 1.0, "rewards/chosen": -0.01687748357653618, "rewards/margins": 0.5080328583717346, "rewards/rejected": -0.5249103307723999, "sft_loss": 0.16877484321594238, "step": 2320 }, { "epoch": 3.35647143890094, "grad_norm": 2.306906410554413, "learning_rate": 3.3874564405332345e-06, "logits/chosen": -0.42751482129096985, "logits/rejected": -0.3295228183269501, "logps/chosen": -0.108555868268013, "logps/rejected": -3.7481698989868164, "loss": 0.1913, "odds_ratio_loss": 0.015943093225359917, "rewards/accuracies": 1.0, "rewards/chosen": -0.010855588130652905, "rewards/margins": 0.363961398601532, "rewards/rejected": -0.37481701374053955, "sft_loss": 0.108555868268013, "step": 2321 }, { "epoch": 3.3579175704989153, "grad_norm": 2.511012207408778, "learning_rate": 3.3843873837650446e-06, "logits/chosen": -0.5912790298461914, "logits/rejected": -0.43329575657844543, "logps/chosen": -0.15693248808383942, "logps/rejected": -6.197193145751953, "loss": 0.2058, "odds_ratio_loss": 0.0240620244294405, "rewards/accuracies": 1.0, "rewards/chosen": -0.01569324918091297, "rewards/margins": 0.6040260791778564, "rewards/rejected": -0.6197193264961243, "sft_loss": 0.15693248808383942, "step": 2322 }, { "epoch": 3.359363702096891, "grad_norm": 2.3284028366780136, "learning_rate": 3.3813186981519962e-06, "logits/chosen": -0.5598545074462891, "logits/rejected": -0.2795000672340393, "logps/chosen": -0.2694113254547119, "logps/rejected": -5.610904693603516, "loss": 0.1817, "odds_ratio_loss": 0.05200380086898804, "rewards/accuracies": 1.0, "rewards/chosen": -0.02694113366305828, "rewards/margins": 0.5341493487358093, "rewards/rejected": -0.5610904693603516, "sft_loss": 0.2694113254547119, "step": 2323 }, { "epoch": 3.360809833694866, "grad_norm": 2.197930880971812, "learning_rate": 3.378250385544208e-06, "logits/chosen": -0.5665892958641052, "logits/rejected": -0.42673903703689575, "logps/chosen": -0.1817660629749298, "logps/rejected": -4.3229498863220215, "loss": 0.1871, "odds_ratio_loss": 0.03276637941598892, "rewards/accuracies": 1.0, "rewards/chosen": -0.0181766077876091, "rewards/margins": 0.41411834955215454, "rewards/rejected": -0.43229496479034424, "sft_loss": 0.1817660629749298, "step": 2324 }, { "epoch": 3.3622559652928414, "grad_norm": 2.067932219260601, "learning_rate": 3.375182447791577e-06, "logits/chosen": -0.6327534914016724, "logits/rejected": -0.36072319746017456, "logps/chosen": -0.16353721916675568, "logps/rejected": -3.5113089084625244, "loss": 0.1897, "odds_ratio_loss": 0.027788694947957993, "rewards/accuracies": 1.0, "rewards/chosen": -0.016353722661733627, "rewards/margins": 0.33477717638015747, "rewards/rejected": -0.351130872964859, "sft_loss": 0.16353721916675568, "step": 2325 }, { "epoch": 3.363702096890817, "grad_norm": 2.0455089321641178, "learning_rate": 3.3721148867437774e-06, "logits/chosen": -0.5868119597434998, "logits/rejected": -0.396449476480484, "logps/chosen": -0.12406010925769806, "logps/rejected": -5.176513195037842, "loss": 0.1933, "odds_ratio_loss": 0.026875488460063934, "rewards/accuracies": 1.0, "rewards/chosen": -0.012406010180711746, "rewards/margins": 0.5052453875541687, "rewards/rejected": -0.5176513195037842, "sft_loss": 0.12406010925769806, "step": 2326 }, { "epoch": 3.3651482284887924, "grad_norm": 1.8271184123517485, "learning_rate": 3.3690477042502496e-06, "logits/chosen": -0.5884243249893188, "logits/rejected": -0.48027777671813965, "logps/chosen": -0.27801549434661865, "logps/rejected": -4.671180725097656, "loss": 0.1827, "odds_ratio_loss": 0.05004490539431572, "rewards/accuracies": 1.0, "rewards/chosen": -0.027801549062132835, "rewards/margins": 0.4393165707588196, "rewards/rejected": -0.46711811423301697, "sft_loss": 0.27801549434661865, "step": 2327 }, { "epoch": 3.366594360086768, "grad_norm": 1.8966435232463192, "learning_rate": 3.365980902160212e-06, "logits/chosen": -0.40547722578048706, "logits/rejected": -0.36879974603652954, "logps/chosen": -0.21847718954086304, "logps/rejected": -4.9782304763793945, "loss": 0.197, "odds_ratio_loss": 0.0362633541226387, "rewards/accuracies": 1.0, "rewards/chosen": -0.021847719326615334, "rewards/margins": 0.4759753346443176, "rewards/rejected": -0.4978230595588684, "sft_loss": 0.21847718954086304, "step": 2328 }, { "epoch": 3.3680404916847433, "grad_norm": 2.0848613162126814, "learning_rate": 3.3629144823226482e-06, "logits/chosen": -0.7243247032165527, "logits/rejected": -0.5537648797035217, "logps/chosen": -0.2133502960205078, "logps/rejected": -3.376030921936035, "loss": 0.1792, "odds_ratio_loss": 0.041394349187612534, "rewards/accuracies": 1.0, "rewards/chosen": -0.021335028111934662, "rewards/margins": 0.31626808643341064, "rewards/rejected": -0.3376030921936035, "sft_loss": 0.2133502960205078, "step": 2329 }, { "epoch": 3.369486623282719, "grad_norm": 1.9065026429617424, "learning_rate": 3.3598484465863172e-06, "logits/chosen": -0.4827233552932739, "logits/rejected": -0.3291730284690857, "logps/chosen": -0.13992281258106232, "logps/rejected": -4.810104846954346, "loss": 0.1576, "odds_ratio_loss": 0.028171217069029808, "rewards/accuracies": 1.0, "rewards/chosen": -0.013992281630635262, "rewards/margins": 0.4670182168483734, "rewards/rejected": -0.4810104966163635, "sft_loss": 0.13992281258106232, "step": 2330 }, { "epoch": 3.370932754880694, "grad_norm": 2.093063883047962, "learning_rate": 3.356782796799741e-06, "logits/chosen": -0.7263155579566956, "logits/rejected": -0.5995932817459106, "logps/chosen": -0.2737896740436554, "logps/rejected": -2.341442823410034, "loss": 0.1972, "odds_ratio_loss": 0.07859160006046295, "rewards/accuracies": 1.0, "rewards/chosen": -0.02737896703183651, "rewards/margins": 0.2067653238773346, "rewards/rejected": -0.23414428532123566, "sft_loss": 0.2737896740436554, "step": 2331 }, { "epoch": 3.3723788864786695, "grad_norm": 2.280755776822081, "learning_rate": 3.3537175348112132e-06, "logits/chosen": -0.5803489089012146, "logits/rejected": -0.4628605246543884, "logps/chosen": -0.2696508467197418, "logps/rejected": -4.5527024269104, "loss": 0.217, "odds_ratio_loss": 0.03504379093647003, "rewards/accuracies": 1.0, "rewards/chosen": -0.026965085417032242, "rewards/margins": 0.42830514907836914, "rewards/rejected": -0.4552702307701111, "sft_loss": 0.2696508467197418, "step": 2332 }, { "epoch": 3.373825018076645, "grad_norm": 2.4387381727232342, "learning_rate": 3.350652662468789e-06, "logits/chosen": -0.5568262338638306, "logits/rejected": -0.5146419405937195, "logps/chosen": -0.27246224880218506, "logps/rejected": -4.7472405433654785, "loss": 0.2329, "odds_ratio_loss": 0.06966537982225418, "rewards/accuracies": 1.0, "rewards/chosen": -0.027246225625276566, "rewards/margins": 0.4474778175354004, "rewards/rejected": -0.47472405433654785, "sft_loss": 0.27246224880218506, "step": 2333 }, { "epoch": 3.3752711496746204, "grad_norm": 2.089432631192305, "learning_rate": 3.347588181620295e-06, "logits/chosen": -0.6027547121047974, "logits/rejected": -0.45868149399757385, "logps/chosen": -0.22351467609405518, "logps/rejected": -3.78956937789917, "loss": 0.2044, "odds_ratio_loss": 0.028093697503209114, "rewards/accuracies": 1.0, "rewards/chosen": -0.022351469844579697, "rewards/margins": 0.3566054701805115, "rewards/rejected": -0.37895697355270386, "sft_loss": 0.22351467609405518, "step": 2334 }, { "epoch": 3.3767172812725956, "grad_norm": 2.017534606069084, "learning_rate": 3.344524094113315e-06, "logits/chosen": -0.5132440328598022, "logits/rejected": -0.4382811188697815, "logps/chosen": -0.24050694704055786, "logps/rejected": -4.633388042449951, "loss": 0.2197, "odds_ratio_loss": 0.06342487037181854, "rewards/accuracies": 1.0, "rewards/chosen": -0.024050697684288025, "rewards/margins": 0.4392881393432617, "rewards/rejected": -0.46333885192871094, "sft_loss": 0.24050694704055786, "step": 2335 }, { "epoch": 3.3781634128705713, "grad_norm": 2.5427888945503003, "learning_rate": 3.3414604017952012e-06, "logits/chosen": -0.43957871198654175, "logits/rejected": -0.32247668504714966, "logps/chosen": -0.16919280588626862, "logps/rejected": -4.3719401359558105, "loss": 0.2299, "odds_ratio_loss": 0.015139477327466011, "rewards/accuracies": 1.0, "rewards/chosen": -0.01691928133368492, "rewards/margins": 0.4202747344970703, "rewards/rejected": -0.43719398975372314, "sft_loss": 0.16919280588626862, "step": 2336 }, { "epoch": 3.3796095444685466, "grad_norm": 1.907668530921736, "learning_rate": 3.338397106513062e-06, "logits/chosen": -0.5221062898635864, "logits/rejected": -0.44908884167671204, "logps/chosen": -0.12111540138721466, "logps/rejected": -5.3049821853637695, "loss": 0.1675, "odds_ratio_loss": 0.013860628008842468, "rewards/accuracies": 1.0, "rewards/chosen": -0.012111540883779526, "rewards/margins": 0.5183866620063782, "rewards/rejected": -0.530498206615448, "sft_loss": 0.12111540138721466, "step": 2337 }, { "epoch": 3.3810556760665222, "grad_norm": 2.7346827804749414, "learning_rate": 3.3353342101137716e-06, "logits/chosen": -0.7345597743988037, "logits/rejected": -0.5234204530715942, "logps/chosen": -0.24222394824028015, "logps/rejected": -3.3377528190612793, "loss": 0.2078, "odds_ratio_loss": 0.034483764320611954, "rewards/accuracies": 1.0, "rewards/chosen": -0.024222396314144135, "rewards/margins": 0.3095529079437256, "rewards/rejected": -0.3337753117084503, "sft_loss": 0.24222394824028015, "step": 2338 }, { "epoch": 3.3825018076644975, "grad_norm": 1.917048235526739, "learning_rate": 3.3322717144439625e-06, "logits/chosen": -0.46222031116485596, "logits/rejected": -0.27375099062919617, "logps/chosen": -0.1882801055908203, "logps/rejected": -2.4854445457458496, "loss": 0.2039, "odds_ratio_loss": 0.03459760174155235, "rewards/accuracies": 1.0, "rewards/chosen": -0.018828010186553, "rewards/margins": 0.2297164499759674, "rewards/rejected": -0.24854443967342377, "sft_loss": 0.1882801055908203, "step": 2339 }, { "epoch": 3.3839479392624727, "grad_norm": 2.0568650037186837, "learning_rate": 3.329209621350022e-06, "logits/chosen": -0.40930983424186707, "logits/rejected": -0.33890777826309204, "logps/chosen": -0.12745355069637299, "logps/rejected": -3.447296619415283, "loss": 0.1956, "odds_ratio_loss": 0.023260656744241714, "rewards/accuracies": 1.0, "rewards/chosen": -0.012745355255901814, "rewards/margins": 0.33198434114456177, "rewards/rejected": -0.3447296917438507, "sft_loss": 0.12745355069637299, "step": 2340 }, { "epoch": 3.3853940708604484, "grad_norm": 1.8812355165375803, "learning_rate": 3.326147932678101e-06, "logits/chosen": -0.5835216045379639, "logits/rejected": -0.3774033486843109, "logps/chosen": -0.1862981766462326, "logps/rejected": -2.6372690200805664, "loss": 0.1454, "odds_ratio_loss": 0.04647889733314514, "rewards/accuracies": 1.0, "rewards/chosen": -0.01862981729209423, "rewards/margins": 0.2450971007347107, "rewards/rejected": -0.2637269198894501, "sft_loss": 0.1862981766462326, "step": 2341 }, { "epoch": 3.3868402024584237, "grad_norm": 2.0368914180952524, "learning_rate": 3.3230866502741003e-06, "logits/chosen": -0.5929782390594482, "logits/rejected": -0.4657699167728424, "logps/chosen": -0.15571850538253784, "logps/rejected": -4.687864303588867, "loss": 0.1233, "odds_ratio_loss": 0.03532061725854874, "rewards/accuracies": 1.0, "rewards/chosen": -0.01557185035198927, "rewards/margins": 0.453214555978775, "rewards/rejected": -0.4687863886356354, "sft_loss": 0.15571850538253784, "step": 2342 }, { "epoch": 3.3882863340563993, "grad_norm": 2.5322463156632584, "learning_rate": 3.3200257759836797e-06, "logits/chosen": -0.5486747026443481, "logits/rejected": -0.4624249041080475, "logps/chosen": -0.15167993307113647, "logps/rejected": -4.17061185836792, "loss": 0.2099, "odds_ratio_loss": 0.028253663331270218, "rewards/accuracies": 1.0, "rewards/chosen": -0.015167992562055588, "rewards/margins": 0.4018931984901428, "rewards/rejected": -0.4170612096786499, "sft_loss": 0.15167993307113647, "step": 2343 }, { "epoch": 3.3897324656543746, "grad_norm": 3.862720927006293, "learning_rate": 3.3169653116522495e-06, "logits/chosen": -0.585822343826294, "logits/rejected": -0.43602919578552246, "logps/chosen": -0.18509134650230408, "logps/rejected": -2.327944755554199, "loss": 0.2604, "odds_ratio_loss": 0.04921817034482956, "rewards/accuracies": 1.0, "rewards/chosen": -0.018509134650230408, "rewards/margins": 0.21428535878658295, "rewards/rejected": -0.23279447853565216, "sft_loss": 0.18509134650230408, "step": 2344 }, { "epoch": 3.39117859725235, "grad_norm": 1.950000962651423, "learning_rate": 3.3139052591249787e-06, "logits/chosen": -0.6751699447631836, "logits/rejected": -0.5654871463775635, "logps/chosen": -0.15269403159618378, "logps/rejected": -3.7506747245788574, "loss": 0.1931, "odds_ratio_loss": 0.016780618578195572, "rewards/accuracies": 1.0, "rewards/chosen": -0.015269402414560318, "rewards/margins": 0.35979804396629333, "rewards/rejected": -0.37506741285324097, "sft_loss": 0.15269403159618378, "step": 2345 }, { "epoch": 3.3926247288503255, "grad_norm": 2.4756529994631578, "learning_rate": 3.310845620246782e-06, "logits/chosen": -0.5545775294303894, "logits/rejected": -0.5234758257865906, "logps/chosen": -0.3893795907497406, "logps/rejected": -3.45979642868042, "loss": 0.2417, "odds_ratio_loss": 0.06643575429916382, "rewards/accuracies": 1.0, "rewards/chosen": -0.03893795982003212, "rewards/margins": 0.3070417046546936, "rewards/rejected": -0.34597963094711304, "sft_loss": 0.3893795907497406, "step": 2346 }, { "epoch": 3.3940708604483008, "grad_norm": 1.9452862350833446, "learning_rate": 3.307786396862328e-06, "logits/chosen": -0.514398992061615, "logits/rejected": -0.418234258890152, "logps/chosen": -0.1784936636686325, "logps/rejected": -3.257981777191162, "loss": 0.169, "odds_ratio_loss": 0.029191169887781143, "rewards/accuracies": 1.0, "rewards/chosen": -0.01784936711192131, "rewards/margins": 0.3079487979412079, "rewards/rejected": -0.3257981538772583, "sft_loss": 0.1784936636686325, "step": 2347 }, { "epoch": 3.395516992046276, "grad_norm": 1.8849207427683208, "learning_rate": 3.3047275908160313e-06, "logits/chosen": -0.6669695377349854, "logits/rejected": -0.5456492304801941, "logps/chosen": -0.14028765261173248, "logps/rejected": -3.805546522140503, "loss": 0.1403, "odds_ratio_loss": 0.0347730778157711, "rewards/accuracies": 1.0, "rewards/chosen": -0.014028767123818398, "rewards/margins": 0.36652588844299316, "rewards/rejected": -0.3805546462535858, "sft_loss": 0.14028765261173248, "step": 2348 }, { "epoch": 3.3969631236442517, "grad_norm": 3.2445046573108662, "learning_rate": 3.301669203952062e-06, "logits/chosen": -0.6601239442825317, "logits/rejected": -0.5460609793663025, "logps/chosen": -0.33426791429519653, "logps/rejected": -3.9542243480682373, "loss": 0.2175, "odds_ratio_loss": 0.059068068861961365, "rewards/accuracies": 1.0, "rewards/chosen": -0.03342679515480995, "rewards/margins": 0.361995667219162, "rewards/rejected": -0.39542245864868164, "sft_loss": 0.33426791429519653, "step": 2349 }, { "epoch": 3.398409255242227, "grad_norm": 2.158850405411536, "learning_rate": 3.298611238114329e-06, "logits/chosen": -0.6491446495056152, "logits/rejected": -0.5393118858337402, "logps/chosen": -0.1076105535030365, "logps/rejected": -4.072206497192383, "loss": 0.1556, "odds_ratio_loss": 0.056976694613695145, "rewards/accuracies": 1.0, "rewards/chosen": -0.01076105609536171, "rewards/margins": 0.3964596390724182, "rewards/rejected": -0.4072206914424896, "sft_loss": 0.1076105535030365, "step": 2350 }, { "epoch": 3.3998553868402026, "grad_norm": 1.9828791161868011, "learning_rate": 3.2955536951464928e-06, "logits/chosen": -0.5805346965789795, "logits/rejected": -0.43692758679389954, "logps/chosen": -0.24223265051841736, "logps/rejected": -3.075357437133789, "loss": 0.1726, "odds_ratio_loss": 0.08130443841218948, "rewards/accuracies": 1.0, "rewards/chosen": -0.024223264306783676, "rewards/margins": 0.28331252932548523, "rewards/rejected": -0.3075357675552368, "sft_loss": 0.24223265051841736, "step": 2351 }, { "epoch": 3.401301518438178, "grad_norm": 2.6132585882729185, "learning_rate": 3.2924965768919584e-06, "logits/chosen": -0.6605554819107056, "logits/rejected": -0.5415310263633728, "logps/chosen": -0.20664909482002258, "logps/rejected": -4.503979682922363, "loss": 0.1837, "odds_ratio_loss": 0.026226144284009933, "rewards/accuracies": 1.0, "rewards/chosen": -0.020664909854531288, "rewards/margins": 0.4297330677509308, "rewards/rejected": -0.4503979980945587, "sft_loss": 0.20664909482002258, "step": 2352 }, { "epoch": 3.4027476500361535, "grad_norm": 2.0914371446634217, "learning_rate": 3.2894398851938722e-06, "logits/chosen": -0.5870836973190308, "logits/rejected": -0.5486207008361816, "logps/chosen": -0.3236868679523468, "logps/rejected": -3.2036993503570557, "loss": 0.2116, "odds_ratio_loss": 0.10235699266195297, "rewards/accuracies": 1.0, "rewards/chosen": -0.03236868977546692, "rewards/margins": 0.28800126910209656, "rewards/rejected": -0.3203699588775635, "sft_loss": 0.3236868679523468, "step": 2353 }, { "epoch": 3.404193781634129, "grad_norm": 1.9042342118993496, "learning_rate": 3.2863836218951264e-06, "logits/chosen": -0.5434327721595764, "logits/rejected": -0.6129238605499268, "logps/chosen": -0.15127164125442505, "logps/rejected": -3.548457145690918, "loss": 0.1374, "odds_ratio_loss": 0.0960196778178215, "rewards/accuracies": 0.9375, "rewards/chosen": -0.01512716431170702, "rewards/margins": 0.3397185206413269, "rewards/rejected": -0.35484570264816284, "sft_loss": 0.15127164125442505, "step": 2354 }, { "epoch": 3.405639913232104, "grad_norm": 2.0168858297493384, "learning_rate": 3.283327788838351e-06, "logits/chosen": -0.46761107444763184, "logits/rejected": -0.40209662914276123, "logps/chosen": -0.16615912318229675, "logps/rejected": -4.629472732543945, "loss": 0.1965, "odds_ratio_loss": 0.07964546978473663, "rewards/accuracies": 0.9375, "rewards/chosen": -0.016615912318229675, "rewards/margins": 0.44633132219314575, "rewards/rejected": -0.462947279214859, "sft_loss": 0.16615912318229675, "step": 2355 }, { "epoch": 3.4070860448300797, "grad_norm": 1.9900945386518782, "learning_rate": 3.2802723878659227e-06, "logits/chosen": -0.7723032236099243, "logits/rejected": -0.5101234912872314, "logps/chosen": -0.18992879986763, "logps/rejected": -3.797363758087158, "loss": 0.162, "odds_ratio_loss": 0.04275386407971382, "rewards/accuracies": 1.0, "rewards/chosen": -0.01899288222193718, "rewards/margins": 0.3607434928417206, "rewards/rejected": -0.3797363340854645, "sft_loss": 0.18992879986763, "step": 2356 }, { "epoch": 3.408532176428055, "grad_norm": 2.0183495710777297, "learning_rate": 3.2772174208199506e-06, "logits/chosen": -0.6566188335418701, "logits/rejected": -0.3303099572658539, "logps/chosen": -0.11480744928121567, "logps/rejected": -6.169617652893066, "loss": 0.1865, "odds_ratio_loss": 0.017061393707990646, "rewards/accuracies": 1.0, "rewards/chosen": -0.011480744928121567, "rewards/margins": 0.6054810285568237, "rewards/rejected": -0.6169617772102356, "sft_loss": 0.11480744928121567, "step": 2357 }, { "epoch": 3.40997830802603, "grad_norm": 1.8891380224756906, "learning_rate": 3.27416288954229e-06, "logits/chosen": -0.44351673126220703, "logits/rejected": -0.43019381165504456, "logps/chosen": -0.10614179819822311, "logps/rejected": -6.608994007110596, "loss": 0.1517, "odds_ratio_loss": 0.028982926160097122, "rewards/accuracies": 1.0, "rewards/chosen": -0.010614179074764252, "rewards/margins": 0.6502852439880371, "rewards/rejected": -0.6608994603157043, "sft_loss": 0.10614179819822311, "step": 2358 }, { "epoch": 3.411424439624006, "grad_norm": 2.5923012440968547, "learning_rate": 3.2711087958745244e-06, "logits/chosen": -0.5696052312850952, "logits/rejected": -0.41010528802871704, "logps/chosen": -0.26490670442581177, "logps/rejected": -4.231900215148926, "loss": 0.2317, "odds_ratio_loss": 0.07147705554962158, "rewards/accuracies": 1.0, "rewards/chosen": -0.026490669697523117, "rewards/margins": 0.39669930934906006, "rewards/rejected": -0.42318999767303467, "sft_loss": 0.26490670442581177, "step": 2359 }, { "epoch": 3.412870571221981, "grad_norm": 2.0508780602004104, "learning_rate": 3.2680551416579814e-06, "logits/chosen": -0.680136501789093, "logits/rejected": -0.4244930148124695, "logps/chosen": -0.16198989748954773, "logps/rejected": -3.4155640602111816, "loss": 0.1938, "odds_ratio_loss": 0.025843270123004913, "rewards/accuracies": 1.0, "rewards/chosen": -0.016198990866541862, "rewards/margins": 0.32535743713378906, "rewards/rejected": -0.3415564000606537, "sft_loss": 0.16198989748954773, "step": 2360 }, { "epoch": 3.414316702819957, "grad_norm": 1.887673138028965, "learning_rate": 3.265001928733718e-06, "logits/chosen": -0.6401176452636719, "logits/rejected": -0.5871616005897522, "logps/chosen": -0.22450858354568481, "logps/rejected": -3.574300765991211, "loss": 0.2106, "odds_ratio_loss": 0.04913847893476486, "rewards/accuracies": 1.0, "rewards/chosen": -0.02245086058974266, "rewards/margins": 0.33497920632362366, "rewards/rejected": -0.357430100440979, "sft_loss": 0.22450858354568481, "step": 2361 }, { "epoch": 3.415762834417932, "grad_norm": 2.082031818490767, "learning_rate": 3.2619491589425315e-06, "logits/chosen": -0.6551008820533752, "logits/rejected": -0.37433722615242004, "logps/chosen": -0.19321411848068237, "logps/rejected": -3.2051291465759277, "loss": 0.2201, "odds_ratio_loss": 0.06106024235486984, "rewards/accuracies": 1.0, "rewards/chosen": -0.019321411848068237, "rewards/margins": 0.301191508769989, "rewards/rejected": -0.32051289081573486, "sft_loss": 0.19321411848068237, "step": 2362 }, { "epoch": 3.4172089660159073, "grad_norm": 2.178896734407967, "learning_rate": 3.2588968341249446e-06, "logits/chosen": -0.49421876668930054, "logits/rejected": -0.4174592196941376, "logps/chosen": -0.15940704941749573, "logps/rejected": -4.831748008728027, "loss": 0.2088, "odds_ratio_loss": 0.02657831273972988, "rewards/accuracies": 1.0, "rewards/chosen": -0.015940703451633453, "rewards/margins": 0.4672340750694275, "rewards/rejected": -0.48317480087280273, "sft_loss": 0.15940704941749573, "step": 2363 }, { "epoch": 3.418655097613883, "grad_norm": 2.3614175718808053, "learning_rate": 3.2558449561212175e-06, "logits/chosen": -0.7048352360725403, "logits/rejected": -0.3603106439113617, "logps/chosen": -0.20679709315299988, "logps/rejected": -4.15094518661499, "loss": 0.2183, "odds_ratio_loss": 0.0959421694278717, "rewards/accuracies": 0.9375, "rewards/chosen": -0.020679708570241928, "rewards/margins": 0.3944148123264313, "rewards/rejected": -0.4150944948196411, "sft_loss": 0.20679709315299988, "step": 2364 }, { "epoch": 3.420101229211858, "grad_norm": 1.8595835773087739, "learning_rate": 3.2527935267713358e-06, "logits/chosen": -0.45306140184402466, "logits/rejected": -0.45435231924057007, "logps/chosen": -0.1741219162940979, "logps/rejected": -2.359769821166992, "loss": 0.1421, "odds_ratio_loss": 0.04474630579352379, "rewards/accuracies": 1.0, "rewards/chosen": -0.01741219311952591, "rewards/margins": 0.21856479346752167, "rewards/rejected": -0.23597699403762817, "sft_loss": 0.1741219162940979, "step": 2365 }, { "epoch": 3.421547360809834, "grad_norm": 2.0437400502249163, "learning_rate": 3.249742547915021e-06, "logits/chosen": -0.6979148983955383, "logits/rejected": -0.5617083311080933, "logps/chosen": -0.15010838210582733, "logps/rejected": -4.167182445526123, "loss": 0.2088, "odds_ratio_loss": 0.030087631195783615, "rewards/accuracies": 1.0, "rewards/chosen": -0.015010838396847248, "rewards/margins": 0.40170741081237793, "rewards/rejected": -0.4167182743549347, "sft_loss": 0.15010838210582733, "step": 2366 }, { "epoch": 3.422993492407809, "grad_norm": 2.2100265623341286, "learning_rate": 3.246692021391719e-06, "logits/chosen": -0.5132609605789185, "logits/rejected": -0.3893182575702667, "logps/chosen": -0.2116222381591797, "logps/rejected": -7.529613494873047, "loss": 0.2881, "odds_ratio_loss": 0.03504924103617668, "rewards/accuracies": 1.0, "rewards/chosen": -0.02116222307085991, "rewards/margins": 0.7317991256713867, "rewards/rejected": -0.7529613375663757, "sft_loss": 0.2116222381591797, "step": 2367 }, { "epoch": 3.4244396240057844, "grad_norm": 1.9627649713389184, "learning_rate": 3.2436419490406014e-06, "logits/chosen": -0.5135637521743774, "logits/rejected": -0.4365648925304413, "logps/chosen": -0.18647979199886322, "logps/rejected": -4.965529441833496, "loss": 0.165, "odds_ratio_loss": 0.05243527889251709, "rewards/accuracies": 1.0, "rewards/chosen": -0.01864797994494438, "rewards/margins": 0.4779050052165985, "rewards/rejected": -0.496552973985672, "sft_loss": 0.18647979199886322, "step": 2368 }, { "epoch": 3.42588575560376, "grad_norm": 1.9653134748886898, "learning_rate": 3.2405923327005713e-06, "logits/chosen": -0.6600465774536133, "logits/rejected": -0.44938400387763977, "logps/chosen": -0.14724966883659363, "logps/rejected": -3.737379550933838, "loss": 0.1758, "odds_ratio_loss": 0.029483633115887642, "rewards/accuracies": 1.0, "rewards/chosen": -0.014724968001246452, "rewards/margins": 0.35901302099227905, "rewards/rejected": -0.37373796105384827, "sft_loss": 0.14724966883659363, "step": 2369 }, { "epoch": 3.4273318872017353, "grad_norm": 2.2160538606815017, "learning_rate": 3.237543174210251e-06, "logits/chosen": -0.8269349336624146, "logits/rejected": -0.6046656370162964, "logps/chosen": -0.09066790342330933, "logps/rejected": -5.1507792472839355, "loss": 0.1569, "odds_ratio_loss": 0.013543096370995045, "rewards/accuracies": 1.0, "rewards/chosen": -0.009066790342330933, "rewards/margins": 0.5060111880302429, "rewards/rejected": -0.5150779485702515, "sft_loss": 0.09066790342330933, "step": 2370 }, { "epoch": 3.4287780187997106, "grad_norm": 1.899550802406212, "learning_rate": 3.234494475407992e-06, "logits/chosen": -0.6040807962417603, "logits/rejected": -0.6489887237548828, "logps/chosen": -0.12181614339351654, "logps/rejected": -3.8885912895202637, "loss": 0.1457, "odds_ratio_loss": 0.024596858769655228, "rewards/accuracies": 1.0, "rewards/chosen": -0.012181615456938744, "rewards/margins": 0.3766775131225586, "rewards/rejected": -0.3888591527938843, "sft_loss": 0.12181614339351654, "step": 2371 }, { "epoch": 3.4302241503976862, "grad_norm": 2.099727938709925, "learning_rate": 3.231446238131863e-06, "logits/chosen": -0.6808019876480103, "logits/rejected": -0.5160682797431946, "logps/chosen": -0.1048995703458786, "logps/rejected": -5.250744819641113, "loss": 0.1281, "odds_ratio_loss": 0.008181117475032806, "rewards/accuracies": 1.0, "rewards/chosen": -0.01048995740711689, "rewards/margins": 0.514584481716156, "rewards/rejected": -0.5250744223594666, "sft_loss": 0.1048995703458786, "step": 2372 }, { "epoch": 3.4316702819956615, "grad_norm": 2.706733983494289, "learning_rate": 3.2283984642196613e-06, "logits/chosen": -0.6812127828598022, "logits/rejected": -0.5712729692459106, "logps/chosen": -0.11651255190372467, "logps/rejected": -4.5421271324157715, "loss": 0.1814, "odds_ratio_loss": 0.023081321269273758, "rewards/accuracies": 1.0, "rewards/chosen": -0.011651256121695042, "rewards/margins": 0.44256147742271423, "rewards/rejected": -0.4542126953601837, "sft_loss": 0.11651255190372467, "step": 2373 }, { "epoch": 3.433116413593637, "grad_norm": 5.394973473846446, "learning_rate": 3.225351155508898e-06, "logits/chosen": -0.6133010387420654, "logits/rejected": -0.45091527700424194, "logps/chosen": -0.22377179563045502, "logps/rejected": -3.0302748680114746, "loss": 0.1914, "odds_ratio_loss": 0.0691990777850151, "rewards/accuracies": 1.0, "rewards/chosen": -0.02237718179821968, "rewards/margins": 0.2806503176689148, "rewards/rejected": -0.303027480840683, "sft_loss": 0.22377179563045502, "step": 2374 }, { "epoch": 3.4345625451916124, "grad_norm": 2.1457624970133327, "learning_rate": 3.222304313836809e-06, "logits/chosen": -0.556597888469696, "logits/rejected": -0.49058791995048523, "logps/chosen": -0.2874477505683899, "logps/rejected": -4.049435615539551, "loss": 0.2413, "odds_ratio_loss": 0.06138356029987335, "rewards/accuracies": 1.0, "rewards/chosen": -0.02874477580189705, "rewards/margins": 0.37619873881340027, "rewards/rejected": -0.4049435555934906, "sft_loss": 0.2874477505683899, "step": 2375 }, { "epoch": 3.436008676789588, "grad_norm": 1.9218950138125275, "learning_rate": 3.219257941040344e-06, "logits/chosen": -0.5005204677581787, "logits/rejected": -0.28589102625846863, "logps/chosen": -0.17385639250278473, "logps/rejected": -4.714556694030762, "loss": 0.1819, "odds_ratio_loss": 0.03751099109649658, "rewards/accuracies": 1.0, "rewards/chosen": -0.017385639250278473, "rewards/margins": 0.4540700316429138, "rewards/rejected": -0.4714556634426117, "sft_loss": 0.17385639250278473, "step": 2376 }, { "epoch": 3.4374548083875633, "grad_norm": 1.8161697265415382, "learning_rate": 3.216212038956176e-06, "logits/chosen": -0.5983232855796814, "logits/rejected": -0.5303330421447754, "logps/chosen": -0.16004885733127594, "logps/rejected": -3.1815803050994873, "loss": 0.154, "odds_ratio_loss": 0.04018660634756088, "rewards/accuracies": 1.0, "rewards/chosen": -0.016004884615540504, "rewards/margins": 0.30215317010879517, "rewards/rejected": -0.31815803050994873, "sft_loss": 0.16004885733127594, "step": 2377 }, { "epoch": 3.4389009399855386, "grad_norm": 2.316256513557429, "learning_rate": 3.2131666094206877e-06, "logits/chosen": -0.6681080460548401, "logits/rejected": -0.4476345479488373, "logps/chosen": -0.1448899507522583, "logps/rejected": -3.2004542350769043, "loss": 0.2104, "odds_ratio_loss": 0.038483526557683945, "rewards/accuracies": 1.0, "rewards/chosen": -0.01448899321258068, "rewards/margins": 0.30555644631385803, "rewards/rejected": -0.32004544138908386, "sft_loss": 0.1448899507522583, "step": 2378 }, { "epoch": 3.4403470715835143, "grad_norm": 2.075563631432093, "learning_rate": 3.2101216542699807e-06, "logits/chosen": -0.5723260641098022, "logits/rejected": -0.46136629581451416, "logps/chosen": -0.21146410703659058, "logps/rejected": -2.679683208465576, "loss": 0.2148, "odds_ratio_loss": 0.048095718026161194, "rewards/accuracies": 1.0, "rewards/chosen": -0.021146409213542938, "rewards/margins": 0.24682191014289856, "rewards/rejected": -0.2679683268070221, "sft_loss": 0.21146410703659058, "step": 2379 }, { "epoch": 3.4417932031814895, "grad_norm": 2.2145156618535804, "learning_rate": 3.207077175339871e-06, "logits/chosen": -0.327717661857605, "logits/rejected": -0.3595542311668396, "logps/chosen": -0.1934828907251358, "logps/rejected": -4.13493537902832, "loss": 0.1963, "odds_ratio_loss": 0.04170215129852295, "rewards/accuracies": 1.0, "rewards/chosen": -0.01934828981757164, "rewards/margins": 0.39414525032043457, "rewards/rejected": -0.4134935438632965, "sft_loss": 0.1934828907251358, "step": 2380 }, { "epoch": 3.4432393347794648, "grad_norm": 2.227105578312563, "learning_rate": 3.204033174465886e-06, "logits/chosen": -0.5712621808052063, "logits/rejected": -0.5687679648399353, "logps/chosen": -0.20861420035362244, "logps/rejected": -3.32293701171875, "loss": 0.2707, "odds_ratio_loss": 0.044927455484867096, "rewards/accuracies": 1.0, "rewards/chosen": -0.020861420780420303, "rewards/margins": 0.3114323019981384, "rewards/rejected": -0.33229371905326843, "sft_loss": 0.20861420035362244, "step": 2381 }, { "epoch": 3.4446854663774404, "grad_norm": 2.090363545406971, "learning_rate": 3.2009896534832645e-06, "logits/chosen": -0.6760439872741699, "logits/rejected": -0.4026757478713989, "logps/chosen": -0.1740947961807251, "logps/rejected": -5.411316871643066, "loss": 0.1854, "odds_ratio_loss": 0.029533682391047478, "rewards/accuracies": 1.0, "rewards/chosen": -0.01740947738289833, "rewards/margins": 0.523722231388092, "rewards/rejected": -0.5411317348480225, "sft_loss": 0.1740947961807251, "step": 2382 }, { "epoch": 3.4461315979754157, "grad_norm": 2.029695406988353, "learning_rate": 3.1979466142269555e-06, "logits/chosen": -0.6552931666374207, "logits/rejected": -0.5883285403251648, "logps/chosen": -0.17759792506694794, "logps/rejected": -2.6016619205474854, "loss": 0.1752, "odds_ratio_loss": 0.04593576118350029, "rewards/accuracies": 1.0, "rewards/chosen": -0.017759792506694794, "rewards/margins": 0.24240639805793762, "rewards/rejected": -0.260166198015213, "sft_loss": 0.17759792506694794, "step": 2383 }, { "epoch": 3.4475777295733914, "grad_norm": 1.9069898883594973, "learning_rate": 3.194904058531621e-06, "logits/chosen": -0.6044560670852661, "logits/rejected": -0.3972908854484558, "logps/chosen": -0.12178179621696472, "logps/rejected": -4.539610862731934, "loss": 0.1617, "odds_ratio_loss": 0.01849059760570526, "rewards/accuracies": 1.0, "rewards/chosen": -0.012178179807960987, "rewards/margins": 0.4417828321456909, "rewards/rejected": -0.453961044549942, "sft_loss": 0.12178179621696472, "step": 2384 }, { "epoch": 3.4490238611713666, "grad_norm": 2.249715426855833, "learning_rate": 3.191861988231627e-06, "logits/chosen": -0.6379084587097168, "logits/rejected": -0.44113999605178833, "logps/chosen": -0.10592366009950638, "logps/rejected": -2.7317471504211426, "loss": 0.1578, "odds_ratio_loss": 0.05817069858312607, "rewards/accuracies": 0.9375, "rewards/chosen": -0.010592365637421608, "rewards/margins": 0.26258236169815063, "rewards/rejected": -0.2731747329235077, "sft_loss": 0.10592366009950638, "step": 2385 }, { "epoch": 3.450469992769342, "grad_norm": 2.263875089312316, "learning_rate": 3.1888204051610524e-06, "logits/chosen": -0.6763030886650085, "logits/rejected": -0.482852578163147, "logps/chosen": -0.11899633705615997, "logps/rejected": -2.220947265625, "loss": 0.1291, "odds_ratio_loss": 0.032654620707035065, "rewards/accuracies": 1.0, "rewards/chosen": -0.011899634264409542, "rewards/margins": 0.21019507944583893, "rewards/rejected": -0.22209471464157104, "sft_loss": 0.11899633705615997, "step": 2386 }, { "epoch": 3.4519161243673175, "grad_norm": 2.2015403837200953, "learning_rate": 3.185779311153674e-06, "logits/chosen": -0.6529958248138428, "logits/rejected": -0.5249685645103455, "logps/chosen": -0.10539045929908752, "logps/rejected": -3.266904354095459, "loss": 0.167, "odds_ratio_loss": 0.023367218673229218, "rewards/accuracies": 1.0, "rewards/chosen": -0.010539045557379723, "rewards/margins": 0.3161514401435852, "rewards/rejected": -0.3266904950141907, "sft_loss": 0.10539045929908752, "step": 2387 }, { "epoch": 3.453362255965293, "grad_norm": 2.1645835829709745, "learning_rate": 3.1827387080429834e-06, "logits/chosen": -0.5144082307815552, "logits/rejected": -0.5050871968269348, "logps/chosen": -0.1488667130470276, "logps/rejected": -3.8072664737701416, "loss": 0.171, "odds_ratio_loss": 0.03980445861816406, "rewards/accuracies": 1.0, "rewards/chosen": -0.014886670745909214, "rewards/margins": 0.36583998799324036, "rewards/rejected": -0.3807266354560852, "sft_loss": 0.1488667130470276, "step": 2388 }, { "epoch": 3.4548083875632685, "grad_norm": 2.092793971447257, "learning_rate": 3.179698597662168e-06, "logits/chosen": -0.6013069748878479, "logits/rejected": -0.5046336054801941, "logps/chosen": -0.19006365537643433, "logps/rejected": -3.2266502380371094, "loss": 0.1904, "odds_ratio_loss": 0.03395771235227585, "rewards/accuracies": 1.0, "rewards/chosen": -0.019006364047527313, "rewards/margins": 0.303658664226532, "rewards/rejected": -0.3226650655269623, "sft_loss": 0.19006365537643433, "step": 2389 }, { "epoch": 3.4562545191612437, "grad_norm": 2.248101499587932, "learning_rate": 3.176658981844125e-06, "logits/chosen": -0.6274727582931519, "logits/rejected": -0.6035176515579224, "logps/chosen": -0.16629016399383545, "logps/rejected": -3.1788456439971924, "loss": 0.1437, "odds_ratio_loss": 0.036419257521629333, "rewards/accuracies": 1.0, "rewards/chosen": -0.016629017889499664, "rewards/margins": 0.30125558376312256, "rewards/rejected": -0.3178845942020416, "sft_loss": 0.16629016399383545, "step": 2390 }, { "epoch": 3.457700650759219, "grad_norm": 2.5398307774402333, "learning_rate": 3.173619862421446e-06, "logits/chosen": -0.6537259817123413, "logits/rejected": -0.4555646777153015, "logps/chosen": -0.13223238289356232, "logps/rejected": -5.075956344604492, "loss": 0.1914, "odds_ratio_loss": 0.02143080160021782, "rewards/accuracies": 1.0, "rewards/chosen": -0.013223239220678806, "rewards/margins": 0.4943723678588867, "rewards/rejected": -0.5075955986976624, "sft_loss": 0.13223238289356232, "step": 2391 }, { "epoch": 3.4591467823571946, "grad_norm": 1.7598998790393996, "learning_rate": 3.170581241226431e-06, "logits/chosen": -0.5485961437225342, "logits/rejected": -0.46813613176345825, "logps/chosen": -0.08493223786354065, "logps/rejected": -3.2580509185791016, "loss": 0.156, "odds_ratio_loss": 0.01832718588411808, "rewards/accuracies": 1.0, "rewards/chosen": -0.008493224158883095, "rewards/margins": 0.3173118829727173, "rewards/rejected": -0.32580509781837463, "sft_loss": 0.08493223786354065, "step": 2392 }, { "epoch": 3.46059291395517, "grad_norm": 2.1489156210409828, "learning_rate": 3.167543120091075e-06, "logits/chosen": -0.6511685848236084, "logits/rejected": -0.5171000957489014, "logps/chosen": -0.2797868847846985, "logps/rejected": -3.9867255687713623, "loss": 0.2727, "odds_ratio_loss": 0.0753171294927597, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0279786866158247, "rewards/margins": 0.3706938624382019, "rewards/rejected": -0.39867255091667175, "sft_loss": 0.2797868847846985, "step": 2393 }, { "epoch": 3.462039045553145, "grad_norm": 3.179471585649838, "learning_rate": 3.1645055008470715e-06, "logits/chosen": -0.6657485365867615, "logits/rejected": -0.5495181083679199, "logps/chosen": -0.16760435700416565, "logps/rejected": -3.56003475189209, "loss": 0.1801, "odds_ratio_loss": 0.024452198296785355, "rewards/accuracies": 1.0, "rewards/chosen": -0.016760436818003654, "rewards/margins": 0.3392430543899536, "rewards/rejected": -0.3560035228729248, "sft_loss": 0.16760435700416565, "step": 2394 }, { "epoch": 3.463485177151121, "grad_norm": 1.8815520518710367, "learning_rate": 3.161468385325814e-06, "logits/chosen": -0.5054148435592651, "logits/rejected": -0.47322767972946167, "logps/chosen": -0.11986206471920013, "logps/rejected": -3.1187729835510254, "loss": 0.1612, "odds_ratio_loss": 0.02132941596210003, "rewards/accuracies": 1.0, "rewards/chosen": -0.011986206285655499, "rewards/margins": 0.2998911142349243, "rewards/rejected": -0.3118773102760315, "sft_loss": 0.11986206471920013, "step": 2395 }, { "epoch": 3.464931308749096, "grad_norm": 2.122953789567727, "learning_rate": 3.1584317753583897e-06, "logits/chosen": -0.45948153734207153, "logits/rejected": -0.4089983105659485, "logps/chosen": -0.15057674050331116, "logps/rejected": -3.30064058303833, "loss": 0.1828, "odds_ratio_loss": 0.038241539150476456, "rewards/accuracies": 1.0, "rewards/chosen": -0.015057675540447235, "rewards/margins": 0.3150063753128052, "rewards/rejected": -0.330064058303833, "sft_loss": 0.15057674050331116, "step": 2396 }, { "epoch": 3.4663774403470717, "grad_norm": 2.609325702886058, "learning_rate": 3.155395672775583e-06, "logits/chosen": -0.6183381080627441, "logits/rejected": -0.5282341241836548, "logps/chosen": -0.2384868562221527, "logps/rejected": -4.012352466583252, "loss": 0.1958, "odds_ratio_loss": 0.06628884375095367, "rewards/accuracies": 1.0, "rewards/chosen": -0.02384868450462818, "rewards/margins": 0.37738654017448425, "rewards/rejected": -0.4012352526187897, "sft_loss": 0.2384868562221527, "step": 2397 }, { "epoch": 3.467823571945047, "grad_norm": 1.963380190689956, "learning_rate": 3.1523600794078695e-06, "logits/chosen": -0.5454755425453186, "logits/rejected": -0.46037548780441284, "logps/chosen": -0.102660171687603, "logps/rejected": -2.684774160385132, "loss": 0.1444, "odds_ratio_loss": 0.021067647263407707, "rewards/accuracies": 1.0, "rewards/chosen": -0.0102660171687603, "rewards/margins": 0.2582114040851593, "rewards/rejected": -0.2684774398803711, "sft_loss": 0.102660171687603, "step": 2398 }, { "epoch": 3.469269703543022, "grad_norm": 2.2003218735262213, "learning_rate": 3.149324997085422e-06, "logits/chosen": -0.5759695768356323, "logits/rejected": -0.5235334634780884, "logps/chosen": -0.1454126387834549, "logps/rejected": -3.1648459434509277, "loss": 0.1878, "odds_ratio_loss": 0.03307610750198364, "rewards/accuracies": 1.0, "rewards/chosen": -0.0145412627607584, "rewards/margins": 0.3019433915615082, "rewards/rejected": -0.31648463010787964, "sft_loss": 0.1454126387834549, "step": 2399 }, { "epoch": 3.470715835140998, "grad_norm": 2.221086291687407, "learning_rate": 3.1462904276381016e-06, "logits/chosen": -0.6801586151123047, "logits/rejected": -0.5571650266647339, "logps/chosen": -0.2424691617488861, "logps/rejected": -3.4627394676208496, "loss": 0.1726, "odds_ratio_loss": 0.06682531535625458, "rewards/accuracies": 0.9375, "rewards/chosen": -0.02424691990017891, "rewards/margins": 0.3220270276069641, "rewards/rejected": -0.3462739586830139, "sft_loss": 0.2424691617488861, "step": 2400 }, { "epoch": 3.472161966738973, "grad_norm": 2.2423475367785213, "learning_rate": 3.1432563728954627e-06, "logits/chosen": -0.7703630924224854, "logits/rejected": -0.5540469288825989, "logps/chosen": -0.18336373567581177, "logps/rejected": -3.5720396041870117, "loss": 0.1913, "odds_ratio_loss": 0.015588531270623207, "rewards/accuracies": 1.0, "rewards/chosen": -0.018336374312639236, "rewards/margins": 0.33886754512786865, "rewards/rejected": -0.35720396041870117, "sft_loss": 0.18336373567581177, "step": 2401 }, { "epoch": 3.473608098336949, "grad_norm": 2.075655681568371, "learning_rate": 3.1402228346867464e-06, "logits/chosen": -0.6194726824760437, "logits/rejected": -0.4660893380641937, "logps/chosen": -0.16618576645851135, "logps/rejected": -3.4085822105407715, "loss": 0.1808, "odds_ratio_loss": 0.03004610911011696, "rewards/accuracies": 1.0, "rewards/chosen": -0.016618575900793076, "rewards/margins": 0.3242396116256714, "rewards/rejected": -0.34085819125175476, "sft_loss": 0.16618576645851135, "step": 2402 }, { "epoch": 3.475054229934924, "grad_norm": 2.176656937387457, "learning_rate": 3.1371898148408864e-06, "logits/chosen": -0.6529266238212585, "logits/rejected": -0.46994972229003906, "logps/chosen": -0.23654180765151978, "logps/rejected": -4.497025012969971, "loss": 0.1887, "odds_ratio_loss": 0.031046872958540916, "rewards/accuracies": 1.0, "rewards/chosen": -0.023654181510210037, "rewards/margins": 0.4260483384132385, "rewards/rejected": -0.44970253109931946, "sft_loss": 0.23654180765151978, "step": 2403 }, { "epoch": 3.4765003615328993, "grad_norm": 1.9864857166981567, "learning_rate": 3.1341573151864996e-06, "logits/chosen": -0.6524367332458496, "logits/rejected": -0.390330970287323, "logps/chosen": -0.18505634367465973, "logps/rejected": -4.1696391105651855, "loss": 0.1636, "odds_ratio_loss": 0.02654869109392166, "rewards/accuracies": 1.0, "rewards/chosen": -0.018505636602640152, "rewards/margins": 0.3984583020210266, "rewards/rejected": -0.4169639050960541, "sft_loss": 0.18505634367465973, "step": 2404 }, { "epoch": 3.477946493130875, "grad_norm": 2.024665467230913, "learning_rate": 3.131125337551891e-06, "logits/chosen": -0.7065335512161255, "logits/rejected": -0.5359545946121216, "logps/chosen": -0.14678552746772766, "logps/rejected": -3.0860984325408936, "loss": 0.2154, "odds_ratio_loss": 0.026049984619021416, "rewards/accuracies": 1.0, "rewards/chosen": -0.014678554609417915, "rewards/margins": 0.2939313054084778, "rewards/rejected": -0.30860984325408936, "sft_loss": 0.14678552746772766, "step": 2405 }, { "epoch": 3.4793926247288502, "grad_norm": 2.0090458837282785, "learning_rate": 3.1280938837650547e-06, "logits/chosen": -0.7596422433853149, "logits/rejected": -0.526679515838623, "logps/chosen": -0.13010278344154358, "logps/rejected": -4.164193153381348, "loss": 0.2014, "odds_ratio_loss": 0.026350578293204308, "rewards/accuracies": 1.0, "rewards/chosen": -0.013010278344154358, "rewards/margins": 0.40340906381607056, "rewards/rejected": -0.4164193272590637, "sft_loss": 0.13010278344154358, "step": 2406 }, { "epoch": 3.480838756326826, "grad_norm": 2.372630511041672, "learning_rate": 3.125062955653661e-06, "logits/chosen": -0.5935577750205994, "logits/rejected": -0.5829852223396301, "logps/chosen": -0.18501299619674683, "logps/rejected": -4.798090934753418, "loss": 0.2498, "odds_ratio_loss": 0.04796503111720085, "rewards/accuracies": 1.0, "rewards/chosen": -0.018501300364732742, "rewards/margins": 0.4613078236579895, "rewards/rejected": -0.47980910539627075, "sft_loss": 0.18501299619674683, "step": 2407 }, { "epoch": 3.482284887924801, "grad_norm": 1.973003502635966, "learning_rate": 3.122032555045072e-06, "logits/chosen": -0.5334011316299438, "logits/rejected": -0.46345335245132446, "logps/chosen": -0.14981776475906372, "logps/rejected": -3.4753215312957764, "loss": 0.1573, "odds_ratio_loss": 0.04779369384050369, "rewards/accuracies": 1.0, "rewards/chosen": -0.014981777407228947, "rewards/margins": 0.33255037665367126, "rewards/rejected": -0.34753215312957764, "sft_loss": 0.14981776475906372, "step": 2408 }, { "epoch": 3.4837310195227764, "grad_norm": 2.084221610637667, "learning_rate": 3.119002683766325e-06, "logits/chosen": -0.5933706760406494, "logits/rejected": -0.4296287000179291, "logps/chosen": -0.12299705296754837, "logps/rejected": -6.163819313049316, "loss": 0.1744, "odds_ratio_loss": 0.02097168192267418, "rewards/accuracies": 1.0, "rewards/chosen": -0.012299705296754837, "rewards/margins": 0.6040822267532349, "rewards/rejected": -0.6163819432258606, "sft_loss": 0.12299705296754837, "step": 2409 }, { "epoch": 3.485177151120752, "grad_norm": 2.2991481985822664, "learning_rate": 3.1159733436441413e-06, "logits/chosen": -0.6943192481994629, "logits/rejected": -0.3896532654762268, "logps/chosen": -0.2796631157398224, "logps/rejected": -3.7519736289978027, "loss": 0.1965, "odds_ratio_loss": 0.04865211993455887, "rewards/accuracies": 1.0, "rewards/chosen": -0.02796631120145321, "rewards/margins": 0.34723103046417236, "rewards/rejected": -0.3751973509788513, "sft_loss": 0.2796631157398224, "step": 2410 }, { "epoch": 3.4866232827187273, "grad_norm": 2.363386916775901, "learning_rate": 3.11294453650492e-06, "logits/chosen": -0.5733893513679504, "logits/rejected": -0.5263524055480957, "logps/chosen": -0.2019123136997223, "logps/rejected": -3.117097854614258, "loss": 0.2121, "odds_ratio_loss": 0.029736708849668503, "rewards/accuracies": 1.0, "rewards/chosen": -0.02019122987985611, "rewards/margins": 0.29151856899261475, "rewards/rejected": -0.31170982122421265, "sft_loss": 0.2019123136997223, "step": 2411 }, { "epoch": 3.488069414316703, "grad_norm": 2.1846083513594934, "learning_rate": 3.1099162641747427e-06, "logits/chosen": -0.5293310284614563, "logits/rejected": -0.3837476372718811, "logps/chosen": -0.10314946621656418, "logps/rejected": -5.216273307800293, "loss": 0.1856, "odds_ratio_loss": 0.052306585013866425, "rewards/accuracies": 0.9375, "rewards/chosen": -0.010314945131540298, "rewards/margins": 0.5113124251365662, "rewards/rejected": -0.5216273069381714, "sft_loss": 0.10314946621656418, "step": 2412 }, { "epoch": 3.4895155459146783, "grad_norm": 15.463632104482794, "learning_rate": 3.1068885284793636e-06, "logits/chosen": -0.620168149471283, "logits/rejected": -0.503106951713562, "logps/chosen": -0.3016592264175415, "logps/rejected": -3.0068225860595703, "loss": 0.2447, "odds_ratio_loss": 0.04610760882496834, "rewards/accuracies": 1.0, "rewards/chosen": -0.03016592189669609, "rewards/margins": 0.27051636576652527, "rewards/rejected": -0.30068230628967285, "sft_loss": 0.3016592264175415, "step": 2413 }, { "epoch": 3.4909616775126535, "grad_norm": 2.085084360099831, "learning_rate": 3.1038613312442187e-06, "logits/chosen": -0.6817932724952698, "logits/rejected": -0.5196681618690491, "logps/chosen": -0.18397408723831177, "logps/rejected": -4.176403999328613, "loss": 0.1861, "odds_ratio_loss": 0.032095346599817276, "rewards/accuracies": 1.0, "rewards/chosen": -0.018397409468889236, "rewards/margins": 0.39924299716949463, "rewards/rejected": -0.4176403880119324, "sft_loss": 0.18397408723831177, "step": 2414 }, { "epoch": 3.492407809110629, "grad_norm": 2.1247525539099206, "learning_rate": 3.1008346742944124e-06, "logits/chosen": -0.6427405476570129, "logits/rejected": -0.4603559076786041, "logps/chosen": -0.19035601615905762, "logps/rejected": -3.170347213745117, "loss": 0.2023, "odds_ratio_loss": 0.03771872818470001, "rewards/accuracies": 1.0, "rewards/chosen": -0.01903560385107994, "rewards/margins": 0.2979991137981415, "rewards/rejected": -0.3170347213745117, "sft_loss": 0.19035601615905762, "step": 2415 }, { "epoch": 3.4938539407086044, "grad_norm": 2.1004484725383876, "learning_rate": 3.097808559454732e-06, "logits/chosen": -0.5600918531417847, "logits/rejected": -0.45386743545532227, "logps/chosen": -0.16461656987667084, "logps/rejected": -4.282623767852783, "loss": 0.2029, "odds_ratio_loss": 0.035014864057302475, "rewards/accuracies": 1.0, "rewards/chosen": -0.016461655497550964, "rewards/margins": 0.41180071234703064, "rewards/rejected": -0.4282623827457428, "sft_loss": 0.16461656987667084, "step": 2416 }, { "epoch": 3.4953000723065797, "grad_norm": 1.9634213959408762, "learning_rate": 3.09478298854963e-06, "logits/chosen": -0.4763942062854767, "logits/rejected": -0.27630501985549927, "logps/chosen": -0.14406144618988037, "logps/rejected": -4.638453006744385, "loss": 0.1381, "odds_ratio_loss": 0.03147125244140625, "rewards/accuracies": 1.0, "rewards/chosen": -0.014406142756342888, "rewards/margins": 0.4494391083717346, "rewards/rejected": -0.46384525299072266, "sft_loss": 0.14406144618988037, "step": 2417 }, { "epoch": 3.4967462039045554, "grad_norm": 2.0293001482316146, "learning_rate": 3.0917579634032345e-06, "logits/chosen": -0.6355884075164795, "logits/rejected": -0.4838637709617615, "logps/chosen": -0.31256037950515747, "logps/rejected": -5.803687572479248, "loss": 0.2496, "odds_ratio_loss": 0.10423914343118668, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03125603869557381, "rewards/margins": 0.5491127371788025, "rewards/rejected": -0.5803688168525696, "sft_loss": 0.31256037950515747, "step": 2418 }, { "epoch": 3.4981923355025306, "grad_norm": 1.9185361869559496, "learning_rate": 3.088733485839348e-06, "logits/chosen": -0.6340678334236145, "logits/rejected": -0.540869951248169, "logps/chosen": -0.21716678142547607, "logps/rejected": -4.758569240570068, "loss": 0.1763, "odds_ratio_loss": 0.050362855195999146, "rewards/accuracies": 1.0, "rewards/chosen": -0.021716676652431488, "rewards/margins": 0.45414021611213684, "rewards/rejected": -0.4758569002151489, "sft_loss": 0.21716678142547607, "step": 2419 }, { "epoch": 3.4996384671005063, "grad_norm": 2.019327134335387, "learning_rate": 3.0857095576814357e-06, "logits/chosen": -0.38161930441856384, "logits/rejected": -0.34976017475128174, "logps/chosen": -0.18557745218276978, "logps/rejected": -6.113600730895996, "loss": 0.2017, "odds_ratio_loss": 0.04104998707771301, "rewards/accuracies": 1.0, "rewards/chosen": -0.018557745963335037, "rewards/margins": 0.5928024053573608, "rewards/rejected": -0.6113600730895996, "sft_loss": 0.18557745218276978, "step": 2420 }, { "epoch": 3.5010845986984815, "grad_norm": 2.0202611999068663, "learning_rate": 3.0826861807526366e-06, "logits/chosen": -0.63789963722229, "logits/rejected": -0.7829943895339966, "logps/chosen": -0.2620221674442291, "logps/rejected": -5.192530632019043, "loss": 0.2215, "odds_ratio_loss": 0.06331058591604233, "rewards/accuracies": 1.0, "rewards/chosen": -0.026202216744422913, "rewards/margins": 0.4930509030818939, "rewards/rejected": -0.519253134727478, "sft_loss": 0.2620221674442291, "step": 2421 }, { "epoch": 3.5025307302964572, "grad_norm": 2.127595750553593, "learning_rate": 3.079663356875754e-06, "logits/chosen": -0.5893542766571045, "logits/rejected": -0.4019384980201721, "logps/chosen": -0.16648858785629272, "logps/rejected": -4.031942844390869, "loss": 0.2227, "odds_ratio_loss": 0.04801696911454201, "rewards/accuracies": 1.0, "rewards/chosen": -0.016648858785629272, "rewards/margins": 0.38654541969299316, "rewards/rejected": -0.40319427847862244, "sft_loss": 0.16648858785629272, "step": 2422 }, { "epoch": 3.5039768618944325, "grad_norm": 1.996498648451224, "learning_rate": 3.0766410878732634e-06, "logits/chosen": -0.47269999980926514, "logits/rejected": -0.32516586780548096, "logps/chosen": -0.20525558292865753, "logps/rejected": -3.3645095825195312, "loss": 0.1638, "odds_ratio_loss": 0.021875783801078796, "rewards/accuracies": 1.0, "rewards/chosen": -0.020525559782981873, "rewards/margins": 0.31592538952827454, "rewards/rejected": -0.3364509344100952, "sft_loss": 0.20525558292865753, "step": 2423 }, { "epoch": 3.5054229934924077, "grad_norm": 2.2579256443891875, "learning_rate": 3.073619375567299e-06, "logits/chosen": -0.4492790699005127, "logits/rejected": -0.4535216689109802, "logps/chosen": -0.16690582036972046, "logps/rejected": -4.513905048370361, "loss": 0.1916, "odds_ratio_loss": 0.04860381782054901, "rewards/accuracies": 1.0, "rewards/chosen": -0.016690582036972046, "rewards/margins": 0.4346998929977417, "rewards/rejected": -0.45139047503471375, "sft_loss": 0.16690582036972046, "step": 2424 }, { "epoch": 3.5068691250903834, "grad_norm": 2.1783585530456357, "learning_rate": 3.070598221779664e-06, "logits/chosen": -0.6967248916625977, "logits/rejected": -0.4836665093898773, "logps/chosen": -0.20443633198738098, "logps/rejected": -3.487060785293579, "loss": 0.2119, "odds_ratio_loss": 0.029861953109502792, "rewards/accuracies": 1.0, "rewards/chosen": -0.020443633198738098, "rewards/margins": 0.32826241850852966, "rewards/rejected": -0.34870606660842896, "sft_loss": 0.20443633198738098, "step": 2425 }, { "epoch": 3.5083152566883586, "grad_norm": 2.463389036122334, "learning_rate": 3.0675776283318203e-06, "logits/chosen": -0.5934221744537354, "logits/rejected": -0.5299607515335083, "logps/chosen": -0.2866644263267517, "logps/rejected": -4.078819274902344, "loss": 0.1839, "odds_ratio_loss": 0.025208022445440292, "rewards/accuracies": 1.0, "rewards/chosen": -0.02866644226014614, "rewards/margins": 0.3792154788970947, "rewards/rejected": -0.4078819155693054, "sft_loss": 0.2866644263267517, "step": 2426 }, { "epoch": 3.509761388286334, "grad_norm": 2.3513293545188634, "learning_rate": 3.064557597044899e-06, "logits/chosen": -0.5402525663375854, "logits/rejected": -0.4047253727912903, "logps/chosen": -0.27317333221435547, "logps/rejected": -4.133315086364746, "loss": 0.2127, "odds_ratio_loss": 0.05065507814288139, "rewards/accuracies": 1.0, "rewards/chosen": -0.027317333966493607, "rewards/margins": 0.3860141634941101, "rewards/rejected": -0.4133315086364746, "sft_loss": 0.27317333221435547, "step": 2427 }, { "epoch": 3.5112075198843096, "grad_norm": 2.629368897025621, "learning_rate": 3.0615381297396863e-06, "logits/chosen": -0.6587547063827515, "logits/rejected": -0.5991069674491882, "logps/chosen": -0.18863442540168762, "logps/rejected": -3.0743231773376465, "loss": 0.1699, "odds_ratio_loss": 0.033928271383047104, "rewards/accuracies": 1.0, "rewards/chosen": -0.018863443285226822, "rewards/margins": 0.2885688841342926, "rewards/rejected": -0.3074323236942291, "sft_loss": 0.18863442540168762, "step": 2428 }, { "epoch": 3.512653651482285, "grad_norm": 1.9268262456854648, "learning_rate": 3.058519228236631e-06, "logits/chosen": -0.6309177279472351, "logits/rejected": -0.5707622766494751, "logps/chosen": -0.15104332566261292, "logps/rejected": -2.467463254928589, "loss": 0.2081, "odds_ratio_loss": 0.030597880482673645, "rewards/accuracies": 1.0, "rewards/chosen": -0.015104331076145172, "rewards/margins": 0.2316419780254364, "rewards/rejected": -0.24674633145332336, "sft_loss": 0.15104332566261292, "step": 2429 }, { "epoch": 3.51409978308026, "grad_norm": 2.187578199208661, "learning_rate": 3.0555008943558376e-06, "logits/chosen": -0.43284595012664795, "logits/rejected": -0.3950757086277008, "logps/chosen": -0.3491520285606384, "logps/rejected": -5.344951629638672, "loss": 0.2358, "odds_ratio_loss": 0.07832454144954681, "rewards/accuracies": 1.0, "rewards/chosen": -0.03491520509123802, "rewards/margins": 0.4995799958705902, "rewards/rejected": -0.5344952344894409, "sft_loss": 0.3491520285606384, "step": 2430 }, { "epoch": 3.5155459146782357, "grad_norm": 2.0356756659249515, "learning_rate": 3.052483129917074e-06, "logits/chosen": -0.527158260345459, "logits/rejected": -0.459917277097702, "logps/chosen": -0.1386464685201645, "logps/rejected": -6.810248374938965, "loss": 0.1815, "odds_ratio_loss": 0.02819202095270157, "rewards/accuracies": 1.0, "rewards/chosen": -0.013864649459719658, "rewards/margins": 0.6671602129936218, "rewards/rejected": -0.6810248494148254, "sft_loss": 0.1386464685201645, "step": 2431 }, { "epoch": 3.516992046276211, "grad_norm": 1.9468625134569972, "learning_rate": 3.04946593673976e-06, "logits/chosen": -0.37922120094299316, "logits/rejected": -0.36168068647384644, "logps/chosen": -0.20144081115722656, "logps/rejected": -4.170196533203125, "loss": 0.1787, "odds_ratio_loss": 0.031825270503759384, "rewards/accuracies": 1.0, "rewards/chosen": -0.020144082605838776, "rewards/margins": 0.3968755900859833, "rewards/rejected": -0.41701966524124146, "sft_loss": 0.20144081115722656, "step": 2432 }, { "epoch": 3.5184381778741867, "grad_norm": 1.9008651504383323, "learning_rate": 3.046449316642972e-06, "logits/chosen": -0.53741455078125, "logits/rejected": -0.49799400568008423, "logps/chosen": -0.26500892639160156, "logps/rejected": -3.6741976737976074, "loss": 0.2029, "odds_ratio_loss": 0.09227637946605682, "rewards/accuracies": 1.0, "rewards/chosen": -0.026500891894102097, "rewards/margins": 0.3409188985824585, "rewards/rejected": -0.3674197793006897, "sft_loss": 0.26500892639160156, "step": 2433 }, { "epoch": 3.519884309472162, "grad_norm": 1.795668496423871, "learning_rate": 3.043433271445444e-06, "logits/chosen": -0.5739824175834656, "logits/rejected": -0.5970664024353027, "logps/chosen": -0.06262612342834473, "logps/rejected": -4.009900093078613, "loss": 0.1221, "odds_ratio_loss": 0.02301051840186119, "rewards/accuracies": 1.0, "rewards/chosen": -0.0062626120634377, "rewards/margins": 0.3947274088859558, "rewards/rejected": -0.4009900391101837, "sft_loss": 0.06262612342834473, "step": 2434 }, { "epoch": 3.5213304410701376, "grad_norm": 2.3034738221241087, "learning_rate": 3.0404178029655584e-06, "logits/chosen": -0.631015419960022, "logits/rejected": -0.41128766536712646, "logps/chosen": -0.20884862542152405, "logps/rejected": -3.4437193870544434, "loss": 0.1965, "odds_ratio_loss": 0.034937452524900436, "rewards/accuracies": 1.0, "rewards/chosen": -0.020884864032268524, "rewards/margins": 0.3234870433807373, "rewards/rejected": -0.3443719148635864, "sft_loss": 0.20884862542152405, "step": 2435 }, { "epoch": 3.522776572668113, "grad_norm": 1.945593311128753, "learning_rate": 3.037402913021354e-06, "logits/chosen": -0.4494878649711609, "logits/rejected": -0.449461966753006, "logps/chosen": -0.1753808557987213, "logps/rejected": -4.066275596618652, "loss": 0.1666, "odds_ratio_loss": 0.035513922572135925, "rewards/accuracies": 1.0, "rewards/chosen": -0.01753808557987213, "rewards/margins": 0.3890894651412964, "rewards/rejected": -0.4066275656223297, "sft_loss": 0.1753808557987213, "step": 2436 }, { "epoch": 3.524222704266088, "grad_norm": 2.079948813762282, "learning_rate": 3.0343886034305167e-06, "logits/chosen": -0.6514714956283569, "logits/rejected": -0.5961179733276367, "logps/chosen": -0.2650774121284485, "logps/rejected": -5.5663652420043945, "loss": 0.2188, "odds_ratio_loss": 0.07149592787027359, "rewards/accuracies": 1.0, "rewards/chosen": -0.02650773897767067, "rewards/margins": 0.5301287770271301, "rewards/rejected": -0.5566365122795105, "sft_loss": 0.2650774121284485, "step": 2437 }, { "epoch": 3.5256688358640638, "grad_norm": 2.1260641283263273, "learning_rate": 3.0313748760103887e-06, "logits/chosen": -0.49076318740844727, "logits/rejected": -0.34688353538513184, "logps/chosen": -0.12424877285957336, "logps/rejected": -5.382345676422119, "loss": 0.1499, "odds_ratio_loss": 0.030808860436081886, "rewards/accuracies": 1.0, "rewards/chosen": -0.012424877844750881, "rewards/margins": 0.5258097052574158, "rewards/rejected": -0.5382345914840698, "sft_loss": 0.12424877285957336, "step": 2438 }, { "epoch": 3.527114967462039, "grad_norm": 2.371176330762152, "learning_rate": 3.0283617325779545e-06, "logits/chosen": -0.4812876880168915, "logits/rejected": -0.34987127780914307, "logps/chosen": -0.2775692641735077, "logps/rejected": -4.12560510635376, "loss": 0.2054, "odds_ratio_loss": 0.04205989092588425, "rewards/accuracies": 1.0, "rewards/chosen": -0.02775692753493786, "rewards/margins": 0.3848035931587219, "rewards/rejected": -0.41256052255630493, "sft_loss": 0.2775692641735077, "step": 2439 }, { "epoch": 3.5285610990600143, "grad_norm": 2.0813750370837423, "learning_rate": 3.0253491749498512e-06, "logits/chosen": -0.4459601938724518, "logits/rejected": -0.328517884016037, "logps/chosen": -0.3284389078617096, "logps/rejected": -3.3992013931274414, "loss": 0.2288, "odds_ratio_loss": 0.058917153626680374, "rewards/accuracies": 1.0, "rewards/chosen": -0.03284389153122902, "rewards/margins": 0.30707627534866333, "rewards/rejected": -0.33992013335227966, "sft_loss": 0.3284389078617096, "step": 2440 }, { "epoch": 3.53000723065799, "grad_norm": 2.003917998097915, "learning_rate": 3.0223372049423586e-06, "logits/chosen": -0.7053133249282837, "logits/rejected": -0.4800989329814911, "logps/chosen": -0.14180231094360352, "logps/rejected": -3.8691177368164062, "loss": 0.1728, "odds_ratio_loss": 0.039757855236530304, "rewards/accuracies": 1.0, "rewards/chosen": -0.014180229976773262, "rewards/margins": 0.3727315664291382, "rewards/rejected": -0.3869118094444275, "sft_loss": 0.14180231094360352, "step": 2441 }, { "epoch": 3.531453362255965, "grad_norm": 2.0461480696169603, "learning_rate": 3.0193258243714084e-06, "logits/chosen": -0.5701842308044434, "logits/rejected": -0.3964661657810211, "logps/chosen": -0.25734081864356995, "logps/rejected": -2.8863861560821533, "loss": 0.1949, "odds_ratio_loss": 0.08616319298744202, "rewards/accuracies": 0.9375, "rewards/chosen": -0.025734081864356995, "rewards/margins": 0.26290449500083923, "rewards/rejected": -0.2886385917663574, "sft_loss": 0.25734081864356995, "step": 2442 }, { "epoch": 3.532899493853941, "grad_norm": 2.255625866443697, "learning_rate": 3.01631503505257e-06, "logits/chosen": -0.7258257865905762, "logits/rejected": -0.4492414593696594, "logps/chosen": -0.21355527639389038, "logps/rejected": -3.853524684906006, "loss": 0.2263, "odds_ratio_loss": 0.030937321484088898, "rewards/accuracies": 1.0, "rewards/chosen": -0.021355528384447098, "rewards/margins": 0.3639969527721405, "rewards/rejected": -0.3853524923324585, "sft_loss": 0.21355527639389038, "step": 2443 }, { "epoch": 3.534345625451916, "grad_norm": 2.1382791727428034, "learning_rate": 3.0133048388010615e-06, "logits/chosen": -0.48679161071777344, "logits/rejected": -0.4358132779598236, "logps/chosen": -0.17743311822414398, "logps/rejected": -3.7108383178710938, "loss": 0.1898, "odds_ratio_loss": 0.029122265055775642, "rewards/accuracies": 1.0, "rewards/chosen": -0.017743311822414398, "rewards/margins": 0.3533405065536499, "rewards/rejected": -0.3710837960243225, "sft_loss": 0.17743311822414398, "step": 2444 }, { "epoch": 3.535791757049892, "grad_norm": 1.9527830299379936, "learning_rate": 3.0102952374317392e-06, "logits/chosen": -0.6457036137580872, "logits/rejected": -0.5125142335891724, "logps/chosen": -0.1727660447359085, "logps/rejected": -4.623327255249023, "loss": 0.2268, "odds_ratio_loss": 0.026115503162145615, "rewards/accuracies": 1.0, "rewards/chosen": -0.01727660559117794, "rewards/margins": 0.44505611062049866, "rewards/rejected": -0.46233272552490234, "sft_loss": 0.1727660447359085, "step": 2445 }, { "epoch": 3.537237888647867, "grad_norm": 2.080234692907973, "learning_rate": 3.007286232759105e-06, "logits/chosen": -0.6250134110450745, "logits/rejected": -0.49293088912963867, "logps/chosen": -0.2399110198020935, "logps/rejected": -3.053586721420288, "loss": 0.1964, "odds_ratio_loss": 0.04962414503097534, "rewards/accuracies": 1.0, "rewards/chosen": -0.02399110235273838, "rewards/margins": 0.28136757016181946, "rewards/rejected": -0.3053586781024933, "sft_loss": 0.2399110198020935, "step": 2446 }, { "epoch": 3.5386840202458423, "grad_norm": 2.317314461800139, "learning_rate": 3.0042778265972984e-06, "logits/chosen": -0.47505897283554077, "logits/rejected": -0.39482036232948303, "logps/chosen": -0.35046637058258057, "logps/rejected": -3.926889419555664, "loss": 0.2291, "odds_ratio_loss": 0.0780850425362587, "rewards/accuracies": 1.0, "rewards/chosen": -0.03504663333296776, "rewards/margins": 0.3576422929763794, "rewards/rejected": -0.39268895983695984, "sft_loss": 0.35046637058258057, "step": 2447 }, { "epoch": 3.540130151843818, "grad_norm": 2.058424958820097, "learning_rate": 3.0012700207600974e-06, "logits/chosen": -0.6533050537109375, "logits/rejected": -0.45995908975601196, "logps/chosen": -0.1486455202102661, "logps/rejected": -4.053260803222656, "loss": 0.2256, "odds_ratio_loss": 0.022504646331071854, "rewards/accuracies": 1.0, "rewards/chosen": -0.014864552766084671, "rewards/margins": 0.3904615640640259, "rewards/rejected": -0.40532612800598145, "sft_loss": 0.1486455202102661, "step": 2448 }, { "epoch": 3.541576283441793, "grad_norm": 2.267037713125268, "learning_rate": 2.9982628170609223e-06, "logits/chosen": -0.6241118907928467, "logits/rejected": -0.5395658612251282, "logps/chosen": -0.16177615523338318, "logps/rejected": -3.594677448272705, "loss": 0.1644, "odds_ratio_loss": 0.03948385640978813, "rewards/accuracies": 1.0, "rewards/chosen": -0.016177615150809288, "rewards/margins": 0.34329015016555786, "rewards/rejected": -0.3594677448272705, "sft_loss": 0.16177615523338318, "step": 2449 }, { "epoch": 3.5430224150397684, "grad_norm": 2.168659789575291, "learning_rate": 2.9952562173128248e-06, "logits/chosen": -0.47931620478630066, "logits/rejected": -0.3874140977859497, "logps/chosen": -0.21014590561389923, "logps/rejected": -5.508059501647949, "loss": 0.1959, "odds_ratio_loss": 0.0441533587872982, "rewards/accuracies": 1.0, "rewards/chosen": -0.021014589816331863, "rewards/margins": 0.5297913551330566, "rewards/rejected": -0.550805926322937, "sft_loss": 0.21014590561389923, "step": 2450 }, { "epoch": 3.544468546637744, "grad_norm": 2.0176231969282497, "learning_rate": 2.9922502233284973e-06, "logits/chosen": -0.5037535429000854, "logits/rejected": -0.46424558758735657, "logps/chosen": -0.30051517486572266, "logps/rejected": -5.129392147064209, "loss": 0.2652, "odds_ratio_loss": 0.08848903328180313, "rewards/accuracies": 1.0, "rewards/chosen": -0.030051520094275475, "rewards/margins": 0.4828876852989197, "rewards/rejected": -0.5129392147064209, "sft_loss": 0.30051517486572266, "step": 2451 }, { "epoch": 3.5459146782357194, "grad_norm": 2.3169499110429617, "learning_rate": 2.989244836920261e-06, "logits/chosen": -0.33989861607551575, "logits/rejected": -0.3616371154785156, "logps/chosen": -0.19239774346351624, "logps/rejected": -3.9730749130249023, "loss": 0.2066, "odds_ratio_loss": 0.06537087261676788, "rewards/accuracies": 1.0, "rewards/chosen": -0.019239773973822594, "rewards/margins": 0.3780677318572998, "rewards/rejected": -0.39730751514434814, "sft_loss": 0.19239774346351624, "step": 2452 }, { "epoch": 3.5473608098336946, "grad_norm": 2.357450845950666, "learning_rate": 2.986240059900079e-06, "logits/chosen": -0.5030441284179688, "logits/rejected": -0.43161094188690186, "logps/chosen": -0.34148740768432617, "logps/rejected": -4.043811798095703, "loss": 0.2018, "odds_ratio_loss": 0.04510558396577835, "rewards/accuracies": 1.0, "rewards/chosen": -0.03414874151349068, "rewards/margins": 0.3702324628829956, "rewards/rejected": -0.4043811559677124, "sft_loss": 0.34148740768432617, "step": 2453 }, { "epoch": 3.5488069414316703, "grad_norm": 2.1646070564772795, "learning_rate": 2.983235894079539e-06, "logits/chosen": -0.7625619769096375, "logits/rejected": -0.6689075231552124, "logps/chosen": -0.19497406482696533, "logps/rejected": -4.010917663574219, "loss": 0.1559, "odds_ratio_loss": 0.09022661298513412, "rewards/accuracies": 0.9375, "rewards/chosen": -0.019497405737638474, "rewards/margins": 0.3815944194793701, "rewards/rejected": -0.4010918140411377, "sft_loss": 0.19497406482696533, "step": 2454 }, { "epoch": 3.5502530730296455, "grad_norm": 2.153672375639417, "learning_rate": 2.9802323412698666e-06, "logits/chosen": -0.5097517371177673, "logits/rejected": -0.5301701426506042, "logps/chosen": -0.18443165719509125, "logps/rejected": -2.901488780975342, "loss": 0.1948, "odds_ratio_loss": 0.056402601301670074, "rewards/accuracies": 1.0, "rewards/chosen": -0.018443167209625244, "rewards/margins": 0.2717057466506958, "rewards/rejected": -0.29014888405799866, "sft_loss": 0.18443165719509125, "step": 2455 }, { "epoch": 3.5516992046276212, "grad_norm": 2.4930393761949268, "learning_rate": 2.977229403281913e-06, "logits/chosen": -0.5628433227539062, "logits/rejected": -0.46921300888061523, "logps/chosen": -0.2880474030971527, "logps/rejected": -3.0459322929382324, "loss": 0.2311, "odds_ratio_loss": 0.05179532617330551, "rewards/accuracies": 1.0, "rewards/chosen": -0.02880474366247654, "rewards/margins": 0.2757885158061981, "rewards/rejected": -0.3045932650566101, "sft_loss": 0.2880474030971527, "step": 2456 }, { "epoch": 3.5531453362255965, "grad_norm": 1.8099839799147603, "learning_rate": 2.974227081926162e-06, "logits/chosen": -0.6780756711959839, "logits/rejected": -0.5614203810691833, "logps/chosen": -0.22212156653404236, "logps/rejected": -4.356184959411621, "loss": 0.1565, "odds_ratio_loss": 0.056687142699956894, "rewards/accuracies": 1.0, "rewards/chosen": -0.022212158888578415, "rewards/margins": 0.4134063422679901, "rewards/rejected": -0.43561851978302, "sft_loss": 0.22212156653404236, "step": 2457 }, { "epoch": 3.554591467823572, "grad_norm": 2.092408680060082, "learning_rate": 2.9712253790127223e-06, "logits/chosen": -0.5250091552734375, "logits/rejected": -0.37066176533699036, "logps/chosen": -0.15992625057697296, "logps/rejected": -2.9919040203094482, "loss": 0.1848, "odds_ratio_loss": 0.022480811923742294, "rewards/accuracies": 1.0, "rewards/chosen": -0.015992626547813416, "rewards/margins": 0.2831977903842926, "rewards/rejected": -0.2991904020309448, "sft_loss": 0.15992625057697296, "step": 2458 }, { "epoch": 3.5560375994215474, "grad_norm": 2.399018956229861, "learning_rate": 2.968224296351334e-06, "logits/chosen": -0.617534875869751, "logits/rejected": -0.5581540465354919, "logps/chosen": -0.177803635597229, "logps/rejected": -3.2628211975097656, "loss": 0.205, "odds_ratio_loss": 0.039328742772340775, "rewards/accuracies": 1.0, "rewards/chosen": -0.01778036542236805, "rewards/margins": 0.3085017800331116, "rewards/rejected": -0.3262821435928345, "sft_loss": 0.177803635597229, "step": 2459 }, { "epoch": 3.5574837310195226, "grad_norm": 2.1254520024891432, "learning_rate": 2.965223835751361e-06, "logits/chosen": -0.7163950204849243, "logits/rejected": -0.5951937437057495, "logps/chosen": -0.2112230509519577, "logps/rejected": -3.38348388671875, "loss": 0.2085, "odds_ratio_loss": 0.07596226781606674, "rewards/accuracies": 0.9375, "rewards/chosen": -0.02112230658531189, "rewards/margins": 0.3172260820865631, "rewards/rejected": -0.338348388671875, "sft_loss": 0.2112230509519577, "step": 2460 }, { "epoch": 3.5589298626174983, "grad_norm": 2.0093893471895967, "learning_rate": 2.9622239990217896e-06, "logits/chosen": -0.6013926267623901, "logits/rejected": -0.5151457190513611, "logps/chosen": -0.15072676539421082, "logps/rejected": -2.643908739089966, "loss": 0.1547, "odds_ratio_loss": 0.022211087867617607, "rewards/accuracies": 1.0, "rewards/chosen": -0.015072677284479141, "rewards/margins": 0.2493181973695755, "rewards/rejected": -0.26439088582992554, "sft_loss": 0.15072676539421082, "step": 2461 }, { "epoch": 3.5603759942154736, "grad_norm": 2.2035437107137046, "learning_rate": 2.9592247879712357e-06, "logits/chosen": -0.3561607599258423, "logits/rejected": -0.36101406812667847, "logps/chosen": -0.2586825489997864, "logps/rejected": -3.189708709716797, "loss": 0.2256, "odds_ratio_loss": 0.0867890790104866, "rewards/accuracies": 1.0, "rewards/chosen": -0.025868259370326996, "rewards/margins": 0.2931026220321655, "rewards/rejected": -0.31897085905075073, "sft_loss": 0.2586825489997864, "step": 2462 }, { "epoch": 3.561822125813449, "grad_norm": 2.054047426793152, "learning_rate": 2.956226204407933e-06, "logits/chosen": -0.5213802456855774, "logits/rejected": -0.47224000096321106, "logps/chosen": -0.14102500677108765, "logps/rejected": -4.169181823730469, "loss": 0.1955, "odds_ratio_loss": 0.0326719805598259, "rewards/accuracies": 1.0, "rewards/chosen": -0.014102501794695854, "rewards/margins": 0.40281563997268677, "rewards/rejected": -0.41691815853118896, "sft_loss": 0.14102500677108765, "step": 2463 }, { "epoch": 3.5632682574114245, "grad_norm": 2.0041196821551353, "learning_rate": 2.95322825013974e-06, "logits/chosen": -0.5516745448112488, "logits/rejected": -0.41874760389328003, "logps/chosen": -0.1619918942451477, "logps/rejected": -3.6532042026519775, "loss": 0.1731, "odds_ratio_loss": 0.028985023498535156, "rewards/accuracies": 1.0, "rewards/chosen": -0.01619919016957283, "rewards/margins": 0.34912124276161194, "rewards/rejected": -0.3653204143047333, "sft_loss": 0.1619918942451477, "step": 2464 }, { "epoch": 3.5647143890093997, "grad_norm": 1.9715716167615243, "learning_rate": 2.9502309269741314e-06, "logits/chosen": -0.4302927255630493, "logits/rejected": -0.3220667541027069, "logps/chosen": -0.12735146284103394, "logps/rejected": -3.6478960514068604, "loss": 0.199, "odds_ratio_loss": 0.020095938816666603, "rewards/accuracies": 1.0, "rewards/chosen": -0.012735145166516304, "rewards/margins": 0.3520544767379761, "rewards/rejected": -0.3647896349430084, "sft_loss": 0.12735146284103394, "step": 2465 }, { "epoch": 3.5661605206073754, "grad_norm": 2.1032834327668506, "learning_rate": 2.9472342367182086e-06, "logits/chosen": -0.5896605253219604, "logits/rejected": -0.5629295706748962, "logps/chosen": -0.331041157245636, "logps/rejected": -3.1009109020233154, "loss": 0.2487, "odds_ratio_loss": 0.05582534521818161, "rewards/accuracies": 1.0, "rewards/chosen": -0.03310411795973778, "rewards/margins": 0.2769869863986969, "rewards/rejected": -0.3100910782814026, "sft_loss": 0.331041157245636, "step": 2466 }, { "epoch": 3.5676066522053507, "grad_norm": 2.2201609739812924, "learning_rate": 2.9442381811786846e-06, "logits/chosen": -0.5560648441314697, "logits/rejected": -0.46408846974372864, "logps/chosen": -0.1172012984752655, "logps/rejected": -3.779165267944336, "loss": 0.167, "odds_ratio_loss": 0.026231221854686737, "rewards/accuracies": 1.0, "rewards/chosen": -0.011720129288733006, "rewards/margins": 0.3661963939666748, "rewards/rejected": -0.37791651487350464, "sft_loss": 0.1172012984752655, "step": 2467 }, { "epoch": 3.5690527838033264, "grad_norm": 2.125829017437066, "learning_rate": 2.9412427621618936e-06, "logits/chosen": -0.569636344909668, "logits/rejected": -0.482186883687973, "logps/chosen": -0.18977811932563782, "logps/rejected": -6.578741550445557, "loss": 0.1958, "odds_ratio_loss": 0.03151582553982735, "rewards/accuracies": 1.0, "rewards/chosen": -0.0189778134226799, "rewards/margins": 0.6388963460922241, "rewards/rejected": -0.6578741669654846, "sft_loss": 0.18977811932563782, "step": 2468 }, { "epoch": 3.5704989154013016, "grad_norm": 2.2044030860237367, "learning_rate": 2.9382479814737836e-06, "logits/chosen": -0.41000592708587646, "logits/rejected": -0.410857617855072, "logps/chosen": -0.3284101188182831, "logps/rejected": -3.962172031402588, "loss": 0.2546, "odds_ratio_loss": 0.05446305871009827, "rewards/accuracies": 1.0, "rewards/chosen": -0.03284101188182831, "rewards/margins": 0.3633762001991272, "rewards/rejected": -0.3962172269821167, "sft_loss": 0.3284101188182831, "step": 2469 }, { "epoch": 3.571945046999277, "grad_norm": 1.9361635852730106, "learning_rate": 2.9352538409199213e-06, "logits/chosen": -0.4168761670589447, "logits/rejected": -0.44787657260894775, "logps/chosen": -0.22248664498329163, "logps/rejected": -3.7818639278411865, "loss": 0.1994, "odds_ratio_loss": 0.03490440174937248, "rewards/accuracies": 1.0, "rewards/chosen": -0.02224866673350334, "rewards/margins": 0.3559377193450928, "rewards/rejected": -0.3781863749027252, "sft_loss": 0.22248664498329163, "step": 2470 }, { "epoch": 3.5733911785972525, "grad_norm": 2.078403301829724, "learning_rate": 2.9322603423054826e-06, "logits/chosen": -0.7230794429779053, "logits/rejected": -0.5729874968528748, "logps/chosen": -0.22554948925971985, "logps/rejected": -3.4284701347351074, "loss": 0.1535, "odds_ratio_loss": 0.04831676930189133, "rewards/accuracies": 1.0, "rewards/chosen": -0.022554948925971985, "rewards/margins": 0.32029205560684204, "rewards/rejected": -0.34284698963165283, "sft_loss": 0.22554948925971985, "step": 2471 }, { "epoch": 3.5748373101952278, "grad_norm": 1.9481832175668146, "learning_rate": 2.92926748743526e-06, "logits/chosen": -0.5367501378059387, "logits/rejected": -0.5176258683204651, "logps/chosen": -0.2013862282037735, "logps/rejected": -3.327510118484497, "loss": 0.1885, "odds_ratio_loss": 0.028387073427438736, "rewards/accuracies": 1.0, "rewards/chosen": -0.02013862319290638, "rewards/margins": 0.3126124143600464, "rewards/rejected": -0.3327510356903076, "sft_loss": 0.2013862282037735, "step": 2472 }, { "epoch": 3.576283441793203, "grad_norm": 2.298558661220886, "learning_rate": 2.9262752781136584e-06, "logits/chosen": -0.5828816890716553, "logits/rejected": -0.4730074405670166, "logps/chosen": -0.14373230934143066, "logps/rejected": -2.983417510986328, "loss": 0.1655, "odds_ratio_loss": 0.020958803594112396, "rewards/accuracies": 1.0, "rewards/chosen": -0.014373233541846275, "rewards/margins": 0.2839685082435608, "rewards/rejected": -0.2983417510986328, "sft_loss": 0.14373230934143066, "step": 2473 }, { "epoch": 3.5777295733911787, "grad_norm": 2.2309893965679914, "learning_rate": 2.92328371614469e-06, "logits/chosen": -0.4653478264808655, "logits/rejected": -0.4630056619644165, "logps/chosen": -0.1631046086549759, "logps/rejected": -4.455272674560547, "loss": 0.165, "odds_ratio_loss": 0.04333974048495293, "rewards/accuracies": 1.0, "rewards/chosen": -0.01631046086549759, "rewards/margins": 0.42921683192253113, "rewards/rejected": -0.44552725553512573, "sft_loss": 0.1631046086549759, "step": 2474 }, { "epoch": 3.579175704989154, "grad_norm": 1.9881278080353881, "learning_rate": 2.9202928033319802e-06, "logits/chosen": -0.486683189868927, "logits/rejected": -0.4148131012916565, "logps/chosen": -0.23954859375953674, "logps/rejected": -4.019528388977051, "loss": 0.2333, "odds_ratio_loss": 0.03935012221336365, "rewards/accuracies": 1.0, "rewards/chosen": -0.023954860866069794, "rewards/margins": 0.3779979944229126, "rewards/rejected": -0.4019528329372406, "sft_loss": 0.23954859375953674, "step": 2475 }, { "epoch": 3.580621836587129, "grad_norm": 1.8702974706037494, "learning_rate": 2.917302541478759e-06, "logits/chosen": -0.6687430143356323, "logits/rejected": -0.4045840799808502, "logps/chosen": -0.10321912914514542, "logps/rejected": -4.325726509094238, "loss": 0.1571, "odds_ratio_loss": 0.013897450640797615, "rewards/accuracies": 1.0, "rewards/chosen": -0.010321913287043571, "rewards/margins": 0.42225074768066406, "rewards/rejected": -0.4325726628303528, "sft_loss": 0.10321912914514542, "step": 2476 }, { "epoch": 3.582067968185105, "grad_norm": 2.2813330190561665, "learning_rate": 2.9143129323878688e-06, "logits/chosen": -0.36619487404823303, "logits/rejected": -0.3331270217895508, "logps/chosen": -0.19207827746868134, "logps/rejected": -4.304910659790039, "loss": 0.1878, "odds_ratio_loss": 0.022445132955908775, "rewards/accuracies": 1.0, "rewards/chosen": -0.019207827746868134, "rewards/margins": 0.4112831950187683, "rewards/rejected": -0.4304910898208618, "sft_loss": 0.19207827746868134, "step": 2477 }, { "epoch": 3.58351409978308, "grad_norm": 2.1589894888217187, "learning_rate": 2.911323977861755e-06, "logits/chosen": -0.5841609239578247, "logits/rejected": -0.5109463334083557, "logps/chosen": -0.11602534353733063, "logps/rejected": -3.624415874481201, "loss": 0.1854, "odds_ratio_loss": 0.032501354813575745, "rewards/accuracies": 1.0, "rewards/chosen": -0.011602534912526608, "rewards/margins": 0.3508390784263611, "rewards/rejected": -0.3624415993690491, "sft_loss": 0.11602534353733063, "step": 2478 }, { "epoch": 3.584960231381056, "grad_norm": 1.8654817230355145, "learning_rate": 2.9083356797024704e-06, "logits/chosen": -0.652426540851593, "logits/rejected": -0.609657347202301, "logps/chosen": -0.3062911033630371, "logps/rejected": -3.556048631668091, "loss": 0.1912, "odds_ratio_loss": 0.04985497146844864, "rewards/accuracies": 1.0, "rewards/chosen": -0.03062911331653595, "rewards/margins": 0.32497575879096985, "rewards/rejected": -0.3556048572063446, "sft_loss": 0.3062911033630371, "step": 2479 }, { "epoch": 3.586406362979031, "grad_norm": 2.3939535042376523, "learning_rate": 2.9053480397116684e-06, "logits/chosen": -0.4469086229801178, "logits/rejected": -0.3033689856529236, "logps/chosen": -0.14823874831199646, "logps/rejected": -3.7784619331359863, "loss": 0.1764, "odds_ratio_loss": 0.04784733057022095, "rewards/accuracies": 1.0, "rewards/chosen": -0.01482387538999319, "rewards/margins": 0.3630222976207733, "rewards/rejected": -0.3778461813926697, "sft_loss": 0.14823874831199646, "step": 2480 }, { "epoch": 3.5878524945770067, "grad_norm": 3.091688427595171, "learning_rate": 2.902361059690612e-06, "logits/chosen": -0.5378003716468811, "logits/rejected": -0.3790894150733948, "logps/chosen": -0.21176090836524963, "logps/rejected": -4.0171709060668945, "loss": 0.2286, "odds_ratio_loss": 0.05296362191438675, "rewards/accuracies": 1.0, "rewards/chosen": -0.021176090463995934, "rewards/margins": 0.3805410861968994, "rewards/rejected": -0.4017171561717987, "sft_loss": 0.21176090836524963, "step": 2481 }, { "epoch": 3.589298626174982, "grad_norm": 1.936186621813689, "learning_rate": 2.8993747414401597e-06, "logits/chosen": -0.487335741519928, "logits/rejected": -0.40547001361846924, "logps/chosen": -0.30852556228637695, "logps/rejected": -3.569243907928467, "loss": 0.1835, "odds_ratio_loss": 0.05021437630057335, "rewards/accuracies": 1.0, "rewards/chosen": -0.030852556228637695, "rewards/margins": 0.3260718584060669, "rewards/rejected": -0.3569244146347046, "sft_loss": 0.30852556228637695, "step": 2482 }, { "epoch": 3.590744757772957, "grad_norm": 1.8409295208559, "learning_rate": 2.8963890867607757e-06, "logits/chosen": -0.6406811475753784, "logits/rejected": -0.5087481737136841, "logps/chosen": -0.23279646039009094, "logps/rejected": -3.4700019359588623, "loss": 0.1946, "odds_ratio_loss": 0.03444098308682442, "rewards/accuracies": 1.0, "rewards/chosen": -0.023279648274183273, "rewards/margins": 0.3237205743789673, "rewards/rejected": -0.3470001816749573, "sft_loss": 0.23279646039009094, "step": 2483 }, { "epoch": 3.592190889370933, "grad_norm": 1.8898837799392554, "learning_rate": 2.8934040974525206e-06, "logits/chosen": -0.5216646790504456, "logits/rejected": -0.5978440046310425, "logps/chosen": -0.17544230818748474, "logps/rejected": -4.052137851715088, "loss": 0.1655, "odds_ratio_loss": 0.03247702866792679, "rewards/accuracies": 1.0, "rewards/chosen": -0.017544232308864594, "rewards/margins": 0.38766956329345703, "rewards/rejected": -0.4052138030529022, "sft_loss": 0.17544230818748474, "step": 2484 }, { "epoch": 3.593637020968908, "grad_norm": 2.187248762834126, "learning_rate": 2.890419775315057e-06, "logits/chosen": -0.6240389347076416, "logits/rejected": -0.5031851530075073, "logps/chosen": -0.13512706756591797, "logps/rejected": -3.0444791316986084, "loss": 0.2024, "odds_ratio_loss": 0.0328708216547966, "rewards/accuracies": 1.0, "rewards/chosen": -0.013512706384062767, "rewards/margins": 0.2909351885318756, "rewards/rejected": -0.30444788932800293, "sft_loss": 0.13512706756591797, "step": 2485 }, { "epoch": 3.5950831525668834, "grad_norm": 2.204519299865527, "learning_rate": 2.887436122147644e-06, "logits/chosen": -0.4692801535129547, "logits/rejected": -0.44944295287132263, "logps/chosen": -0.33308011293411255, "logps/rejected": -3.2959773540496826, "loss": 0.2273, "odds_ratio_loss": 0.08923923969268799, "rewards/accuracies": 1.0, "rewards/chosen": -0.033308010548353195, "rewards/margins": 0.29628971219062805, "rewards/rejected": -0.32959771156311035, "sft_loss": 0.33308011293411255, "step": 2486 }, { "epoch": 3.596529284164859, "grad_norm": 2.228837722865733, "learning_rate": 2.884453139749135e-06, "logits/chosen": -0.5567612051963806, "logits/rejected": -0.4138484597206116, "logps/chosen": -0.15191403031349182, "logps/rejected": -5.067747116088867, "loss": 0.1797, "odds_ratio_loss": 0.020546402782201767, "rewards/accuracies": 1.0, "rewards/chosen": -0.015191404148936272, "rewards/margins": 0.49158334732055664, "rewards/rejected": -0.5067747831344604, "sft_loss": 0.15191403031349182, "step": 2487 }, { "epoch": 3.5979754157628343, "grad_norm": 2.245458739312704, "learning_rate": 2.881470829917984e-06, "logits/chosen": -0.5933226346969604, "logits/rejected": -0.567808985710144, "logps/chosen": -0.255230188369751, "logps/rejected": -3.186595916748047, "loss": 0.2138, "odds_ratio_loss": 0.07271232455968857, "rewards/accuracies": 1.0, "rewards/chosen": -0.02552301622927189, "rewards/margins": 0.2931365668773651, "rewards/rejected": -0.31865960359573364, "sft_loss": 0.255230188369751, "step": 2488 }, { "epoch": 3.59942154736081, "grad_norm": 1.9277082887715968, "learning_rate": 2.8784891944522356e-06, "logits/chosen": -0.5661904215812683, "logits/rejected": -0.4054012596607208, "logps/chosen": -0.2036726176738739, "logps/rejected": -4.150453567504883, "loss": 0.1813, "odds_ratio_loss": 0.03672172129154205, "rewards/accuracies": 1.0, "rewards/chosen": -0.02036726474761963, "rewards/margins": 0.3946780562400818, "rewards/rejected": -0.4150453805923462, "sft_loss": 0.2036726176738739, "step": 2489 }, { "epoch": 3.6008676789587852, "grad_norm": 2.086005099661083, "learning_rate": 2.875508235149529e-06, "logits/chosen": -0.3694448471069336, "logits/rejected": -0.33086222410202026, "logps/chosen": -0.23983649909496307, "logps/rejected": -4.911922931671143, "loss": 0.1822, "odds_ratio_loss": 0.07498527318239212, "rewards/accuracies": 0.9375, "rewards/chosen": -0.023983649909496307, "rewards/margins": 0.4672086834907532, "rewards/rejected": -0.4911923110485077, "sft_loss": 0.23983649909496307, "step": 2490 }, { "epoch": 3.602313810556761, "grad_norm": 2.3194389065513, "learning_rate": 2.872527953807094e-06, "logits/chosen": -0.6120354533195496, "logits/rejected": -0.3980521261692047, "logps/chosen": -0.21247637271881104, "logps/rejected": -3.412698268890381, "loss": 0.1507, "odds_ratio_loss": 0.04428691789507866, "rewards/accuracies": 1.0, "rewards/chosen": -0.021247640252113342, "rewards/margins": 0.3200221061706543, "rewards/rejected": -0.34126976132392883, "sft_loss": 0.21247637271881104, "step": 2491 }, { "epoch": 3.603759942154736, "grad_norm": 2.2912130950801246, "learning_rate": 2.869548352221757e-06, "logits/chosen": -0.5996018648147583, "logits/rejected": -0.38249221444129944, "logps/chosen": -0.14581993222236633, "logps/rejected": -4.9018731117248535, "loss": 0.1737, "odds_ratio_loss": 0.023255767300724983, "rewards/accuracies": 1.0, "rewards/chosen": -0.014581995084881783, "rewards/margins": 0.4756053388118744, "rewards/rejected": -0.4901873469352722, "sft_loss": 0.14581993222236633, "step": 2492 }, { "epoch": 3.6052060737527114, "grad_norm": 2.3023457380266303, "learning_rate": 2.8665694321899275e-06, "logits/chosen": -0.5616737604141235, "logits/rejected": -0.36620962619781494, "logps/chosen": -0.09285837411880493, "logps/rejected": -4.530645847320557, "loss": 0.2018, "odds_ratio_loss": 0.016189442947506905, "rewards/accuracies": 1.0, "rewards/chosen": -0.009285837411880493, "rewards/margins": 0.44377875328063965, "rewards/rejected": -0.45306459069252014, "sft_loss": 0.09285837411880493, "step": 2493 }, { "epoch": 3.606652205350687, "grad_norm": 1.8296179592059094, "learning_rate": 2.863591195507609e-06, "logits/chosen": -0.43233737349510193, "logits/rejected": -0.42928576469421387, "logps/chosen": -0.1452055722475052, "logps/rejected": -4.242135524749756, "loss": 0.1593, "odds_ratio_loss": 0.04257887601852417, "rewards/accuracies": 1.0, "rewards/chosen": -0.014520557597279549, "rewards/margins": 0.40969300270080566, "rewards/rejected": -0.4242135286331177, "sft_loss": 0.1452055722475052, "step": 2494 }, { "epoch": 3.6080983369486623, "grad_norm": 1.8562477051616662, "learning_rate": 2.86061364397039e-06, "logits/chosen": -0.468068391084671, "logits/rejected": -0.40226829051971436, "logps/chosen": -0.14415565133094788, "logps/rejected": -3.4453213214874268, "loss": 0.1654, "odds_ratio_loss": 0.024850863963365555, "rewards/accuracies": 1.0, "rewards/chosen": -0.014415565878152847, "rewards/margins": 0.3301165699958801, "rewards/rejected": -0.34453216195106506, "sft_loss": 0.14415565133094788, "step": 2495 }, { "epoch": 3.6095444685466376, "grad_norm": 2.0000474708923717, "learning_rate": 2.8576367793734506e-06, "logits/chosen": -0.297725111246109, "logits/rejected": -0.36155325174331665, "logps/chosen": -0.20408833026885986, "logps/rejected": -3.36604905128479, "loss": 0.2305, "odds_ratio_loss": 0.037189140915870667, "rewards/accuracies": 1.0, "rewards/chosen": -0.020408835262060165, "rewards/margins": 0.316196084022522, "rewards/rejected": -0.33660489320755005, "sft_loss": 0.20408833026885986, "step": 2496 }, { "epoch": 3.6109906001446133, "grad_norm": 1.9603399607152372, "learning_rate": 2.8546606035115498e-06, "logits/chosen": -0.6297708749771118, "logits/rejected": -0.5421985387802124, "logps/chosen": -0.3003847301006317, "logps/rejected": -2.903427839279175, "loss": 0.2134, "odds_ratio_loss": 0.09296062588691711, "rewards/accuracies": 1.0, "rewards/chosen": -0.03003847599029541, "rewards/margins": 0.26030433177948, "rewards/rejected": -0.2903428077697754, "sft_loss": 0.3003847301006317, "step": 2497 }, { "epoch": 3.6124367317425885, "grad_norm": 1.8908173660949101, "learning_rate": 2.8516851181790384e-06, "logits/chosen": -0.4586806297302246, "logits/rejected": -0.3453059494495392, "logps/chosen": -0.07760489732027054, "logps/rejected": -4.848832607269287, "loss": 0.1691, "odds_ratio_loss": 0.03154202550649643, "rewards/accuracies": 1.0, "rewards/chosen": -0.007760489825159311, "rewards/margins": 0.4771227538585663, "rewards/rejected": -0.48488324880599976, "sft_loss": 0.07760489732027054, "step": 2498 }, { "epoch": 3.6138828633405637, "grad_norm": 1.9129309499246139, "learning_rate": 2.848710325169844e-06, "logits/chosen": -0.5357916951179504, "logits/rejected": -0.47928518056869507, "logps/chosen": -0.09794984012842178, "logps/rejected": -5.796481609344482, "loss": 0.1743, "odds_ratio_loss": 0.015042275190353394, "rewards/accuracies": 1.0, "rewards/chosen": -0.009794985875487328, "rewards/margins": 0.5698531866073608, "rewards/rejected": -0.5796481966972351, "sft_loss": 0.09794984012842178, "step": 2499 }, { "epoch": 3.6153289949385394, "grad_norm": 3.275069979062189, "learning_rate": 2.8457362262774825e-06, "logits/chosen": -0.3791963458061218, "logits/rejected": -0.27261611819267273, "logps/chosen": -0.1412392556667328, "logps/rejected": -6.682195663452148, "loss": 0.1799, "odds_ratio_loss": 0.036368854343891144, "rewards/accuracies": 1.0, "rewards/chosen": -0.014123925939202309, "rewards/margins": 0.6540957093238831, "rewards/rejected": -0.6682195663452148, "sft_loss": 0.1412392556667328, "step": 2500 }, { "epoch": 3.6167751265365147, "grad_norm": 1.9873717801084936, "learning_rate": 2.8427628232950504e-06, "logits/chosen": -0.5378691554069519, "logits/rejected": -0.48928403854370117, "logps/chosen": -0.1797875463962555, "logps/rejected": -3.5523314476013184, "loss": 0.1742, "odds_ratio_loss": 0.04132210463285446, "rewards/accuracies": 1.0, "rewards/chosen": -0.01797875389456749, "rewards/margins": 0.3372544050216675, "rewards/rejected": -0.35523316264152527, "sft_loss": 0.1797875463962555, "step": 2501 }, { "epoch": 3.6182212581344904, "grad_norm": 1.8980601051441697, "learning_rate": 2.8397901180152223e-06, "logits/chosen": -0.5706363916397095, "logits/rejected": -0.407425194978714, "logps/chosen": -0.15050289034843445, "logps/rejected": -3.6332499980926514, "loss": 0.1993, "odds_ratio_loss": 0.042995937168598175, "rewards/accuracies": 1.0, "rewards/chosen": -0.01505028922110796, "rewards/margins": 0.34827476739883423, "rewards/rejected": -0.36332499980926514, "sft_loss": 0.15050289034843445, "step": 2502 }, { "epoch": 3.6196673897324656, "grad_norm": 2.118084164458217, "learning_rate": 2.8368181122302525e-06, "logits/chosen": -0.3179455101490021, "logits/rejected": -0.25522419810295105, "logps/chosen": -0.1424923986196518, "logps/rejected": -5.086899280548096, "loss": 0.1693, "odds_ratio_loss": 0.03211604803800583, "rewards/accuracies": 1.0, "rewards/chosen": -0.014249240048229694, "rewards/margins": 0.4944407045841217, "rewards/rejected": -0.5086899399757385, "sft_loss": 0.1424923986196518, "step": 2503 }, { "epoch": 3.6211135213304413, "grad_norm": 2.6548176127386793, "learning_rate": 2.833846807731975e-06, "logits/chosen": -0.4786378741264343, "logits/rejected": -0.43536630272865295, "logps/chosen": -0.0729706808924675, "logps/rejected": -3.080990791320801, "loss": 0.1645, "odds_ratio_loss": 0.021972548216581345, "rewards/accuracies": 1.0, "rewards/chosen": -0.00729706883430481, "rewards/margins": 0.30080199241638184, "rewards/rejected": -0.30809906125068665, "sft_loss": 0.0729706808924675, "step": 2504 }, { "epoch": 3.6225596529284165, "grad_norm": 2.5135474712614045, "learning_rate": 2.8308762063118006e-06, "logits/chosen": -0.4966127574443817, "logits/rejected": -0.5303236842155457, "logps/chosen": -0.17419995367527008, "logps/rejected": -3.1192662715911865, "loss": 0.1724, "odds_ratio_loss": 0.04412810876965523, "rewards/accuracies": 1.0, "rewards/chosen": -0.01741999387741089, "rewards/margins": 0.29450663924217224, "rewards/rejected": -0.31192663311958313, "sft_loss": 0.17419995367527008, "step": 2505 }, { "epoch": 3.6240057845263918, "grad_norm": 2.1262943897743747, "learning_rate": 2.8279063097607156e-06, "logits/chosen": -0.6062009334564209, "logits/rejected": -0.5596913695335388, "logps/chosen": -0.1444343477487564, "logps/rejected": -2.005614757537842, "loss": 0.1655, "odds_ratio_loss": 0.03216197341680527, "rewards/accuracies": 1.0, "rewards/chosen": -0.014443434774875641, "rewards/margins": 0.18611805140972137, "rewards/rejected": -0.20056146383285522, "sft_loss": 0.1444343477487564, "step": 2506 }, { "epoch": 3.6254519161243675, "grad_norm": 2.086089991067271, "learning_rate": 2.8249371198692827e-06, "logits/chosen": -0.6401376724243164, "logits/rejected": -0.5771905183792114, "logps/chosen": -0.13659507036209106, "logps/rejected": -4.549966812133789, "loss": 0.1906, "odds_ratio_loss": 0.015896422788500786, "rewards/accuracies": 1.0, "rewards/chosen": -0.013659507036209106, "rewards/margins": 0.4413371682167053, "rewards/rejected": -0.45499664545059204, "sft_loss": 0.13659507036209106, "step": 2507 }, { "epoch": 3.6268980477223427, "grad_norm": 2.169001436836437, "learning_rate": 2.821968638427634e-06, "logits/chosen": -0.8902711868286133, "logits/rejected": -0.500067412853241, "logps/chosen": -0.11250185966491699, "logps/rejected": -3.8250503540039062, "loss": 0.162, "odds_ratio_loss": 0.036406371742486954, "rewards/accuracies": 1.0, "rewards/chosen": -0.011250186711549759, "rewards/margins": 0.3712548613548279, "rewards/rejected": -0.38250505924224854, "sft_loss": 0.11250185966491699, "step": 2508 }, { "epoch": 3.628344179320318, "grad_norm": 13.832662348150325, "learning_rate": 2.8190008672254835e-06, "logits/chosen": -0.7249534726142883, "logits/rejected": -0.6150829792022705, "logps/chosen": -0.13645420968532562, "logps/rejected": -2.8476240634918213, "loss": 0.1885, "odds_ratio_loss": 0.03549912944436073, "rewards/accuracies": 1.0, "rewards/chosen": -0.013645419850945473, "rewards/margins": 0.2711169719696045, "rewards/rejected": -0.2847624123096466, "sft_loss": 0.13645420968532562, "step": 2509 }, { "epoch": 3.6297903109182936, "grad_norm": 2.2782044948197258, "learning_rate": 2.8160338080521074e-06, "logits/chosen": -0.5967862606048584, "logits/rejected": -0.6156792640686035, "logps/chosen": -0.15163278579711914, "logps/rejected": -3.7114038467407227, "loss": 0.177, "odds_ratio_loss": 0.03112233430147171, "rewards/accuracies": 1.0, "rewards/chosen": -0.015163278207182884, "rewards/margins": 0.3559771180152893, "rewards/rejected": -0.37114039063453674, "sft_loss": 0.15163278579711914, "step": 2510 }, { "epoch": 3.631236442516269, "grad_norm": 2.1207273904138946, "learning_rate": 2.8130674626963586e-06, "logits/chosen": -0.5536920428276062, "logits/rejected": -0.49420982599258423, "logps/chosen": -0.18125204741954803, "logps/rejected": -2.170628070831299, "loss": 0.1629, "odds_ratio_loss": 0.03468827158212662, "rewards/accuracies": 1.0, "rewards/chosen": -0.018125206232070923, "rewards/margins": 0.19893759489059448, "rewards/rejected": -0.2170628309249878, "sft_loss": 0.18125204741954803, "step": 2511 }, { "epoch": 3.6326825741142446, "grad_norm": 2.087165380567483, "learning_rate": 2.8101018329466557e-06, "logits/chosen": -0.529472827911377, "logits/rejected": -0.3276049494743347, "logps/chosen": -0.2205849587917328, "logps/rejected": -5.405893802642822, "loss": 0.1696, "odds_ratio_loss": 0.04581043869256973, "rewards/accuracies": 1.0, "rewards/chosen": -0.02205849625170231, "rewards/margins": 0.5185309052467346, "rewards/rejected": -0.5405893921852112, "sft_loss": 0.2205849587917328, "step": 2512 }, { "epoch": 3.63412870571222, "grad_norm": 4.15655797636779, "learning_rate": 2.8071369205909904e-06, "logits/chosen": -0.5813817381858826, "logits/rejected": -0.5162385702133179, "logps/chosen": -0.16202031075954437, "logps/rejected": -4.27128791809082, "loss": 0.1493, "odds_ratio_loss": 0.04542159289121628, "rewards/accuracies": 1.0, "rewards/chosen": -0.016202032566070557, "rewards/margins": 0.4109267294406891, "rewards/rejected": -0.42712879180908203, "sft_loss": 0.16202031075954437, "step": 2513 }, { "epoch": 3.6355748373101955, "grad_norm": 2.1316330685140072, "learning_rate": 2.804172727416919e-06, "logits/chosen": -0.566758394241333, "logits/rejected": -0.4026070237159729, "logps/chosen": -0.2781253159046173, "logps/rejected": -4.5679779052734375, "loss": 0.2684, "odds_ratio_loss": 0.0528629794716835, "rewards/accuracies": 1.0, "rewards/chosen": -0.02781253308057785, "rewards/margins": 0.42898523807525635, "rewards/rejected": -0.4567977786064148, "sft_loss": 0.2781253159046173, "step": 2514 }, { "epoch": 3.6370209689081707, "grad_norm": 2.2518564789198323, "learning_rate": 2.801209255211562e-06, "logits/chosen": -0.5290085077285767, "logits/rejected": -0.4111337661743164, "logps/chosen": -0.15597251057624817, "logps/rejected": -3.711620807647705, "loss": 0.2034, "odds_ratio_loss": 0.033812057226896286, "rewards/accuracies": 1.0, "rewards/chosen": -0.015597251243889332, "rewards/margins": 0.35556483268737793, "rewards/rejected": -0.3711620569229126, "sft_loss": 0.15597251057624817, "step": 2515 }, { "epoch": 3.638467100506146, "grad_norm": 1.9427894630685738, "learning_rate": 2.798246505761612e-06, "logits/chosen": -0.580420732498169, "logits/rejected": -0.45197558403015137, "logps/chosen": -0.1896083801984787, "logps/rejected": -5.463057518005371, "loss": 0.1589, "odds_ratio_loss": 0.029428288340568542, "rewards/accuracies": 1.0, "rewards/chosen": -0.01896083913743496, "rewards/margins": 0.5273449420928955, "rewards/rejected": -0.546305775642395, "sft_loss": 0.1896083801984787, "step": 2516 }, { "epoch": 3.6399132321041217, "grad_norm": 1.704300930390487, "learning_rate": 2.7952844808533185e-06, "logits/chosen": -0.5066642165184021, "logits/rejected": -0.4148600101470947, "logps/chosen": -0.13526996970176697, "logps/rejected": -5.089852333068848, "loss": 0.1713, "odds_ratio_loss": 0.030814705416560173, "rewards/accuracies": 1.0, "rewards/chosen": -0.013526996597647667, "rewards/margins": 0.4954582452774048, "rewards/rejected": -0.5089852213859558, "sft_loss": 0.13526996970176697, "step": 2517 }, { "epoch": 3.641359363702097, "grad_norm": 1.8499325363659644, "learning_rate": 2.792323182272499e-06, "logits/chosen": -0.46863168478012085, "logits/rejected": -0.4521971642971039, "logps/chosen": -0.13161855936050415, "logps/rejected": -3.217064380645752, "loss": 0.1664, "odds_ratio_loss": 0.03563810884952545, "rewards/accuracies": 1.0, "rewards/chosen": -0.0131618557497859, "rewards/margins": 0.3085446059703827, "rewards/rejected": -0.32170647382736206, "sft_loss": 0.13161855936050415, "step": 2518 }, { "epoch": 3.642805495300072, "grad_norm": 2.025316530793158, "learning_rate": 2.789362611804529e-06, "logits/chosen": -0.49984610080718994, "logits/rejected": -0.515692949295044, "logps/chosen": -0.22088411450386047, "logps/rejected": -4.2113823890686035, "loss": 0.2018, "odds_ratio_loss": 0.0324944369494915, "rewards/accuracies": 1.0, "rewards/chosen": -0.022088412195444107, "rewards/margins": 0.3990498185157776, "rewards/rejected": -0.4211382269859314, "sft_loss": 0.22088411450386047, "step": 2519 }, { "epoch": 3.644251626898048, "grad_norm": 1.976339663757813, "learning_rate": 2.7864027712343513e-06, "logits/chosen": -0.45987632870674133, "logits/rejected": -0.3473128378391266, "logps/chosen": -0.10946309566497803, "logps/rejected": -4.030787944793701, "loss": 0.1518, "odds_ratio_loss": 0.04007424786686897, "rewards/accuracies": 1.0, "rewards/chosen": -0.010946309193968773, "rewards/margins": 0.3921324908733368, "rewards/rejected": -0.4030787944793701, "sft_loss": 0.10946309566497803, "step": 2520 }, { "epoch": 3.645697758496023, "grad_norm": 2.0676256585095576, "learning_rate": 2.7834436623464616e-06, "logits/chosen": -0.6962200403213501, "logits/rejected": -0.5652716755867004, "logps/chosen": -0.20560969412326813, "logps/rejected": -4.473116397857666, "loss": 0.2235, "odds_ratio_loss": 0.03834813833236694, "rewards/accuracies": 1.0, "rewards/chosen": -0.020560970529913902, "rewards/margins": 0.42675071954727173, "rewards/rejected": -0.447311669588089, "sft_loss": 0.20560969412326813, "step": 2521 }, { "epoch": 3.6471438900939983, "grad_norm": 1.9786619388491158, "learning_rate": 2.78048528692492e-06, "logits/chosen": -0.4101904332637787, "logits/rejected": -0.3749925494194031, "logps/chosen": -0.2814168632030487, "logps/rejected": -2.196606159210205, "loss": 0.1929, "odds_ratio_loss": 0.06532847881317139, "rewards/accuracies": 1.0, "rewards/chosen": -0.0281416866928339, "rewards/margins": 0.19151893258094788, "rewards/rejected": -0.21966060996055603, "sft_loss": 0.2814168632030487, "step": 2522 }, { "epoch": 3.648590021691974, "grad_norm": 1.8923559183457217, "learning_rate": 2.777527646753339e-06, "logits/chosen": -0.533107340335846, "logits/rejected": -0.37154343724250793, "logps/chosen": -0.18305464088916779, "logps/rejected": -3.3793654441833496, "loss": 0.1757, "odds_ratio_loss": 0.03871382027864456, "rewards/accuracies": 1.0, "rewards/chosen": -0.018305467441678047, "rewards/margins": 0.31963109970092773, "rewards/rejected": -0.3379365801811218, "sft_loss": 0.18305464088916779, "step": 2523 }, { "epoch": 3.6500361532899492, "grad_norm": 2.4175380591242486, "learning_rate": 2.774570743614894e-06, "logits/chosen": -0.6140806078910828, "logits/rejected": -0.43565231561660767, "logps/chosen": -0.2680893838405609, "logps/rejected": -3.991342306137085, "loss": 0.2151, "odds_ratio_loss": 0.14778588712215424, "rewards/accuracies": 0.9375, "rewards/chosen": -0.02680893987417221, "rewards/margins": 0.3723253309726715, "rewards/rejected": -0.39913424849510193, "sft_loss": 0.2680893838405609, "step": 2524 }, { "epoch": 3.651482284887925, "grad_norm": 2.0920122323863817, "learning_rate": 2.7716145792923114e-06, "logits/chosen": -0.3725340962409973, "logits/rejected": -0.392286479473114, "logps/chosen": -0.19242461025714874, "logps/rejected": -2.6325159072875977, "loss": 0.2413, "odds_ratio_loss": 0.041729189455509186, "rewards/accuracies": 1.0, "rewards/chosen": -0.019242461770772934, "rewards/margins": 0.2440091371536255, "rewards/rejected": -0.2632516026496887, "sft_loss": 0.19242461025714874, "step": 2525 }, { "epoch": 3.6529284164859, "grad_norm": 1.7977951192621633, "learning_rate": 2.7686591555678725e-06, "logits/chosen": -0.3152431845664978, "logits/rejected": -0.36898574233055115, "logps/chosen": -0.09872251749038696, "logps/rejected": -5.19644021987915, "loss": 0.1324, "odds_ratio_loss": 0.01986844837665558, "rewards/accuracies": 1.0, "rewards/chosen": -0.009872252121567726, "rewards/margins": 0.5097717642784119, "rewards/rejected": -0.519644021987915, "sft_loss": 0.09872251749038696, "step": 2526 }, { "epoch": 3.654374548083876, "grad_norm": 2.1765409277692904, "learning_rate": 2.765704474223417e-06, "logits/chosen": -0.5735154151916504, "logits/rejected": -0.43699342012405396, "logps/chosen": -0.194193035364151, "logps/rejected": -4.180932521820068, "loss": 0.2277, "odds_ratio_loss": 0.043235015124082565, "rewards/accuracies": 1.0, "rewards/chosen": -0.01941930502653122, "rewards/margins": 0.3986739218235016, "rewards/rejected": -0.4180932641029358, "sft_loss": 0.194193035364151, "step": 2527 }, { "epoch": 3.655820679681851, "grad_norm": 1.986485149439459, "learning_rate": 2.76275053704033e-06, "logits/chosen": -0.6245859265327454, "logits/rejected": -0.5641262531280518, "logps/chosen": -0.1828930377960205, "logps/rejected": -3.177208185195923, "loss": 0.1233, "odds_ratio_loss": 0.0734827071428299, "rewards/accuracies": 1.0, "rewards/chosen": -0.01828930340707302, "rewards/margins": 0.2994315028190613, "rewards/rejected": -0.31772083044052124, "sft_loss": 0.1828930377960205, "step": 2528 }, { "epoch": 3.6572668112798263, "grad_norm": 2.2808936138834874, "learning_rate": 2.759797345799553e-06, "logits/chosen": -0.5670177936553955, "logits/rejected": -0.3730791509151459, "logps/chosen": -0.17481276392936707, "logps/rejected": -4.819494247436523, "loss": 0.2586, "odds_ratio_loss": 0.023719413205981255, "rewards/accuracies": 1.0, "rewards/chosen": -0.017481276765465736, "rewards/margins": 0.4644681513309479, "rewards/rejected": -0.48194941878318787, "sft_loss": 0.17481276392936707, "step": 2529 }, { "epoch": 3.658712942877802, "grad_norm": 2.5614184083505975, "learning_rate": 2.7568449022815737e-06, "logits/chosen": -0.6097587943077087, "logits/rejected": -0.42381176352500916, "logps/chosen": -0.11319475620985031, "logps/rejected": -5.370057106018066, "loss": 0.1716, "odds_ratio_loss": 0.013680643402040005, "rewards/accuracies": 1.0, "rewards/chosen": -0.011319476179778576, "rewards/margins": 0.5256862640380859, "rewards/rejected": -0.5370057821273804, "sft_loss": 0.11319475620985031, "step": 2530 }, { "epoch": 3.6601590744757773, "grad_norm": 2.1000222835332836, "learning_rate": 2.7538932082664337e-06, "logits/chosen": -0.6866092085838318, "logits/rejected": -0.6213411092758179, "logps/chosen": -0.1459658145904541, "logps/rejected": -2.8110921382904053, "loss": 0.1863, "odds_ratio_loss": 0.04226803034543991, "rewards/accuracies": 1.0, "rewards/chosen": -0.01459658145904541, "rewards/margins": 0.2665126323699951, "rewards/rejected": -0.2811092138290405, "sft_loss": 0.1459658145904541, "step": 2531 }, { "epoch": 3.6616052060737525, "grad_norm": 2.349668505867712, "learning_rate": 2.7509422655337194e-06, "logits/chosen": -0.748806893825531, "logits/rejected": -0.5211739540100098, "logps/chosen": -0.11192212998867035, "logps/rejected": -5.20810604095459, "loss": 0.1486, "odds_ratio_loss": 0.027064472436904907, "rewards/accuracies": 1.0, "rewards/chosen": -0.011192213743925095, "rewards/margins": 0.5096184015274048, "rewards/rejected": -0.520810604095459, "sft_loss": 0.11192212998867035, "step": 2532 }, { "epoch": 3.663051337671728, "grad_norm": 1.9229196234297072, "learning_rate": 2.747992075862566e-06, "logits/chosen": -0.5800803899765015, "logits/rejected": -0.5336440801620483, "logps/chosen": -0.21885840594768524, "logps/rejected": -3.05794620513916, "loss": 0.1674, "odds_ratio_loss": 0.033811405301094055, "rewards/accuracies": 1.0, "rewards/chosen": -0.021885842084884644, "rewards/margins": 0.28390878438949585, "rewards/rejected": -0.3057945966720581, "sft_loss": 0.21885840594768524, "step": 2533 }, { "epoch": 3.6644974692697034, "grad_norm": 1.9210763154156887, "learning_rate": 2.7450426410316515e-06, "logits/chosen": -0.6399502754211426, "logits/rejected": -0.5182642340660095, "logps/chosen": -0.2076047956943512, "logps/rejected": -4.193682670593262, "loss": 0.1769, "odds_ratio_loss": 0.045763999223709106, "rewards/accuracies": 1.0, "rewards/chosen": -0.02076047845184803, "rewards/margins": 0.3986077904701233, "rewards/rejected": -0.41936826705932617, "sft_loss": 0.2076047956943512, "step": 2534 }, { "epoch": 3.665943600867679, "grad_norm": 2.011546723686748, "learning_rate": 2.7420939628192044e-06, "logits/chosen": -0.8048521280288696, "logits/rejected": -0.5235615968704224, "logps/chosen": -0.17272844910621643, "logps/rejected": -4.574443817138672, "loss": 0.1633, "odds_ratio_loss": 0.0383070707321167, "rewards/accuracies": 1.0, "rewards/chosen": -0.017272844910621643, "rewards/margins": 0.44017156958580017, "rewards/rejected": -0.457444429397583, "sft_loss": 0.17272844910621643, "step": 2535 }, { "epoch": 3.6673897324656544, "grad_norm": 1.9680305398978295, "learning_rate": 2.739146043002991e-06, "logits/chosen": -0.6656097173690796, "logits/rejected": -0.5541112422943115, "logps/chosen": -0.1563243865966797, "logps/rejected": -3.3513598442077637, "loss": 0.171, "odds_ratio_loss": 0.022492770105600357, "rewards/accuracies": 1.0, "rewards/chosen": -0.01563243940472603, "rewards/margins": 0.3195035457611084, "rewards/rejected": -0.3351359963417053, "sft_loss": 0.1563243865966797, "step": 2536 }, { "epoch": 3.66883586406363, "grad_norm": 1.9549831394889021, "learning_rate": 2.736198883360324e-06, "logits/chosen": -0.5706369876861572, "logits/rejected": -0.43308964371681213, "logps/chosen": -0.24978458881378174, "logps/rejected": -3.1274073123931885, "loss": 0.2132, "odds_ratio_loss": 0.05592186748981476, "rewards/accuracies": 1.0, "rewards/chosen": -0.024978458881378174, "rewards/margins": 0.28776225447654724, "rewards/rejected": -0.3127407133579254, "sft_loss": 0.24978458881378174, "step": 2537 }, { "epoch": 3.6702819956616053, "grad_norm": 2.8400570515450774, "learning_rate": 2.733252485668057e-06, "logits/chosen": -0.6471999883651733, "logits/rejected": -0.5394845008850098, "logps/chosen": -0.189593106508255, "logps/rejected": -2.7674801349639893, "loss": 0.2171, "odds_ratio_loss": 0.03962300345301628, "rewards/accuracies": 1.0, "rewards/chosen": -0.01895931176841259, "rewards/margins": 0.2577887177467346, "rewards/rejected": -0.27674800157546997, "sft_loss": 0.189593106508255, "step": 2538 }, { "epoch": 3.6717281272595805, "grad_norm": 2.4733180825080403, "learning_rate": 2.7303068517025845e-06, "logits/chosen": -0.5572927594184875, "logits/rejected": -0.4921252727508545, "logps/chosen": -0.1227627843618393, "logps/rejected": -4.144178867340088, "loss": 0.1404, "odds_ratio_loss": 0.012044022791087627, "rewards/accuracies": 1.0, "rewards/chosen": -0.01227627694606781, "rewards/margins": 0.40214163064956665, "rewards/rejected": -0.41441792249679565, "sft_loss": 0.1227627843618393, "step": 2539 }, { "epoch": 3.673174258857556, "grad_norm": 1.9355292256824457, "learning_rate": 2.7273619832398405e-06, "logits/chosen": -0.5252768993377686, "logits/rejected": -0.4472333788871765, "logps/chosen": -0.15913286805152893, "logps/rejected": -4.911133766174316, "loss": 0.1602, "odds_ratio_loss": 0.02607126533985138, "rewards/accuracies": 1.0, "rewards/chosen": -0.015913287177681923, "rewards/margins": 0.4752000570297241, "rewards/rejected": -0.4911133646965027, "sft_loss": 0.15913286805152893, "step": 2540 }, { "epoch": 3.6746203904555315, "grad_norm": 2.381886666609673, "learning_rate": 2.724417882055295e-06, "logits/chosen": -0.5004345774650574, "logits/rejected": -0.457826167345047, "logps/chosen": -0.16766560077667236, "logps/rejected": -2.8187742233276367, "loss": 0.2033, "odds_ratio_loss": 0.049446333199739456, "rewards/accuracies": 1.0, "rewards/chosen": -0.016766561195254326, "rewards/margins": 0.26511088013648987, "rewards/rejected": -0.28187742829322815, "sft_loss": 0.16766560077667236, "step": 2541 }, { "epoch": 3.6760665220535067, "grad_norm": 2.292433055047217, "learning_rate": 2.7214745499239613e-06, "logits/chosen": -0.7124675512313843, "logits/rejected": -0.5647624135017395, "logps/chosen": -0.13279548287391663, "logps/rejected": -2.9666402339935303, "loss": 0.1804, "odds_ratio_loss": 0.022926434874534607, "rewards/accuracies": 1.0, "rewards/chosen": -0.013279548846185207, "rewards/margins": 0.2833844721317291, "rewards/rejected": -0.2966639995574951, "sft_loss": 0.13279548287391663, "step": 2542 }, { "epoch": 3.6775126536514824, "grad_norm": 2.181098941599247, "learning_rate": 2.7185319886203825e-06, "logits/chosen": -0.5184475183486938, "logits/rejected": -0.35520482063293457, "logps/chosen": -0.11584815382957458, "logps/rejected": -3.8595778942108154, "loss": 0.2275, "odds_ratio_loss": 0.012421883642673492, "rewards/accuracies": 1.0, "rewards/chosen": -0.011584816500544548, "rewards/margins": 0.3743729889392853, "rewards/rejected": -0.3859577775001526, "sft_loss": 0.11584815382957458, "step": 2543 }, { "epoch": 3.6789587852494576, "grad_norm": 1.947842308476255, "learning_rate": 2.7155901999186407e-06, "logits/chosen": -0.6304459571838379, "logits/rejected": -0.3842719793319702, "logps/chosen": -0.1558356136083603, "logps/rejected": -4.710226535797119, "loss": 0.175, "odds_ratio_loss": 0.029195290058851242, "rewards/accuracies": 1.0, "rewards/chosen": -0.015583561733365059, "rewards/margins": 0.4554390609264374, "rewards/rejected": -0.4710226356983185, "sft_loss": 0.1558356136083603, "step": 2544 }, { "epoch": 3.680404916847433, "grad_norm": 2.107354009715289, "learning_rate": 2.7126491855923497e-06, "logits/chosen": -0.4662625193595886, "logits/rejected": -0.3562268316745758, "logps/chosen": -0.18191561102867126, "logps/rejected": -5.232119083404541, "loss": 0.2264, "odds_ratio_loss": 0.027083907276391983, "rewards/accuracies": 1.0, "rewards/chosen": -0.018191561102867126, "rewards/margins": 0.5050203800201416, "rewards/rejected": -0.5232118964195251, "sft_loss": 0.18191561102867126, "step": 2545 }, { "epoch": 3.6818510484454086, "grad_norm": 2.1162458781646594, "learning_rate": 2.7097089474146608e-06, "logits/chosen": -0.6267588138580322, "logits/rejected": -0.47810542583465576, "logps/chosen": -0.17624418437480927, "logps/rejected": -2.7139992713928223, "loss": 0.1855, "odds_ratio_loss": 0.01949489489197731, "rewards/accuracies": 1.0, "rewards/chosen": -0.017624419182538986, "rewards/margins": 0.25377553701400757, "rewards/rejected": -0.27139994502067566, "sft_loss": 0.17624418437480927, "step": 2546 }, { "epoch": 3.683297180043384, "grad_norm": 1.8981881245598302, "learning_rate": 2.706769487158251e-06, "logits/chosen": -0.4763748347759247, "logits/rejected": -0.4929497241973877, "logps/chosen": -0.10615742951631546, "logps/rejected": -2.481844425201416, "loss": 0.1587, "odds_ratio_loss": 0.022733446210622787, "rewards/accuracies": 1.0, "rewards/chosen": -0.010615743696689606, "rewards/margins": 0.2375687062740326, "rewards/rejected": -0.2481844425201416, "sft_loss": 0.10615742951631546, "step": 2547 }, { "epoch": 3.6847433116413595, "grad_norm": 1.9813324367877208, "learning_rate": 2.703830806595337e-06, "logits/chosen": -0.5492920279502869, "logits/rejected": -0.4389027953147888, "logps/chosen": -0.20252877473831177, "logps/rejected": -5.312198638916016, "loss": 0.1906, "odds_ratio_loss": 0.031157786026597023, "rewards/accuracies": 1.0, "rewards/chosen": -0.020252875983715057, "rewards/margins": 0.5109670162200928, "rewards/rejected": -0.5312198400497437, "sft_loss": 0.20252877473831177, "step": 2548 }, { "epoch": 3.6861894432393347, "grad_norm": 2.584069937077018, "learning_rate": 2.7008929074976548e-06, "logits/chosen": -0.43435296416282654, "logits/rejected": -0.5349177122116089, "logps/chosen": -0.19991475343704224, "logps/rejected": -2.423389434814453, "loss": 0.1821, "odds_ratio_loss": 0.1047024056315422, "rewards/accuracies": 0.9375, "rewards/chosen": -0.019991476088762283, "rewards/margins": 0.2223474681377411, "rewards/rejected": -0.24233895540237427, "sft_loss": 0.19991475343704224, "step": 2549 }, { "epoch": 3.6876355748373104, "grad_norm": 2.117888291413296, "learning_rate": 2.6979557916364784e-06, "logits/chosen": -0.48327285051345825, "logits/rejected": -0.43461471796035767, "logps/chosen": -0.2285316288471222, "logps/rejected": -4.459638595581055, "loss": 0.2034, "odds_ratio_loss": 0.04889984801411629, "rewards/accuracies": 1.0, "rewards/chosen": -0.02285316213965416, "rewards/margins": 0.4231106638908386, "rewards/rejected": -0.4459638297557831, "sft_loss": 0.2285316288471222, "step": 2550 }, { "epoch": 3.6890817064352857, "grad_norm": 2.312483892866644, "learning_rate": 2.695019460782603e-06, "logits/chosen": -0.465861052274704, "logits/rejected": -0.4098866581916809, "logps/chosen": -0.2447035163640976, "logps/rejected": -2.7150635719299316, "loss": 0.1766, "odds_ratio_loss": 0.11905589699745178, "rewards/accuracies": 0.9375, "rewards/chosen": -0.02447035349905491, "rewards/margins": 0.24703601002693176, "rewards/rejected": -0.27150633931159973, "sft_loss": 0.2447035163640976, "step": 2551 }, { "epoch": 3.690527838033261, "grad_norm": 1.9446876065784444, "learning_rate": 2.6920839167063553e-06, "logits/chosen": -0.6311659812927246, "logits/rejected": -0.5097159147262573, "logps/chosen": -0.16333486139774323, "logps/rejected": -5.158690452575684, "loss": 0.1929, "odds_ratio_loss": 0.02804923988878727, "rewards/accuracies": 1.0, "rewards/chosen": -0.016333486884832382, "rewards/margins": 0.49953556060791016, "rewards/rejected": -0.5158690214157104, "sft_loss": 0.16333486139774323, "step": 2552 }, { "epoch": 3.6919739696312366, "grad_norm": 1.9728394412534829, "learning_rate": 2.689149161177587e-06, "logits/chosen": -0.5078314542770386, "logits/rejected": -0.5038515329360962, "logps/chosen": -0.27855753898620605, "logps/rejected": -3.2389707565307617, "loss": 0.2245, "odds_ratio_loss": 0.06192722171545029, "rewards/accuracies": 1.0, "rewards/chosen": -0.027855753898620605, "rewards/margins": 0.296041339635849, "rewards/rejected": -0.3238970935344696, "sft_loss": 0.27855753898620605, "step": 2553 }, { "epoch": 3.693420101229212, "grad_norm": 2.189323575896016, "learning_rate": 2.6862151959656696e-06, "logits/chosen": -0.735476016998291, "logits/rejected": -0.4959479868412018, "logps/chosen": -0.16790777444839478, "logps/rejected": -4.156628608703613, "loss": 0.2023, "odds_ratio_loss": 0.011840347200632095, "rewards/accuracies": 1.0, "rewards/chosen": -0.016790777444839478, "rewards/margins": 0.3988720774650574, "rewards/rejected": -0.41566285490989685, "sft_loss": 0.16790777444839478, "step": 2554 }, { "epoch": 3.694866232827187, "grad_norm": 1.8570832866807923, "learning_rate": 2.6832820228395054e-06, "logits/chosen": -0.603441596031189, "logits/rejected": -0.4443323612213135, "logps/chosen": -0.15063583850860596, "logps/rejected": -4.947831630706787, "loss": 0.1428, "odds_ratio_loss": 0.022614125162363052, "rewards/accuracies": 1.0, "rewards/chosen": -0.015063582919538021, "rewards/margins": 0.4797195792198181, "rewards/rejected": -0.4947831928730011, "sft_loss": 0.15063583850860596, "step": 2555 }, { "epoch": 3.6963123644251628, "grad_norm": 1.9088578967408238, "learning_rate": 2.6803496435675127e-06, "logits/chosen": -0.6813327074050903, "logits/rejected": -0.5897226333618164, "logps/chosen": -0.23708531260490417, "logps/rejected": -3.692704200744629, "loss": 0.1918, "odds_ratio_loss": 0.04931206628680229, "rewards/accuracies": 1.0, "rewards/chosen": -0.023708531633019447, "rewards/margins": 0.3455618917942047, "rewards/rejected": -0.3692704439163208, "sft_loss": 0.23708531260490417, "step": 2556 }, { "epoch": 3.697758496023138, "grad_norm": 2.3605727774829557, "learning_rate": 2.6774180599176356e-06, "logits/chosen": -0.7046637535095215, "logits/rejected": -0.6025267839431763, "logps/chosen": -0.19786414504051208, "logps/rejected": -3.777595281600952, "loss": 0.161, "odds_ratio_loss": 0.03681756556034088, "rewards/accuracies": 1.0, "rewards/chosen": -0.019786417484283447, "rewards/margins": 0.3579730987548828, "rewards/rejected": -0.37775954604148865, "sft_loss": 0.19786414504051208, "step": 2557 }, { "epoch": 3.6992046276211137, "grad_norm": 2.125397589144874, "learning_rate": 2.674487273657334e-06, "logits/chosen": -0.3669889569282532, "logits/rejected": -0.3705841302871704, "logps/chosen": -0.1971624493598938, "logps/rejected": -5.799873352050781, "loss": 0.151, "odds_ratio_loss": 0.07610459625720978, "rewards/accuracies": 0.9375, "rewards/chosen": -0.01971624605357647, "rewards/margins": 0.5602710843086243, "rewards/rejected": -0.5799873471260071, "sft_loss": 0.1971624493598938, "step": 2558 }, { "epoch": 3.700650759219089, "grad_norm": 1.9753024574913587, "learning_rate": 2.671557286553594e-06, "logits/chosen": -0.5993480682373047, "logits/rejected": -0.564248263835907, "logps/chosen": -0.29278743267059326, "logps/rejected": -3.0654852390289307, "loss": 0.1871, "odds_ratio_loss": 0.06417261809110641, "rewards/accuracies": 1.0, "rewards/chosen": -0.029278744012117386, "rewards/margins": 0.27726981043815613, "rewards/rejected": -0.306548535823822, "sft_loss": 0.29278743267059326, "step": 2559 }, { "epoch": 3.7020968908170646, "grad_norm": 2.347577217983018, "learning_rate": 2.6686281003729126e-06, "logits/chosen": -0.8407468795776367, "logits/rejected": -0.6318446397781372, "logps/chosen": -0.1679374873638153, "logps/rejected": -4.08281135559082, "loss": 0.1522, "odds_ratio_loss": 0.029092181473970413, "rewards/accuracies": 1.0, "rewards/chosen": -0.0167937483638525, "rewards/margins": 0.39148736000061035, "rewards/rejected": -0.408281147480011, "sft_loss": 0.1679374873638153, "step": 2560 }, { "epoch": 3.70354302241504, "grad_norm": 1.9624703787237274, "learning_rate": 2.6656997168813085e-06, "logits/chosen": -0.46395984292030334, "logits/rejected": -0.49898120760917664, "logps/chosen": -0.29700911045074463, "logps/rejected": -5.381232261657715, "loss": 0.198, "odds_ratio_loss": 0.07508499175310135, "rewards/accuracies": 0.9375, "rewards/chosen": -0.029700906947255135, "rewards/margins": 0.508422315120697, "rewards/rejected": -0.5381232500076294, "sft_loss": 0.29700911045074463, "step": 2561 }, { "epoch": 3.704989154013015, "grad_norm": 2.0815175213453068, "learning_rate": 2.662772137844313e-06, "logits/chosen": -0.5803831815719604, "logits/rejected": -0.5036250948905945, "logps/chosen": -0.23945432901382446, "logps/rejected": -4.763128280639648, "loss": 0.1932, "odds_ratio_loss": 0.025707893073558807, "rewards/accuracies": 1.0, "rewards/chosen": -0.023945434018969536, "rewards/margins": 0.45236736536026, "rewards/rejected": -0.4763127863407135, "sft_loss": 0.23945432901382446, "step": 2562 }, { "epoch": 3.706435285610991, "grad_norm": 2.1229476330536188, "learning_rate": 2.659845365026978e-06, "logits/chosen": -0.615597665309906, "logits/rejected": -0.3729110658168793, "logps/chosen": -0.1586671620607376, "logps/rejected": -6.872448921203613, "loss": 0.225, "odds_ratio_loss": 0.0378350093960762, "rewards/accuracies": 1.0, "rewards/chosen": -0.01586671732366085, "rewards/margins": 0.6713781952857971, "rewards/rejected": -0.6872448921203613, "sft_loss": 0.1586671620607376, "step": 2563 }, { "epoch": 3.707881417208966, "grad_norm": 2.06415325654969, "learning_rate": 2.6569194001938625e-06, "logits/chosen": -0.548956036567688, "logits/rejected": -0.44726645946502686, "logps/chosen": -0.1950879991054535, "logps/rejected": -3.3179187774658203, "loss": 0.2203, "odds_ratio_loss": 0.032068926841020584, "rewards/accuracies": 1.0, "rewards/chosen": -0.01950879767537117, "rewards/margins": 0.31228309869766235, "rewards/rejected": -0.33179187774658203, "sft_loss": 0.1950879991054535, "step": 2564 }, { "epoch": 3.7093275488069413, "grad_norm": 2.0330127883250677, "learning_rate": 2.653994245109044e-06, "logits/chosen": -0.7457367777824402, "logits/rejected": -0.4856140613555908, "logps/chosen": -0.1529059112071991, "logps/rejected": -3.516286611557007, "loss": 0.1702, "odds_ratio_loss": 0.017367858439683914, "rewards/accuracies": 1.0, "rewards/chosen": -0.015290590934455395, "rewards/margins": 0.3363381028175354, "rewards/rejected": -0.3516286611557007, "sft_loss": 0.1529059112071991, "step": 2565 }, { "epoch": 3.710773680404917, "grad_norm": 2.4457696068112478, "learning_rate": 2.651069901536106e-06, "logits/chosen": -0.7771862745285034, "logits/rejected": -0.3238375186920166, "logps/chosen": -0.135412335395813, "logps/rejected": -5.311068058013916, "loss": 0.1521, "odds_ratio_loss": 0.013591814786195755, "rewards/accuracies": 1.0, "rewards/chosen": -0.01354123279452324, "rewards/margins": 0.5175656080245972, "rewards/rejected": -0.5311068296432495, "sft_loss": 0.135412335395813, "step": 2566 }, { "epoch": 3.712219812002892, "grad_norm": 1.7528436649250814, "learning_rate": 2.64814637123815e-06, "logits/chosen": -0.696196436882019, "logits/rejected": -0.45283639430999756, "logps/chosen": -0.11723405867815018, "logps/rejected": -5.629380226135254, "loss": 0.14, "odds_ratio_loss": 0.019943315535783768, "rewards/accuracies": 1.0, "rewards/chosen": -0.011723405681550503, "rewards/margins": 0.5512145757675171, "rewards/rejected": -0.5629379749298096, "sft_loss": 0.11723405867815018, "step": 2567 }, { "epoch": 3.7136659436008674, "grad_norm": 1.9528661831415446, "learning_rate": 2.645223655977782e-06, "logits/chosen": -0.6582965850830078, "logits/rejected": -0.3814026117324829, "logps/chosen": -0.07659213244915009, "logps/rejected": -7.244695663452148, "loss": 0.1345, "odds_ratio_loss": 0.008341525681316853, "rewards/accuracies": 1.0, "rewards/chosen": -0.007659214083105326, "rewards/margins": 0.716810405254364, "rewards/rejected": -0.7244695425033569, "sft_loss": 0.07659213244915009, "step": 2568 }, { "epoch": 3.715112075198843, "grad_norm": 1.994869520742531, "learning_rate": 2.6423017575171153e-06, "logits/chosen": -0.6350436806678772, "logits/rejected": -0.6293706893920898, "logps/chosen": -0.13775679469108582, "logps/rejected": -5.226606845855713, "loss": 0.1814, "odds_ratio_loss": 0.032059669494628906, "rewards/accuracies": 1.0, "rewards/chosen": -0.013775679282844067, "rewards/margins": 0.5088850259780884, "rewards/rejected": -0.5226607322692871, "sft_loss": 0.13775679469108582, "step": 2569 }, { "epoch": 3.7165582067968184, "grad_norm": 2.0319895917615303, "learning_rate": 2.6393806776177777e-06, "logits/chosen": -0.7844041585922241, "logits/rejected": -0.5700384378433228, "logps/chosen": -0.1305319368839264, "logps/rejected": -3.323162078857422, "loss": 0.2278, "odds_ratio_loss": 0.02434907667338848, "rewards/accuracies": 1.0, "rewards/chosen": -0.01305319368839264, "rewards/margins": 0.3192630112171173, "rewards/rejected": -0.33231621980667114, "sft_loss": 0.1305319368839264, "step": 2570 }, { "epoch": 3.718004338394794, "grad_norm": 2.674733943727509, "learning_rate": 2.6364604180408963e-06, "logits/chosen": -0.7198705673217773, "logits/rejected": -0.4059660732746124, "logps/chosen": -0.13282117247581482, "logps/rejected": -4.611578464508057, "loss": 0.1716, "odds_ratio_loss": 0.016370346769690514, "rewards/accuracies": 1.0, "rewards/chosen": -0.013282117433845997, "rewards/margins": 0.4478756785392761, "rewards/rejected": -0.46115779876708984, "sft_loss": 0.13282117247581482, "step": 2571 }, { "epoch": 3.7194504699927693, "grad_norm": 2.274559735184135, "learning_rate": 2.633540980547108e-06, "logits/chosen": -0.8815616369247437, "logits/rejected": -0.6955693960189819, "logps/chosen": -0.25010132789611816, "logps/rejected": -3.4041104316711426, "loss": 0.2208, "odds_ratio_loss": 0.056482359766960144, "rewards/accuracies": 1.0, "rewards/chosen": -0.025010133162140846, "rewards/margins": 0.31540095806121826, "rewards/rejected": -0.34041109681129456, "sft_loss": 0.25010132789611816, "step": 2572 }, { "epoch": 3.720896601590745, "grad_norm": 3.822609490949455, "learning_rate": 2.63062236689655e-06, "logits/chosen": -0.5537604093551636, "logits/rejected": -0.3567535877227783, "logps/chosen": -0.27658021450042725, "logps/rejected": -3.7039966583251953, "loss": 0.2414, "odds_ratio_loss": 0.04956976696848869, "rewards/accuracies": 1.0, "rewards/chosen": -0.027658019214868546, "rewards/margins": 0.34274163842201233, "rewards/rejected": -0.3703996539115906, "sft_loss": 0.27658021450042725, "step": 2573 }, { "epoch": 3.72234273318872, "grad_norm": 2.0402329713850835, "learning_rate": 2.6277045788488695e-06, "logits/chosen": -0.5506142973899841, "logits/rejected": -0.40666961669921875, "logps/chosen": -0.17523783445358276, "logps/rejected": -4.3214945793151855, "loss": 0.1537, "odds_ratio_loss": 0.025635970756411552, "rewards/accuracies": 1.0, "rewards/chosen": -0.017523784190416336, "rewards/margins": 0.4146256744861603, "rewards/rejected": -0.4321494698524475, "sft_loss": 0.17523783445358276, "step": 2574 }, { "epoch": 3.7237888647866955, "grad_norm": 2.370602746264014, "learning_rate": 2.624787618163208e-06, "logits/chosen": -0.5524752140045166, "logits/rejected": -0.48569953441619873, "logps/chosen": -0.09435204416513443, "logps/rejected": -3.0093679428100586, "loss": 0.201, "odds_ratio_loss": 0.02923463098704815, "rewards/accuracies": 1.0, "rewards/chosen": -0.009435204789042473, "rewards/margins": 0.29150158166885376, "rewards/rejected": -0.3009367883205414, "sft_loss": 0.09435204416513443, "step": 2575 }, { "epoch": 3.725234996384671, "grad_norm": 1.9717764414884915, "learning_rate": 2.621871486598214e-06, "logits/chosen": -0.510238766670227, "logits/rejected": -0.4656646251678467, "logps/chosen": -0.10176987946033478, "logps/rejected": -3.364330768585205, "loss": 0.1647, "odds_ratio_loss": 0.06915295869112015, "rewards/accuracies": 0.9375, "rewards/chosen": -0.010176989249885082, "rewards/margins": 0.3262560963630676, "rewards/rejected": -0.3364330530166626, "sft_loss": 0.10176987946033478, "step": 2576 }, { "epoch": 3.7266811279826464, "grad_norm": 3.64987792030796, "learning_rate": 2.618956185912032e-06, "logits/chosen": -0.47754430770874023, "logits/rejected": -0.38157540559768677, "logps/chosen": -0.23310229182243347, "logps/rejected": -3.7075998783111572, "loss": 0.2027, "odds_ratio_loss": 0.04826747253537178, "rewards/accuracies": 1.0, "rewards/chosen": -0.023310229182243347, "rewards/margins": 0.34744977951049805, "rewards/rejected": -0.3707600235939026, "sft_loss": 0.23310229182243347, "step": 2577 }, { "epoch": 3.7281272595806216, "grad_norm": 2.439119648965075, "learning_rate": 2.616041717862311e-06, "logits/chosen": -0.6305740475654602, "logits/rejected": -0.48160648345947266, "logps/chosen": -0.2966257929801941, "logps/rejected": -5.1441545486450195, "loss": 0.2318, "odds_ratio_loss": 0.06576695293188095, "rewards/accuracies": 0.9375, "rewards/chosen": -0.02966257929801941, "rewards/margins": 0.484752893447876, "rewards/rejected": -0.5144155025482178, "sft_loss": 0.2966257929801941, "step": 2578 }, { "epoch": 3.7295733911785973, "grad_norm": 2.259488348219623, "learning_rate": 2.613128084206191e-06, "logits/chosen": -0.4879527688026428, "logits/rejected": -0.5105048418045044, "logps/chosen": -0.22474190592765808, "logps/rejected": -3.7948379516601562, "loss": 0.1725, "odds_ratio_loss": 0.07505577802658081, "rewards/accuracies": 1.0, "rewards/chosen": -0.022474190220236778, "rewards/margins": 0.3570095896720886, "rewards/rejected": -0.37948381900787354, "sft_loss": 0.22474190592765808, "step": 2579 }, { "epoch": 3.7310195227765726, "grad_norm": 2.0511491756501825, "learning_rate": 2.6102152867003143e-06, "logits/chosen": -0.6580703854560852, "logits/rejected": -0.4834393262863159, "logps/chosen": -0.1688551902770996, "logps/rejected": -3.4576525688171387, "loss": 0.1761, "odds_ratio_loss": 0.028964854776859283, "rewards/accuracies": 1.0, "rewards/chosen": -0.01688551902770996, "rewards/margins": 0.32887977361679077, "rewards/rejected": -0.34576526284217834, "sft_loss": 0.1688551902770996, "step": 2580 }, { "epoch": 3.7324656543745482, "grad_norm": 2.1539385559635664, "learning_rate": 2.6073033271008184e-06, "logits/chosen": -0.5000095963478088, "logits/rejected": -0.5185361504554749, "logps/chosen": -0.16783685982227325, "logps/rejected": -3.7964539527893066, "loss": 0.2066, "odds_ratio_loss": 0.02780839428305626, "rewards/accuracies": 1.0, "rewards/chosen": -0.016783684492111206, "rewards/margins": 0.36286166310310364, "rewards/rejected": -0.37964534759521484, "sft_loss": 0.16783685982227325, "step": 2581 }, { "epoch": 3.7339117859725235, "grad_norm": 2.452678357903579, "learning_rate": 2.604392207163333e-06, "logits/chosen": -0.7065168023109436, "logits/rejected": -0.6411018967628479, "logps/chosen": -0.2850452661514282, "logps/rejected": -2.233325481414795, "loss": 0.1963, "odds_ratio_loss": 0.09458545595407486, "rewards/accuracies": 1.0, "rewards/chosen": -0.02850452810525894, "rewards/margins": 0.19482800364494324, "rewards/rejected": -0.22333255410194397, "sft_loss": 0.2850452661514282, "step": 2582 }, { "epoch": 3.735357917570499, "grad_norm": 3.5577667700650695, "learning_rate": 2.601481928642985e-06, "logits/chosen": -0.5432594418525696, "logits/rejected": -0.4507932662963867, "logps/chosen": -0.14750340580940247, "logps/rejected": -5.576773166656494, "loss": 0.2091, "odds_ratio_loss": 0.012168655171990395, "rewards/accuracies": 1.0, "rewards/chosen": -0.014750340022146702, "rewards/margins": 0.5429270267486572, "rewards/rejected": -0.5576773285865784, "sft_loss": 0.14750340580940247, "step": 2583 }, { "epoch": 3.7368040491684744, "grad_norm": 3.1603618668755367, "learning_rate": 2.598572493294388e-06, "logits/chosen": -0.8299934267997742, "logits/rejected": -0.4090917706489563, "logps/chosen": -0.15736770629882812, "logps/rejected": -4.021340370178223, "loss": 0.18, "odds_ratio_loss": 0.014862221665680408, "rewards/accuracies": 1.0, "rewards/chosen": -0.015736771747469902, "rewards/margins": 0.38639724254608154, "rewards/rejected": -0.4021340012550354, "sft_loss": 0.15736770629882812, "step": 2584 }, { "epoch": 3.7382501807664497, "grad_norm": 2.2400557928515883, "learning_rate": 2.5956639028716576e-06, "logits/chosen": -0.4494069814682007, "logits/rejected": -0.47250938415527344, "logps/chosen": -0.19860422611236572, "logps/rejected": -2.777355194091797, "loss": 0.2257, "odds_ratio_loss": 0.06139344349503517, "rewards/accuracies": 1.0, "rewards/chosen": -0.01986042410135269, "rewards/margins": 0.25787508487701416, "rewards/rejected": -0.27773553133010864, "sft_loss": 0.19860422611236572, "step": 2585 }, { "epoch": 3.7396963123644253, "grad_norm": 1.983706842271957, "learning_rate": 2.592756159128388e-06, "logits/chosen": -0.5673388838768005, "logits/rejected": -0.5991949439048767, "logps/chosen": -0.11181987822055817, "logps/rejected": -4.574343204498291, "loss": 0.1371, "odds_ratio_loss": 0.022676371037960052, "rewards/accuracies": 1.0, "rewards/chosen": -0.011181988753378391, "rewards/margins": 0.44625231623649597, "rewards/rejected": -0.4574342966079712, "sft_loss": 0.11181987822055817, "step": 2586 }, { "epoch": 3.7411424439624006, "grad_norm": 2.137993317413108, "learning_rate": 2.589849263817673e-06, "logits/chosen": -0.4015089273452759, "logits/rejected": -0.35641565918922424, "logps/chosen": -0.23663023114204407, "logps/rejected": -5.35009765625, "loss": 0.2093, "odds_ratio_loss": 0.022359168156981468, "rewards/accuracies": 1.0, "rewards/chosen": -0.023663025349378586, "rewards/margins": 0.5113467574119568, "rewards/rejected": -0.5350098013877869, "sft_loss": 0.23663023114204407, "step": 2587 }, { "epoch": 3.742588575560376, "grad_norm": 1.846983258310368, "learning_rate": 2.586943218692087e-06, "logits/chosen": -0.4792490601539612, "logits/rejected": -0.38760459423065186, "logps/chosen": -0.1646152138710022, "logps/rejected": -3.257059097290039, "loss": 0.1606, "odds_ratio_loss": 0.023885680362582207, "rewards/accuracies": 1.0, "rewards/chosen": -0.01646151952445507, "rewards/margins": 0.30924439430236816, "rewards/rejected": -0.3257059156894684, "sft_loss": 0.1646152138710022, "step": 2588 }, { "epoch": 3.7440347071583515, "grad_norm": 2.2043603682689885, "learning_rate": 2.5840380255036987e-06, "logits/chosen": -0.5829678773880005, "logits/rejected": -0.47153595089912415, "logps/chosen": -0.23809784650802612, "logps/rejected": -4.241508483886719, "loss": 0.2287, "odds_ratio_loss": 0.05533334240317345, "rewards/accuracies": 1.0, "rewards/chosen": -0.023809785023331642, "rewards/margins": 0.40034106373786926, "rewards/rejected": -0.42415088415145874, "sft_loss": 0.23809784650802612, "step": 2589 }, { "epoch": 3.7454808387563268, "grad_norm": 2.374432356246243, "learning_rate": 2.5811336860040575e-06, "logits/chosen": -0.6532708406448364, "logits/rejected": -0.6016970872879028, "logps/chosen": -0.16082583367824554, "logps/rejected": -4.622746467590332, "loss": 0.1728, "odds_ratio_loss": 0.07337768375873566, "rewards/accuracies": 0.9375, "rewards/chosen": -0.016082582995295525, "rewards/margins": 0.44619205594062805, "rewards/rejected": -0.4622746706008911, "sft_loss": 0.16082583367824554, "step": 2590 }, { "epoch": 3.746926970354302, "grad_norm": 2.4503692546178084, "learning_rate": 2.5782302019442028e-06, "logits/chosen": -0.7345556020736694, "logits/rejected": -0.5952568054199219, "logps/chosen": -0.2881607413291931, "logps/rejected": -3.4280734062194824, "loss": 0.2145, "odds_ratio_loss": 0.07098816335201263, "rewards/accuracies": 1.0, "rewards/chosen": -0.02881607599556446, "rewards/margins": 0.3139912486076355, "rewards/rejected": -0.3428073227405548, "sft_loss": 0.2881607413291931, "step": 2591 }, { "epoch": 3.7483731019522777, "grad_norm": 2.0016155900455703, "learning_rate": 2.5753275750746523e-06, "logits/chosen": -0.5780439376831055, "logits/rejected": -0.4470243453979492, "logps/chosen": -0.21503284573554993, "logps/rejected": -3.0827932357788086, "loss": 0.1664, "odds_ratio_loss": 0.060324862599372864, "rewards/accuracies": 1.0, "rewards/chosen": -0.02150328829884529, "rewards/margins": 0.286776065826416, "rewards/rejected": -0.3082793653011322, "sft_loss": 0.21503284573554993, "step": 2592 }, { "epoch": 3.749819233550253, "grad_norm": 1.9390808109588353, "learning_rate": 2.5724258071454134e-06, "logits/chosen": -0.49952131509780884, "logits/rejected": -0.4054247736930847, "logps/chosen": -0.23884204030036926, "logps/rejected": -4.414041519165039, "loss": 0.1973, "odds_ratio_loss": 0.04530846327543259, "rewards/accuracies": 1.0, "rewards/chosen": -0.023884203284978867, "rewards/margins": 0.4175199866294861, "rewards/rejected": -0.44140419363975525, "sft_loss": 0.23884204030036926, "step": 2593 }, { "epoch": 3.7512653651482286, "grad_norm": 1.9932521409433872, "learning_rate": 2.5695248999059732e-06, "logits/chosen": -0.5046989321708679, "logits/rejected": -0.54093998670578, "logps/chosen": -0.19573456048965454, "logps/rejected": -3.6174211502075195, "loss": 0.1782, "odds_ratio_loss": 0.051149386912584305, "rewards/accuracies": 1.0, "rewards/chosen": -0.019573457539081573, "rewards/margins": 0.3421686589717865, "rewards/rejected": -0.3617421090602875, "sft_loss": 0.19573456048965454, "step": 2594 }, { "epoch": 3.752711496746204, "grad_norm": 3.2670889758539667, "learning_rate": 2.5666248551052987e-06, "logits/chosen": -0.5050589442253113, "logits/rejected": -0.4645492434501648, "logps/chosen": -0.24979722499847412, "logps/rejected": -3.0747222900390625, "loss": 0.1936, "odds_ratio_loss": 0.05264318734407425, "rewards/accuracies": 1.0, "rewards/chosen": -0.024979721754789352, "rewards/margins": 0.2824925184249878, "rewards/rejected": -0.30747222900390625, "sft_loss": 0.24979722499847412, "step": 2595 }, { "epoch": 3.7541576283441795, "grad_norm": 2.0890743012945867, "learning_rate": 2.563725674491837e-06, "logits/chosen": -0.5555975437164307, "logits/rejected": -0.40666472911834717, "logps/chosen": -0.1283828616142273, "logps/rejected": -3.9973580837249756, "loss": 0.1481, "odds_ratio_loss": 0.027073953300714493, "rewards/accuracies": 1.0, "rewards/chosen": -0.012838287279009819, "rewards/margins": 0.3868975341320038, "rewards/rejected": -0.39973583817481995, "sft_loss": 0.1283828616142273, "step": 2596 }, { "epoch": 3.755603759942155, "grad_norm": 2.0155989414546616, "learning_rate": 2.5608273598135145e-06, "logits/chosen": -0.6884058117866516, "logits/rejected": -0.6368868350982666, "logps/chosen": -0.25396209955215454, "logps/rejected": -2.0925629138946533, "loss": 0.2361, "odds_ratio_loss": 0.07850989699363708, "rewards/accuracies": 1.0, "rewards/chosen": -0.025396209210157394, "rewards/margins": 0.18386009335517883, "rewards/rejected": -0.20925629138946533, "sft_loss": 0.25396209955215454, "step": 2597 }, { "epoch": 3.75704989154013, "grad_norm": 1.9793552722188195, "learning_rate": 2.557929912817738e-06, "logits/chosen": -0.49515867233276367, "logits/rejected": -0.24635782837867737, "logps/chosen": -0.20986327528953552, "logps/rejected": -4.535175323486328, "loss": 0.1979, "odds_ratio_loss": 0.04149693623185158, "rewards/accuracies": 1.0, "rewards/chosen": -0.020986327901482582, "rewards/margins": 0.43253129720687866, "rewards/rejected": -0.4535176157951355, "sft_loss": 0.20986327528953552, "step": 2598 }, { "epoch": 3.7584960231381057, "grad_norm": 1.8678317296039257, "learning_rate": 2.5550333352513884e-06, "logits/chosen": -0.7255061268806458, "logits/rejected": -0.794285237789154, "logps/chosen": -0.1850017011165619, "logps/rejected": -3.257514238357544, "loss": 0.1506, "odds_ratio_loss": 0.0496574267745018, "rewards/accuracies": 1.0, "rewards/chosen": -0.01850016973912716, "rewards/margins": 0.3072512745857239, "rewards/rejected": -0.3257514238357544, "sft_loss": 0.1850017011165619, "step": 2599 }, { "epoch": 3.759942154736081, "grad_norm": 2.025715101180212, "learning_rate": 2.552137628860822e-06, "logits/chosen": -0.5476210117340088, "logits/rejected": -0.4572288990020752, "logps/chosen": -0.19937428832054138, "logps/rejected": -4.361888885498047, "loss": 0.2122, "odds_ratio_loss": 0.03671419993042946, "rewards/accuracies": 1.0, "rewards/chosen": -0.019937429577112198, "rewards/margins": 0.41625142097473145, "rewards/rejected": -0.43618887662887573, "sft_loss": 0.19937428832054138, "step": 2600 }, { "epoch": 3.761388286334056, "grad_norm": 1.8713940948947654, "learning_rate": 2.549242795391871e-06, "logits/chosen": -0.5071431398391724, "logits/rejected": -0.30187955498695374, "logps/chosen": -0.05415284261107445, "logps/rejected": -6.518517971038818, "loss": 0.1636, "odds_ratio_loss": 0.005559473764151335, "rewards/accuracies": 1.0, "rewards/chosen": -0.00541528407484293, "rewards/margins": 0.6464364528656006, "rewards/rejected": -0.6518517732620239, "sft_loss": 0.05415284261107445, "step": 2601 }, { "epoch": 3.762834417932032, "grad_norm": 3.2637222304935616, "learning_rate": 2.5463488365898426e-06, "logits/chosen": -0.5941678285598755, "logits/rejected": -0.43715280294418335, "logps/chosen": -0.20915353298187256, "logps/rejected": -3.662285804748535, "loss": 0.2696, "odds_ratio_loss": 0.03305754438042641, "rewards/accuracies": 1.0, "rewards/chosen": -0.020915353670716286, "rewards/margins": 0.3453132212162018, "rewards/rejected": -0.36622855067253113, "sft_loss": 0.20915353298187256, "step": 2602 }, { "epoch": 3.764280549530007, "grad_norm": 2.007573145262109, "learning_rate": 2.543455754199514e-06, "logits/chosen": -0.5387523770332336, "logits/rejected": -0.4196021854877472, "logps/chosen": -0.1982327699661255, "logps/rejected": -4.7550201416015625, "loss": 0.1634, "odds_ratio_loss": 0.03372848033905029, "rewards/accuracies": 1.0, "rewards/chosen": -0.01982327736914158, "rewards/margins": 0.4556787312030792, "rewards/rejected": -0.47550201416015625, "sft_loss": 0.1982327699661255, "step": 2603 }, { "epoch": 3.765726681127983, "grad_norm": 1.9541545437185883, "learning_rate": 2.540563549965137e-06, "logits/chosen": -0.5714979767799377, "logits/rejected": -0.43261682987213135, "logps/chosen": -0.19826523959636688, "logps/rejected": -4.462432861328125, "loss": 0.2072, "odds_ratio_loss": 0.06122538074851036, "rewards/accuracies": 1.0, "rewards/chosen": -0.019826525822281837, "rewards/margins": 0.4264167845249176, "rewards/rejected": -0.4462433159351349, "sft_loss": 0.19826523959636688, "step": 2604 }, { "epoch": 3.767172812725958, "grad_norm": 1.9328215931836676, "learning_rate": 2.5376722256304295e-06, "logits/chosen": -0.5500741004943848, "logits/rejected": -0.4248887002468109, "logps/chosen": -0.23179960250854492, "logps/rejected": -3.2190053462982178, "loss": 0.1968, "odds_ratio_loss": 0.07697392255067825, "rewards/accuracies": 1.0, "rewards/chosen": -0.02317996136844158, "rewards/margins": 0.2987205982208252, "rewards/rejected": -0.32190054655075073, "sft_loss": 0.23179960250854492, "step": 2605 }, { "epoch": 3.7686189443239337, "grad_norm": 2.203707685539388, "learning_rate": 2.5347817829385846e-06, "logits/chosen": -0.3630380630493164, "logits/rejected": -0.3876681327819824, "logps/chosen": -0.16202141344547272, "logps/rejected": -3.187636613845825, "loss": 0.1938, "odds_ratio_loss": 0.0400000736117363, "rewards/accuracies": 1.0, "rewards/chosen": -0.01620214246213436, "rewards/margins": 0.30256152153015137, "rewards/rejected": -0.3187636733055115, "sft_loss": 0.16202141344547272, "step": 2606 }, { "epoch": 3.770065075921909, "grad_norm": 2.04049312263035, "learning_rate": 2.5318922236322602e-06, "logits/chosen": -0.5945833921432495, "logits/rejected": -0.4727177917957306, "logps/chosen": -0.26018601655960083, "logps/rejected": -5.333613395690918, "loss": 0.1814, "odds_ratio_loss": 0.0550345852971077, "rewards/accuracies": 1.0, "rewards/chosen": -0.026018597185611725, "rewards/margins": 0.5073426961898804, "rewards/rejected": -0.5333613157272339, "sft_loss": 0.26018601655960083, "step": 2607 }, { "epoch": 3.7715112075198842, "grad_norm": 2.270452537470267, "learning_rate": 2.5290035494535805e-06, "logits/chosen": -0.4364027678966522, "logits/rejected": -0.444566547870636, "logps/chosen": -0.15006223320960999, "logps/rejected": -3.9100732803344727, "loss": 0.1582, "odds_ratio_loss": 0.06002519652247429, "rewards/accuracies": 1.0, "rewards/chosen": -0.015006224624812603, "rewards/margins": 0.3760010898113251, "rewards/rejected": -0.39100736379623413, "sft_loss": 0.15006223320960999, "step": 2608 }, { "epoch": 3.77295733911786, "grad_norm": 2.292197095198817, "learning_rate": 2.5261157621441413e-06, "logits/chosen": -0.6184677481651306, "logits/rejected": -0.6291539072990417, "logps/chosen": -0.33772191405296326, "logps/rejected": -3.0420265197753906, "loss": 0.24, "odds_ratio_loss": 0.052166953682899475, "rewards/accuracies": 1.0, "rewards/chosen": -0.033772192895412445, "rewards/margins": 0.27043047547340393, "rewards/rejected": -0.3042026460170746, "sft_loss": 0.33772191405296326, "step": 2609 }, { "epoch": 3.774403470715835, "grad_norm": 2.099941808987336, "learning_rate": 2.523228863444997e-06, "logits/chosen": -0.6739153861999512, "logits/rejected": -0.4860963225364685, "logps/chosen": -0.16617199778556824, "logps/rejected": -6.441198348999023, "loss": 0.1615, "odds_ratio_loss": 0.03473407030105591, "rewards/accuracies": 1.0, "rewards/chosen": -0.016617199406027794, "rewards/margins": 0.6275026798248291, "rewards/rejected": -0.6441198587417603, "sft_loss": 0.16617199778556824, "step": 2610 }, { "epoch": 3.7758496023138104, "grad_norm": 1.982794854155467, "learning_rate": 2.5203428550966722e-06, "logits/chosen": -0.5175485014915466, "logits/rejected": -0.48581239581108093, "logps/chosen": -0.22742906212806702, "logps/rejected": -3.3520054817199707, "loss": 0.2163, "odds_ratio_loss": 0.05771571770310402, "rewards/accuracies": 1.0, "rewards/chosen": -0.02274290844798088, "rewards/margins": 0.3124576807022095, "rewards/rejected": -0.33520054817199707, "sft_loss": 0.22742906212806702, "step": 2611 }, { "epoch": 3.777295733911786, "grad_norm": 2.47202283857541, "learning_rate": 2.517457738839149e-06, "logits/chosen": -0.7268478870391846, "logits/rejected": -0.6017354726791382, "logps/chosen": -0.32217803597450256, "logps/rejected": -2.822502613067627, "loss": 0.2546, "odds_ratio_loss": 0.078008271753788, "rewards/accuracies": 1.0, "rewards/chosen": -0.032217804342508316, "rewards/margins": 0.2500324845314026, "rewards/rejected": -0.2822502851486206, "sft_loss": 0.32217803597450256, "step": 2612 }, { "epoch": 3.7787418655097613, "grad_norm": 1.9410029740314656, "learning_rate": 2.5145735164118788e-06, "logits/chosen": -0.6474170088768005, "logits/rejected": -0.48739415407180786, "logps/chosen": -0.08966898918151855, "logps/rejected": -6.39056396484375, "loss": 0.1556, "odds_ratio_loss": 0.01307761948555708, "rewards/accuracies": 1.0, "rewards/chosen": -0.008966898545622826, "rewards/margins": 0.6300894618034363, "rewards/rejected": -0.639056384563446, "sft_loss": 0.08966898918151855, "step": 2613 }, { "epoch": 3.7801879971077366, "grad_norm": 2.0303896879223085, "learning_rate": 2.511690189553767e-06, "logits/chosen": -0.5155009031295776, "logits/rejected": -0.47514843940734863, "logps/chosen": -0.2928372025489807, "logps/rejected": -5.307400703430176, "loss": 0.2232, "odds_ratio_loss": 0.056762486696243286, "rewards/accuracies": 1.0, "rewards/chosen": -0.029283719137310982, "rewards/margins": 0.5014563202857971, "rewards/rejected": -0.5307400822639465, "sft_loss": 0.2928372025489807, "step": 2614 }, { "epoch": 3.7816341287057122, "grad_norm": 2.168404520852216, "learning_rate": 2.5088077600031834e-06, "logits/chosen": -0.7363658547401428, "logits/rejected": -0.6525477170944214, "logps/chosen": -0.12692435085773468, "logps/rejected": -4.636532306671143, "loss": 0.1842, "odds_ratio_loss": 0.01741865649819374, "rewards/accuracies": 1.0, "rewards/chosen": -0.012692435644567013, "rewards/margins": 0.45096075534820557, "rewards/rejected": -0.46365320682525635, "sft_loss": 0.12692435085773468, "step": 2615 }, { "epoch": 3.7830802603036875, "grad_norm": 2.0652992396748875, "learning_rate": 2.5059262294979535e-06, "logits/chosen": -0.5252244472503662, "logits/rejected": -0.4214191436767578, "logps/chosen": -0.24405062198638916, "logps/rejected": -4.097309112548828, "loss": 0.2465, "odds_ratio_loss": 0.04359886795282364, "rewards/accuracies": 1.0, "rewards/chosen": -0.024405062198638916, "rewards/margins": 0.3853258788585663, "rewards/rejected": -0.4097309112548828, "sft_loss": 0.24405062198638916, "step": 2616 }, { "epoch": 3.784526391901663, "grad_norm": 1.9517111344022926, "learning_rate": 2.5030455997753663e-06, "logits/chosen": -0.6113194227218628, "logits/rejected": -0.6022163033485413, "logps/chosen": -0.31509339809417725, "logps/rejected": -2.91943359375, "loss": 0.265, "odds_ratio_loss": 0.059509724378585815, "rewards/accuracies": 1.0, "rewards/chosen": -0.031509339809417725, "rewards/margins": 0.26043403148651123, "rewards/rejected": -0.29194337129592896, "sft_loss": 0.31509339809417725, "step": 2617 }, { "epoch": 3.7859725234996384, "grad_norm": 2.092277943733906, "learning_rate": 2.500165872572161e-06, "logits/chosen": -0.6757873296737671, "logits/rejected": -0.5339130759239197, "logps/chosen": -0.189954936504364, "logps/rejected": -4.904602527618408, "loss": 0.2004, "odds_ratio_loss": 0.05145931616425514, "rewards/accuracies": 1.0, "rewards/chosen": -0.01899549551308155, "rewards/margins": 0.47146478295326233, "rewards/rejected": -0.49046024680137634, "sft_loss": 0.189954936504364, "step": 2618 }, { "epoch": 3.787418655097614, "grad_norm": 2.152313378101085, "learning_rate": 2.4972870496245366e-06, "logits/chosen": -0.6356850266456604, "logits/rejected": -0.20262449979782104, "logps/chosen": -0.25436562299728394, "logps/rejected": -4.084608554840088, "loss": 0.2114, "odds_ratio_loss": 0.158283069729805, "rewards/accuracies": 0.9375, "rewards/chosen": -0.025436561554670334, "rewards/margins": 0.38302430510520935, "rewards/rejected": -0.4084608554840088, "sft_loss": 0.25436562299728394, "step": 2619 }, { "epoch": 3.7888647866955893, "grad_norm": 1.8461811123197192, "learning_rate": 2.4944091326681484e-06, "logits/chosen": -0.5899071097373962, "logits/rejected": -0.6064687967300415, "logps/chosen": -0.13420352339744568, "logps/rejected": -5.148534297943115, "loss": 0.1548, "odds_ratio_loss": 0.033398158848285675, "rewards/accuracies": 1.0, "rewards/chosen": -0.013420352712273598, "rewards/margins": 0.5014330148696899, "rewards/rejected": -0.5148534178733826, "sft_loss": 0.13420352339744568, "step": 2620 }, { "epoch": 3.7903109182935646, "grad_norm": 2.5677808116805085, "learning_rate": 2.4915321234381e-06, "logits/chosen": -0.6439390182495117, "logits/rejected": -0.6536998748779297, "logps/chosen": -0.21709826588630676, "logps/rejected": -2.651989698410034, "loss": 0.2035, "odds_ratio_loss": 0.05655750632286072, "rewards/accuracies": 1.0, "rewards/chosen": -0.021709825843572617, "rewards/margins": 0.24348914623260498, "rewards/rejected": -0.2651989758014679, "sft_loss": 0.21709826588630676, "step": 2621 }, { "epoch": 3.7917570498915403, "grad_norm": 1.7240970042044195, "learning_rate": 2.4886560236689542e-06, "logits/chosen": -0.4303063154220581, "logits/rejected": -0.2958157956600189, "logps/chosen": -0.10966433584690094, "logps/rejected": -5.254568099975586, "loss": 0.1329, "odds_ratio_loss": 0.017621489241719246, "rewards/accuracies": 1.0, "rewards/chosen": -0.010966433212161064, "rewards/margins": 0.5144904255867004, "rewards/rejected": -0.5254568457603455, "sft_loss": 0.10966433584690094, "step": 2622 }, { "epoch": 3.7932031814895155, "grad_norm": 2.2505913691166635, "learning_rate": 2.4857808350947186e-06, "logits/chosen": -0.5765487551689148, "logits/rejected": -0.26596999168395996, "logps/chosen": -0.19715330004692078, "logps/rejected": -3.5105814933776855, "loss": 0.1702, "odds_ratio_loss": 0.039164457470178604, "rewards/accuracies": 1.0, "rewards/chosen": -0.019715333357453346, "rewards/margins": 0.33134281635284424, "rewards/rejected": -0.35105812549591064, "sft_loss": 0.19715330004692078, "step": 2623 }, { "epoch": 3.7946493130874908, "grad_norm": 1.9007264625255884, "learning_rate": 2.4829065594488586e-06, "logits/chosen": -0.6615445613861084, "logits/rejected": -0.485071063041687, "logps/chosen": -0.16579408943653107, "logps/rejected": -5.319438457489014, "loss": 0.1492, "odds_ratio_loss": 0.04737501218914986, "rewards/accuracies": 1.0, "rewards/chosen": -0.016579408198595047, "rewards/margins": 0.515364408493042, "rewards/rejected": -0.5319438576698303, "sft_loss": 0.16579408943653107, "step": 2624 }, { "epoch": 3.7960954446854664, "grad_norm": 1.9969119227602157, "learning_rate": 2.4800331984642837e-06, "logits/chosen": -0.5275101661682129, "logits/rejected": -0.48966342210769653, "logps/chosen": -0.21786557137966156, "logps/rejected": -2.759644031524658, "loss": 0.1551, "odds_ratio_loss": 0.054346635937690735, "rewards/accuracies": 1.0, "rewards/chosen": -0.021786555647850037, "rewards/margins": 0.25417786836624146, "rewards/rejected": -0.2759644091129303, "sft_loss": 0.21786557137966156, "step": 2625 }, { "epoch": 3.7975415762834417, "grad_norm": 3.3918947625961184, "learning_rate": 2.4771607538733554e-06, "logits/chosen": -0.4635317325592041, "logits/rejected": -0.377239465713501, "logps/chosen": -0.12900260090827942, "logps/rejected": -5.37921142578125, "loss": 0.1137, "odds_ratio_loss": 0.009468241594731808, "rewards/accuracies": 1.0, "rewards/chosen": -0.012900261208415031, "rewards/margins": 0.5250208377838135, "rewards/rejected": -0.537921130657196, "sft_loss": 0.12900260090827942, "step": 2626 }, { "epoch": 3.7989877078814174, "grad_norm": 2.052320299573357, "learning_rate": 2.474289227407878e-06, "logits/chosen": -0.6370794773101807, "logits/rejected": -0.4276534914970398, "logps/chosen": -0.18756850063800812, "logps/rejected": -3.975283622741699, "loss": 0.1866, "odds_ratio_loss": 0.0332857221364975, "rewards/accuracies": 1.0, "rewards/chosen": -0.01875685155391693, "rewards/margins": 0.37877151370048523, "rewards/rejected": -0.39752835035324097, "sft_loss": 0.18756850063800812, "step": 2627 }, { "epoch": 3.8004338394793926, "grad_norm": 2.194141426402103, "learning_rate": 2.4714186207991095e-06, "logits/chosen": -0.5361537337303162, "logits/rejected": -0.5779704451560974, "logps/chosen": -0.19404172897338867, "logps/rejected": -2.87283992767334, "loss": 0.2097, "odds_ratio_loss": 0.04218186438083649, "rewards/accuracies": 1.0, "rewards/chosen": -0.019404174759984016, "rewards/margins": 0.26787981390953064, "rewards/rejected": -0.2872839570045471, "sft_loss": 0.19404172897338867, "step": 2628 }, { "epoch": 3.8018799710773683, "grad_norm": 2.0262944409100654, "learning_rate": 2.468548935777747e-06, "logits/chosen": -0.660569429397583, "logits/rejected": -0.5347737073898315, "logps/chosen": -0.16592204570770264, "logps/rejected": -4.6749138832092285, "loss": 0.2168, "odds_ratio_loss": 0.030461864545941353, "rewards/accuracies": 1.0, "rewards/chosen": -0.016592204570770264, "rewards/margins": 0.4508991837501526, "rewards/rejected": -0.46749138832092285, "sft_loss": 0.16592204570770264, "step": 2629 }, { "epoch": 3.8033261026753435, "grad_norm": 2.0284521617244002, "learning_rate": 2.4656801740739356e-06, "logits/chosen": -0.5391656756401062, "logits/rejected": -0.42175936698913574, "logps/chosen": -0.1921410858631134, "logps/rejected": -3.636936902999878, "loss": 0.194, "odds_ratio_loss": 0.053551651537418365, "rewards/accuracies": 1.0, "rewards/chosen": -0.01921410858631134, "rewards/margins": 0.34447959065437317, "rewards/rejected": -0.3636936843395233, "sft_loss": 0.1921410858631134, "step": 2630 }, { "epoch": 3.804772234273319, "grad_norm": 2.0716752248142467, "learning_rate": 2.46281233741726e-06, "logits/chosen": -0.6886047124862671, "logits/rejected": -0.5288622379302979, "logps/chosen": -0.23665514588356018, "logps/rejected": -2.459178924560547, "loss": 0.2018, "odds_ratio_loss": 0.06963891535997391, "rewards/accuracies": 1.0, "rewards/chosen": -0.023665515705943108, "rewards/margins": 0.22225239872932434, "rewards/rejected": -0.2459179013967514, "sft_loss": 0.23665514588356018, "step": 2631 }, { "epoch": 3.8062183658712945, "grad_norm": 2.0485758856786034, "learning_rate": 2.4599454275367526e-06, "logits/chosen": -0.6599227786064148, "logits/rejected": -0.459836483001709, "logps/chosen": -0.18189409375190735, "logps/rejected": -2.9734206199645996, "loss": 0.2247, "odds_ratio_loss": 0.02089795470237732, "rewards/accuracies": 1.0, "rewards/chosen": -0.018189409747719765, "rewards/margins": 0.27915263175964355, "rewards/rejected": -0.2973420321941376, "sft_loss": 0.18189409375190735, "step": 2632 }, { "epoch": 3.8076644974692697, "grad_norm": 2.057869407827372, "learning_rate": 2.4570794461608816e-06, "logits/chosen": -0.6338626146316528, "logits/rejected": -0.7052443027496338, "logps/chosen": -0.16895508766174316, "logps/rejected": -1.9207273721694946, "loss": 0.2421, "odds_ratio_loss": 0.05102720484137535, "rewards/accuracies": 1.0, "rewards/chosen": -0.016895508393645287, "rewards/margins": 0.17517723143100739, "rewards/rejected": -0.19207274913787842, "sft_loss": 0.16895508766174316, "step": 2633 }, { "epoch": 3.809110629067245, "grad_norm": 1.9324861662836847, "learning_rate": 2.4542143950175594e-06, "logits/chosen": -0.5443887710571289, "logits/rejected": -0.3198566436767578, "logps/chosen": -0.16024640202522278, "logps/rejected": -5.304378509521484, "loss": 0.2111, "odds_ratio_loss": 0.018941111862659454, "rewards/accuracies": 1.0, "rewards/chosen": -0.016024641692638397, "rewards/margins": 0.5144132375717163, "rewards/rejected": -0.5304378867149353, "sft_loss": 0.16024640202522278, "step": 2634 }, { "epoch": 3.8105567606652206, "grad_norm": 2.0345855814772307, "learning_rate": 2.4513502758341365e-06, "logits/chosen": -0.6028193235397339, "logits/rejected": -0.6171325445175171, "logps/chosen": -0.3160252571105957, "logps/rejected": -3.1160173416137695, "loss": 0.1974, "odds_ratio_loss": 0.12013718485832214, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03160252422094345, "rewards/margins": 0.2799992263317108, "rewards/rejected": -0.31160175800323486, "sft_loss": 0.3160252571105957, "step": 2635 }, { "epoch": 3.812002892263196, "grad_norm": 1.8867295971525198, "learning_rate": 2.448487090337399e-06, "logits/chosen": -0.3207634687423706, "logits/rejected": -0.26631879806518555, "logps/chosen": -0.11553123593330383, "logps/rejected": -4.209815979003906, "loss": 0.1515, "odds_ratio_loss": 0.0758066475391388, "rewards/accuracies": 0.9375, "rewards/chosen": -0.011553123593330383, "rewards/margins": 0.4094284772872925, "rewards/rejected": -0.42098158597946167, "sft_loss": 0.11553123593330383, "step": 2636 }, { "epoch": 3.813449023861171, "grad_norm": 2.002878147880314, "learning_rate": 2.4456248402535744e-06, "logits/chosen": -0.4317575991153717, "logits/rejected": -0.36689284443855286, "logps/chosen": -0.1188906878232956, "logps/rejected": -5.702237129211426, "loss": 0.1365, "odds_ratio_loss": 0.030404847115278244, "rewards/accuracies": 1.0, "rewards/chosen": -0.011889069341123104, "rewards/margins": 0.5583345890045166, "rewards/rejected": -0.5702236890792847, "sft_loss": 0.1188906878232956, "step": 2637 }, { "epoch": 3.814895155459147, "grad_norm": 2.0028531170479185, "learning_rate": 2.4427635273083205e-06, "logits/chosen": -0.5605095624923706, "logits/rejected": -0.6367613077163696, "logps/chosen": -0.16176557540893555, "logps/rejected": -2.33298397064209, "loss": 0.1762, "odds_ratio_loss": 0.03323551267385483, "rewards/accuracies": 1.0, "rewards/chosen": -0.016176559031009674, "rewards/margins": 0.21712183952331543, "rewards/rejected": -0.2332983911037445, "sft_loss": 0.16176557540893555, "step": 2638 }, { "epoch": 3.816341287057122, "grad_norm": 1.9136356302803936, "learning_rate": 2.439903153226738e-06, "logits/chosen": -0.4376313388347626, "logits/rejected": -0.34835922718048096, "logps/chosen": -0.11063480377197266, "logps/rejected": -6.628140926361084, "loss": 0.1722, "odds_ratio_loss": 0.02789875864982605, "rewards/accuracies": 1.0, "rewards/chosen": -0.011063480749726295, "rewards/margins": 0.6517506837844849, "rewards/rejected": -0.6628141403198242, "sft_loss": 0.11063480377197266, "step": 2639 }, { "epoch": 3.8177874186550977, "grad_norm": 1.9359572286512576, "learning_rate": 2.4370437197333535e-06, "logits/chosen": -0.7120791077613831, "logits/rejected": -0.4205125868320465, "logps/chosen": -0.23005536198616028, "logps/rejected": -4.266036033630371, "loss": 0.1677, "odds_ratio_loss": 0.02433464676141739, "rewards/accuracies": 1.0, "rewards/chosen": -0.023005535826086998, "rewards/margins": 0.4035980999469757, "rewards/rejected": -0.42660361528396606, "sft_loss": 0.23005536198616028, "step": 2640 }, { "epoch": 3.819233550253073, "grad_norm": 2.2466722500017187, "learning_rate": 2.434185228552133e-06, "logits/chosen": -0.5734530091285706, "logits/rejected": -0.5702399015426636, "logps/chosen": -0.2448917031288147, "logps/rejected": -3.1988368034362793, "loss": 0.1682, "odds_ratio_loss": 0.03184647485613823, "rewards/accuracies": 1.0, "rewards/chosen": -0.02448917180299759, "rewards/margins": 0.29539453983306885, "rewards/rejected": -0.31988370418548584, "sft_loss": 0.2448917031288147, "step": 2641 }, { "epoch": 3.8206796818510487, "grad_norm": 2.5007274744169905, "learning_rate": 2.431327681406468e-06, "logits/chosen": -0.4124388098716736, "logits/rejected": -0.30019381642341614, "logps/chosen": -0.220967099070549, "logps/rejected": -4.135493278503418, "loss": 0.2378, "odds_ratio_loss": 0.04712362587451935, "rewards/accuracies": 1.0, "rewards/chosen": -0.022096708416938782, "rewards/margins": 0.3914526104927063, "rewards/rejected": -0.4135493040084839, "sft_loss": 0.220967099070549, "step": 2642 }, { "epoch": 3.822125813449024, "grad_norm": 2.2516952824095715, "learning_rate": 2.4284710800191877e-06, "logits/chosen": -0.6341081857681274, "logits/rejected": -0.4541645646095276, "logps/chosen": -0.218563973903656, "logps/rejected": -3.7575972080230713, "loss": 0.1739, "odds_ratio_loss": 0.023864325135946274, "rewards/accuracies": 1.0, "rewards/chosen": -0.0218563973903656, "rewards/margins": 0.3539033532142639, "rewards/rejected": -0.37575972080230713, "sft_loss": 0.218563973903656, "step": 2643 }, { "epoch": 3.823571945046999, "grad_norm": 1.8444139630057388, "learning_rate": 2.425615426112545e-06, "logits/chosen": -0.6047503352165222, "logits/rejected": -0.41343581676483154, "logps/chosen": -0.1336621344089508, "logps/rejected": -4.135256290435791, "loss": 0.1668, "odds_ratio_loss": 0.011073876172304153, "rewards/accuracies": 1.0, "rewards/chosen": -0.01336621306836605, "rewards/margins": 0.40015938878059387, "rewards/rejected": -0.41352561116218567, "sft_loss": 0.1336621344089508, "step": 2644 }, { "epoch": 3.825018076644975, "grad_norm": 3.148403921694642, "learning_rate": 2.4227607214082267e-06, "logits/chosen": -0.3584001958370209, "logits/rejected": -0.3328199088573456, "logps/chosen": -0.12119731307029724, "logps/rejected": -3.8874454498291016, "loss": 0.1786, "odds_ratio_loss": 0.0209796205163002, "rewards/accuracies": 1.0, "rewards/chosen": -0.012119731865823269, "rewards/margins": 0.37662479281425476, "rewards/rejected": -0.3887445628643036, "sft_loss": 0.12119731307029724, "step": 2645 }, { "epoch": 3.82646420824295, "grad_norm": 2.0977720178713906, "learning_rate": 2.419906967627343e-06, "logits/chosen": -0.6201465129852295, "logits/rejected": -0.49884653091430664, "logps/chosen": -0.18073919415473938, "logps/rejected": -5.1349968910217285, "loss": 0.2091, "odds_ratio_loss": 0.04040105640888214, "rewards/accuracies": 1.0, "rewards/chosen": -0.018073920160531998, "rewards/margins": 0.4954257607460022, "rewards/rejected": -0.5134996771812439, "sft_loss": 0.18073919415473938, "step": 2646 }, { "epoch": 3.8279103398409253, "grad_norm": 2.253181881937432, "learning_rate": 2.417054166490433e-06, "logits/chosen": -0.6608943939208984, "logits/rejected": -0.5251725316047668, "logps/chosen": -0.10730911791324615, "logps/rejected": -3.7329394817352295, "loss": 0.1553, "odds_ratio_loss": 0.030384372919797897, "rewards/accuracies": 1.0, "rewards/chosen": -0.01073091197758913, "rewards/margins": 0.36256301403045654, "rewards/rejected": -0.3732939660549164, "sft_loss": 0.10730911791324615, "step": 2647 }, { "epoch": 3.829356471438901, "grad_norm": 3.032198838691612, "learning_rate": 2.4142023197174625e-06, "logits/chosen": -0.6617286205291748, "logits/rejected": -0.5526999235153198, "logps/chosen": -0.14543002843856812, "logps/rejected": -4.451727390289307, "loss": 0.1426, "odds_ratio_loss": 0.017422260716557503, "rewards/accuracies": 1.0, "rewards/chosen": -0.014543002471327782, "rewards/margins": 0.4306297302246094, "rewards/rejected": -0.4451727569103241, "sft_loss": 0.14543002843856812, "step": 2648 }, { "epoch": 3.8308026030368763, "grad_norm": 2.104212176731941, "learning_rate": 2.4113514290278193e-06, "logits/chosen": -0.49113255739212036, "logits/rejected": -0.36438047885894775, "logps/chosen": -0.12778490781784058, "logps/rejected": -3.6236534118652344, "loss": 0.1499, "odds_ratio_loss": 0.01595112681388855, "rewards/accuracies": 1.0, "rewards/chosen": -0.012778490781784058, "rewards/margins": 0.3495868742465973, "rewards/rejected": -0.36236536502838135, "sft_loss": 0.12778490781784058, "step": 2649 }, { "epoch": 3.8322487346348515, "grad_norm": 1.9491076169596921, "learning_rate": 2.4085014961403168e-06, "logits/chosen": -0.6785717010498047, "logits/rejected": -0.5729687809944153, "logps/chosen": -0.20131143927574158, "logps/rejected": -4.839371681213379, "loss": 0.171, "odds_ratio_loss": 0.04480065777897835, "rewards/accuracies": 1.0, "rewards/chosen": -0.020131144672632217, "rewards/margins": 0.46380603313446045, "rewards/rejected": -0.48393720388412476, "sft_loss": 0.20131143927574158, "step": 2650 }, { "epoch": 3.833694866232827, "grad_norm": 2.714118679981264, "learning_rate": 2.4056525227731882e-06, "logits/chosen": -0.4560430943965912, "logits/rejected": -0.3898116648197174, "logps/chosen": -0.2648117244243622, "logps/rejected": -3.906280517578125, "loss": 0.1648, "odds_ratio_loss": 0.062475770711898804, "rewards/accuracies": 1.0, "rewards/chosen": -0.02648117206990719, "rewards/margins": 0.3641469180583954, "rewards/rejected": -0.39062806963920593, "sft_loss": 0.2648117244243622, "step": 2651 }, { "epoch": 3.835140997830803, "grad_norm": 2.2461660391212943, "learning_rate": 2.4028045106440933e-06, "logits/chosen": -0.590645432472229, "logits/rejected": -0.4207696318626404, "logps/chosen": -0.21039243042469025, "logps/rejected": -4.4112677574157715, "loss": 0.1987, "odds_ratio_loss": 0.035821445286273956, "rewards/accuracies": 1.0, "rewards/chosen": -0.021039243787527084, "rewards/margins": 0.4200875461101532, "rewards/rejected": -0.4411267936229706, "sft_loss": 0.21039243042469025, "step": 2652 }, { "epoch": 3.836587129428778, "grad_norm": 2.166266828237972, "learning_rate": 2.3999574614701067e-06, "logits/chosen": -0.669924259185791, "logits/rejected": -0.48964861035346985, "logps/chosen": -0.15689903497695923, "logps/rejected": -3.4739127159118652, "loss": 0.1679, "odds_ratio_loss": 0.04638451337814331, "rewards/accuracies": 1.0, "rewards/chosen": -0.015689902007579803, "rewards/margins": 0.3317013680934906, "rewards/rejected": -0.347391277551651, "sft_loss": 0.15689903497695923, "step": 2653 }, { "epoch": 3.8380332610267534, "grad_norm": 2.0754765994658, "learning_rate": 2.3971113769677263e-06, "logits/chosen": -0.4671492278575897, "logits/rejected": -0.4130011796951294, "logps/chosen": -0.1233329176902771, "logps/rejected": -4.802967071533203, "loss": 0.1486, "odds_ratio_loss": 0.016072288155555725, "rewards/accuracies": 1.0, "rewards/chosen": -0.012333293445408344, "rewards/margins": 0.46796342730522156, "rewards/rejected": -0.48029670119285583, "sft_loss": 0.1233329176902771, "step": 2654 }, { "epoch": 3.839479392624729, "grad_norm": 2.198700366164306, "learning_rate": 2.394266258852865e-06, "logits/chosen": -0.6466750502586365, "logits/rejected": -0.579832136631012, "logps/chosen": -0.2253737449645996, "logps/rejected": -3.258246660232544, "loss": 0.197, "odds_ratio_loss": 0.03207365423440933, "rewards/accuracies": 1.0, "rewards/chosen": -0.02253737300634384, "rewards/margins": 0.3032872974872589, "rewards/rejected": -0.32582464814186096, "sft_loss": 0.2253737449645996, "step": 2655 }, { "epoch": 3.8409255242227043, "grad_norm": 2.806483633942708, "learning_rate": 2.3914221088408583e-06, "logits/chosen": -0.4588702619075775, "logits/rejected": -0.34531545639038086, "logps/chosen": -0.2463608831167221, "logps/rejected": -4.684918403625488, "loss": 0.1539, "odds_ratio_loss": 0.06339805573225021, "rewards/accuracies": 1.0, "rewards/chosen": -0.02463608607649803, "rewards/margins": 0.4438557028770447, "rewards/rejected": -0.4684918522834778, "sft_loss": 0.2463608831167221, "step": 2656 }, { "epoch": 3.8423716558206795, "grad_norm": 2.1236348351774277, "learning_rate": 2.3885789286464527e-06, "logits/chosen": -0.4032669961452484, "logits/rejected": -0.27119266986846924, "logps/chosen": -0.12063460052013397, "logps/rejected": -4.50440788269043, "loss": 0.166, "odds_ratio_loss": 0.03388247638940811, "rewards/accuracies": 1.0, "rewards/chosen": -0.012063460424542427, "rewards/margins": 0.43837735056877136, "rewards/rejected": -0.45044082403182983, "sft_loss": 0.12063460052013397, "step": 2657 }, { "epoch": 3.843817787418655, "grad_norm": 1.9236357962541986, "learning_rate": 2.385736719983813e-06, "logits/chosen": -0.5088585615158081, "logits/rejected": -0.4260765314102173, "logps/chosen": -0.21049919724464417, "logps/rejected": -4.297446250915527, "loss": 0.1824, "odds_ratio_loss": 0.03804740309715271, "rewards/accuracies": 1.0, "rewards/chosen": -0.021049920469522476, "rewards/margins": 0.4086947441101074, "rewards/rejected": -0.4297446608543396, "sft_loss": 0.21049919724464417, "step": 2658 }, { "epoch": 3.8452639190166304, "grad_norm": 2.1599104065431614, "learning_rate": 2.3828954845665153e-06, "logits/chosen": -0.5122015476226807, "logits/rejected": -0.5679474472999573, "logps/chosen": -0.2392263263463974, "logps/rejected": -5.5167236328125, "loss": 0.2194, "odds_ratio_loss": 0.03686853125691414, "rewards/accuracies": 1.0, "rewards/chosen": -0.02392263151705265, "rewards/margins": 0.52774977684021, "rewards/rejected": -0.5516723990440369, "sft_loss": 0.2392263263463974, "step": 2659 }, { "epoch": 3.8467100506146057, "grad_norm": 2.251439001850937, "learning_rate": 2.3800552241075538e-06, "logits/chosen": -0.5718013048171997, "logits/rejected": -0.6475985050201416, "logps/chosen": -0.15388913452625275, "logps/rejected": -3.5337791442871094, "loss": 0.2118, "odds_ratio_loss": 0.04061558097600937, "rewards/accuracies": 1.0, "rewards/chosen": -0.01538891438394785, "rewards/margins": 0.33798903226852417, "rewards/rejected": -0.35337793827056885, "sft_loss": 0.15388913452625275, "step": 2660 }, { "epoch": 3.8481561822125814, "grad_norm": 2.3274237399359605, "learning_rate": 2.3772159403193315e-06, "logits/chosen": -0.5476040840148926, "logits/rejected": -0.3083384931087494, "logps/chosen": -0.14341911673545837, "logps/rejected": -5.004617691040039, "loss": 0.1523, "odds_ratio_loss": 0.00789717212319374, "rewards/accuracies": 1.0, "rewards/chosen": -0.014341913163661957, "rewards/margins": 0.4861198663711548, "rewards/rejected": -0.500461757183075, "sft_loss": 0.14341911673545837, "step": 2661 }, { "epoch": 3.8496023138105566, "grad_norm": 1.853220411413604, "learning_rate": 2.3743776349136615e-06, "logits/chosen": -0.6236512660980225, "logits/rejected": -0.5720344185829163, "logps/chosen": -0.27546441555023193, "logps/rejected": -3.9568581581115723, "loss": 0.1901, "odds_ratio_loss": 0.06295789033174515, "rewards/accuracies": 1.0, "rewards/chosen": -0.027546444907784462, "rewards/margins": 0.368139386177063, "rewards/rejected": -0.3956858515739441, "sft_loss": 0.27546441555023193, "step": 2662 }, { "epoch": 3.8510484454085323, "grad_norm": 2.382636427805891, "learning_rate": 2.3715403096017713e-06, "logits/chosen": -0.5481364727020264, "logits/rejected": -0.35319069027900696, "logps/chosen": -0.13607822358608246, "logps/rejected": -6.205620288848877, "loss": 0.1311, "odds_ratio_loss": 0.012123924680054188, "rewards/accuracies": 1.0, "rewards/chosen": -0.013607822358608246, "rewards/margins": 0.6069542169570923, "rewards/rejected": -0.6205620765686035, "sft_loss": 0.13607822358608246, "step": 2663 }, { "epoch": 3.8524945770065075, "grad_norm": 2.1180282826968915, "learning_rate": 2.3687039660942926e-06, "logits/chosen": -0.611968994140625, "logits/rejected": -0.44643181562423706, "logps/chosen": -0.14699847996234894, "logps/rejected": -5.006779670715332, "loss": 0.1661, "odds_ratio_loss": 0.04196387901902199, "rewards/accuracies": 1.0, "rewards/chosen": -0.014699846506118774, "rewards/margins": 0.4859781861305237, "rewards/rejected": -0.5006780624389648, "sft_loss": 0.14699847996234894, "step": 2664 }, { "epoch": 3.8539407086044832, "grad_norm": 2.104062036320424, "learning_rate": 2.365868606101269e-06, "logits/chosen": -0.7090559601783752, "logits/rejected": -0.5487937927246094, "logps/chosen": -0.11773136258125305, "logps/rejected": -4.815327167510986, "loss": 0.1767, "odds_ratio_loss": 0.018653199076652527, "rewards/accuracies": 1.0, "rewards/chosen": -0.011773135513067245, "rewards/margins": 0.46975958347320557, "rewards/rejected": -0.4815327525138855, "sft_loss": 0.11773136258125305, "step": 2665 }, { "epoch": 3.8553868402024585, "grad_norm": 2.0999659236456902, "learning_rate": 2.3630342313321473e-06, "logits/chosen": -0.4976791441440582, "logits/rejected": -0.646413266658783, "logps/chosen": -0.21249869465827942, "logps/rejected": -4.945645809173584, "loss": 0.1906, "odds_ratio_loss": 0.06591396033763885, "rewards/accuracies": 1.0, "rewards/chosen": -0.021249869838356972, "rewards/margins": 0.4733147621154785, "rewards/rejected": -0.49456462264060974, "sft_loss": 0.21249869465827942, "step": 2666 }, { "epoch": 3.8568329718004337, "grad_norm": 2.1533503427856537, "learning_rate": 2.360200843495786e-06, "logits/chosen": -0.4372127056121826, "logits/rejected": -0.4424366354942322, "logps/chosen": -0.24950441718101501, "logps/rejected": -2.736983299255371, "loss": 0.1789, "odds_ratio_loss": 0.06281781941652298, "rewards/accuracies": 1.0, "rewards/chosen": -0.02495044469833374, "rewards/margins": 0.24874788522720337, "rewards/rejected": -0.2736983299255371, "sft_loss": 0.24950441718101501, "step": 2667 }, { "epoch": 3.8582791033984094, "grad_norm": 3.0188891770587762, "learning_rate": 2.3573684443004425e-06, "logits/chosen": -0.46939700841903687, "logits/rejected": -0.5516136884689331, "logps/chosen": -0.27107489109039307, "logps/rejected": -3.729104518890381, "loss": 0.1659, "odds_ratio_loss": 0.055554620921611786, "rewards/accuracies": 1.0, "rewards/chosen": -0.027107488363981247, "rewards/margins": 0.3458029627799988, "rewards/rejected": -0.3729104697704315, "sft_loss": 0.27107489109039307, "step": 2668 }, { "epoch": 3.8597252349963846, "grad_norm": 2.31441815819728, "learning_rate": 2.354537035453783e-06, "logits/chosen": -0.8334481120109558, "logits/rejected": -0.6476742625236511, "logps/chosen": -0.08477871119976044, "logps/rejected": -4.919661521911621, "loss": 0.1985, "odds_ratio_loss": 0.008285166695713997, "rewards/accuracies": 1.0, "rewards/chosen": -0.008477871306240559, "rewards/margins": 0.4834883213043213, "rewards/rejected": -0.491966187953949, "sft_loss": 0.08477871119976044, "step": 2669 }, { "epoch": 3.86117136659436, "grad_norm": 1.8740259808678605, "learning_rate": 2.351706618662871e-06, "logits/chosen": -0.6517231464385986, "logits/rejected": -0.40706366300582886, "logps/chosen": -0.22961880266666412, "logps/rejected": -5.185815811157227, "loss": 0.1517, "odds_ratio_loss": 0.056328706443309784, "rewards/accuracies": 1.0, "rewards/chosen": -0.022961881011724472, "rewards/margins": 0.4956197142601013, "rewards/rejected": -0.5185816287994385, "sft_loss": 0.22961880266666412, "step": 2670 }, { "epoch": 3.8626174981923356, "grad_norm": 2.2545041214525696, "learning_rate": 2.3488771956341795e-06, "logits/chosen": -0.5102351903915405, "logits/rejected": -0.4168573319911957, "logps/chosen": -0.19025494158267975, "logps/rejected": -5.506153106689453, "loss": 0.1912, "odds_ratio_loss": 0.05001336708664894, "rewards/accuracies": 1.0, "rewards/chosen": -0.019025495275855064, "rewards/margins": 0.5315897464752197, "rewards/rejected": -0.5506153106689453, "sft_loss": 0.19025494158267975, "step": 2671 }, { "epoch": 3.864063629790311, "grad_norm": 2.267911264770996, "learning_rate": 2.346048768073575e-06, "logits/chosen": -0.5635769963264465, "logits/rejected": -0.511735737323761, "logps/chosen": -0.12450949102640152, "logps/rejected": -5.101263523101807, "loss": 0.127, "odds_ratio_loss": 0.04796475172042847, "rewards/accuracies": 1.0, "rewards/chosen": -0.012450949288904667, "rewards/margins": 0.49767541885375977, "rewards/rejected": -0.5101263523101807, "sft_loss": 0.12450949102640152, "step": 2672 }, { "epoch": 3.865509761388286, "grad_norm": 2.1719020920397387, "learning_rate": 2.343221337686328e-06, "logits/chosen": -0.6359919309616089, "logits/rejected": -0.5677777528762817, "logps/chosen": -0.18134115636348724, "logps/rejected": -3.594672679901123, "loss": 0.1779, "odds_ratio_loss": 0.038205526769161224, "rewards/accuracies": 1.0, "rewards/chosen": -0.018134113401174545, "rewards/margins": 0.34133318066596985, "rewards/rejected": -0.3594672977924347, "sft_loss": 0.18134115636348724, "step": 2673 }, { "epoch": 3.8669558929862617, "grad_norm": 2.1634017159786483, "learning_rate": 2.3403949061771083e-06, "logits/chosen": -0.7519722580909729, "logits/rejected": -0.4830787181854248, "logps/chosen": -0.19885629415512085, "logps/rejected": -6.522568225860596, "loss": 0.1848, "odds_ratio_loss": 0.01121095847338438, "rewards/accuracies": 1.0, "rewards/chosen": -0.019885629415512085, "rewards/margins": 0.6323711276054382, "rewards/rejected": -0.6522568464279175, "sft_loss": 0.19885629415512085, "step": 2674 }, { "epoch": 3.8684020245842374, "grad_norm": 1.9462707784219915, "learning_rate": 2.33756947524998e-06, "logits/chosen": -0.5072588920593262, "logits/rejected": -0.3704627752304077, "logps/chosen": -0.09452974051237106, "logps/rejected": -4.214332103729248, "loss": 0.1956, "odds_ratio_loss": 0.014662055298686028, "rewards/accuracies": 1.0, "rewards/chosen": -0.009452973492443562, "rewards/margins": 0.4119802713394165, "rewards/rejected": -0.4214332699775696, "sft_loss": 0.09452974051237106, "step": 2675 }, { "epoch": 3.8698481561822127, "grad_norm": 1.883670642460947, "learning_rate": 2.3347450466084064e-06, "logits/chosen": -0.4906489849090576, "logits/rejected": -0.5329009890556335, "logps/chosen": -0.13472123444080353, "logps/rejected": -3.3913538455963135, "loss": 0.1755, "odds_ratio_loss": 0.026549160480499268, "rewards/accuracies": 1.0, "rewards/chosen": -0.013472123071551323, "rewards/margins": 0.3256632685661316, "rewards/rejected": -0.33913540840148926, "sft_loss": 0.13472123444080353, "step": 2676 }, { "epoch": 3.871294287780188, "grad_norm": 1.7645055199572455, "learning_rate": 2.3319216219552465e-06, "logits/chosen": -0.7514166831970215, "logits/rejected": -0.4781545102596283, "logps/chosen": -0.19920554757118225, "logps/rejected": -4.202726364135742, "loss": 0.2171, "odds_ratio_loss": 0.050058625638484955, "rewards/accuracies": 1.0, "rewards/chosen": -0.019920554012060165, "rewards/margins": 0.4003520607948303, "rewards/rejected": -0.4202726483345032, "sft_loss": 0.19920554757118225, "step": 2677 }, { "epoch": 3.8727404193781636, "grad_norm": 2.272822367567261, "learning_rate": 2.3290992029927545e-06, "logits/chosen": -0.5296982526779175, "logits/rejected": -0.42527490854263306, "logps/chosen": -0.1885211169719696, "logps/rejected": -3.3843655586242676, "loss": 0.1662, "odds_ratio_loss": 0.03290311619639397, "rewards/accuracies": 1.0, "rewards/chosen": -0.0188521146774292, "rewards/margins": 0.319584459066391, "rewards/rejected": -0.3384365737438202, "sft_loss": 0.1885211169719696, "step": 2678 }, { "epoch": 3.874186550976139, "grad_norm": 2.1408885936856636, "learning_rate": 2.326277791422574e-06, "logits/chosen": -0.6290498971939087, "logits/rejected": -0.4600859582424164, "logps/chosen": -0.1297677755355835, "logps/rejected": -4.369756698608398, "loss": 0.1729, "odds_ratio_loss": 0.016120584681630135, "rewards/accuracies": 1.0, "rewards/chosen": -0.01297677680850029, "rewards/margins": 0.4239988625049591, "rewards/rejected": -0.4369756579399109, "sft_loss": 0.1297677755355835, "step": 2679 }, { "epoch": 3.875632682574114, "grad_norm": 2.392102570924333, "learning_rate": 2.3234573889457477e-06, "logits/chosen": -0.6287966370582581, "logits/rejected": -0.5920383930206299, "logps/chosen": -0.1912301480770111, "logps/rejected": -3.680026054382324, "loss": 0.2083, "odds_ratio_loss": 0.06031038612127304, "rewards/accuracies": 1.0, "rewards/chosen": -0.01912301406264305, "rewards/margins": 0.3488796353340149, "rewards/rejected": -0.36800259351730347, "sft_loss": 0.1912301480770111, "step": 2680 }, { "epoch": 3.8770788141720898, "grad_norm": 2.307929371288277, "learning_rate": 2.3206379972627047e-06, "logits/chosen": -0.5087276697158813, "logits/rejected": -0.34346815943717957, "logps/chosen": -0.22030162811279297, "logps/rejected": -2.734839916229248, "loss": 0.2434, "odds_ratio_loss": 0.03812616690993309, "rewards/accuracies": 1.0, "rewards/chosen": -0.022030163556337357, "rewards/margins": 0.25145381689071655, "rewards/rejected": -0.2734839916229248, "sft_loss": 0.22030162811279297, "step": 2681 }, { "epoch": 3.878524945770065, "grad_norm": 2.261436653549, "learning_rate": 2.31781961807327e-06, "logits/chosen": -0.6415372490882874, "logits/rejected": -0.5570952892303467, "logps/chosen": -0.1425553858280182, "logps/rejected": -4.366990566253662, "loss": 0.1465, "odds_ratio_loss": 0.023208629339933395, "rewards/accuracies": 1.0, "rewards/chosen": -0.014255540445446968, "rewards/margins": 0.4224435091018677, "rewards/rejected": -0.4366990923881531, "sft_loss": 0.1425553858280182, "step": 2682 }, { "epoch": 3.8799710773680403, "grad_norm": 1.8952349868348815, "learning_rate": 2.31500225307665e-06, "logits/chosen": -0.508310079574585, "logits/rejected": -0.42341750860214233, "logps/chosen": -0.09663991630077362, "logps/rejected": -4.333970069885254, "loss": 0.1367, "odds_ratio_loss": 0.005665269680321217, "rewards/accuracies": 1.0, "rewards/chosen": -0.009663991630077362, "rewards/margins": 0.4237329959869385, "rewards/rejected": -0.43339696526527405, "sft_loss": 0.09663991630077362, "step": 2683 }, { "epoch": 3.881417208966016, "grad_norm": 2.4567404402425863, "learning_rate": 2.3121859039714492e-06, "logits/chosen": -0.5046891570091248, "logits/rejected": -0.38773348927497864, "logps/chosen": -0.169320210814476, "logps/rejected": -3.1759390830993652, "loss": 0.1662, "odds_ratio_loss": 0.05408705398440361, "rewards/accuracies": 1.0, "rewards/chosen": -0.01693202182650566, "rewards/margins": 0.3006618618965149, "rewards/rejected": -0.31759390234947205, "sft_loss": 0.169320210814476, "step": 2684 }, { "epoch": 3.882863340563991, "grad_norm": 3.069883800674717, "learning_rate": 2.3093705724556527e-06, "logits/chosen": -0.47510790824890137, "logits/rejected": -0.37609606981277466, "logps/chosen": -0.21013259887695312, "logps/rejected": -2.8292295932769775, "loss": 0.2096, "odds_ratio_loss": 0.046919770538806915, "rewards/accuracies": 1.0, "rewards/chosen": -0.02101326175034046, "rewards/margins": 0.26190969347953796, "rewards/rejected": -0.2829229533672333, "sft_loss": 0.21013259887695312, "step": 2685 }, { "epoch": 3.884309472161967, "grad_norm": 2.517335439887365, "learning_rate": 2.3065562602266336e-06, "logits/chosen": -0.5777808427810669, "logits/rejected": -0.6327955722808838, "logps/chosen": -0.2458532601594925, "logps/rejected": -4.5382609367370605, "loss": 0.1886, "odds_ratio_loss": 0.05085287243127823, "rewards/accuracies": 1.0, "rewards/chosen": -0.02458532713353634, "rewards/margins": 0.42924079298973083, "rewards/rejected": -0.4538261294364929, "sft_loss": 0.2458532601594925, "step": 2686 }, { "epoch": 3.885755603759942, "grad_norm": 2.2236099260861284, "learning_rate": 2.3037429689811535e-06, "logits/chosen": -0.49882829189300537, "logits/rejected": -0.46719181537628174, "logps/chosen": -0.30639082193374634, "logps/rejected": -3.102865219116211, "loss": 0.2481, "odds_ratio_loss": 0.07744499295949936, "rewards/accuracies": 1.0, "rewards/chosen": -0.030639082193374634, "rewards/margins": 0.2796474099159241, "rewards/rejected": -0.3102865219116211, "sft_loss": 0.30639082193374634, "step": 2687 }, { "epoch": 3.887201735357918, "grad_norm": 1.9450700368153286, "learning_rate": 2.3009307004153535e-06, "logits/chosen": -0.6472720503807068, "logits/rejected": -0.49304014444351196, "logps/chosen": -0.2393447756767273, "logps/rejected": -4.900020599365234, "loss": 0.2072, "odds_ratio_loss": 0.02204173430800438, "rewards/accuracies": 1.0, "rewards/chosen": -0.02393447607755661, "rewards/margins": 0.4660675823688507, "rewards/rejected": -0.4900020360946655, "sft_loss": 0.2393447756767273, "step": 2688 }, { "epoch": 3.888647866955893, "grad_norm": 2.2910900345787035, "learning_rate": 2.298119456224764e-06, "logits/chosen": -0.6662863492965698, "logits/rejected": -0.6600769758224487, "logps/chosen": -0.13599048554897308, "logps/rejected": -2.522681713104248, "loss": 0.2028, "odds_ratio_loss": 0.07435938715934753, "rewards/accuracies": 0.9375, "rewards/chosen": -0.013599049299955368, "rewards/margins": 0.23866915702819824, "rewards/rejected": -0.2522681951522827, "sft_loss": 0.13599048554897308, "step": 2689 }, { "epoch": 3.8900939985538683, "grad_norm": 3.7471824607256785, "learning_rate": 2.295309238104291e-06, "logits/chosen": -0.7487191557884216, "logits/rejected": -0.44234901666641235, "logps/chosen": -0.11293800920248032, "logps/rejected": -5.536605358123779, "loss": 0.1694, "odds_ratio_loss": 0.017550412565469742, "rewards/accuracies": 1.0, "rewards/chosen": -0.011293800547719002, "rewards/margins": 0.5423667430877686, "rewards/rejected": -0.55366051197052, "sft_loss": 0.11293800920248032, "step": 2690 }, { "epoch": 3.891540130151844, "grad_norm": 1.9729125724864918, "learning_rate": 2.2925000477482286e-06, "logits/chosen": -0.5971082448959351, "logits/rejected": -0.6901511549949646, "logps/chosen": -0.1475466787815094, "logps/rejected": -4.186668395996094, "loss": 0.1915, "odds_ratio_loss": 0.03244779258966446, "rewards/accuracies": 1.0, "rewards/chosen": -0.014754666946828365, "rewards/margins": 0.40391218662261963, "rewards/rejected": -0.4186668395996094, "sft_loss": 0.1475466787815094, "step": 2691 }, { "epoch": 3.892986261749819, "grad_norm": 2.286839337755713, "learning_rate": 2.289691886850246e-06, "logits/chosen": -0.7391765117645264, "logits/rejected": -0.47775739431381226, "logps/chosen": -0.23687493801116943, "logps/rejected": -3.8079967498779297, "loss": 0.1872, "odds_ratio_loss": 0.0462396964430809, "rewards/accuracies": 1.0, "rewards/chosen": -0.023687491193413734, "rewards/margins": 0.35711222887039185, "rewards/rejected": -0.38079971075057983, "sft_loss": 0.23687493801116943, "step": 2692 }, { "epoch": 3.8944323933477945, "grad_norm": 2.16708601581374, "learning_rate": 2.2868847571033958e-06, "logits/chosen": -0.65777987241745, "logits/rejected": -0.4829615652561188, "logps/chosen": -0.11277547478675842, "logps/rejected": -4.166236400604248, "loss": 0.1679, "odds_ratio_loss": 0.024801742285490036, "rewards/accuracies": 1.0, "rewards/chosen": -0.011277547106146812, "rewards/margins": 0.4053461253643036, "rewards/rejected": -0.41662368178367615, "sft_loss": 0.11277547478675842, "step": 2693 }, { "epoch": 3.89587852494577, "grad_norm": 1.9916804771142396, "learning_rate": 2.284078660200105e-06, "logits/chosen": -0.5471053123474121, "logits/rejected": -0.4118785262107849, "logps/chosen": -0.1762133240699768, "logps/rejected": -5.470975875854492, "loss": 0.1503, "odds_ratio_loss": 0.019666478037834167, "rewards/accuracies": 1.0, "rewards/chosen": -0.01762133277952671, "rewards/margins": 0.5294762849807739, "rewards/rejected": -0.5470975637435913, "sft_loss": 0.1762133240699768, "step": 2694 }, { "epoch": 3.8973246565437454, "grad_norm": 2.0384475244157105, "learning_rate": 2.2812735978321823e-06, "logits/chosen": -0.5914080142974854, "logits/rejected": -0.5802508592605591, "logps/chosen": -0.12550309300422668, "logps/rejected": -3.805398941040039, "loss": 0.1454, "odds_ratio_loss": 0.025427283719182014, "rewards/accuracies": 1.0, "rewards/chosen": -0.012550310231745243, "rewards/margins": 0.36798956990242004, "rewards/rejected": -0.3805398941040039, "sft_loss": 0.12550309300422668, "step": 2695 }, { "epoch": 3.8987707881417206, "grad_norm": 2.0039485066790914, "learning_rate": 2.278469571690806e-06, "logits/chosen": -0.6192625761032104, "logits/rejected": -0.5282604694366455, "logps/chosen": -0.15284359455108643, "logps/rejected": -4.06088924407959, "loss": 0.1777, "odds_ratio_loss": 0.018614958971738815, "rewards/accuracies": 1.0, "rewards/chosen": -0.015284359455108643, "rewards/margins": 0.39080455899238586, "rewards/rejected": -0.4060888886451721, "sft_loss": 0.15284359455108643, "step": 2696 }, { "epoch": 3.9002169197396963, "grad_norm": 1.9348476449421625, "learning_rate": 2.2756665834665386e-06, "logits/chosen": -0.5989269018173218, "logits/rejected": -0.46438270807266235, "logps/chosen": -0.204659104347229, "logps/rejected": -5.059053897857666, "loss": 0.2055, "odds_ratio_loss": 0.015010501258075237, "rewards/accuracies": 1.0, "rewards/chosen": -0.0204659104347229, "rewards/margins": 0.4854394793510437, "rewards/rejected": -0.5059053897857666, "sft_loss": 0.204659104347229, "step": 2697 }, { "epoch": 3.901663051337672, "grad_norm": 2.091035380201727, "learning_rate": 2.272864634849308e-06, "logits/chosen": -0.47287124395370483, "logits/rejected": -0.3687141537666321, "logps/chosen": -0.23247979581356049, "logps/rejected": -4.085878849029541, "loss": 0.2085, "odds_ratio_loss": 0.05591895431280136, "rewards/accuracies": 1.0, "rewards/chosen": -0.023247981444001198, "rewards/margins": 0.3853399157524109, "rewards/rejected": -0.40858790278434753, "sft_loss": 0.23247979581356049, "step": 2698 }, { "epoch": 3.9031091829356472, "grad_norm": 1.9688702293823612, "learning_rate": 2.2700637275284244e-06, "logits/chosen": -0.4580070972442627, "logits/rejected": -0.39984434843063354, "logps/chosen": -0.2849455773830414, "logps/rejected": -2.5171000957489014, "loss": 0.1823, "odds_ratio_loss": 0.06607170403003693, "rewards/accuracies": 1.0, "rewards/chosen": -0.02849455736577511, "rewards/margins": 0.22321546077728271, "rewards/rejected": -0.2517099976539612, "sft_loss": 0.2849455773830414, "step": 2699 }, { "epoch": 3.9045553145336225, "grad_norm": 2.3070770756670913, "learning_rate": 2.26726386319256e-06, "logits/chosen": -0.5406573414802551, "logits/rejected": -0.26209941506385803, "logps/chosen": -0.1298496127128601, "logps/rejected": -3.572732925415039, "loss": 0.1791, "odds_ratio_loss": 0.017180941998958588, "rewards/accuracies": 1.0, "rewards/chosen": -0.01298496127128601, "rewards/margins": 0.34428831934928894, "rewards/rejected": -0.35727331042289734, "sft_loss": 0.1298496127128601, "step": 2700 }, { "epoch": 3.906001446131598, "grad_norm": 1.915036217109782, "learning_rate": 2.2644650435297675e-06, "logits/chosen": -0.5574444532394409, "logits/rejected": -0.5001909136772156, "logps/chosen": -0.28338301181793213, "logps/rejected": -3.268771171569824, "loss": 0.2028, "odds_ratio_loss": 0.07406371831893921, "rewards/accuracies": 1.0, "rewards/chosen": -0.028338300064206123, "rewards/margins": 0.29853883385658264, "rewards/rejected": -0.3268771469593048, "sft_loss": 0.28338301181793213, "step": 2701 }, { "epoch": 3.9074475777295734, "grad_norm": 3.4689329886595868, "learning_rate": 2.2616672702274643e-06, "logits/chosen": -0.562351405620575, "logits/rejected": -0.4643508791923523, "logps/chosen": -0.1464012861251831, "logps/rejected": -3.3796513080596924, "loss": 0.1936, "odds_ratio_loss": 0.03205585852265358, "rewards/accuracies": 1.0, "rewards/chosen": -0.01464013010263443, "rewards/margins": 0.3233250081539154, "rewards/rejected": -0.33796513080596924, "sft_loss": 0.1464012861251831, "step": 2702 }, { "epoch": 3.9088937093275486, "grad_norm": 2.3207929462257146, "learning_rate": 2.258870544972437e-06, "logits/chosen": -0.5727015733718872, "logits/rejected": -0.4699394106864929, "logps/chosen": -0.17665952444076538, "logps/rejected": -3.051046848297119, "loss": 0.2095, "odds_ratio_loss": 0.060139112174510956, "rewards/accuracies": 1.0, "rewards/chosen": -0.017665952444076538, "rewards/margins": 0.2874387204647064, "rewards/rejected": -0.30510464310646057, "sft_loss": 0.17665952444076538, "step": 2703 }, { "epoch": 3.9103398409255243, "grad_norm": 2.4119382863806433, "learning_rate": 2.2560748694508435e-06, "logits/chosen": -0.5257418155670166, "logits/rejected": -0.5052908658981323, "logps/chosen": -0.1041015014052391, "logps/rejected": -2.8004159927368164, "loss": 0.2212, "odds_ratio_loss": 0.02311025746166706, "rewards/accuracies": 1.0, "rewards/chosen": -0.01041015051305294, "rewards/margins": 0.2696314752101898, "rewards/rejected": -0.2800416052341461, "sft_loss": 0.1041015014052391, "step": 2704 }, { "epoch": 3.9117859725234996, "grad_norm": 2.2430778101447055, "learning_rate": 2.253280245348205e-06, "logits/chosen": -0.5410814881324768, "logits/rejected": -0.4534750282764435, "logps/chosen": -0.2101866453886032, "logps/rejected": -3.4188551902770996, "loss": 0.2123, "odds_ratio_loss": 0.05106037110090256, "rewards/accuracies": 1.0, "rewards/chosen": -0.02101866528391838, "rewards/margins": 0.3208668529987335, "rewards/rejected": -0.341885507106781, "sft_loss": 0.2101866453886032, "step": 2705 }, { "epoch": 3.913232104121475, "grad_norm": 2.414189461889776, "learning_rate": 2.2504866743494134e-06, "logits/chosen": -0.6277978420257568, "logits/rejected": -0.39811521768569946, "logps/chosen": -0.15590061247348785, "logps/rejected": -5.489923477172852, "loss": 0.2404, "odds_ratio_loss": 0.012680593878030777, "rewards/accuracies": 1.0, "rewards/chosen": -0.01559006329625845, "rewards/margins": 0.5334023237228394, "rewards/rejected": -0.5489923357963562, "sft_loss": 0.15590061247348785, "step": 2706 }, { "epoch": 3.9146782357194505, "grad_norm": 2.1925742092247456, "learning_rate": 2.24769415813872e-06, "logits/chosen": -0.6148355007171631, "logits/rejected": -0.5587329864501953, "logps/chosen": -0.18322691321372986, "logps/rejected": -3.181863784790039, "loss": 0.1937, "odds_ratio_loss": 0.049598000943660736, "rewards/accuracies": 1.0, "rewards/chosen": -0.018322691321372986, "rewards/margins": 0.29986369609832764, "rewards/rejected": -0.3181864023208618, "sft_loss": 0.18322691321372986, "step": 2707 }, { "epoch": 3.9161243673174257, "grad_norm": 3.1679965919517916, "learning_rate": 2.2449026983997476e-06, "logits/chosen": -0.7403485774993896, "logits/rejected": -0.6171454191207886, "logps/chosen": -0.2301938235759735, "logps/rejected": -3.761472225189209, "loss": 0.1659, "odds_ratio_loss": 0.04906386882066727, "rewards/accuracies": 1.0, "rewards/chosen": -0.02301938459277153, "rewards/margins": 0.3531278371810913, "rewards/rejected": -0.37614724040031433, "sft_loss": 0.2301938235759735, "step": 2708 }, { "epoch": 3.9175704989154014, "grad_norm": 2.1201844249727153, "learning_rate": 2.242112296815474e-06, "logits/chosen": -0.6007668972015381, "logits/rejected": -0.4732978045940399, "logps/chosen": -0.2650083899497986, "logps/rejected": -4.493795394897461, "loss": 0.2163, "odds_ratio_loss": 0.03872312977910042, "rewards/accuracies": 1.0, "rewards/chosen": -0.02650083787739277, "rewards/margins": 0.4228787422180176, "rewards/rejected": -0.449379563331604, "sft_loss": 0.2650083899497986, "step": 2709 }, { "epoch": 3.9190166305133767, "grad_norm": 1.8862309748937722, "learning_rate": 2.239322955068244e-06, "logits/chosen": -0.4192129075527191, "logits/rejected": -0.39664286375045776, "logps/chosen": -0.25290870666503906, "logps/rejected": -5.286991119384766, "loss": 0.2261, "odds_ratio_loss": 0.05502773076295853, "rewards/accuracies": 1.0, "rewards/chosen": -0.025290867313742638, "rewards/margins": 0.5034083127975464, "rewards/rejected": -0.5286991596221924, "sft_loss": 0.25290870666503906, "step": 2710 }, { "epoch": 3.9204627621113524, "grad_norm": 1.824106972408141, "learning_rate": 2.2365346748397606e-06, "logits/chosen": -0.4954592287540436, "logits/rejected": -0.42046910524368286, "logps/chosen": -0.11221830546855927, "logps/rejected": -6.584062576293945, "loss": 0.1433, "odds_ratio_loss": 0.022290529683232307, "rewards/accuracies": 1.0, "rewards/chosen": -0.011221831664443016, "rewards/margins": 0.6471844911575317, "rewards/rejected": -0.6584063172340393, "sft_loss": 0.11221830546855927, "step": 2711 }, { "epoch": 3.9219088937093276, "grad_norm": 2.073955700687443, "learning_rate": 2.2337474578110904e-06, "logits/chosen": -0.4804004430770874, "logits/rejected": -0.32498830556869507, "logps/chosen": -0.10834227502346039, "logps/rejected": -4.200552940368652, "loss": 0.1867, "odds_ratio_loss": 0.015155954286456108, "rewards/accuracies": 1.0, "rewards/chosen": -0.01083422638475895, "rewards/margins": 0.4092210531234741, "rewards/rejected": -0.4200552701950073, "sft_loss": 0.10834227502346039, "step": 2712 }, { "epoch": 3.923355025307303, "grad_norm": 3.7115801213716497, "learning_rate": 2.230961305662655e-06, "logits/chosen": -0.49971461296081543, "logits/rejected": -0.3881816565990448, "logps/chosen": -0.27020180225372314, "logps/rejected": -4.6016716957092285, "loss": 0.2186, "odds_ratio_loss": 0.05096177011728287, "rewards/accuracies": 1.0, "rewards/chosen": -0.027020180597901344, "rewards/margins": 0.43314701318740845, "rewards/rejected": -0.46016716957092285, "sft_loss": 0.27020180225372314, "step": 2713 }, { "epoch": 3.9248011569052785, "grad_norm": 2.308041112836885, "learning_rate": 2.228176220074237e-06, "logits/chosen": -0.8088836669921875, "logits/rejected": -0.6313307285308838, "logps/chosen": -0.21790727972984314, "logps/rejected": -3.6561279296875, "loss": 0.1934, "odds_ratio_loss": 0.04986013099551201, "rewards/accuracies": 1.0, "rewards/chosen": -0.021790727972984314, "rewards/margins": 0.34382206201553345, "rewards/rejected": -0.36561277508735657, "sft_loss": 0.21790727972984314, "step": 2714 }, { "epoch": 3.9262472885032538, "grad_norm": 1.9474719292583453, "learning_rate": 2.2253922027249765e-06, "logits/chosen": -0.4550485610961914, "logits/rejected": -0.35971906781196594, "logps/chosen": -0.22380274534225464, "logps/rejected": -3.577040672302246, "loss": 0.229, "odds_ratio_loss": 0.03350451588630676, "rewards/accuracies": 1.0, "rewards/chosen": -0.022380275651812553, "rewards/margins": 0.3353238105773926, "rewards/rejected": -0.3577041029930115, "sft_loss": 0.22380274534225464, "step": 2715 }, { "epoch": 3.927693420101229, "grad_norm": 1.9833196622850737, "learning_rate": 2.222609255293367e-06, "logits/chosen": -0.5128300189971924, "logits/rejected": -0.3967776894569397, "logps/chosen": -0.12300994247198105, "logps/rejected": -3.8472933769226074, "loss": 0.1866, "odds_ratio_loss": 0.033817462623119354, "rewards/accuracies": 1.0, "rewards/chosen": -0.01230099517852068, "rewards/margins": 0.37242835760116577, "rewards/rejected": -0.3847293257713318, "sft_loss": 0.12300994247198105, "step": 2716 }, { "epoch": 3.9291395516992047, "grad_norm": 1.8220600153997757, "learning_rate": 2.219827379457256e-06, "logits/chosen": -0.47464361786842346, "logits/rejected": -0.4734431505203247, "logps/chosen": -0.1375093311071396, "logps/rejected": -4.749077796936035, "loss": 0.1515, "odds_ratio_loss": 0.03777370974421501, "rewards/accuracies": 1.0, "rewards/chosen": -0.013750933110713959, "rewards/margins": 0.46115684509277344, "rewards/rejected": -0.4749077558517456, "sft_loss": 0.1375093311071396, "step": 2717 }, { "epoch": 3.93058568329718, "grad_norm": 2.051028908996472, "learning_rate": 2.2170465768938473e-06, "logits/chosen": -0.6458155512809753, "logits/rejected": -0.5764715671539307, "logps/chosen": -0.08788472414016724, "logps/rejected": -3.526029109954834, "loss": 0.2008, "odds_ratio_loss": 0.020078768953680992, "rewards/accuracies": 1.0, "rewards/chosen": -0.008788472041487694, "rewards/margins": 0.3438144326210022, "rewards/rejected": -0.35260289907455444, "sft_loss": 0.08788472414016724, "step": 2718 }, { "epoch": 3.932031814895155, "grad_norm": 1.7076574111730232, "learning_rate": 2.214266849279699e-06, "logits/chosen": -0.4735933542251587, "logits/rejected": -0.44786524772644043, "logps/chosen": -0.10404633730649948, "logps/rejected": -4.46885347366333, "loss": 0.1045, "odds_ratio_loss": 0.02645310014486313, "rewards/accuracies": 1.0, "rewards/chosen": -0.010404633358120918, "rewards/margins": 0.43648070096969604, "rewards/rejected": -0.446885347366333, "sft_loss": 0.10404633730649948, "step": 2719 }, { "epoch": 3.933477946493131, "grad_norm": 2.0280602741537046, "learning_rate": 2.211488198290716e-06, "logits/chosen": -0.6490156054496765, "logits/rejected": -0.5675171613693237, "logps/chosen": -0.23582687973976135, "logps/rejected": -2.961477756500244, "loss": 0.2049, "odds_ratio_loss": 0.06842078268527985, "rewards/accuracies": 1.0, "rewards/chosen": -0.023582689464092255, "rewards/margins": 0.2725651264190674, "rewards/rejected": -0.29614779353141785, "sft_loss": 0.23582687973976135, "step": 2720 }, { "epoch": 3.9349240780911066, "grad_norm": 2.0813505754869426, "learning_rate": 2.208710625602162e-06, "logits/chosen": -0.5769675970077515, "logits/rejected": -0.48996660113334656, "logps/chosen": -0.20689250528812408, "logps/rejected": -4.864871025085449, "loss": 0.1726, "odds_ratio_loss": 0.029630932956933975, "rewards/accuracies": 1.0, "rewards/chosen": -0.02068925090134144, "rewards/margins": 0.4657978415489197, "rewards/rejected": -0.48648712038993835, "sft_loss": 0.20689250528812408, "step": 2721 }, { "epoch": 3.936370209689082, "grad_norm": 2.1687754965363952, "learning_rate": 2.205934132888641e-06, "logits/chosen": -0.46100273728370667, "logits/rejected": -0.36753734946250916, "logps/chosen": -0.11003377288579941, "logps/rejected": -4.685311794281006, "loss": 0.1729, "odds_ratio_loss": 0.017081379890441895, "rewards/accuracies": 1.0, "rewards/chosen": -0.011003376916050911, "rewards/margins": 0.4575278162956238, "rewards/rejected": -0.46853119134902954, "sft_loss": 0.11003377288579941, "step": 2722 }, { "epoch": 3.937816341287057, "grad_norm": 2.0296629482018655, "learning_rate": 2.2031587218241148e-06, "logits/chosen": -0.4646747410297394, "logits/rejected": -0.39276355504989624, "logps/chosen": -0.10434699803590775, "logps/rejected": -4.183708190917969, "loss": 0.175, "odds_ratio_loss": 0.02051355130970478, "rewards/accuracies": 1.0, "rewards/chosen": -0.01043469924479723, "rewards/margins": 0.4079361855983734, "rewards/rejected": -0.41837090253829956, "sft_loss": 0.10434699803590775, "step": 2723 }, { "epoch": 3.9392624728850327, "grad_norm": 2.049891085377211, "learning_rate": 2.2003843940818874e-06, "logits/chosen": -0.6782933473587036, "logits/rejected": -0.5610325932502747, "logps/chosen": -0.20635099709033966, "logps/rejected": -4.527532577514648, "loss": 0.2131, "odds_ratio_loss": 0.03238668665289879, "rewards/accuracies": 1.0, "rewards/chosen": -0.020635100081562996, "rewards/margins": 0.4321182072162628, "rewards/rejected": -0.4527532458305359, "sft_loss": 0.20635099709033966, "step": 2724 }, { "epoch": 3.940708604483008, "grad_norm": 1.86509035176643, "learning_rate": 2.1976111513346113e-06, "logits/chosen": -0.6634677648544312, "logits/rejected": -0.5007971525192261, "logps/chosen": -0.09252119809389114, "logps/rejected": -4.644352436065674, "loss": 0.1433, "odds_ratio_loss": 0.020731642842292786, "rewards/accuracies": 1.0, "rewards/chosen": -0.00925211887806654, "rewards/margins": 0.45518314838409424, "rewards/rejected": -0.46443524956703186, "sft_loss": 0.09252119809389114, "step": 2725 }, { "epoch": 3.942154736080983, "grad_norm": 2.0492445038755394, "learning_rate": 2.1948389952542834e-06, "logits/chosen": -0.5211946964263916, "logits/rejected": -0.4892616271972656, "logps/chosen": -0.10040014237165451, "logps/rejected": -2.7622721195220947, "loss": 0.2018, "odds_ratio_loss": 0.021230706945061684, "rewards/accuracies": 1.0, "rewards/chosen": -0.010040014050900936, "rewards/margins": 0.2661871910095215, "rewards/rejected": -0.2762272357940674, "sft_loss": 0.10040014237165451, "step": 2726 }, { "epoch": 3.943600867678959, "grad_norm": 1.9556796476216858, "learning_rate": 2.1920679275122482e-06, "logits/chosen": -0.7522842884063721, "logits/rejected": -0.5683616995811462, "logps/chosen": -0.12370572984218597, "logps/rejected": -2.923466920852661, "loss": 0.1666, "odds_ratio_loss": 0.01687091588973999, "rewards/accuracies": 1.0, "rewards/chosen": -0.012370571494102478, "rewards/margins": 0.27997615933418274, "rewards/rejected": -0.292346715927124, "sft_loss": 0.12370572984218597, "step": 2727 }, { "epoch": 3.945046999276934, "grad_norm": 2.555360496366615, "learning_rate": 2.1892979497791945e-06, "logits/chosen": -0.7111763954162598, "logits/rejected": -0.534297525882721, "logps/chosen": -0.14246775209903717, "logps/rejected": -3.895209550857544, "loss": 0.1889, "odds_ratio_loss": 0.01873032934963703, "rewards/accuracies": 1.0, "rewards/chosen": -0.014246775768697262, "rewards/margins": 0.3752741813659668, "rewards/rejected": -0.38952094316482544, "sft_loss": 0.14246775209903717, "step": 2728 }, { "epoch": 3.9464931308749094, "grad_norm": 2.69918533275395, "learning_rate": 2.1865290637251494e-06, "logits/chosen": -0.6954668760299683, "logits/rejected": -0.4582352638244629, "logps/chosen": -0.16472528874874115, "logps/rejected": -2.790771245956421, "loss": 0.189, "odds_ratio_loss": 0.0377388596534729, "rewards/accuracies": 1.0, "rewards/chosen": -0.016472529619932175, "rewards/margins": 0.26260459423065186, "rewards/rejected": -0.2790771424770355, "sft_loss": 0.16472528874874115, "step": 2729 }, { "epoch": 3.947939262472885, "grad_norm": 2.1214673089172487, "learning_rate": 2.1837612710194872e-06, "logits/chosen": -0.5668919086456299, "logits/rejected": -0.4885096549987793, "logps/chosen": -0.3104791045188904, "logps/rejected": -4.2551960945129395, "loss": 0.1956, "odds_ratio_loss": 0.07189791649580002, "rewards/accuracies": 1.0, "rewards/chosen": -0.031047910451889038, "rewards/margins": 0.3944717049598694, "rewards/rejected": -0.4255196154117584, "sft_loss": 0.3104791045188904, "step": 2730 }, { "epoch": 3.9493853940708603, "grad_norm": 2.1663124391142325, "learning_rate": 2.1809945733309193e-06, "logits/chosen": -0.6335052847862244, "logits/rejected": -0.5434231162071228, "logps/chosen": -0.13023731112480164, "logps/rejected": -3.2621257305145264, "loss": 0.1589, "odds_ratio_loss": 0.0158349871635437, "rewards/accuracies": 1.0, "rewards/chosen": -0.01302372943609953, "rewards/margins": 0.3131888508796692, "rewards/rejected": -0.3262125551700592, "sft_loss": 0.13023731112480164, "step": 2731 }, { "epoch": 3.950831525668836, "grad_norm": 2.2296369918556573, "learning_rate": 2.1782289723274975e-06, "logits/chosen": -0.5075691342353821, "logits/rejected": -0.40280091762542725, "logps/chosen": -0.21550577878952026, "logps/rejected": -2.2407689094543457, "loss": 0.1781, "odds_ratio_loss": 0.041584450751543045, "rewards/accuracies": 1.0, "rewards/chosen": -0.021550578996539116, "rewards/margins": 0.2025263011455536, "rewards/rejected": -0.22407689690589905, "sft_loss": 0.21550577878952026, "step": 2732 }, { "epoch": 3.9522776572668112, "grad_norm": 2.1585834508959945, "learning_rate": 2.175464469676612e-06, "logits/chosen": -0.6296189427375793, "logits/rejected": -0.6328527331352234, "logps/chosen": -0.20828096568584442, "logps/rejected": -5.044098377227783, "loss": 0.163, "odds_ratio_loss": 0.06003636494278908, "rewards/accuracies": 1.0, "rewards/chosen": -0.02082809805870056, "rewards/margins": 0.4835817217826843, "rewards/rejected": -0.5044097900390625, "sft_loss": 0.20828096568584442, "step": 2733 }, { "epoch": 3.953723788864787, "grad_norm": 1.8902474968189271, "learning_rate": 2.1727010670449945e-06, "logits/chosen": -0.5789146423339844, "logits/rejected": -0.4928153157234192, "logps/chosen": -0.10002440959215164, "logps/rejected": -4.361918926239014, "loss": 0.1677, "odds_ratio_loss": 0.03862610086798668, "rewards/accuracies": 1.0, "rewards/chosen": -0.01000244077295065, "rewards/margins": 0.42618948221206665, "rewards/rejected": -0.4361919164657593, "sft_loss": 0.10002440959215164, "step": 2734 }, { "epoch": 3.955169920462762, "grad_norm": 3.1751381210221767, "learning_rate": 2.1699387660987077e-06, "logits/chosen": -0.5375577211380005, "logits/rejected": -0.5367023944854736, "logps/chosen": -0.24724946916103363, "logps/rejected": -3.8961119651794434, "loss": 0.2169, "odds_ratio_loss": 0.03406355902552605, "rewards/accuracies": 1.0, "rewards/chosen": -0.024724945425987244, "rewards/margins": 0.36488622426986694, "rewards/rejected": -0.3896111845970154, "sft_loss": 0.24724946916103363, "step": 2735 }, { "epoch": 3.9566160520607374, "grad_norm": 1.9066520362152397, "learning_rate": 2.1671775685031563e-06, "logits/chosen": -0.6246693134307861, "logits/rejected": -0.5068244934082031, "logps/chosen": -0.17706000804901123, "logps/rejected": -3.941403865814209, "loss": 0.1729, "odds_ratio_loss": 0.031446393579244614, "rewards/accuracies": 1.0, "rewards/chosen": -0.017706003040075302, "rewards/margins": 0.37643444538116455, "rewards/rejected": -0.39414042234420776, "sft_loss": 0.17706000804901123, "step": 2736 }, { "epoch": 3.958062183658713, "grad_norm": 1.9930585146889106, "learning_rate": 2.1644174759230736e-06, "logits/chosen": -0.5262311100959778, "logits/rejected": -0.4606435298919678, "logps/chosen": -0.21938474476337433, "logps/rejected": -4.327494144439697, "loss": 0.1949, "odds_ratio_loss": 0.045247986912727356, "rewards/accuracies": 1.0, "rewards/chosen": -0.021938476711511612, "rewards/margins": 0.4108109772205353, "rewards/rejected": -0.4327494502067566, "sft_loss": 0.21938474476337433, "step": 2737 }, { "epoch": 3.9595083152566883, "grad_norm": 1.8421809853500997, "learning_rate": 2.161658490022532e-06, "logits/chosen": -0.6197947859764099, "logits/rejected": -0.4551239311695099, "logps/chosen": -0.22861793637275696, "logps/rejected": -3.364131450653076, "loss": 0.1774, "odds_ratio_loss": 0.047167375683784485, "rewards/accuracies": 1.0, "rewards/chosen": -0.022861795499920845, "rewards/margins": 0.3135513663291931, "rewards/rejected": -0.33641317486763, "sft_loss": 0.22861793637275696, "step": 2738 }, { "epoch": 3.9609544468546636, "grad_norm": 2.0397606314124266, "learning_rate": 2.1589006124649325e-06, "logits/chosen": -0.436599999666214, "logits/rejected": -0.3888368010520935, "logps/chosen": -0.12135307490825653, "logps/rejected": -4.132548809051514, "loss": 0.1574, "odds_ratio_loss": 0.016933148726820946, "rewards/accuracies": 1.0, "rewards/chosen": -0.012135308235883713, "rewards/margins": 0.4011196196079254, "rewards/rejected": -0.41325491666793823, "sft_loss": 0.12135307490825653, "step": 2739 }, { "epoch": 3.9624005784526393, "grad_norm": 1.75327087377883, "learning_rate": 2.1561438449130124e-06, "logits/chosen": -0.5723130106925964, "logits/rejected": -0.5248115062713623, "logps/chosen": -0.14575621485710144, "logps/rejected": -5.364757537841797, "loss": 0.1623, "odds_ratio_loss": 0.03158080577850342, "rewards/accuracies": 1.0, "rewards/chosen": -0.014575622044503689, "rewards/margins": 0.5219001770019531, "rewards/rejected": -0.5364757776260376, "sft_loss": 0.14575621485710144, "step": 2740 }, { "epoch": 3.9638467100506145, "grad_norm": 2.0814838546242704, "learning_rate": 2.153388189028835e-06, "logits/chosen": -0.5976966619491577, "logits/rejected": -0.47446054220199585, "logps/chosen": -0.15319940447807312, "logps/rejected": -4.2068257331848145, "loss": 0.1719, "odds_ratio_loss": 0.02311069332063198, "rewards/accuracies": 1.0, "rewards/chosen": -0.015319941565394402, "rewards/margins": 0.40536269545555115, "rewards/rejected": -0.4206825792789459, "sft_loss": 0.15319940447807312, "step": 2741 }, { "epoch": 3.9652928416485898, "grad_norm": 5.440242071490554, "learning_rate": 2.1506336464737943e-06, "logits/chosen": -0.5652493834495544, "logits/rejected": -0.49528682231903076, "logps/chosen": -0.2625923752784729, "logps/rejected": -4.457971572875977, "loss": 0.2107, "odds_ratio_loss": 0.0539512000977993, "rewards/accuracies": 1.0, "rewards/chosen": -0.02625923603773117, "rewards/margins": 0.4195379316806793, "rewards/rejected": -0.4457972049713135, "sft_loss": 0.2625923752784729, "step": 2742 }, { "epoch": 3.9667389732465654, "grad_norm": 3.215284049864351, "learning_rate": 2.147880218908618e-06, "logits/chosen": -0.7117863893508911, "logits/rejected": -0.4697888195514679, "logps/chosen": -0.0863754153251648, "logps/rejected": -4.418948173522949, "loss": 0.1422, "odds_ratio_loss": 0.011987053789198399, "rewards/accuracies": 1.0, "rewards/chosen": -0.00863754190504551, "rewards/margins": 0.4332572817802429, "rewards/rejected": -0.4418948292732239, "sft_loss": 0.0863754153251648, "step": 2743 }, { "epoch": 3.968185104844541, "grad_norm": 2.0169688500160268, "learning_rate": 2.145127907993354e-06, "logits/chosen": -0.6409890651702881, "logits/rejected": -0.6879695057868958, "logps/chosen": -0.153423473238945, "logps/rejected": -4.237060546875, "loss": 0.2012, "odds_ratio_loss": 0.030342232435941696, "rewards/accuracies": 1.0, "rewards/chosen": -0.0153423473238945, "rewards/margins": 0.4083637297153473, "rewards/rejected": -0.4237060844898224, "sft_loss": 0.153423473238945, "step": 2744 }, { "epoch": 3.9696312364425164, "grad_norm": 2.266381185127771, "learning_rate": 2.1423767153873845e-06, "logits/chosen": -0.5684958100318909, "logits/rejected": -0.49004459381103516, "logps/chosen": -0.2129286527633667, "logps/rejected": -3.743337869644165, "loss": 0.207, "odds_ratio_loss": 0.025867803022265434, "rewards/accuracies": 1.0, "rewards/chosen": -0.02129286341369152, "rewards/margins": 0.3530409336090088, "rewards/rejected": -0.37433379888534546, "sft_loss": 0.2129286527633667, "step": 2745 }, { "epoch": 3.9710773680404916, "grad_norm": 1.92859371400278, "learning_rate": 2.13962664274941e-06, "logits/chosen": -0.5195826292037964, "logits/rejected": -0.5544252395629883, "logps/chosen": -0.23682743310928345, "logps/rejected": -3.3036885261535645, "loss": 0.1898, "odds_ratio_loss": 0.08838633447885513, "rewards/accuracies": 1.0, "rewards/chosen": -0.023682741448283195, "rewards/margins": 0.3066861033439636, "rewards/rejected": -0.3303688168525696, "sft_loss": 0.23682743310928345, "step": 2746 }, { "epoch": 3.9725234996384673, "grad_norm": 1.8564797001366269, "learning_rate": 2.1368776917374623e-06, "logits/chosen": -0.5486931204795837, "logits/rejected": -0.48903441429138184, "logps/chosen": -0.19572268426418304, "logps/rejected": -5.466530799865723, "loss": 0.1608, "odds_ratio_loss": 0.022454869002103806, "rewards/accuracies": 1.0, "rewards/chosen": -0.019572269171476364, "rewards/margins": 0.527080774307251, "rewards/rejected": -0.5466530919075012, "sft_loss": 0.19572268426418304, "step": 2747 }, { "epoch": 3.9739696312364425, "grad_norm": 2.2883512574071077, "learning_rate": 2.134129864008894e-06, "logits/chosen": -0.6515862345695496, "logits/rejected": -0.4136117398738861, "logps/chosen": -0.2721066474914551, "logps/rejected": -4.8516316413879395, "loss": 0.2038, "odds_ratio_loss": 0.0357467383146286, "rewards/accuracies": 1.0, "rewards/chosen": -0.027210667729377747, "rewards/margins": 0.4579525291919708, "rewards/rejected": -0.48516321182250977, "sft_loss": 0.2721066474914551, "step": 2748 }, { "epoch": 3.9754157628344178, "grad_norm": 2.3359102349897753, "learning_rate": 2.1313831612203796e-06, "logits/chosen": -0.7950201034545898, "logits/rejected": -0.706078290939331, "logps/chosen": -0.1814843863248825, "logps/rejected": -3.375770092010498, "loss": 0.2099, "odds_ratio_loss": 0.035325754433870316, "rewards/accuracies": 1.0, "rewards/chosen": -0.01814843900501728, "rewards/margins": 0.31942859292030334, "rewards/rejected": -0.3375770151615143, "sft_loss": 0.1814843863248825, "step": 2749 }, { "epoch": 3.9768618944323935, "grad_norm": 2.019886300027027, "learning_rate": 2.1286375850279154e-06, "logits/chosen": -0.7477739453315735, "logits/rejected": -0.6434758901596069, "logps/chosen": -0.1750190109014511, "logps/rejected": -4.2602996826171875, "loss": 0.1752, "odds_ratio_loss": 0.042220983654260635, "rewards/accuracies": 1.0, "rewards/chosen": -0.01750190183520317, "rewards/margins": 0.40852802991867065, "rewards/rejected": -0.42602992057800293, "sft_loss": 0.1750190109014511, "step": 2750 }, { "epoch": 3.9783080260303687, "grad_norm": 1.6827104281078238, "learning_rate": 2.1258931370868224e-06, "logits/chosen": -0.6645622253417969, "logits/rejected": -0.5144686698913574, "logps/chosen": -0.09467566013336182, "logps/rejected": -4.5071258544921875, "loss": 0.15, "odds_ratio_loss": 0.017522595822811127, "rewards/accuracies": 1.0, "rewards/chosen": -0.009467566385865211, "rewards/margins": 0.44124501943588257, "rewards/rejected": -0.4507125914096832, "sft_loss": 0.09467566013336182, "step": 2751 }, { "epoch": 3.979754157628344, "grad_norm": 2.795885951210386, "learning_rate": 2.1231498190517355e-06, "logits/chosen": -0.43349555134773254, "logits/rejected": -0.4150978624820709, "logps/chosen": -0.1894543319940567, "logps/rejected": -6.349620819091797, "loss": 0.1768, "odds_ratio_loss": 0.03964434191584587, "rewards/accuracies": 1.0, "rewards/chosen": -0.01894543319940567, "rewards/margins": 0.6160166263580322, "rewards/rejected": -0.6349620819091797, "sft_loss": 0.1894543319940567, "step": 2752 }, { "epoch": 3.9812002892263196, "grad_norm": 1.8162552078369851, "learning_rate": 2.1204076325766124e-06, "logits/chosen": -0.5967478156089783, "logits/rejected": -0.4460592269897461, "logps/chosen": -0.17963387072086334, "logps/rejected": -3.825800895690918, "loss": 0.1539, "odds_ratio_loss": 0.030072186142206192, "rewards/accuracies": 1.0, "rewards/chosen": -0.017963387072086334, "rewards/margins": 0.3646166920661926, "rewards/rejected": -0.38258010149002075, "sft_loss": 0.17963387072086334, "step": 2753 }, { "epoch": 3.982646420824295, "grad_norm": 2.151401772662356, "learning_rate": 2.1176665793147296e-06, "logits/chosen": -0.721175491809845, "logits/rejected": -0.4136351943016052, "logps/chosen": -0.27765095233917236, "logps/rejected": -2.8129587173461914, "loss": 0.2211, "odds_ratio_loss": 0.053692497313022614, "rewards/accuracies": 1.0, "rewards/chosen": -0.027765095233917236, "rewards/margins": 0.2535307705402374, "rewards/rejected": -0.28129586577415466, "sft_loss": 0.27765095233917236, "step": 2754 }, { "epoch": 3.9840925524222706, "grad_norm": 1.8567141033496175, "learning_rate": 2.1149266609186767e-06, "logits/chosen": -0.5536510944366455, "logits/rejected": -0.5295378565788269, "logps/chosen": -0.14767736196517944, "logps/rejected": -5.057055473327637, "loss": 0.195, "odds_ratio_loss": 0.041959308087825775, "rewards/accuracies": 1.0, "rewards/chosen": -0.01476773526519537, "rewards/margins": 0.4909377992153168, "rewards/rejected": -0.5057055950164795, "sft_loss": 0.14767736196517944, "step": 2755 }, { "epoch": 3.985538684020246, "grad_norm": 2.0893400371062523, "learning_rate": 2.1121878790403607e-06, "logits/chosen": -0.6446054577827454, "logits/rejected": -0.49732160568237305, "logps/chosen": -0.1490086168050766, "logps/rejected": -4.0585784912109375, "loss": 0.2023, "odds_ratio_loss": 0.03372291848063469, "rewards/accuracies": 1.0, "rewards/chosen": -0.01490086317062378, "rewards/margins": 0.39095696806907654, "rewards/rejected": -0.4058578610420227, "sft_loss": 0.1490086168050766, "step": 2756 }, { "epoch": 3.9869848156182215, "grad_norm": 2.408244901425215, "learning_rate": 2.1094502353310026e-06, "logits/chosen": -0.656318187713623, "logits/rejected": -0.4849543273448944, "logps/chosen": -0.21277806162834167, "logps/rejected": -5.278512477874756, "loss": 0.1842, "odds_ratio_loss": 0.037755995988845825, "rewards/accuracies": 1.0, "rewards/chosen": -0.021277807652950287, "rewards/margins": 0.5065734386444092, "rewards/rejected": -0.5278512239456177, "sft_loss": 0.21277806162834167, "step": 2757 }, { "epoch": 3.9884309472161967, "grad_norm": 2.1234436431541797, "learning_rate": 2.1067137314411394e-06, "logits/chosen": -0.5390723347663879, "logits/rejected": -0.4621487855911255, "logps/chosen": -0.28014615178108215, "logps/rejected": -3.016232490539551, "loss": 0.2125, "odds_ratio_loss": 0.05021138861775398, "rewards/accuracies": 1.0, "rewards/chosen": -0.028014618903398514, "rewards/margins": 0.27360865473747253, "rewards/rejected": -0.30162325501441956, "sft_loss": 0.28014615178108215, "step": 2758 }, { "epoch": 3.989877078814172, "grad_norm": 2.0758920916498522, "learning_rate": 2.103978369020618e-06, "logits/chosen": -0.756662130355835, "logits/rejected": -0.6477712392807007, "logps/chosen": -0.22675055265426636, "logps/rejected": -2.2667582035064697, "loss": 0.1676, "odds_ratio_loss": 0.05150124430656433, "rewards/accuracies": 1.0, "rewards/chosen": -0.022675054147839546, "rewards/margins": 0.20400077104568481, "rewards/rejected": -0.2266758382320404, "sft_loss": 0.22675055265426636, "step": 2759 }, { "epoch": 3.9913232104121477, "grad_norm": 2.966514602001194, "learning_rate": 2.1012441497186006e-06, "logits/chosen": -0.4243288040161133, "logits/rejected": -0.2724151611328125, "logps/chosen": -0.08447768539190292, "logps/rejected": -5.80491828918457, "loss": 0.1971, "odds_ratio_loss": 0.013431225903332233, "rewards/accuracies": 1.0, "rewards/chosen": -0.008447768166661263, "rewards/margins": 0.5720440745353699, "rewards/rejected": -0.580491840839386, "sft_loss": 0.08447768539190292, "step": 2760 }, { "epoch": 3.992769342010123, "grad_norm": 2.0690569247897566, "learning_rate": 2.0985110751835554e-06, "logits/chosen": -0.5802749991416931, "logits/rejected": -0.4594845473766327, "logps/chosen": -0.21454155445098877, "logps/rejected": -3.6675314903259277, "loss": 0.1895, "odds_ratio_loss": 0.054889433085918427, "rewards/accuracies": 1.0, "rewards/chosen": -0.021454155445098877, "rewards/margins": 0.34529903531074524, "rewards/rejected": -0.36675316095352173, "sft_loss": 0.21454155445098877, "step": 2761 }, { "epoch": 3.994215473608098, "grad_norm": 2.130452441813903, "learning_rate": 2.0957791470632668e-06, "logits/chosen": -0.3942503035068512, "logits/rejected": -0.28423938155174255, "logps/chosen": -0.1051185354590416, "logps/rejected": -5.772364616394043, "loss": 0.1506, "odds_ratio_loss": 0.02300701104104519, "rewards/accuracies": 1.0, "rewards/chosen": -0.010511854663491249, "rewards/margins": 0.5667246580123901, "rewards/rejected": -0.5772364735603333, "sft_loss": 0.1051185354590416, "step": 2762 }, { "epoch": 3.995661605206074, "grad_norm": 2.273248642486303, "learning_rate": 2.0930483670048225e-06, "logits/chosen": -0.6677214503288269, "logits/rejected": -0.47213077545166016, "logps/chosen": -0.10058383643627167, "logps/rejected": -5.465931415557861, "loss": 0.1752, "odds_ratio_loss": 0.018684882670640945, "rewards/accuracies": 1.0, "rewards/chosen": -0.010058384388685226, "rewards/margins": 0.5365347862243652, "rewards/rejected": -0.546593189239502, "sft_loss": 0.10058383643627167, "step": 2763 }, { "epoch": 3.997107736804049, "grad_norm": 1.873279739010511, "learning_rate": 2.0903187366546196e-06, "logits/chosen": -0.7388254404067993, "logits/rejected": -0.7730780839920044, "logps/chosen": -0.23930414021015167, "logps/rejected": -4.100466728210449, "loss": 0.202, "odds_ratio_loss": 0.048742517828941345, "rewards/accuracies": 1.0, "rewards/chosen": -0.023930413648486137, "rewards/margins": 0.38611626625061035, "rewards/rejected": -0.41004669666290283, "sft_loss": 0.23930414021015167, "step": 2764 }, { "epoch": 3.9985538684020243, "grad_norm": 2.039276418562883, "learning_rate": 2.0875902576583613e-06, "logits/chosen": -0.6181426048278809, "logits/rejected": -0.440829873085022, "logps/chosen": -0.12418884038925171, "logps/rejected": -3.6102936267852783, "loss": 0.1564, "odds_ratio_loss": 0.015171993523836136, "rewards/accuracies": 1.0, "rewards/chosen": -0.012418882921338081, "rewards/margins": 0.34861046075820923, "rewards/rejected": -0.36102938652038574, "sft_loss": 0.12418884038925171, "step": 2765 }, { "epoch": 4.0, "grad_norm": 2.2517115073863265, "learning_rate": 2.084862931661061e-06, "logits/chosen": -0.5372803211212158, "logits/rejected": -0.3119491636753082, "logps/chosen": -0.1669417768716812, "logps/rejected": -4.001537799835205, "loss": 0.1403, "odds_ratio_loss": 0.028092026710510254, "rewards/accuracies": 1.0, "rewards/chosen": -0.01669418066740036, "rewards/margins": 0.3834596276283264, "rewards/rejected": -0.400153785943985, "sft_loss": 0.1669417768716812, "step": 2766 }, { "epoch": 4.001446131597976, "grad_norm": 1.9342629816067212, "learning_rate": 2.08213676030703e-06, "logits/chosen": -0.6452394127845764, "logits/rejected": -0.4118797481060028, "logps/chosen": -0.08616970479488373, "logps/rejected": -3.6615328788757324, "loss": 0.0679, "odds_ratio_loss": 0.016679398715496063, "rewards/accuracies": 1.0, "rewards/chosen": -0.008616969920694828, "rewards/margins": 0.35753631591796875, "rewards/rejected": -0.3661532998085022, "sft_loss": 0.08616970479488373, "step": 2767 }, { "epoch": 4.0028922631959505, "grad_norm": 1.8309926478353908, "learning_rate": 2.0794117452398896e-06, "logits/chosen": -0.7527530789375305, "logits/rejected": -0.5017178058624268, "logps/chosen": -0.0876193642616272, "logps/rejected": -2.9753198623657227, "loss": 0.0645, "odds_ratio_loss": 0.01615132950246334, "rewards/accuracies": 1.0, "rewards/chosen": -0.00876193679869175, "rewards/margins": 0.28877004981040955, "rewards/rejected": -0.29753196239471436, "sft_loss": 0.0876193642616272, "step": 2768 }, { "epoch": 4.004338394793926, "grad_norm": 2.0775734258132093, "learning_rate": 2.0766878881025626e-06, "logits/chosen": -0.6768192052841187, "logits/rejected": -0.5599380135536194, "logps/chosen": -0.11010854691267014, "logps/rejected": -4.613247871398926, "loss": 0.0943, "odds_ratio_loss": 0.01849268190562725, "rewards/accuracies": 1.0, "rewards/chosen": -0.011010855436325073, "rewards/margins": 0.450313925743103, "rewards/rejected": -0.4613247513771057, "sft_loss": 0.11010854691267014, "step": 2769 }, { "epoch": 4.005784526391902, "grad_norm": 1.598856868986213, "learning_rate": 2.0739651905372706e-06, "logits/chosen": -0.4616875946521759, "logits/rejected": -0.34646421670913696, "logps/chosen": -0.07392052561044693, "logps/rejected": -4.814556121826172, "loss": 0.0724, "odds_ratio_loss": 0.00886174663901329, "rewards/accuracies": 1.0, "rewards/chosen": -0.007392052561044693, "rewards/margins": 0.47406357526779175, "rewards/rejected": -0.48145565390586853, "sft_loss": 0.07392052561044693, "step": 2770 }, { "epoch": 4.007230657989877, "grad_norm": 2.0273765071761405, "learning_rate": 2.0712436541855387e-06, "logits/chosen": -0.6958746314048767, "logits/rejected": -0.42994117736816406, "logps/chosen": -0.14068825542926788, "logps/rejected": -4.38588809967041, "loss": 0.1447, "odds_ratio_loss": 0.008284801617264748, "rewards/accuracies": 1.0, "rewards/chosen": -0.014068827033042908, "rewards/margins": 0.42451998591423035, "rewards/rejected": -0.43858882784843445, "sft_loss": 0.14068825542926788, "step": 2771 }, { "epoch": 4.008676789587852, "grad_norm": 1.409673623556823, "learning_rate": 2.0685232806881896e-06, "logits/chosen": -0.6632593870162964, "logits/rejected": -0.5344269275665283, "logps/chosen": -0.03422532230615616, "logps/rejected": -4.707058906555176, "loss": 0.0583, "odds_ratio_loss": 0.004966989159584045, "rewards/accuracies": 1.0, "rewards/chosen": -0.003422531997784972, "rewards/margins": 0.46728330850601196, "rewards/rejected": -0.47070592641830444, "sft_loss": 0.03422532230615616, "step": 2772 }, { "epoch": 4.010122921185828, "grad_norm": 1.5195324066013671, "learning_rate": 2.06580407168535e-06, "logits/chosen": -0.744748055934906, "logits/rejected": -0.6185660362243652, "logps/chosen": -0.03680463880300522, "logps/rejected": -5.538668632507324, "loss": 0.0651, "odds_ratio_loss": 0.004509086720645428, "rewards/accuracies": 1.0, "rewards/chosen": -0.0036804641131311655, "rewards/margins": 0.5501863360404968, "rewards/rejected": -0.5538668632507324, "sft_loss": 0.03680463880300522, "step": 2773 }, { "epoch": 4.011569052783804, "grad_norm": 2.124422736113147, "learning_rate": 2.063086028816437e-06, "logits/chosen": -0.6991645693778992, "logits/rejected": -0.5606277585029602, "logps/chosen": -0.1650342047214508, "logps/rejected": -3.594874382019043, "loss": 0.0973, "odds_ratio_loss": 0.01484596822410822, "rewards/accuracies": 1.0, "rewards/chosen": -0.01650342158973217, "rewards/margins": 0.34298405051231384, "rewards/rejected": -0.3594874441623688, "sft_loss": 0.1650342047214508, "step": 2774 }, { "epoch": 4.0130151843817785, "grad_norm": 2.395553922175963, "learning_rate": 2.0603691537201737e-06, "logits/chosen": -0.8897903561592102, "logits/rejected": -0.5353154540061951, "logps/chosen": -0.0864485427737236, "logps/rejected": -5.473728656768799, "loss": 0.0823, "odds_ratio_loss": 0.005243867635726929, "rewards/accuracies": 1.0, "rewards/chosen": -0.008644853718578815, "rewards/margins": 0.5387280583381653, "rewards/rejected": -0.5473728775978088, "sft_loss": 0.0864485427737236, "step": 2775 }, { "epoch": 4.014461315979754, "grad_norm": 2.6106970833473215, "learning_rate": 2.057653448034569e-06, "logits/chosen": -0.7756266593933105, "logits/rejected": -0.49392417073249817, "logps/chosen": -0.17186208069324493, "logps/rejected": -5.397519588470459, "loss": 0.1117, "odds_ratio_loss": 0.0077825989574193954, "rewards/accuracies": 1.0, "rewards/chosen": -0.017186209559440613, "rewards/margins": 0.5225657820701599, "rewards/rejected": -0.5397520065307617, "sft_loss": 0.17186208069324493, "step": 2776 }, { "epoch": 4.01590744757773, "grad_norm": 3.5803120207407293, "learning_rate": 2.0549389133969366e-06, "logits/chosen": -0.7232028245925903, "logits/rejected": -0.5350125432014465, "logps/chosen": -0.08477292954921722, "logps/rejected": -6.003397464752197, "loss": 0.0854, "odds_ratio_loss": 0.011778164654970169, "rewards/accuracies": 1.0, "rewards/chosen": -0.008477292954921722, "rewards/margins": 0.5918624997138977, "rewards/rejected": -0.6003397703170776, "sft_loss": 0.08477292954921722, "step": 2777 }, { "epoch": 4.017353579175705, "grad_norm": 2.2746691520235833, "learning_rate": 2.0522255514438775e-06, "logits/chosen": -0.7820395231246948, "logits/rejected": -0.539219856262207, "logps/chosen": -0.10004076361656189, "logps/rejected": -6.174839496612549, "loss": 0.0983, "odds_ratio_loss": 0.006524843629449606, "rewards/accuracies": 1.0, "rewards/chosen": -0.010004077106714249, "rewards/margins": 0.6074798703193665, "rewards/rejected": -0.6174839735031128, "sft_loss": 0.10004076361656189, "step": 2778 }, { "epoch": 4.01879971077368, "grad_norm": 2.8298145277154583, "learning_rate": 2.0495133638112876e-06, "logits/chosen": -0.8696098327636719, "logits/rejected": -0.6498009562492371, "logps/chosen": -0.07407721132040024, "logps/rejected": -5.003327369689941, "loss": 0.0689, "odds_ratio_loss": 0.005369211081415415, "rewards/accuracies": 1.0, "rewards/chosen": -0.007407721597701311, "rewards/margins": 0.4929249882698059, "rewards/rejected": -0.5003327131271362, "sft_loss": 0.07407721132040024, "step": 2779 }, { "epoch": 4.020245842371656, "grad_norm": 1.9598064141005225, "learning_rate": 2.0468023521343535e-06, "logits/chosen": -0.7513258457183838, "logits/rejected": -0.6734272241592407, "logps/chosen": -0.16317948698997498, "logps/rejected": -3.4593756198883057, "loss": 0.1124, "odds_ratio_loss": 0.021505819633603096, "rewards/accuracies": 1.0, "rewards/chosen": -0.016317948698997498, "rewards/margins": 0.3296195864677429, "rewards/rejected": -0.3459375500679016, "sft_loss": 0.16317948698997498, "step": 2780 }, { "epoch": 4.021691973969631, "grad_norm": 1.7165495887574607, "learning_rate": 2.044092518047556e-06, "logits/chosen": -0.9572099447250366, "logits/rejected": -0.7057690620422363, "logps/chosen": -0.05946547910571098, "logps/rejected": -3.578216552734375, "loss": 0.092, "odds_ratio_loss": 0.010709418915212154, "rewards/accuracies": 1.0, "rewards/chosen": -0.005946548189967871, "rewards/margins": 0.3518751263618469, "rewards/rejected": -0.35782164335250854, "sft_loss": 0.05946547910571098, "step": 2781 }, { "epoch": 4.0231381055676065, "grad_norm": 1.5373982677542486, "learning_rate": 2.0413838631846655e-06, "logits/chosen": -0.9308152794837952, "logits/rejected": -0.6596737504005432, "logps/chosen": -0.0882471352815628, "logps/rejected": -3.0405995845794678, "loss": 0.0911, "odds_ratio_loss": 0.009932536631822586, "rewards/accuracies": 1.0, "rewards/chosen": -0.00882471352815628, "rewards/margins": 0.295235276222229, "rewards/rejected": -0.3040599822998047, "sft_loss": 0.0882471352815628, "step": 2782 }, { "epoch": 4.024584237165582, "grad_norm": 1.6064838122981433, "learning_rate": 2.038676389178737e-06, "logits/chosen": -0.912672758102417, "logits/rejected": -0.6855254769325256, "logps/chosen": -0.10411486029624939, "logps/rejected": -5.8816142082214355, "loss": 0.0905, "odds_ratio_loss": 0.012567874044179916, "rewards/accuracies": 1.0, "rewards/chosen": -0.010411485098302364, "rewards/margins": 0.5777499675750732, "rewards/rejected": -0.5881614089012146, "sft_loss": 0.10411486029624939, "step": 2783 }, { "epoch": 4.026030368763558, "grad_norm": 1.533022173291985, "learning_rate": 2.0359700976621192e-06, "logits/chosen": -0.7604274153709412, "logits/rejected": -0.6002693176269531, "logps/chosen": -0.09835337847471237, "logps/rejected": -4.979158401489258, "loss": 0.0734, "odds_ratio_loss": 0.00728287547826767, "rewards/accuracies": 1.0, "rewards/chosen": -0.009835338220000267, "rewards/margins": 0.4880805015563965, "rewards/rejected": -0.4979158341884613, "sft_loss": 0.09835337847471237, "step": 2784 }, { "epoch": 4.027476500361533, "grad_norm": 1.8914491919734018, "learning_rate": 2.0332649902664435e-06, "logits/chosen": -0.734957218170166, "logits/rejected": -0.5849780440330505, "logps/chosen": -0.07099728286266327, "logps/rejected": -5.679014205932617, "loss": 0.0875, "odds_ratio_loss": 0.010899766348302364, "rewards/accuracies": 1.0, "rewards/chosen": -0.007099728100001812, "rewards/margins": 0.5608016848564148, "rewards/rejected": -0.5679014325141907, "sft_loss": 0.07099728286266327, "step": 2785 }, { "epoch": 4.028922631959508, "grad_norm": 1.4716068999371834, "learning_rate": 2.030561068622631e-06, "logits/chosen": -0.641997754573822, "logits/rejected": -0.6025215983390808, "logps/chosen": -0.11185550689697266, "logps/rejected": -3.5561747550964355, "loss": 0.0741, "odds_ratio_loss": 0.01804935932159424, "rewards/accuracies": 1.0, "rewards/chosen": -0.011185551062226295, "rewards/margins": 0.34443193674087524, "rewards/rejected": -0.355617493391037, "sft_loss": 0.11185550689697266, "step": 2786 }, { "epoch": 4.030368763557484, "grad_norm": 1.45988684450543, "learning_rate": 2.0278583343608855e-06, "logits/chosen": -0.6629144549369812, "logits/rejected": -0.45586591958999634, "logps/chosen": -0.14175164699554443, "logps/rejected": -5.194118499755859, "loss": 0.1214, "odds_ratio_loss": 0.014719752594828606, "rewards/accuracies": 1.0, "rewards/chosen": -0.014175164513289928, "rewards/margins": 0.5052367448806763, "rewards/rejected": -0.5194118618965149, "sft_loss": 0.14175164699554443, "step": 2787 }, { "epoch": 4.031814895155459, "grad_norm": 1.2245497852271376, "learning_rate": 2.0251567891106953e-06, "logits/chosen": -0.6399087905883789, "logits/rejected": -0.47924333810806274, "logps/chosen": -0.05142543837428093, "logps/rejected": -4.47532844543457, "loss": 0.0544, "odds_ratio_loss": 0.004084172658622265, "rewards/accuracies": 1.0, "rewards/chosen": -0.005142544396221638, "rewards/margins": 0.4423903226852417, "rewards/rejected": -0.44753289222717285, "sft_loss": 0.05142543837428093, "step": 2788 }, { "epoch": 4.033261026753435, "grad_norm": 2.06127829155153, "learning_rate": 2.022456434500831e-06, "logits/chosen": -0.8269731998443604, "logits/rejected": -0.5844072103500366, "logps/chosen": -0.0957033634185791, "logps/rejected": -4.2631611824035645, "loss": 0.11, "odds_ratio_loss": 0.006857863627374172, "rewards/accuracies": 1.0, "rewards/chosen": -0.009570336900651455, "rewards/margins": 0.41674575209617615, "rewards/rejected": -0.42631611227989197, "sft_loss": 0.0957033634185791, "step": 2789 }, { "epoch": 4.03470715835141, "grad_norm": 1.687212211680361, "learning_rate": 2.019757272159348e-06, "logits/chosen": -0.903799295425415, "logits/rejected": -0.6986560821533203, "logps/chosen": -0.09332535415887833, "logps/rejected": -3.8312535285949707, "loss": 0.0843, "odds_ratio_loss": 0.012101933360099792, "rewards/accuracies": 1.0, "rewards/chosen": -0.009332534857094288, "rewards/margins": 0.3737927973270416, "rewards/rejected": -0.383125364780426, "sft_loss": 0.09332535415887833, "step": 2790 }, { "epoch": 4.036153289949385, "grad_norm": 1.5214741277296162, "learning_rate": 2.01705930371358e-06, "logits/chosen": -0.8658531308174133, "logits/rejected": -0.5357121825218201, "logps/chosen": -0.03709006682038307, "logps/rejected": -4.968294143676758, "loss": 0.0464, "odds_ratio_loss": 0.004839232191443443, "rewards/accuracies": 1.0, "rewards/chosen": -0.0037090065889060497, "rewards/margins": 0.49312037229537964, "rewards/rejected": -0.49682939052581787, "sft_loss": 0.03709006682038307, "step": 2791 }, { "epoch": 4.037599421547361, "grad_norm": 1.8959372352641202, "learning_rate": 2.0143625307901445e-06, "logits/chosen": -0.7801449298858643, "logits/rejected": -0.6802493929862976, "logps/chosen": -0.04702619090676308, "logps/rejected": -3.90498423576355, "loss": 0.0703, "odds_ratio_loss": 0.0023806714452803135, "rewards/accuracies": 1.0, "rewards/chosen": -0.004702619276940823, "rewards/margins": 0.38579580187797546, "rewards/rejected": -0.39049839973449707, "sft_loss": 0.04702619090676308, "step": 2792 }, { "epoch": 4.039045553145336, "grad_norm": 2.415858200709156, "learning_rate": 2.0116669550149326e-06, "logits/chosen": -0.6672720909118652, "logits/rejected": -0.6657229661941528, "logps/chosen": -0.08343811333179474, "logps/rejected": -3.9901604652404785, "loss": 0.0964, "odds_ratio_loss": 0.0074556972831487656, "rewards/accuracies": 1.0, "rewards/chosen": -0.008343812078237534, "rewards/margins": 0.3906722366809845, "rewards/rejected": -0.3990160822868347, "sft_loss": 0.08343811333179474, "step": 2793 }, { "epoch": 4.040491684743311, "grad_norm": 1.7239051085866066, "learning_rate": 2.008972578013121e-06, "logits/chosen": -0.8345197439193726, "logits/rejected": -0.62155681848526, "logps/chosen": -0.0501299649477005, "logps/rejected": -6.423168182373047, "loss": 0.0885, "odds_ratio_loss": 0.012613892555236816, "rewards/accuracies": 1.0, "rewards/chosen": -0.00501299649477005, "rewards/margins": 0.6373038291931152, "rewards/rejected": -0.6423167586326599, "sft_loss": 0.0501299649477005, "step": 2794 }, { "epoch": 4.041937816341287, "grad_norm": 2.2864873355325317, "learning_rate": 2.0062794014091566e-06, "logits/chosen": -0.7575594186782837, "logits/rejected": -0.5298593640327454, "logps/chosen": -0.08866608142852783, "logps/rejected": -3.8082284927368164, "loss": 0.0898, "odds_ratio_loss": 0.02208029478788376, "rewards/accuracies": 1.0, "rewards/chosen": -0.008866608142852783, "rewards/margins": 0.3719562888145447, "rewards/rejected": -0.38082289695739746, "sft_loss": 0.08866608142852783, "step": 2795 }, { "epoch": 4.043383947939263, "grad_norm": 2.0832074173183006, "learning_rate": 2.0035874268267652e-06, "logits/chosen": -0.9552007913589478, "logits/rejected": -0.6479728817939758, "logps/chosen": -0.19561168551445007, "logps/rejected": -4.241243839263916, "loss": 0.1215, "odds_ratio_loss": 0.02996988594532013, "rewards/accuracies": 1.0, "rewards/chosen": -0.019561167806386948, "rewards/margins": 0.4045632481575012, "rewards/rejected": -0.4241243898868561, "sft_loss": 0.19561168551445007, "step": 2796 }, { "epoch": 4.044830079537238, "grad_norm": 1.376822495445301, "learning_rate": 2.0008966558889518e-06, "logits/chosen": -0.8547458648681641, "logits/rejected": -0.6141563653945923, "logps/chosen": -0.02995988354086876, "logps/rejected": -5.119405746459961, "loss": 0.0598, "odds_ratio_loss": 0.001512437593191862, "rewards/accuracies": 1.0, "rewards/chosen": -0.0029959888197481632, "rewards/margins": 0.508944571018219, "rewards/rejected": -0.511940598487854, "sft_loss": 0.02995988354086876, "step": 2797 }, { "epoch": 4.046276211135213, "grad_norm": 1.7513413495327788, "learning_rate": 1.9982070902179885e-06, "logits/chosen": -0.876068651676178, "logits/rejected": -0.6037436723709106, "logps/chosen": -0.031491801142692566, "logps/rejected": -4.855276584625244, "loss": 0.0784, "odds_ratio_loss": 0.0038668958004564047, "rewards/accuracies": 1.0, "rewards/chosen": -0.0031491799745708704, "rewards/margins": 0.4823784828186035, "rewards/rejected": -0.4855276346206665, "sft_loss": 0.031491801142692566, "step": 2798 }, { "epoch": 4.047722342733189, "grad_norm": 1.607844504092351, "learning_rate": 1.995518731435427e-06, "logits/chosen": -0.69386887550354, "logits/rejected": -0.537121593952179, "logps/chosen": -0.10619297623634338, "logps/rejected": -5.933511734008789, "loss": 0.0766, "odds_ratio_loss": 0.012091580778360367, "rewards/accuracies": 1.0, "rewards/chosen": -0.010619297623634338, "rewards/margins": 0.5827318429946899, "rewards/rejected": -0.5933511257171631, "sft_loss": 0.10619297623634338, "step": 2799 }, { "epoch": 4.0491684743311644, "grad_norm": 2.041163818914151, "learning_rate": 1.992831581162086e-06, "logits/chosen": -0.8246333599090576, "logits/rejected": -0.7129614353179932, "logps/chosen": -0.10542631149291992, "logps/rejected": -4.061657905578613, "loss": 0.0915, "odds_ratio_loss": 0.011720533482730389, "rewards/accuracies": 1.0, "rewards/chosen": -0.010542631149291992, "rewards/margins": 0.39562320709228516, "rewards/rejected": -0.4061657786369324, "sft_loss": 0.10542631149291992, "step": 2800 }, { "epoch": 4.050614605929139, "grad_norm": 1.5198161109339767, "learning_rate": 1.9901456410180626e-06, "logits/chosen": -0.8532893657684326, "logits/rejected": -0.9210831522941589, "logps/chosen": -0.10792700946331024, "logps/rejected": -3.295602560043335, "loss": 0.0825, "odds_ratio_loss": 0.009402144700288773, "rewards/accuracies": 1.0, "rewards/chosen": -0.010792701505124569, "rewards/margins": 0.3187675476074219, "rewards/rejected": -0.3295602798461914, "sft_loss": 0.10792700946331024, "step": 2801 }, { "epoch": 4.052060737527115, "grad_norm": 1.6587673101071005, "learning_rate": 1.987460912622717e-06, "logits/chosen": -0.7868859171867371, "logits/rejected": -0.6162922382354736, "logps/chosen": -0.08886329084634781, "logps/rejected": -4.913092613220215, "loss": 0.0714, "odds_ratio_loss": 0.011058435775339603, "rewards/accuracies": 1.0, "rewards/chosen": -0.008886328898370266, "rewards/margins": 0.48242297768592834, "rewards/rejected": -0.4913092851638794, "sft_loss": 0.08886329084634781, "step": 2802 }, { "epoch": 4.053506869125091, "grad_norm": 1.4545531255335953, "learning_rate": 1.9847773975946833e-06, "logits/chosen": -0.3921070694923401, "logits/rejected": -0.3834128975868225, "logps/chosen": -0.046835917979478836, "logps/rejected": -6.639843940734863, "loss": 0.1102, "odds_ratio_loss": 0.010030764155089855, "rewards/accuracies": 1.0, "rewards/chosen": -0.004683591891080141, "rewards/margins": 0.6593008041381836, "rewards/rejected": -0.6639844179153442, "sft_loss": 0.046835917979478836, "step": 2803 }, { "epoch": 4.054953000723065, "grad_norm": 1.603097030697306, "learning_rate": 1.98209509755186e-06, "logits/chosen": -0.8556368947029114, "logits/rejected": -0.5742295980453491, "logps/chosen": -0.12034314125776291, "logps/rejected": -3.737192392349243, "loss": 0.096, "odds_ratio_loss": 0.020050739869475365, "rewards/accuracies": 1.0, "rewards/chosen": -0.012034314684569836, "rewards/margins": 0.3616849184036255, "rewards/rejected": -0.3737192451953888, "sft_loss": 0.12034314125776291, "step": 2804 }, { "epoch": 4.056399132321041, "grad_norm": 1.5499176234984071, "learning_rate": 1.97941401411142e-06, "logits/chosen": -0.8785386085510254, "logits/rejected": -0.736790657043457, "logps/chosen": -0.09029129147529602, "logps/rejected": -3.283236503601074, "loss": 0.0651, "odds_ratio_loss": 0.010353603400290012, "rewards/accuracies": 1.0, "rewards/chosen": -0.009029129520058632, "rewards/margins": 0.3192945122718811, "rewards/rejected": -0.328323632478714, "sft_loss": 0.09029129147529602, "step": 2805 }, { "epoch": 4.057845263919017, "grad_norm": 1.497023625579859, "learning_rate": 1.976734148889794e-06, "logits/chosen": -0.9096969366073608, "logits/rejected": -0.563471794128418, "logps/chosen": -0.09378468990325928, "logps/rejected": -4.701470851898193, "loss": 0.0833, "odds_ratio_loss": 0.01099737174808979, "rewards/accuracies": 1.0, "rewards/chosen": -0.009378468617796898, "rewards/margins": 0.46076861023902893, "rewards/rejected": -0.4701470732688904, "sft_loss": 0.09378468990325928, "step": 2806 }, { "epoch": 4.0592913955169925, "grad_norm": 1.299958296890451, "learning_rate": 1.9740555035026856e-06, "logits/chosen": -0.9414892792701721, "logits/rejected": -0.8532160520553589, "logps/chosen": -0.09001899510622025, "logps/rejected": -4.150521278381348, "loss": 0.0668, "odds_ratio_loss": 0.01099309604614973, "rewards/accuracies": 1.0, "rewards/chosen": -0.0090019004419446, "rewards/margins": 0.40605026483535767, "rewards/rejected": -0.4150521159172058, "sft_loss": 0.09001899510622025, "step": 2807 }, { "epoch": 4.060737527114967, "grad_norm": 1.4888656980066968, "learning_rate": 1.971378079565061e-06, "logits/chosen": -0.7717404961585999, "logits/rejected": -0.703227162361145, "logps/chosen": -0.06621512025594711, "logps/rejected": -3.4783830642700195, "loss": 0.0637, "odds_ratio_loss": 0.006453595124185085, "rewards/accuracies": 1.0, "rewards/chosen": -0.006621511187404394, "rewards/margins": 0.3412168025970459, "rewards/rejected": -0.34783831238746643, "sft_loss": 0.06621512025594711, "step": 2808 }, { "epoch": 4.062183658712943, "grad_norm": 1.4317984222561302, "learning_rate": 1.9687018786911477e-06, "logits/chosen": -0.8258988857269287, "logits/rejected": -0.6008090376853943, "logps/chosen": -0.04819875583052635, "logps/rejected": -5.592133522033691, "loss": 0.0731, "odds_ratio_loss": 0.0023457964416593313, "rewards/accuracies": 1.0, "rewards/chosen": -0.004819875583052635, "rewards/margins": 0.554393470287323, "rewards/rejected": -0.5592133402824402, "sft_loss": 0.04819875583052635, "step": 2809 }, { "epoch": 4.063629790310919, "grad_norm": 1.5889562813928515, "learning_rate": 1.9660269024944367e-06, "logits/chosen": -0.857683539390564, "logits/rejected": -0.8138494491577148, "logps/chosen": -0.13486728072166443, "logps/rejected": -2.3455417156219482, "loss": 0.086, "odds_ratio_loss": 0.014339055866003036, "rewards/accuracies": 1.0, "rewards/chosen": -0.013486729003489017, "rewards/margins": 0.22106747329235077, "rewards/rejected": -0.2345542013645172, "sft_loss": 0.13486728072166443, "step": 2810 }, { "epoch": 4.065075921908893, "grad_norm": 1.7040159776179822, "learning_rate": 1.9633531525876804e-06, "logits/chosen": -0.9003788232803345, "logits/rejected": -0.46565985679626465, "logps/chosen": -0.08736556768417358, "logps/rejected": -5.143779754638672, "loss": 0.084, "odds_ratio_loss": 0.006077864672988653, "rewards/accuracies": 1.0, "rewards/chosen": -0.008736557327210903, "rewards/margins": 0.5056413412094116, "rewards/rejected": -0.5143779516220093, "sft_loss": 0.08736556768417358, "step": 2811 }, { "epoch": 4.066522053506869, "grad_norm": 1.6114194827080601, "learning_rate": 1.960680630582895e-06, "logits/chosen": -0.8223308324813843, "logits/rejected": -0.520995020866394, "logps/chosen": -0.03943537548184395, "logps/rejected": -4.510629177093506, "loss": 0.0935, "odds_ratio_loss": 0.002579466672614217, "rewards/accuracies": 1.0, "rewards/chosen": -0.003943537827581167, "rewards/margins": 0.44711941480636597, "rewards/rejected": -0.4510629177093506, "sft_loss": 0.03943537548184395, "step": 2812 }, { "epoch": 4.067968185104845, "grad_norm": 2.896809649533277, "learning_rate": 1.9580093380913516e-06, "logits/chosen": -0.7096729874610901, "logits/rejected": -0.45962074398994446, "logps/chosen": -0.09003637731075287, "logps/rejected": -5.000641822814941, "loss": 0.0789, "odds_ratio_loss": 0.004351029638200998, "rewards/accuracies": 1.0, "rewards/chosen": -0.009003638289868832, "rewards/margins": 0.4910605549812317, "rewards/rejected": -0.5000641942024231, "sft_loss": 0.09003637731075287, "step": 2813 }, { "epoch": 4.06941431670282, "grad_norm": 1.6830278975699327, "learning_rate": 1.955339276723584e-06, "logits/chosen": -0.8133095502853394, "logits/rejected": -0.6556833982467651, "logps/chosen": -0.10588407516479492, "logps/rejected": -4.726909160614014, "loss": 0.0942, "odds_ratio_loss": 0.008478911593556404, "rewards/accuracies": 1.0, "rewards/chosen": -0.010588408447802067, "rewards/margins": 0.4621025025844574, "rewards/rejected": -0.4726909101009369, "sft_loss": 0.10588407516479492, "step": 2814 }, { "epoch": 4.070860448300795, "grad_norm": 1.3555935048586099, "learning_rate": 1.952670448089381e-06, "logits/chosen": -0.7163006067276001, "logits/rejected": -0.587743878364563, "logps/chosen": -0.07247428596019745, "logps/rejected": -5.259910583496094, "loss": 0.0539, "odds_ratio_loss": 0.013823432847857475, "rewards/accuracies": 1.0, "rewards/chosen": -0.007247428875416517, "rewards/margins": 0.5187435746192932, "rewards/rejected": -0.5259910225868225, "sft_loss": 0.07247428596019745, "step": 2815 }, { "epoch": 4.072306579898771, "grad_norm": 1.6383073489651063, "learning_rate": 1.9500028537977916e-06, "logits/chosen": -0.8243443965911865, "logits/rejected": -0.5978400707244873, "logps/chosen": -0.07016627490520477, "logps/rejected": -5.391256809234619, "loss": 0.0842, "odds_ratio_loss": 0.007298076990991831, "rewards/accuracies": 1.0, "rewards/chosen": -0.007016628049314022, "rewards/margins": 0.5321090221405029, "rewards/rejected": -0.5391257405281067, "sft_loss": 0.07016627490520477, "step": 2816 }, { "epoch": 4.073752711496746, "grad_norm": 1.4638936787417978, "learning_rate": 1.9473364954571156e-06, "logits/chosen": -0.852425217628479, "logits/rejected": -0.6924081444740295, "logps/chosen": -0.03700890764594078, "logps/rejected": -5.5652289390563965, "loss": 0.0678, "odds_ratio_loss": 0.0037209701258689165, "rewards/accuracies": 1.0, "rewards/chosen": -0.003700891276821494, "rewards/margins": 0.5528219938278198, "rewards/rejected": -0.5565229654312134, "sft_loss": 0.03700890764594078, "step": 2817 }, { "epoch": 4.0751988430947215, "grad_norm": 1.4007257608951111, "learning_rate": 1.9446713746749124e-06, "logits/chosen": -0.7752952575683594, "logits/rejected": -0.5839805603027344, "logps/chosen": -0.07095170021057129, "logps/rejected": -5.140481948852539, "loss": 0.0557, "odds_ratio_loss": 0.006458982825279236, "rewards/accuracies": 1.0, "rewards/chosen": -0.007095170207321644, "rewards/margins": 0.5069530010223389, "rewards/rejected": -0.5140482187271118, "sft_loss": 0.07095170021057129, "step": 2818 }, { "epoch": 4.076644974692697, "grad_norm": 1.753771771087421, "learning_rate": 1.94200749305799e-06, "logits/chosen": -0.6933737397193909, "logits/rejected": -0.544430673122406, "logps/chosen": -0.07938231527805328, "logps/rejected": -3.3233370780944824, "loss": 0.089, "odds_ratio_loss": 0.006840870250016451, "rewards/accuracies": 1.0, "rewards/chosen": -0.007938231341540813, "rewards/margins": 0.32439547777175903, "rewards/rejected": -0.3323337137699127, "sft_loss": 0.07938231527805328, "step": 2819 }, { "epoch": 4.078091106290673, "grad_norm": 1.377106669791675, "learning_rate": 1.9393448522124154e-06, "logits/chosen": -1.001451015472412, "logits/rejected": -0.8603689670562744, "logps/chosen": -0.04609488323330879, "logps/rejected": -5.648879051208496, "loss": 0.0742, "odds_ratio_loss": 0.0034265825524926186, "rewards/accuracies": 1.0, "rewards/chosen": -0.004609488416463137, "rewards/margins": 0.5602784752845764, "rewards/rejected": -0.5648879408836365, "sft_loss": 0.04609488323330879, "step": 2820 }, { "epoch": 4.079537237888648, "grad_norm": 1.6696172775689424, "learning_rate": 1.9366834537435052e-06, "logits/chosen": -0.7560070753097534, "logits/rejected": -0.5555533170700073, "logps/chosen": -0.09860974550247192, "logps/rejected": -5.792730808258057, "loss": 0.1054, "odds_ratio_loss": 0.007586261723190546, "rewards/accuracies": 1.0, "rewards/chosen": -0.009860975667834282, "rewards/margins": 0.569412112236023, "rewards/rejected": -0.5792730450630188, "sft_loss": 0.09860974550247192, "step": 2821 }, { "epoch": 4.080983369486623, "grad_norm": 1.7064321016532233, "learning_rate": 1.9340232992558242e-06, "logits/chosen": -0.862777590751648, "logits/rejected": -0.6179209351539612, "logps/chosen": -0.15015126764774323, "logps/rejected": -3.6706156730651855, "loss": 0.1044, "odds_ratio_loss": 0.017487984150648117, "rewards/accuracies": 1.0, "rewards/chosen": -0.015015127137303352, "rewards/margins": 0.3520464301109314, "rewards/rejected": -0.3670615553855896, "sft_loss": 0.15015126764774323, "step": 2822 }, { "epoch": 4.082429501084599, "grad_norm": 1.7723353244915958, "learning_rate": 1.9313643903531916e-06, "logits/chosen": -0.8052210211753845, "logits/rejected": -0.7118884325027466, "logps/chosen": -0.07638125866651535, "logps/rejected": -3.0936851501464844, "loss": 0.1025, "odds_ratio_loss": 0.007743437774479389, "rewards/accuracies": 1.0, "rewards/chosen": -0.00763812568038702, "rewards/margins": 0.30173036456108093, "rewards/rejected": -0.3093685209751129, "sft_loss": 0.07638125866651535, "step": 2823 }, { "epoch": 4.083875632682574, "grad_norm": 1.603899631457365, "learning_rate": 1.9287067286386735e-06, "logits/chosen": -0.7378439903259277, "logits/rejected": -0.6304211616516113, "logps/chosen": -0.10273067653179169, "logps/rejected": -4.914271354675293, "loss": 0.0856, "odds_ratio_loss": 0.0075208027847111225, "rewards/accuracies": 1.0, "rewards/chosen": -0.010273068211972713, "rewards/margins": 0.48115405440330505, "rewards/rejected": -0.49142712354660034, "sft_loss": 0.10273067653179169, "step": 2824 }, { "epoch": 4.0853217642805495, "grad_norm": 1.815738070132041, "learning_rate": 1.926050315714582e-06, "logits/chosen": -0.7988303899765015, "logits/rejected": -0.7211605310440063, "logps/chosen": -0.06291496008634567, "logps/rejected": -4.587538719177246, "loss": 0.0739, "odds_ratio_loss": 0.005865362472832203, "rewards/accuracies": 1.0, "rewards/chosen": -0.006291495636105537, "rewards/margins": 0.4524623453617096, "rewards/rejected": -0.4587538540363312, "sft_loss": 0.06291496008634567, "step": 2825 }, { "epoch": 4.086767895878525, "grad_norm": 1.5383179861936356, "learning_rate": 1.923395153182478e-06, "logits/chosen": -0.8308844566345215, "logits/rejected": -0.715095043182373, "logps/chosen": -0.053292643278837204, "logps/rejected": -3.6435461044311523, "loss": 0.0703, "odds_ratio_loss": 0.0045709628611803055, "rewards/accuracies": 1.0, "rewards/chosen": -0.005329264793545008, "rewards/margins": 0.35902532935142517, "rewards/rejected": -0.36435458064079285, "sft_loss": 0.053292643278837204, "step": 2826 }, { "epoch": 4.0882140274765, "grad_norm": 1.5371817275721964, "learning_rate": 1.920741242643172e-06, "logits/chosen": -0.9177175164222717, "logits/rejected": -0.7014694213867188, "logps/chosen": -0.0635892003774643, "logps/rejected": -4.349488258361816, "loss": 0.0495, "odds_ratio_loss": 0.0026953257620334625, "rewards/accuracies": 1.0, "rewards/chosen": -0.0063589196652174, "rewards/margins": 0.4285898804664612, "rewards/rejected": -0.4349488317966461, "sft_loss": 0.0635892003774643, "step": 2827 }, { "epoch": 4.089660159074476, "grad_norm": 1.9984594647479759, "learning_rate": 1.9180885856967133e-06, "logits/chosen": -0.9267109036445618, "logits/rejected": -0.71535325050354, "logps/chosen": -0.0746922492980957, "logps/rejected": -4.762479782104492, "loss": 0.117, "odds_ratio_loss": 0.00817443523555994, "rewards/accuracies": 1.0, "rewards/chosen": -0.007469225209206343, "rewards/margins": 0.4687787592411041, "rewards/rejected": -0.47624802589416504, "sft_loss": 0.0746922492980957, "step": 2828 }, { "epoch": 4.091106290672451, "grad_norm": 1.826210488228371, "learning_rate": 1.9154371839424014e-06, "logits/chosen": -0.9434868097305298, "logits/rejected": -0.7802065014839172, "logps/chosen": -0.04058940336108208, "logps/rejected": -6.8330464363098145, "loss": 0.0835, "odds_ratio_loss": 0.004032096825540066, "rewards/accuracies": 1.0, "rewards/chosen": -0.004058940336108208, "rewards/margins": 0.6792457103729248, "rewards/rejected": -0.6833046674728394, "sft_loss": 0.04058940336108208, "step": 2829 }, { "epoch": 4.092552422270427, "grad_norm": 1.9463274698751436, "learning_rate": 1.912787038978774e-06, "logits/chosen": -0.7986459732055664, "logits/rejected": -0.5903900861740112, "logps/chosen": -0.06586778163909912, "logps/rejected": -4.728672504425049, "loss": 0.0968, "odds_ratio_loss": 0.005637805908918381, "rewards/accuracies": 1.0, "rewards/chosen": -0.006586778908967972, "rewards/margins": 0.4662804901599884, "rewards/rejected": -0.4728672504425049, "sft_loss": 0.06586778163909912, "step": 2830 }, { "epoch": 4.093998553868402, "grad_norm": 1.9187071058912697, "learning_rate": 1.910138152403616e-06, "logits/chosen": -0.7255648970603943, "logits/rejected": -0.5702151656150818, "logps/chosen": -0.15819592773914337, "logps/rejected": -4.314438819885254, "loss": 0.0972, "odds_ratio_loss": 0.014649780467152596, "rewards/accuracies": 1.0, "rewards/chosen": -0.015819592401385307, "rewards/margins": 0.4156242907047272, "rewards/rejected": -0.43144387006759644, "sft_loss": 0.15819592773914337, "step": 2831 }, { "epoch": 4.0954446854663775, "grad_norm": 1.678493531054628, "learning_rate": 1.907490525813947e-06, "logits/chosen": -0.5866957306861877, "logits/rejected": -0.3991992175579071, "logps/chosen": -0.10705491900444031, "logps/rejected": -6.210849761962891, "loss": 0.0792, "odds_ratio_loss": 0.005011118017137051, "rewards/accuracies": 1.0, "rewards/chosen": -0.010705491527915001, "rewards/margins": 0.6103795170783997, "rewards/rejected": -0.621084988117218, "sft_loss": 0.10705491900444031, "step": 2832 }, { "epoch": 4.096890817064353, "grad_norm": 1.6178290940716764, "learning_rate": 1.9048441608060358e-06, "logits/chosen": -0.6119937896728516, "logits/rejected": -0.4112843871116638, "logps/chosen": -0.044672608375549316, "logps/rejected": -4.956965446472168, "loss": 0.0581, "odds_ratio_loss": 0.0036439471878111362, "rewards/accuracies": 1.0, "rewards/chosen": -0.0044672610238194466, "rewards/margins": 0.4912292957305908, "rewards/rejected": -0.4956965446472168, "sft_loss": 0.044672608375549316, "step": 2833 }, { "epoch": 4.098336948662328, "grad_norm": 1.844544275796571, "learning_rate": 1.9021990589753827e-06, "logits/chosen": -0.6817110180854797, "logits/rejected": -0.5111923217773438, "logps/chosen": -0.07816684991121292, "logps/rejected": -5.29497766494751, "loss": 0.0771, "odds_ratio_loss": 0.00837037805467844, "rewards/accuracies": 1.0, "rewards/chosen": -0.007816685363650322, "rewards/margins": 0.5216810703277588, "rewards/rejected": -0.5294978022575378, "sft_loss": 0.07816684991121292, "step": 2834 }, { "epoch": 4.099783080260304, "grad_norm": 1.3457218950980514, "learning_rate": 1.8995552219167284e-06, "logits/chosen": -0.6739135384559631, "logits/rejected": -0.555662989616394, "logps/chosen": -0.04297306016087532, "logps/rejected": -6.009502410888672, "loss": 0.0573, "odds_ratio_loss": 0.0026276421267539263, "rewards/accuracies": 1.0, "rewards/chosen": -0.004297305829823017, "rewards/margins": 0.5966529846191406, "rewards/rejected": -0.6009502410888672, "sft_loss": 0.04297306016087532, "step": 2835 }, { "epoch": 4.101229211858279, "grad_norm": 2.3566402902558714, "learning_rate": 1.8969126512240555e-06, "logits/chosen": -0.8081563711166382, "logits/rejected": -0.6554782390594482, "logps/chosen": -0.13750715553760529, "logps/rejected": -3.394568920135498, "loss": 0.0946, "odds_ratio_loss": 0.016596950590610504, "rewards/accuracies": 1.0, "rewards/chosen": -0.013750715181231499, "rewards/margins": 0.3257061839103699, "rewards/rejected": -0.3394568860530853, "sft_loss": 0.13750715553760529, "step": 2836 }, { "epoch": 4.102675343456254, "grad_norm": 1.7832953293812144, "learning_rate": 1.8942713484905761e-06, "logits/chosen": -0.6158128976821899, "logits/rejected": -0.46845024824142456, "logps/chosen": -0.03573101758956909, "logps/rejected": -4.8370561599731445, "loss": 0.0443, "odds_ratio_loss": 0.005472586024552584, "rewards/accuracies": 1.0, "rewards/chosen": -0.0035731021780520678, "rewards/margins": 0.480132520198822, "rewards/rejected": -0.48370563983917236, "sft_loss": 0.03573101758956909, "step": 2837 }, { "epoch": 4.10412147505423, "grad_norm": 1.4666250779657488, "learning_rate": 1.891631315308745e-06, "logits/chosen": -0.8202795386314392, "logits/rejected": -0.8174320459365845, "logps/chosen": -0.050609488040208817, "logps/rejected": -3.721074104309082, "loss": 0.0552, "odds_ratio_loss": 0.006177366711199284, "rewards/accuracies": 1.0, "rewards/chosen": -0.005060948897153139, "rewards/margins": 0.3670464754104614, "rewards/rejected": -0.3721074163913727, "sft_loss": 0.050609488040208817, "step": 2838 }, { "epoch": 4.1055676066522055, "grad_norm": 1.4677580796088394, "learning_rate": 1.888992553270245e-06, "logits/chosen": -0.6891862154006958, "logits/rejected": -0.5416716933250427, "logps/chosen": -0.05191085860133171, "logps/rejected": -3.703627586364746, "loss": 0.0697, "odds_ratio_loss": 0.0027856866363435984, "rewards/accuracies": 1.0, "rewards/chosen": -0.005191085860133171, "rewards/margins": 0.3651717007160187, "rewards/rejected": -0.370362788438797, "sft_loss": 0.05191085860133171, "step": 2839 }, { "epoch": 4.10701373825018, "grad_norm": 1.7383613915672071, "learning_rate": 1.8863550639659983e-06, "logits/chosen": -0.9624236822128296, "logits/rejected": -0.6738488078117371, "logps/chosen": -0.13818246126174927, "logps/rejected": -4.537891387939453, "loss": 0.1094, "odds_ratio_loss": 0.015407193452119827, "rewards/accuracies": 1.0, "rewards/chosen": -0.013818246312439442, "rewards/margins": 0.43997085094451904, "rewards/rejected": -0.4537891447544098, "sft_loss": 0.13818246126174927, "step": 2840 }, { "epoch": 4.108459869848156, "grad_norm": 1.769141927099359, "learning_rate": 1.883718848986155e-06, "logits/chosen": -0.9640853404998779, "logits/rejected": -0.7224030494689941, "logps/chosen": -0.07164276391267776, "logps/rejected": -5.423728942871094, "loss": 0.1005, "odds_ratio_loss": 0.002488494850695133, "rewards/accuracies": 1.0, "rewards/chosen": -0.007164277136325836, "rewards/margins": 0.5352085828781128, "rewards/rejected": -0.5423728823661804, "sft_loss": 0.07164276391267776, "step": 2841 }, { "epoch": 4.109906001446132, "grad_norm": 1.624625898692095, "learning_rate": 1.8810839099201004e-06, "logits/chosen": -0.8588815927505493, "logits/rejected": -0.7229719161987305, "logps/chosen": -0.08476316183805466, "logps/rejected": -3.678792953491211, "loss": 0.079, "odds_ratio_loss": 0.010054201819002628, "rewards/accuracies": 1.0, "rewards/chosen": -0.008476316928863525, "rewards/margins": 0.35940298438072205, "rewards/rejected": -0.36787930130958557, "sft_loss": 0.08476316183805466, "step": 2842 }, { "epoch": 4.111352133044107, "grad_norm": 1.7008537404128115, "learning_rate": 1.878450248356446e-06, "logits/chosen": -0.8767229318618774, "logits/rejected": -0.6937390565872192, "logps/chosen": -0.12689150869846344, "logps/rejected": -5.257368564605713, "loss": 0.082, "odds_ratio_loss": 0.012621680274605751, "rewards/accuracies": 1.0, "rewards/chosen": -0.012689150869846344, "rewards/margins": 0.5130476951599121, "rewards/rejected": -0.5257368683815002, "sft_loss": 0.12689150869846344, "step": 2843 }, { "epoch": 4.112798264642082, "grad_norm": 1.5221274028586664, "learning_rate": 1.87581786588304e-06, "logits/chosen": -0.9948400259017944, "logits/rejected": -0.6448618173599243, "logps/chosen": -0.11695563793182373, "logps/rejected": -5.0134406089782715, "loss": 0.0724, "odds_ratio_loss": 0.008317345753312111, "rewards/accuracies": 1.0, "rewards/chosen": -0.011695563793182373, "rewards/margins": 0.4896485209465027, "rewards/rejected": -0.5013440847396851, "sft_loss": 0.11695563793182373, "step": 2844 }, { "epoch": 4.114244396240058, "grad_norm": 1.8872482751675728, "learning_rate": 1.8731867640869528e-06, "logits/chosen": -0.982231080532074, "logits/rejected": -0.7305710911750793, "logps/chosen": -0.09674321860074997, "logps/rejected": -4.068989276885986, "loss": 0.1131, "odds_ratio_loss": 0.006529256701469421, "rewards/accuracies": 1.0, "rewards/chosen": -0.009674321860074997, "rewards/margins": 0.3972246050834656, "rewards/rejected": -0.4068989157676697, "sft_loss": 0.09674321860074997, "step": 2845 }, { "epoch": 4.115690527838034, "grad_norm": 1.6369506462728376, "learning_rate": 1.8705569445544875e-06, "logits/chosen": -0.7939417362213135, "logits/rejected": -0.5607274770736694, "logps/chosen": -0.08005495369434357, "logps/rejected": -6.112852573394775, "loss": 0.0843, "odds_ratio_loss": 0.006115018390119076, "rewards/accuracies": 1.0, "rewards/chosen": -0.008005496114492416, "rewards/margins": 0.603279709815979, "rewards/rejected": -0.6112852096557617, "sft_loss": 0.08005495369434357, "step": 2846 }, { "epoch": 4.117136659436008, "grad_norm": 1.7432584620353744, "learning_rate": 1.8679284088711703e-06, "logits/chosen": -1.0481350421905518, "logits/rejected": -0.642801821231842, "logps/chosen": -0.04570764675736427, "logps/rejected": -5.874081611633301, "loss": 0.062, "odds_ratio_loss": 0.0012314720079302788, "rewards/accuracies": 1.0, "rewards/chosen": -0.0045707649551332, "rewards/margins": 0.5828374028205872, "rewards/rejected": -0.587408185005188, "sft_loss": 0.04570764675736427, "step": 2847 }, { "epoch": 4.118582791033984, "grad_norm": 2.034327837129782, "learning_rate": 1.8653011586217575e-06, "logits/chosen": -0.9006962180137634, "logits/rejected": -0.524931013584137, "logps/chosen": -0.07342912256717682, "logps/rejected": -5.247374534606934, "loss": 0.0688, "odds_ratio_loss": 0.0020064630080014467, "rewards/accuracies": 1.0, "rewards/chosen": -0.007342912256717682, "rewards/margins": 0.5173946022987366, "rewards/rejected": -0.5247374773025513, "sft_loss": 0.07342912256717682, "step": 2848 }, { "epoch": 4.12002892263196, "grad_norm": 1.5094420598404346, "learning_rate": 1.8626751953902265e-06, "logits/chosen": -0.8094779849052429, "logits/rejected": -0.6073752641677856, "logps/chosen": -0.061085619032382965, "logps/rejected": -4.160881519317627, "loss": 0.0567, "odds_ratio_loss": 0.0034529813565313816, "rewards/accuracies": 1.0, "rewards/chosen": -0.006108561530709267, "rewards/margins": 0.40997958183288574, "rewards/rejected": -0.41608819365501404, "sft_loss": 0.061085619032382965, "step": 2849 }, { "epoch": 4.1214750542299345, "grad_norm": 1.4627918976068797, "learning_rate": 1.8600505207597789e-06, "logits/chosen": -0.5969792604446411, "logits/rejected": -0.4297393262386322, "logps/chosen": -0.030321719124913216, "logps/rejected": -6.4586896896362305, "loss": 0.0626, "odds_ratio_loss": 0.0009324885904788971, "rewards/accuracies": 1.0, "rewards/chosen": -0.0030321720987558365, "rewards/margins": 0.6428368091583252, "rewards/rejected": -0.6458690166473389, "sft_loss": 0.030321719124913216, "step": 2850 }, { "epoch": 4.12292118582791, "grad_norm": 1.4982677272369433, "learning_rate": 1.857427136312844e-06, "logits/chosen": -0.876201331615448, "logits/rejected": -0.6428571343421936, "logps/chosen": -0.07151912152767181, "logps/rejected": -4.5147175788879395, "loss": 0.0806, "odds_ratio_loss": 0.0038516269996762276, "rewards/accuracies": 1.0, "rewards/chosen": -0.007151912897825241, "rewards/margins": 0.44431987404823303, "rewards/rejected": -0.45147180557250977, "sft_loss": 0.07151912152767181, "step": 2851 }, { "epoch": 4.124367317425886, "grad_norm": 1.5257893605138617, "learning_rate": 1.8548050436310669e-06, "logits/chosen": -0.8841822743415833, "logits/rejected": -0.5845543146133423, "logps/chosen": -0.04778134822845459, "logps/rejected": -5.190133094787598, "loss": 0.0625, "odds_ratio_loss": 0.004256582818925381, "rewards/accuracies": 1.0, "rewards/chosen": -0.004778134636580944, "rewards/margins": 0.5142351984977722, "rewards/rejected": -0.5190134048461914, "sft_loss": 0.04778134822845459, "step": 2852 }, { "epoch": 4.125813449023862, "grad_norm": 1.2651179338589529, "learning_rate": 1.8521842442953198e-06, "logits/chosen": -0.7237213850021362, "logits/rejected": -0.6026439070701599, "logps/chosen": -0.1049027293920517, "logps/rejected": -5.724381923675537, "loss": 0.0709, "odds_ratio_loss": 0.008922792971134186, "rewards/accuracies": 1.0, "rewards/chosen": -0.010490273125469685, "rewards/margins": 0.5619478821754456, "rewards/rejected": -0.5724381804466248, "sft_loss": 0.1049027293920517, "step": 2853 }, { "epoch": 4.127259580621836, "grad_norm": 1.4252690236427485, "learning_rate": 1.84956473988569e-06, "logits/chosen": -0.9042859077453613, "logits/rejected": -0.5686567425727844, "logps/chosen": -0.056515954434871674, "logps/rejected": -5.483891487121582, "loss": 0.0745, "odds_ratio_loss": 0.005240194033831358, "rewards/accuracies": 1.0, "rewards/chosen": -0.005651596002280712, "rewards/margins": 0.542737603187561, "rewards/rejected": -0.548389196395874, "sft_loss": 0.056515954434871674, "step": 2854 }, { "epoch": 4.128705712219812, "grad_norm": 1.3935884148488344, "learning_rate": 1.846946531981489e-06, "logits/chosen": -0.8212897777557373, "logits/rejected": -0.5057724118232727, "logps/chosen": -0.039713770151138306, "logps/rejected": -4.20341157913208, "loss": 0.0708, "odds_ratio_loss": 0.0034693204797804356, "rewards/accuracies": 1.0, "rewards/chosen": -0.00397137738764286, "rewards/margins": 0.41636979579925537, "rewards/rejected": -0.4203411638736725, "sft_loss": 0.039713770151138306, "step": 2855 }, { "epoch": 4.130151843817788, "grad_norm": 1.4700289842978596, "learning_rate": 1.8443296221612426e-06, "logits/chosen": -0.8127454519271851, "logits/rejected": -0.6734610199928284, "logps/chosen": -0.07819758355617523, "logps/rejected": -4.370360374450684, "loss": 0.0792, "odds_ratio_loss": 0.010140936821699142, "rewards/accuracies": 1.0, "rewards/chosen": -0.007819757796823978, "rewards/margins": 0.42921626567840576, "rewards/rejected": -0.43703603744506836, "sft_loss": 0.07819758355617523, "step": 2856 }, { "epoch": 4.131597975415763, "grad_norm": 1.6312862182934151, "learning_rate": 1.8417140120026954e-06, "logits/chosen": -0.6778440475463867, "logits/rejected": -0.5362757444381714, "logps/chosen": -0.054850347340106964, "logps/rejected": -3.9211716651916504, "loss": 0.0752, "odds_ratio_loss": 0.0033841035328805447, "rewards/accuracies": 1.0, "rewards/chosen": -0.0054850345477461815, "rewards/margins": 0.38663214445114136, "rewards/rejected": -0.39211714267730713, "sft_loss": 0.054850347340106964, "step": 2857 }, { "epoch": 4.133044107013738, "grad_norm": 1.3563265826587092, "learning_rate": 1.8390997030828074e-06, "logits/chosen": -0.9520107507705688, "logits/rejected": -0.7877506017684937, "logps/chosen": -0.07438983023166656, "logps/rejected": -4.451206684112549, "loss": 0.0519, "odds_ratio_loss": 0.004518951755017042, "rewards/accuracies": 1.0, "rewards/chosen": -0.007438982836902142, "rewards/margins": 0.4376816749572754, "rewards/rejected": -0.4451206624507904, "sft_loss": 0.07438983023166656, "step": 2858 }, { "epoch": 4.134490238611714, "grad_norm": 1.343261093042372, "learning_rate": 1.836486696977758e-06, "logits/chosen": -0.7512749433517456, "logits/rejected": -0.6193314790725708, "logps/chosen": -0.03754514455795288, "logps/rejected": -5.197000026702881, "loss": 0.0499, "odds_ratio_loss": 0.006154801230877638, "rewards/accuracies": 1.0, "rewards/chosen": -0.003754514502361417, "rewards/margins": 0.5159454941749573, "rewards/rejected": -0.5197000503540039, "sft_loss": 0.03754514455795288, "step": 2859 }, { "epoch": 4.135936370209689, "grad_norm": 1.6673291219985102, "learning_rate": 1.8338749952629353e-06, "logits/chosen": -0.8384102582931519, "logits/rejected": -0.6935494542121887, "logps/chosen": -0.05776578187942505, "logps/rejected": -3.5844650268554688, "loss": 0.0767, "odds_ratio_loss": 0.008841626346111298, "rewards/accuracies": 1.0, "rewards/chosen": -0.005776578560471535, "rewards/margins": 0.35266995429992676, "rewards/rejected": -0.35844647884368896, "sft_loss": 0.05776578187942505, "step": 2860 }, { "epoch": 4.137382501807664, "grad_norm": 1.9142765839106552, "learning_rate": 1.831264599512945e-06, "logits/chosen": -0.8089092969894409, "logits/rejected": -0.6261400580406189, "logps/chosen": -0.16545385122299194, "logps/rejected": -4.192132949829102, "loss": 0.1065, "odds_ratio_loss": 0.02832021750509739, "rewards/accuracies": 1.0, "rewards/chosen": -0.016545386984944344, "rewards/margins": 0.40266790986061096, "rewards/rejected": -0.41921332478523254, "sft_loss": 0.16545385122299194, "step": 2861 }, { "epoch": 4.13882863340564, "grad_norm": 1.8508060502337165, "learning_rate": 1.828655511301607e-06, "logits/chosen": -1.029259204864502, "logits/rejected": -0.7229580283164978, "logps/chosen": -0.08430210500955582, "logps/rejected": -4.091892242431641, "loss": 0.0872, "odds_ratio_loss": 0.007961390540003777, "rewards/accuracies": 1.0, "rewards/chosen": -0.008430209942162037, "rewards/margins": 0.4007590413093567, "rewards/rejected": -0.40918928384780884, "sft_loss": 0.08430210500955582, "step": 2862 }, { "epoch": 4.140274765003615, "grad_norm": 1.4236489094364113, "learning_rate": 1.8260477322019478e-06, "logits/chosen": -0.9422646760940552, "logits/rejected": -0.6158483028411865, "logps/chosen": -0.043667394667863846, "logps/rejected": -5.095708847045898, "loss": 0.049, "odds_ratio_loss": 0.0031190637964755297, "rewards/accuracies": 1.0, "rewards/chosen": -0.0043667396530508995, "rewards/margins": 0.5052041411399841, "rewards/rejected": -0.5095708966255188, "sft_loss": 0.043667394667863846, "step": 2863 }, { "epoch": 4.141720896601591, "grad_norm": 2.237085057444961, "learning_rate": 1.8234412637862078e-06, "logits/chosen": -0.7573752403259277, "logits/rejected": -0.5755871534347534, "logps/chosen": -0.036390505731105804, "logps/rejected": -4.381635665893555, "loss": 0.11, "odds_ratio_loss": 0.003340129740536213, "rewards/accuracies": 1.0, "rewards/chosen": -0.003639050293713808, "rewards/margins": 0.4345245063304901, "rewards/rejected": -0.43816354870796204, "sft_loss": 0.036390505731105804, "step": 2864 }, { "epoch": 4.143167028199566, "grad_norm": 1.6249757873443114, "learning_rate": 1.8208361076258347e-06, "logits/chosen": -0.8179000616073608, "logits/rejected": -0.6705646514892578, "logps/chosen": -0.12193619459867477, "logps/rejected": -4.738859176635742, "loss": 0.087, "odds_ratio_loss": 0.0028922702185809612, "rewards/accuracies": 1.0, "rewards/chosen": -0.012193620204925537, "rewards/margins": 0.46169233322143555, "rewards/rejected": -0.4738859534263611, "sft_loss": 0.12193619459867477, "step": 2865 }, { "epoch": 4.144613159797542, "grad_norm": 1.6742919109805106, "learning_rate": 1.8182322652914897e-06, "logits/chosen": -0.6485641002655029, "logits/rejected": -0.580163300037384, "logps/chosen": -0.030021870508790016, "logps/rejected": -4.312215328216553, "loss": 0.0654, "odds_ratio_loss": 0.003744534682482481, "rewards/accuracies": 1.0, "rewards/chosen": -0.0030021872371435165, "rewards/margins": 0.42821940779685974, "rewards/rejected": -0.4312215745449066, "sft_loss": 0.030021870508790016, "step": 2866 }, { "epoch": 4.146059291395517, "grad_norm": 1.5207496797985913, "learning_rate": 1.8156297383530363e-06, "logits/chosen": -0.8672469258308411, "logits/rejected": -0.7313282489776611, "logps/chosen": -0.051602356135845184, "logps/rejected": -6.097007751464844, "loss": 0.0676, "odds_ratio_loss": 0.002992212073877454, "rewards/accuracies": 1.0, "rewards/chosen": -0.005160235799849033, "rewards/margins": 0.6045405864715576, "rewards/rejected": -0.6097007989883423, "sft_loss": 0.051602356135845184, "step": 2867 }, { "epoch": 4.1475054229934925, "grad_norm": 1.6381642095253177, "learning_rate": 1.8130285283795508e-06, "logits/chosen": -0.7583048343658447, "logits/rejected": -0.5695833563804626, "logps/chosen": -0.13401220738887787, "logps/rejected": -4.404128074645996, "loss": 0.0939, "odds_ratio_loss": 0.010725535452365875, "rewards/accuracies": 1.0, "rewards/chosen": -0.013401219621300697, "rewards/margins": 0.42701154947280884, "rewards/rejected": -0.4404127597808838, "sft_loss": 0.13401220738887787, "step": 2868 }, { "epoch": 4.148951554591468, "grad_norm": 1.4680257306046172, "learning_rate": 1.8104286369393087e-06, "logits/chosen": -0.8767812252044678, "logits/rejected": -0.5856321454048157, "logps/chosen": -0.05747556686401367, "logps/rejected": -5.090399742126465, "loss": 0.0814, "odds_ratio_loss": 0.003675619838759303, "rewards/accuracies": 1.0, "rewards/chosen": -0.0057475571520626545, "rewards/margins": 0.503292441368103, "rewards/rejected": -0.5090399980545044, "sft_loss": 0.05747556686401367, "step": 2869 }, { "epoch": 4.150397686189443, "grad_norm": 1.7573313537408448, "learning_rate": 1.807830065599798e-06, "logits/chosen": -0.7944881916046143, "logits/rejected": -0.6999379992485046, "logps/chosen": -0.0795753002166748, "logps/rejected": -5.302948474884033, "loss": 0.0808, "odds_ratio_loss": 0.0038214053492993116, "rewards/accuracies": 1.0, "rewards/chosen": -0.00795752927660942, "rewards/margins": 0.5223373174667358, "rewards/rejected": -0.5302948355674744, "sft_loss": 0.0795753002166748, "step": 2870 }, { "epoch": 4.151843817787419, "grad_norm": 1.6291075861205213, "learning_rate": 1.8052328159277054e-06, "logits/chosen": -0.7494406700134277, "logits/rejected": -0.7102495431900024, "logps/chosen": -0.06435263156890869, "logps/rejected": -4.151299953460693, "loss": 0.0722, "odds_ratio_loss": 0.010602910071611404, "rewards/accuracies": 1.0, "rewards/chosen": -0.006435262970626354, "rewards/margins": 0.4086947441101074, "rewards/rejected": -0.41513001918792725, "sft_loss": 0.06435263156890869, "step": 2871 }, { "epoch": 4.153289949385394, "grad_norm": 1.6436961304644404, "learning_rate": 1.802636889488922e-06, "logits/chosen": -0.8582144379615784, "logits/rejected": -0.6094192266464233, "logps/chosen": -0.05906105786561966, "logps/rejected": -5.2758402824401855, "loss": 0.0643, "odds_ratio_loss": 0.004993016365915537, "rewards/accuracies": 1.0, "rewards/chosen": -0.005906105972826481, "rewards/margins": 0.5216779708862305, "rewards/rejected": -0.5275840759277344, "sft_loss": 0.05906105786561966, "step": 2872 }, { "epoch": 4.154736080983369, "grad_norm": 1.5066252354267282, "learning_rate": 1.8000422878485403e-06, "logits/chosen": -0.8553987741470337, "logits/rejected": -0.684226393699646, "logps/chosen": -0.12316957116127014, "logps/rejected": -3.842941999435425, "loss": 0.0926, "odds_ratio_loss": 0.019793318584561348, "rewards/accuracies": 1.0, "rewards/chosen": -0.012316957116127014, "rewards/margins": 0.3719772398471832, "rewards/rejected": -0.38429421186447144, "sft_loss": 0.12316957116127014, "step": 2873 }, { "epoch": 4.156182212581345, "grad_norm": 1.7609833375052197, "learning_rate": 1.7974490125708575e-06, "logits/chosen": -0.9975032210350037, "logits/rejected": -0.7744351029396057, "logps/chosen": -0.12477146834135056, "logps/rejected": -3.4359312057495117, "loss": 0.0988, "odds_ratio_loss": 0.009972745552659035, "rewards/accuracies": 1.0, "rewards/chosen": -0.0124771473929286, "rewards/margins": 0.3311160206794739, "rewards/rejected": -0.34359318017959595, "sft_loss": 0.12477146834135056, "step": 2874 }, { "epoch": 4.1576283441793205, "grad_norm": 1.518218164522744, "learning_rate": 1.7948570652193688e-06, "logits/chosen": -0.6091439127922058, "logits/rejected": -0.5392559766769409, "logps/chosen": -0.16808342933654785, "logps/rejected": -2.7966959476470947, "loss": 0.0894, "odds_ratio_loss": 0.03804006054997444, "rewards/accuracies": 1.0, "rewards/chosen": -0.016808344051241875, "rewards/margins": 0.2628612816333771, "rewards/rejected": -0.2796696126461029, "sft_loss": 0.16808342933654785, "step": 2875 }, { "epoch": 4.159074475777295, "grad_norm": 1.6362781472337709, "learning_rate": 1.7922664473567672e-06, "logits/chosen": -0.7585278153419495, "logits/rejected": -0.6109793782234192, "logps/chosen": -0.12611831724643707, "logps/rejected": -4.219643592834473, "loss": 0.1081, "odds_ratio_loss": 0.021449480205774307, "rewards/accuracies": 1.0, "rewards/chosen": -0.012611833401024342, "rewards/margins": 0.40935254096984863, "rewards/rejected": -0.4219644367694855, "sft_loss": 0.12611831724643707, "step": 2876 }, { "epoch": 4.160520607375271, "grad_norm": 3.2649128823647127, "learning_rate": 1.7896771605449485e-06, "logits/chosen": -0.7754852175712585, "logits/rejected": -0.6101595163345337, "logps/chosen": -0.1011345386505127, "logps/rejected": -3.0544514656066895, "loss": 0.0989, "odds_ratio_loss": 0.009415505453944206, "rewards/accuracies": 1.0, "rewards/chosen": -0.010113454423844814, "rewards/margins": 0.2953317165374756, "rewards/rejected": -0.3054451644420624, "sft_loss": 0.1011345386505127, "step": 2877 }, { "epoch": 4.161966738973247, "grad_norm": 1.3888856417078521, "learning_rate": 1.7870892063450001e-06, "logits/chosen": -0.753406822681427, "logits/rejected": -0.4384850263595581, "logps/chosen": -0.08360215276479721, "logps/rejected": -3.66217303276062, "loss": 0.0711, "odds_ratio_loss": 0.0076039680279791355, "rewards/accuracies": 1.0, "rewards/chosen": -0.008360215462744236, "rewards/margins": 0.3578570485115051, "rewards/rejected": -0.36621731519699097, "sft_loss": 0.08360215276479721, "step": 2878 }, { "epoch": 4.163412870571222, "grad_norm": 1.529348069798455, "learning_rate": 1.7845025863172127e-06, "logits/chosen": -0.685241162776947, "logits/rejected": -0.6223734617233276, "logps/chosen": -0.06466331332921982, "logps/rejected": -4.321099758148193, "loss": 0.0725, "odds_ratio_loss": 0.009548970498144627, "rewards/accuracies": 1.0, "rewards/chosen": -0.006466331891715527, "rewards/margins": 0.4256436228752136, "rewards/rejected": -0.4321099519729614, "sft_loss": 0.06466331332921982, "step": 2879 }, { "epoch": 4.164859002169197, "grad_norm": 1.3802640702535711, "learning_rate": 1.781917302021067e-06, "logits/chosen": -0.743414044380188, "logits/rejected": -0.6212598085403442, "logps/chosen": -0.08406087011098862, "logps/rejected": -3.3538153171539307, "loss": 0.0755, "odds_ratio_loss": 0.007001823280006647, "rewards/accuracies": 1.0, "rewards/chosen": -0.008406086824834347, "rewards/margins": 0.32697543501853943, "rewards/rejected": -0.33538153767585754, "sft_loss": 0.08406087011098862, "step": 2880 }, { "epoch": 4.166305133767173, "grad_norm": 1.7454934799111688, "learning_rate": 1.7793333550152413e-06, "logits/chosen": -0.9296549558639526, "logits/rejected": -0.6040893197059631, "logps/chosen": -0.09559880197048187, "logps/rejected": -6.568416118621826, "loss": 0.0685, "odds_ratio_loss": 0.002591322874650359, "rewards/accuracies": 1.0, "rewards/chosen": -0.009559880942106247, "rewards/margins": 0.6472817063331604, "rewards/rejected": -0.6568415760993958, "sft_loss": 0.09559880197048187, "step": 2881 }, { "epoch": 4.1677512653651485, "grad_norm": 1.378926994916888, "learning_rate": 1.776750746857605e-06, "logits/chosen": -1.0216419696807861, "logits/rejected": -0.5682446956634521, "logps/chosen": -0.07705073058605194, "logps/rejected": -5.844695091247559, "loss": 0.0884, "odds_ratio_loss": 0.0033558798022568226, "rewards/accuracies": 1.0, "rewards/chosen": -0.0077050733380019665, "rewards/margins": 0.5767644643783569, "rewards/rejected": -0.5844695568084717, "sft_loss": 0.07705073058605194, "step": 2882 }, { "epoch": 4.169197396963123, "grad_norm": 1.8249380721129438, "learning_rate": 1.7741694791052248e-06, "logits/chosen": -0.7748250961303711, "logits/rejected": -0.5840004682540894, "logps/chosen": -0.08540331572294235, "logps/rejected": -5.7757792472839355, "loss": 0.0915, "odds_ratio_loss": 0.0048137810081243515, "rewards/accuracies": 1.0, "rewards/chosen": -0.008540332317352295, "rewards/margins": 0.5690376162528992, "rewards/rejected": -0.5775779485702515, "sft_loss": 0.08540331572294235, "step": 2883 }, { "epoch": 4.170643528561099, "grad_norm": 1.6439984729479422, "learning_rate": 1.7715895533143543e-06, "logits/chosen": -0.9461988806724548, "logits/rejected": -0.7191964983940125, "logps/chosen": -0.08565716445446014, "logps/rejected": -4.468839168548584, "loss": 0.085, "odds_ratio_loss": 0.003522283863276243, "rewards/accuracies": 1.0, "rewards/chosen": -0.008565716445446014, "rewards/margins": 0.4383182227611542, "rewards/rejected": -0.4468839466571808, "sft_loss": 0.08565716445446014, "step": 2884 }, { "epoch": 4.172089660159075, "grad_norm": 1.9779228119632557, "learning_rate": 1.7690109710404433e-06, "logits/chosen": -0.8428412675857544, "logits/rejected": -0.5799111127853394, "logps/chosen": -0.10756391286849976, "logps/rejected": -4.175439357757568, "loss": 0.1062, "odds_ratio_loss": 0.017258528620004654, "rewards/accuracies": 1.0, "rewards/chosen": -0.010756392031908035, "rewards/margins": 0.40678757429122925, "rewards/rejected": -0.4175439774990082, "sft_loss": 0.10756391286849976, "step": 2885 }, { "epoch": 4.1735357917570495, "grad_norm": 1.5163132148550575, "learning_rate": 1.7664337338381253e-06, "logits/chosen": -0.7353506088256836, "logits/rejected": -0.6050468683242798, "logps/chosen": -0.09535052627325058, "logps/rejected": -3.8978962898254395, "loss": 0.086, "odds_ratio_loss": 0.013053749687969685, "rewards/accuracies": 1.0, "rewards/chosen": -0.009535052813589573, "rewards/margins": 0.3802545666694641, "rewards/rejected": -0.3897896409034729, "sft_loss": 0.09535052627325058, "step": 2886 }, { "epoch": 4.174981923355025, "grad_norm": 4.89411470614702, "learning_rate": 1.7638578432612294e-06, "logits/chosen": -0.5543168783187866, "logits/rejected": -0.49109578132629395, "logps/chosen": -0.07704973220825195, "logps/rejected": -5.383688449859619, "loss": 0.0643, "odds_ratio_loss": 0.00851721502840519, "rewards/accuracies": 1.0, "rewards/chosen": -0.007704972755163908, "rewards/margins": 0.5306639671325684, "rewards/rejected": -0.5383689403533936, "sft_loss": 0.07704973220825195, "step": 2887 }, { "epoch": 4.176428054953001, "grad_norm": 1.6508265050328592, "learning_rate": 1.761283300862768e-06, "logits/chosen": -0.732459545135498, "logits/rejected": -0.6587421298027039, "logps/chosen": -0.061337437480688095, "logps/rejected": -3.2690553665161133, "loss": 0.0917, "odds_ratio_loss": 0.026827432215213776, "rewards/accuracies": 1.0, "rewards/chosen": -0.006133743561804295, "rewards/margins": 0.32077181339263916, "rewards/rejected": -0.3269055485725403, "sft_loss": 0.061337437480688095, "step": 2888 }, { "epoch": 4.1778741865509765, "grad_norm": 1.5687060094740681, "learning_rate": 1.7587101081949406e-06, "logits/chosen": -0.871804416179657, "logits/rejected": -0.6253595352172852, "logps/chosen": -0.09505133330821991, "logps/rejected": -4.816754341125488, "loss": 0.1232, "odds_ratio_loss": 0.0039987508207559586, "rewards/accuracies": 1.0, "rewards/chosen": -0.009505132213234901, "rewards/margins": 0.47217029333114624, "rewards/rejected": -0.4816754162311554, "sft_loss": 0.09505133330821991, "step": 2889 }, { "epoch": 4.179320318148951, "grad_norm": 1.574633627267037, "learning_rate": 1.7561382668091383e-06, "logits/chosen": -0.8467705249786377, "logits/rejected": -0.5515745282173157, "logps/chosen": -0.12236753851175308, "logps/rejected": -6.257351875305176, "loss": 0.1053, "odds_ratio_loss": 0.012112165801227093, "rewards/accuracies": 1.0, "rewards/chosen": -0.012236754409968853, "rewards/margins": 0.6134985089302063, "rewards/rejected": -0.6257352232933044, "sft_loss": 0.12236753851175308, "step": 2890 }, { "epoch": 4.180766449746927, "grad_norm": 1.6617557779312644, "learning_rate": 1.7535677782559306e-06, "logits/chosen": -0.6538142561912537, "logits/rejected": -0.5575007200241089, "logps/chosen": -0.0687403753399849, "logps/rejected": -4.553154945373535, "loss": 0.091, "odds_ratio_loss": 0.008355571888387203, "rewards/accuracies": 1.0, "rewards/chosen": -0.006874037906527519, "rewards/margins": 0.44844144582748413, "rewards/rejected": -0.455315500497818, "sft_loss": 0.0687403753399849, "step": 2891 }, { "epoch": 4.182212581344903, "grad_norm": 1.3237025011825474, "learning_rate": 1.7509986440850773e-06, "logits/chosen": -0.8017591834068298, "logits/rejected": -0.6542471051216125, "logps/chosen": -0.04119560867547989, "logps/rejected": -5.426454067230225, "loss": 0.0717, "odds_ratio_loss": 0.006445649079978466, "rewards/accuracies": 1.0, "rewards/chosen": -0.004119561053812504, "rewards/margins": 0.5385258197784424, "rewards/rejected": -0.5426453948020935, "sft_loss": 0.04119560867547989, "step": 2892 }, { "epoch": 4.1836587129428775, "grad_norm": 1.6426153001965083, "learning_rate": 1.748430865845516e-06, "logits/chosen": -0.7238603234291077, "logits/rejected": -0.7122807502746582, "logps/chosen": -0.09050285816192627, "logps/rejected": -3.2810425758361816, "loss": 0.09, "odds_ratio_loss": 0.011915099807083607, "rewards/accuracies": 1.0, "rewards/chosen": -0.009050285443663597, "rewards/margins": 0.31905397772789, "rewards/rejected": -0.32810425758361816, "sft_loss": 0.09050285816192627, "step": 2893 }, { "epoch": 4.185104844540853, "grad_norm": 1.5864270160960752, "learning_rate": 1.745864445085373e-06, "logits/chosen": -1.0129873752593994, "logits/rejected": -0.6170842051506042, "logps/chosen": -0.045922402292490005, "logps/rejected": -5.002997398376465, "loss": 0.0947, "odds_ratio_loss": 0.0038542391266673803, "rewards/accuracies": 1.0, "rewards/chosen": -0.004592240788042545, "rewards/margins": 0.49570751190185547, "rewards/rejected": -0.5002997517585754, "sft_loss": 0.045922402292490005, "step": 2894 }, { "epoch": 4.186550976138829, "grad_norm": 4.730722961719703, "learning_rate": 1.7432993833519514e-06, "logits/chosen": -0.7456246018409729, "logits/rejected": -0.638870120048523, "logps/chosen": -0.09694996476173401, "logps/rejected": -5.234409809112549, "loss": 0.0961, "odds_ratio_loss": 0.014911282807588577, "rewards/accuracies": 1.0, "rewards/chosen": -0.009694996289908886, "rewards/margins": 0.5137460231781006, "rewards/rejected": -0.5234410166740417, "sft_loss": 0.09694996476173401, "step": 2895 }, { "epoch": 4.187997107736804, "grad_norm": 1.7372021790081584, "learning_rate": 1.7407356821917362e-06, "logits/chosen": -0.7853217124938965, "logits/rejected": -0.5458134412765503, "logps/chosen": -0.1326024830341339, "logps/rejected": -5.669880390167236, "loss": 0.1053, "odds_ratio_loss": 0.015873540192842484, "rewards/accuracies": 1.0, "rewards/chosen": -0.01326024904847145, "rewards/margins": 0.5537277460098267, "rewards/rejected": -0.5669880509376526, "sft_loss": 0.1326024830341339, "step": 2896 }, { "epoch": 4.189443239334779, "grad_norm": 1.7437826506465477, "learning_rate": 1.7381733431503919e-06, "logits/chosen": -0.7777459621429443, "logits/rejected": -0.5212811827659607, "logps/chosen": -0.06154448539018631, "logps/rejected": -6.10374641418457, "loss": 0.0601, "odds_ratio_loss": 0.00723948935046792, "rewards/accuracies": 1.0, "rewards/chosen": -0.006154448725283146, "rewards/margins": 0.6042202711105347, "rewards/rejected": -0.6103746891021729, "sft_loss": 0.06154448539018631, "step": 2897 }, { "epoch": 4.190889370932755, "grad_norm": 1.6087185240794226, "learning_rate": 1.7356123677727634e-06, "logits/chosen": -0.8092604875564575, "logits/rejected": -0.6327940225601196, "logps/chosen": -0.0712791383266449, "logps/rejected": -6.498178958892822, "loss": 0.0952, "odds_ratio_loss": 0.015109434723854065, "rewards/accuracies": 1.0, "rewards/chosen": -0.007127914112061262, "rewards/margins": 0.6426900029182434, "rewards/rejected": -0.649817943572998, "sft_loss": 0.0712791383266449, "step": 2898 }, { "epoch": 4.19233550253073, "grad_norm": 1.810061844063369, "learning_rate": 1.7330527576028713e-06, "logits/chosen": -1.077024221420288, "logits/rejected": -0.8910109996795654, "logps/chosen": -0.14297831058502197, "logps/rejected": -3.079667806625366, "loss": 0.105, "odds_ratio_loss": 0.01574338600039482, "rewards/accuracies": 1.0, "rewards/chosen": -0.014297831803560257, "rewards/margins": 0.2936689555644989, "rewards/rejected": -0.30796679854393005, "sft_loss": 0.14297831058502197, "step": 2899 }, { "epoch": 4.1937816341287055, "grad_norm": 1.4080726234451635, "learning_rate": 1.7304945141839156e-06, "logits/chosen": -0.8134248852729797, "logits/rejected": -0.6756539344787598, "logps/chosen": -0.06007365137338638, "logps/rejected": -5.442124366760254, "loss": 0.0702, "odds_ratio_loss": 0.0070592500269412994, "rewards/accuracies": 1.0, "rewards/chosen": -0.006007364951074123, "rewards/margins": 0.5382050275802612, "rewards/rejected": -0.5442124605178833, "sft_loss": 0.06007365137338638, "step": 2900 }, { "epoch": 4.195227765726681, "grad_norm": 1.8129753504901793, "learning_rate": 1.7279376390582683e-06, "logits/chosen": -0.814599871635437, "logits/rejected": -0.5579865574836731, "logps/chosen": -0.06395604461431503, "logps/rejected": -6.123435974121094, "loss": 0.085, "odds_ratio_loss": 0.0038474637549370527, "rewards/accuracies": 1.0, "rewards/chosen": -0.006395603530108929, "rewards/margins": 0.6059479713439941, "rewards/rejected": -0.6123435497283936, "sft_loss": 0.06395604461431503, "step": 2901 }, { "epoch": 4.196673897324657, "grad_norm": 1.7496011317153113, "learning_rate": 1.725382133767482e-06, "logits/chosen": -0.9168472290039062, "logits/rejected": -0.6373493671417236, "logps/chosen": -0.14649318158626556, "logps/rejected": -6.127131462097168, "loss": 0.0838, "odds_ratio_loss": 0.01623375155031681, "rewards/accuracies": 1.0, "rewards/chosen": -0.014649318531155586, "rewards/margins": 0.5980637669563293, "rewards/rejected": -0.612713098526001, "sft_loss": 0.14649318158626556, "step": 2902 }, { "epoch": 4.198120028922632, "grad_norm": 1.8093572721032025, "learning_rate": 1.7228279998522791e-06, "logits/chosen": -0.8479722738265991, "logits/rejected": -0.6456978917121887, "logps/chosen": -0.0714811310172081, "logps/rejected": -4.065431594848633, "loss": 0.0833, "odds_ratio_loss": 0.015304593369364738, "rewards/accuracies": 1.0, "rewards/chosen": -0.00714811310172081, "rewards/margins": 0.39939504861831665, "rewards/rejected": -0.40654319524765015, "sft_loss": 0.0714811310172081, "step": 2903 }, { "epoch": 4.199566160520607, "grad_norm": 1.7050155520074486, "learning_rate": 1.7202752388525546e-06, "logits/chosen": -0.7397294044494629, "logits/rejected": -0.6496505737304688, "logps/chosen": -0.05917609483003616, "logps/rejected": -4.717207908630371, "loss": 0.0827, "odds_ratio_loss": 0.0097846370190382, "rewards/accuracies": 1.0, "rewards/chosen": -0.005917609669268131, "rewards/margins": 0.46580320596694946, "rewards/rejected": -0.471720814704895, "sft_loss": 0.05917609483003616, "step": 2904 }, { "epoch": 4.201012292118583, "grad_norm": 1.8064910660008877, "learning_rate": 1.7177238523073804e-06, "logits/chosen": -0.7527694702148438, "logits/rejected": -0.6076477766036987, "logps/chosen": -0.12672166526317596, "logps/rejected": -3.709519863128662, "loss": 0.0817, "odds_ratio_loss": 0.025193804875016212, "rewards/accuracies": 1.0, "rewards/chosen": -0.012672166340053082, "rewards/margins": 0.358279824256897, "rewards/rejected": -0.3709520101547241, "sft_loss": 0.12672166526317596, "step": 2905 }, { "epoch": 4.202458423716558, "grad_norm": 1.4769855495178907, "learning_rate": 1.7151738417549945e-06, "logits/chosen": -0.8476330041885376, "logits/rejected": -0.7080752849578857, "logps/chosen": -0.08576930314302444, "logps/rejected": -3.353492021560669, "loss": 0.0825, "odds_ratio_loss": 0.0258241668343544, "rewards/accuracies": 1.0, "rewards/chosen": -0.008576931431889534, "rewards/margins": 0.3267722725868225, "rewards/rejected": -0.3353492021560669, "sft_loss": 0.08576930314302444, "step": 2906 }, { "epoch": 4.2039045553145336, "grad_norm": 1.6047502239398708, "learning_rate": 1.7126252087328106e-06, "logits/chosen": -0.7401002645492554, "logits/rejected": -0.48686888813972473, "logps/chosen": -0.09672141075134277, "logps/rejected": -5.065194129943848, "loss": 0.0748, "odds_ratio_loss": 0.011058647185564041, "rewards/accuracies": 1.0, "rewards/chosen": -0.009672141633927822, "rewards/margins": 0.4968472421169281, "rewards/rejected": -0.5065193772315979, "sft_loss": 0.09672141075134277, "step": 2907 }, { "epoch": 4.205350686912509, "grad_norm": 1.784826024305332, "learning_rate": 1.710077954777406e-06, "logits/chosen": -0.947332501411438, "logits/rejected": -0.6513444185256958, "logps/chosen": -0.11616423726081848, "logps/rejected": -5.848725318908691, "loss": 0.0847, "odds_ratio_loss": 0.021783018484711647, "rewards/accuracies": 1.0, "rewards/chosen": -0.011616423726081848, "rewards/margins": 0.5732560753822327, "rewards/rejected": -0.5848724842071533, "sft_loss": 0.11616423726081848, "step": 2908 }, { "epoch": 4.206796818510484, "grad_norm": 1.6840418840730764, "learning_rate": 1.7075320814245325e-06, "logits/chosen": -0.7306966781616211, "logits/rejected": -0.5873762369155884, "logps/chosen": -0.10011477768421173, "logps/rejected": -4.062368392944336, "loss": 0.089, "odds_ratio_loss": 0.009407087229192257, "rewards/accuracies": 1.0, "rewards/chosen": -0.010011478327214718, "rewards/margins": 0.39622533321380615, "rewards/rejected": -0.40623682737350464, "sft_loss": 0.10011477768421173, "step": 2909 }, { "epoch": 4.20824295010846, "grad_norm": 2.1766607536331595, "learning_rate": 1.7049875902091046e-06, "logits/chosen": -0.9869181513786316, "logits/rejected": -0.6725652813911438, "logps/chosen": -0.07069652527570724, "logps/rejected": -5.175534725189209, "loss": 0.0693, "odds_ratio_loss": 0.004759188741445541, "rewards/accuracies": 1.0, "rewards/chosen": -0.007069652434438467, "rewards/margins": 0.5104838013648987, "rewards/rejected": -0.517553448677063, "sft_loss": 0.07069652527570724, "step": 2910 }, { "epoch": 4.209689081706435, "grad_norm": 1.3402883750903443, "learning_rate": 1.7024444826652067e-06, "logits/chosen": -0.5692568421363831, "logits/rejected": -0.4671230912208557, "logps/chosen": -0.046070732176303864, "logps/rejected": -3.63814640045166, "loss": 0.0817, "odds_ratio_loss": 0.004079668782651424, "rewards/accuracies": 1.0, "rewards/chosen": -0.004607073031365871, "rewards/margins": 0.3592075705528259, "rewards/rejected": -0.36381465196609497, "sft_loss": 0.046070732176303864, "step": 2911 }, { "epoch": 4.211135213304411, "grad_norm": 1.8269769593137486, "learning_rate": 1.6999027603260853e-06, "logits/chosen": -1.0144546031951904, "logits/rejected": -0.7266017198562622, "logps/chosen": -0.12296151369810104, "logps/rejected": -4.764224529266357, "loss": 0.0957, "odds_ratio_loss": 0.00944902841001749, "rewards/accuracies": 1.0, "rewards/chosen": -0.012296151369810104, "rewards/margins": 0.464126318693161, "rewards/rejected": -0.4764224588871002, "sft_loss": 0.12296151369810104, "step": 2912 }, { "epoch": 4.212581344902386, "grad_norm": 1.6723520264588212, "learning_rate": 1.697362424724158e-06, "logits/chosen": -0.8207547664642334, "logits/rejected": -0.6812379956245422, "logps/chosen": -0.06191746145486832, "logps/rejected": -5.384179592132568, "loss": 0.0729, "odds_ratio_loss": 0.006640473380684853, "rewards/accuracies": 1.0, "rewards/chosen": -0.006191745866090059, "rewards/margins": 0.5322262644767761, "rewards/rejected": -0.5384179353713989, "sft_loss": 0.06191746145486832, "step": 2913 }, { "epoch": 4.214027476500362, "grad_norm": 1.6031455868403621, "learning_rate": 1.6948234773909995e-06, "logits/chosen": -0.9777011871337891, "logits/rejected": -0.8530696630477905, "logps/chosen": -0.09695122390985489, "logps/rejected": -4.301798343658447, "loss": 0.1181, "odds_ratio_loss": 0.008988775312900543, "rewards/accuracies": 1.0, "rewards/chosen": -0.009695122018456459, "rewards/margins": 0.4204846918582916, "rewards/rejected": -0.4301798343658447, "sft_loss": 0.09695122390985489, "step": 2914 }, { "epoch": 4.215473608098337, "grad_norm": 1.6686746446757759, "learning_rate": 1.6922859198573516e-06, "logits/chosen": -0.7078945636749268, "logits/rejected": -0.5031204223632812, "logps/chosen": -0.05042143911123276, "logps/rejected": -4.290203094482422, "loss": 0.0847, "odds_ratio_loss": 0.007931235246360302, "rewards/accuracies": 1.0, "rewards/chosen": -0.005042144097387791, "rewards/margins": 0.42397817969322205, "rewards/rejected": -0.4290202856063843, "sft_loss": 0.05042143911123276, "step": 2915 }, { "epoch": 4.216919739696312, "grad_norm": 1.3601920987396054, "learning_rate": 1.6897497536531188e-06, "logits/chosen": -0.8085181713104248, "logits/rejected": -0.473737895488739, "logps/chosen": -0.04474177956581116, "logps/rejected": -4.583406448364258, "loss": 0.0655, "odds_ratio_loss": 0.0023093842901289463, "rewards/accuracies": 1.0, "rewards/chosen": -0.004474177956581116, "rewards/margins": 0.45386651158332825, "rewards/rejected": -0.4583406448364258, "sft_loss": 0.04474177956581116, "step": 2916 }, { "epoch": 4.218365871294288, "grad_norm": 1.9745610489643932, "learning_rate": 1.6872149803073642e-06, "logits/chosen": -0.9446933269500732, "logits/rejected": -0.632041871547699, "logps/chosen": -0.07594672590494156, "logps/rejected": -4.689000129699707, "loss": 0.0593, "odds_ratio_loss": 0.012281806208193302, "rewards/accuracies": 1.0, "rewards/chosen": -0.007594672497361898, "rewards/margins": 0.4613053798675537, "rewards/rejected": -0.46890002489089966, "sft_loss": 0.07594672590494156, "step": 2917 }, { "epoch": 4.219812002892263, "grad_norm": 1.5163951778369975, "learning_rate": 1.6846816013483114e-06, "logits/chosen": -0.8689394593238831, "logits/rejected": -0.5733392834663391, "logps/chosen": -0.04879632219672203, "logps/rejected": -3.380754232406616, "loss": 0.0642, "odds_ratio_loss": 0.004706839565187693, "rewards/accuracies": 1.0, "rewards/chosen": -0.004879632964730263, "rewards/margins": 0.3331957459449768, "rewards/rejected": -0.3380753993988037, "sft_loss": 0.04879632219672203, "step": 2918 }, { "epoch": 4.221258134490238, "grad_norm": 1.5403147100350525, "learning_rate": 1.6821496183033426e-06, "logits/chosen": -0.6562444567680359, "logits/rejected": -0.5436285138130188, "logps/chosen": -0.06507329642772675, "logps/rejected": -3.918442964553833, "loss": 0.1034, "odds_ratio_loss": 0.004862302914261818, "rewards/accuracies": 1.0, "rewards/chosen": -0.006507330108433962, "rewards/margins": 0.3853369653224945, "rewards/rejected": -0.3918442726135254, "sft_loss": 0.06507329642772675, "step": 2919 }, { "epoch": 4.222704266088214, "grad_norm": 1.9639610620744994, "learning_rate": 1.6796190326990035e-06, "logits/chosen": -0.7960516810417175, "logits/rejected": -0.5733585357666016, "logps/chosen": -0.0999024510383606, "logps/rejected": -5.8444013595581055, "loss": 0.0888, "odds_ratio_loss": 0.001729599549435079, "rewards/accuracies": 1.0, "rewards/chosen": -0.00999024510383606, "rewards/margins": 0.574449896812439, "rewards/rejected": -0.5844402313232422, "sft_loss": 0.0999024510383606, "step": 2920 }, { "epoch": 4.22415039768619, "grad_norm": 1.2356981106459832, "learning_rate": 1.6770898460609898e-06, "logits/chosen": -0.9516746401786804, "logits/rejected": -0.6569518446922302, "logps/chosen": -0.04203661531209946, "logps/rejected": -4.199306488037109, "loss": 0.0365, "odds_ratio_loss": 0.004309549927711487, "rewards/accuracies": 1.0, "rewards/chosen": -0.004203661344945431, "rewards/margins": 0.4157269597053528, "rewards/rejected": -0.4199306070804596, "sft_loss": 0.04203661531209946, "step": 2921 }, { "epoch": 4.225596529284164, "grad_norm": 1.7615373206972187, "learning_rate": 1.674562059914161e-06, "logits/chosen": -0.8067028522491455, "logits/rejected": -0.7631421089172363, "logps/chosen": -0.07550188153982162, "logps/rejected": -5.086093902587891, "loss": 0.1018, "odds_ratio_loss": 0.0082497987896204, "rewards/accuracies": 1.0, "rewards/chosen": -0.007550188340246677, "rewards/margins": 0.5010592341423035, "rewards/rejected": -0.508609414100647, "sft_loss": 0.07550188153982162, "step": 2922 }, { "epoch": 4.22704266088214, "grad_norm": 1.6479858436312316, "learning_rate": 1.6720356757825256e-06, "logits/chosen": -0.8297333717346191, "logits/rejected": -0.7453821301460266, "logps/chosen": -0.16031071543693542, "logps/rejected": -4.488302230834961, "loss": 0.098, "odds_ratio_loss": 0.019046666100621223, "rewards/accuracies": 1.0, "rewards/chosen": -0.016031071543693542, "rewards/margins": 0.43279916048049927, "rewards/rejected": -0.448830246925354, "sft_loss": 0.16031071543693542, "step": 2923 }, { "epoch": 4.228488792480116, "grad_norm": 1.7918703336478432, "learning_rate": 1.669510695189253e-06, "logits/chosen": -0.833388090133667, "logits/rejected": -0.6928578615188599, "logps/chosen": -0.0595964752137661, "logps/rejected": -4.915972709655762, "loss": 0.0828, "odds_ratio_loss": 0.005791317671537399, "rewards/accuracies": 1.0, "rewards/chosen": -0.005959647241979837, "rewards/margins": 0.4856376349925995, "rewards/rejected": -0.4915972352027893, "sft_loss": 0.0595964752137661, "step": 2924 }, { "epoch": 4.2299349240780915, "grad_norm": 1.7973661105477898, "learning_rate": 1.6669871196566607e-06, "logits/chosen": -0.6049919128417969, "logits/rejected": -0.48009204864501953, "logps/chosen": -0.09875990450382233, "logps/rejected": -4.366959571838379, "loss": 0.088, "odds_ratio_loss": 0.009393557906150818, "rewards/accuracies": 1.0, "rewards/chosen": -0.009875990450382233, "rewards/margins": 0.4268200099468231, "rewards/rejected": -0.43669599294662476, "sft_loss": 0.09875990450382233, "step": 2925 }, { "epoch": 4.231381055676066, "grad_norm": 1.3643896640774793, "learning_rate": 1.6644649507062241e-06, "logits/chosen": -0.8335921764373779, "logits/rejected": -0.5632571578025818, "logps/chosen": -0.057263340801000595, "logps/rejected": -5.25675630569458, "loss": 0.0885, "odds_ratio_loss": 0.0035503399558365345, "rewards/accuracies": 1.0, "rewards/chosen": -0.00572633370757103, "rewards/margins": 0.5199492573738098, "rewards/rejected": -0.5256756544113159, "sft_loss": 0.057263340801000595, "step": 2926 }, { "epoch": 4.232827187274042, "grad_norm": 1.7117069678343029, "learning_rate": 1.6619441898585676e-06, "logits/chosen": -0.860699474811554, "logits/rejected": -0.7995709180831909, "logps/chosen": -0.06467077136039734, "logps/rejected": -3.649376392364502, "loss": 0.1072, "odds_ratio_loss": 0.009878093376755714, "rewards/accuracies": 1.0, "rewards/chosen": -0.006467076949775219, "rewards/margins": 0.3584705591201782, "rewards/rejected": -0.36493760347366333, "sft_loss": 0.06467077136039734, "step": 2927 }, { "epoch": 4.234273318872018, "grad_norm": 1.4866554904397025, "learning_rate": 1.6594248386334649e-06, "logits/chosen": -0.9298467040061951, "logits/rejected": -0.7085505723953247, "logps/chosen": -0.07608020305633545, "logps/rejected": -4.444356918334961, "loss": 0.0761, "odds_ratio_loss": 0.004563465248793364, "rewards/accuracies": 1.0, "rewards/chosen": -0.007608020678162575, "rewards/margins": 0.43682771921157837, "rewards/rejected": -0.444435715675354, "sft_loss": 0.07608020305633545, "step": 2928 }, { "epoch": 4.235719450469992, "grad_norm": 1.4014182752061624, "learning_rate": 1.6569068985498457e-06, "logits/chosen": -0.943458616733551, "logits/rejected": -0.849433183670044, "logps/chosen": -0.051664408296346664, "logps/rejected": -4.948188781738281, "loss": 0.055, "odds_ratio_loss": 0.006422580685466528, "rewards/accuracies": 1.0, "rewards/chosen": -0.005166441202163696, "rewards/margins": 0.4896524250507355, "rewards/rejected": -0.49481892585754395, "sft_loss": 0.051664408296346664, "step": 2929 }, { "epoch": 4.237165582067968, "grad_norm": 1.854606938644546, "learning_rate": 1.6543903711257832e-06, "logits/chosen": -0.7278324365615845, "logits/rejected": -0.4564022421836853, "logps/chosen": -0.10963063687086105, "logps/rejected": -4.744562149047852, "loss": 0.0732, "odds_ratio_loss": 0.009700143709778786, "rewards/accuracies": 1.0, "rewards/chosen": -0.01096306461840868, "rewards/margins": 0.46349310874938965, "rewards/rejected": -0.47445619106292725, "sft_loss": 0.10963063687086105, "step": 2930 }, { "epoch": 4.238611713665944, "grad_norm": 1.3679336336823964, "learning_rate": 1.651875257878503e-06, "logits/chosen": -0.8306885361671448, "logits/rejected": -0.7784897089004517, "logps/chosen": -0.05908175930380821, "logps/rejected": -4.5930585861206055, "loss": 0.0494, "odds_ratio_loss": 0.011198028922080994, "rewards/accuracies": 1.0, "rewards/chosen": -0.005908175837248564, "rewards/margins": 0.4533976912498474, "rewards/rejected": -0.45930585265159607, "sft_loss": 0.05908175930380821, "step": 2931 }, { "epoch": 4.240057845263919, "grad_norm": 1.495799549172853, "learning_rate": 1.6493615603243733e-06, "logits/chosen": -0.5538917183876038, "logits/rejected": -0.5523948073387146, "logps/chosen": -0.07916896045207977, "logps/rejected": -4.839788436889648, "loss": 0.0927, "odds_ratio_loss": 0.06816166639328003, "rewards/accuracies": 0.9375, "rewards/chosen": -0.007916895672678947, "rewards/margins": 0.47606196999549866, "rewards/rejected": -0.48397886753082275, "sft_loss": 0.07916896045207977, "step": 2932 }, { "epoch": 4.241503976861894, "grad_norm": 1.9770485452901845, "learning_rate": 1.6468492799789155e-06, "logits/chosen": -0.8710498809814453, "logits/rejected": -0.7079587578773499, "logps/chosen": -0.037493444979190826, "logps/rejected": -4.81158447265625, "loss": 0.0908, "odds_ratio_loss": 0.004065762739628553, "rewards/accuracies": 1.0, "rewards/chosen": -0.0037493444979190826, "rewards/margins": 0.47740909457206726, "rewards/rejected": -0.48115843534469604, "sft_loss": 0.037493444979190826, "step": 2933 }, { "epoch": 4.24295010845987, "grad_norm": 1.7584110563510973, "learning_rate": 1.6443384183567907e-06, "logits/chosen": -0.779168963432312, "logits/rejected": -0.7387828826904297, "logps/chosen": -0.12338142096996307, "logps/rejected": -4.761046409606934, "loss": 0.0934, "odds_ratio_loss": 0.015843737870454788, "rewards/accuracies": 1.0, "rewards/chosen": -0.012338140979409218, "rewards/margins": 0.463766485452652, "rewards/rejected": -0.47610461711883545, "sft_loss": 0.12338142096996307, "step": 2934 }, { "epoch": 4.244396240057846, "grad_norm": 1.7602609188825338, "learning_rate": 1.6418289769718072e-06, "logits/chosen": -0.8570691347122192, "logits/rejected": -0.7762001752853394, "logps/chosen": -0.09432914108037949, "logps/rejected": -4.270380973815918, "loss": 0.1054, "odds_ratio_loss": 0.009115164168179035, "rewards/accuracies": 1.0, "rewards/chosen": -0.009432912804186344, "rewards/margins": 0.4176052212715149, "rewards/rejected": -0.4270381033420563, "sft_loss": 0.09432914108037949, "step": 2935 }, { "epoch": 4.2458423716558205, "grad_norm": 1.7361866931127048, "learning_rate": 1.6393209573369146e-06, "logits/chosen": -0.934897780418396, "logits/rejected": -0.7846497297286987, "logps/chosen": -0.07415792346000671, "logps/rejected": -4.217360496520996, "loss": 0.1052, "odds_ratio_loss": 0.011329401284456253, "rewards/accuracies": 1.0, "rewards/chosen": -0.007415792904794216, "rewards/margins": 0.41432029008865356, "rewards/rejected": -0.42173606157302856, "sft_loss": 0.07415792346000671, "step": 2936 }, { "epoch": 4.247288503253796, "grad_norm": 1.622613194938224, "learning_rate": 1.6368143609642102e-06, "logits/chosen": -0.8236643075942993, "logits/rejected": -0.6638423204421997, "logps/chosen": -0.10684844851493835, "logps/rejected": -4.309247016906738, "loss": 0.0832, "odds_ratio_loss": 0.015189705416560173, "rewards/accuracies": 1.0, "rewards/chosen": -0.01068484503775835, "rewards/margins": 0.4202398657798767, "rewards/rejected": -0.4309247136116028, "sft_loss": 0.10684844851493835, "step": 2937 }, { "epoch": 4.248734634851772, "grad_norm": 1.4281181979339255, "learning_rate": 1.6343091893649282e-06, "logits/chosen": -0.64753657579422, "logits/rejected": -0.5138456225395203, "logps/chosen": -0.029012585058808327, "logps/rejected": -6.18804931640625, "loss": 0.0521, "odds_ratio_loss": 0.0036126519553363323, "rewards/accuracies": 1.0, "rewards/chosen": -0.002901258412748575, "rewards/margins": 0.6159037351608276, "rewards/rejected": -0.618804931640625, "sft_loss": 0.029012585058808327, "step": 2938 }, { "epoch": 4.250180766449747, "grad_norm": 1.5929975903122693, "learning_rate": 1.6318054440494473e-06, "logits/chosen": -0.7193521857261658, "logits/rejected": -0.6249187588691711, "logps/chosen": -0.07214511930942535, "logps/rejected": -5.506035804748535, "loss": 0.1076, "odds_ratio_loss": 0.010497845709323883, "rewards/accuracies": 1.0, "rewards/chosen": -0.007214512676000595, "rewards/margins": 0.543389081954956, "rewards/rejected": -0.5506036281585693, "sft_loss": 0.07214511930942535, "step": 2939 }, { "epoch": 4.251626898047722, "grad_norm": 1.6636318072838059, "learning_rate": 1.6293031265272834e-06, "logits/chosen": -0.7778177261352539, "logits/rejected": -0.7233993411064148, "logps/chosen": -0.05430297553539276, "logps/rejected": -3.162222385406494, "loss": 0.0946, "odds_ratio_loss": 0.010178321041166782, "rewards/accuracies": 1.0, "rewards/chosen": -0.005430297926068306, "rewards/margins": 0.310791939496994, "rewards/rejected": -0.31622225046157837, "sft_loss": 0.05430297553539276, "step": 2940 }, { "epoch": 4.253073029645698, "grad_norm": 1.677532945172881, "learning_rate": 1.6268022383070949e-06, "logits/chosen": -0.6373997926712036, "logits/rejected": -0.579150915145874, "logps/chosen": -0.07864230126142502, "logps/rejected": -4.474689483642578, "loss": 0.0894, "odds_ratio_loss": 0.022512556985020638, "rewards/accuracies": 1.0, "rewards/chosen": -0.007864230312407017, "rewards/margins": 0.4396046996116638, "rewards/rejected": -0.44746896624565125, "sft_loss": 0.07864230126142502, "step": 2941 }, { "epoch": 4.254519161243673, "grad_norm": 1.6819675796686997, "learning_rate": 1.6243027808966763e-06, "logits/chosen": -0.9810624122619629, "logits/rejected": -0.6949055194854736, "logps/chosen": -0.1464422345161438, "logps/rejected": -4.525935649871826, "loss": 0.0808, "odds_ratio_loss": 0.008545887656509876, "rewards/accuracies": 1.0, "rewards/chosen": -0.01464422419667244, "rewards/margins": 0.43794935941696167, "rewards/rejected": -0.4525935649871826, "sft_loss": 0.1464422345161438, "step": 2942 }, { "epoch": 4.2559652928416485, "grad_norm": 1.5653710384809179, "learning_rate": 1.6218047558029574e-06, "logits/chosen": -0.9486204385757446, "logits/rejected": -0.6342113018035889, "logps/chosen": -0.09254883229732513, "logps/rejected": -4.13349723815918, "loss": 0.1043, "odds_ratio_loss": 0.009536804631352425, "rewards/accuracies": 1.0, "rewards/chosen": -0.009254883043467999, "rewards/margins": 0.4040948152542114, "rewards/rejected": -0.4133497178554535, "sft_loss": 0.09254883229732513, "step": 2943 }, { "epoch": 4.257411424439624, "grad_norm": 1.8681108971211955, "learning_rate": 1.6193081645320098e-06, "logits/chosen": -0.7743822336196899, "logits/rejected": -0.5420173406600952, "logps/chosen": -0.0828114002943039, "logps/rejected": -4.951780319213867, "loss": 0.0878, "odds_ratio_loss": 0.01936756819486618, "rewards/accuracies": 1.0, "rewards/chosen": -0.00828113965690136, "rewards/margins": 0.4868968725204468, "rewards/rejected": -0.4951780438423157, "sft_loss": 0.0828114002943039, "step": 2944 }, { "epoch": 4.258857556037599, "grad_norm": 1.736072625058886, "learning_rate": 1.6168130085890353e-06, "logits/chosen": -0.9736768007278442, "logits/rejected": -0.6207584738731384, "logps/chosen": -0.07031136751174927, "logps/rejected": -2.8038110733032227, "loss": 0.0748, "odds_ratio_loss": 0.009815733879804611, "rewards/accuracies": 1.0, "rewards/chosen": -0.007031137123703957, "rewards/margins": 0.27334994077682495, "rewards/rejected": -0.28038108348846436, "sft_loss": 0.07031136751174927, "step": 2945 }, { "epoch": 4.260303687635575, "grad_norm": 1.8051801795448943, "learning_rate": 1.6143192894783751e-06, "logits/chosen": -0.671993613243103, "logits/rejected": -0.5487344264984131, "logps/chosen": -0.1085626408457756, "logps/rejected": -4.188941478729248, "loss": 0.0862, "odds_ratio_loss": 0.004737728741019964, "rewards/accuracies": 1.0, "rewards/chosen": -0.010856264270842075, "rewards/margins": 0.4080379009246826, "rewards/rejected": -0.4188941717147827, "sft_loss": 0.1085626408457756, "step": 2946 }, { "epoch": 4.26174981923355, "grad_norm": 1.3776328659708403, "learning_rate": 1.611827008703499e-06, "logits/chosen": -0.7267762422561646, "logits/rejected": -0.4366953372955322, "logps/chosen": -0.10525096952915192, "logps/rejected": -3.887083053588867, "loss": 0.0738, "odds_ratio_loss": 0.007737172767519951, "rewards/accuracies": 1.0, "rewards/chosen": -0.010525096207857132, "rewards/margins": 0.3781832158565521, "rewards/rejected": -0.38870832324028015, "sft_loss": 0.10525096952915192, "step": 2947 }, { "epoch": 4.263195950831526, "grad_norm": 1.6865062513248021, "learning_rate": 1.6093361677670157e-06, "logits/chosen": -0.5784760117530823, "logits/rejected": -0.5577380657196045, "logps/chosen": -0.03789515048265457, "logps/rejected": -4.079635143280029, "loss": 0.0948, "odds_ratio_loss": 0.008148334920406342, "rewards/accuracies": 1.0, "rewards/chosen": -0.003789515234529972, "rewards/margins": 0.4041740298271179, "rewards/rejected": -0.4079635441303253, "sft_loss": 0.03789515048265457, "step": 2948 }, { "epoch": 4.264642082429501, "grad_norm": 1.573544466630816, "learning_rate": 1.6068467681706602e-06, "logits/chosen": -0.6730165481567383, "logits/rejected": -0.38399767875671387, "logps/chosen": -0.05993356555700302, "logps/rejected": -5.047731399536133, "loss": 0.0711, "odds_ratio_loss": 0.002913933712989092, "rewards/accuracies": 1.0, "rewards/chosen": -0.005993356462568045, "rewards/margins": 0.4987798035144806, "rewards/rejected": -0.5047731399536133, "sft_loss": 0.05993356555700302, "step": 2949 }, { "epoch": 4.2660882140274765, "grad_norm": 1.4803755685437572, "learning_rate": 1.6043588114153016e-06, "logits/chosen": -0.8042940497398376, "logits/rejected": -0.545076847076416, "logps/chosen": -0.04276617616415024, "logps/rejected": -4.222831726074219, "loss": 0.0499, "odds_ratio_loss": 0.004342740401625633, "rewards/accuracies": 1.0, "rewards/chosen": -0.0042766183614730835, "rewards/margins": 0.4180065393447876, "rewards/rejected": -0.4222831726074219, "sft_loss": 0.04276617616415024, "step": 2950 }, { "epoch": 4.267534345625452, "grad_norm": 1.546044165144899, "learning_rate": 1.601872299000936e-06, "logits/chosen": -0.830041766166687, "logits/rejected": -0.49972018599510193, "logps/chosen": -0.17610527575016022, "logps/rejected": -4.997128486633301, "loss": 0.1124, "odds_ratio_loss": 0.016635987907648087, "rewards/accuracies": 1.0, "rewards/chosen": -0.01761052943766117, "rewards/margins": 0.48210230469703674, "rewards/rejected": -0.49971282482147217, "sft_loss": 0.17610527575016022, "step": 2951 }, { "epoch": 4.268980477223427, "grad_norm": 1.6893484365315183, "learning_rate": 1.599387232426695e-06, "logits/chosen": -0.8119892477989197, "logits/rejected": -0.5879321694374084, "logps/chosen": -0.09631012380123138, "logps/rejected": -4.048828601837158, "loss": 0.081, "odds_ratio_loss": 0.006678726989775896, "rewards/accuracies": 1.0, "rewards/chosen": -0.009631011635065079, "rewards/margins": 0.3952518701553345, "rewards/rejected": -0.40488284826278687, "sft_loss": 0.09631012380123138, "step": 2952 }, { "epoch": 4.270426608821403, "grad_norm": 1.507223223978156, "learning_rate": 1.5969036131908302e-06, "logits/chosen": -0.9762201905250549, "logits/rejected": -0.7047141790390015, "logps/chosen": -0.05365900322794914, "logps/rejected": -4.9685821533203125, "loss": 0.0596, "odds_ratio_loss": 0.00955482292920351, "rewards/accuracies": 1.0, "rewards/chosen": -0.005365900695323944, "rewards/margins": 0.4914923310279846, "rewards/rejected": -0.49685823917388916, "sft_loss": 0.05365900322794914, "step": 2953 }, { "epoch": 4.271872740419378, "grad_norm": 1.6075610959081317, "learning_rate": 1.5944214427907277e-06, "logits/chosen": -0.8845387697219849, "logits/rejected": -0.6668776869773865, "logps/chosen": -0.07586217671632767, "logps/rejected": -4.427034378051758, "loss": 0.0802, "odds_ratio_loss": 0.00986341293901205, "rewards/accuracies": 1.0, "rewards/chosen": -0.007586217485368252, "rewards/margins": 0.435117244720459, "rewards/rejected": -0.4427034556865692, "sft_loss": 0.07586217671632767, "step": 2954 }, { "epoch": 4.273318872017353, "grad_norm": 1.4176098445474563, "learning_rate": 1.5919407227228976e-06, "logits/chosen": -0.9498851299285889, "logits/rejected": -0.5791040062904358, "logps/chosen": -0.051679011434316635, "logps/rejected": -4.279007434844971, "loss": 0.0494, "odds_ratio_loss": 0.0026139526162296534, "rewards/accuracies": 1.0, "rewards/chosen": -0.005167901515960693, "rewards/margins": 0.4227328300476074, "rewards/rejected": -0.4279007315635681, "sft_loss": 0.051679011434316635, "step": 2955 }, { "epoch": 4.274765003615329, "grad_norm": 1.6257258837083226, "learning_rate": 1.5894614544829747e-06, "logits/chosen": -0.7169545888900757, "logits/rejected": -0.42061808705329895, "logps/chosen": -0.14304006099700928, "logps/rejected": -6.230022430419922, "loss": 0.0903, "odds_ratio_loss": 0.005613164976239204, "rewards/accuracies": 1.0, "rewards/chosen": -0.014304005540907383, "rewards/margins": 0.6086981892585754, "rewards/rejected": -0.6230022311210632, "sft_loss": 0.14304006099700928, "step": 2956 }, { "epoch": 4.2762111352133045, "grad_norm": 1.8611753096508106, "learning_rate": 1.5869836395657185e-06, "logits/chosen": -0.6850326061248779, "logits/rejected": -0.4503934979438782, "logps/chosen": -0.03483531251549721, "logps/rejected": -5.079290866851807, "loss": 0.0554, "odds_ratio_loss": 0.0030643048230558634, "rewards/accuracies": 1.0, "rewards/chosen": -0.0034835312981158495, "rewards/margins": 0.5044455528259277, "rewards/rejected": -0.5079290866851807, "sft_loss": 0.03483531251549721, "step": 2957 }, { "epoch": 4.27765726681128, "grad_norm": 1.4420781987404232, "learning_rate": 1.5845072794650118e-06, "logits/chosen": -0.765277087688446, "logits/rejected": -0.6213148832321167, "logps/chosen": -0.1533205658197403, "logps/rejected": -4.846577167510986, "loss": 0.0834, "odds_ratio_loss": 0.022611288353800774, "rewards/accuracies": 1.0, "rewards/chosen": -0.015332056209445, "rewards/margins": 0.46932563185691833, "rewards/rejected": -0.4846577048301697, "sft_loss": 0.1533205658197403, "step": 2958 }, { "epoch": 4.279103398409255, "grad_norm": 1.5171362176056595, "learning_rate": 1.5820323756738643e-06, "logits/chosen": -0.8511385917663574, "logits/rejected": -0.5496898293495178, "logps/chosen": -0.048428021371364594, "logps/rejected": -5.54318380355835, "loss": 0.0619, "odds_ratio_loss": 0.006257210858166218, "rewards/accuracies": 1.0, "rewards/chosen": -0.004842801950871944, "rewards/margins": 0.5494755506515503, "rewards/rejected": -0.554318368434906, "sft_loss": 0.048428021371364594, "step": 2959 }, { "epoch": 4.280549530007231, "grad_norm": 1.7435049856938651, "learning_rate": 1.579558929684401e-06, "logits/chosen": -0.6494778394699097, "logits/rejected": -0.5383796095848083, "logps/chosen": -0.14231713116168976, "logps/rejected": -3.891136646270752, "loss": 0.1077, "odds_ratio_loss": 0.02279626578092575, "rewards/accuracies": 1.0, "rewards/chosen": -0.01423171442002058, "rewards/margins": 0.37488192319869995, "rewards/rejected": -0.3891136646270752, "sft_loss": 0.14231713116168976, "step": 2960 }, { "epoch": 4.281995661605206, "grad_norm": 1.6390575823940858, "learning_rate": 1.5770869429878752e-06, "logits/chosen": -0.9379094243049622, "logits/rejected": -0.8066362738609314, "logps/chosen": -0.08813867717981339, "logps/rejected": -3.6948931217193604, "loss": 0.0692, "odds_ratio_loss": 0.01244413573294878, "rewards/accuracies": 1.0, "rewards/chosen": -0.008813867345452309, "rewards/margins": 0.3606754541397095, "rewards/rejected": -0.3694893419742584, "sft_loss": 0.08813867717981339, "step": 2961 }, { "epoch": 4.283441793203181, "grad_norm": 1.6843204785703874, "learning_rate": 1.5746164170746542e-06, "logits/chosen": -0.8207695484161377, "logits/rejected": -0.7455189824104309, "logps/chosen": -0.019551407545804977, "logps/rejected": -4.0999836921691895, "loss": 0.0764, "odds_ratio_loss": 0.0014808757696300745, "rewards/accuracies": 1.0, "rewards/chosen": -0.001955140847712755, "rewards/margins": 0.40804323554039, "rewards/rejected": -0.40999835729599, "sft_loss": 0.019551407545804977, "step": 2962 }, { "epoch": 4.284887924801157, "grad_norm": 1.7963696202728958, "learning_rate": 1.5721473534342296e-06, "logits/chosen": -0.8363921642303467, "logits/rejected": -0.680959939956665, "logps/chosen": -0.0901620164513588, "logps/rejected": -5.3850202560424805, "loss": 0.0799, "odds_ratio_loss": 0.008571883663535118, "rewards/accuracies": 1.0, "rewards/chosen": -0.009016201831400394, "rewards/margins": 0.529485821723938, "rewards/rejected": -0.538502037525177, "sft_loss": 0.0901620164513588, "step": 2963 }, { "epoch": 4.286334056399133, "grad_norm": 1.7375869804762725, "learning_rate": 1.5696797535552078e-06, "logits/chosen": -0.9396668672561646, "logits/rejected": -0.5902308821678162, "logps/chosen": -0.05397862195968628, "logps/rejected": -3.661261558532715, "loss": 0.1087, "odds_ratio_loss": 0.006330575793981552, "rewards/accuracies": 1.0, "rewards/chosen": -0.005397862754762173, "rewards/margins": 0.36072826385498047, "rewards/rejected": -0.3661261796951294, "sft_loss": 0.05397862195968628, "step": 2964 }, { "epoch": 4.287780187997107, "grad_norm": 1.759220854338626, "learning_rate": 1.5672136189253143e-06, "logits/chosen": -0.8728683590888977, "logits/rejected": -0.5772249698638916, "logps/chosen": -0.12193911522626877, "logps/rejected": -6.481555461883545, "loss": 0.0981, "odds_ratio_loss": 0.013428367674350739, "rewards/accuracies": 1.0, "rewards/chosen": -0.012193911708891392, "rewards/margins": 0.635961651802063, "rewards/rejected": -0.6481555104255676, "sft_loss": 0.12193911522626877, "step": 2965 }, { "epoch": 4.289226319595083, "grad_norm": 1.4307744608227613, "learning_rate": 1.5647489510313894e-06, "logits/chosen": -0.595003068447113, "logits/rejected": -0.4218134582042694, "logps/chosen": -0.09229257702827454, "logps/rejected": -4.479331970214844, "loss": 0.0776, "odds_ratio_loss": 0.007592702284455299, "rewards/accuracies": 1.0, "rewards/chosen": -0.009229256771504879, "rewards/margins": 0.4387039542198181, "rewards/rejected": -0.44793322682380676, "sft_loss": 0.09229257702827454, "step": 2966 }, { "epoch": 4.290672451193059, "grad_norm": 1.5528265033967315, "learning_rate": 1.562285751359393e-06, "logits/chosen": -0.5394838452339172, "logits/rejected": -0.5432738661766052, "logps/chosen": -0.051818784326314926, "logps/rejected": -5.729471206665039, "loss": 0.0756, "odds_ratio_loss": 0.008107408881187439, "rewards/accuracies": 1.0, "rewards/chosen": -0.0051818788051605225, "rewards/margins": 0.5677652359008789, "rewards/rejected": -0.5729471445083618, "sft_loss": 0.051818784326314926, "step": 2967 }, { "epoch": 4.2921185827910335, "grad_norm": 1.5265022495863947, "learning_rate": 1.5598240213943945e-06, "logits/chosen": -0.8859455585479736, "logits/rejected": -0.606169581413269, "logps/chosen": -0.068026602268219, "logps/rejected": -3.8222718238830566, "loss": 0.0673, "odds_ratio_loss": 0.005845806561410427, "rewards/accuracies": 1.0, "rewards/chosen": -0.006802660878747702, "rewards/margins": 0.3754245638847351, "rewards/rejected": -0.38222724199295044, "sft_loss": 0.068026602268219, "step": 2968 }, { "epoch": 4.293564714389009, "grad_norm": 1.7731344063337215, "learning_rate": 1.5573637626205818e-06, "logits/chosen": -0.7323688864707947, "logits/rejected": -0.6176028847694397, "logps/chosen": -0.05627206712961197, "logps/rejected": -4.247992038726807, "loss": 0.0782, "odds_ratio_loss": 0.004452931694686413, "rewards/accuracies": 1.0, "rewards/chosen": -0.005627206526696682, "rewards/margins": 0.4191719889640808, "rewards/rejected": -0.42479920387268066, "sft_loss": 0.05627206712961197, "step": 2969 }, { "epoch": 4.295010845986985, "grad_norm": 1.6054365983973067, "learning_rate": 1.5549049765212554e-06, "logits/chosen": -0.6429954767227173, "logits/rejected": -0.5249923467636108, "logps/chosen": -0.14273762702941895, "logps/rejected": -4.192305564880371, "loss": 0.0963, "odds_ratio_loss": 0.06254995614290237, "rewards/accuracies": 0.9375, "rewards/chosen": -0.01427376363426447, "rewards/margins": 0.4049568176269531, "rewards/rejected": -0.419230580329895, "sft_loss": 0.14273762702941895, "step": 2970 }, { "epoch": 4.296456977584961, "grad_norm": 1.527875617710426, "learning_rate": 1.5524476645788238e-06, "logits/chosen": -0.8400360941886902, "logits/rejected": -0.5458084344863892, "logps/chosen": -0.06384026259183884, "logps/rejected": -5.687685489654541, "loss": 0.0735, "odds_ratio_loss": 0.0056127458810806274, "rewards/accuracies": 1.0, "rewards/chosen": -0.006384026259183884, "rewards/margins": 0.5623845458030701, "rewards/rejected": -0.5687686204910278, "sft_loss": 0.06384026259183884, "step": 2971 }, { "epoch": 4.297903109182935, "grad_norm": 1.5936572274065168, "learning_rate": 1.5499918282748122e-06, "logits/chosen": -0.8916709423065186, "logits/rejected": -0.6111676692962646, "logps/chosen": -0.0904655009508133, "logps/rejected": -4.318139553070068, "loss": 0.0742, "odds_ratio_loss": 0.0061836340464651585, "rewards/accuracies": 1.0, "rewards/chosen": -0.009046549908816814, "rewards/margins": 0.42276743054389954, "rewards/rejected": -0.43181395530700684, "sft_loss": 0.0904655009508133, "step": 2972 }, { "epoch": 4.299349240780911, "grad_norm": 1.5231636245400404, "learning_rate": 1.5475374690898519e-06, "logits/chosen": -0.7592072486877441, "logits/rejected": -0.706114649772644, "logps/chosen": -0.02892283909022808, "logps/rejected": -4.623927116394043, "loss": 0.0602, "odds_ratio_loss": 0.006367319729179144, "rewards/accuracies": 1.0, "rewards/chosen": -0.002892283955588937, "rewards/margins": 0.45950043201446533, "rewards/rejected": -0.4623927175998688, "sft_loss": 0.02892283909022808, "step": 2973 }, { "epoch": 4.300795372378887, "grad_norm": 1.363212769075135, "learning_rate": 1.5450845885036858e-06, "logits/chosen": -0.7811589241027832, "logits/rejected": -0.6956539750099182, "logps/chosen": -0.13292838633060455, "logps/rejected": -3.8825864791870117, "loss": 0.0763, "odds_ratio_loss": 0.009715279564261436, "rewards/accuracies": 1.0, "rewards/chosen": -0.01329283881932497, "rewards/margins": 0.3749657869338989, "rewards/rejected": -0.3882586359977722, "sft_loss": 0.13292838633060455, "step": 2974 }, { "epoch": 4.302241503976862, "grad_norm": 1.6827472496502511, "learning_rate": 1.5426331879951628e-06, "logits/chosen": -1.0225802659988403, "logits/rejected": -0.652125895023346, "logps/chosen": -0.06771662831306458, "logps/rejected": -5.0016303062438965, "loss": 0.0796, "odds_ratio_loss": 0.008105105720460415, "rewards/accuracies": 1.0, "rewards/chosen": -0.006771662272512913, "rewards/margins": 0.49339133501052856, "rewards/rejected": -0.5001630187034607, "sft_loss": 0.06771662831306458, "step": 2975 }, { "epoch": 4.303687635574837, "grad_norm": 1.928130002839635, "learning_rate": 1.5401832690422448e-06, "logits/chosen": -0.7736106514930725, "logits/rejected": -0.6177071332931519, "logps/chosen": -0.14021962881088257, "logps/rejected": -3.8993277549743652, "loss": 0.1028, "odds_ratio_loss": 0.008643961511552334, "rewards/accuracies": 1.0, "rewards/chosen": -0.014021963812410831, "rewards/margins": 0.37591081857681274, "rewards/rejected": -0.389932781457901, "sft_loss": 0.14021962881088257, "step": 2976 }, { "epoch": 4.305133767172813, "grad_norm": 1.4138003491700672, "learning_rate": 1.5377348331219934e-06, "logits/chosen": -0.6652264595031738, "logits/rejected": -0.5014814734458923, "logps/chosen": -0.045446865260601044, "logps/rejected": -4.814835548400879, "loss": 0.0547, "odds_ratio_loss": 0.003532176371663809, "rewards/accuracies": 1.0, "rewards/chosen": -0.004544686526060104, "rewards/margins": 0.4769388735294342, "rewards/rejected": -0.4814835786819458, "sft_loss": 0.045446865260601044, "step": 2977 }, { "epoch": 4.306579898770788, "grad_norm": 1.2321565482718624, "learning_rate": 1.535287881710583e-06, "logits/chosen": -0.6869823336601257, "logits/rejected": -0.485745906829834, "logps/chosen": -0.016159405931830406, "logps/rejected": -7.835278511047363, "loss": 0.0353, "odds_ratio_loss": 0.0009945888305082917, "rewards/accuracies": 1.0, "rewards/chosen": -0.0016159405931830406, "rewards/margins": 0.7819118499755859, "rewards/rejected": -0.7835278511047363, "sft_loss": 0.016159405931830406, "step": 2978 }, { "epoch": 4.308026030368763, "grad_norm": 1.761381333961773, "learning_rate": 1.5328424162832869e-06, "logits/chosen": -0.8571205139160156, "logits/rejected": -0.7177923917770386, "logps/chosen": -0.09584394842386246, "logps/rejected": -5.071878433227539, "loss": 0.0894, "odds_ratio_loss": 0.01330016739666462, "rewards/accuracies": 1.0, "rewards/chosen": -0.00958439614623785, "rewards/margins": 0.4976034164428711, "rewards/rejected": -0.5071878433227539, "sft_loss": 0.09584394842386246, "step": 2979 }, { "epoch": 4.309472161966739, "grad_norm": 1.560676269187292, "learning_rate": 1.5303984383144881e-06, "logits/chosen": -0.8674488067626953, "logits/rejected": -0.5158795714378357, "logps/chosen": -0.08388251066207886, "logps/rejected": -4.4099578857421875, "loss": 0.0683, "odds_ratio_loss": 0.00677723903208971, "rewards/accuracies": 1.0, "rewards/chosen": -0.008388251066207886, "rewards/margins": 0.432607501745224, "rewards/rejected": -0.4409957528114319, "sft_loss": 0.08388251066207886, "step": 2980 }, { "epoch": 4.310918293564715, "grad_norm": 1.5808008085636467, "learning_rate": 1.5279559492776688e-06, "logits/chosen": -0.6280909180641174, "logits/rejected": -0.4606649577617645, "logps/chosen": -0.06359011679887772, "logps/rejected": -3.5109429359436035, "loss": 0.0882, "odds_ratio_loss": 0.008953486569225788, "rewards/accuracies": 1.0, "rewards/chosen": -0.0063590118661522865, "rewards/margins": 0.3447352945804596, "rewards/rejected": -0.3510943055152893, "sft_loss": 0.06359011679887772, "step": 2981 }, { "epoch": 4.31236442516269, "grad_norm": 2.3823337309263204, "learning_rate": 1.5255149506454127e-06, "logits/chosen": -0.663183867931366, "logits/rejected": -0.63742595911026, "logps/chosen": -0.06929976493120193, "logps/rejected": -3.3315176963806152, "loss": 0.0753, "odds_ratio_loss": 0.006415051873773336, "rewards/accuracies": 1.0, "rewards/chosen": -0.006929976865649223, "rewards/margins": 0.3262217938899994, "rewards/rejected": -0.33315175771713257, "sft_loss": 0.06929976493120193, "step": 2982 }, { "epoch": 4.313810556760665, "grad_norm": 1.9637548548003436, "learning_rate": 1.523075443889411e-06, "logits/chosen": -0.8092422485351562, "logits/rejected": -0.5086420774459839, "logps/chosen": -0.09017635136842728, "logps/rejected": -4.9352827072143555, "loss": 0.0883, "odds_ratio_loss": 0.004887173883616924, "rewards/accuracies": 1.0, "rewards/chosen": -0.009017635136842728, "rewards/margins": 0.4845106899738312, "rewards/rejected": -0.4935283064842224, "sft_loss": 0.09017635136842728, "step": 2983 }, { "epoch": 4.315256688358641, "grad_norm": 1.7855616405277308, "learning_rate": 1.520637430480447e-06, "logits/chosen": -0.6320347785949707, "logits/rejected": -0.5742474794387817, "logps/chosen": -0.10536934435367584, "logps/rejected": -5.842680931091309, "loss": 0.0956, "odds_ratio_loss": 0.009043958969414234, "rewards/accuracies": 1.0, "rewards/chosen": -0.01053693424910307, "rewards/margins": 0.5737311244010925, "rewards/rejected": -0.5842680931091309, "sft_loss": 0.10536934435367584, "step": 2984 }, { "epoch": 4.316702819956616, "grad_norm": 1.4993141920071105, "learning_rate": 1.518200911888412e-06, "logits/chosen": -0.676465630531311, "logits/rejected": -0.4837293028831482, "logps/chosen": -0.03564174845814705, "logps/rejected": -3.781062364578247, "loss": 0.0672, "odds_ratio_loss": 0.002124813385307789, "rewards/accuracies": 1.0, "rewards/chosen": -0.003564174985513091, "rewards/margins": 0.3745420575141907, "rewards/rejected": -0.3781062364578247, "sft_loss": 0.03564174845814705, "step": 2985 }, { "epoch": 4.318148951554591, "grad_norm": 1.7181779387692628, "learning_rate": 1.5157658895822892e-06, "logits/chosen": -0.9328091144561768, "logits/rejected": -0.6783272624015808, "logps/chosen": -0.10630530118942261, "logps/rejected": -5.30551290512085, "loss": 0.1007, "odds_ratio_loss": 0.011326825246214867, "rewards/accuracies": 1.0, "rewards/chosen": -0.010630530305206776, "rewards/margins": 0.519920825958252, "rewards/rejected": -0.5305513739585876, "sft_loss": 0.10630530118942261, "step": 2986 }, { "epoch": 4.319595083152567, "grad_norm": 1.5179729610740627, "learning_rate": 1.5133323650301653e-06, "logits/chosen": -0.8240708112716675, "logits/rejected": -0.643516480922699, "logps/chosen": -0.07202492654323578, "logps/rejected": -4.5885725021362305, "loss": 0.067, "odds_ratio_loss": 0.007801711093634367, "rewards/accuracies": 1.0, "rewards/chosen": -0.007202493026852608, "rewards/margins": 0.4516547620296478, "rewards/rejected": -0.45885729789733887, "sft_loss": 0.07202492654323578, "step": 2987 }, { "epoch": 4.321041214750542, "grad_norm": 1.948795780060964, "learning_rate": 1.5109003396992196e-06, "logits/chosen": -1.0349688529968262, "logits/rejected": -0.46029579639434814, "logps/chosen": -0.06265415251255035, "logps/rejected": -4.769626140594482, "loss": 0.0644, "odds_ratio_loss": 0.003265473060309887, "rewards/accuracies": 1.0, "rewards/chosen": -0.0062654148787260056, "rewards/margins": 0.47069722414016724, "rewards/rejected": -0.4769626259803772, "sft_loss": 0.06265415251255035, "step": 2988 }, { "epoch": 4.322487346348518, "grad_norm": 1.5111670667012016, "learning_rate": 1.5084698150557294e-06, "logits/chosen": -0.8330080509185791, "logits/rejected": -0.5925476551055908, "logps/chosen": -0.05622435361146927, "logps/rejected": -3.154797315597534, "loss": 0.0729, "odds_ratio_loss": 0.006971028633415699, "rewards/accuracies": 1.0, "rewards/chosen": -0.005622435361146927, "rewards/margins": 0.3098573088645935, "rewards/rejected": -0.31547972559928894, "sft_loss": 0.05622435361146927, "step": 2989 }, { "epoch": 4.323933477946493, "grad_norm": 1.9164595471589108, "learning_rate": 1.506040792565066e-06, "logits/chosen": -0.763274610042572, "logits/rejected": -0.5204155445098877, "logps/chosen": -0.0702725499868393, "logps/rejected": -5.756369113922119, "loss": 0.0946, "odds_ratio_loss": 0.003552860114723444, "rewards/accuracies": 1.0, "rewards/chosen": -0.007027255371212959, "rewards/margins": 0.5686096549034119, "rewards/rejected": -0.5756369233131409, "sft_loss": 0.0702725499868393, "step": 2990 }, { "epoch": 4.325379609544468, "grad_norm": 1.5652675124707198, "learning_rate": 1.5036132736916986e-06, "logits/chosen": -0.9880766272544861, "logits/rejected": -0.8328273892402649, "logps/chosen": -0.16169053316116333, "logps/rejected": -4.504159927368164, "loss": 0.0928, "odds_ratio_loss": 0.015480546280741692, "rewards/accuracies": 1.0, "rewards/chosen": -0.016169054433703423, "rewards/margins": 0.4342469573020935, "rewards/rejected": -0.4504159986972809, "sft_loss": 0.16169053316116333, "step": 2991 }, { "epoch": 4.326825741142444, "grad_norm": 1.386084019741126, "learning_rate": 1.5011872598991845e-06, "logits/chosen": -0.891055703163147, "logits/rejected": -0.6146432161331177, "logps/chosen": -0.06819592416286469, "logps/rejected": -5.212233543395996, "loss": 0.058, "odds_ratio_loss": 0.007073340006172657, "rewards/accuracies": 1.0, "rewards/chosen": -0.006819593720138073, "rewards/margins": 0.514403760433197, "rewards/rejected": -0.5212233662605286, "sft_loss": 0.06819592416286469, "step": 2992 }, { "epoch": 4.3282718727404195, "grad_norm": 1.701412371797771, "learning_rate": 1.4987627526501797e-06, "logits/chosen": -0.8779382705688477, "logits/rejected": -0.7203929424285889, "logps/chosen": -0.07334870100021362, "logps/rejected": -5.043249130249023, "loss": 0.0908, "odds_ratio_loss": 0.008891316130757332, "rewards/accuracies": 1.0, "rewards/chosen": -0.007334871217608452, "rewards/margins": 0.49699005484580994, "rewards/rejected": -0.5043249726295471, "sft_loss": 0.07334870100021362, "step": 2993 }, { "epoch": 4.329718004338395, "grad_norm": 1.499832025218408, "learning_rate": 1.4963397534064255e-06, "logits/chosen": -0.6140452027320862, "logits/rejected": -0.5003236532211304, "logps/chosen": -0.0780247375369072, "logps/rejected": -3.900202512741089, "loss": 0.0825, "odds_ratio_loss": 0.006760553922504187, "rewards/accuracies": 1.0, "rewards/chosen": -0.007802474312484264, "rewards/margins": 0.38221779465675354, "rewards/rejected": -0.3900202810764313, "sft_loss": 0.0780247375369072, "step": 2994 }, { "epoch": 4.33116413593637, "grad_norm": 1.1920435557793356, "learning_rate": 1.4939182636287594e-06, "logits/chosen": -0.6822051405906677, "logits/rejected": -0.5335265398025513, "logps/chosen": -0.05268733948469162, "logps/rejected": -5.311867713928223, "loss": 0.0537, "odds_ratio_loss": 0.010495830327272415, "rewards/accuracies": 1.0, "rewards/chosen": -0.005268733948469162, "rewards/margins": 0.5259181261062622, "rewards/rejected": -0.5311868190765381, "sft_loss": 0.05268733948469162, "step": 2995 }, { "epoch": 4.332610267534346, "grad_norm": 1.4166781490558604, "learning_rate": 1.4914982847771063e-06, "logits/chosen": -0.8763676881790161, "logits/rejected": -0.6532676815986633, "logps/chosen": -0.07074081897735596, "logps/rejected": -4.6171064376831055, "loss": 0.0696, "odds_ratio_loss": 0.006446932442486286, "rewards/accuracies": 1.0, "rewards/chosen": -0.007074081804603338, "rewards/margins": 0.4546365737915039, "rewards/rejected": -0.4617106318473816, "sft_loss": 0.07074081897735596, "step": 2996 }, { "epoch": 4.334056399132321, "grad_norm": 1.6560432823034414, "learning_rate": 1.4890798183104788e-06, "logits/chosen": -0.5278356075286865, "logits/rejected": -0.3531097173690796, "logps/chosen": -0.08435966819524765, "logps/rejected": -7.50329065322876, "loss": 0.0967, "odds_ratio_loss": 0.008475156500935555, "rewards/accuracies": 1.0, "rewards/chosen": -0.008435966446995735, "rewards/margins": 0.7418931722640991, "rewards/rejected": -0.7503290772438049, "sft_loss": 0.08435966819524765, "step": 2997 }, { "epoch": 4.335502530730296, "grad_norm": 1.8409694247477042, "learning_rate": 1.4866628656869816e-06, "logits/chosen": -0.8288428783416748, "logits/rejected": -0.5312150716781616, "logps/chosen": -0.10413511097431183, "logps/rejected": -3.546949625015259, "loss": 0.1194, "odds_ratio_loss": 0.012981563806533813, "rewards/accuracies": 1.0, "rewards/chosen": -0.010413511656224728, "rewards/margins": 0.34428146481513977, "rewards/rejected": -0.3546949625015259, "sft_loss": 0.10413511097431183, "step": 2998 }, { "epoch": 4.336948662328272, "grad_norm": 1.7431106907764917, "learning_rate": 1.484247428363802e-06, "logits/chosen": -1.054826259613037, "logits/rejected": -0.7220063209533691, "logps/chosen": -0.0636000782251358, "logps/rejected": -4.4596781730651855, "loss": 0.0817, "odds_ratio_loss": 0.004126345738768578, "rewards/accuracies": 1.0, "rewards/chosen": -0.006360008381307125, "rewards/margins": 0.43960779905319214, "rewards/rejected": -0.44596779346466064, "sft_loss": 0.0636000782251358, "step": 2999 }, { "epoch": 4.3383947939262475, "grad_norm": 1.6516343056764413, "learning_rate": 1.4818335077972188e-06, "logits/chosen": -0.9266605377197266, "logits/rejected": -0.6105201840400696, "logps/chosen": -0.05521458014845848, "logps/rejected": -4.643217086791992, "loss": 0.0699, "odds_ratio_loss": 0.004646781831979752, "rewards/accuracies": 1.0, "rewards/chosen": -0.0055214581079781055, "rewards/margins": 0.4588002860546112, "rewards/rejected": -0.46432173252105713, "sft_loss": 0.05521458014845848, "step": 3000 }, { "epoch": 4.339840925524222, "grad_norm": 1.4289864542975454, "learning_rate": 1.479421105442591e-06, "logits/chosen": -0.9717822074890137, "logits/rejected": -0.62735515832901, "logps/chosen": -0.05365077406167984, "logps/rejected": -4.997622489929199, "loss": 0.0743, "odds_ratio_loss": 0.003372207749634981, "rewards/accuracies": 1.0, "rewards/chosen": -0.005365077406167984, "rewards/margins": 0.4943971633911133, "rewards/rejected": -0.49976223707199097, "sft_loss": 0.05365077406167984, "step": 3001 }, { "epoch": 4.341287057122198, "grad_norm": 1.6480211924822543, "learning_rate": 1.4770102227543678e-06, "logits/chosen": -0.8481490612030029, "logits/rejected": -0.6115496158599854, "logps/chosen": -0.0740613266825676, "logps/rejected": -5.005758285522461, "loss": 0.0932, "odds_ratio_loss": 0.009503107517957687, "rewards/accuracies": 1.0, "rewards/chosen": -0.00740613229572773, "rewards/margins": 0.49316978454589844, "rewards/rejected": -0.5005759000778198, "sft_loss": 0.0740613266825676, "step": 3002 }, { "epoch": 4.342733188720174, "grad_norm": 1.3298459528138336, "learning_rate": 1.474600861186078e-06, "logits/chosen": -0.622434139251709, "logits/rejected": -0.5628266930580139, "logps/chosen": -0.057216983288526535, "logps/rejected": -3.126250982284546, "loss": 0.0674, "odds_ratio_loss": 0.006634141784161329, "rewards/accuracies": 1.0, "rewards/chosen": -0.005721698515117168, "rewards/margins": 0.3069033920764923, "rewards/rejected": -0.31262508034706116, "sft_loss": 0.057216983288526535, "step": 3003 }, { "epoch": 4.344179320318149, "grad_norm": 1.8040818693788596, "learning_rate": 1.4721930221903342e-06, "logits/chosen": -0.7807648181915283, "logits/rejected": -0.6595614552497864, "logps/chosen": -0.04600096866488457, "logps/rejected": -4.082098484039307, "loss": 0.0867, "odds_ratio_loss": 0.003212960669770837, "rewards/accuracies": 1.0, "rewards/chosen": -0.004600096959620714, "rewards/margins": 0.4036097526550293, "rewards/rejected": -0.4082098603248596, "sft_loss": 0.04600096866488457, "step": 3004 }, { "epoch": 4.345625451916124, "grad_norm": 1.8544771440367331, "learning_rate": 1.469786707218831e-06, "logits/chosen": -0.8745797872543335, "logits/rejected": -0.6506798267364502, "logps/chosen": -0.13814522325992584, "logps/rejected": -5.566080570220947, "loss": 0.0952, "odds_ratio_loss": 0.019235284999012947, "rewards/accuracies": 1.0, "rewards/chosen": -0.013814522884786129, "rewards/margins": 0.5427935719490051, "rewards/rejected": -0.5566080808639526, "sft_loss": 0.13814522325992584, "step": 3005 }, { "epoch": 4.3470715835141, "grad_norm": 1.507690637374228, "learning_rate": 1.4673819177223466e-06, "logits/chosen": -0.8183923959732056, "logits/rejected": -0.669651210308075, "logps/chosen": -0.18851414322853088, "logps/rejected": -3.478821277618408, "loss": 0.1186, "odds_ratio_loss": 0.018825456500053406, "rewards/accuracies": 1.0, "rewards/chosen": -0.01885141432285309, "rewards/margins": 0.32903075218200684, "rewards/rejected": -0.34788215160369873, "sft_loss": 0.18851414322853088, "step": 3006 }, { "epoch": 4.3485177151120755, "grad_norm": 1.4777687325743487, "learning_rate": 1.4649786551507354e-06, "logits/chosen": -0.7968933582305908, "logits/rejected": -0.5578837394714355, "logps/chosen": -0.06734529882669449, "logps/rejected": -4.371576309204102, "loss": 0.0649, "odds_ratio_loss": 0.004955535754561424, "rewards/accuracies": 1.0, "rewards/chosen": -0.006734529510140419, "rewards/margins": 0.4304230809211731, "rewards/rejected": -0.43715763092041016, "sft_loss": 0.06734529882669449, "step": 3007 }, { "epoch": 4.34996384671005, "grad_norm": 1.3447914227375248, "learning_rate": 1.4625769209529342e-06, "logits/chosen": -0.7508656978607178, "logits/rejected": -0.6274343729019165, "logps/chosen": -0.06294934451580048, "logps/rejected": -5.2878594398498535, "loss": 0.0614, "odds_ratio_loss": 0.005506738089025021, "rewards/accuracies": 1.0, "rewards/chosen": -0.006294934079051018, "rewards/margins": 0.5224910378456116, "rewards/rejected": -0.5287859439849854, "sft_loss": 0.06294934451580048, "step": 3008 }, { "epoch": 4.351409978308026, "grad_norm": 2.018138682950108, "learning_rate": 1.460176716576959e-06, "logits/chosen": -0.7540772557258606, "logits/rejected": -0.6052873730659485, "logps/chosen": -0.051301129162311554, "logps/rejected": -6.2577667236328125, "loss": 0.0804, "odds_ratio_loss": 0.005588476546108723, "rewards/accuracies": 1.0, "rewards/chosen": -0.00513011310249567, "rewards/margins": 0.620646595954895, "rewards/rejected": -0.6257766485214233, "sft_loss": 0.051301129162311554, "step": 3009 }, { "epoch": 4.352856109906002, "grad_norm": 1.3678448408498183, "learning_rate": 1.4577780434699012e-06, "logits/chosen": -0.8656454086303711, "logits/rejected": -0.6067763566970825, "logps/chosen": -0.07266905903816223, "logps/rejected": -5.833568096160889, "loss": 0.0738, "odds_ratio_loss": 0.013198381289839745, "rewards/accuracies": 1.0, "rewards/chosen": -0.007266905624419451, "rewards/margins": 0.5760899186134338, "rewards/rejected": -0.5833568572998047, "sft_loss": 0.07266905903816223, "step": 3010 }, { "epoch": 4.3543022415039765, "grad_norm": 1.663768760253496, "learning_rate": 1.4553809030779287e-06, "logits/chosen": -0.7601166367530823, "logits/rejected": -0.6807395219802856, "logps/chosen": -0.1088540330529213, "logps/rejected": -3.1109838485717773, "loss": 0.0937, "odds_ratio_loss": 0.009050026535987854, "rewards/accuracies": 1.0, "rewards/chosen": -0.010885403491556644, "rewards/margins": 0.3002129793167114, "rewards/rejected": -0.3110983967781067, "sft_loss": 0.1088540330529213, "step": 3011 }, { "epoch": 4.355748373101952, "grad_norm": 1.5594475178551062, "learning_rate": 1.4529852968462858e-06, "logits/chosen": -0.7941527366638184, "logits/rejected": -0.585971474647522, "logps/chosen": -0.07313597947359085, "logps/rejected": -4.013620376586914, "loss": 0.0593, "odds_ratio_loss": 0.009553024545311928, "rewards/accuracies": 1.0, "rewards/chosen": -0.007313598413020372, "rewards/margins": 0.39404845237731934, "rewards/rejected": -0.4013620615005493, "sft_loss": 0.07313597947359085, "step": 3012 }, { "epoch": 4.357194504699928, "grad_norm": 1.9061263595083393, "learning_rate": 1.450591226219295e-06, "logits/chosen": -1.0241869688034058, "logits/rejected": -0.8023190498352051, "logps/chosen": -0.05216866359114647, "logps/rejected": -4.660516738891602, "loss": 0.073, "odds_ratio_loss": 0.01102613378316164, "rewards/accuracies": 1.0, "rewards/chosen": -0.0052168662659823895, "rewards/margins": 0.460834801197052, "rewards/rejected": -0.46605169773101807, "sft_loss": 0.05216866359114647, "step": 3013 }, { "epoch": 4.358640636297903, "grad_norm": 2.652329974259841, "learning_rate": 1.4481986926403473e-06, "logits/chosen": -0.9176286458969116, "logits/rejected": -0.8531794548034668, "logps/chosen": -0.18076550960540771, "logps/rejected": -4.112682342529297, "loss": 0.1192, "odds_ratio_loss": 0.042850561439991, "rewards/accuracies": 1.0, "rewards/chosen": -0.01807655207812786, "rewards/margins": 0.39319172501564026, "rewards/rejected": -0.4112682640552521, "sft_loss": 0.18076550960540771, "step": 3014 }, { "epoch": 4.360086767895878, "grad_norm": 1.4776010369065984, "learning_rate": 1.445807697551913e-06, "logits/chosen": -0.8927839398384094, "logits/rejected": -0.6999744176864624, "logps/chosen": -0.04306114837527275, "logps/rejected": -5.057520866394043, "loss": 0.0499, "odds_ratio_loss": 0.00421172333881259, "rewards/accuracies": 1.0, "rewards/chosen": -0.004306115210056305, "rewards/margins": 0.5014459490776062, "rewards/rejected": -0.5057520270347595, "sft_loss": 0.04306114837527275, "step": 3015 }, { "epoch": 4.361532899493854, "grad_norm": 1.7796610779704267, "learning_rate": 1.4434182423955296e-06, "logits/chosen": -0.7437611222267151, "logits/rejected": -0.5102348327636719, "logps/chosen": -0.057212602347135544, "logps/rejected": -6.495122909545898, "loss": 0.087, "odds_ratio_loss": 0.00607125461101532, "rewards/accuracies": 1.0, "rewards/chosen": -0.005721260327845812, "rewards/margins": 0.6437910199165344, "rewards/rejected": -0.6495122909545898, "sft_loss": 0.057212602347135544, "step": 3016 }, { "epoch": 4.36297903109183, "grad_norm": 1.7363048835289525, "learning_rate": 1.4410303286118106e-06, "logits/chosen": -0.8889349699020386, "logits/rejected": -0.6399192214012146, "logps/chosen": -0.07837576419115067, "logps/rejected": -5.654128551483154, "loss": 0.0973, "odds_ratio_loss": 0.0068669687025249004, "rewards/accuracies": 1.0, "rewards/chosen": -0.007837576791644096, "rewards/margins": 0.5575752854347229, "rewards/rejected": -0.5654128789901733, "sft_loss": 0.07837576419115067, "step": 3017 }, { "epoch": 4.3644251626898045, "grad_norm": 1.7329045014223299, "learning_rate": 1.438643957640436e-06, "logits/chosen": -0.6671526432037354, "logits/rejected": -0.48754367232322693, "logps/chosen": -0.06948922574520111, "logps/rejected": -4.8217668533325195, "loss": 0.1362, "odds_ratio_loss": 0.006450343877077103, "rewards/accuracies": 1.0, "rewards/chosen": -0.006948922295123339, "rewards/margins": 0.4752277433872223, "rewards/rejected": -0.48217666149139404, "sft_loss": 0.06948922574520111, "step": 3018 }, { "epoch": 4.36587129428778, "grad_norm": 1.547783584280245, "learning_rate": 1.4362591309201618e-06, "logits/chosen": -0.6410992741584778, "logits/rejected": -0.5067773461341858, "logps/chosen": -0.06263686716556549, "logps/rejected": -4.180117607116699, "loss": 0.0802, "odds_ratio_loss": 0.0073380540125072, "rewards/accuracies": 1.0, "rewards/chosen": -0.006263687275350094, "rewards/margins": 0.4117480516433716, "rewards/rejected": -0.41801172494888306, "sft_loss": 0.06263686716556549, "step": 3019 }, { "epoch": 4.367317425885756, "grad_norm": 1.7173950891688827, "learning_rate": 1.4338758498888028e-06, "logits/chosen": -0.7619547843933105, "logits/rejected": -0.5084792375564575, "logps/chosen": -0.06390223652124405, "logps/rejected": -4.947268486022949, "loss": 0.1019, "odds_ratio_loss": 0.006950300186872482, "rewards/accuracies": 1.0, "rewards/chosen": -0.006390223745256662, "rewards/margins": 0.4883366525173187, "rewards/rejected": -0.49472689628601074, "sft_loss": 0.06390223652124405, "step": 3020 }, { "epoch": 4.368763557483731, "grad_norm": 1.6770132351279181, "learning_rate": 1.4314941159832516e-06, "logits/chosen": -0.7331417798995972, "logits/rejected": -0.5848100781440735, "logps/chosen": -0.0755729153752327, "logps/rejected": -4.190185070037842, "loss": 0.0927, "odds_ratio_loss": 0.004576533567160368, "rewards/accuracies": 1.0, "rewards/chosen": -0.007557292003184557, "rewards/margins": 0.4114612340927124, "rewards/rejected": -0.4190185070037842, "sft_loss": 0.0755729153752327, "step": 3021 }, { "epoch": 4.370209689081706, "grad_norm": 1.4978278317319662, "learning_rate": 1.4291139306394651e-06, "logits/chosen": -0.9728338718414307, "logits/rejected": -0.6097639799118042, "logps/chosen": -0.06788711994886398, "logps/rejected": -4.2939534187316895, "loss": 0.0838, "odds_ratio_loss": 0.0016902622301131487, "rewards/accuracies": 1.0, "rewards/chosen": -0.006788711994886398, "rewards/margins": 0.4226066470146179, "rewards/rejected": -0.4293953776359558, "sft_loss": 0.06788711994886398, "step": 3022 }, { "epoch": 4.371655820679682, "grad_norm": 1.5576854515553027, "learning_rate": 1.4267352952924632e-06, "logits/chosen": -0.5633742213249207, "logits/rejected": -0.47203364968299866, "logps/chosen": -0.04447542876005173, "logps/rejected": -5.289262294769287, "loss": 0.0457, "odds_ratio_loss": 0.003245576983317733, "rewards/accuracies": 1.0, "rewards/chosen": -0.0044475425966084, "rewards/margins": 0.5244786739349365, "rewards/rejected": -0.5289261937141418, "sft_loss": 0.04447542876005173, "step": 3023 }, { "epoch": 4.373101952277657, "grad_norm": 1.5445070302860024, "learning_rate": 1.4243582113763376e-06, "logits/chosen": -0.9106212258338928, "logits/rejected": -0.4883584976196289, "logps/chosen": -0.08085661381483078, "logps/rejected": -6.208954811096191, "loss": 0.0742, "odds_ratio_loss": 0.0067681861110031605, "rewards/accuracies": 1.0, "rewards/chosen": -0.008085661567747593, "rewards/margins": 0.6128097772598267, "rewards/rejected": -0.620895504951477, "sft_loss": 0.08085661381483078, "step": 3024 }, { "epoch": 4.3745480838756325, "grad_norm": 1.5797770050022457, "learning_rate": 1.4219826803242372e-06, "logits/chosen": -0.9211044907569885, "logits/rejected": -0.6510443091392517, "logps/chosen": -0.031016170978546143, "logps/rejected": -3.0293474197387695, "loss": 0.0921, "odds_ratio_loss": 0.0027869718614965677, "rewards/accuracies": 1.0, "rewards/chosen": -0.0031016170978546143, "rewards/margins": 0.2998330891132355, "rewards/rejected": -0.3029347360134125, "sft_loss": 0.031016170978546143, "step": 3025 }, { "epoch": 4.375994215473608, "grad_norm": 1.8973584967663573, "learning_rate": 1.4196087035683818e-06, "logits/chosen": -0.7323670387268066, "logits/rejected": -0.6421462893486023, "logps/chosen": -0.08696135878562927, "logps/rejected": -4.719554901123047, "loss": 0.0594, "odds_ratio_loss": 0.010816432535648346, "rewards/accuracies": 1.0, "rewards/chosen": -0.008696136064827442, "rewards/margins": 0.46325939893722534, "rewards/rejected": -0.4719555377960205, "sft_loss": 0.08696135878562927, "step": 3026 }, { "epoch": 4.377440347071584, "grad_norm": 1.4381222411520438, "learning_rate": 1.4172362825400499e-06, "logits/chosen": -0.6959972381591797, "logits/rejected": -0.5631358623504639, "logps/chosen": -0.07375743985176086, "logps/rejected": -4.761948108673096, "loss": 0.0651, "odds_ratio_loss": 0.004052689298987389, "rewards/accuracies": 1.0, "rewards/chosen": -0.007375743240118027, "rewards/margins": 0.4688190221786499, "rewards/rejected": -0.4761947989463806, "sft_loss": 0.07375743985176086, "step": 3027 }, { "epoch": 4.378886478669559, "grad_norm": 1.8112701096091208, "learning_rate": 1.4148654186695818e-06, "logits/chosen": -0.7027171850204468, "logits/rejected": -0.6286312341690063, "logps/chosen": -0.10041756927967072, "logps/rejected": -4.47844123840332, "loss": 0.1241, "odds_ratio_loss": 0.01177249290049076, "rewards/accuracies": 1.0, "rewards/chosen": -0.010041756555438042, "rewards/margins": 0.43780237436294556, "rewards/rejected": -0.44784414768218994, "sft_loss": 0.10041756927967072, "step": 3028 }, { "epoch": 4.380332610267534, "grad_norm": 1.729400708490863, "learning_rate": 1.4124961133863802e-06, "logits/chosen": -0.950899064540863, "logits/rejected": -0.5882576107978821, "logps/chosen": -0.07639265805482864, "logps/rejected": -5.41416072845459, "loss": 0.0693, "odds_ratio_loss": 0.0038110874593257904, "rewards/accuracies": 1.0, "rewards/chosen": -0.007639266550540924, "rewards/margins": 0.5337768197059631, "rewards/rejected": -0.5414160490036011, "sft_loss": 0.07639265805482864, "step": 3029 }, { "epoch": 4.38177874186551, "grad_norm": 1.877374137773176, "learning_rate": 1.4101283681189097e-06, "logits/chosen": -0.7628424167633057, "logits/rejected": -0.5627432465553284, "logps/chosen": -0.08173272758722305, "logps/rejected": -6.892059326171875, "loss": 0.1018, "odds_ratio_loss": 0.003274757880717516, "rewards/accuracies": 1.0, "rewards/chosen": -0.00817327294498682, "rewards/margins": 0.6810327172279358, "rewards/rejected": -0.6892059445381165, "sft_loss": 0.08173272758722305, "step": 3030 }, { "epoch": 4.383224873463485, "grad_norm": 1.8461184307068432, "learning_rate": 1.4077621842946905e-06, "logits/chosen": -0.7165622711181641, "logits/rejected": -0.5991818308830261, "logps/chosen": -0.08831194043159485, "logps/rejected": -3.3247945308685303, "loss": 0.0721, "odds_ratio_loss": 0.01518079824745655, "rewards/accuracies": 1.0, "rewards/chosen": -0.008831193670630455, "rewards/margins": 0.32364824414253235, "rewards/rejected": -0.33247941732406616, "sft_loss": 0.08831194043159485, "step": 3031 }, { "epoch": 4.384671005061461, "grad_norm": 1.5390651399829893, "learning_rate": 1.4053975633403062e-06, "logits/chosen": -0.903634786605835, "logits/rejected": -0.5671402812004089, "logps/chosen": -0.07350382208824158, "logps/rejected": -5.765620708465576, "loss": 0.0779, "odds_ratio_loss": 0.008814067579805851, "rewards/accuracies": 1.0, "rewards/chosen": -0.007350381929427385, "rewards/margins": 0.5692117214202881, "rewards/rejected": -0.5765621066093445, "sft_loss": 0.07350382208824158, "step": 3032 }, { "epoch": 4.386117136659436, "grad_norm": 2.6024145275267627, "learning_rate": 1.4030345066813927e-06, "logits/chosen": -0.9356105327606201, "logits/rejected": -0.6585832834243774, "logps/chosen": -0.11158432811498642, "logps/rejected": -3.363126277923584, "loss": 0.0882, "odds_ratio_loss": 0.006204155273735523, "rewards/accuracies": 1.0, "rewards/chosen": -0.011158432811498642, "rewards/margins": 0.325154185295105, "rewards/rejected": -0.3363126218318939, "sft_loss": 0.11158432811498642, "step": 3033 }, { "epoch": 4.387563268257411, "grad_norm": 1.500933012566789, "learning_rate": 1.400673015742649e-06, "logits/chosen": -0.8368505239486694, "logits/rejected": -0.660033106803894, "logps/chosen": -0.07904787361621857, "logps/rejected": -3.45121169090271, "loss": 0.0637, "odds_ratio_loss": 0.009564665146172047, "rewards/accuracies": 1.0, "rewards/chosen": -0.007904788479208946, "rewards/margins": 0.3372163772583008, "rewards/rejected": -0.3451211750507355, "sft_loss": 0.07904787361621857, "step": 3034 }, { "epoch": 4.389009399855387, "grad_norm": 1.695864175616555, "learning_rate": 1.3983130919478248e-06, "logits/chosen": -0.908363401889801, "logits/rejected": -0.6203923225402832, "logps/chosen": -0.06778942048549652, "logps/rejected": -3.6424999237060547, "loss": 0.0752, "odds_ratio_loss": 0.005892588756978512, "rewards/accuracies": 1.0, "rewards/chosen": -0.006778942421078682, "rewards/margins": 0.3574710488319397, "rewards/rejected": -0.36424997448921204, "sft_loss": 0.06778942048549652, "step": 3035 }, { "epoch": 4.390455531453362, "grad_norm": 1.5718319470054316, "learning_rate": 1.3959547367197262e-06, "logits/chosen": -0.6267784833908081, "logits/rejected": -0.5574684739112854, "logps/chosen": -0.08968807756900787, "logps/rejected": -5.226409435272217, "loss": 0.0907, "odds_ratio_loss": 0.0179790947586298, "rewards/accuracies": 1.0, "rewards/chosen": -0.008968808688223362, "rewards/margins": 0.5136721134185791, "rewards/rejected": -0.5226409435272217, "sft_loss": 0.08968807756900787, "step": 3036 }, { "epoch": 4.391901663051337, "grad_norm": 1.5139362649763861, "learning_rate": 1.3935979514802166e-06, "logits/chosen": -0.628441572189331, "logits/rejected": -0.4122357964515686, "logps/chosen": -0.05464586615562439, "logps/rejected": -3.6563820838928223, "loss": 0.085, "odds_ratio_loss": 0.008319716900587082, "rewards/accuracies": 1.0, "rewards/chosen": -0.005464586429297924, "rewards/margins": 0.36017361283302307, "rewards/rejected": -0.36563819646835327, "sft_loss": 0.05464586615562439, "step": 3037 }, { "epoch": 4.393347794649313, "grad_norm": 1.4656706139150852, "learning_rate": 1.3912427376502075e-06, "logits/chosen": -0.8293111324310303, "logits/rejected": -0.5837162137031555, "logps/chosen": -0.04411669075489044, "logps/rejected": -4.333014011383057, "loss": 0.078, "odds_ratio_loss": 0.0032760649919509888, "rewards/accuracies": 1.0, "rewards/chosen": -0.004411669448018074, "rewards/margins": 0.4288897216320038, "rewards/rejected": -0.4333013892173767, "sft_loss": 0.04411669075489044, "step": 3038 }, { "epoch": 4.394793926247289, "grad_norm": 1.6543272906345592, "learning_rate": 1.3888890966496698e-06, "logits/chosen": -0.6298922300338745, "logits/rejected": -0.4950665831565857, "logps/chosen": -0.05027003213763237, "logps/rejected": -5.746697425842285, "loss": 0.0852, "odds_ratio_loss": 0.003421762026846409, "rewards/accuracies": 1.0, "rewards/chosen": -0.005027003586292267, "rewards/margins": 0.5696427226066589, "rewards/rejected": -0.5746697187423706, "sft_loss": 0.05027003213763237, "step": 3039 }, { "epoch": 4.396240057845264, "grad_norm": 1.7242446848815338, "learning_rate": 1.3865370298976188e-06, "logits/chosen": -0.7039051055908203, "logits/rejected": -0.6944054365158081, "logps/chosen": -0.07602061331272125, "logps/rejected": -3.5846824645996094, "loss": 0.0798, "odds_ratio_loss": 0.008702849969267845, "rewards/accuracies": 1.0, "rewards/chosen": -0.007602061610668898, "rewards/margins": 0.3508661985397339, "rewards/rejected": -0.35846826434135437, "sft_loss": 0.07602061331272125, "step": 3040 }, { "epoch": 4.397686189443239, "grad_norm": 1.5606431444503155, "learning_rate": 1.3841865388121275e-06, "logits/chosen": -0.6780179738998413, "logits/rejected": -0.5885179042816162, "logps/chosen": -0.1251131147146225, "logps/rejected": -4.6654558181762695, "loss": 0.085, "odds_ratio_loss": 0.02848890796303749, "rewards/accuracies": 1.0, "rewards/chosen": -0.01251131109893322, "rewards/margins": 0.4540342688560486, "rewards/rejected": -0.46654558181762695, "sft_loss": 0.1251131147146225, "step": 3041 }, { "epoch": 4.399132321041215, "grad_norm": 1.6872868354444532, "learning_rate": 1.3818376248103144e-06, "logits/chosen": -0.9445464611053467, "logits/rejected": -0.6897961497306824, "logps/chosen": -0.06516368687152863, "logps/rejected": -5.009978771209717, "loss": 0.0916, "odds_ratio_loss": 0.011260229162871838, "rewards/accuracies": 1.0, "rewards/chosen": -0.006516368594020605, "rewards/margins": 0.49448153376579285, "rewards/rejected": -0.5009979009628296, "sft_loss": 0.06516368687152863, "step": 3042 }, { "epoch": 4.4005784526391905, "grad_norm": 1.9080324274666887, "learning_rate": 1.3794902893083485e-06, "logits/chosen": -0.7691489458084106, "logits/rejected": -0.589622974395752, "logps/chosen": -0.09429274499416351, "logps/rejected": -3.665755033493042, "loss": 0.1116, "odds_ratio_loss": 0.0055620986968278885, "rewards/accuracies": 1.0, "rewards/chosen": -0.009429275058209896, "rewards/margins": 0.3571462631225586, "rewards/rejected": -0.3665755093097687, "sft_loss": 0.09429274499416351, "step": 3043 }, { "epoch": 4.402024584237165, "grad_norm": 1.5782764306018584, "learning_rate": 1.377144533721445e-06, "logits/chosen": -0.8291411399841309, "logits/rejected": -0.6172857880592346, "logps/chosen": -0.11644075810909271, "logps/rejected": -6.257889747619629, "loss": 0.0623, "odds_ratio_loss": 0.0051756082102656364, "rewards/accuracies": 1.0, "rewards/chosen": -0.011644075624644756, "rewards/margins": 0.6141449213027954, "rewards/rejected": -0.6257889866828918, "sft_loss": 0.11644075810909271, "step": 3044 }, { "epoch": 4.403470715835141, "grad_norm": 1.6116329655987056, "learning_rate": 1.3748003594638728e-06, "logits/chosen": -0.6932000517845154, "logits/rejected": -0.6144500374794006, "logps/chosen": -0.033836688846349716, "logps/rejected": -3.270808219909668, "loss": 0.0628, "odds_ratio_loss": 0.006877487525343895, "rewards/accuracies": 1.0, "rewards/chosen": -0.0033836690708994865, "rewards/margins": 0.32369717955589294, "rewards/rejected": -0.3270808458328247, "sft_loss": 0.033836688846349716, "step": 3045 }, { "epoch": 4.404916847433117, "grad_norm": 1.5976320593278748, "learning_rate": 1.3724577679489393e-06, "logits/chosen": -0.6334445476531982, "logits/rejected": -0.5385106801986694, "logps/chosen": -0.0667404979467392, "logps/rejected": -5.588997840881348, "loss": 0.0673, "odds_ratio_loss": 0.006370170041918755, "rewards/accuracies": 1.0, "rewards/chosen": -0.0066740503534674644, "rewards/margins": 0.5522257089614868, "rewards/rejected": -0.5588997602462769, "sft_loss": 0.0667404979467392, "step": 3046 }, { "epoch": 4.406362979031091, "grad_norm": 1.5601781729780566, "learning_rate": 1.3701167605890054e-06, "logits/chosen": -0.7392277717590332, "logits/rejected": -0.5148305296897888, "logps/chosen": -0.0697949156165123, "logps/rejected": -3.349868059158325, "loss": 0.0715, "odds_ratio_loss": 0.007275932468473911, "rewards/accuracies": 1.0, "rewards/chosen": -0.00697949156165123, "rewards/margins": 0.3280073404312134, "rewards/rejected": -0.3349868059158325, "sft_loss": 0.0697949156165123, "step": 3047 }, { "epoch": 4.407809110629067, "grad_norm": 1.6185824118809349, "learning_rate": 1.3677773387954696e-06, "logits/chosen": -0.7123594284057617, "logits/rejected": -0.3868655562400818, "logps/chosen": -0.10565149039030075, "logps/rejected": -7.379734992980957, "loss": 0.0901, "odds_ratio_loss": 0.00902761984616518, "rewards/accuracies": 1.0, "rewards/chosen": -0.010565148666501045, "rewards/margins": 0.7274083495140076, "rewards/rejected": -0.7379735112190247, "sft_loss": 0.10565149039030075, "step": 3048 }, { "epoch": 4.409255242227043, "grad_norm": 1.4439418842601006, "learning_rate": 1.3654395039787808e-06, "logits/chosen": -0.8884992599487305, "logits/rejected": -0.5133688449859619, "logps/chosen": -0.01734546385705471, "logps/rejected": -4.463611602783203, "loss": 0.0663, "odds_ratio_loss": 0.00035680370638146996, "rewards/accuracies": 1.0, "rewards/chosen": -0.001734546385705471, "rewards/margins": 0.44462665915489197, "rewards/rejected": -0.4463611841201782, "sft_loss": 0.01734546385705471, "step": 3049 }, { "epoch": 4.4107013738250185, "grad_norm": 1.4545245283187038, "learning_rate": 1.3631032575484276e-06, "logits/chosen": -0.7421780228614807, "logits/rejected": -0.8050009608268738, "logps/chosen": -0.12215909361839294, "logps/rejected": -2.9910664558410645, "loss": 0.0897, "odds_ratio_loss": 0.03483511507511139, "rewards/accuracies": 1.0, "rewards/chosen": -0.01221590954810381, "rewards/margins": 0.28689074516296387, "rewards/rejected": -0.2991066575050354, "sft_loss": 0.12215909361839294, "step": 3050 }, { "epoch": 4.412147505422993, "grad_norm": 1.7163056525259084, "learning_rate": 1.3607686009129395e-06, "logits/chosen": -0.8548030853271484, "logits/rejected": -0.7793415188789368, "logps/chosen": -0.10555724054574966, "logps/rejected": -6.055647373199463, "loss": 0.0987, "odds_ratio_loss": 0.013582490384578705, "rewards/accuracies": 1.0, "rewards/chosen": -0.010555723682045937, "rewards/margins": 0.5950090289115906, "rewards/rejected": -0.6055647730827332, "sft_loss": 0.10555724054574966, "step": 3051 }, { "epoch": 4.413593637020969, "grad_norm": 1.8039202268622319, "learning_rate": 1.3584355354798933e-06, "logits/chosen": -0.894365668296814, "logits/rejected": -0.652009129524231, "logps/chosen": -0.10110814869403839, "logps/rejected": -5.039724826812744, "loss": 0.0822, "odds_ratio_loss": 0.007859394885599613, "rewards/accuracies": 1.0, "rewards/chosen": -0.010110815986990929, "rewards/margins": 0.49386167526245117, "rewards/rejected": -0.5039725303649902, "sft_loss": 0.10110814869403839, "step": 3052 }, { "epoch": 4.415039768618945, "grad_norm": 1.5687944501539148, "learning_rate": 1.3561040626558993e-06, "logits/chosen": -0.8314445614814758, "logits/rejected": -0.5648901462554932, "logps/chosen": -0.12590952217578888, "logps/rejected": -5.708104610443115, "loss": 0.0936, "odds_ratio_loss": 0.007341315969824791, "rewards/accuracies": 1.0, "rewards/chosen": -0.012590952217578888, "rewards/margins": 0.5582195520401001, "rewards/rejected": -0.5708104372024536, "sft_loss": 0.12590952217578888, "step": 3053 }, { "epoch": 4.4164859002169194, "grad_norm": 1.7372836321558947, "learning_rate": 1.3537741838466144e-06, "logits/chosen": -0.753945529460907, "logits/rejected": -0.6398801207542419, "logps/chosen": -0.10332097113132477, "logps/rejected": -3.84722900390625, "loss": 0.093, "odds_ratio_loss": 0.014396404847502708, "rewards/accuracies": 1.0, "rewards/chosen": -0.010332096368074417, "rewards/margins": 0.3743908405303955, "rewards/rejected": -0.38472288846969604, "sft_loss": 0.10332097113132477, "step": 3054 }, { "epoch": 4.417932031814895, "grad_norm": 1.653094550223046, "learning_rate": 1.3514459004567282e-06, "logits/chosen": -0.7897300124168396, "logits/rejected": -0.7482852339744568, "logps/chosen": -0.046823035925626755, "logps/rejected": -5.072388648986816, "loss": 0.0919, "odds_ratio_loss": 0.005791356787085533, "rewards/accuracies": 1.0, "rewards/chosen": -0.0046823034062981606, "rewards/margins": 0.502556562423706, "rewards/rejected": -0.5072388648986816, "sft_loss": 0.046823035925626755, "step": 3055 }, { "epoch": 4.419378163412871, "grad_norm": 1.7870939070781757, "learning_rate": 1.3491192138899746e-06, "logits/chosen": -0.7878323793411255, "logits/rejected": -0.6233074069023132, "logps/chosen": -0.07858381420373917, "logps/rejected": -5.563356399536133, "loss": 0.0799, "odds_ratio_loss": 0.020533833652734756, "rewards/accuracies": 1.0, "rewards/chosen": -0.007858381606638432, "rewards/margins": 0.5484772324562073, "rewards/rejected": -0.5563355684280396, "sft_loss": 0.07858381420373917, "step": 3056 }, { "epoch": 4.420824295010846, "grad_norm": 1.4064986944225257, "learning_rate": 1.3467941255491191e-06, "logits/chosen": -0.6935770511627197, "logits/rejected": -0.5368286967277527, "logps/chosen": -0.061862025409936905, "logps/rejected": -6.563525199890137, "loss": 0.0847, "odds_ratio_loss": 0.005179527681320906, "rewards/accuracies": 1.0, "rewards/chosen": -0.006186202168464661, "rewards/margins": 0.6501663327217102, "rewards/rejected": -0.6563525795936584, "sft_loss": 0.061862025409936905, "step": 3057 }, { "epoch": 4.422270426608821, "grad_norm": 1.4615415491387482, "learning_rate": 1.3444706368359673e-06, "logits/chosen": -0.6259520053863525, "logits/rejected": -0.5293348431587219, "logps/chosen": -0.051740460097789764, "logps/rejected": -3.2695064544677734, "loss": 0.0548, "odds_ratio_loss": 0.004323053639382124, "rewards/accuracies": 1.0, "rewards/chosen": -0.005174045916646719, "rewards/margins": 0.3217766284942627, "rewards/rejected": -0.32695066928863525, "sft_loss": 0.051740460097789764, "step": 3058 }, { "epoch": 4.423716558206797, "grad_norm": 2.5391363837179877, "learning_rate": 1.3421487491513577e-06, "logits/chosen": -0.7854698300361633, "logits/rejected": -0.6050273180007935, "logps/chosen": -0.06993037462234497, "logps/rejected": -6.034948348999023, "loss": 0.1104, "odds_ratio_loss": 0.009069676510989666, "rewards/accuracies": 1.0, "rewards/chosen": -0.006993037648499012, "rewards/margins": 0.5965018272399902, "rewards/rejected": -0.6034948229789734, "sft_loss": 0.06993037462234497, "step": 3059 }, { "epoch": 4.425162689804772, "grad_norm": 1.6143254545828654, "learning_rate": 1.3398284638951674e-06, "logits/chosen": -0.8341240882873535, "logits/rejected": -0.8235796093940735, "logps/chosen": -0.11071653664112091, "logps/rejected": -3.5441067218780518, "loss": 0.0679, "odds_ratio_loss": 0.009739990346133709, "rewards/accuracies": 1.0, "rewards/chosen": -0.011071654036641121, "rewards/margins": 0.3433390259742737, "rewards/rejected": -0.35441067814826965, "sft_loss": 0.11071653664112091, "step": 3060 }, { "epoch": 4.4266088214027475, "grad_norm": 1.7933967896292562, "learning_rate": 1.3375097824663022e-06, "logits/chosen": -0.7039732336997986, "logits/rejected": -0.5318971872329712, "logps/chosen": -0.08936135470867157, "logps/rejected": -5.843852519989014, "loss": 0.1061, "odds_ratio_loss": 0.014453625306487083, "rewards/accuracies": 1.0, "rewards/chosen": -0.008936136029660702, "rewards/margins": 0.5754491686820984, "rewards/rejected": -0.5843852162361145, "sft_loss": 0.08936135470867157, "step": 3061 }, { "epoch": 4.428054953000723, "grad_norm": 1.739041043864036, "learning_rate": 1.3351927062627053e-06, "logits/chosen": -0.817838728427887, "logits/rejected": -0.5577576160430908, "logps/chosen": -0.07707401365041733, "logps/rejected": -4.251495361328125, "loss": 0.1131, "odds_ratio_loss": 0.008257195353507996, "rewards/accuracies": 1.0, "rewards/chosen": -0.007707401178777218, "rewards/margins": 0.41744211316108704, "rewards/rejected": -0.425149530172348, "sft_loss": 0.07707401365041733, "step": 3062 }, { "epoch": 4.429501084598699, "grad_norm": 1.4160499745882797, "learning_rate": 1.332877236681352e-06, "logits/chosen": -0.8466053009033203, "logits/rejected": -0.5242434740066528, "logps/chosen": -0.10801158845424652, "logps/rejected": -3.6215646266937256, "loss": 0.1005, "odds_ratio_loss": 0.00946379266679287, "rewards/accuracies": 1.0, "rewards/chosen": -0.010801158845424652, "rewards/margins": 0.35135528445243835, "rewards/rejected": -0.3621564507484436, "sft_loss": 0.10801158845424652, "step": 3063 }, { "epoch": 4.430947216196674, "grad_norm": 1.5466882379686322, "learning_rate": 1.330563375118245e-06, "logits/chosen": -0.7832115292549133, "logits/rejected": -0.536728024482727, "logps/chosen": -0.07185237854719162, "logps/rejected": -4.798366546630859, "loss": 0.066, "odds_ratio_loss": 0.0033283275552093983, "rewards/accuracies": 1.0, "rewards/chosen": -0.007185238413512707, "rewards/margins": 0.4726513922214508, "rewards/rejected": -0.479836642742157, "sft_loss": 0.07185237854719162, "step": 3064 }, { "epoch": 4.432393347794649, "grad_norm": 1.4196276378868817, "learning_rate": 1.3282511229684223e-06, "logits/chosen": -0.6810212135314941, "logits/rejected": -0.709541916847229, "logps/chosen": -0.059914518147706985, "logps/rejected": -4.5397047996521, "loss": 0.0743, "odds_ratio_loss": 0.01058815699070692, "rewards/accuracies": 1.0, "rewards/chosen": -0.005991451907902956, "rewards/margins": 0.44797906279563904, "rewards/rejected": -0.4539704918861389, "sft_loss": 0.059914518147706985, "step": 3065 }, { "epoch": 4.433839479392625, "grad_norm": 1.3472066677998193, "learning_rate": 1.3259404816259481e-06, "logits/chosen": -0.751527726650238, "logits/rejected": -0.6476584076881409, "logps/chosen": -0.07095544040203094, "logps/rejected": -3.7621631622314453, "loss": 0.0746, "odds_ratio_loss": 0.0034617676865309477, "rewards/accuracies": 1.0, "rewards/chosen": -0.007095544598996639, "rewards/margins": 0.3691207766532898, "rewards/rejected": -0.3762163519859314, "sft_loss": 0.07095544040203094, "step": 3066 }, { "epoch": 4.4352856109906, "grad_norm": 1.628631458493376, "learning_rate": 1.3236314524839172e-06, "logits/chosen": -0.8099523782730103, "logits/rejected": -0.6404974460601807, "logps/chosen": -0.09229202568531036, "logps/rejected": -3.195460081100464, "loss": 0.0832, "odds_ratio_loss": 0.007787358481436968, "rewards/accuracies": 1.0, "rewards/chosen": -0.009229202754795551, "rewards/margins": 0.310316801071167, "rewards/rejected": -0.31954601407051086, "sft_loss": 0.09229202568531036, "step": 3067 }, { "epoch": 4.4367317425885755, "grad_norm": 1.4118554915639228, "learning_rate": 1.3213240369344498e-06, "logits/chosen": -0.8080687522888184, "logits/rejected": -0.50105881690979, "logps/chosen": -0.04950429126620293, "logps/rejected": -4.707045555114746, "loss": 0.0551, "odds_ratio_loss": 0.006314246449619532, "rewards/accuracies": 1.0, "rewards/chosen": -0.004950429312884808, "rewards/margins": 0.4657541513442993, "rewards/rejected": -0.4707045555114746, "sft_loss": 0.04950429126620293, "step": 3068 }, { "epoch": 4.438177874186551, "grad_norm": 1.409611417063734, "learning_rate": 1.319018236368698e-06, "logits/chosen": -0.8788902759552002, "logits/rejected": -0.6900272369384766, "logps/chosen": -0.04148077964782715, "logps/rejected": -3.2429776191711426, "loss": 0.0471, "odds_ratio_loss": 0.005179783795028925, "rewards/accuracies": 1.0, "rewards/chosen": -0.00414807815104723, "rewards/margins": 0.320149689912796, "rewards/rejected": -0.32429778575897217, "sft_loss": 0.04148077964782715, "step": 3069 }, { "epoch": 4.439624005784526, "grad_norm": 2.262076676746834, "learning_rate": 1.3167140521768359e-06, "logits/chosen": -0.9927254915237427, "logits/rejected": -0.6981010437011719, "logps/chosen": -0.08163347840309143, "logps/rejected": -5.204900741577148, "loss": 0.069, "odds_ratio_loss": 0.006048885174095631, "rewards/accuracies": 1.0, "rewards/chosen": -0.008163347840309143, "rewards/margins": 0.512326717376709, "rewards/rejected": -0.5204900503158569, "sft_loss": 0.08163347840309143, "step": 3070 }, { "epoch": 4.441070137382502, "grad_norm": 1.593186487495553, "learning_rate": 1.3144114857480664e-06, "logits/chosen": -0.8842880129814148, "logits/rejected": -0.6513842344284058, "logps/chosen": -0.06474480032920837, "logps/rejected": -3.6384944915771484, "loss": 0.0741, "odds_ratio_loss": 0.0073405965231359005, "rewards/accuracies": 1.0, "rewards/chosen": -0.00647447956725955, "rewards/margins": 0.35737502574920654, "rewards/rejected": -0.3638494610786438, "sft_loss": 0.06474480032920837, "step": 3071 }, { "epoch": 4.442516268980477, "grad_norm": 3.063001386453148, "learning_rate": 1.312110538470613e-06, "logits/chosen": -0.8223741054534912, "logits/rejected": -0.6300759315490723, "logps/chosen": -0.08572448790073395, "logps/rejected": -5.912900447845459, "loss": 0.1048, "odds_ratio_loss": 0.04846544191241264, "rewards/accuracies": 0.9375, "rewards/chosen": -0.00857244897633791, "rewards/margins": 0.5827176570892334, "rewards/rejected": -0.5912900567054749, "sft_loss": 0.08572448790073395, "step": 3072 }, { "epoch": 4.443962400578453, "grad_norm": 1.638978270409598, "learning_rate": 1.3098112117317279e-06, "logits/chosen": -0.6799906492233276, "logits/rejected": -0.49231547117233276, "logps/chosen": -0.08531919121742249, "logps/rejected": -5.583248615264893, "loss": 0.0791, "odds_ratio_loss": 0.006705442443490028, "rewards/accuracies": 1.0, "rewards/chosen": -0.008531919680535793, "rewards/margins": 0.5497928857803345, "rewards/rejected": -0.5583248138427734, "sft_loss": 0.08531919121742249, "step": 3073 }, { "epoch": 4.445408532176428, "grad_norm": 2.1057000892118083, "learning_rate": 1.307513506917683e-06, "logits/chosen": -0.761457085609436, "logits/rejected": -0.5390462875366211, "logps/chosen": -0.1640089601278305, "logps/rejected": -5.138252258300781, "loss": 0.1403, "odds_ratio_loss": 0.014966591261327267, "rewards/accuracies": 1.0, "rewards/chosen": -0.01640089601278305, "rewards/margins": 0.4974243640899658, "rewards/rejected": -0.5138252973556519, "sft_loss": 0.1640089601278305, "step": 3074 }, { "epoch": 4.4468546637744035, "grad_norm": 1.4700041819270353, "learning_rate": 1.3052174254137712e-06, "logits/chosen": -0.6377707719802856, "logits/rejected": -0.5121666789054871, "logps/chosen": -0.024024605751037598, "logps/rejected": -4.902261257171631, "loss": 0.0515, "odds_ratio_loss": 0.0034545832313597202, "rewards/accuracies": 1.0, "rewards/chosen": -0.0024024604354053736, "rewards/margins": 0.4878236651420593, "rewards/rejected": -0.490226149559021, "sft_loss": 0.024024605751037598, "step": 3075 }, { "epoch": 4.448300795372379, "grad_norm": 1.5186502267359456, "learning_rate": 1.3029229686043111e-06, "logits/chosen": -0.8834959268569946, "logits/rejected": -0.5727044343948364, "logps/chosen": -0.036372192203998566, "logps/rejected": -6.26273775100708, "loss": 0.0695, "odds_ratio_loss": 0.0046208943240344524, "rewards/accuracies": 1.0, "rewards/chosen": -0.003637219313532114, "rewards/margins": 0.6226365566253662, "rewards/rejected": -0.6262738108634949, "sft_loss": 0.036372192203998566, "step": 3076 }, { "epoch": 4.449746926970354, "grad_norm": 1.5327777476835118, "learning_rate": 1.300630137872637e-06, "logits/chosen": -0.7297614812850952, "logits/rejected": -0.5649856328964233, "logps/chosen": -0.0725630521774292, "logps/rejected": -5.728961944580078, "loss": 0.0704, "odds_ratio_loss": 0.007540358696132898, "rewards/accuracies": 1.0, "rewards/chosen": -0.007256305776536465, "rewards/margins": 0.5656399130821228, "rewards/rejected": -0.5728961825370789, "sft_loss": 0.0725630521774292, "step": 3077 }, { "epoch": 4.45119305856833, "grad_norm": 1.486418897043296, "learning_rate": 1.2983389346011079e-06, "logits/chosen": -0.6607635617256165, "logits/rejected": -0.6320513486862183, "logps/chosen": -0.0793953388929367, "logps/rejected": -6.55406379699707, "loss": 0.0727, "odds_ratio_loss": 0.01003860030323267, "rewards/accuracies": 1.0, "rewards/chosen": -0.007939533330500126, "rewards/margins": 0.6474668979644775, "rewards/rejected": -0.6554064154624939, "sft_loss": 0.0793953388929367, "step": 3078 }, { "epoch": 4.452639190166305, "grad_norm": 1.5748827094661648, "learning_rate": 1.2960493601710956e-06, "logits/chosen": -0.8328260183334351, "logits/rejected": -0.5687087774276733, "logps/chosen": -0.09150678664445877, "logps/rejected": -4.2515997886657715, "loss": 0.0911, "odds_ratio_loss": 0.006226380355656147, "rewards/accuracies": 1.0, "rewards/chosen": -0.009150679223239422, "rewards/margins": 0.4160093069076538, "rewards/rejected": -0.4251599907875061, "sft_loss": 0.09150678664445877, "step": 3079 }, { "epoch": 4.45408532176428, "grad_norm": 1.5380901831808957, "learning_rate": 1.293761415962996e-06, "logits/chosen": -0.7865862846374512, "logits/rejected": -0.6210229396820068, "logps/chosen": -0.03755870461463928, "logps/rejected": -5.537697792053223, "loss": 0.0941, "odds_ratio_loss": 0.005131867248564959, "rewards/accuracies": 1.0, "rewards/chosen": -0.0037558707408607006, "rewards/margins": 0.5500138998031616, "rewards/rejected": -0.5537697672843933, "sft_loss": 0.03755870461463928, "step": 3080 }, { "epoch": 4.455531453362256, "grad_norm": 1.2266009128818411, "learning_rate": 1.2914751033562178e-06, "logits/chosen": -0.8513956069946289, "logits/rejected": -0.5696482062339783, "logps/chosen": -0.08961950242519379, "logps/rejected": -5.04107141494751, "loss": 0.0765, "odds_ratio_loss": 0.0022056095767766237, "rewards/accuracies": 1.0, "rewards/chosen": -0.008961950428783894, "rewards/margins": 0.49514520168304443, "rewards/rejected": -0.5041071772575378, "sft_loss": 0.08961950242519379, "step": 3081 }, { "epoch": 4.4569775849602316, "grad_norm": 1.4910928711147577, "learning_rate": 1.2891904237291873e-06, "logits/chosen": -0.6649074554443359, "logits/rejected": -0.6417031288146973, "logps/chosen": -0.053086064755916595, "logps/rejected": -2.805670976638794, "loss": 0.1115, "odds_ratio_loss": 0.018563320860266685, "rewards/accuracies": 1.0, "rewards/chosen": -0.005308606196194887, "rewards/margins": 0.27525848150253296, "rewards/rejected": -0.28056707978248596, "sft_loss": 0.053086064755916595, "step": 3082 }, { "epoch": 4.458423716558206, "grad_norm": 1.5262137978721781, "learning_rate": 1.2869073784593453e-06, "logits/chosen": -0.7511852979660034, "logits/rejected": -0.6438009142875671, "logps/chosen": -0.08350561559200287, "logps/rejected": -3.083378553390503, "loss": 0.0719, "odds_ratio_loss": 0.012275271117687225, "rewards/accuracies": 1.0, "rewards/chosen": -0.008350562304258347, "rewards/margins": 0.2999872863292694, "rewards/rejected": -0.30833783745765686, "sft_loss": 0.08350561559200287, "step": 3083 }, { "epoch": 4.459869848156182, "grad_norm": 1.3219017121775374, "learning_rate": 1.2846259689231506e-06, "logits/chosen": -0.8914992809295654, "logits/rejected": -0.5301820039749146, "logps/chosen": -0.057186149060726166, "logps/rejected": -5.1810760498046875, "loss": 0.0701, "odds_ratio_loss": 0.0078020961955189705, "rewards/accuracies": 1.0, "rewards/chosen": -0.005718615371733904, "rewards/margins": 0.512389063835144, "rewards/rejected": -0.5181075930595398, "sft_loss": 0.057186149060726166, "step": 3084 }, { "epoch": 4.461315979754158, "grad_norm": 2.145105467061074, "learning_rate": 1.2823461964960713e-06, "logits/chosen": -0.837430477142334, "logits/rejected": -0.699951171875, "logps/chosen": -0.1344883143901825, "logps/rejected": -3.640721321105957, "loss": 0.1181, "odds_ratio_loss": 0.015670428052544594, "rewards/accuracies": 1.0, "rewards/chosen": -0.01344883069396019, "rewards/margins": 0.3506232500076294, "rewards/rejected": -0.36407211422920227, "sft_loss": 0.1344883143901825, "step": 3085 }, { "epoch": 4.462762111352133, "grad_norm": 1.7994304164534771, "learning_rate": 1.2800680625525933e-06, "logits/chosen": -0.7815194129943848, "logits/rejected": -0.45796847343444824, "logps/chosen": -0.08412434160709381, "logps/rejected": -6.533374786376953, "loss": 0.0596, "odds_ratio_loss": 0.0032411282882094383, "rewards/accuracies": 1.0, "rewards/chosen": -0.008412433788180351, "rewards/margins": 0.6449249982833862, "rewards/rejected": -0.6533374190330505, "sft_loss": 0.08412434160709381, "step": 3086 }, { "epoch": 4.464208242950108, "grad_norm": 1.5619771864210943, "learning_rate": 1.2777915684662088e-06, "logits/chosen": -0.7581502199172974, "logits/rejected": -0.6802593469619751, "logps/chosen": -0.15440955758094788, "logps/rejected": -4.527318477630615, "loss": 0.0886, "odds_ratio_loss": 0.02079537883400917, "rewards/accuracies": 1.0, "rewards/chosen": -0.015440955758094788, "rewards/margins": 0.4372909367084503, "rewards/rejected": -0.4527318775653839, "sft_loss": 0.15440955758094788, "step": 3087 }, { "epoch": 4.465654374548084, "grad_norm": 2.034043709736984, "learning_rate": 1.2755167156094278e-06, "logits/chosen": -0.670623242855072, "logits/rejected": -0.5299463272094727, "logps/chosen": -0.06610743701457977, "logps/rejected": -4.180477142333984, "loss": 0.0689, "odds_ratio_loss": 0.005267998669296503, "rewards/accuracies": 1.0, "rewards/chosen": -0.006610743701457977, "rewards/margins": 0.41143694519996643, "rewards/rejected": -0.418047696352005, "sft_loss": 0.06610743701457977, "step": 3088 }, { "epoch": 4.46710050614606, "grad_norm": 1.7924040631150486, "learning_rate": 1.2732435053537657e-06, "logits/chosen": -1.0045921802520752, "logits/rejected": -0.7713348269462585, "logps/chosen": -0.04516824334859848, "logps/rejected": -5.035297870635986, "loss": 0.063, "odds_ratio_loss": 0.0038515704218298197, "rewards/accuracies": 1.0, "rewards/chosen": -0.004516824148595333, "rewards/margins": 0.49901291728019714, "rewards/rejected": -0.5035297870635986, "sft_loss": 0.04516824334859848, "step": 3089 }, { "epoch": 4.468546637744034, "grad_norm": 1.6682105497831299, "learning_rate": 1.2709719390697484e-06, "logits/chosen": -0.6681150794029236, "logits/rejected": -0.5994194746017456, "logps/chosen": -0.08257712423801422, "logps/rejected": -4.70539665222168, "loss": 0.058, "odds_ratio_loss": 0.010448940098285675, "rewards/accuracies": 1.0, "rewards/chosen": -0.008257712237536907, "rewards/margins": 0.4622820019721985, "rewards/rejected": -0.4705396592617035, "sft_loss": 0.08257712423801422, "step": 3090 }, { "epoch": 4.46999276934201, "grad_norm": 1.7738282748311598, "learning_rate": 1.2687020181269147e-06, "logits/chosen": -0.7731041312217712, "logits/rejected": -0.6896120309829712, "logps/chosen": -0.13104453682899475, "logps/rejected": -5.16012716293335, "loss": 0.0768, "odds_ratio_loss": 0.023107830435037613, "rewards/accuracies": 1.0, "rewards/chosen": -0.0131044527515769, "rewards/margins": 0.5029082298278809, "rewards/rejected": -0.5160127878189087, "sft_loss": 0.13104453682899475, "step": 3091 }, { "epoch": 4.471438900939986, "grad_norm": 1.6843927662054177, "learning_rate": 1.2664337438938052e-06, "logits/chosen": -0.8892203569412231, "logits/rejected": -0.742080807685852, "logps/chosen": -0.12089746445417404, "logps/rejected": -4.260397911071777, "loss": 0.0922, "odds_ratio_loss": 0.012745104730129242, "rewards/accuracies": 1.0, "rewards/chosen": -0.012089746072888374, "rewards/margins": 0.41395002603530884, "rewards/rejected": -0.42603975534439087, "sft_loss": 0.12089746445417404, "step": 3092 }, { "epoch": 4.4728850325379605, "grad_norm": 1.5276591685861431, "learning_rate": 1.264167117737974e-06, "logits/chosen": -0.7408867478370667, "logits/rejected": -0.5432535409927368, "logps/chosen": -0.055726416409015656, "logps/rejected": -5.040454387664795, "loss": 0.0677, "odds_ratio_loss": 0.01312476396560669, "rewards/accuracies": 1.0, "rewards/chosen": -0.00557264219969511, "rewards/margins": 0.49847283959388733, "rewards/rejected": -0.5040454268455505, "sft_loss": 0.055726416409015656, "step": 3093 }, { "epoch": 4.474331164135936, "grad_norm": 1.6530262946310774, "learning_rate": 1.2619021410259749e-06, "logits/chosen": -0.705237627029419, "logits/rejected": -0.39580756425857544, "logps/chosen": -0.06976450234651566, "logps/rejected": -4.587730884552002, "loss": 0.0955, "odds_ratio_loss": 0.003844949882477522, "rewards/accuracies": 1.0, "rewards/chosen": -0.006976449862122536, "rewards/margins": 0.45179668068885803, "rewards/rejected": -0.458773136138916, "sft_loss": 0.06976450234651566, "step": 3094 }, { "epoch": 4.475777295733912, "grad_norm": 1.8816842919449692, "learning_rate": 1.2596388151233749e-06, "logits/chosen": -0.6847041845321655, "logits/rejected": -0.5424312353134155, "logps/chosen": -0.09019088745117188, "logps/rejected": -5.533663749694824, "loss": 0.1015, "odds_ratio_loss": 0.006966853979974985, "rewards/accuracies": 1.0, "rewards/chosen": -0.009019088931381702, "rewards/margins": 0.54434734582901, "rewards/rejected": -0.5533663630485535, "sft_loss": 0.09019088745117188, "step": 3095 }, { "epoch": 4.477223427331888, "grad_norm": 1.5343422977668435, "learning_rate": 1.2573771413947385e-06, "logits/chosen": -0.8714608550071716, "logits/rejected": -0.5927603244781494, "logps/chosen": -0.04954008013010025, "logps/rejected": -5.682701587677002, "loss": 0.0522, "odds_ratio_loss": 0.005804221611469984, "rewards/accuracies": 1.0, "rewards/chosen": -0.00495400745421648, "rewards/margins": 0.5633162260055542, "rewards/rejected": -0.568270206451416, "sft_loss": 0.04954008013010025, "step": 3096 }, { "epoch": 4.478669558929862, "grad_norm": 1.3475022711559164, "learning_rate": 1.2551171212036388e-06, "logits/chosen": -0.8582381010055542, "logits/rejected": -0.744134783744812, "logps/chosen": -0.05268469080328941, "logps/rejected": -4.953915596008301, "loss": 0.0539, "odds_ratio_loss": 0.005596311762928963, "rewards/accuracies": 1.0, "rewards/chosen": -0.005268468987196684, "rewards/margins": 0.49012309312820435, "rewards/rejected": -0.4953915476799011, "sft_loss": 0.05268469080328941, "step": 3097 }, { "epoch": 4.480115690527838, "grad_norm": 1.6798491379007103, "learning_rate": 1.2528587559126482e-06, "logits/chosen": -0.7833631038665771, "logits/rejected": -0.5663323998451233, "logps/chosen": -0.04492802917957306, "logps/rejected": -3.8920435905456543, "loss": 0.0682, "odds_ratio_loss": 0.002451021457090974, "rewards/accuracies": 1.0, "rewards/chosen": -0.004492803476750851, "rewards/margins": 0.3847115635871887, "rewards/rejected": -0.38920438289642334, "sft_loss": 0.04492802917957306, "step": 3098 }, { "epoch": 4.481561822125814, "grad_norm": 1.4371191416192433, "learning_rate": 1.2506020468833467e-06, "logits/chosen": -0.8777024149894714, "logits/rejected": -0.6330994963645935, "logps/chosen": -0.1080128401517868, "logps/rejected": -5.859587669372559, "loss": 0.0664, "odds_ratio_loss": 0.007772314827889204, "rewards/accuracies": 1.0, "rewards/chosen": -0.010801284573972225, "rewards/margins": 0.5751575231552124, "rewards/rejected": -0.5859587788581848, "sft_loss": 0.1080128401517868, "step": 3099 }, { "epoch": 4.483007953723789, "grad_norm": 1.6584298264925346, "learning_rate": 1.2483469954763096e-06, "logits/chosen": -0.7187407612800598, "logits/rejected": -0.6401557326316833, "logps/chosen": -0.19415581226348877, "logps/rejected": -5.682297706604004, "loss": 0.1093, "odds_ratio_loss": 0.006913043558597565, "rewards/accuracies": 1.0, "rewards/chosen": -0.019415581598877907, "rewards/margins": 0.5488142371177673, "rewards/rejected": -0.5682297945022583, "sft_loss": 0.19415581226348877, "step": 3100 }, { "epoch": 4.484454085321764, "grad_norm": 1.490297451192404, "learning_rate": 1.2460936030511184e-06, "logits/chosen": -1.0072453022003174, "logits/rejected": -0.7015312314033508, "logps/chosen": -0.04474491626024246, "logps/rejected": -3.6979777812957764, "loss": 0.062, "odds_ratio_loss": 0.0034636668860912323, "rewards/accuracies": 1.0, "rewards/chosen": -0.0044744922779500484, "rewards/margins": 0.3653232455253601, "rewards/rejected": -0.36979779601097107, "sft_loss": 0.04474491626024246, "step": 3101 }, { "epoch": 4.48590021691974, "grad_norm": 1.4188491354073367, "learning_rate": 1.2438418709663489e-06, "logits/chosen": -0.6652334928512573, "logits/rejected": -0.4683679938316345, "logps/chosen": -0.0296041090041399, "logps/rejected": -4.450751304626465, "loss": 0.0804, "odds_ratio_loss": 0.0026001809164881706, "rewards/accuracies": 1.0, "rewards/chosen": -0.0029604113660752773, "rewards/margins": 0.4421147406101227, "rewards/rejected": -0.445075124502182, "sft_loss": 0.0296041090041399, "step": 3102 }, { "epoch": 4.487346348517715, "grad_norm": 1.6423753492513558, "learning_rate": 1.2415918005795823e-06, "logits/chosen": -0.928438663482666, "logits/rejected": -0.5258166193962097, "logps/chosen": -0.12412159889936447, "logps/rejected": -5.318241119384766, "loss": 0.0877, "odds_ratio_loss": 0.0032795346342027187, "rewards/accuracies": 1.0, "rewards/chosen": -0.012412158772349358, "rewards/margins": 0.5194119215011597, "rewards/rejected": -0.5318241119384766, "sft_loss": 0.12412159889936447, "step": 3103 }, { "epoch": 4.48879248011569, "grad_norm": 2.2357703682444248, "learning_rate": 1.2393433932473922e-06, "logits/chosen": -0.8368874192237854, "logits/rejected": -0.5514189004898071, "logps/chosen": -0.0364052839577198, "logps/rejected": -4.619117736816406, "loss": 0.0767, "odds_ratio_loss": 0.0018499845173209906, "rewards/accuracies": 1.0, "rewards/chosen": -0.003640528302639723, "rewards/margins": 0.4582712650299072, "rewards/rejected": -0.46191176772117615, "sft_loss": 0.0364052839577198, "step": 3104 }, { "epoch": 4.490238611713666, "grad_norm": 1.475287637368776, "learning_rate": 1.237096650325351e-06, "logits/chosen": -1.124525785446167, "logits/rejected": -0.7751089334487915, "logps/chosen": -0.0581241250038147, "logps/rejected": -4.338222503662109, "loss": 0.0609, "odds_ratio_loss": 0.005026786122471094, "rewards/accuracies": 1.0, "rewards/chosen": -0.005812412593513727, "rewards/margins": 0.4280098080635071, "rewards/rejected": -0.4338222146034241, "sft_loss": 0.0581241250038147, "step": 3105 }, { "epoch": 4.491684743311641, "grad_norm": 1.457490704264928, "learning_rate": 1.2348515731680306e-06, "logits/chosen": -0.7537250518798828, "logits/rejected": -0.6656769514083862, "logps/chosen": -0.15097872912883759, "logps/rejected": -4.950982093811035, "loss": 0.0875, "odds_ratio_loss": 0.01871381886303425, "rewards/accuracies": 1.0, "rewards/chosen": -0.015097874216735363, "rewards/margins": 0.480000376701355, "rewards/rejected": -0.4950982332229614, "sft_loss": 0.15097872912883759, "step": 3106 }, { "epoch": 4.493130874909617, "grad_norm": 1.4918833440545058, "learning_rate": 1.2326081631289941e-06, "logits/chosen": -0.787826657295227, "logits/rejected": -0.6312362551689148, "logps/chosen": -0.07020243257284164, "logps/rejected": -6.140321731567383, "loss": 0.0824, "odds_ratio_loss": 0.011244947090744972, "rewards/accuracies": 1.0, "rewards/chosen": -0.007020242977887392, "rewards/margins": 0.6070119142532349, "rewards/rejected": -0.6140321493148804, "sft_loss": 0.07020243257284164, "step": 3107 }, { "epoch": 4.494577006507592, "grad_norm": 1.9028191693388976, "learning_rate": 1.230366421560804e-06, "logits/chosen": -0.8860039710998535, "logits/rejected": -0.6597675681114197, "logps/chosen": -0.05819562450051308, "logps/rejected": -4.343313694000244, "loss": 0.0756, "odds_ratio_loss": 0.009293823502957821, "rewards/accuracies": 1.0, "rewards/chosen": -0.00581956235691905, "rewards/margins": 0.4285118281841278, "rewards/rejected": -0.43433141708374023, "sft_loss": 0.05819562450051308, "step": 3108 }, { "epoch": 4.496023138105568, "grad_norm": 1.6718646559326362, "learning_rate": 1.2281263498150125e-06, "logits/chosen": -0.8646371960639954, "logits/rejected": -0.9306697845458984, "logps/chosen": -0.06505996733903885, "logps/rejected": -6.015259265899658, "loss": 0.0827, "odds_ratio_loss": 0.0021365510765463114, "rewards/accuracies": 1.0, "rewards/chosen": -0.006505997385829687, "rewards/margins": 0.5950199365615845, "rewards/rejected": -0.6015259027481079, "sft_loss": 0.06505996733903885, "step": 3109 }, { "epoch": 4.497469269703543, "grad_norm": 1.4464144348963581, "learning_rate": 1.2258879492421695e-06, "logits/chosen": -0.8180409669876099, "logits/rejected": -0.5849642753601074, "logps/chosen": -0.08725761622190475, "logps/rejected": -6.156432151794434, "loss": 0.074, "odds_ratio_loss": 0.002803635550662875, "rewards/accuracies": 1.0, "rewards/chosen": -0.00872576143592596, "rewards/margins": 0.6069174408912659, "rewards/rejected": -0.6156432032585144, "sft_loss": 0.08725761622190475, "step": 3110 }, { "epoch": 4.4989154013015185, "grad_norm": 1.603659515241102, "learning_rate": 1.2236512211918125e-06, "logits/chosen": -0.7529808282852173, "logits/rejected": -0.6013559103012085, "logps/chosen": -0.06795186549425125, "logps/rejected": -5.375178813934326, "loss": 0.0702, "odds_ratio_loss": 0.009950753301382065, "rewards/accuracies": 1.0, "rewards/chosen": -0.006795186549425125, "rewards/margins": 0.5307227373123169, "rewards/rejected": -0.5375179052352905, "sft_loss": 0.06795186549425125, "step": 3111 }, { "epoch": 4.500361532899494, "grad_norm": 1.6272200025476848, "learning_rate": 1.2214161670124767e-06, "logits/chosen": -0.8545068502426147, "logits/rejected": -0.6369377970695496, "logps/chosen": -0.06836562603712082, "logps/rejected": -4.065532207489014, "loss": 0.0764, "odds_ratio_loss": 0.006054166704416275, "rewards/accuracies": 1.0, "rewards/chosen": -0.006836562883108854, "rewards/margins": 0.39971664547920227, "rewards/rejected": -0.40655317902565, "sft_loss": 0.06836562603712082, "step": 3112 }, { "epoch": 4.501807664497469, "grad_norm": 1.7318462854388412, "learning_rate": 1.2191827880516804e-06, "logits/chosen": -0.8255729675292969, "logits/rejected": -0.6984921097755432, "logps/chosen": -0.045626379549503326, "logps/rejected": -3.896320343017578, "loss": 0.0746, "odds_ratio_loss": 0.008824576623737812, "rewards/accuracies": 1.0, "rewards/chosen": -0.004562637768685818, "rewards/margins": 0.3850694000720978, "rewards/rejected": -0.3896320164203644, "sft_loss": 0.045626379549503326, "step": 3113 }, { "epoch": 4.503253796095445, "grad_norm": 1.553585178335543, "learning_rate": 1.216951085655939e-06, "logits/chosen": -0.8265068531036377, "logits/rejected": -0.6123427152633667, "logps/chosen": -0.0894949659705162, "logps/rejected": -4.58646297454834, "loss": 0.0653, "odds_ratio_loss": 0.009005776606500149, "rewards/accuracies": 1.0, "rewards/chosen": -0.008949496783316135, "rewards/margins": 0.449696809053421, "rewards/rejected": -0.458646297454834, "sft_loss": 0.0894949659705162, "step": 3114 }, { "epoch": 4.50469992769342, "grad_norm": 1.6527698553956316, "learning_rate": 1.214721061170752e-06, "logits/chosen": -0.9693940877914429, "logits/rejected": -0.6829290390014648, "logps/chosen": -0.10791851580142975, "logps/rejected": -4.9538068771362305, "loss": 0.0856, "odds_ratio_loss": 0.005190132651478052, "rewards/accuracies": 1.0, "rewards/chosen": -0.010791851207613945, "rewards/margins": 0.4845888018608093, "rewards/rejected": -0.4953806698322296, "sft_loss": 0.10791851580142975, "step": 3115 }, { "epoch": 4.506146059291396, "grad_norm": 1.6640371254845585, "learning_rate": 1.2124927159406108e-06, "logits/chosen": -0.7707617878913879, "logits/rejected": -0.6420580148696899, "logps/chosen": -0.05679768696427345, "logps/rejected": -3.126427173614502, "loss": 0.0911, "odds_ratio_loss": 0.005179098807275295, "rewards/accuracies": 1.0, "rewards/chosen": -0.00567976851016283, "rewards/margins": 0.30696290731430054, "rewards/rejected": -0.3126426935195923, "sft_loss": 0.05679768696427345, "step": 3116 }, { "epoch": 4.507592190889371, "grad_norm": 2.0472468137344806, "learning_rate": 1.210266051308994e-06, "logits/chosen": -0.8093677759170532, "logits/rejected": -0.6810101270675659, "logps/chosen": -0.06034237891435623, "logps/rejected": -4.68641471862793, "loss": 0.0849, "odds_ratio_loss": 0.009722420014441013, "rewards/accuracies": 1.0, "rewards/chosen": -0.006034237798303366, "rewards/margins": 0.4626072943210602, "rewards/rejected": -0.4686414897441864, "sft_loss": 0.06034237891435623, "step": 3117 }, { "epoch": 4.5090383224873465, "grad_norm": 1.5325845079517588, "learning_rate": 1.208041068618364e-06, "logits/chosen": -0.7304118871688843, "logits/rejected": -0.6360498070716858, "logps/chosen": -0.057711161673069, "logps/rejected": -4.730152130126953, "loss": 0.0471, "odds_ratio_loss": 0.006703021004796028, "rewards/accuracies": 1.0, "rewards/chosen": -0.005771116353571415, "rewards/margins": 0.46724411845207214, "rewards/rejected": -0.4730152487754822, "sft_loss": 0.057711161673069, "step": 3118 }, { "epoch": 4.510484454085322, "grad_norm": 1.6384690787658085, "learning_rate": 1.205817769210173e-06, "logits/chosen": -0.596650242805481, "logits/rejected": -0.5212622284889221, "logps/chosen": -0.10295553505420685, "logps/rejected": -5.2416276931762695, "loss": 0.0868, "odds_ratio_loss": 0.01541278325021267, "rewards/accuracies": 1.0, "rewards/chosen": -0.010295553132891655, "rewards/margins": 0.5138672590255737, "rewards/rejected": -0.524162769317627, "sft_loss": 0.10295553505420685, "step": 3119 }, { "epoch": 4.511930585683297, "grad_norm": 3.2031844268630087, "learning_rate": 1.2035961544248557e-06, "logits/chosen": -0.8341730833053589, "logits/rejected": -0.5385291576385498, "logps/chosen": -0.05033033713698387, "logps/rejected": -5.8386383056640625, "loss": 0.1294, "odds_ratio_loss": 0.005003097467124462, "rewards/accuracies": 1.0, "rewards/chosen": -0.005033033899962902, "rewards/margins": 0.5788308382034302, "rewards/rejected": -0.5838638544082642, "sft_loss": 0.05033033713698387, "step": 3120 }, { "epoch": 4.513376717281273, "grad_norm": 1.8825190631616322, "learning_rate": 1.2013762256018316e-06, "logits/chosen": -0.5786645412445068, "logits/rejected": -0.44824934005737305, "logps/chosen": -0.08105679601430893, "logps/rejected": -6.404214859008789, "loss": 0.0781, "odds_ratio_loss": 0.009483350440859795, "rewards/accuracies": 1.0, "rewards/chosen": -0.008105679415166378, "rewards/margins": 0.6323158144950867, "rewards/rejected": -0.6404215097427368, "sft_loss": 0.08105679601430893, "step": 3121 }, { "epoch": 4.514822848879248, "grad_norm": 1.6008860726040326, "learning_rate": 1.1991579840795037e-06, "logits/chosen": -0.6722384691238403, "logits/rejected": -0.48127275705337524, "logps/chosen": -0.06287876516580582, "logps/rejected": -6.787384986877441, "loss": 0.0912, "odds_ratio_loss": 0.007575103081762791, "rewards/accuracies": 1.0, "rewards/chosen": -0.006287876050919294, "rewards/margins": 0.6724505424499512, "rewards/rejected": -0.6787384152412415, "sft_loss": 0.06287876516580582, "step": 3122 }, { "epoch": 4.516268980477223, "grad_norm": 1.342589231387517, "learning_rate": 1.1969414311952593e-06, "logits/chosen": -0.8379350304603577, "logits/rejected": -0.6947819590568542, "logps/chosen": -0.034005191177129745, "logps/rejected": -5.846619606018066, "loss": 0.0581, "odds_ratio_loss": 0.0031600147485733032, "rewards/accuracies": 1.0, "rewards/chosen": -0.0034005185589194298, "rewards/margins": 0.581261396408081, "rewards/rejected": -0.5846619606018066, "sft_loss": 0.034005191177129745, "step": 3123 }, { "epoch": 4.517715112075199, "grad_norm": 1.5147641999518209, "learning_rate": 1.1947265682854645e-06, "logits/chosen": -0.7644755840301514, "logits/rejected": -0.6600558161735535, "logps/chosen": -0.11227733641862869, "logps/rejected": -5.182661056518555, "loss": 0.0626, "odds_ratio_loss": 0.019188618287444115, "rewards/accuracies": 1.0, "rewards/chosen": -0.011227734386920929, "rewards/margins": 0.5070383548736572, "rewards/rejected": -0.5182661414146423, "sft_loss": 0.11227733641862869, "step": 3124 }, { "epoch": 4.5191612436731745, "grad_norm": 1.5468387885414232, "learning_rate": 1.192513396685471e-06, "logits/chosen": -0.7222833037376404, "logits/rejected": -0.5326530933380127, "logps/chosen": -0.02052503637969494, "logps/rejected": -5.6258955001831055, "loss": 0.0875, "odds_ratio_loss": 0.002066811081022024, "rewards/accuracies": 1.0, "rewards/chosen": -0.002052503637969494, "rewards/margins": 0.5605370402336121, "rewards/rejected": -0.5625895261764526, "sft_loss": 0.02052503637969494, "step": 3125 }, { "epoch": 4.520607375271149, "grad_norm": 1.665196211024822, "learning_rate": 1.190301917729606e-06, "logits/chosen": -0.7487690448760986, "logits/rejected": -0.7268781661987305, "logps/chosen": -0.09351005405187607, "logps/rejected": -5.104481220245361, "loss": 0.1071, "odds_ratio_loss": 0.0030038892291486263, "rewards/accuracies": 1.0, "rewards/chosen": -0.009351005777716637, "rewards/margins": 0.5010970830917358, "rewards/rejected": -0.5104480981826782, "sft_loss": 0.09351005405187607, "step": 3126 }, { "epoch": 4.522053506869125, "grad_norm": 1.8089219923251223, "learning_rate": 1.1880921327511799e-06, "logits/chosen": -0.7335822582244873, "logits/rejected": -0.635443389415741, "logps/chosen": -0.06432907283306122, "logps/rejected": -3.4109225273132324, "loss": 0.087, "odds_ratio_loss": 0.005904484074562788, "rewards/accuracies": 1.0, "rewards/chosen": -0.006432907655835152, "rewards/margins": 0.3346593677997589, "rewards/rejected": -0.3410922884941101, "sft_loss": 0.06432907283306122, "step": 3127 }, { "epoch": 4.523499638467101, "grad_norm": 1.444253112444959, "learning_rate": 1.1858840430824798e-06, "logits/chosen": -0.8454017639160156, "logits/rejected": -0.6811554431915283, "logps/chosen": -0.045448124408721924, "logps/rejected": -4.554635047912598, "loss": 0.0643, "odds_ratio_loss": 0.005859294906258583, "rewards/accuracies": 1.0, "rewards/chosen": -0.004544812720268965, "rewards/margins": 0.45091870427131653, "rewards/rejected": -0.4554634988307953, "sft_loss": 0.045448124408721924, "step": 3128 }, { "epoch": 4.5249457700650755, "grad_norm": 1.4456132302971754, "learning_rate": 1.1836776500547698e-06, "logits/chosen": -0.804023802280426, "logits/rejected": -0.6073635816574097, "logps/chosen": -0.04560978338122368, "logps/rejected": -3.941932201385498, "loss": 0.0558, "odds_ratio_loss": 0.006999202072620392, "rewards/accuracies": 1.0, "rewards/chosen": -0.004560978151857853, "rewards/margins": 0.3896322250366211, "rewards/rejected": -0.394193172454834, "sft_loss": 0.04560978338122368, "step": 3129 }, { "epoch": 4.526391901663051, "grad_norm": 1.7544043039159367, "learning_rate": 1.181472954998295e-06, "logits/chosen": -0.7125527858734131, "logits/rejected": -0.5077294707298279, "logps/chosen": -0.1320764422416687, "logps/rejected": -4.950249671936035, "loss": 0.0929, "odds_ratio_loss": 0.011006537824869156, "rewards/accuracies": 1.0, "rewards/chosen": -0.013207645155489445, "rewards/margins": 0.4818173050880432, "rewards/rejected": -0.49502497911453247, "sft_loss": 0.1320764422416687, "step": 3130 }, { "epoch": 4.527838033261027, "grad_norm": 1.4651207792497591, "learning_rate": 1.1792699592422714e-06, "logits/chosen": -0.6505829691886902, "logits/rejected": -0.49591994285583496, "logps/chosen": -0.06142151355743408, "logps/rejected": -4.931596755981445, "loss": 0.0811, "odds_ratio_loss": 0.008902262896299362, "rewards/accuracies": 1.0, "rewards/chosen": -0.006142151076346636, "rewards/margins": 0.48701754212379456, "rewards/rejected": -0.493159681558609, "sft_loss": 0.06142151355743408, "step": 3131 }, { "epoch": 4.5292841648590025, "grad_norm": 1.37934565512855, "learning_rate": 1.1770686641148966e-06, "logits/chosen": -0.9451402425765991, "logits/rejected": -0.652174174785614, "logps/chosen": -0.042112577706575394, "logps/rejected": -4.959356307983398, "loss": 0.0686, "odds_ratio_loss": 0.00329545047134161, "rewards/accuracies": 1.0, "rewards/chosen": -0.004211257677525282, "rewards/margins": 0.49172443151474, "rewards/rejected": -0.49593567848205566, "sft_loss": 0.042112577706575394, "step": 3132 }, { "epoch": 4.530730296456977, "grad_norm": 1.8806322433014044, "learning_rate": 1.1748690709433361e-06, "logits/chosen": -0.6774564981460571, "logits/rejected": -0.47243747115135193, "logps/chosen": -0.038698915392160416, "logps/rejected": -5.015310764312744, "loss": 0.0835, "odds_ratio_loss": 0.003635758301243186, "rewards/accuracies": 1.0, "rewards/chosen": -0.0038698913995176554, "rewards/margins": 0.49766120314598083, "rewards/rejected": -0.5015311241149902, "sft_loss": 0.038698915392160416, "step": 3133 }, { "epoch": 4.532176428054953, "grad_norm": 1.9020404331515046, "learning_rate": 1.1726711810537366e-06, "logits/chosen": -0.700002133846283, "logits/rejected": -0.5288843512535095, "logps/chosen": -0.15027688443660736, "logps/rejected": -4.056690216064453, "loss": 0.0959, "odds_ratio_loss": 0.005189661867916584, "rewards/accuracies": 1.0, "rewards/chosen": -0.015027688816189766, "rewards/margins": 0.39064133167266846, "rewards/rejected": -0.40566903352737427, "sft_loss": 0.15027688443660736, "step": 3134 }, { "epoch": 4.533622559652929, "grad_norm": 1.4043050533639423, "learning_rate": 1.1704749957712117e-06, "logits/chosen": -0.8214952945709229, "logits/rejected": -0.5681087374687195, "logps/chosen": -0.04724892973899841, "logps/rejected": -4.8302321434021, "loss": 0.0588, "odds_ratio_loss": 0.007608736399561167, "rewards/accuracies": 1.0, "rewards/chosen": -0.004724893253296614, "rewards/margins": 0.4782983064651489, "rewards/rejected": -0.4830232262611389, "sft_loss": 0.04724892973899841, "step": 3135 }, { "epoch": 4.5350686912509035, "grad_norm": 1.6404462461922473, "learning_rate": 1.1682805164198502e-06, "logits/chosen": -0.6733072996139526, "logits/rejected": -0.5581374764442444, "logps/chosen": -0.04132752865552902, "logps/rejected": -4.327413082122803, "loss": 0.0587, "odds_ratio_loss": 0.004820593632757664, "rewards/accuracies": 1.0, "rewards/chosen": -0.004132753238081932, "rewards/margins": 0.4286085367202759, "rewards/rejected": -0.43274131417274475, "sft_loss": 0.04132752865552902, "step": 3136 }, { "epoch": 4.536514822848879, "grad_norm": 1.8346210650122372, "learning_rate": 1.1660877443227106e-06, "logits/chosen": -0.8485831022262573, "logits/rejected": -0.7402029037475586, "logps/chosen": -0.10031185299158096, "logps/rejected": -3.5375704765319824, "loss": 0.1088, "odds_ratio_loss": 0.013819573447108269, "rewards/accuracies": 1.0, "rewards/chosen": -0.010031186044216156, "rewards/margins": 0.3437258303165436, "rewards/rejected": -0.35375702381134033, "sft_loss": 0.10031185299158096, "step": 3137 }, { "epoch": 4.537960954446855, "grad_norm": 1.4162384065240734, "learning_rate": 1.1638966808018258e-06, "logits/chosen": -0.6201837062835693, "logits/rejected": -0.5881571769714355, "logps/chosen": -0.05904054269194603, "logps/rejected": -2.439631938934326, "loss": 0.077, "odds_ratio_loss": 0.009300749748945236, "rewards/accuracies": 1.0, "rewards/chosen": -0.00590405473485589, "rewards/margins": 0.2380591332912445, "rewards/rejected": -0.24396318197250366, "sft_loss": 0.05904054269194603, "step": 3138 }, { "epoch": 4.539407086044831, "grad_norm": 1.4561234361716575, "learning_rate": 1.1617073271781937e-06, "logits/chosen": -0.8747523427009583, "logits/rejected": -0.5830885767936707, "logps/chosen": -0.06753970682621002, "logps/rejected": -3.148228168487549, "loss": 0.0605, "odds_ratio_loss": 0.00502714142203331, "rewards/accuracies": 1.0, "rewards/chosen": -0.006753971334546804, "rewards/margins": 0.3080688416957855, "rewards/rejected": -0.314822793006897, "sft_loss": 0.06753970682621002, "step": 3139 }, { "epoch": 4.540853217642805, "grad_norm": 1.4928781049529505, "learning_rate": 1.1595196847717858e-06, "logits/chosen": -0.537721574306488, "logits/rejected": -0.4988245964050293, "logps/chosen": -0.07080557942390442, "logps/rejected": -4.220191955566406, "loss": 0.085, "odds_ratio_loss": 0.013362744823098183, "rewards/accuracies": 1.0, "rewards/chosen": -0.007080557756125927, "rewards/margins": 0.41493868827819824, "rewards/rejected": -0.42201921343803406, "sft_loss": 0.07080557942390442, "step": 3140 }, { "epoch": 4.542299349240781, "grad_norm": 1.717320927515359, "learning_rate": 1.1573337549015384e-06, "logits/chosen": -0.7508547306060791, "logits/rejected": -0.548348069190979, "logps/chosen": -0.03961345553398132, "logps/rejected": -4.493158340454102, "loss": 0.0565, "odds_ratio_loss": 0.003105518640950322, "rewards/accuracies": 1.0, "rewards/chosen": -0.00396134564653039, "rewards/margins": 0.44535452127456665, "rewards/rejected": -0.4493158459663391, "sft_loss": 0.03961345553398132, "step": 3141 }, { "epoch": 4.543745480838757, "grad_norm": 1.9170535110296663, "learning_rate": 1.1551495388853583e-06, "logits/chosen": -0.6952974796295166, "logits/rejected": -0.6106499433517456, "logps/chosen": -0.12642665207386017, "logps/rejected": -5.010893821716309, "loss": 0.1007, "odds_ratio_loss": 0.012112841010093689, "rewards/accuracies": 1.0, "rewards/chosen": -0.012642664834856987, "rewards/margins": 0.4884466528892517, "rewards/rejected": -0.501089334487915, "sft_loss": 0.12642665207386017, "step": 3142 }, { "epoch": 4.5451916124367315, "grad_norm": 1.7576434765191336, "learning_rate": 1.1529670380401166e-06, "logits/chosen": -0.7798545360565186, "logits/rejected": -0.6895734667778015, "logps/chosen": -0.10837902128696442, "logps/rejected": -3.75516414642334, "loss": 0.0832, "odds_ratio_loss": 0.010169276036322117, "rewards/accuracies": 1.0, "rewards/chosen": -0.010837902314960957, "rewards/margins": 0.3646785318851471, "rewards/rejected": -0.375516414642334, "sft_loss": 0.10837902128696442, "step": 3143 }, { "epoch": 4.546637744034707, "grad_norm": 1.5294315441059878, "learning_rate": 1.15078625368165e-06, "logits/chosen": -0.8288568258285522, "logits/rejected": -0.5528225898742676, "logps/chosen": -0.08221597224473953, "logps/rejected": -4.284519195556641, "loss": 0.0672, "odds_ratio_loss": 0.0071078077889978886, "rewards/accuracies": 1.0, "rewards/chosen": -0.008221596479415894, "rewards/margins": 0.42023032903671265, "rewards/rejected": -0.4284519553184509, "sft_loss": 0.08221597224473953, "step": 3144 }, { "epoch": 4.548083875632683, "grad_norm": 1.6890116765857413, "learning_rate": 1.1486071871247637e-06, "logits/chosen": -0.8703626990318298, "logits/rejected": -0.65482097864151, "logps/chosen": -0.05179030820727348, "logps/rejected": -4.451789855957031, "loss": 0.0888, "odds_ratio_loss": 0.004219289869070053, "rewards/accuracies": 1.0, "rewards/chosen": -0.005179030820727348, "rewards/margins": 0.43999993801116943, "rewards/rejected": -0.4451789855957031, "sft_loss": 0.05179030820727348, "step": 3145 }, { "epoch": 4.549530007230658, "grad_norm": 1.7118473076649063, "learning_rate": 1.1464298396832232e-06, "logits/chosen": -0.746877133846283, "logits/rejected": -0.5233524441719055, "logps/chosen": -0.061596743762493134, "logps/rejected": -3.4973483085632324, "loss": 0.0606, "odds_ratio_loss": 0.0044494750909507275, "rewards/accuracies": 1.0, "rewards/chosen": -0.006159674376249313, "rewards/margins": 0.3435751795768738, "rewards/rejected": -0.3497348725795746, "sft_loss": 0.061596743762493134, "step": 3146 }, { "epoch": 4.550976138828633, "grad_norm": 1.6246389022279693, "learning_rate": 1.144254212669761e-06, "logits/chosen": -0.6675397753715515, "logits/rejected": -0.5680756568908691, "logps/chosen": -0.12102605402469635, "logps/rejected": -4.196258544921875, "loss": 0.089, "odds_ratio_loss": 0.015979530289769173, "rewards/accuracies": 1.0, "rewards/chosen": -0.012102605774998665, "rewards/margins": 0.4075232446193695, "rewards/rejected": -0.41962581872940063, "sft_loss": 0.12102605402469635, "step": 3147 }, { "epoch": 4.552422270426609, "grad_norm": 1.4363813660078086, "learning_rate": 1.142080307396069e-06, "logits/chosen": -0.9800047874450684, "logits/rejected": -0.6176702976226807, "logps/chosen": -0.056100714951753616, "logps/rejected": -4.86061954498291, "loss": 0.0655, "odds_ratio_loss": 0.007833914831280708, "rewards/accuracies": 1.0, "rewards/chosen": -0.005610071122646332, "rewards/margins": 0.4804518520832062, "rewards/rejected": -0.4860619306564331, "sft_loss": 0.056100714951753616, "step": 3148 }, { "epoch": 4.553868402024584, "grad_norm": 1.7603116527742264, "learning_rate": 1.1399081251728047e-06, "logits/chosen": -0.7957381010055542, "logits/rejected": -0.6939653754234314, "logps/chosen": -0.07760776579380035, "logps/rejected": -4.140201568603516, "loss": 0.0792, "odds_ratio_loss": 0.015399527736008167, "rewards/accuracies": 1.0, "rewards/chosen": -0.0077607762068510056, "rewards/margins": 0.4062593877315521, "rewards/rejected": -0.4140201508998871, "sft_loss": 0.07760776579380035, "step": 3149 }, { "epoch": 4.55531453362256, "grad_norm": 1.7762852522056913, "learning_rate": 1.1377376673095836e-06, "logits/chosen": -0.7090476751327515, "logits/rejected": -0.6018462777137756, "logps/chosen": -0.06537199765443802, "logps/rejected": -5.667291641235352, "loss": 0.085, "odds_ratio_loss": 0.010719805024564266, "rewards/accuracies": 1.0, "rewards/chosen": -0.006537200417369604, "rewards/margins": 0.5601919889450073, "rewards/rejected": -0.5667291879653931, "sft_loss": 0.06537199765443802, "step": 3150 }, { "epoch": 4.556760665220535, "grad_norm": 1.7313737183629967, "learning_rate": 1.1355689351149837e-06, "logits/chosen": -0.7923053503036499, "logits/rejected": -0.6517032384872437, "logps/chosen": -0.08571956306695938, "logps/rejected": -5.058340549468994, "loss": 0.0809, "odds_ratio_loss": 0.005782403517514467, "rewards/accuracies": 1.0, "rewards/chosen": -0.008571955375373363, "rewards/margins": 0.4972621202468872, "rewards/rejected": -0.5058341026306152, "sft_loss": 0.08571956306695938, "step": 3151 }, { "epoch": 4.55820679681851, "grad_norm": 1.4263625628365386, "learning_rate": 1.1334019298965394e-06, "logits/chosen": -0.6988058090209961, "logits/rejected": -0.5728355050086975, "logps/chosen": -0.03624889254570007, "logps/rejected": -3.573759078979492, "loss": 0.064, "odds_ratio_loss": 0.0032424121163785458, "rewards/accuracies": 1.0, "rewards/chosen": -0.003624889301136136, "rewards/margins": 0.3537510633468628, "rewards/rejected": -0.3573759198188782, "sft_loss": 0.03624889254570007, "step": 3152 }, { "epoch": 4.559652928416486, "grad_norm": 1.6847747000268567, "learning_rate": 1.1312366529607493e-06, "logits/chosen": -1.2507294416427612, "logits/rejected": -0.6984279751777649, "logps/chosen": -0.07849462330341339, "logps/rejected": -4.19482421875, "loss": 0.0845, "odds_ratio_loss": 0.00989564135670662, "rewards/accuracies": 1.0, "rewards/chosen": -0.007849462330341339, "rewards/margins": 0.4116330146789551, "rewards/rejected": -0.4194824695587158, "sft_loss": 0.07849462330341339, "step": 3153 }, { "epoch": 4.561099060014461, "grad_norm": 1.4166334580337705, "learning_rate": 1.1290731056130645e-06, "logits/chosen": -1.1231694221496582, "logits/rejected": -0.7405078411102295, "logps/chosen": -0.09685708582401276, "logps/rejected": -4.7172393798828125, "loss": 0.074, "odds_ratio_loss": 0.010411504656076431, "rewards/accuracies": 1.0, "rewards/chosen": -0.00968570914119482, "rewards/margins": 0.4620382487773895, "rewards/rejected": -0.4717239737510681, "sft_loss": 0.09685708582401276, "step": 3154 }, { "epoch": 4.562545191612437, "grad_norm": 1.472571547360005, "learning_rate": 1.1269112891578964e-06, "logits/chosen": -0.8097749352455139, "logits/rejected": -0.6882657408714294, "logps/chosen": -0.09382905811071396, "logps/rejected": -3.311452627182007, "loss": 0.0875, "odds_ratio_loss": 0.010742062702775002, "rewards/accuracies": 1.0, "rewards/chosen": -0.009382905438542366, "rewards/margins": 0.321762353181839, "rewards/rejected": -0.3311452567577362, "sft_loss": 0.09382905811071396, "step": 3155 }, { "epoch": 4.563991323210412, "grad_norm": 2.522302408631471, "learning_rate": 1.124751204898614e-06, "logits/chosen": -1.0051918029785156, "logits/rejected": -0.6038036346435547, "logps/chosen": -0.0715935081243515, "logps/rejected": -5.134657382965088, "loss": 0.0943, "odds_ratio_loss": 0.004194090608507395, "rewards/accuracies": 1.0, "rewards/chosen": -0.007159349974244833, "rewards/margins": 0.5063064098358154, "rewards/rejected": -0.5134657621383667, "sft_loss": 0.0715935081243515, "step": 3156 }, { "epoch": 4.565437454808388, "grad_norm": 1.3968773246812638, "learning_rate": 1.1225928541375376e-06, "logits/chosen": -0.8158324956893921, "logits/rejected": -0.6068318486213684, "logps/chosen": -0.07950488477945328, "logps/rejected": -9.155324935913086, "loss": 0.0772, "odds_ratio_loss": 0.005621365271508694, "rewards/accuracies": 1.0, "rewards/chosen": -0.007950487546622753, "rewards/margins": 0.9075820446014404, "rewards/rejected": -0.9155324697494507, "sft_loss": 0.07950488477945328, "step": 3157 }, { "epoch": 4.566883586406363, "grad_norm": 1.6581570448756777, "learning_rate": 1.1204362381759485e-06, "logits/chosen": -1.0890898704528809, "logits/rejected": -0.6594418287277222, "logps/chosen": -0.07467133551836014, "logps/rejected": -5.057663917541504, "loss": 0.0913, "odds_ratio_loss": 0.001974435057491064, "rewards/accuracies": 1.0, "rewards/chosen": -0.007467133458703756, "rewards/margins": 0.4982993006706238, "rewards/rejected": -0.5057663917541504, "sft_loss": 0.07467133551836014, "step": 3158 }, { "epoch": 4.568329718004338, "grad_norm": 2.052336604338936, "learning_rate": 1.1182813583140736e-06, "logits/chosen": -0.9036225080490112, "logits/rejected": -0.690902054309845, "logps/chosen": -0.10851224511861801, "logps/rejected": -6.375421047210693, "loss": 0.1284, "odds_ratio_loss": 0.010672297328710556, "rewards/accuracies": 1.0, "rewards/chosen": -0.010851224884390831, "rewards/margins": 0.6266908645629883, "rewards/rejected": -0.6375421285629272, "sft_loss": 0.10851224511861801, "step": 3159 }, { "epoch": 4.569775849602314, "grad_norm": 1.3507760008718734, "learning_rate": 1.1161282158511016e-06, "logits/chosen": -0.692068874835968, "logits/rejected": -0.607538104057312, "logps/chosen": -0.05230974406003952, "logps/rejected": -4.683342456817627, "loss": 0.0577, "odds_ratio_loss": 0.0022822008468210697, "rewards/accuracies": 1.0, "rewards/chosen": -0.005230974406003952, "rewards/margins": 0.4631032943725586, "rewards/rejected": -0.46833428740501404, "sft_loss": 0.05230974406003952, "step": 3160 }, { "epoch": 4.571221981200289, "grad_norm": 1.3867036048678973, "learning_rate": 1.1139768120851677e-06, "logits/chosen": -0.8184716701507568, "logits/rejected": -0.6516551971435547, "logps/chosen": -0.04152075573801994, "logps/rejected": -6.1955413818359375, "loss": 0.0513, "odds_ratio_loss": 0.0025585112161934376, "rewards/accuracies": 1.0, "rewards/chosen": -0.004152075387537479, "rewards/margins": 0.6154020428657532, "rewards/rejected": -0.6195541024208069, "sft_loss": 0.04152075573801994, "step": 3161 }, { "epoch": 4.572668112798265, "grad_norm": 1.5935727445354109, "learning_rate": 1.1118271483133638e-06, "logits/chosen": -0.6832992434501648, "logits/rejected": -0.6606971621513367, "logps/chosen": -0.031898848712444305, "logps/rejected": -4.598211288452148, "loss": 0.0939, "odds_ratio_loss": 0.0026514395140111446, "rewards/accuracies": 1.0, "rewards/chosen": -0.0031898850575089455, "rewards/margins": 0.45663124322891235, "rewards/rejected": -0.4598211646080017, "sft_loss": 0.031898848712444305, "step": 3162 }, { "epoch": 4.57411424439624, "grad_norm": 1.6348180305027182, "learning_rate": 1.1096792258317273e-06, "logits/chosen": -0.8901809453964233, "logits/rejected": -0.6974164247512817, "logps/chosen": -0.07738389074802399, "logps/rejected": -4.344188690185547, "loss": 0.1015, "odds_ratio_loss": 0.00792864803224802, "rewards/accuracies": 1.0, "rewards/chosen": -0.007738389540463686, "rewards/margins": 0.42668044567108154, "rewards/rejected": -0.43441885709762573, "sft_loss": 0.07738389074802399, "step": 3163 }, { "epoch": 4.575560375994216, "grad_norm": 1.452333306176182, "learning_rate": 1.1075330459352517e-06, "logits/chosen": -0.8866585493087769, "logits/rejected": -0.6407424211502075, "logps/chosen": -0.0721864402294159, "logps/rejected": -4.096374988555908, "loss": 0.0696, "odds_ratio_loss": 0.010981040075421333, "rewards/accuracies": 1.0, "rewards/chosen": -0.007218644022941589, "rewards/margins": 0.402418851852417, "rewards/rejected": -0.4096375107765198, "sft_loss": 0.0721864402294159, "step": 3164 }, { "epoch": 4.577006507592191, "grad_norm": 1.6162076523042697, "learning_rate": 1.1053886099178745e-06, "logits/chosen": -0.7911291718482971, "logits/rejected": -0.6464878916740417, "logps/chosen": -0.10432671755552292, "logps/rejected": -3.5140199661254883, "loss": 0.0838, "odds_ratio_loss": 0.008694362826645374, "rewards/accuracies": 1.0, "rewards/chosen": -0.010432671755552292, "rewards/margins": 0.3409693241119385, "rewards/rejected": -0.35140201449394226, "sft_loss": 0.10432671755552292, "step": 3165 }, { "epoch": 4.578452639190166, "grad_norm": 1.8954756696938493, "learning_rate": 1.1032459190724858e-06, "logits/chosen": -0.8337982892990112, "logits/rejected": -0.7270399332046509, "logps/chosen": -0.11213445663452148, "logps/rejected": -4.699167251586914, "loss": 0.1142, "odds_ratio_loss": 0.007468009367585182, "rewards/accuracies": 1.0, "rewards/chosen": -0.011213446035981178, "rewards/margins": 0.45870327949523926, "rewards/rejected": -0.46991676092147827, "sft_loss": 0.11213445663452148, "step": 3166 }, { "epoch": 4.579898770788142, "grad_norm": 1.972473660722877, "learning_rate": 1.1011049746909216e-06, "logits/chosen": -0.9359748363494873, "logits/rejected": -0.7544848918914795, "logps/chosen": -0.1299418807029724, "logps/rejected": -4.122316360473633, "loss": 0.085, "odds_ratio_loss": 0.009792567230761051, "rewards/accuracies": 1.0, "rewards/chosen": -0.012994188815355301, "rewards/margins": 0.3992374539375305, "rewards/rejected": -0.4122316241264343, "sft_loss": 0.1299418807029724, "step": 3167 }, { "epoch": 4.5813449023861175, "grad_norm": 2.1773388410438055, "learning_rate": 1.0989657780639632e-06, "logits/chosen": -0.8588944673538208, "logits/rejected": -0.5516154766082764, "logps/chosen": -0.08692246675491333, "logps/rejected": -6.501953125, "loss": 0.0723, "odds_ratio_loss": 0.009055457077920437, "rewards/accuracies": 1.0, "rewards/chosen": -0.008692245930433273, "rewards/margins": 0.641503095626831, "rewards/rejected": -0.6501953601837158, "sft_loss": 0.08692246675491333, "step": 3168 }, { "epoch": 4.582791033984092, "grad_norm": 1.71320771588054, "learning_rate": 1.0968283304813435e-06, "logits/chosen": -0.7051694989204407, "logits/rejected": -0.5732921361923218, "logps/chosen": -0.054236263036727905, "logps/rejected": -5.548681735992432, "loss": 0.1064, "odds_ratio_loss": 0.002435041591525078, "rewards/accuracies": 1.0, "rewards/chosen": -0.005423626862466335, "rewards/margins": 0.5494445562362671, "rewards/rejected": -0.5548681616783142, "sft_loss": 0.054236263036727905, "step": 3169 }, { "epoch": 4.584237165582068, "grad_norm": 1.8240270589071466, "learning_rate": 1.0946926332317344e-06, "logits/chosen": -0.7067397832870483, "logits/rejected": -0.8004418015480042, "logps/chosen": -0.10146773606538773, "logps/rejected": -3.536372184753418, "loss": 0.1123, "odds_ratio_loss": 0.024081986397504807, "rewards/accuracies": 1.0, "rewards/chosen": -0.010146773420274258, "rewards/margins": 0.34349045157432556, "rewards/rejected": -0.3536372184753418, "sft_loss": 0.10146773606538773, "step": 3170 }, { "epoch": 4.585683297180044, "grad_norm": 2.3802481561976525, "learning_rate": 1.092558687602758e-06, "logits/chosen": -0.8300359845161438, "logits/rejected": -0.5264511704444885, "logps/chosen": -0.13310036063194275, "logps/rejected": -5.091741561889648, "loss": 0.1434, "odds_ratio_loss": 0.0062806615605950356, "rewards/accuracies": 1.0, "rewards/chosen": -0.01331003662198782, "rewards/margins": 0.4958640933036804, "rewards/rejected": -0.5091741681098938, "sft_loss": 0.13310036063194275, "step": 3171 }, { "epoch": 4.587129428778018, "grad_norm": 1.4752608857026752, "learning_rate": 1.0904264948809769e-06, "logits/chosen": -0.8397960662841797, "logits/rejected": -0.6188733577728271, "logps/chosen": -0.056685350835323334, "logps/rejected": -3.9327101707458496, "loss": 0.0549, "odds_ratio_loss": 0.009330632165074348, "rewards/accuracies": 1.0, "rewards/chosen": -0.005668535362929106, "rewards/margins": 0.3876025080680847, "rewards/rejected": -0.3932710289955139, "sft_loss": 0.056685350835323334, "step": 3172 }, { "epoch": 4.588575560375994, "grad_norm": 1.4437816871536389, "learning_rate": 1.0882960563518993e-06, "logits/chosen": -0.6703709959983826, "logits/rejected": -0.5640783905982971, "logps/chosen": -0.03526798635721207, "logps/rejected": -4.020233154296875, "loss": 0.0908, "odds_ratio_loss": 0.004295479506254196, "rewards/accuracies": 1.0, "rewards/chosen": -0.003526798915117979, "rewards/margins": 0.3984965980052948, "rewards/rejected": -0.4020233452320099, "sft_loss": 0.03526798635721207, "step": 3173 }, { "epoch": 4.59002169197397, "grad_norm": 1.50024639564137, "learning_rate": 1.0861673732999737e-06, "logits/chosen": -0.6850433349609375, "logits/rejected": -0.5941628217697144, "logps/chosen": -0.17146408557891846, "logps/rejected": -4.675539970397949, "loss": 0.0941, "odds_ratio_loss": 0.01795043796300888, "rewards/accuracies": 1.0, "rewards/chosen": -0.017146408557891846, "rewards/margins": 0.45040759444236755, "rewards/rejected": -0.4675540030002594, "sft_loss": 0.17146408557891846, "step": 3174 }, { "epoch": 4.591467823571945, "grad_norm": 1.757910834910301, "learning_rate": 1.0840404470085908e-06, "logits/chosen": -0.8096254467964172, "logits/rejected": -0.5914109945297241, "logps/chosen": -0.06457695364952087, "logps/rejected": -3.9802322387695312, "loss": 0.149, "odds_ratio_loss": 0.009919540025293827, "rewards/accuracies": 1.0, "rewards/chosen": -0.006457695737481117, "rewards/margins": 0.3915655314922333, "rewards/rejected": -0.39802321791648865, "sft_loss": 0.06457695364952087, "step": 3175 }, { "epoch": 4.59291395516992, "grad_norm": 3.319954432745719, "learning_rate": 1.0819152787600815e-06, "logits/chosen": -0.8060805201530457, "logits/rejected": -0.4811224341392517, "logps/chosen": -0.12582726776599884, "logps/rejected": -4.696778297424316, "loss": 0.0859, "odds_ratio_loss": 0.0025469374377280474, "rewards/accuracies": 1.0, "rewards/chosen": -0.012582727707922459, "rewards/margins": 0.45709505677223206, "rewards/rejected": -0.46967780590057373, "sft_loss": 0.12582726776599884, "step": 3176 }, { "epoch": 4.594360086767896, "grad_norm": 1.4319642017221001, "learning_rate": 1.07979186983572e-06, "logits/chosen": -0.7860830426216125, "logits/rejected": -0.6465473771095276, "logps/chosen": -0.08816933631896973, "logps/rejected": -3.463515281677246, "loss": 0.0742, "odds_ratio_loss": 0.005800843238830566, "rewards/accuracies": 1.0, "rewards/chosen": -0.008816934190690517, "rewards/margins": 0.3375346064567566, "rewards/rejected": -0.3463515043258667, "sft_loss": 0.08816933631896973, "step": 3177 }, { "epoch": 4.595806218365872, "grad_norm": 1.4844982954091979, "learning_rate": 1.0776702215157153e-06, "logits/chosen": -0.8793070912361145, "logits/rejected": -0.7314832210540771, "logps/chosen": -0.0764409527182579, "logps/rejected": -3.413910388946533, "loss": 0.06, "odds_ratio_loss": 0.010066686198115349, "rewards/accuracies": 1.0, "rewards/chosen": -0.007644095458090305, "rewards/margins": 0.333746999502182, "rewards/rejected": -0.34139108657836914, "sft_loss": 0.0764409527182579, "step": 3178 }, { "epoch": 4.5972523499638465, "grad_norm": 1.536625642676757, "learning_rate": 1.0755503350792188e-06, "logits/chosen": -0.7018544673919678, "logits/rejected": -0.6762786507606506, "logps/chosen": -0.06070394814014435, "logps/rejected": -2.954383373260498, "loss": 0.1113, "odds_ratio_loss": 0.007944751530885696, "rewards/accuracies": 1.0, "rewards/chosen": -0.00607039500027895, "rewards/margins": 0.2893679738044739, "rewards/rejected": -0.29543834924697876, "sft_loss": 0.06070394814014435, "step": 3179 }, { "epoch": 4.598698481561822, "grad_norm": 1.3876750626742878, "learning_rate": 1.0734322118043158e-06, "logits/chosen": -0.861190676689148, "logits/rejected": -0.7343040704727173, "logps/chosen": -0.03276204317808151, "logps/rejected": -7.235999584197998, "loss": 0.0495, "odds_ratio_loss": 0.001321395393460989, "rewards/accuracies": 1.0, "rewards/chosen": -0.0032762044575065374, "rewards/margins": 0.7203238010406494, "rewards/rejected": -0.7236000299453735, "sft_loss": 0.03276204317808151, "step": 3180 }, { "epoch": 4.600144613159798, "grad_norm": 1.4689343043016123, "learning_rate": 1.0713158529680336e-06, "logits/chosen": -0.8702647686004639, "logits/rejected": -0.7496334314346313, "logps/chosen": -0.05915086343884468, "logps/rejected": -3.7460100650787354, "loss": 0.0443, "odds_ratio_loss": 0.00663282535970211, "rewards/accuracies": 1.0, "rewards/chosen": -0.005915086250752211, "rewards/margins": 0.36868590116500854, "rewards/rejected": -0.37460100650787354, "sft_loss": 0.05915086343884468, "step": 3181 }, { "epoch": 4.601590744757773, "grad_norm": 2.1623315858903696, "learning_rate": 1.069201259846331e-06, "logits/chosen": -0.8386332988739014, "logits/rejected": -0.6750970482826233, "logps/chosen": -0.08837445080280304, "logps/rejected": -4.1748151779174805, "loss": 0.0652, "odds_ratio_loss": 0.009183772839605808, "rewards/accuracies": 1.0, "rewards/chosen": -0.008837445639073849, "rewards/margins": 0.40864408016204834, "rewards/rejected": -0.41748151183128357, "sft_loss": 0.08837445080280304, "step": 3182 }, { "epoch": 4.603036876355748, "grad_norm": 1.4154943795887955, "learning_rate": 1.0670884337141028e-06, "logits/chosen": -0.6935139894485474, "logits/rejected": -0.42883455753326416, "logps/chosen": -0.10457340627908707, "logps/rejected": -5.574097156524658, "loss": 0.0871, "odds_ratio_loss": 0.016215285286307335, "rewards/accuracies": 1.0, "rewards/chosen": -0.010457340627908707, "rewards/margins": 0.5469523668289185, "rewards/rejected": -0.5574097037315369, "sft_loss": 0.10457340627908707, "step": 3183 }, { "epoch": 4.604483007953724, "grad_norm": 1.7813193278087818, "learning_rate": 1.0649773758451832e-06, "logits/chosen": -0.8007773160934448, "logits/rejected": -0.6058996915817261, "logps/chosen": -0.07291635125875473, "logps/rejected": -5.96047306060791, "loss": 0.0747, "odds_ratio_loss": 0.005711485166102648, "rewards/accuracies": 1.0, "rewards/chosen": -0.0072916364297270775, "rewards/margins": 0.58875572681427, "rewards/rejected": -0.5960473418235779, "sft_loss": 0.07291635125875473, "step": 3184 }, { "epoch": 4.6059291395517, "grad_norm": 1.8367648936309657, "learning_rate": 1.0628680875123327e-06, "logits/chosen": -0.5505800843238831, "logits/rejected": -0.4273523986339569, "logps/chosen": -0.06284250319004059, "logps/rejected": -3.4022233486175537, "loss": 0.0687, "odds_ratio_loss": 0.006926396396011114, "rewards/accuracies": 1.0, "rewards/chosen": -0.006284250877797604, "rewards/margins": 0.3339381217956543, "rewards/rejected": -0.3402223289012909, "sft_loss": 0.06284250319004059, "step": 3185 }, { "epoch": 4.6073752711496745, "grad_norm": 1.7856939273411114, "learning_rate": 1.0607605699872534e-06, "logits/chosen": -0.7774771451950073, "logits/rejected": -0.6703144311904907, "logps/chosen": -0.18263757228851318, "logps/rejected": -4.344770908355713, "loss": 0.099, "odds_ratio_loss": 0.021513303741812706, "rewards/accuracies": 1.0, "rewards/chosen": -0.01826375722885132, "rewards/margins": 0.41621333360671997, "rewards/rejected": -0.4344770908355713, "sft_loss": 0.18263757228851318, "step": 3186 }, { "epoch": 4.60882140274765, "grad_norm": 1.779759710533555, "learning_rate": 1.0586548245405715e-06, "logits/chosen": -0.5290587544441223, "logits/rejected": -0.5527068972587585, "logps/chosen": -0.20977208018302917, "logps/rejected": -4.67911434173584, "loss": 0.1121, "odds_ratio_loss": 0.024520423263311386, "rewards/accuracies": 1.0, "rewards/chosen": -0.020977208390831947, "rewards/margins": 0.44693419337272644, "rewards/rejected": -0.46791142225265503, "sft_loss": 0.20977208018302917, "step": 3187 }, { "epoch": 4.610267534345626, "grad_norm": 1.6500003465840882, "learning_rate": 1.0565508524418522e-06, "logits/chosen": -0.826015293598175, "logits/rejected": -0.6062042713165283, "logps/chosen": -0.09776744246482849, "logps/rejected": -3.359614849090576, "loss": 0.0841, "odds_ratio_loss": 0.008568299002945423, "rewards/accuracies": 1.0, "rewards/chosen": -0.009776744991540909, "rewards/margins": 0.3261847198009491, "rewards/rejected": -0.3359614908695221, "sft_loss": 0.09776744246482849, "step": 3188 }, { "epoch": 4.611713665943601, "grad_norm": 1.4752869233630663, "learning_rate": 1.0544486549595868e-06, "logits/chosen": -0.8045449256896973, "logits/rejected": -0.71222323179245, "logps/chosen": -0.04860668256878853, "logps/rejected": -3.213080883026123, "loss": 0.0518, "odds_ratio_loss": 0.007026593200862408, "rewards/accuracies": 1.0, "rewards/chosen": -0.004860668908804655, "rewards/margins": 0.3164474368095398, "rewards/rejected": -0.32130810618400574, "sft_loss": 0.04860668256878853, "step": 3189 }, { "epoch": 4.613159797541576, "grad_norm": 1.349596427506426, "learning_rate": 1.0523482333611973e-06, "logits/chosen": -0.9437330961227417, "logits/rejected": -0.670153021812439, "logps/chosen": -0.027105463668704033, "logps/rejected": -5.5081562995910645, "loss": 0.0588, "odds_ratio_loss": 0.003837391035631299, "rewards/accuracies": 1.0, "rewards/chosen": -0.002710546599701047, "rewards/margins": 0.5481050610542297, "rewards/rejected": -0.5508156418800354, "sft_loss": 0.027105463668704033, "step": 3190 }, { "epoch": 4.614605929139552, "grad_norm": 1.6136975666084887, "learning_rate": 1.0502495889130348e-06, "logits/chosen": -0.6746639609336853, "logits/rejected": -0.5449431538581848, "logps/chosen": -0.04978754371404648, "logps/rejected": -4.931973934173584, "loss": 0.0676, "odds_ratio_loss": 0.01146079320460558, "rewards/accuracies": 1.0, "rewards/chosen": -0.004978754557669163, "rewards/margins": 0.48821866512298584, "rewards/rejected": -0.49319738149642944, "sft_loss": 0.04978754371404648, "step": 3191 }, { "epoch": 4.616052060737527, "grad_norm": 1.8053309321240607, "learning_rate": 1.0481527228803825e-06, "logits/chosen": -0.7316631078720093, "logits/rejected": -0.48841989040374756, "logps/chosen": -0.05751391872763634, "logps/rejected": -7.884844779968262, "loss": 0.0968, "odds_ratio_loss": 0.0013119642389938235, "rewards/accuracies": 1.0, "rewards/chosen": -0.005751391872763634, "rewards/margins": 0.7827330827713013, "rewards/rejected": -0.7884844541549683, "sft_loss": 0.05751391872763634, "step": 3192 }, { "epoch": 4.6174981923355025, "grad_norm": 1.6906203577332746, "learning_rate": 1.0460576365274464e-06, "logits/chosen": -0.5634068846702576, "logits/rejected": -0.4004989564418793, "logps/chosen": -0.09039582312107086, "logps/rejected": -6.0387139320373535, "loss": 0.0674, "odds_ratio_loss": 0.010964239947497845, "rewards/accuracies": 1.0, "rewards/chosen": -0.00903958361595869, "rewards/margins": 0.5948318243026733, "rewards/rejected": -0.6038714051246643, "sft_loss": 0.09039582312107086, "step": 3193 }, { "epoch": 4.618944323933478, "grad_norm": 1.5504675757012243, "learning_rate": 1.043964331117364e-06, "logits/chosen": -0.6690958142280579, "logits/rejected": -0.7253610491752625, "logps/chosen": -0.04411579668521881, "logps/rejected": -3.1955485343933105, "loss": 0.0484, "odds_ratio_loss": 0.011617464944720268, "rewards/accuracies": 1.0, "rewards/chosen": -0.004411579575389624, "rewards/margins": 0.31514328718185425, "rewards/rejected": -0.3195548951625824, "sft_loss": 0.04411579668521881, "step": 3194 }, { "epoch": 4.620390455531453, "grad_norm": 1.9494225748283929, "learning_rate": 1.0418728079121946e-06, "logits/chosen": -0.7559603452682495, "logits/rejected": -0.6388688087463379, "logps/chosen": -0.08622848987579346, "logps/rejected": -4.276580810546875, "loss": 0.0798, "odds_ratio_loss": 0.00728376442566514, "rewards/accuracies": 1.0, "rewards/chosen": -0.008622849360108376, "rewards/margins": 0.4190352261066437, "rewards/rejected": -0.4276581108570099, "sft_loss": 0.08622848987579346, "step": 3195 }, { "epoch": 4.621836587129429, "grad_norm": 1.5533084791527914, "learning_rate": 1.039783068172928e-06, "logits/chosen": -0.704337477684021, "logits/rejected": -0.5278608202934265, "logps/chosen": -0.11907921731472015, "logps/rejected": -3.9224257469177246, "loss": 0.1027, "odds_ratio_loss": 0.014725545421242714, "rewards/accuracies": 1.0, "rewards/chosen": -0.011907922104001045, "rewards/margins": 0.38033467531204224, "rewards/rejected": -0.3922426104545593, "sft_loss": 0.11907921731472015, "step": 3196 }, { "epoch": 4.623282718727404, "grad_norm": 1.5224963198530153, "learning_rate": 1.0376951131594745e-06, "logits/chosen": -1.0034908056259155, "logits/rejected": -0.6819223165512085, "logps/chosen": -0.044741638004779816, "logps/rejected": -5.437371730804443, "loss": 0.0572, "odds_ratio_loss": 0.0024968015495687723, "rewards/accuracies": 1.0, "rewards/chosen": -0.0044741639867424965, "rewards/margins": 0.5392630100250244, "rewards/rejected": -0.5437371730804443, "sft_loss": 0.044741638004779816, "step": 3197 }, { "epoch": 4.624728850325379, "grad_norm": 2.572168604966729, "learning_rate": 1.0356089441306685e-06, "logits/chosen": -1.0177311897277832, "logits/rejected": -0.6533507108688354, "logps/chosen": -0.08731850981712341, "logps/rejected": -4.7309112548828125, "loss": 0.0765, "odds_ratio_loss": 0.01737801730632782, "rewards/accuracies": 1.0, "rewards/chosen": -0.008731850422918797, "rewards/margins": 0.4643592834472656, "rewards/rejected": -0.47309115529060364, "sft_loss": 0.08731850981712341, "step": 3198 }, { "epoch": 4.626174981923355, "grad_norm": 1.7934135984111654, "learning_rate": 1.0335245623442724e-06, "logits/chosen": -0.8442277908325195, "logits/rejected": -0.6539605259895325, "logps/chosen": -0.10159482061862946, "logps/rejected": -4.887598991394043, "loss": 0.1044, "odds_ratio_loss": 0.010155798867344856, "rewards/accuracies": 1.0, "rewards/chosen": -0.01015948224812746, "rewards/margins": 0.4786003828048706, "rewards/rejected": -0.4887598752975464, "sft_loss": 0.10159482061862946, "step": 3199 }, { "epoch": 4.6276211135213305, "grad_norm": 1.4702477282441089, "learning_rate": 1.0314419690569645e-06, "logits/chosen": -0.7017852067947388, "logits/rejected": -0.6079646348953247, "logps/chosen": -0.05808115005493164, "logps/rejected": -4.254312992095947, "loss": 0.0666, "odds_ratio_loss": 0.00289735640399158, "rewards/accuracies": 1.0, "rewards/chosen": -0.005808115005493164, "rewards/margins": 0.4196231961250305, "rewards/rejected": -0.4254313111305237, "sft_loss": 0.05808115005493164, "step": 3200 }, { "epoch": 4.629067245119306, "grad_norm": 1.8173043025233961, "learning_rate": 1.0293611655243508e-06, "logits/chosen": -0.8450669646263123, "logits/rejected": -0.6467236280441284, "logps/chosen": -0.17060142755508423, "logps/rejected": -6.41766357421875, "loss": 0.1134, "odds_ratio_loss": 0.03092418983578682, "rewards/accuracies": 1.0, "rewards/chosen": -0.017060142010450363, "rewards/margins": 0.6247062087059021, "rewards/rejected": -0.641766369342804, "sft_loss": 0.17060142755508423, "step": 3201 }, { "epoch": 4.630513376717281, "grad_norm": 1.7247881706158277, "learning_rate": 1.0272821530009528e-06, "logits/chosen": -0.7554938793182373, "logits/rejected": -0.631575882434845, "logps/chosen": -0.051126834005117416, "logps/rejected": -5.378612518310547, "loss": 0.0964, "odds_ratio_loss": 0.004960625432431698, "rewards/accuracies": 1.0, "rewards/chosen": -0.005112683400511742, "rewards/margins": 0.5327485799789429, "rewards/rejected": -0.5378612279891968, "sft_loss": 0.051126834005117416, "step": 3202 }, { "epoch": 4.631959508315257, "grad_norm": 1.698964824312715, "learning_rate": 1.025204932740218e-06, "logits/chosen": -0.8689213991165161, "logits/rejected": -0.4361826181411743, "logps/chosen": -0.0891483798623085, "logps/rejected": -5.1415205001831055, "loss": 0.0987, "odds_ratio_loss": 0.04759196564555168, "rewards/accuracies": 0.9375, "rewards/chosen": -0.008914838545024395, "rewards/margins": 0.5052372217178345, "rewards/rejected": -0.5141521096229553, "sft_loss": 0.0891483798623085, "step": 3203 }, { "epoch": 4.633405639913232, "grad_norm": 1.543659408045758, "learning_rate": 1.0231295059945084e-06, "logits/chosen": -0.7963493466377258, "logits/rejected": -0.6935844421386719, "logps/chosen": -0.054386600852012634, "logps/rejected": -3.769763708114624, "loss": 0.0832, "odds_ratio_loss": 0.0008780553471297026, "rewards/accuracies": 1.0, "rewards/chosen": -0.005438660271465778, "rewards/margins": 0.3715377449989319, "rewards/rejected": -0.3769764006137848, "sft_loss": 0.054386600852012634, "step": 3204 }, { "epoch": 4.634851771511207, "grad_norm": 1.5964525768359796, "learning_rate": 1.0210558740151065e-06, "logits/chosen": -0.7700487375259399, "logits/rejected": -0.5274595618247986, "logps/chosen": -0.08021484315395355, "logps/rejected": -5.129361152648926, "loss": 0.0822, "odds_ratio_loss": 0.013991568237543106, "rewards/accuracies": 1.0, "rewards/chosen": -0.00802148412913084, "rewards/margins": 0.5049146413803101, "rewards/rejected": -0.5129361152648926, "sft_loss": 0.08021484315395355, "step": 3205 }, { "epoch": 4.636297903109183, "grad_norm": 1.7217177993051314, "learning_rate": 1.0189840380522118e-06, "logits/chosen": -0.7825113534927368, "logits/rejected": -0.5335025787353516, "logps/chosen": -0.048637744039297104, "logps/rejected": -4.941579341888428, "loss": 0.0923, "odds_ratio_loss": 0.006592496298253536, "rewards/accuracies": 1.0, "rewards/chosen": -0.00486377440392971, "rewards/margins": 0.4892942011356354, "rewards/rejected": -0.49415794014930725, "sft_loss": 0.048637744039297104, "step": 3206 }, { "epoch": 4.637744034707159, "grad_norm": 1.6148713420823113, "learning_rate": 1.0169139993549443e-06, "logits/chosen": -0.6278676390647888, "logits/rejected": -0.5886056423187256, "logps/chosen": -0.09759227186441422, "logps/rejected": -6.357240676879883, "loss": 0.076, "odds_ratio_loss": 0.010194497182965279, "rewards/accuracies": 1.0, "rewards/chosen": -0.009759227745234966, "rewards/margins": 0.6259648203849792, "rewards/rejected": -0.6357241272926331, "sft_loss": 0.09759227186441422, "step": 3207 }, { "epoch": 4.639190166305134, "grad_norm": 1.6617822163455997, "learning_rate": 1.0148457591713358e-06, "logits/chosen": -0.7911767363548279, "logits/rejected": -0.7584560513496399, "logps/chosen": -0.12109269201755524, "logps/rejected": -3.515918254852295, "loss": 0.12, "odds_ratio_loss": 0.022366242483258247, "rewards/accuracies": 1.0, "rewards/chosen": -0.012109269388020039, "rewards/margins": 0.33948254585266113, "rewards/rejected": -0.3515918254852295, "sft_loss": 0.12109269201755524, "step": 3208 }, { "epoch": 4.640636297903109, "grad_norm": 1.5696273536778012, "learning_rate": 1.0127793187483367e-06, "logits/chosen": -0.7384767532348633, "logits/rejected": -0.5731381177902222, "logps/chosen": -0.07455767691135406, "logps/rejected": -5.048724174499512, "loss": 0.0901, "odds_ratio_loss": 0.0019971770234405994, "rewards/accuracies": 1.0, "rewards/chosen": -0.007455767132341862, "rewards/margins": 0.497416615486145, "rewards/rejected": -0.5048723816871643, "sft_loss": 0.07455767691135406, "step": 3209 }, { "epoch": 4.642082429501085, "grad_norm": 1.3018747419669137, "learning_rate": 1.010714679331813e-06, "logits/chosen": -0.6185959577560425, "logits/rejected": -0.5798184871673584, "logps/chosen": -0.08040968328714371, "logps/rejected": -4.064995288848877, "loss": 0.0642, "odds_ratio_loss": 0.05532249063253403, "rewards/accuracies": 0.9375, "rewards/chosen": -0.00804096832871437, "rewards/margins": 0.3984585702419281, "rewards/rejected": -0.4064995348453522, "sft_loss": 0.08040968328714371, "step": 3210 }, { "epoch": 4.64352856109906, "grad_norm": 1.5401199624080288, "learning_rate": 1.0086518421665417e-06, "logits/chosen": -0.8253481984138489, "logits/rejected": -0.4594198763370514, "logps/chosen": -0.06197258457541466, "logps/rejected": -4.424698352813721, "loss": 0.0879, "odds_ratio_loss": 0.05480283871293068, "rewards/accuracies": 0.9375, "rewards/chosen": -0.006197258364409208, "rewards/margins": 0.4362725615501404, "rewards/rejected": -0.44246983528137207, "sft_loss": 0.06197258457541466, "step": 3211 }, { "epoch": 4.644974692697035, "grad_norm": 1.7590357910590537, "learning_rate": 1.0065908084962166e-06, "logits/chosen": -0.6786633133888245, "logits/rejected": -0.5764570236206055, "logps/chosen": -0.11568009108304977, "logps/rejected": -3.1933226585388184, "loss": 0.0924, "odds_ratio_loss": 0.018704205751419067, "rewards/accuracies": 1.0, "rewards/chosen": -0.011568009853363037, "rewards/margins": 0.30776429176330566, "rewards/rejected": -0.3193323016166687, "sft_loss": 0.11568009108304977, "step": 3212 }, { "epoch": 4.646420824295011, "grad_norm": 1.4765110012013127, "learning_rate": 1.0045315795634416e-06, "logits/chosen": -0.8340771794319153, "logits/rejected": -0.63593989610672, "logps/chosen": -0.06669916212558746, "logps/rejected": -3.186521530151367, "loss": 0.0646, "odds_ratio_loss": 0.008953748270869255, "rewards/accuracies": 1.0, "rewards/chosen": -0.006669916212558746, "rewards/margins": 0.3119822144508362, "rewards/rejected": -0.3186521530151367, "sft_loss": 0.06669916212558746, "step": 3213 }, { "epoch": 4.647866955892987, "grad_norm": 1.4826650637112055, "learning_rate": 1.002474156609734e-06, "logits/chosen": -0.7655255794525146, "logits/rejected": -0.5246065855026245, "logps/chosen": -0.029604678973555565, "logps/rejected": -7.7826080322265625, "loss": 0.085, "odds_ratio_loss": 0.0013552091550081968, "rewards/accuracies": 1.0, "rewards/chosen": -0.0029604679439216852, "rewards/margins": 0.7753003835678101, "rewards/rejected": -0.7782608270645142, "sft_loss": 0.029604678973555565, "step": 3214 }, { "epoch": 4.649313087490961, "grad_norm": 1.5306493087569484, "learning_rate": 1.0004185408755196e-06, "logits/chosen": -0.9814145565032959, "logits/rejected": -0.6695619821548462, "logps/chosen": -0.0681711882352829, "logps/rejected": -5.490948677062988, "loss": 0.0671, "odds_ratio_loss": 0.008766880258917809, "rewards/accuracies": 1.0, "rewards/chosen": -0.006817118730396032, "rewards/margins": 0.5422778129577637, "rewards/rejected": -0.5490949153900146, "sft_loss": 0.0681711882352829, "step": 3215 }, { "epoch": 4.650759219088937, "grad_norm": 1.4412286059471762, "learning_rate": 9.983647336001401e-07, "logits/chosen": -0.6026754379272461, "logits/rejected": -0.6260681748390198, "logps/chosen": -0.09057678282260895, "logps/rejected": -3.460932731628418, "loss": 0.0653, "odds_ratio_loss": 0.015110738575458527, "rewards/accuracies": 1.0, "rewards/chosen": -0.009057678282260895, "rewards/margins": 0.3370356261730194, "rewards/rejected": -0.3460932672023773, "sft_loss": 0.09057678282260895, "step": 3216 }, { "epoch": 4.652205350686913, "grad_norm": 1.334960822259661, "learning_rate": 9.963127360218409e-07, "logits/chosen": -0.8631464242935181, "logits/rejected": -0.5723793506622314, "logps/chosen": -0.05538426339626312, "logps/rejected": -5.14091157913208, "loss": 0.0499, "odds_ratio_loss": 0.004457543138414621, "rewards/accuracies": 1.0, "rewards/chosen": -0.005538426339626312, "rewards/margins": 0.5085527300834656, "rewards/rejected": -0.5140911340713501, "sft_loss": 0.05538426339626312, "step": 3217 }, { "epoch": 4.653651482284888, "grad_norm": 1.655925383862883, "learning_rate": 9.94262549377781e-07, "logits/chosen": -0.6246498227119446, "logits/rejected": -0.5415275692939758, "logps/chosen": -0.06165987253189087, "logps/rejected": -3.9687774181365967, "loss": 0.0864, "odds_ratio_loss": 0.015562493354082108, "rewards/accuracies": 1.0, "rewards/chosen": -0.006165987346321344, "rewards/margins": 0.3907117545604706, "rewards/rejected": -0.3968777060508728, "sft_loss": 0.06165987253189087, "step": 3218 }, { "epoch": 4.655097613882863, "grad_norm": 1.551288429706046, "learning_rate": 9.922141749040238e-07, "logits/chosen": -0.8679819703102112, "logits/rejected": -0.5541244745254517, "logps/chosen": -0.06448002904653549, "logps/rejected": -4.471212387084961, "loss": 0.0783, "odds_ratio_loss": 0.004951273091137409, "rewards/accuracies": 1.0, "rewards/chosen": -0.006448003463447094, "rewards/margins": 0.44067320227622986, "rewards/rejected": -0.44712120294570923, "sft_loss": 0.06448002904653549, "step": 3219 }, { "epoch": 4.656543745480839, "grad_norm": 1.5372933816352716, "learning_rate": 9.901676138355438e-07, "logits/chosen": -0.6660540699958801, "logits/rejected": -0.5739330649375916, "logps/chosen": -0.04592113569378853, "logps/rejected": -4.954591274261475, "loss": 0.0778, "odds_ratio_loss": 0.004109309054911137, "rewards/accuracies": 1.0, "rewards/chosen": -0.004592114128172398, "rewards/margins": 0.4908669590950012, "rewards/rejected": -0.49545910954475403, "sft_loss": 0.04592113569378853, "step": 3220 }, { "epoch": 4.657989877078814, "grad_norm": 1.5844634244020717, "learning_rate": 9.88122867406219e-07, "logits/chosen": -0.7254536151885986, "logits/rejected": -0.5857067704200745, "logps/chosen": -0.1282578706741333, "logps/rejected": -5.345030307769775, "loss": 0.0979, "odds_ratio_loss": 0.006788488943129778, "rewards/accuracies": 1.0, "rewards/chosen": -0.01282578706741333, "rewards/margins": 0.5216772556304932, "rewards/rejected": -0.5345030426979065, "sft_loss": 0.1282578706741333, "step": 3221 }, { "epoch": 4.659436008676789, "grad_norm": 1.5352851093014344, "learning_rate": 9.860799368488338e-07, "logits/chosen": -0.8546849489212036, "logits/rejected": -0.49446427822113037, "logps/chosen": -0.058683719485998154, "logps/rejected": -6.933520793914795, "loss": 0.0789, "odds_ratio_loss": 0.003612469183281064, "rewards/accuracies": 1.0, "rewards/chosen": -0.005868372041732073, "rewards/margins": 0.6874837279319763, "rewards/rejected": -0.6933520436286926, "sft_loss": 0.058683719485998154, "step": 3222 }, { "epoch": 4.660882140274765, "grad_norm": 1.9348714464409837, "learning_rate": 9.840388233950809e-07, "logits/chosen": -0.878996729850769, "logits/rejected": -0.6803721785545349, "logps/chosen": -0.08524107933044434, "logps/rejected": -3.973269462585449, "loss": 0.1024, "odds_ratio_loss": 0.00521640432998538, "rewards/accuracies": 1.0, "rewards/chosen": -0.008524107746779919, "rewards/margins": 0.38880282640457153, "rewards/rejected": -0.3973269760608673, "sft_loss": 0.08524107933044434, "step": 3223 }, { "epoch": 4.662328271872741, "grad_norm": 2.0220508369894223, "learning_rate": 9.819995282755526e-07, "logits/chosen": -0.7756155133247375, "logits/rejected": -0.6541730761528015, "logps/chosen": -0.14658205211162567, "logps/rejected": -4.749970436096191, "loss": 0.1, "odds_ratio_loss": 0.013168347999453545, "rewards/accuracies": 1.0, "rewards/chosen": -0.014658206142485142, "rewards/margins": 0.460338830947876, "rewards/rejected": -0.47499704360961914, "sft_loss": 0.14658205211162567, "step": 3224 }, { "epoch": 4.663774403470716, "grad_norm": 1.6531658948928443, "learning_rate": 9.799620527197503e-07, "logits/chosen": -0.7469644546508789, "logits/rejected": -0.509243369102478, "logps/chosen": -0.1062968522310257, "logps/rejected": -4.866016864776611, "loss": 0.0707, "odds_ratio_loss": 0.007952062413096428, "rewards/accuracies": 1.0, "rewards/chosen": -0.0106296855956316, "rewards/margins": 0.4759719967842102, "rewards/rejected": -0.48660171031951904, "sft_loss": 0.1062968522310257, "step": 3225 }, { "epoch": 4.665220535068691, "grad_norm": 1.5363536198298622, "learning_rate": 9.779263979560735e-07, "logits/chosen": -0.8867968916893005, "logits/rejected": -0.7032777070999146, "logps/chosen": -0.07279931008815765, "logps/rejected": -3.2184600830078125, "loss": 0.0761, "odds_ratio_loss": 0.006781273987144232, "rewards/accuracies": 1.0, "rewards/chosen": -0.0072799306362867355, "rewards/margins": 0.31456607580184937, "rewards/rejected": -0.32184600830078125, "sft_loss": 0.07279931008815765, "step": 3226 }, { "epoch": 4.666666666666667, "grad_norm": 1.585570187302912, "learning_rate": 9.758925652118276e-07, "logits/chosen": -0.729887068271637, "logits/rejected": -0.5777206420898438, "logps/chosen": -0.05206802114844322, "logps/rejected": -4.347285747528076, "loss": 0.093, "odds_ratio_loss": 0.003964191768318415, "rewards/accuracies": 1.0, "rewards/chosen": -0.005206801928579807, "rewards/margins": 0.42952173948287964, "rewards/rejected": -0.43472856283187866, "sft_loss": 0.05206802114844322, "step": 3227 }, { "epoch": 4.668112798264642, "grad_norm": 1.1830202726874117, "learning_rate": 9.738605557132168e-07, "logits/chosen": -0.7481473088264465, "logits/rejected": -0.5761478543281555, "logps/chosen": -0.03131213039159775, "logps/rejected": -5.612335205078125, "loss": 0.0631, "odds_ratio_loss": 0.003878758754581213, "rewards/accuracies": 1.0, "rewards/chosen": -0.0031312128994613886, "rewards/margins": 0.5581023097038269, "rewards/rejected": -0.5612335801124573, "sft_loss": 0.03131213039159775, "step": 3228 }, { "epoch": 4.6695589298626174, "grad_norm": 1.4602396396613635, "learning_rate": 9.71830370685348e-07, "logits/chosen": -0.80008864402771, "logits/rejected": -0.8370683789253235, "logps/chosen": -0.08197289705276489, "logps/rejected": -4.320422172546387, "loss": 0.0699, "odds_ratio_loss": 0.010543855838477612, "rewards/accuracies": 1.0, "rewards/chosen": -0.008197290822863579, "rewards/margins": 0.4238448441028595, "rewards/rejected": -0.4320421516895294, "sft_loss": 0.08197289705276489, "step": 3229 }, { "epoch": 4.671005061460593, "grad_norm": 1.4630280771677762, "learning_rate": 9.698020113522253e-07, "logits/chosen": -0.6244576573371887, "logits/rejected": -0.5266187787055969, "logps/chosen": -0.04099838435649872, "logps/rejected": -6.273877143859863, "loss": 0.0721, "odds_ratio_loss": 0.0028168242424726486, "rewards/accuracies": 1.0, "rewards/chosen": -0.004099838435649872, "rewards/margins": 0.6232879161834717, "rewards/rejected": -0.6273877620697021, "sft_loss": 0.04099838435649872, "step": 3230 }, { "epoch": 4.672451193058569, "grad_norm": 1.6816336361544946, "learning_rate": 9.677754789367569e-07, "logits/chosen": -1.0483098030090332, "logits/rejected": -0.6950076818466187, "logps/chosen": -0.10204799473285675, "logps/rejected": -5.153908729553223, "loss": 0.1092, "odds_ratio_loss": 0.00561350304633379, "rewards/accuracies": 1.0, "rewards/chosen": -0.0102047985419631, "rewards/margins": 0.5051860809326172, "rewards/rejected": -0.5153908729553223, "sft_loss": 0.10204799473285675, "step": 3231 }, { "epoch": 4.673897324656544, "grad_norm": 1.6650569138582232, "learning_rate": 9.657507746607442e-07, "logits/chosen": -0.8510163426399231, "logits/rejected": -0.6006070971488953, "logps/chosen": -0.061348456889390945, "logps/rejected": -4.452303886413574, "loss": 0.0874, "odds_ratio_loss": 0.008031385019421577, "rewards/accuracies": 1.0, "rewards/chosen": -0.006134845782071352, "rewards/margins": 0.43909552693367004, "rewards/rejected": -0.4452304244041443, "sft_loss": 0.061348456889390945, "step": 3232 }, { "epoch": 4.675343456254519, "grad_norm": 1.4129358383161126, "learning_rate": 9.637278997448919e-07, "logits/chosen": -1.053891658782959, "logits/rejected": -0.7565010190010071, "logps/chosen": -0.03904981166124344, "logps/rejected": -4.420421600341797, "loss": 0.0511, "odds_ratio_loss": 0.0059059979394078255, "rewards/accuracies": 1.0, "rewards/chosen": -0.0039049815386533737, "rewards/margins": 0.4381372034549713, "rewards/rejected": -0.44204211235046387, "sft_loss": 0.03904981166124344, "step": 3233 }, { "epoch": 4.676789587852495, "grad_norm": 1.6502092599765257, "learning_rate": 9.617068554087953e-07, "logits/chosen": -0.7731687426567078, "logits/rejected": -0.6064621806144714, "logps/chosen": -0.07107666879892349, "logps/rejected": -4.7771806716918945, "loss": 0.0717, "odds_ratio_loss": 0.004392960108816624, "rewards/accuracies": 1.0, "rewards/chosen": -0.007107666693627834, "rewards/margins": 0.4706103801727295, "rewards/rejected": -0.4777180850505829, "sft_loss": 0.07107666879892349, "step": 3234 }, { "epoch": 4.67823571945047, "grad_norm": 1.828353358209516, "learning_rate": 9.596876428709531e-07, "logits/chosen": -0.7077252864837646, "logits/rejected": -0.49990472197532654, "logps/chosen": -0.09420859813690186, "logps/rejected": -4.479310035705566, "loss": 0.0771, "odds_ratio_loss": 0.012622407637536526, "rewards/accuracies": 1.0, "rewards/chosen": -0.00942085962742567, "rewards/margins": 0.4385101795196533, "rewards/rejected": -0.4479310214519501, "sft_loss": 0.09420859813690186, "step": 3235 }, { "epoch": 4.6796818510484455, "grad_norm": 1.7821117898301133, "learning_rate": 9.576702633487537e-07, "logits/chosen": -0.8034138679504395, "logits/rejected": -0.6696498990058899, "logps/chosen": -0.14594009518623352, "logps/rejected": -3.874324321746826, "loss": 0.0969, "odds_ratio_loss": 0.024632243439555168, "rewards/accuracies": 1.0, "rewards/chosen": -0.014594011008739471, "rewards/margins": 0.37283843755722046, "rewards/rejected": -0.38743242621421814, "sft_loss": 0.14594009518623352, "step": 3236 }, { "epoch": 4.681127982646421, "grad_norm": 1.9260742548491816, "learning_rate": 9.556547180584828e-07, "logits/chosen": -0.6212836503982544, "logits/rejected": -0.5890661478042603, "logps/chosen": -0.15807095170021057, "logps/rejected": -3.948227882385254, "loss": 0.126, "odds_ratio_loss": 0.0165469441562891, "rewards/accuracies": 1.0, "rewards/chosen": -0.01580709218978882, "rewards/margins": 0.3790157437324524, "rewards/rejected": -0.3948228359222412, "sft_loss": 0.15807095170021057, "step": 3237 }, { "epoch": 4.682574114244396, "grad_norm": 1.7002914000121425, "learning_rate": 9.536410082153215e-07, "logits/chosen": -0.7422010898590088, "logits/rejected": -0.556551992893219, "logps/chosen": -0.19723354279994965, "logps/rejected": -5.211292743682861, "loss": 0.106, "odds_ratio_loss": 0.009756670333445072, "rewards/accuracies": 1.0, "rewards/chosen": -0.019723355770111084, "rewards/margins": 0.5014059543609619, "rewards/rejected": -0.521129310131073, "sft_loss": 0.19723354279994965, "step": 3238 }, { "epoch": 4.684020245842372, "grad_norm": 1.3402956800471524, "learning_rate": 9.516291350333414e-07, "logits/chosen": -0.8872632384300232, "logits/rejected": -0.7011750936508179, "logps/chosen": -0.04245447367429733, "logps/rejected": -4.0774006843566895, "loss": 0.053, "odds_ratio_loss": 0.0019003519555553794, "rewards/accuracies": 1.0, "rewards/chosen": -0.004245447926223278, "rewards/margins": 0.40349453687667847, "rewards/rejected": -0.4077400267124176, "sft_loss": 0.04245447367429733, "step": 3239 }, { "epoch": 4.685466377440347, "grad_norm": 1.6582515741923567, "learning_rate": 9.496190997255098e-07, "logits/chosen": -0.6277506947517395, "logits/rejected": -0.443684846162796, "logps/chosen": -0.07038243860006332, "logps/rejected": -4.194167137145996, "loss": 0.0861, "odds_ratio_loss": 0.014950907789170742, "rewards/accuracies": 1.0, "rewards/chosen": -0.00703824358060956, "rewards/margins": 0.4123784899711609, "rewards/rejected": -0.41941672563552856, "sft_loss": 0.07038243860006332, "step": 3240 }, { "epoch": 4.686912509038322, "grad_norm": 1.5661539655850352, "learning_rate": 9.476109035036831e-07, "logits/chosen": -0.833951473236084, "logits/rejected": -0.7562543153762817, "logps/chosen": -0.10855622589588165, "logps/rejected": -3.645920753479004, "loss": 0.076, "odds_ratio_loss": 0.010784967802464962, "rewards/accuracies": 1.0, "rewards/chosen": -0.010855622589588165, "rewards/margins": 0.35373643040657043, "rewards/rejected": -0.3645920753479004, "sft_loss": 0.10855622589588165, "step": 3241 }, { "epoch": 4.688358640636298, "grad_norm": 1.4776366552591538, "learning_rate": 9.456045475786121e-07, "logits/chosen": -0.7049121260643005, "logits/rejected": -0.518916130065918, "logps/chosen": -0.04573969542980194, "logps/rejected": -5.782866954803467, "loss": 0.0754, "odds_ratio_loss": 0.006768940016627312, "rewards/accuracies": 1.0, "rewards/chosen": -0.004573969170451164, "rewards/margins": 0.5737127065658569, "rewards/rejected": -0.5782866477966309, "sft_loss": 0.04573969542980194, "step": 3242 }, { "epoch": 4.6898047722342735, "grad_norm": 1.615380623509659, "learning_rate": 9.436000331599347e-07, "logits/chosen": -0.865658164024353, "logits/rejected": -0.7146108150482178, "logps/chosen": -0.038462672382593155, "logps/rejected": -4.004945755004883, "loss": 0.0563, "odds_ratio_loss": 0.007846795953810215, "rewards/accuracies": 1.0, "rewards/chosen": -0.003846267005428672, "rewards/margins": 0.3966482877731323, "rewards/rejected": -0.40049460530281067, "sft_loss": 0.038462672382593155, "step": 3243 }, { "epoch": 4.691250903832248, "grad_norm": 1.7145212188015946, "learning_rate": 9.415973614561812e-07, "logits/chosen": -0.6654316782951355, "logits/rejected": -0.5937279462814331, "logps/chosen": -0.07255206257104874, "logps/rejected": -5.105835914611816, "loss": 0.0939, "odds_ratio_loss": 0.010402324609458447, "rewards/accuracies": 1.0, "rewards/chosen": -0.007255205884575844, "rewards/margins": 0.5033283233642578, "rewards/rejected": -0.5105835199356079, "sft_loss": 0.07255206257104874, "step": 3244 }, { "epoch": 4.692697035430224, "grad_norm": 1.5620231531870004, "learning_rate": 9.395965336747677e-07, "logits/chosen": -0.7461257576942444, "logits/rejected": -0.6894717216491699, "logps/chosen": -0.06823474168777466, "logps/rejected": -5.050882339477539, "loss": 0.0625, "odds_ratio_loss": 0.007007937878370285, "rewards/accuracies": 1.0, "rewards/chosen": -0.006823473609983921, "rewards/margins": 0.49826475977897644, "rewards/rejected": -0.505088210105896, "sft_loss": 0.06823474168777466, "step": 3245 }, { "epoch": 4.6941431670282, "grad_norm": 1.5250744458372743, "learning_rate": 9.375975510220033e-07, "logits/chosen": -0.6381784677505493, "logits/rejected": -0.47818252444267273, "logps/chosen": -0.06893990188837051, "logps/rejected": -4.53413200378418, "loss": 0.0681, "odds_ratio_loss": 0.006821871735155582, "rewards/accuracies": 1.0, "rewards/chosen": -0.006893990095704794, "rewards/margins": 0.446519136428833, "rewards/rejected": -0.4534131586551666, "sft_loss": 0.06893990188837051, "step": 3246 }, { "epoch": 4.695589298626175, "grad_norm": 1.3255197344195617, "learning_rate": 9.356004147030798e-07, "logits/chosen": -0.7541120052337646, "logits/rejected": -0.6116973161697388, "logps/chosen": -0.10232388228178024, "logps/rejected": -4.735118865966797, "loss": 0.0695, "odds_ratio_loss": 0.005717164371162653, "rewards/accuracies": 1.0, "rewards/chosen": -0.01023238804191351, "rewards/margins": 0.4632795453071594, "rewards/rejected": -0.4735119044780731, "sft_loss": 0.10232388228178024, "step": 3247 }, { "epoch": 4.69703543022415, "grad_norm": 1.6754504716143828, "learning_rate": 9.336051259220807e-07, "logits/chosen": -0.813551127910614, "logits/rejected": -0.5718523859977722, "logps/chosen": -0.08019101619720459, "logps/rejected": -4.289122104644775, "loss": 0.0796, "odds_ratio_loss": 0.008219133131206036, "rewards/accuracies": 1.0, "rewards/chosen": -0.008019101805984974, "rewards/margins": 0.4208931028842926, "rewards/rejected": -0.4289122223854065, "sft_loss": 0.08019101619720459, "step": 3248 }, { "epoch": 4.698481561822126, "grad_norm": 1.4682579826607338, "learning_rate": 9.316116858819704e-07, "logits/chosen": -0.9900127053260803, "logits/rejected": -0.6790282726287842, "logps/chosen": -0.04047767072916031, "logps/rejected": -4.796731948852539, "loss": 0.0736, "odds_ratio_loss": 0.002619100734591484, "rewards/accuracies": 1.0, "rewards/chosen": -0.0040477667935192585, "rewards/margins": 0.4756254553794861, "rewards/rejected": -0.47967323660850525, "sft_loss": 0.04047767072916031, "step": 3249 }, { "epoch": 4.6999276934201015, "grad_norm": 1.7451548851827552, "learning_rate": 9.296200957846028e-07, "logits/chosen": -0.8112927079200745, "logits/rejected": -0.715489387512207, "logps/chosen": -0.07249405980110168, "logps/rejected": -4.894453525543213, "loss": 0.0778, "odds_ratio_loss": 0.00706165935844183, "rewards/accuracies": 1.0, "rewards/chosen": -0.007249406538903713, "rewards/margins": 0.48219597339630127, "rewards/rejected": -0.48944535851478577, "sft_loss": 0.07249405980110168, "step": 3250 }, { "epoch": 4.701373825018076, "grad_norm": 1.649219451393204, "learning_rate": 9.276303568307167e-07, "logits/chosen": -0.9887948036193848, "logits/rejected": -0.7613665461540222, "logps/chosen": -0.0671960636973381, "logps/rejected": -3.415968179702759, "loss": 0.098, "odds_ratio_loss": 0.003112174803391099, "rewards/accuracies": 1.0, "rewards/chosen": -0.006719606928527355, "rewards/margins": 0.33487722277641296, "rewards/rejected": -0.3415968418121338, "sft_loss": 0.0671960636973381, "step": 3251 }, { "epoch": 4.702819956616052, "grad_norm": 2.515817384014982, "learning_rate": 9.25642470219929e-07, "logits/chosen": -0.8247559070587158, "logits/rejected": -0.6773849129676819, "logps/chosen": -0.07466698437929153, "logps/rejected": -4.479544162750244, "loss": 0.0735, "odds_ratio_loss": 0.0051119765266776085, "rewards/accuracies": 1.0, "rewards/chosen": -0.007466698996722698, "rewards/margins": 0.44048771262168884, "rewards/rejected": -0.4479544162750244, "sft_loss": 0.07466698437929153, "step": 3252 }, { "epoch": 4.704266088214028, "grad_norm": 1.491972360970979, "learning_rate": 9.236564371507474e-07, "logits/chosen": -0.7877534031867981, "logits/rejected": -0.6748812794685364, "logps/chosen": -0.13032501935958862, "logps/rejected": -4.983989715576172, "loss": 0.0713, "odds_ratio_loss": 0.01147688739001751, "rewards/accuracies": 1.0, "rewards/chosen": -0.013032502494752407, "rewards/margins": 0.48536643385887146, "rewards/rejected": -0.49839890003204346, "sft_loss": 0.13032501935958862, "step": 3253 }, { "epoch": 4.705712219812003, "grad_norm": 1.624021210716625, "learning_rate": 9.216722588205561e-07, "logits/chosen": -0.8625186085700989, "logits/rejected": -0.8469754457473755, "logps/chosen": -0.08548222482204437, "logps/rejected": -4.792725086212158, "loss": 0.0718, "odds_ratio_loss": 0.008853597566485405, "rewards/accuracies": 1.0, "rewards/chosen": -0.008548222482204437, "rewards/margins": 0.47072431445121765, "rewards/rejected": -0.4792725443840027, "sft_loss": 0.08548222482204437, "step": 3254 }, { "epoch": 4.707158351409978, "grad_norm": 1.7508042191690136, "learning_rate": 9.196899364256259e-07, "logits/chosen": -0.6333081722259521, "logits/rejected": -0.4871371388435364, "logps/chosen": -0.08562222123146057, "logps/rejected": -4.179487705230713, "loss": 0.1186, "odds_ratio_loss": 0.008961411193013191, "rewards/accuracies": 1.0, "rewards/chosen": -0.008562222123146057, "rewards/margins": 0.4093865156173706, "rewards/rejected": -0.41794872283935547, "sft_loss": 0.08562222123146057, "step": 3255 }, { "epoch": 4.708604483007954, "grad_norm": 1.4369748515968988, "learning_rate": 9.177094711611041e-07, "logits/chosen": -0.7007789611816406, "logits/rejected": -0.4575268030166626, "logps/chosen": -0.034006789326667786, "logps/rejected": -4.8226637840271, "loss": 0.0648, "odds_ratio_loss": 0.001526236068457365, "rewards/accuracies": 1.0, "rewards/chosen": -0.0034006789792329073, "rewards/margins": 0.47886568307876587, "rewards/rejected": -0.4822664260864258, "sft_loss": 0.034006789326667786, "step": 3256 }, { "epoch": 4.7100506146059296, "grad_norm": 1.93287908188121, "learning_rate": 9.157308642210235e-07, "logits/chosen": -0.7941340804100037, "logits/rejected": -0.5466873049736023, "logps/chosen": -0.06813658028841019, "logps/rejected": -3.8739757537841797, "loss": 0.0663, "odds_ratio_loss": 0.014670598320662975, "rewards/accuracies": 1.0, "rewards/chosen": -0.0068136584013700485, "rewards/margins": 0.3805839419364929, "rewards/rejected": -0.3873975872993469, "sft_loss": 0.06813658028841019, "step": 3257 }, { "epoch": 4.711496746203904, "grad_norm": 1.6984279813334189, "learning_rate": 9.137541167982905e-07, "logits/chosen": -0.997580349445343, "logits/rejected": -0.6903654932975769, "logps/chosen": -0.11518067121505737, "logps/rejected": -4.90092658996582, "loss": 0.0901, "odds_ratio_loss": 0.011495430953800678, "rewards/accuracies": 1.0, "rewards/chosen": -0.011518066748976707, "rewards/margins": 0.47857460379600525, "rewards/rejected": -0.4900926947593689, "sft_loss": 0.11518067121505737, "step": 3258 }, { "epoch": 4.71294287780188, "grad_norm": 1.5717400046835743, "learning_rate": 9.117792300846958e-07, "logits/chosen": -0.8282598257064819, "logits/rejected": -0.6946208477020264, "logps/chosen": -0.031053097918629646, "logps/rejected": -4.154017925262451, "loss": 0.0545, "odds_ratio_loss": 0.0034298747777938843, "rewards/accuracies": 1.0, "rewards/chosen": -0.0031053097918629646, "rewards/margins": 0.41229650378227234, "rewards/rejected": -0.41540175676345825, "sft_loss": 0.031053097918629646, "step": 3259 }, { "epoch": 4.714389009399856, "grad_norm": 1.4768072947611934, "learning_rate": 9.098062052709052e-07, "logits/chosen": -0.8885550498962402, "logits/rejected": -0.587072491645813, "logps/chosen": -0.07804380357265472, "logps/rejected": -6.696857929229736, "loss": 0.0927, "odds_ratio_loss": 0.008093244396150112, "rewards/accuracies": 1.0, "rewards/chosen": -0.007804380729794502, "rewards/margins": 0.6618814468383789, "rewards/rejected": -0.6696858406066895, "sft_loss": 0.07804380357265472, "step": 3260 }, { "epoch": 4.7158351409978305, "grad_norm": 1.5119276549388956, "learning_rate": 9.078350435464627e-07, "logits/chosen": -0.7504903078079224, "logits/rejected": -0.6390421390533447, "logps/chosen": -0.11923964321613312, "logps/rejected": -4.330544471740723, "loss": 0.0777, "odds_ratio_loss": 0.007041964679956436, "rewards/accuracies": 1.0, "rewards/chosen": -0.011923965066671371, "rewards/margins": 0.4211304783821106, "rewards/rejected": -0.4330544173717499, "sft_loss": 0.11923964321613312, "step": 3261 }, { "epoch": 4.717281272595806, "grad_norm": 1.848013472334746, "learning_rate": 9.058657460997876e-07, "logits/chosen": -1.02030611038208, "logits/rejected": -0.9156399965286255, "logps/chosen": -0.07417614012956619, "logps/rejected": -3.798497200012207, "loss": 0.0791, "odds_ratio_loss": 0.008186925202608109, "rewards/accuracies": 1.0, "rewards/chosen": -0.007417613640427589, "rewards/margins": 0.3724321126937866, "rewards/rejected": -0.37984973192214966, "sft_loss": 0.07417614012956619, "step": 3262 }, { "epoch": 4.718727404193782, "grad_norm": 2.0126285877800028, "learning_rate": 9.03898314118178e-07, "logits/chosen": -0.9355478882789612, "logits/rejected": -0.703206479549408, "logps/chosen": -0.1144571453332901, "logps/rejected": -6.172203063964844, "loss": 0.0773, "odds_ratio_loss": 0.011154555715620518, "rewards/accuracies": 1.0, "rewards/chosen": -0.011445715092122555, "rewards/margins": 0.6057746410369873, "rewards/rejected": -0.6172203421592712, "sft_loss": 0.1144571453332901, "step": 3263 }, { "epoch": 4.720173535791757, "grad_norm": 1.5994780503232926, "learning_rate": 9.019327487878072e-07, "logits/chosen": -0.7576562166213989, "logits/rejected": -0.6976518630981445, "logps/chosen": -0.05885721370577812, "logps/rejected": -5.845735549926758, "loss": 0.0676, "odds_ratio_loss": 0.005953238345682621, "rewards/accuracies": 1.0, "rewards/chosen": -0.005885721184313297, "rewards/margins": 0.5786879062652588, "rewards/rejected": -0.5845736265182495, "sft_loss": 0.05885721370577812, "step": 3264 }, { "epoch": 4.721619667389732, "grad_norm": 1.8736737702028536, "learning_rate": 8.999690512937195e-07, "logits/chosen": -0.9201067686080933, "logits/rejected": -0.7886765003204346, "logps/chosen": -0.03042430989444256, "logps/rejected": -2.8618273735046387, "loss": 0.0658, "odds_ratio_loss": 0.003192545147612691, "rewards/accuracies": 1.0, "rewards/chosen": -0.0030424310825765133, "rewards/margins": 0.2831403315067291, "rewards/rejected": -0.2861827611923218, "sft_loss": 0.03042430989444256, "step": 3265 }, { "epoch": 4.723065798987708, "grad_norm": 1.5332286243868711, "learning_rate": 8.980072228198374e-07, "logits/chosen": -1.1953673362731934, "logits/rejected": -0.7290738224983215, "logps/chosen": -0.08467172086238861, "logps/rejected": -4.777332305908203, "loss": 0.0647, "odds_ratio_loss": 0.003652008716017008, "rewards/accuracies": 1.0, "rewards/chosen": -0.008467172272503376, "rewards/margins": 0.4692661166191101, "rewards/rejected": -0.4777332842350006, "sft_loss": 0.08467172086238861, "step": 3266 }, { "epoch": 4.724511930585683, "grad_norm": 1.421654983561279, "learning_rate": 8.960472645489536e-07, "logits/chosen": -0.9492431879043579, "logits/rejected": -0.63211590051651, "logps/chosen": -0.05600889027118683, "logps/rejected": -6.399876594543457, "loss": 0.0529, "odds_ratio_loss": 0.0036964938044548035, "rewards/accuracies": 1.0, "rewards/chosen": -0.005600889213383198, "rewards/margins": 0.6343867778778076, "rewards/rejected": -0.639987587928772, "sft_loss": 0.05600889027118683, "step": 3267 }, { "epoch": 4.7259580621836585, "grad_norm": 1.5880771854974947, "learning_rate": 8.940891776627348e-07, "logits/chosen": -0.8621605634689331, "logits/rejected": -0.5667805671691895, "logps/chosen": -0.09114475548267365, "logps/rejected": -7.375045299530029, "loss": 0.0932, "odds_ratio_loss": 0.00746044609695673, "rewards/accuracies": 1.0, "rewards/chosen": -0.00911447498947382, "rewards/margins": 0.728390097618103, "rewards/rejected": -0.7375046014785767, "sft_loss": 0.09114475548267365, "step": 3268 }, { "epoch": 4.727404193781634, "grad_norm": 1.5189323585430745, "learning_rate": 8.921329633417172e-07, "logits/chosen": -0.9254083633422852, "logits/rejected": -0.7080655097961426, "logps/chosen": -0.11067543923854828, "logps/rejected": -3.79581356048584, "loss": 0.0815, "odds_ratio_loss": 0.026493411511182785, "rewards/accuracies": 1.0, "rewards/chosen": -0.011067543178796768, "rewards/margins": 0.368513822555542, "rewards/rejected": -0.37958139181137085, "sft_loss": 0.11067543923854828, "step": 3269 }, { "epoch": 4.72885032537961, "grad_norm": 1.7915520979090995, "learning_rate": 8.901786227653119e-07, "logits/chosen": -1.0431417226791382, "logits/rejected": -0.6968197822570801, "logps/chosen": -0.06255242228507996, "logps/rejected": -5.142256259918213, "loss": 0.1002, "odds_ratio_loss": 0.004798182286322117, "rewards/accuracies": 1.0, "rewards/chosen": -0.006255242042243481, "rewards/margins": 0.50797039270401, "rewards/rejected": -0.5142256617546082, "sft_loss": 0.06255242228507996, "step": 3270 }, { "epoch": 4.730296456977585, "grad_norm": 1.5051111918909226, "learning_rate": 8.882261571117959e-07, "logits/chosen": -0.9455627202987671, "logits/rejected": -0.6978684663772583, "logps/chosen": -0.058345332741737366, "logps/rejected": -3.4079577922821045, "loss": 0.0952, "odds_ratio_loss": 0.0066518341191112995, "rewards/accuracies": 1.0, "rewards/chosen": -0.005834532901644707, "rewards/margins": 0.33496126532554626, "rewards/rejected": -0.3407958149909973, "sft_loss": 0.058345332741737366, "step": 3271 }, { "epoch": 4.73174258857556, "grad_norm": 1.6356443459061143, "learning_rate": 8.862755675583207e-07, "logits/chosen": -0.5706143379211426, "logits/rejected": -0.42333659529685974, "logps/chosen": -0.0708899050951004, "logps/rejected": -6.153177261352539, "loss": 0.0879, "odds_ratio_loss": 0.0040434496477246284, "rewards/accuracies": 1.0, "rewards/chosen": -0.007088990416377783, "rewards/margins": 0.6082287430763245, "rewards/rejected": -0.6153177618980408, "sft_loss": 0.0708899050951004, "step": 3272 }, { "epoch": 4.733188720173536, "grad_norm": 1.667441586060215, "learning_rate": 8.843268552809009e-07, "logits/chosen": -0.7408003211021423, "logits/rejected": -0.5686386227607727, "logps/chosen": -0.10141906142234802, "logps/rejected": -4.278630256652832, "loss": 0.0906, "odds_ratio_loss": 0.010225032456219196, "rewards/accuracies": 1.0, "rewards/chosen": -0.010141907259821892, "rewards/margins": 0.417721152305603, "rewards/rejected": -0.42786306142807007, "sft_loss": 0.10141906142234802, "step": 3273 }, { "epoch": 4.734634851771511, "grad_norm": 1.646184662641339, "learning_rate": 8.823800214544257e-07, "logits/chosen": -0.8300518989562988, "logits/rejected": -0.771294116973877, "logps/chosen": -0.06484097242355347, "logps/rejected": -5.420320510864258, "loss": 0.081, "odds_ratio_loss": 0.00832346174865961, "rewards/accuracies": 1.0, "rewards/chosen": -0.006484096869826317, "rewards/margins": 0.5355479121208191, "rewards/rejected": -0.54203200340271, "sft_loss": 0.06484097242355347, "step": 3274 }, { "epoch": 4.736080983369487, "grad_norm": 1.6407642969954903, "learning_rate": 8.804350672526469e-07, "logits/chosen": -0.8406962156295776, "logits/rejected": -0.6480029821395874, "logps/chosen": -0.06862354278564453, "logps/rejected": -4.057324409484863, "loss": 0.0744, "odds_ratio_loss": 0.017293782904744148, "rewards/accuracies": 1.0, "rewards/chosen": -0.006862354464828968, "rewards/margins": 0.398870050907135, "rewards/rejected": -0.4057324230670929, "sft_loss": 0.06862354278564453, "step": 3275 }, { "epoch": 4.737527114967462, "grad_norm": 1.243947005110168, "learning_rate": 8.784919938481832e-07, "logits/chosen": -0.9880828261375427, "logits/rejected": -0.694316029548645, "logps/chosen": -0.04657265543937683, "logps/rejected": -4.722322463989258, "loss": 0.0611, "odds_ratio_loss": 0.004112796392291784, "rewards/accuracies": 1.0, "rewards/chosen": -0.004657265730202198, "rewards/margins": 0.46757495403289795, "rewards/rejected": -0.4722321927547455, "sft_loss": 0.04657265543937683, "step": 3276 }, { "epoch": 4.738973246565438, "grad_norm": 1.4949852325477238, "learning_rate": 8.76550802412523e-07, "logits/chosen": -0.43887242674827576, "logits/rejected": -0.36812540888786316, "logps/chosen": -0.03462660312652588, "logps/rejected": -5.46714973449707, "loss": 0.0631, "odds_ratio_loss": 0.013562695123255253, "rewards/accuracies": 1.0, "rewards/chosen": -0.0034626605920493603, "rewards/margins": 0.5432523488998413, "rewards/rejected": -0.5467150211334229, "sft_loss": 0.03462660312652588, "step": 3277 }, { "epoch": 4.740419378163413, "grad_norm": 1.4479273066737757, "learning_rate": 8.746114941160163e-07, "logits/chosen": -0.691802978515625, "logits/rejected": -0.5151437520980835, "logps/chosen": -0.1676384061574936, "logps/rejected": -5.6583147048950195, "loss": 0.0748, "odds_ratio_loss": 0.016385193914175034, "rewards/accuracies": 1.0, "rewards/chosen": -0.01676384173333645, "rewards/margins": 0.5490676760673523, "rewards/rejected": -0.5658314824104309, "sft_loss": 0.1676384061574936, "step": 3278 }, { "epoch": 4.741865509761388, "grad_norm": 1.6250065519446988, "learning_rate": 8.726740701278808e-07, "logits/chosen": -0.9104323387145996, "logits/rejected": -0.8115368485450745, "logps/chosen": -0.02846909500658512, "logps/rejected": -3.3297507762908936, "loss": 0.0595, "odds_ratio_loss": 0.0022020572796463966, "rewards/accuracies": 1.0, "rewards/chosen": -0.0028469092212617397, "rewards/margins": 0.33012816309928894, "rewards/rejected": -0.3329750597476959, "sft_loss": 0.02846909500658512, "step": 3279 }, { "epoch": 4.743311641359364, "grad_norm": 1.5518030908611087, "learning_rate": 8.707385316161953e-07, "logits/chosen": -0.8623483180999756, "logits/rejected": -0.645122766494751, "logps/chosen": -0.05945183336734772, "logps/rejected": -4.296452045440674, "loss": 0.0591, "odds_ratio_loss": 0.005059407092630863, "rewards/accuracies": 1.0, "rewards/chosen": -0.005945183336734772, "rewards/margins": 0.4237000346183777, "rewards/rejected": -0.4296451807022095, "sft_loss": 0.05945183336734772, "step": 3280 }, { "epoch": 4.744757772957339, "grad_norm": 1.7926493160828691, "learning_rate": 8.688048797479042e-07, "logits/chosen": -1.0307143926620483, "logits/rejected": -0.6179426908493042, "logps/chosen": -0.06234199181199074, "logps/rejected": -5.2709503173828125, "loss": 0.0493, "odds_ratio_loss": 0.003489615162834525, "rewards/accuracies": 1.0, "rewards/chosen": -0.0062341997399926186, "rewards/margins": 0.5208608508110046, "rewards/rejected": -0.5270950198173523, "sft_loss": 0.06234199181199074, "step": 3281 }, { "epoch": 4.746203904555315, "grad_norm": 1.658140114417297, "learning_rate": 8.668731156888131e-07, "logits/chosen": -0.9141590595245361, "logits/rejected": -0.656516432762146, "logps/chosen": -0.11738917231559753, "logps/rejected": -3.92860746383667, "loss": 0.063, "odds_ratio_loss": 0.012846600264310837, "rewards/accuracies": 1.0, "rewards/chosen": -0.011738916859030724, "rewards/margins": 0.38112184405326843, "rewards/rejected": -0.3928607702255249, "sft_loss": 0.11738917231559753, "step": 3282 }, { "epoch": 4.74765003615329, "grad_norm": 2.812226274576573, "learning_rate": 8.649432406035897e-07, "logits/chosen": -0.6458684206008911, "logits/rejected": -0.5954203009605408, "logps/chosen": -0.14193274080753326, "logps/rejected": -5.994907379150391, "loss": 0.1098, "odds_ratio_loss": 0.005412849597632885, "rewards/accuracies": 1.0, "rewards/chosen": -0.014193275012075901, "rewards/margins": 0.5852974653244019, "rewards/rejected": -0.5994908213615417, "sft_loss": 0.14193274080753326, "step": 3283 }, { "epoch": 4.749096167751265, "grad_norm": 1.5820536753740884, "learning_rate": 8.630152556557613e-07, "logits/chosen": -0.6483851671218872, "logits/rejected": -0.5950357913970947, "logps/chosen": -0.04669715836644173, "logps/rejected": -3.50252628326416, "loss": 0.0653, "odds_ratio_loss": 0.0048396410420536995, "rewards/accuracies": 1.0, "rewards/chosen": -0.004669716116040945, "rewards/margins": 0.34558290243148804, "rewards/rejected": -0.350252628326416, "sft_loss": 0.04669715836644173, "step": 3284 }, { "epoch": 4.750542299349241, "grad_norm": 1.691255261241321, "learning_rate": 8.610891620077198e-07, "logits/chosen": -0.6523389220237732, "logits/rejected": -0.4981173872947693, "logps/chosen": -0.11179995536804199, "logps/rejected": -4.435649871826172, "loss": 0.1057, "odds_ratio_loss": 0.007845187559723854, "rewards/accuracies": 1.0, "rewards/chosen": -0.011179995723068714, "rewards/margins": 0.4323849678039551, "rewards/rejected": -0.4435649812221527, "sft_loss": 0.11179995536804199, "step": 3285 }, { "epoch": 4.7519884309472165, "grad_norm": 1.625313677026407, "learning_rate": 8.59164960820712e-07, "logits/chosen": -0.8503109216690063, "logits/rejected": -0.7520562410354614, "logps/chosen": -0.16012218594551086, "logps/rejected": -3.7297375202178955, "loss": 0.1028, "odds_ratio_loss": 0.033738117665052414, "rewards/accuracies": 1.0, "rewards/chosen": -0.016012219712138176, "rewards/margins": 0.35696154832839966, "rewards/rejected": -0.3729737401008606, "sft_loss": 0.16012218594551086, "step": 3286 }, { "epoch": 4.753434562545191, "grad_norm": 1.4980789666005607, "learning_rate": 8.572426532548487e-07, "logits/chosen": -0.6264578104019165, "logits/rejected": -0.6370917558670044, "logps/chosen": -0.09624272584915161, "logps/rejected": -3.5333688259124756, "loss": 0.0647, "odds_ratio_loss": 0.013179374858736992, "rewards/accuracies": 1.0, "rewards/chosen": -0.009624271653592587, "rewards/margins": 0.34371256828308105, "rewards/rejected": -0.3533368706703186, "sft_loss": 0.09624272584915161, "step": 3287 }, { "epoch": 4.754880694143167, "grad_norm": 1.7420127662395781, "learning_rate": 8.553222404690928e-07, "logits/chosen": -0.8942149877548218, "logits/rejected": -0.7594558000564575, "logps/chosen": -0.08043821156024933, "logps/rejected": -4.053690433502197, "loss": 0.0754, "odds_ratio_loss": 0.005299334414303303, "rewards/accuracies": 1.0, "rewards/chosen": -0.008043821901082993, "rewards/margins": 0.39732521772384644, "rewards/rejected": -0.4053690433502197, "sft_loss": 0.08043821156024933, "step": 3288 }, { "epoch": 4.756326825741143, "grad_norm": 1.6059975248009308, "learning_rate": 8.534037236212715e-07, "logits/chosen": -0.8111061453819275, "logits/rejected": -0.6317873597145081, "logps/chosen": -0.06460095942020416, "logps/rejected": -5.87692928314209, "loss": 0.0611, "odds_ratio_loss": 0.00885202456265688, "rewards/accuracies": 1.0, "rewards/chosen": -0.006460095755755901, "rewards/margins": 0.5812327861785889, "rewards/rejected": -0.58769291639328, "sft_loss": 0.06460095942020416, "step": 3289 }, { "epoch": 4.757772957339117, "grad_norm": 1.8821353090731472, "learning_rate": 8.514871038680644e-07, "logits/chosen": -0.7282547950744629, "logits/rejected": -0.5171594619750977, "logps/chosen": -0.09477448463439941, "logps/rejected": -6.691037654876709, "loss": 0.1061, "odds_ratio_loss": 0.00491682905703783, "rewards/accuracies": 1.0, "rewards/chosen": -0.009477448649704456, "rewards/margins": 0.659626305103302, "rewards/rejected": -0.6691038012504578, "sft_loss": 0.09477448463439941, "step": 3290 }, { "epoch": 4.759219088937093, "grad_norm": 1.3869499519014088, "learning_rate": 8.495723823650078e-07, "logits/chosen": -0.8584036231040955, "logits/rejected": -0.7203078866004944, "logps/chosen": -0.06936081498861313, "logps/rejected": -5.100736618041992, "loss": 0.0654, "odds_ratio_loss": 0.005390543024986982, "rewards/accuracies": 1.0, "rewards/chosen": -0.006936081685125828, "rewards/margins": 0.5031375885009766, "rewards/rejected": -0.5100736618041992, "sft_loss": 0.06936081498861313, "step": 3291 }, { "epoch": 4.760665220535069, "grad_norm": 1.572558923505267, "learning_rate": 8.476595602664965e-07, "logits/chosen": -0.7464091777801514, "logits/rejected": -0.6228684186935425, "logps/chosen": -0.07792852073907852, "logps/rejected": -4.133586883544922, "loss": 0.0911, "odds_ratio_loss": 0.008083056658506393, "rewards/accuracies": 1.0, "rewards/chosen": -0.0077928523533046246, "rewards/margins": 0.4055657982826233, "rewards/rejected": -0.4133586883544922, "sft_loss": 0.07792852073907852, "step": 3292 }, { "epoch": 4.7621113521330445, "grad_norm": 1.8275520156329068, "learning_rate": 8.457486387257753e-07, "logits/chosen": -0.6914631128311157, "logits/rejected": -0.5352708697319031, "logps/chosen": -0.06538383662700653, "logps/rejected": -4.57308292388916, "loss": 0.0546, "odds_ratio_loss": 0.005749408155679703, "rewards/accuracies": 1.0, "rewards/chosen": -0.006538383662700653, "rewards/margins": 0.45076990127563477, "rewards/rejected": -0.4573083519935608, "sft_loss": 0.06538383662700653, "step": 3293 }, { "epoch": 4.763557483731019, "grad_norm": 1.6277201198392217, "learning_rate": 8.438396188949486e-07, "logits/chosen": -0.6373045444488525, "logits/rejected": -0.615994930267334, "logps/chosen": -0.11351317912340164, "logps/rejected": -6.261795997619629, "loss": 0.0821, "odds_ratio_loss": 0.02115788869559765, "rewards/accuracies": 1.0, "rewards/chosen": -0.011351317167282104, "rewards/margins": 0.61482834815979, "rewards/rejected": -0.6261796355247498, "sft_loss": 0.11351317912340164, "step": 3294 }, { "epoch": 4.765003615328995, "grad_norm": 1.771173716342708, "learning_rate": 8.419325019249699e-07, "logits/chosen": -0.8777015209197998, "logits/rejected": -0.7351939082145691, "logps/chosen": -0.09541021287441254, "logps/rejected": -4.427221298217773, "loss": 0.0785, "odds_ratio_loss": 0.0014403918758034706, "rewards/accuracies": 1.0, "rewards/chosen": -0.009541021659970284, "rewards/margins": 0.43318116664886475, "rewards/rejected": -0.4427221417427063, "sft_loss": 0.09541021287441254, "step": 3295 }, { "epoch": 4.766449746926971, "grad_norm": 1.4668075763215394, "learning_rate": 8.400272889656484e-07, "logits/chosen": -0.9218635559082031, "logits/rejected": -0.8534926176071167, "logps/chosen": -0.044725202023983, "logps/rejected": -3.9485654830932617, "loss": 0.0502, "odds_ratio_loss": 0.007518763653934002, "rewards/accuracies": 1.0, "rewards/chosen": -0.0044725202023983, "rewards/margins": 0.3903840184211731, "rewards/rejected": -0.3948565125465393, "sft_loss": 0.044725202023983, "step": 3296 }, { "epoch": 4.7678958785249455, "grad_norm": 1.754333085943087, "learning_rate": 8.381239811656434e-07, "logits/chosen": -0.5121525526046753, "logits/rejected": -0.393355131149292, "logps/chosen": -0.0673723891377449, "logps/rejected": -4.3197021484375, "loss": 0.0698, "odds_ratio_loss": 0.006755652837455273, "rewards/accuracies": 1.0, "rewards/chosen": -0.006737238727509975, "rewards/margins": 0.42523300647735596, "rewards/rejected": -0.4319702386856079, "sft_loss": 0.0673723891377449, "step": 3297 }, { "epoch": 4.769342010122921, "grad_norm": 1.429132077368146, "learning_rate": 8.362225796724662e-07, "logits/chosen": -0.8575242757797241, "logits/rejected": -0.8655635714530945, "logps/chosen": -0.10043197125196457, "logps/rejected": -4.142751693725586, "loss": 0.0738, "odds_ratio_loss": 0.010751158930361271, "rewards/accuracies": 1.0, "rewards/chosen": -0.010043196380138397, "rewards/margins": 0.4042320251464844, "rewards/rejected": -0.414275199174881, "sft_loss": 0.10043197125196457, "step": 3298 }, { "epoch": 4.770788141720897, "grad_norm": 2.0904057729420122, "learning_rate": 8.343230856324779e-07, "logits/chosen": -0.6189036965370178, "logits/rejected": -0.5577619671821594, "logps/chosen": -0.06153484806418419, "logps/rejected": -5.164226531982422, "loss": 0.0832, "odds_ratio_loss": 0.007398117333650589, "rewards/accuracies": 1.0, "rewards/chosen": -0.006153484806418419, "rewards/margins": 0.5102692246437073, "rewards/rejected": -0.5164227485656738, "sft_loss": 0.06153484806418419, "step": 3299 }, { "epoch": 4.7722342733188725, "grad_norm": 1.809363032411696, "learning_rate": 8.324255001908929e-07, "logits/chosen": -0.6867760419845581, "logits/rejected": -0.5739983916282654, "logps/chosen": -0.08187718689441681, "logps/rejected": -4.6687726974487305, "loss": 0.0809, "odds_ratio_loss": 0.01753270998597145, "rewards/accuracies": 1.0, "rewards/chosen": -0.008187718689441681, "rewards/margins": 0.4586895704269409, "rewards/rejected": -0.4668773114681244, "sft_loss": 0.08187718689441681, "step": 3300 }, { "epoch": 4.773680404916847, "grad_norm": 1.6080700276039386, "learning_rate": 8.305298244917698e-07, "logits/chosen": -1.0285183191299438, "logits/rejected": -0.8908055424690247, "logps/chosen": -0.1339968740940094, "logps/rejected": -2.8577942848205566, "loss": 0.0732, "odds_ratio_loss": 0.020126506686210632, "rewards/accuracies": 1.0, "rewards/chosen": -0.01339968666434288, "rewards/margins": 0.2723797559738159, "rewards/rejected": -0.2857794165611267, "sft_loss": 0.1339968740940094, "step": 3301 }, { "epoch": 4.775126536514823, "grad_norm": 1.5805169753771973, "learning_rate": 8.286360596780197e-07, "logits/chosen": -0.8294731378555298, "logits/rejected": -0.64952552318573, "logps/chosen": -0.061815451830625534, "logps/rejected": -4.596172332763672, "loss": 0.0699, "odds_ratio_loss": 0.0055434140376746655, "rewards/accuracies": 1.0, "rewards/chosen": -0.006181545555591583, "rewards/margins": 0.4534357190132141, "rewards/rejected": -0.4596172571182251, "sft_loss": 0.061815451830625534, "step": 3302 }, { "epoch": 4.776572668112799, "grad_norm": 1.7640904225463478, "learning_rate": 8.267442068914019e-07, "logits/chosen": -0.8276826739311218, "logits/rejected": -0.5451090931892395, "logps/chosen": -0.04526611045002937, "logps/rejected": -5.430952072143555, "loss": 0.0699, "odds_ratio_loss": 0.0029891584999859333, "rewards/accuracies": 1.0, "rewards/chosen": -0.00452661095187068, "rewards/margins": 0.5385686159133911, "rewards/rejected": -0.5430951714515686, "sft_loss": 0.04526611045002937, "step": 3303 }, { "epoch": 4.7780187997107735, "grad_norm": 1.4000226060925451, "learning_rate": 8.248542672725189e-07, "logits/chosen": -0.8920959830284119, "logits/rejected": -0.6845582127571106, "logps/chosen": -0.04449871927499771, "logps/rejected": -3.2585527896881104, "loss": 0.0678, "odds_ratio_loss": 0.005256593693047762, "rewards/accuracies": 1.0, "rewards/chosen": -0.004449872300028801, "rewards/margins": 0.3214053809642792, "rewards/rejected": -0.3258552551269531, "sft_loss": 0.04449871927499771, "step": 3304 }, { "epoch": 4.779464931308749, "grad_norm": 1.467193415411127, "learning_rate": 8.229662419608252e-07, "logits/chosen": -0.9364633560180664, "logits/rejected": -0.799718976020813, "logps/chosen": -0.09500187635421753, "logps/rejected": -4.936524391174316, "loss": 0.0788, "odds_ratio_loss": 0.007705302909016609, "rewards/accuracies": 1.0, "rewards/chosen": -0.009500187821686268, "rewards/margins": 0.48415225744247437, "rewards/rejected": -0.49365243315696716, "sft_loss": 0.09500187635421753, "step": 3305 }, { "epoch": 4.780911062906725, "grad_norm": 1.4723060397784287, "learning_rate": 8.210801320946163e-07, "logits/chosen": -0.8033458590507507, "logits/rejected": -0.514359712600708, "logps/chosen": -0.07291413843631744, "logps/rejected": -6.4514875411987305, "loss": 0.0735, "odds_ratio_loss": 0.004370196722447872, "rewards/accuracies": 1.0, "rewards/chosen": -0.007291413843631744, "rewards/margins": 0.6378573179244995, "rewards/rejected": -0.6451488137245178, "sft_loss": 0.07291413843631744, "step": 3306 }, { "epoch": 4.7823571945047, "grad_norm": 1.4452690512837696, "learning_rate": 8.191959388110356e-07, "logits/chosen": -0.767648458480835, "logits/rejected": -0.6331923007965088, "logps/chosen": -0.06505173444747925, "logps/rejected": -3.9707190990448, "loss": 0.0706, "odds_ratio_loss": 0.0042672669515013695, "rewards/accuracies": 1.0, "rewards/chosen": -0.006505173165351152, "rewards/margins": 0.39056673645973206, "rewards/rejected": -0.397071897983551, "sft_loss": 0.06505173444747925, "step": 3307 }, { "epoch": 4.783803326102675, "grad_norm": 1.4691145708870954, "learning_rate": 8.173136632460687e-07, "logits/chosen": -0.8610515594482422, "logits/rejected": -0.6491256356239319, "logps/chosen": -0.05403333902359009, "logps/rejected": -3.766594648361206, "loss": 0.0779, "odds_ratio_loss": 0.004882722161710262, "rewards/accuracies": 1.0, "rewards/chosen": -0.005403334274888039, "rewards/margins": 0.37125617265701294, "rewards/rejected": -0.3766595125198364, "sft_loss": 0.05403333902359009, "step": 3308 }, { "epoch": 4.785249457700651, "grad_norm": 2.0942384834532115, "learning_rate": 8.154333065345489e-07, "logits/chosen": -0.7129379510879517, "logits/rejected": -0.49292656779289246, "logps/chosen": -0.08610312640666962, "logps/rejected": -4.657783031463623, "loss": 0.1099, "odds_ratio_loss": 0.007286392152309418, "rewards/accuracies": 1.0, "rewards/chosen": -0.008610313758254051, "rewards/margins": 0.45716798305511475, "rewards/rejected": -0.46577832102775574, "sft_loss": 0.08610312640666962, "step": 3309 }, { "epoch": 4.786695589298626, "grad_norm": 1.9279725748993217, "learning_rate": 8.135548698101482e-07, "logits/chosen": -0.6503070592880249, "logits/rejected": -0.5543050765991211, "logps/chosen": -0.0853266641497612, "logps/rejected": -4.004879951477051, "loss": 0.08, "odds_ratio_loss": 0.019836550578475, "rewards/accuracies": 1.0, "rewards/chosen": -0.008532666601240635, "rewards/margins": 0.39195531606674194, "rewards/rejected": -0.4004879891872406, "sft_loss": 0.0853266641497612, "step": 3310 }, { "epoch": 4.7881417208966015, "grad_norm": 1.967430729296363, "learning_rate": 8.116783542053855e-07, "logits/chosen": -0.7817206382751465, "logits/rejected": -0.6119512915611267, "logps/chosen": -0.12662483751773834, "logps/rejected": -4.243107795715332, "loss": 0.1184, "odds_ratio_loss": 0.009148865006864071, "rewards/accuracies": 1.0, "rewards/chosen": -0.012662483379244804, "rewards/margins": 0.41164830327033997, "rewards/rejected": -0.4243107736110687, "sft_loss": 0.12662483751773834, "step": 3311 }, { "epoch": 4.789587852494577, "grad_norm": 1.6003137675765775, "learning_rate": 8.09803760851616e-07, "logits/chosen": -0.8021121621131897, "logits/rejected": -0.5248170495033264, "logps/chosen": -0.05905238538980484, "logps/rejected": -4.108620643615723, "loss": 0.0577, "odds_ratio_loss": 0.007532968185842037, "rewards/accuracies": 1.0, "rewards/chosen": -0.005905238911509514, "rewards/margins": 0.4049568772315979, "rewards/rejected": -0.4108620882034302, "sft_loss": 0.05905238538980484, "step": 3312 }, { "epoch": 4.791033984092552, "grad_norm": 1.5645076243387899, "learning_rate": 8.079310908790419e-07, "logits/chosen": -0.934636116027832, "logits/rejected": -0.722909688949585, "logps/chosen": -0.1410127729177475, "logps/rejected": -5.098935127258301, "loss": 0.1074, "odds_ratio_loss": 0.0036122482270002365, "rewards/accuracies": 1.0, "rewards/chosen": -0.01410127803683281, "rewards/margins": 0.4957922697067261, "rewards/rejected": -0.509893536567688, "sft_loss": 0.1410127729177475, "step": 3313 }, { "epoch": 4.792480115690528, "grad_norm": 1.3982813202070725, "learning_rate": 8.060603454167019e-07, "logits/chosen": -1.0576629638671875, "logits/rejected": -0.6195839047431946, "logps/chosen": -0.08455665409564972, "logps/rejected": -5.276945114135742, "loss": 0.084, "odds_ratio_loss": 0.007596557028591633, "rewards/accuracies": 1.0, "rewards/chosen": -0.008455665782094002, "rewards/margins": 0.5192388296127319, "rewards/rejected": -0.5276945233345032, "sft_loss": 0.08455665409564972, "step": 3314 }, { "epoch": 4.793926247288503, "grad_norm": 1.6979981582653707, "learning_rate": 8.041915255924747e-07, "logits/chosen": -0.7179811596870422, "logits/rejected": -0.5930829644203186, "logps/chosen": -0.04754379764199257, "logps/rejected": -4.652839183807373, "loss": 0.0682, "odds_ratio_loss": 0.0038252677768468857, "rewards/accuracies": 1.0, "rewards/chosen": -0.004754380322992802, "rewards/margins": 0.46052953600883484, "rewards/rejected": -0.46528393030166626, "sft_loss": 0.04754379764199257, "step": 3315 }, { "epoch": 4.795372378886479, "grad_norm": 1.6512873697364088, "learning_rate": 8.023246325330784e-07, "logits/chosen": -0.9477057456970215, "logits/rejected": -0.686681866645813, "logps/chosen": -0.11634199321269989, "logps/rejected": -5.4780354499816895, "loss": 0.0937, "odds_ratio_loss": 0.008887776173651218, "rewards/accuracies": 1.0, "rewards/chosen": -0.01163419894874096, "rewards/margins": 0.5361693501472473, "rewards/rejected": -0.5478035807609558, "sft_loss": 0.11634199321269989, "step": 3316 }, { "epoch": 4.796818510484454, "grad_norm": 1.5667079079881334, "learning_rate": 8.004596673640707e-07, "logits/chosen": -0.7140193581581116, "logits/rejected": -0.4710603654384613, "logps/chosen": -0.08510614186525345, "logps/rejected": -3.7450685501098633, "loss": 0.0795, "odds_ratio_loss": 0.016938410699367523, "rewards/accuracies": 1.0, "rewards/chosen": -0.008510613813996315, "rewards/margins": 0.36599624156951904, "rewards/rejected": -0.3745068609714508, "sft_loss": 0.08510614186525345, "step": 3317 }, { "epoch": 4.7982646420824295, "grad_norm": 1.6338412747663902, "learning_rate": 7.985966312098469e-07, "logits/chosen": -0.6758091449737549, "logits/rejected": -0.4951193928718567, "logps/chosen": -0.12785102427005768, "logps/rejected": -6.292760848999023, "loss": 0.0977, "odds_ratio_loss": 0.012771306559443474, "rewards/accuracies": 1.0, "rewards/chosen": -0.012785101309418678, "rewards/margins": 0.6164909601211548, "rewards/rejected": -0.6292760968208313, "sft_loss": 0.12785102427005768, "step": 3318 }, { "epoch": 4.799710773680405, "grad_norm": 1.496919749181908, "learning_rate": 7.967355251936361e-07, "logits/chosen": -0.7086659073829651, "logits/rejected": -0.5454675555229187, "logps/chosen": -0.10819420963525772, "logps/rejected": -6.755625247955322, "loss": 0.0749, "odds_ratio_loss": 0.004403555765748024, "rewards/accuracies": 1.0, "rewards/chosen": -0.010819422081112862, "rewards/margins": 0.6647431254386902, "rewards/rejected": -0.6755625009536743, "sft_loss": 0.10819420963525772, "step": 3319 }, { "epoch": 4.80115690527838, "grad_norm": 1.8413203272352143, "learning_rate": 7.948763504375087e-07, "logits/chosen": -0.9324480891227722, "logits/rejected": -0.6381005048751831, "logps/chosen": -0.06453929096460342, "logps/rejected": -6.1513166427612305, "loss": 0.0978, "odds_ratio_loss": 0.0025580860674381256, "rewards/accuracies": 1.0, "rewards/chosen": -0.006453928537666798, "rewards/margins": 0.6086777448654175, "rewards/rejected": -0.6151317358016968, "sft_loss": 0.06453929096460342, "step": 3320 }, { "epoch": 4.802603036876356, "grad_norm": 1.5277251212604894, "learning_rate": 7.930191080623668e-07, "logits/chosen": -0.6400019526481628, "logits/rejected": -0.5209491848945618, "logps/chosen": -0.07888957858085632, "logps/rejected": -4.372533321380615, "loss": 0.0621, "odds_ratio_loss": 0.010060323402285576, "rewards/accuracies": 1.0, "rewards/chosen": -0.007888957858085632, "rewards/margins": 0.429364413022995, "rewards/rejected": -0.4372533857822418, "sft_loss": 0.07888957858085632, "step": 3321 }, { "epoch": 4.804049168474331, "grad_norm": 2.2584557191850894, "learning_rate": 7.911637991879483e-07, "logits/chosen": -1.0325016975402832, "logits/rejected": -0.7070356011390686, "logps/chosen": -0.09169142693281174, "logps/rejected": -4.366299152374268, "loss": 0.1086, "odds_ratio_loss": 0.012625223957002163, "rewards/accuracies": 1.0, "rewards/chosen": -0.009169142693281174, "rewards/margins": 0.42746075987815857, "rewards/rejected": -0.43662989139556885, "sft_loss": 0.09169142693281174, "step": 3322 }, { "epoch": 4.805495300072307, "grad_norm": 1.64122497738142, "learning_rate": 7.893104249328258e-07, "logits/chosen": -0.9511623978614807, "logits/rejected": -0.7628656625747681, "logps/chosen": -0.09314146637916565, "logps/rejected": -4.508124828338623, "loss": 0.0803, "odds_ratio_loss": 0.005370480008423328, "rewards/accuracies": 1.0, "rewards/chosen": -0.009314147755503654, "rewards/margins": 0.4414983093738556, "rewards/rejected": -0.4508124887943268, "sft_loss": 0.09314146637916565, "step": 3323 }, { "epoch": 4.806941431670282, "grad_norm": 1.5377080828397056, "learning_rate": 7.874589864144066e-07, "logits/chosen": -0.8962692022323608, "logits/rejected": -0.5798559784889221, "logps/chosen": -0.06909617781639099, "logps/rejected": -7.326718807220459, "loss": 0.0594, "odds_ratio_loss": 0.00794798880815506, "rewards/accuracies": 1.0, "rewards/chosen": -0.006909618619829416, "rewards/margins": 0.7257623076438904, "rewards/rejected": -0.7326719164848328, "sft_loss": 0.06909617781639099, "step": 3324 }, { "epoch": 4.808387563268258, "grad_norm": 1.5937599520948071, "learning_rate": 7.856094847489286e-07, "logits/chosen": -0.6906980872154236, "logits/rejected": -0.5133391618728638, "logps/chosen": -0.09170494973659515, "logps/rejected": -4.876096725463867, "loss": 0.071, "odds_ratio_loss": 0.00837009958922863, "rewards/accuracies": 1.0, "rewards/chosen": -0.00917049590498209, "rewards/margins": 0.47843921184539795, "rewards/rejected": -0.4876096844673157, "sft_loss": 0.09170494973659515, "step": 3325 }, { "epoch": 4.809833694866233, "grad_norm": 1.6849063604132934, "learning_rate": 7.837619210514645e-07, "logits/chosen": -0.8195112943649292, "logits/rejected": -0.6542753577232361, "logps/chosen": -0.08772353827953339, "logps/rejected": -5.507904052734375, "loss": 0.0818, "odds_ratio_loss": 0.009588465094566345, "rewards/accuracies": 1.0, "rewards/chosen": -0.008772353641688824, "rewards/margins": 0.5420180559158325, "rewards/rejected": -0.5507904291152954, "sft_loss": 0.08772353827953339, "step": 3326 }, { "epoch": 4.811279826464208, "grad_norm": 1.597280251006234, "learning_rate": 7.819162964359161e-07, "logits/chosen": -0.8942530155181885, "logits/rejected": -0.6368886232376099, "logps/chosen": -0.07599680870771408, "logps/rejected": -4.553309440612793, "loss": 0.0985, "odds_ratio_loss": 0.005070159211754799, "rewards/accuracies": 1.0, "rewards/chosen": -0.007599681615829468, "rewards/margins": 0.44773125648498535, "rewards/rejected": -0.45533090829849243, "sft_loss": 0.07599680870771408, "step": 3327 }, { "epoch": 4.812725958062184, "grad_norm": 1.5081792858017415, "learning_rate": 7.800726120150188e-07, "logits/chosen": -0.9549357891082764, "logits/rejected": -0.6117710471153259, "logps/chosen": -0.08728785812854767, "logps/rejected": -5.800983428955078, "loss": 0.074, "odds_ratio_loss": 0.008302710950374603, "rewards/accuracies": 1.0, "rewards/chosen": -0.008728786371648312, "rewards/margins": 0.5713695883750916, "rewards/rejected": -0.5800983905792236, "sft_loss": 0.08728785812854767, "step": 3328 }, { "epoch": 4.814172089660159, "grad_norm": 1.6857913081849396, "learning_rate": 7.782308689003359e-07, "logits/chosen": -0.9277090430259705, "logits/rejected": -0.642306923866272, "logps/chosen": -0.04577184468507767, "logps/rejected": -4.958084583282471, "loss": 0.0645, "odds_ratio_loss": 0.0022096242755651474, "rewards/accuracies": 1.0, "rewards/chosen": -0.004577184561640024, "rewards/margins": 0.4912312626838684, "rewards/rejected": -0.495808482170105, "sft_loss": 0.04577184468507767, "step": 3329 }, { "epoch": 4.815618221258134, "grad_norm": 2.1125135146669125, "learning_rate": 7.763910682022606e-07, "logits/chosen": -0.7076646089553833, "logits/rejected": -0.5731692910194397, "logps/chosen": -0.05527088791131973, "logps/rejected": -6.649045944213867, "loss": 0.0628, "odds_ratio_loss": 0.005929501727223396, "rewards/accuracies": 1.0, "rewards/chosen": -0.005527088418602943, "rewards/margins": 0.6593775153160095, "rewards/rejected": -0.6649045944213867, "sft_loss": 0.05527088791131973, "step": 3330 }, { "epoch": 4.81706435285611, "grad_norm": 1.5913484401509341, "learning_rate": 7.74553211030017e-07, "logits/chosen": -0.7423536777496338, "logits/rejected": -0.6629868745803833, "logps/chosen": -0.06886343657970428, "logps/rejected": -3.8152801990509033, "loss": 0.0913, "odds_ratio_loss": 0.014335782267153263, "rewards/accuracies": 1.0, "rewards/chosen": -0.0068863434717059135, "rewards/margins": 0.3746417164802551, "rewards/rejected": -0.38152801990509033, "sft_loss": 0.06886343657970428, "step": 3331 }, { "epoch": 4.818510484454086, "grad_norm": 1.6210084217221785, "learning_rate": 7.727172984916545e-07, "logits/chosen": -0.8341798782348633, "logits/rejected": -0.7047946453094482, "logps/chosen": -0.1327381432056427, "logps/rejected": -4.129507064819336, "loss": 0.1181, "odds_ratio_loss": 0.02535739727318287, "rewards/accuracies": 1.0, "rewards/chosen": -0.013273816555738449, "rewards/margins": 0.39967694878578186, "rewards/rejected": -0.4129507541656494, "sft_loss": 0.1327381432056427, "step": 3332 }, { "epoch": 4.81995661605206, "grad_norm": 1.6399711939380055, "learning_rate": 7.708833316940535e-07, "logits/chosen": -0.8430829048156738, "logits/rejected": -0.6608396768569946, "logps/chosen": -0.05684221535921097, "logps/rejected": -4.846701622009277, "loss": 0.086, "odds_ratio_loss": 0.0052174655720591545, "rewards/accuracies": 1.0, "rewards/chosen": -0.005684222094714642, "rewards/margins": 0.478985995054245, "rewards/rejected": -0.4846702218055725, "sft_loss": 0.05684221535921097, "step": 3333 }, { "epoch": 4.821402747650036, "grad_norm": 1.5697641034160588, "learning_rate": 7.690513117429169e-07, "logits/chosen": -0.9114935398101807, "logits/rejected": -0.7490615248680115, "logps/chosen": -0.07970191538333893, "logps/rejected": -3.7014589309692383, "loss": 0.0887, "odds_ratio_loss": 0.009289674460887909, "rewards/accuracies": 1.0, "rewards/chosen": -0.007970191538333893, "rewards/margins": 0.36217570304870605, "rewards/rejected": -0.37014591693878174, "sft_loss": 0.07970191538333893, "step": 3334 }, { "epoch": 4.822848879248012, "grad_norm": 1.5121408909888006, "learning_rate": 7.672212397427795e-07, "logits/chosen": -0.8596815466880798, "logits/rejected": -0.5858790278434753, "logps/chosen": -0.15773321688175201, "logps/rejected": -4.535950660705566, "loss": 0.1105, "odds_ratio_loss": 0.011155569925904274, "rewards/accuracies": 1.0, "rewards/chosen": -0.015773320570588112, "rewards/margins": 0.43782174587249756, "rewards/rejected": -0.4535950720310211, "sft_loss": 0.15773321688175201, "step": 3335 }, { "epoch": 4.8242950108459866, "grad_norm": 1.675437148395465, "learning_rate": 7.653931167969965e-07, "logits/chosen": -0.7641026973724365, "logits/rejected": -0.5949983596801758, "logps/chosen": -0.05824780464172363, "logps/rejected": -5.220847129821777, "loss": 0.0899, "odds_ratio_loss": 0.00477550970390439, "rewards/accuracies": 1.0, "rewards/chosen": -0.005824780557304621, "rewards/margins": 0.5162599086761475, "rewards/rejected": -0.5220847129821777, "sft_loss": 0.05824780464172363, "step": 3336 }, { "epoch": 4.825741142443962, "grad_norm": 1.509678398919964, "learning_rate": 7.635669440077502e-07, "logits/chosen": -0.7890247106552124, "logits/rejected": -0.4827120006084442, "logps/chosen": -0.08683416247367859, "logps/rejected": -5.226888179779053, "loss": 0.0642, "odds_ratio_loss": 0.0037988172844052315, "rewards/accuracies": 1.0, "rewards/chosen": -0.008683416061103344, "rewards/margins": 0.5140054225921631, "rewards/rejected": -0.5226888656616211, "sft_loss": 0.08683416247367859, "step": 3337 }, { "epoch": 4.827187274041938, "grad_norm": 1.3819942621004409, "learning_rate": 7.61742722476046e-07, "logits/chosen": -0.8519781827926636, "logits/rejected": -0.6214016675949097, "logps/chosen": -0.0465102382004261, "logps/rejected": -4.396292686462402, "loss": 0.0694, "odds_ratio_loss": 0.003573787398636341, "rewards/accuracies": 1.0, "rewards/chosen": -0.004651024006307125, "rewards/margins": 0.43497827649116516, "rewards/rejected": -0.43962928652763367, "sft_loss": 0.0465102382004261, "step": 3338 }, { "epoch": 4.828633405639914, "grad_norm": 1.6948199742458905, "learning_rate": 7.599204533017163e-07, "logits/chosen": -0.8539596796035767, "logits/rejected": -0.6457651257514954, "logps/chosen": -0.08881109207868576, "logps/rejected": -4.225852012634277, "loss": 0.0877, "odds_ratio_loss": 0.015900075435638428, "rewards/accuracies": 1.0, "rewards/chosen": -0.008881110697984695, "rewards/margins": 0.41370415687561035, "rewards/rejected": -0.42258524894714355, "sft_loss": 0.08881109207868576, "step": 3339 }, { "epoch": 4.830079537237888, "grad_norm": 1.597991480719571, "learning_rate": 7.581001375834115e-07, "logits/chosen": -0.7218050360679626, "logits/rejected": -0.5693727731704712, "logps/chosen": -0.04720648005604744, "logps/rejected": -5.685175895690918, "loss": 0.0757, "odds_ratio_loss": 0.0023048685397952795, "rewards/accuracies": 1.0, "rewards/chosen": -0.004720647819340229, "rewards/margins": 0.5637969374656677, "rewards/rejected": -0.5685176253318787, "sft_loss": 0.04720648005604744, "step": 3340 }, { "epoch": 4.831525668835864, "grad_norm": 1.8196226662982362, "learning_rate": 7.562817764186089e-07, "logits/chosen": -0.9130833148956299, "logits/rejected": -0.6541707515716553, "logps/chosen": -0.03671087324619293, "logps/rejected": -4.781536102294922, "loss": 0.0823, "odds_ratio_loss": 0.002584438305348158, "rewards/accuracies": 1.0, "rewards/chosen": -0.0036710870917886496, "rewards/margins": 0.4744824767112732, "rewards/rejected": -0.47815364599227905, "sft_loss": 0.03671087324619293, "step": 3341 }, { "epoch": 4.83297180043384, "grad_norm": 1.3978966669157058, "learning_rate": 7.544653709036031e-07, "logits/chosen": -0.75586998462677, "logits/rejected": -0.6667506694793701, "logps/chosen": -0.02506261318922043, "logps/rejected": -5.60970401763916, "loss": 0.0442, "odds_ratio_loss": 0.002245509997010231, "rewards/accuracies": 1.0, "rewards/chosen": -0.0025062612257897854, "rewards/margins": 0.5584641098976135, "rewards/rejected": -0.5609703660011292, "sft_loss": 0.02506261318922043, "step": 3342 }, { "epoch": 4.834417932031815, "grad_norm": 1.6595972252761546, "learning_rate": 7.52650922133514e-07, "logits/chosen": -0.8522578477859497, "logits/rejected": -0.6769514679908752, "logps/chosen": -0.10625261068344116, "logps/rejected": -5.128435134887695, "loss": 0.0849, "odds_ratio_loss": 0.00923290103673935, "rewards/accuracies": 1.0, "rewards/chosen": -0.010625261813402176, "rewards/margins": 0.5022182464599609, "rewards/rejected": -0.5128434896469116, "sft_loss": 0.10625261068344116, "step": 3343 }, { "epoch": 4.83586406362979, "grad_norm": 4.07614397131639, "learning_rate": 7.508384312022782e-07, "logits/chosen": -0.6114479303359985, "logits/rejected": -0.5096147656440735, "logps/chosen": -0.20816218852996826, "logps/rejected": -5.687783241271973, "loss": 0.1081, "odds_ratio_loss": 0.014260072261095047, "rewards/accuracies": 1.0, "rewards/chosen": -0.020816218107938766, "rewards/margins": 0.5479621291160583, "rewards/rejected": -0.5687783360481262, "sft_loss": 0.20816218852996826, "step": 3344 }, { "epoch": 4.837310195227766, "grad_norm": 1.537217968234896, "learning_rate": 7.490278992026527e-07, "logits/chosen": -0.7360656261444092, "logits/rejected": -0.5381304025650024, "logps/chosen": -0.05491877347230911, "logps/rejected": -6.194522380828857, "loss": 0.0955, "odds_ratio_loss": 0.006263894494622946, "rewards/accuracies": 1.0, "rewards/chosen": -0.005491877440363169, "rewards/margins": 0.613960325717926, "rewards/rejected": -0.6194522380828857, "sft_loss": 0.05491877347230911, "step": 3345 }, { "epoch": 4.838756326825742, "grad_norm": 1.2221302872235238, "learning_rate": 7.472193272262153e-07, "logits/chosen": -0.7894086837768555, "logits/rejected": -0.7410459518432617, "logps/chosen": -0.08093753457069397, "logps/rejected": -5.554490566253662, "loss": 0.0606, "odds_ratio_loss": 0.006017337553203106, "rewards/accuracies": 1.0, "rewards/chosen": -0.008093753829598427, "rewards/margins": 0.5473552942276001, "rewards/rejected": -0.5554490685462952, "sft_loss": 0.08093753457069397, "step": 3346 }, { "epoch": 4.840202458423716, "grad_norm": 1.6006744327868236, "learning_rate": 7.454127163633592e-07, "logits/chosen": -0.7655268311500549, "logits/rejected": -0.5101924538612366, "logps/chosen": -0.07313164323568344, "logps/rejected": -5.084259986877441, "loss": 0.0724, "odds_ratio_loss": 0.004324886482208967, "rewards/accuracies": 1.0, "rewards/chosen": -0.007313164416700602, "rewards/margins": 0.5011128187179565, "rewards/rejected": -0.5084260106086731, "sft_loss": 0.07313164323568344, "step": 3347 }, { "epoch": 4.841648590021692, "grad_norm": 1.791665583703619, "learning_rate": 7.43608067703299e-07, "logits/chosen": -0.7527843713760376, "logits/rejected": -0.525779128074646, "logps/chosen": -0.11555786430835724, "logps/rejected": -5.336285591125488, "loss": 0.1089, "odds_ratio_loss": 0.012387518770992756, "rewards/accuracies": 1.0, "rewards/chosen": -0.011555787175893784, "rewards/margins": 0.5220727920532227, "rewards/rejected": -0.5336285829544067, "sft_loss": 0.11555786430835724, "step": 3348 }, { "epoch": 4.843094721619668, "grad_norm": 1.661398734647164, "learning_rate": 7.418053823340619e-07, "logits/chosen": -0.7001569271087646, "logits/rejected": -0.6337046027183533, "logps/chosen": -0.053295012563467026, "logps/rejected": -3.712512493133545, "loss": 0.0658, "odds_ratio_loss": 0.00555766187608242, "rewards/accuracies": 1.0, "rewards/chosen": -0.005329500883817673, "rewards/margins": 0.3659217357635498, "rewards/rejected": -0.3712512254714966, "sft_loss": 0.053295012563467026, "step": 3349 }, { "epoch": 4.844540853217643, "grad_norm": 1.7115225809925612, "learning_rate": 7.400046613424953e-07, "logits/chosen": -0.9585342407226562, "logits/rejected": -0.7728517055511475, "logps/chosen": -0.06088492274284363, "logps/rejected": -4.558899879455566, "loss": 0.0988, "odds_ratio_loss": 0.009430285543203354, "rewards/accuracies": 1.0, "rewards/chosen": -0.00608849199488759, "rewards/margins": 0.449801504611969, "rewards/rejected": -0.4558899998664856, "sft_loss": 0.06088492274284363, "step": 3350 }, { "epoch": 4.845986984815618, "grad_norm": 1.5834390739287287, "learning_rate": 7.382059058142593e-07, "logits/chosen": -0.6291943192481995, "logits/rejected": -0.5301669239997864, "logps/chosen": -0.10131149739027023, "logps/rejected": -4.346987724304199, "loss": 0.1024, "odds_ratio_loss": 0.02162271924316883, "rewards/accuracies": 1.0, "rewards/chosen": -0.010131150484085083, "rewards/margins": 0.42456763982772827, "rewards/rejected": -0.43469879031181335, "sft_loss": 0.10131149739027023, "step": 3351 }, { "epoch": 4.847433116413594, "grad_norm": 1.503712398335488, "learning_rate": 7.364091168338316e-07, "logits/chosen": -0.8372702598571777, "logits/rejected": -0.4649083614349365, "logps/chosen": -0.08138298988342285, "logps/rejected": -5.166512489318848, "loss": 0.0685, "odds_ratio_loss": 0.00755068426951766, "rewards/accuracies": 1.0, "rewards/chosen": -0.008138298988342285, "rewards/margins": 0.5085129737854004, "rewards/rejected": -0.5166512727737427, "sft_loss": 0.08138298988342285, "step": 3352 }, { "epoch": 4.848879248011569, "grad_norm": 1.5724541370779448, "learning_rate": 7.346142954845023e-07, "logits/chosen": -0.8317492008209229, "logits/rejected": -0.6612436771392822, "logps/chosen": -0.08570529520511627, "logps/rejected": -5.606926918029785, "loss": 0.0873, "odds_ratio_loss": 0.004792380146682262, "rewards/accuracies": 1.0, "rewards/chosen": -0.008570530451834202, "rewards/margins": 0.552122175693512, "rewards/rejected": -0.5606927275657654, "sft_loss": 0.08570529520511627, "step": 3353 }, { "epoch": 4.8503253796095445, "grad_norm": 1.5864421670245932, "learning_rate": 7.328214428483761e-07, "logits/chosen": -0.8036106824874878, "logits/rejected": -0.6917225122451782, "logps/chosen": -0.03204449266195297, "logps/rejected": -5.145187854766846, "loss": 0.0606, "odds_ratio_loss": 0.004465484991669655, "rewards/accuracies": 1.0, "rewards/chosen": -0.003204449312761426, "rewards/margins": 0.5113143920898438, "rewards/rejected": -0.5145187973976135, "sft_loss": 0.03204449266195297, "step": 3354 }, { "epoch": 4.85177151120752, "grad_norm": 1.8370978992534, "learning_rate": 7.310305600063689e-07, "logits/chosen": -0.7530336380004883, "logits/rejected": -0.5965844988822937, "logps/chosen": -0.08221124112606049, "logps/rejected": -5.269443988800049, "loss": 0.0844, "odds_ratio_loss": 0.006964895874261856, "rewards/accuracies": 1.0, "rewards/chosen": -0.008221123367547989, "rewards/margins": 0.5187232494354248, "rewards/rejected": -0.5269443988800049, "sft_loss": 0.08221124112606049, "step": 3355 }, { "epoch": 4.853217642805495, "grad_norm": 1.599729561734898, "learning_rate": 7.292416480382124e-07, "logits/chosen": -0.8736181259155273, "logits/rejected": -0.61781907081604, "logps/chosen": -0.118461973965168, "logps/rejected": -4.439339637756348, "loss": 0.0943, "odds_ratio_loss": 0.006806999910622835, "rewards/accuracies": 1.0, "rewards/chosen": -0.01184619776904583, "rewards/margins": 0.432087779045105, "rewards/rejected": -0.44393399357795715, "sft_loss": 0.118461973965168, "step": 3356 }, { "epoch": 4.854663774403471, "grad_norm": 1.41699904404708, "learning_rate": 7.274547080224484e-07, "logits/chosen": -0.9623618125915527, "logits/rejected": -0.6903282403945923, "logps/chosen": -0.09254752844572067, "logps/rejected": -5.285045623779297, "loss": 0.0765, "odds_ratio_loss": 0.005762843415141106, "rewards/accuracies": 1.0, "rewards/chosen": -0.009254752658307552, "rewards/margins": 0.5192498564720154, "rewards/rejected": -0.5285046100616455, "sft_loss": 0.09254752844572067, "step": 3357 }, { "epoch": 4.856109906001446, "grad_norm": 1.5705078152447065, "learning_rate": 7.256697410364285e-07, "logits/chosen": -1.076545000076294, "logits/rejected": -0.740825891494751, "logps/chosen": -0.09734838455915451, "logps/rejected": -4.460480690002441, "loss": 0.1055, "odds_ratio_loss": 0.004817990120500326, "rewards/accuracies": 1.0, "rewards/chosen": -0.009734838269650936, "rewards/margins": 0.4363132119178772, "rewards/rejected": -0.4460480809211731, "sft_loss": 0.09734838455915451, "step": 3358 }, { "epoch": 4.857556037599421, "grad_norm": 1.4897765346710856, "learning_rate": 7.23886748156318e-07, "logits/chosen": -0.7283959984779358, "logits/rejected": -0.5863144397735596, "logps/chosen": -0.09330400079488754, "logps/rejected": -6.437313079833984, "loss": 0.0829, "odds_ratio_loss": 0.010229337960481644, "rewards/accuracies": 1.0, "rewards/chosen": -0.00933040026575327, "rewards/margins": 0.6344009637832642, "rewards/rejected": -0.6437313556671143, "sft_loss": 0.09330400079488754, "step": 3359 }, { "epoch": 4.859002169197397, "grad_norm": 2.2661257282257825, "learning_rate": 7.221057304570881e-07, "logits/chosen": -0.5533820986747742, "logits/rejected": -0.5214040279388428, "logps/chosen": -0.12027622759342194, "logps/rejected": -5.463490009307861, "loss": 0.0807, "odds_ratio_loss": 0.05367731302976608, "rewards/accuracies": 0.9375, "rewards/chosen": -0.012027623131871223, "rewards/margins": 0.5343213677406311, "rewards/rejected": -0.5463489890098572, "sft_loss": 0.12027622759342194, "step": 3360 }, { "epoch": 4.8604483007953725, "grad_norm": 1.5140500816134663, "learning_rate": 7.203266890125217e-07, "logits/chosen": -0.5703849792480469, "logits/rejected": -0.5185967683792114, "logps/chosen": -0.040762219578027725, "logps/rejected": -6.055505275726318, "loss": 0.0789, "odds_ratio_loss": 0.004309056792408228, "rewards/accuracies": 1.0, "rewards/chosen": -0.0040762219578027725, "rewards/margins": 0.6014742851257324, "rewards/rejected": -0.6055505275726318, "sft_loss": 0.040762219578027725, "step": 3361 }, { "epoch": 4.861894432393348, "grad_norm": 1.8219579547368454, "learning_rate": 7.185496248952078e-07, "logits/chosen": -0.507846474647522, "logits/rejected": -0.41886287927627563, "logps/chosen": -0.04386391118168831, "logps/rejected": -5.788032054901123, "loss": 0.0736, "odds_ratio_loss": 0.004342243075370789, "rewards/accuracies": 1.0, "rewards/chosen": -0.004386391025036573, "rewards/margins": 0.5744168758392334, "rewards/rejected": -0.578803300857544, "sft_loss": 0.04386391118168831, "step": 3362 }, { "epoch": 4.863340563991323, "grad_norm": 1.6904218040386811, "learning_rate": 7.167745391765483e-07, "logits/chosen": -0.8306548595428467, "logits/rejected": -0.6804887056350708, "logps/chosen": -0.10468629747629166, "logps/rejected": -4.65870475769043, "loss": 0.0904, "odds_ratio_loss": 0.008187741041183472, "rewards/accuracies": 1.0, "rewards/chosen": -0.010468630120158195, "rewards/margins": 0.4554018974304199, "rewards/rejected": -0.4658704996109009, "sft_loss": 0.10468629747629166, "step": 3363 }, { "epoch": 4.864786695589299, "grad_norm": 1.406382721212895, "learning_rate": 7.150014329267456e-07, "logits/chosen": -0.7490118741989136, "logits/rejected": -0.5290810465812683, "logps/chosen": -0.04611920937895775, "logps/rejected": -5.029910564422607, "loss": 0.0674, "odds_ratio_loss": 0.004671413917094469, "rewards/accuracies": 1.0, "rewards/chosen": -0.004611921031028032, "rewards/margins": 0.4983791708946228, "rewards/rejected": -0.5029910802841187, "sft_loss": 0.04611920937895775, "step": 3364 }, { "epoch": 4.866232827187274, "grad_norm": 1.4718586864445116, "learning_rate": 7.132303072148147e-07, "logits/chosen": -0.6270997524261475, "logits/rejected": -0.5362582802772522, "logps/chosen": -0.14092980325222015, "logps/rejected": -5.525834560394287, "loss": 0.1005, "odds_ratio_loss": 0.013515939004719257, "rewards/accuracies": 1.0, "rewards/chosen": -0.014092981815338135, "rewards/margins": 0.5384904742240906, "rewards/rejected": -0.5525834560394287, "sft_loss": 0.14092980325222015, "step": 3365 }, { "epoch": 4.867678958785249, "grad_norm": 1.3171658711366296, "learning_rate": 7.114611631085719e-07, "logits/chosen": -0.8933150172233582, "logits/rejected": -0.7245310544967651, "logps/chosen": -0.06106524541974068, "logps/rejected": -4.623565673828125, "loss": 0.0743, "odds_ratio_loss": 0.007010796573013067, "rewards/accuracies": 1.0, "rewards/chosen": -0.00610652519389987, "rewards/margins": 0.45624998211860657, "rewards/rejected": -0.4623565375804901, "sft_loss": 0.06106524541974068, "step": 3366 }, { "epoch": 4.869125090383225, "grad_norm": 2.0077338029846494, "learning_rate": 7.096940016746429e-07, "logits/chosen": -0.762084424495697, "logits/rejected": -0.6642420291900635, "logps/chosen": -0.11195956915616989, "logps/rejected": -3.8559727668762207, "loss": 0.0896, "odds_ratio_loss": 0.008951609954237938, "rewards/accuracies": 1.0, "rewards/chosen": -0.011195957660675049, "rewards/margins": 0.3744013011455536, "rewards/rejected": -0.385597288608551, "sft_loss": 0.11195956915616989, "step": 3367 }, { "epoch": 4.8705712219812005, "grad_norm": 1.5746236198419516, "learning_rate": 7.079288239784542e-07, "logits/chosen": -0.6728679537773132, "logits/rejected": -0.5606586337089539, "logps/chosen": -0.1795307844877243, "logps/rejected": -5.230622291564941, "loss": 0.1015, "odds_ratio_loss": 0.023649588227272034, "rewards/accuracies": 1.0, "rewards/chosen": -0.01795307919383049, "rewards/margins": 0.5051091313362122, "rewards/rejected": -0.5230622291564941, "sft_loss": 0.1795307844877243, "step": 3368 }, { "epoch": 4.872017353579176, "grad_norm": 1.4742014680311728, "learning_rate": 7.061656310842381e-07, "logits/chosen": -0.68035888671875, "logits/rejected": -0.6797270774841309, "logps/chosen": -0.08322112262248993, "logps/rejected": -3.5652883052825928, "loss": 0.0882, "odds_ratio_loss": 0.012553151696920395, "rewards/accuracies": 1.0, "rewards/chosen": -0.008322112262248993, "rewards/margins": 0.3482067584991455, "rewards/rejected": -0.3565288782119751, "sft_loss": 0.08322112262248993, "step": 3369 }, { "epoch": 4.873463485177151, "grad_norm": 1.4925291304976303, "learning_rate": 7.044044240550313e-07, "logits/chosen": -0.8202152252197266, "logits/rejected": -0.563833475112915, "logps/chosen": -0.06742648780345917, "logps/rejected": -4.185554027557373, "loss": 0.0591, "odds_ratio_loss": 0.005834805779159069, "rewards/accuracies": 1.0, "rewards/chosen": -0.006742648780345917, "rewards/margins": 0.4118127226829529, "rewards/rejected": -0.4185553789138794, "sft_loss": 0.06742648780345917, "step": 3370 }, { "epoch": 4.874909616775127, "grad_norm": 2.5154884893336478, "learning_rate": 7.026452039526703e-07, "logits/chosen": -0.8807682991027832, "logits/rejected": -0.5594336986541748, "logps/chosen": -0.05363871157169342, "logps/rejected": -5.723832130432129, "loss": 0.1029, "odds_ratio_loss": 0.005470744799822569, "rewards/accuracies": 1.0, "rewards/chosen": -0.005363871343433857, "rewards/margins": 0.5670193433761597, "rewards/rejected": -0.5723832249641418, "sft_loss": 0.05363871157169342, "step": 3371 }, { "epoch": 4.876355748373102, "grad_norm": 1.6423306994229212, "learning_rate": 7.008879718377976e-07, "logits/chosen": -0.6424853801727295, "logits/rejected": -0.5146550536155701, "logps/chosen": -0.07374648004770279, "logps/rejected": -6.507882118225098, "loss": 0.0858, "odds_ratio_loss": 0.005046168342232704, "rewards/accuracies": 1.0, "rewards/chosen": -0.007374648004770279, "rewards/margins": 0.6434135437011719, "rewards/rejected": -0.6507881879806519, "sft_loss": 0.07374648004770279, "step": 3372 }, { "epoch": 4.877801879971077, "grad_norm": 1.6355065912046813, "learning_rate": 6.991327287698525e-07, "logits/chosen": -0.7128841876983643, "logits/rejected": -0.6331945061683655, "logps/chosen": -0.08015397936105728, "logps/rejected": -3.10212779045105, "loss": 0.0926, "odds_ratio_loss": 0.008713113144040108, "rewards/accuracies": 1.0, "rewards/chosen": -0.008015397936105728, "rewards/margins": 0.3021973967552185, "rewards/rejected": -0.31021279096603394, "sft_loss": 0.08015397936105728, "step": 3373 }, { "epoch": 4.879248011569053, "grad_norm": 1.7960911189141748, "learning_rate": 6.973794758070806e-07, "logits/chosen": -0.7306787967681885, "logits/rejected": -0.6239840984344482, "logps/chosen": -0.13009199500083923, "logps/rejected": -3.731572389602661, "loss": 0.0737, "odds_ratio_loss": 0.007687950972467661, "rewards/accuracies": 1.0, "rewards/chosen": -0.013009199872612953, "rewards/margins": 0.3601480722427368, "rewards/rejected": -0.373157262802124, "sft_loss": 0.13009199500083923, "step": 3374 }, { "epoch": 4.8806941431670285, "grad_norm": 1.6128639569714278, "learning_rate": 6.956282140065224e-07, "logits/chosen": -0.5749595165252686, "logits/rejected": -0.48999878764152527, "logps/chosen": -0.08304701745510101, "logps/rejected": -6.037389755249023, "loss": 0.0804, "odds_ratio_loss": 0.0033578509464859962, "rewards/accuracies": 1.0, "rewards/chosen": -0.008304702118039131, "rewards/margins": 0.595434308052063, "rewards/rejected": -0.6037390232086182, "sft_loss": 0.08304701745510101, "step": 3375 }, { "epoch": 4.882140274765003, "grad_norm": 1.61063155767896, "learning_rate": 6.93878944424021e-07, "logits/chosen": -0.6542560458183289, "logits/rejected": -0.710060715675354, "logps/chosen": -0.10780586302280426, "logps/rejected": -5.035168647766113, "loss": 0.0817, "odds_ratio_loss": 0.008214112371206284, "rewards/accuracies": 1.0, "rewards/chosen": -0.01078058686107397, "rewards/margins": 0.4927363097667694, "rewards/rejected": -0.5035169124603271, "sft_loss": 0.10780586302280426, "step": 3376 }, { "epoch": 4.883586406362979, "grad_norm": 1.931393542363766, "learning_rate": 6.921316681142167e-07, "logits/chosen": -0.8681274056434631, "logits/rejected": -0.6243171691894531, "logps/chosen": -0.1537507176399231, "logps/rejected": -5.715590000152588, "loss": 0.114, "odds_ratio_loss": 0.014606855809688568, "rewards/accuracies": 1.0, "rewards/chosen": -0.01537507213652134, "rewards/margins": 0.5561839938163757, "rewards/rejected": -0.5715590119361877, "sft_loss": 0.1537507176399231, "step": 3377 }, { "epoch": 4.885032537960955, "grad_norm": 1.6623591535238764, "learning_rate": 6.903863861305498e-07, "logits/chosen": -0.7357972264289856, "logits/rejected": -0.553459882736206, "logps/chosen": -0.06771313399076462, "logps/rejected": -3.4469971656799316, "loss": 0.1139, "odds_ratio_loss": 0.01092704851180315, "rewards/accuracies": 1.0, "rewards/chosen": -0.006771313492208719, "rewards/margins": 0.3379283845424652, "rewards/rejected": -0.3446996808052063, "sft_loss": 0.06771313399076462, "step": 3378 }, { "epoch": 4.8864786695589295, "grad_norm": 1.744530404431733, "learning_rate": 6.886430995252564e-07, "logits/chosen": -0.8139889240264893, "logits/rejected": -0.6457674503326416, "logps/chosen": -0.15386618673801422, "logps/rejected": -4.273682117462158, "loss": 0.0993, "odds_ratio_loss": 0.01531638391315937, "rewards/accuracies": 1.0, "rewards/chosen": -0.015386618673801422, "rewards/margins": 0.4119815528392792, "rewards/rejected": -0.4273681938648224, "sft_loss": 0.15386618673801422, "step": 3379 }, { "epoch": 4.887924801156905, "grad_norm": 1.3904580673138525, "learning_rate": 6.869018093493721e-07, "logits/chosen": -0.8261375427246094, "logits/rejected": -0.7308131456375122, "logps/chosen": -0.17565055191516876, "logps/rejected": -3.2135813236236572, "loss": 0.0742, "odds_ratio_loss": 0.025811482220888138, "rewards/accuracies": 1.0, "rewards/chosen": -0.017565056681632996, "rewards/margins": 0.3037930727005005, "rewards/rejected": -0.3213581442832947, "sft_loss": 0.17565055191516876, "step": 3380 }, { "epoch": 4.889370932754881, "grad_norm": 1.594133966204827, "learning_rate": 6.851625166527255e-07, "logits/chosen": -0.8713509440422058, "logits/rejected": -0.6780038475990295, "logps/chosen": -0.0992034375667572, "logps/rejected": -4.472326755523682, "loss": 0.0729, "odds_ratio_loss": 0.015876563265919685, "rewards/accuracies": 1.0, "rewards/chosen": -0.00992034375667572, "rewards/margins": 0.4373123347759247, "rewards/rejected": -0.4472326636314392, "sft_loss": 0.0992034375667572, "step": 3381 }, { "epoch": 4.890817064352856, "grad_norm": 1.3939413364635795, "learning_rate": 6.834252224839438e-07, "logits/chosen": -0.516639232635498, "logits/rejected": -0.49215567111968994, "logps/chosen": -0.06323473155498505, "logps/rejected": -2.7271199226379395, "loss": 0.0867, "odds_ratio_loss": 0.01052488386631012, "rewards/accuracies": 1.0, "rewards/chosen": -0.006323473993688822, "rewards/margins": 0.26638853549957275, "rewards/rejected": -0.27271202206611633, "sft_loss": 0.06323473155498505, "step": 3382 }, { "epoch": 4.892263195950831, "grad_norm": 2.4650325801464246, "learning_rate": 6.81689927890448e-07, "logits/chosen": -0.6514842510223389, "logits/rejected": -0.4817507266998291, "logps/chosen": -0.061137307435274124, "logps/rejected": -6.235174179077148, "loss": 0.0912, "odds_ratio_loss": 0.0024059026036411524, "rewards/accuracies": 1.0, "rewards/chosen": -0.00611373083665967, "rewards/margins": 0.6174037456512451, "rewards/rejected": -0.6235173940658569, "sft_loss": 0.061137307435274124, "step": 3383 }, { "epoch": 4.893709327548807, "grad_norm": 2.0299733895183385, "learning_rate": 6.799566339184526e-07, "logits/chosen": -0.8810280561447144, "logits/rejected": -0.4609205722808838, "logps/chosen": -0.0726509764790535, "logps/rejected": -5.877103805541992, "loss": 0.0678, "odds_ratio_loss": 0.004596366081386805, "rewards/accuracies": 1.0, "rewards/chosen": -0.007265097927302122, "rewards/margins": 0.5804452896118164, "rewards/rejected": -0.5877103805541992, "sft_loss": 0.0726509764790535, "step": 3384 }, { "epoch": 4.895155459146783, "grad_norm": 1.53183380817701, "learning_rate": 6.782253416129684e-07, "logits/chosen": -0.7259580492973328, "logits/rejected": -0.6520754098892212, "logps/chosen": -0.022825662046670914, "logps/rejected": -4.62690544128418, "loss": 0.0748, "odds_ratio_loss": 0.003376226406544447, "rewards/accuracies": 1.0, "rewards/chosen": -0.0022825661581009626, "rewards/margins": 0.46040797233581543, "rewards/rejected": -0.462690532207489, "sft_loss": 0.022825662046670914, "step": 3385 }, { "epoch": 4.8966015907447575, "grad_norm": 1.3488999656463687, "learning_rate": 6.764960520177965e-07, "logits/chosen": -0.817007303237915, "logits/rejected": -0.6087720990180969, "logps/chosen": -0.05678170174360275, "logps/rejected": -6.089994430541992, "loss": 0.0592, "odds_ratio_loss": 0.0028764172457158566, "rewards/accuracies": 1.0, "rewards/chosen": -0.00567817036062479, "rewards/margins": 0.6033213138580322, "rewards/rejected": -0.6089994311332703, "sft_loss": 0.05678170174360275, "step": 3386 }, { "epoch": 4.898047722342733, "grad_norm": 1.4230099428004324, "learning_rate": 6.747687661755339e-07, "logits/chosen": -0.747872531414032, "logits/rejected": -0.6560572385787964, "logps/chosen": -0.06658175587654114, "logps/rejected": -5.573917865753174, "loss": 0.0793, "odds_ratio_loss": 0.01551305316388607, "rewards/accuracies": 1.0, "rewards/chosen": -0.006658175960183144, "rewards/margins": 0.5507336258888245, "rewards/rejected": -0.5573917627334595, "sft_loss": 0.06658175587654114, "step": 3387 }, { "epoch": 4.899493853940709, "grad_norm": 1.7422732363316469, "learning_rate": 6.73043485127566e-07, "logits/chosen": -0.7491236329078674, "logits/rejected": -0.6787482500076294, "logps/chosen": -0.07064135372638702, "logps/rejected": -5.146181106567383, "loss": 0.0753, "odds_ratio_loss": 0.00906374678015709, "rewards/accuracies": 1.0, "rewards/chosen": -0.007064135279506445, "rewards/margins": 0.5075539946556091, "rewards/rejected": -0.5146180987358093, "sft_loss": 0.07064135372638702, "step": 3388 }, { "epoch": 4.900939985538684, "grad_norm": 1.9246122301587167, "learning_rate": 6.713202099140725e-07, "logits/chosen": -1.0471328496932983, "logits/rejected": -0.6187826991081238, "logps/chosen": -0.1254141926765442, "logps/rejected": -5.144631862640381, "loss": 0.1055, "odds_ratio_loss": 0.006312836892902851, "rewards/accuracies": 1.0, "rewards/chosen": -0.012541419826447964, "rewards/margins": 0.5019217729568481, "rewards/rejected": -0.5144631862640381, "sft_loss": 0.1254141926765442, "step": 3389 }, { "epoch": 4.902386117136659, "grad_norm": 2.6512806023231543, "learning_rate": 6.695989415740215e-07, "logits/chosen": -0.7667117118835449, "logits/rejected": -0.5550485253334045, "logps/chosen": -0.056425757706165314, "logps/rejected": -4.555874347686768, "loss": 0.0858, "odds_ratio_loss": 0.0027704143431037664, "rewards/accuracies": 1.0, "rewards/chosen": -0.005642575677484274, "rewards/margins": 0.4499448835849762, "rewards/rejected": -0.4555874764919281, "sft_loss": 0.056425757706165314, "step": 3390 }, { "epoch": 4.903832248734635, "grad_norm": 1.5656569412623513, "learning_rate": 6.678796811451727e-07, "logits/chosen": -0.761025071144104, "logits/rejected": -0.5918843746185303, "logps/chosen": -0.07612086087465286, "logps/rejected": -5.696231365203857, "loss": 0.0731, "odds_ratio_loss": 0.003577734809368849, "rewards/accuracies": 1.0, "rewards/chosen": -0.007612085901200771, "rewards/margins": 0.5620110630989075, "rewards/rejected": -0.5696231722831726, "sft_loss": 0.07612086087465286, "step": 3391 }, { "epoch": 4.905278380332611, "grad_norm": 1.610031355891547, "learning_rate": 6.661624296640731e-07, "logits/chosen": -0.6648229956626892, "logits/rejected": -0.6174643039703369, "logps/chosen": -0.04702949523925781, "logps/rejected": -5.392739772796631, "loss": 0.0891, "odds_ratio_loss": 0.0024822133127599955, "rewards/accuracies": 1.0, "rewards/chosen": -0.004702950362116098, "rewards/margins": 0.5345709919929504, "rewards/rejected": -0.5392739176750183, "sft_loss": 0.04702949523925781, "step": 3392 }, { "epoch": 4.906724511930586, "grad_norm": 1.3906640545233522, "learning_rate": 6.644471881660623e-07, "logits/chosen": -0.7568591833114624, "logits/rejected": -0.4995534420013428, "logps/chosen": -0.0706809014081955, "logps/rejected": -5.783414840698242, "loss": 0.0842, "odds_ratio_loss": 0.0026557797100394964, "rewards/accuracies": 1.0, "rewards/chosen": -0.007068090606480837, "rewards/margins": 0.5712733864784241, "rewards/rejected": -0.578341543674469, "sft_loss": 0.0706809014081955, "step": 3393 }, { "epoch": 4.908170643528561, "grad_norm": 1.5512790240610077, "learning_rate": 6.627339576852637e-07, "logits/chosen": -0.8462902307510376, "logits/rejected": -0.7097457647323608, "logps/chosen": -0.06939011067152023, "logps/rejected": -2.7737913131713867, "loss": 0.0757, "odds_ratio_loss": 0.007057640701532364, "rewards/accuracies": 1.0, "rewards/chosen": -0.006939011625945568, "rewards/margins": 0.27044010162353516, "rewards/rejected": -0.2773791253566742, "sft_loss": 0.06939011067152023, "step": 3394 }, { "epoch": 4.909616775126537, "grad_norm": 1.65898596137372, "learning_rate": 6.610227392545922e-07, "logits/chosen": -0.6548706293106079, "logits/rejected": -0.4947068393230438, "logps/chosen": -0.09346526116132736, "logps/rejected": -5.793054580688477, "loss": 0.0901, "odds_ratio_loss": 0.013216537423431873, "rewards/accuracies": 1.0, "rewards/chosen": -0.00934652704745531, "rewards/margins": 0.5699589252471924, "rewards/rejected": -0.5793054699897766, "sft_loss": 0.09346526116132736, "step": 3395 }, { "epoch": 4.911062906724512, "grad_norm": 1.5699203400624258, "learning_rate": 6.593135339057463e-07, "logits/chosen": -0.8828110694885254, "logits/rejected": -0.6490928530693054, "logps/chosen": -0.03655397891998291, "logps/rejected": -4.743061065673828, "loss": 0.0744, "odds_ratio_loss": 0.004315060563385487, "rewards/accuracies": 1.0, "rewards/chosen": -0.003655398031696677, "rewards/margins": 0.47065073251724243, "rewards/rejected": -0.4743060767650604, "sft_loss": 0.03655397891998291, "step": 3396 }, { "epoch": 4.912509038322487, "grad_norm": 1.5275318305815881, "learning_rate": 6.576063426692125e-07, "logits/chosen": -0.6870374083518982, "logits/rejected": -0.49240612983703613, "logps/chosen": -0.09693014621734619, "logps/rejected": -7.673630714416504, "loss": 0.0751, "odds_ratio_loss": 0.010385874658823013, "rewards/accuracies": 1.0, "rewards/chosen": -0.009693015366792679, "rewards/margins": 0.7576700448989868, "rewards/rejected": -0.7673630714416504, "sft_loss": 0.09693014621734619, "step": 3397 }, { "epoch": 4.913955169920463, "grad_norm": 1.3408507320992828, "learning_rate": 6.559011665742642e-07, "logits/chosen": -0.7937183380126953, "logits/rejected": -0.5760884284973145, "logps/chosen": -0.12189510464668274, "logps/rejected": -5.872251033782959, "loss": 0.0693, "odds_ratio_loss": 0.01099199429154396, "rewards/accuracies": 1.0, "rewards/chosen": -0.012189511209726334, "rewards/margins": 0.5750356316566467, "rewards/rejected": -0.587225079536438, "sft_loss": 0.12189510464668274, "step": 3398 }, { "epoch": 4.915401301518438, "grad_norm": 1.6638328443107613, "learning_rate": 6.541980066489569e-07, "logits/chosen": -0.7404236793518066, "logits/rejected": -0.638684093952179, "logps/chosen": -0.15662673115730286, "logps/rejected": -4.5496721267700195, "loss": 0.1009, "odds_ratio_loss": 0.010028908029198647, "rewards/accuracies": 1.0, "rewards/chosen": -0.015662673860788345, "rewards/margins": 0.43930453062057495, "rewards/rejected": -0.454967200756073, "sft_loss": 0.15662673115730286, "step": 3399 }, { "epoch": 4.916847433116414, "grad_norm": 1.7502597021129793, "learning_rate": 6.524968639201329e-07, "logits/chosen": -0.920937180519104, "logits/rejected": -0.6321260333061218, "logps/chosen": -0.08427523076534271, "logps/rejected": -6.011247158050537, "loss": 0.091, "odds_ratio_loss": 0.004902101121842861, "rewards/accuracies": 1.0, "rewards/chosen": -0.008427524007856846, "rewards/margins": 0.5926971435546875, "rewards/rejected": -0.60112464427948, "sft_loss": 0.08427523076534271, "step": 3400 }, { "epoch": 4.918293564714389, "grad_norm": 1.4833107024960808, "learning_rate": 6.507977394134161e-07, "logits/chosen": -0.5898668169975281, "logits/rejected": -0.5003875494003296, "logps/chosen": -0.13638347387313843, "logps/rejected": -3.1561052799224854, "loss": 0.1005, "odds_ratio_loss": 0.013293557800352573, "rewards/accuracies": 1.0, "rewards/chosen": -0.013638347387313843, "rewards/margins": 0.3019721806049347, "rewards/rejected": -0.3156105577945709, "sft_loss": 0.13638347387313843, "step": 3401 }, { "epoch": 4.919739696312364, "grad_norm": 1.5630456229191343, "learning_rate": 6.491006341532169e-07, "logits/chosen": -0.7746043801307678, "logits/rejected": -0.586871325969696, "logps/chosen": -0.09608809649944305, "logps/rejected": -4.432920455932617, "loss": 0.0824, "odds_ratio_loss": 0.0049471426755189896, "rewards/accuracies": 1.0, "rewards/chosen": -0.009608810767531395, "rewards/margins": 0.4336831867694855, "rewards/rejected": -0.4432920217514038, "sft_loss": 0.09608809649944305, "step": 3402 }, { "epoch": 4.92118582791034, "grad_norm": 1.7588181040876307, "learning_rate": 6.474055491627246e-07, "logits/chosen": -0.7171893119812012, "logits/rejected": -0.6028531789779663, "logps/chosen": -0.13583296537399292, "logps/rejected": -4.825489521026611, "loss": 0.0862, "odds_ratio_loss": 0.004513260908424854, "rewards/accuracies": 1.0, "rewards/chosen": -0.013583295047283173, "rewards/margins": 0.46896564960479736, "rewards/rejected": -0.48254895210266113, "sft_loss": 0.13583296537399292, "step": 3403 }, { "epoch": 4.9226319595083154, "grad_norm": 1.6924015889641442, "learning_rate": 6.45712485463914e-07, "logits/chosen": -0.89324951171875, "logits/rejected": -0.666816771030426, "logps/chosen": -0.1056736633181572, "logps/rejected": -4.227684020996094, "loss": 0.0801, "odds_ratio_loss": 0.016112789511680603, "rewards/accuracies": 1.0, "rewards/chosen": -0.010567366145551205, "rewards/margins": 0.41220101714134216, "rewards/rejected": -0.42276835441589355, "sft_loss": 0.1056736633181572, "step": 3404 }, { "epoch": 4.92407809110629, "grad_norm": 1.5042913816203431, "learning_rate": 6.440214440775374e-07, "logits/chosen": -0.8448358774185181, "logits/rejected": -0.6646611094474792, "logps/chosen": -0.05981362611055374, "logps/rejected": -4.67680549621582, "loss": 0.0644, "odds_ratio_loss": 0.006723261438310146, "rewards/accuracies": 1.0, "rewards/chosen": -0.0059813628904521465, "rewards/margins": 0.4616991877555847, "rewards/rejected": -0.46768054366111755, "sft_loss": 0.05981362611055374, "step": 3405 }, { "epoch": 4.925524222704266, "grad_norm": 1.661160117863071, "learning_rate": 6.423324260231324e-07, "logits/chosen": -0.5030421614646912, "logits/rejected": -0.4442903399467468, "logps/chosen": -0.19611015915870667, "logps/rejected": -4.853398323059082, "loss": 0.1093, "odds_ratio_loss": 0.03798852115869522, "rewards/accuracies": 1.0, "rewards/chosen": -0.019611012190580368, "rewards/margins": 0.4657288193702698, "rewards/rejected": -0.48533985018730164, "sft_loss": 0.19611015915870667, "step": 3406 }, { "epoch": 4.926970354302242, "grad_norm": 1.5571396653798601, "learning_rate": 6.406454323190127e-07, "logits/chosen": -0.722494900226593, "logits/rejected": -0.6708986163139343, "logps/chosen": -0.10587897151708603, "logps/rejected": -5.6761274337768555, "loss": 0.0664, "odds_ratio_loss": 0.014984402805566788, "rewards/accuracies": 1.0, "rewards/chosen": -0.010587898083031178, "rewards/margins": 0.5570248365402222, "rewards/rejected": -0.5676127672195435, "sft_loss": 0.10587897151708603, "step": 3407 }, { "epoch": 4.928416485900217, "grad_norm": 1.6336998087518904, "learning_rate": 6.389604639822739e-07, "logits/chosen": -0.8598955869674683, "logits/rejected": -0.600988507270813, "logps/chosen": -0.0970025435090065, "logps/rejected": -3.2531440258026123, "loss": 0.1, "odds_ratio_loss": 0.0073122261092066765, "rewards/accuracies": 1.0, "rewards/chosen": -0.009700254537165165, "rewards/margins": 0.31561416387557983, "rewards/rejected": -0.32531440258026123, "sft_loss": 0.0970025435090065, "step": 3408 }, { "epoch": 4.929862617498192, "grad_norm": 1.8046682859163223, "learning_rate": 6.372775220287878e-07, "logits/chosen": -0.7766846418380737, "logits/rejected": -0.5947623252868652, "logps/chosen": -0.128305122256279, "logps/rejected": -4.95772647857666, "loss": 0.1133, "odds_ratio_loss": 0.026372672989964485, "rewards/accuracies": 1.0, "rewards/chosen": -0.012830512598156929, "rewards/margins": 0.4829421639442444, "rewards/rejected": -0.4957726001739502, "sft_loss": 0.128305122256279, "step": 3409 }, { "epoch": 4.931308749096168, "grad_norm": 1.8430904450067187, "learning_rate": 6.355966074732082e-07, "logits/chosen": -0.8618395924568176, "logits/rejected": -0.628158688545227, "logps/chosen": -0.04948505386710167, "logps/rejected": -4.483854293823242, "loss": 0.1178, "odds_ratio_loss": 0.004379372112452984, "rewards/accuracies": 1.0, "rewards/chosen": -0.004948505666106939, "rewards/margins": 0.4434368908405304, "rewards/rejected": -0.44838541746139526, "sft_loss": 0.04948505386710167, "step": 3410 }, { "epoch": 4.9327548806941435, "grad_norm": 1.5954896703058488, "learning_rate": 6.339177213289652e-07, "logits/chosen": -0.5719497799873352, "logits/rejected": -0.44910386204719543, "logps/chosen": -0.052914202213287354, "logps/rejected": -6.891765594482422, "loss": 0.0525, "odds_ratio_loss": 0.0075008077546954155, "rewards/accuracies": 1.0, "rewards/chosen": -0.005291420966386795, "rewards/margins": 0.6838852167129517, "rewards/rejected": -0.689176619052887, "sft_loss": 0.052914202213287354, "step": 3411 }, { "epoch": 4.934201012292118, "grad_norm": 1.4200289969330797, "learning_rate": 6.322408646082635e-07, "logits/chosen": -0.8700981140136719, "logits/rejected": -0.599973201751709, "logps/chosen": -0.06421230733394623, "logps/rejected": -5.4810638427734375, "loss": 0.0889, "odds_ratio_loss": 0.0070202648639678955, "rewards/accuracies": 1.0, "rewards/chosen": -0.006421231664717197, "rewards/margins": 0.541685163974762, "rewards/rejected": -0.5481064319610596, "sft_loss": 0.06421230733394623, "step": 3412 }, { "epoch": 4.935647143890094, "grad_norm": 1.5988810203628434, "learning_rate": 6.305660383220885e-07, "logits/chosen": -0.8789199590682983, "logits/rejected": -0.7220363616943359, "logps/chosen": -0.09391533583402634, "logps/rejected": -4.320672988891602, "loss": 0.0727, "odds_ratio_loss": 0.0037208469584584236, "rewards/accuracies": 1.0, "rewards/chosen": -0.009391534142196178, "rewards/margins": 0.4226757884025574, "rewards/rejected": -0.43206727504730225, "sft_loss": 0.09391533583402634, "step": 3413 }, { "epoch": 4.93709327548807, "grad_norm": 2.964711941744502, "learning_rate": 6.288932434801979e-07, "logits/chosen": -0.8900729417800903, "logits/rejected": -0.6892324090003967, "logps/chosen": -0.0752728208899498, "logps/rejected": -4.755692481994629, "loss": 0.0925, "odds_ratio_loss": 0.008502209559082985, "rewards/accuracies": 1.0, "rewards/chosen": -0.007527281995862722, "rewards/margins": 0.46804195642471313, "rewards/rejected": -0.4755692481994629, "sft_loss": 0.0752728208899498, "step": 3414 }, { "epoch": 4.938539407086044, "grad_norm": 1.226582221767045, "learning_rate": 6.272224810911262e-07, "logits/chosen": -0.6443370580673218, "logits/rejected": -0.46328115463256836, "logps/chosen": -0.0617620050907135, "logps/rejected": -6.892882347106934, "loss": 0.0539, "odds_ratio_loss": 0.007957663387060165, "rewards/accuracies": 1.0, "rewards/chosen": -0.006176200229674578, "rewards/margins": 0.6831120252609253, "rewards/rejected": -0.6892882585525513, "sft_loss": 0.0617620050907135, "step": 3415 }, { "epoch": 4.93998553868402, "grad_norm": 1.7930112841480699, "learning_rate": 6.255537521621814e-07, "logits/chosen": -0.8234100341796875, "logits/rejected": -0.7632617950439453, "logps/chosen": -0.0765080451965332, "logps/rejected": -5.386995315551758, "loss": 0.1174, "odds_ratio_loss": 0.006928831338882446, "rewards/accuracies": 1.0, "rewards/chosen": -0.0076508051715791225, "rewards/margins": 0.5310487747192383, "rewards/rejected": -0.5386995673179626, "sft_loss": 0.0765080451965332, "step": 3416 }, { "epoch": 4.941431670281996, "grad_norm": 1.5497605374260457, "learning_rate": 6.238870576994482e-07, "logits/chosen": -0.9648762345314026, "logits/rejected": -0.679747462272644, "logps/chosen": -0.06289042532444, "logps/rejected": -6.040827751159668, "loss": 0.0695, "odds_ratio_loss": 0.0020546133164316416, "rewards/accuracies": 1.0, "rewards/chosen": -0.0062890429981052876, "rewards/margins": 0.5977937579154968, "rewards/rejected": -0.6040828227996826, "sft_loss": 0.06289042532444, "step": 3417 }, { "epoch": 4.9428778018799715, "grad_norm": 1.5782236984356215, "learning_rate": 6.222223987077808e-07, "logits/chosen": -0.9196997284889221, "logits/rejected": -0.7327249646186829, "logps/chosen": -0.02562006562948227, "logps/rejected": -3.8854827880859375, "loss": 0.0997, "odds_ratio_loss": 0.0034806833136826754, "rewards/accuracies": 1.0, "rewards/chosen": -0.002562006702646613, "rewards/margins": 0.3859862685203552, "rewards/rejected": -0.3885482847690582, "sft_loss": 0.02562006562948227, "step": 3418 }, { "epoch": 4.944323933477946, "grad_norm": 1.5751576479599592, "learning_rate": 6.205597761908104e-07, "logits/chosen": -0.5640643835067749, "logits/rejected": -0.5260137319564819, "logps/chosen": -0.15139015018939972, "logps/rejected": -3.5538008213043213, "loss": 0.1029, "odds_ratio_loss": 0.015284059569239616, "rewards/accuracies": 1.0, "rewards/chosen": -0.015139015391469002, "rewards/margins": 0.34024107456207275, "rewards/rejected": -0.3553800880908966, "sft_loss": 0.15139015018939972, "step": 3419 }, { "epoch": 4.945770065075922, "grad_norm": 1.4752020186966457, "learning_rate": 6.188991911509367e-07, "logits/chosen": -1.0183162689208984, "logits/rejected": -0.5773686170578003, "logps/chosen": -0.04004896432161331, "logps/rejected": -5.913060188293457, "loss": 0.0642, "odds_ratio_loss": 0.0022270409390330315, "rewards/accuracies": 1.0, "rewards/chosen": -0.004004896618425846, "rewards/margins": 0.5873011350631714, "rewards/rejected": -0.5913060307502747, "sft_loss": 0.04004896432161331, "step": 3420 }, { "epoch": 4.947216196673898, "grad_norm": 1.4465994196175587, "learning_rate": 6.172406445893337e-07, "logits/chosen": -0.5818377137184143, "logits/rejected": -0.6574020385742188, "logps/chosen": -0.16453906893730164, "logps/rejected": -4.426886558532715, "loss": 0.091, "odds_ratio_loss": 0.029384538531303406, "rewards/accuracies": 1.0, "rewards/chosen": -0.016453908756375313, "rewards/margins": 0.42623475193977356, "rewards/rejected": -0.44268864393234253, "sft_loss": 0.16453906893730164, "step": 3421 }, { "epoch": 4.9486623282718725, "grad_norm": 1.447822137306616, "learning_rate": 6.15584137505945e-07, "logits/chosen": -0.8531981706619263, "logits/rejected": -0.6357840895652771, "logps/chosen": -0.07084062695503235, "logps/rejected": -7.386335372924805, "loss": 0.0794, "odds_ratio_loss": 0.0017525034490972757, "rewards/accuracies": 1.0, "rewards/chosen": -0.00708406325429678, "rewards/margins": 0.7315495014190674, "rewards/rejected": -0.7386335134506226, "sft_loss": 0.07084062695503235, "step": 3422 }, { "epoch": 4.950108459869848, "grad_norm": 1.8491072383950693, "learning_rate": 6.139296708994837e-07, "logits/chosen": -0.747806966304779, "logits/rejected": -0.6480367183685303, "logps/chosen": -0.06436759978532791, "logps/rejected": -5.5151543617248535, "loss": 0.1032, "odds_ratio_loss": 0.005401041358709335, "rewards/accuracies": 1.0, "rewards/chosen": -0.006436760071665049, "rewards/margins": 0.545078694820404, "rewards/rejected": -0.5515154600143433, "sft_loss": 0.06436759978532791, "step": 3423 }, { "epoch": 4.951554591467824, "grad_norm": 1.4690335807293746, "learning_rate": 6.122772457674359e-07, "logits/chosen": -0.5605374574661255, "logits/rejected": -0.46818655729293823, "logps/chosen": -0.09475595504045486, "logps/rejected": -4.236270904541016, "loss": 0.0888, "odds_ratio_loss": 0.01986742950975895, "rewards/accuracies": 1.0, "rewards/chosen": -0.009475595317780972, "rewards/margins": 0.41415154933929443, "rewards/rejected": -0.4236271381378174, "sft_loss": 0.09475595504045486, "step": 3424 }, { "epoch": 4.953000723065799, "grad_norm": 1.1868102449763724, "learning_rate": 6.106268631060527e-07, "logits/chosen": -0.8078406453132629, "logits/rejected": -0.5437741875648499, "logps/chosen": -0.03639750927686691, "logps/rejected": -5.105257034301758, "loss": 0.0472, "odds_ratio_loss": 0.00621524965390563, "rewards/accuracies": 1.0, "rewards/chosen": -0.003639751113951206, "rewards/margins": 0.5068859457969666, "rewards/rejected": -0.5105257034301758, "sft_loss": 0.03639750927686691, "step": 3425 }, { "epoch": 4.954446854663774, "grad_norm": 1.5291294485877678, "learning_rate": 6.089785239103582e-07, "logits/chosen": -0.9439482688903809, "logits/rejected": -0.6107262372970581, "logps/chosen": -0.03788119927048683, "logps/rejected": -3.4490232467651367, "loss": 0.0818, "odds_ratio_loss": 0.00338040036149323, "rewards/accuracies": 1.0, "rewards/chosen": -0.003788120113313198, "rewards/margins": 0.34111422300338745, "rewards/rejected": -0.3449023365974426, "sft_loss": 0.03788119927048683, "step": 3426 }, { "epoch": 4.95589298626175, "grad_norm": 1.5203093371616432, "learning_rate": 6.073322291741405e-07, "logits/chosen": -0.7352548241615295, "logits/rejected": -0.6759071946144104, "logps/chosen": -0.04905076324939728, "logps/rejected": -3.5433459281921387, "loss": 0.056, "odds_ratio_loss": 0.0025512652937322855, "rewards/accuracies": 1.0, "rewards/chosen": -0.004905076697468758, "rewards/margins": 0.34942954778671265, "rewards/rejected": -0.35433462262153625, "sft_loss": 0.04905076324939728, "step": 3427 }, { "epoch": 4.957339117859725, "grad_norm": 1.337702504072495, "learning_rate": 6.056879798899581e-07, "logits/chosen": -0.8775549530982971, "logits/rejected": -0.8563462495803833, "logps/chosen": -0.036653611809015274, "logps/rejected": -4.592785835266113, "loss": 0.0516, "odds_ratio_loss": 0.005046031437814236, "rewards/accuracies": 1.0, "rewards/chosen": -0.0036653613205999136, "rewards/margins": 0.4556131958961487, "rewards/rejected": -0.45927858352661133, "sft_loss": 0.036653611809015274, "step": 3428 }, { "epoch": 4.9587852494577005, "grad_norm": 1.7224191092429069, "learning_rate": 6.040457770491345e-07, "logits/chosen": -0.9732665419578552, "logits/rejected": -0.621749222278595, "logps/chosen": -0.07956325262784958, "logps/rejected": -4.399288177490234, "loss": 0.0908, "odds_ratio_loss": 0.005940181203186512, "rewards/accuracies": 1.0, "rewards/chosen": -0.007956326007843018, "rewards/margins": 0.4319724440574646, "rewards/rejected": -0.43992879986763, "sft_loss": 0.07956325262784958, "step": 3429 }, { "epoch": 4.960231381055676, "grad_norm": 1.254116051896288, "learning_rate": 6.024056216417595e-07, "logits/chosen": -0.9306346774101257, "logits/rejected": -0.6040125489234924, "logps/chosen": -0.029323115944862366, "logps/rejected": -6.736618995666504, "loss": 0.037, "odds_ratio_loss": 0.002192482352256775, "rewards/accuracies": 1.0, "rewards/chosen": -0.002932311501353979, "rewards/margins": 0.6707295179367065, "rewards/rejected": -0.6736618280410767, "sft_loss": 0.029323115944862366, "step": 3430 }, { "epoch": 4.961677512653652, "grad_norm": 1.293836056578048, "learning_rate": 6.007675146566886e-07, "logits/chosen": -0.7885620594024658, "logits/rejected": -0.5741199851036072, "logps/chosen": -0.05781673640012741, "logps/rejected": -6.402325630187988, "loss": 0.0713, "odds_ratio_loss": 0.007596355862915516, "rewards/accuracies": 1.0, "rewards/chosen": -0.005781673826277256, "rewards/margins": 0.6344509720802307, "rewards/rejected": -0.6402326822280884, "sft_loss": 0.05781673640012741, "step": 3431 }, { "epoch": 4.963123644251627, "grad_norm": 1.5357216450846016, "learning_rate": 5.99131457081544e-07, "logits/chosen": -0.645007848739624, "logits/rejected": -0.5896191596984863, "logps/chosen": -0.06823316216468811, "logps/rejected": -5.339206218719482, "loss": 0.0845, "odds_ratio_loss": 0.00628221919760108, "rewards/accuracies": 1.0, "rewards/chosen": -0.006823315750807524, "rewards/margins": 0.5270972847938538, "rewards/rejected": -0.5339206457138062, "sft_loss": 0.06823316216468811, "step": 3432 }, { "epoch": 4.964569775849602, "grad_norm": 1.6104289227539481, "learning_rate": 5.974974499027094e-07, "logits/chosen": -0.868780791759491, "logits/rejected": -0.577593207359314, "logps/chosen": -0.05608178302645683, "logps/rejected": -6.6864471435546875, "loss": 0.108, "odds_ratio_loss": 0.002942313440144062, "rewards/accuracies": 1.0, "rewards/chosen": -0.005608178209513426, "rewards/margins": 0.6630365252494812, "rewards/rejected": -0.6686447858810425, "sft_loss": 0.05608178302645683, "step": 3433 }, { "epoch": 4.966015907447578, "grad_norm": 1.6332927485572049, "learning_rate": 5.958654941053352e-07, "logits/chosen": -0.7936917543411255, "logits/rejected": -0.6169689893722534, "logps/chosen": -0.1516617238521576, "logps/rejected": -3.740570306777954, "loss": 0.0813, "odds_ratio_loss": 0.027436548843979836, "rewards/accuracies": 1.0, "rewards/chosen": -0.015166172757744789, "rewards/margins": 0.3588908612728119, "rewards/rejected": -0.37405702471733093, "sft_loss": 0.1516617238521576, "step": 3434 }, { "epoch": 4.967462039045553, "grad_norm": 2.132685715380471, "learning_rate": 5.942355906733318e-07, "logits/chosen": -0.77684485912323, "logits/rejected": -0.5762640833854675, "logps/chosen": -0.026029953733086586, "logps/rejected": -5.362104415893555, "loss": 0.053, "odds_ratio_loss": 0.003081801813095808, "rewards/accuracies": 1.0, "rewards/chosen": -0.0026029953733086586, "rewards/margins": 0.5336074233055115, "rewards/rejected": -0.5362104177474976, "sft_loss": 0.026029953733086586, "step": 3435 }, { "epoch": 4.9689081706435285, "grad_norm": 1.81155485785451, "learning_rate": 5.926077405893766e-07, "logits/chosen": -0.743396520614624, "logits/rejected": -0.6381196975708008, "logps/chosen": -0.06945671886205673, "logps/rejected": -4.84320068359375, "loss": 0.0892, "odds_ratio_loss": 0.005398800130933523, "rewards/accuracies": 1.0, "rewards/chosen": -0.006945671979337931, "rewards/margins": 0.477374404668808, "rewards/rejected": -0.4843200743198395, "sft_loss": 0.06945671886205673, "step": 3436 }, { "epoch": 4.970354302241504, "grad_norm": 1.5592717402220886, "learning_rate": 5.909819448349051e-07, "logits/chosen": -0.8907531499862671, "logits/rejected": -0.6397050023078918, "logps/chosen": -0.1254955232143402, "logps/rejected": -4.674949645996094, "loss": 0.0766, "odds_ratio_loss": 0.005577145144343376, "rewards/accuracies": 1.0, "rewards/chosen": -0.01254955306649208, "rewards/margins": 0.45494544506073, "rewards/rejected": -0.46749502420425415, "sft_loss": 0.1254955232143402, "step": 3437 }, { "epoch": 4.971800433839479, "grad_norm": 2.65297190852868, "learning_rate": 5.893582043901144e-07, "logits/chosen": -0.7786065340042114, "logits/rejected": -0.6070635318756104, "logps/chosen": -0.08002370595932007, "logps/rejected": -3.568556070327759, "loss": 0.0902, "odds_ratio_loss": 0.007114575244486332, "rewards/accuracies": 1.0, "rewards/chosen": -0.008002370595932007, "rewards/margins": 0.3488532304763794, "rewards/rejected": -0.3568556308746338, "sft_loss": 0.08002370595932007, "step": 3438 }, { "epoch": 4.973246565437455, "grad_norm": 1.6585001087257703, "learning_rate": 5.877365202339657e-07, "logits/chosen": -1.0660187005996704, "logits/rejected": -0.6834717392921448, "logps/chosen": -0.05151912942528725, "logps/rejected": -7.605033874511719, "loss": 0.0773, "odds_ratio_loss": 0.0027478046249598265, "rewards/accuracies": 1.0, "rewards/chosen": -0.005151913035660982, "rewards/margins": 0.7553514838218689, "rewards/rejected": -0.7605034112930298, "sft_loss": 0.05151912942528725, "step": 3439 }, { "epoch": 4.97469269703543, "grad_norm": 1.525514052881944, "learning_rate": 5.861168933441769e-07, "logits/chosen": -0.7787050008773804, "logits/rejected": -0.6245294809341431, "logps/chosen": -0.10993112623691559, "logps/rejected": -5.3284454345703125, "loss": 0.0818, "odds_ratio_loss": 0.012700337916612625, "rewards/accuracies": 1.0, "rewards/chosen": -0.0109931118786335, "rewards/margins": 0.5218514800071716, "rewards/rejected": -0.5328445434570312, "sft_loss": 0.10993112623691559, "step": 3440 }, { "epoch": 4.976138828633406, "grad_norm": 1.7760939346248212, "learning_rate": 5.844993246972288e-07, "logits/chosen": -1.0633209943771362, "logits/rejected": -0.9356187582015991, "logps/chosen": -0.04119650647044182, "logps/rejected": -3.5258665084838867, "loss": 0.083, "odds_ratio_loss": 0.00574019318446517, "rewards/accuracies": 1.0, "rewards/chosen": -0.004119650926440954, "rewards/margins": 0.34846699237823486, "rewards/rejected": -0.35258665680885315, "sft_loss": 0.04119650647044182, "step": 3441 }, { "epoch": 4.977584960231381, "grad_norm": 1.7749277326694144, "learning_rate": 5.828838152683575e-07, "logits/chosen": -0.8471897840499878, "logits/rejected": -0.6608887314796448, "logps/chosen": -0.06945478171110153, "logps/rejected": -3.0917301177978516, "loss": 0.0627, "odds_ratio_loss": 0.008055892772972584, "rewards/accuracies": 1.0, "rewards/chosen": -0.006945478729903698, "rewards/margins": 0.30222752690315247, "rewards/rejected": -0.30917298793792725, "sft_loss": 0.06945478171110153, "step": 3442 }, { "epoch": 4.9790310918293565, "grad_norm": 1.4906959459399594, "learning_rate": 5.812703660315614e-07, "logits/chosen": -1.0138330459594727, "logits/rejected": -0.7253773212432861, "logps/chosen": -0.10903214663267136, "logps/rejected": -3.4463255405426025, "loss": 0.0798, "odds_ratio_loss": 0.0062078689225018024, "rewards/accuracies": 1.0, "rewards/chosen": -0.010903215035796165, "rewards/margins": 0.3337293863296509, "rewards/rejected": -0.3446325957775116, "sft_loss": 0.10903214663267136, "step": 3443 }, { "epoch": 4.980477223427332, "grad_norm": 1.4923880084454153, "learning_rate": 5.796589779595936e-07, "logits/chosen": -0.8511115312576294, "logits/rejected": -0.6175816059112549, "logps/chosen": -0.09215164184570312, "logps/rejected": -5.326424598693848, "loss": 0.0994, "odds_ratio_loss": 0.0075257387943565845, "rewards/accuracies": 1.0, "rewards/chosen": -0.009215164929628372, "rewards/margins": 0.5234273076057434, "rewards/rejected": -0.5326424241065979, "sft_loss": 0.09215164184570312, "step": 3444 }, { "epoch": 4.981923355025307, "grad_norm": 1.9255239760331049, "learning_rate": 5.780496520239672e-07, "logits/chosen": -0.6826792359352112, "logits/rejected": -0.6077350974082947, "logps/chosen": -0.06906363368034363, "logps/rejected": -3.452920436859131, "loss": 0.0731, "odds_ratio_loss": 0.0063808392733335495, "rewards/accuracies": 1.0, "rewards/chosen": -0.006906363181769848, "rewards/margins": 0.3383857011795044, "rewards/rejected": -0.34529203176498413, "sft_loss": 0.06906363368034363, "step": 3445 }, { "epoch": 4.983369486623283, "grad_norm": 1.657251305881418, "learning_rate": 5.764423891949506e-07, "logits/chosen": -0.7294119000434875, "logits/rejected": -0.5812273621559143, "logps/chosen": -0.07187902182340622, "logps/rejected": -4.1396989822387695, "loss": 0.0809, "odds_ratio_loss": 0.003367446595802903, "rewards/accuracies": 1.0, "rewards/chosen": -0.007187901996076107, "rewards/margins": 0.40678200125694275, "rewards/rejected": -0.41396987438201904, "sft_loss": 0.07187902182340622, "step": 3446 }, { "epoch": 4.984815618221258, "grad_norm": 1.4641805442906786, "learning_rate": 5.748371904415683e-07, "logits/chosen": -0.7924904823303223, "logits/rejected": -0.6872941255569458, "logps/chosen": -0.15684622526168823, "logps/rejected": -3.0761542320251465, "loss": 0.0963, "odds_ratio_loss": 0.02701270952820778, "rewards/accuracies": 1.0, "rewards/chosen": -0.015684621408581734, "rewards/margins": 0.29193079471588135, "rewards/rejected": -0.30761539936065674, "sft_loss": 0.15684622526168823, "step": 3447 }, { "epoch": 4.986261749819233, "grad_norm": 1.6232973011439962, "learning_rate": 5.732340567315997e-07, "logits/chosen": -0.8529285192489624, "logits/rejected": -0.7546834349632263, "logps/chosen": -0.08991379290819168, "logps/rejected": -5.203977584838867, "loss": 0.0853, "odds_ratio_loss": 0.005872908979654312, "rewards/accuracies": 1.0, "rewards/chosen": -0.008991379290819168, "rewards/margins": 0.511406421661377, "rewards/rejected": -0.5203977823257446, "sft_loss": 0.08991379290819168, "step": 3448 }, { "epoch": 4.987707881417209, "grad_norm": 1.609326930688802, "learning_rate": 5.716329890315816e-07, "logits/chosen": -0.7413277626037598, "logits/rejected": -0.6836074590682983, "logps/chosen": -0.13507165014743805, "logps/rejected": -3.857135534286499, "loss": 0.0762, "odds_ratio_loss": 0.00941159576177597, "rewards/accuracies": 1.0, "rewards/chosen": -0.01350716594606638, "rewards/margins": 0.3722064197063446, "rewards/rejected": -0.3857135474681854, "sft_loss": 0.13507165014743805, "step": 3449 }, { "epoch": 4.989154013015185, "grad_norm": 1.9671365529158957, "learning_rate": 5.700339883068026e-07, "logits/chosen": -0.844429075717926, "logits/rejected": -0.6580359935760498, "logps/chosen": -0.07697506248950958, "logps/rejected": -3.4200170040130615, "loss": 0.0736, "odds_ratio_loss": 0.011311711743474007, "rewards/accuracies": 1.0, "rewards/chosen": -0.007697506807744503, "rewards/margins": 0.33430421352386475, "rewards/rejected": -0.34200170636177063, "sft_loss": 0.07697506248950958, "step": 3450 }, { "epoch": 4.990600144613159, "grad_norm": 1.676385478801375, "learning_rate": 5.684370555213061e-07, "logits/chosen": -0.6407653093338013, "logits/rejected": -0.5257874727249146, "logps/chosen": -0.05387239158153534, "logps/rejected": -4.147794246673584, "loss": 0.0712, "odds_ratio_loss": 0.0045845406129956245, "rewards/accuracies": 1.0, "rewards/chosen": -0.005387239158153534, "rewards/margins": 0.40939223766326904, "rewards/rejected": -0.4147794544696808, "sft_loss": 0.05387239158153534, "step": 3451 }, { "epoch": 4.992046276211135, "grad_norm": 1.5851457373820437, "learning_rate": 5.668421916378907e-07, "logits/chosen": -0.9896453022956848, "logits/rejected": -0.6115586757659912, "logps/chosen": -0.03699421137571335, "logps/rejected": -3.2008819580078125, "loss": 0.0717, "odds_ratio_loss": 0.007299942895770073, "rewards/accuracies": 1.0, "rewards/chosen": -0.0036994214169681072, "rewards/margins": 0.3163887560367584, "rewards/rejected": -0.3200881779193878, "sft_loss": 0.03699421137571335, "step": 3452 }, { "epoch": 4.993492407809111, "grad_norm": 1.6443996781685672, "learning_rate": 5.652493976181039e-07, "logits/chosen": -0.7869142293930054, "logits/rejected": -0.5275483727455139, "logps/chosen": -0.05778158828616142, "logps/rejected": -4.564939498901367, "loss": 0.067, "odds_ratio_loss": 0.003800423815846443, "rewards/accuracies": 1.0, "rewards/chosen": -0.005778159014880657, "rewards/margins": 0.4507158100605011, "rewards/rejected": -0.45649394392967224, "sft_loss": 0.05778158828616142, "step": 3453 }, { "epoch": 4.994938539407086, "grad_norm": 1.6418530507761049, "learning_rate": 5.636586744222481e-07, "logits/chosen": -0.7569884061813354, "logits/rejected": -0.5459706783294678, "logps/chosen": -0.059836842119693756, "logps/rejected": -4.32498836517334, "loss": 0.0911, "odds_ratio_loss": 0.0062176999635994434, "rewards/accuracies": 1.0, "rewards/chosen": -0.005983684211969376, "rewards/margins": 0.4265151619911194, "rewards/rejected": -0.43249887228012085, "sft_loss": 0.059836842119693756, "step": 3454 }, { "epoch": 4.996384671005061, "grad_norm": 1.7891231577091458, "learning_rate": 5.620700230093742e-07, "logits/chosen": -0.7332042455673218, "logits/rejected": -0.626262366771698, "logps/chosen": -0.11887304484844208, "logps/rejected": -4.1333417892456055, "loss": 0.077, "odds_ratio_loss": 0.008874907158315182, "rewards/accuracies": 1.0, "rewards/chosen": -0.011887304484844208, "rewards/margins": 0.4014468789100647, "rewards/rejected": -0.4133341908454895, "sft_loss": 0.11887304484844208, "step": 3455 }, { "epoch": 4.997830802603037, "grad_norm": 1.4062046371133936, "learning_rate": 5.604834443372892e-07, "logits/chosen": -1.033819317817688, "logits/rejected": -0.594970703125, "logps/chosen": -0.10729336738586426, "logps/rejected": -5.250903606414795, "loss": 0.063, "odds_ratio_loss": 0.007175394333899021, "rewards/accuracies": 1.0, "rewards/chosen": -0.010729337111115456, "rewards/margins": 0.5143610239028931, "rewards/rejected": -0.5250903367996216, "sft_loss": 0.10729336738586426, "step": 3456 }, { "epoch": 4.999276934201013, "grad_norm": 1.8521632428679058, "learning_rate": 5.588989393625447e-07, "logits/chosen": -0.8611646294593811, "logits/rejected": -0.5696526765823364, "logps/chosen": -0.0587548166513443, "logps/rejected": -4.783111572265625, "loss": 0.0781, "odds_ratio_loss": 0.00351850432343781, "rewards/accuracies": 1.0, "rewards/chosen": -0.005875481758266687, "rewards/margins": 0.47243571281433105, "rewards/rejected": -0.4783111810684204, "sft_loss": 0.0587548166513443, "step": 3457 }, { "epoch": 5.000723065798987, "grad_norm": 1.4265437130237972, "learning_rate": 5.573165090404464e-07, "logits/chosen": -0.709952175617218, "logits/rejected": -0.5098187327384949, "logps/chosen": -0.047502551227808, "logps/rejected": -4.869006156921387, "loss": 0.0423, "odds_ratio_loss": 0.002838584128767252, "rewards/accuracies": 1.0, "rewards/chosen": -0.004750255029648542, "rewards/margins": 0.4821503758430481, "rewards/rejected": -0.48690059781074524, "sft_loss": 0.047502551227808, "step": 3458 }, { "epoch": 5.002169197396963, "grad_norm": 1.085421565449066, "learning_rate": 5.55736154325046e-07, "logits/chosen": -0.7732264399528503, "logits/rejected": -0.5995284914970398, "logps/chosen": -0.039276450872421265, "logps/rejected": -4.488852024078369, "loss": 0.0527, "odds_ratio_loss": 0.010519957169890404, "rewards/accuracies": 1.0, "rewards/chosen": -0.003927645273506641, "rewards/margins": 0.44495758414268494, "rewards/rejected": -0.4488852322101593, "sft_loss": 0.039276450872421265, "step": 3459 }, { "epoch": 5.003615328994939, "grad_norm": 1.0464265932381913, "learning_rate": 5.54157876169147e-07, "logits/chosen": -0.8605638742446899, "logits/rejected": -0.6857526302337646, "logps/chosen": -0.009190384298563004, "logps/rejected": -5.061098575592041, "loss": 0.0324, "odds_ratio_loss": 0.0008861377718858421, "rewards/accuracies": 1.0, "rewards/chosen": -0.0009190384298563004, "rewards/margins": 0.5051908493041992, "rewards/rejected": -0.506109893321991, "sft_loss": 0.009190384298563004, "step": 3460 }, { "epoch": 5.005061460592914, "grad_norm": 1.1326646780174328, "learning_rate": 5.525816755242978e-07, "logits/chosen": -0.7338491678237915, "logits/rejected": -0.5906820297241211, "logps/chosen": -0.05195004120469093, "logps/rejected": -4.651167392730713, "loss": 0.0347, "odds_ratio_loss": 0.0044695246033370495, "rewards/accuracies": 1.0, "rewards/chosen": -0.005195004865527153, "rewards/margins": 0.4599217176437378, "rewards/rejected": -0.4651166796684265, "sft_loss": 0.05195004120469093, "step": 3461 }, { "epoch": 5.006507592190889, "grad_norm": 0.9565385507003581, "learning_rate": 5.510075533407961e-07, "logits/chosen": -0.8748555183410645, "logits/rejected": -0.730570912361145, "logps/chosen": -0.02134561724960804, "logps/rejected": -3.8099608421325684, "loss": 0.044, "odds_ratio_loss": 0.002423565834760666, "rewards/accuracies": 1.0, "rewards/chosen": -0.002134561538696289, "rewards/margins": 0.37886154651641846, "rewards/rejected": -0.38099610805511475, "sft_loss": 0.02134561724960804, "step": 3462 }, { "epoch": 5.007953723788865, "grad_norm": 0.9112409719292605, "learning_rate": 5.494355105676853e-07, "logits/chosen": -1.012371301651001, "logits/rejected": -0.8462579250335693, "logps/chosen": -0.03380393236875534, "logps/rejected": -3.5552430152893066, "loss": 0.0256, "odds_ratio_loss": 0.004212790168821812, "rewards/accuracies": 1.0, "rewards/chosen": -0.0033803931437432766, "rewards/margins": 0.35214394330978394, "rewards/rejected": -0.35552430152893066, "sft_loss": 0.03380393236875534, "step": 3463 }, { "epoch": 5.009399855386841, "grad_norm": 1.1716206047793647, "learning_rate": 5.478655481527559e-07, "logits/chosen": -0.7203569412231445, "logits/rejected": -0.5603138208389282, "logps/chosen": -0.036938704550266266, "logps/rejected": -5.195960998535156, "loss": 0.0375, "odds_ratio_loss": 0.003028081264346838, "rewards/accuracies": 1.0, "rewards/chosen": -0.0036938705015927553, "rewards/margins": 0.5159022212028503, "rewards/rejected": -0.5195960402488708, "sft_loss": 0.036938704550266266, "step": 3464 }, { "epoch": 5.010845986984815, "grad_norm": 0.8655848449570359, "learning_rate": 5.462976670425461e-07, "logits/chosen": -0.8955216407775879, "logits/rejected": -0.7210637927055359, "logps/chosen": -0.013777623884379864, "logps/rejected": -6.22860860824585, "loss": 0.0343, "odds_ratio_loss": 0.0025187258142977953, "rewards/accuracies": 1.0, "rewards/chosen": -0.0013777624117210507, "rewards/margins": 0.6214830875396729, "rewards/rejected": -0.622860848903656, "sft_loss": 0.013777623884379864, "step": 3465 }, { "epoch": 5.012292118582791, "grad_norm": 1.009337477557826, "learning_rate": 5.447318681823346e-07, "logits/chosen": -1.2388375997543335, "logits/rejected": -0.7586451172828674, "logps/chosen": -0.07897603511810303, "logps/rejected": -4.735720634460449, "loss": 0.0448, "odds_ratio_loss": 0.003169738221913576, "rewards/accuracies": 1.0, "rewards/chosen": -0.007897603325545788, "rewards/margins": 0.46567443013191223, "rewards/rejected": -0.4735720157623291, "sft_loss": 0.07897603511810303, "step": 3466 }, { "epoch": 5.013738250180767, "grad_norm": 1.121337861751337, "learning_rate": 5.431681525161495e-07, "logits/chosen": -0.7842767834663391, "logits/rejected": -0.6963058710098267, "logps/chosen": -0.10743860900402069, "logps/rejected": -3.431588649749756, "loss": 0.0624, "odds_ratio_loss": 0.012040664441883564, "rewards/accuracies": 1.0, "rewards/chosen": -0.010743859224021435, "rewards/margins": 0.33241501450538635, "rewards/rejected": -0.3431588411331177, "sft_loss": 0.10743860900402069, "step": 3467 }, { "epoch": 5.015184381778742, "grad_norm": 1.0922147755297598, "learning_rate": 5.416065209867598e-07, "logits/chosen": -0.7255322337150574, "logits/rejected": -0.561846137046814, "logps/chosen": -0.05708377808332443, "logps/rejected": -4.307957649230957, "loss": 0.0513, "odds_ratio_loss": 0.003463734406977892, "rewards/accuracies": 1.0, "rewards/chosen": -0.005708377808332443, "rewards/margins": 0.4250873625278473, "rewards/rejected": -0.43079572916030884, "sft_loss": 0.05708377808332443, "step": 3468 }, { "epoch": 5.016630513376717, "grad_norm": 0.9156674198366694, "learning_rate": 5.40046974535679e-07, "logits/chosen": -0.8717478513717651, "logits/rejected": -0.8460903167724609, "logps/chosen": -0.05093216896057129, "logps/rejected": -2.9140570163726807, "loss": 0.0363, "odds_ratio_loss": 0.009572173468768597, "rewards/accuracies": 1.0, "rewards/chosen": -0.005093216896057129, "rewards/margins": 0.2863124907016754, "rewards/rejected": -0.29140573740005493, "sft_loss": 0.05093216896057129, "step": 3469 }, { "epoch": 5.018076644974693, "grad_norm": 1.2516930398057569, "learning_rate": 5.384895141031629e-07, "logits/chosen": -0.9424750804901123, "logits/rejected": -0.701157808303833, "logps/chosen": -0.027423281222581863, "logps/rejected": -7.161192417144775, "loss": 0.0475, "odds_ratio_loss": 0.0023427875712513924, "rewards/accuracies": 1.0, "rewards/chosen": -0.002742328215390444, "rewards/margins": 0.7133768796920776, "rewards/rejected": -0.7161192893981934, "sft_loss": 0.027423281222581863, "step": 3470 }, { "epoch": 5.019522776572668, "grad_norm": 1.446751345973255, "learning_rate": 5.369341406282113e-07, "logits/chosen": -0.9990890026092529, "logits/rejected": -0.6447659730911255, "logps/chosen": -0.019773146137595177, "logps/rejected": -5.728797435760498, "loss": 0.0269, "odds_ratio_loss": 0.0009713853942230344, "rewards/accuracies": 1.0, "rewards/chosen": -0.001977314706891775, "rewards/margins": 0.570902407169342, "rewards/rejected": -0.5728797316551208, "sft_loss": 0.019773146137595177, "step": 3471 }, { "epoch": 5.0209689081706435, "grad_norm": 1.2865825800855881, "learning_rate": 5.353808550485635e-07, "logits/chosen": -0.9069744348526001, "logits/rejected": -0.7498742938041687, "logps/chosen": -0.017203805968165398, "logps/rejected": -6.09996223449707, "loss": 0.0346, "odds_ratio_loss": 0.0010495948372408748, "rewards/accuracies": 1.0, "rewards/chosen": -0.001720380736514926, "rewards/margins": 0.6082758903503418, "rewards/rejected": -0.6099961996078491, "sft_loss": 0.017203805968165398, "step": 3472 }, { "epoch": 5.022415039768619, "grad_norm": 1.5006156541725042, "learning_rate": 5.338296583007027e-07, "logits/chosen": -0.9074760675430298, "logits/rejected": -0.6146718263626099, "logps/chosen": -0.06319965422153473, "logps/rejected": -5.602901458740234, "loss": 0.0575, "odds_ratio_loss": 0.0013854659628123045, "rewards/accuracies": 1.0, "rewards/chosen": -0.00631996663287282, "rewards/margins": 0.553970217704773, "rewards/rejected": -0.5602901577949524, "sft_loss": 0.06319965422153473, "step": 3473 }, { "epoch": 5.023861171366594, "grad_norm": 1.5874623391961369, "learning_rate": 5.322805513198494e-07, "logits/chosen": -0.9391703605651855, "logits/rejected": -0.777937650680542, "logps/chosen": -0.08251549303531647, "logps/rejected": -4.623871803283691, "loss": 0.0606, "odds_ratio_loss": 0.0030107577331364155, "rewards/accuracies": 1.0, "rewards/chosen": -0.008251549676060677, "rewards/margins": 0.4541356563568115, "rewards/rejected": -0.46238720417022705, "sft_loss": 0.08251549303531647, "step": 3474 }, { "epoch": 5.02530730296457, "grad_norm": 1.4057184124461983, "learning_rate": 5.307335350399675e-07, "logits/chosen": -0.8874735832214355, "logits/rejected": -0.7688844799995422, "logps/chosen": -0.10471326112747192, "logps/rejected": -3.2333931922912598, "loss": 0.0434, "odds_ratio_loss": 0.010410060174763203, "rewards/accuracies": 1.0, "rewards/chosen": -0.010471326299011707, "rewards/margins": 0.31286799907684326, "rewards/rejected": -0.3233392834663391, "sft_loss": 0.10471326112747192, "step": 3475 }, { "epoch": 5.026753434562545, "grad_norm": 1.2902962205289839, "learning_rate": 5.291886103937586e-07, "logits/chosen": -0.9923503398895264, "logits/rejected": -0.6792709231376648, "logps/chosen": -0.015890207141637802, "logps/rejected": -3.8003733158111572, "loss": 0.0294, "odds_ratio_loss": 0.0025172270834445953, "rewards/accuracies": 1.0, "rewards/chosen": -0.0015890207141637802, "rewards/margins": 0.3784483075141907, "rewards/rejected": -0.3800373375415802, "sft_loss": 0.015890207141637802, "step": 3476 }, { "epoch": 5.028199566160521, "grad_norm": 1.2875767952399477, "learning_rate": 5.276457783126624e-07, "logits/chosen": -0.8897219300270081, "logits/rejected": -0.6773457527160645, "logps/chosen": -0.049017373472452164, "logps/rejected": -4.137425899505615, "loss": 0.0361, "odds_ratio_loss": 0.0024960683658719063, "rewards/accuracies": 1.0, "rewards/chosen": -0.004901737906038761, "rewards/margins": 0.4088408648967743, "rewards/rejected": -0.41374263167381287, "sft_loss": 0.049017373472452164, "step": 3477 }, { "epoch": 5.029645697758496, "grad_norm": 1.372847049337572, "learning_rate": 5.261050397268594e-07, "logits/chosen": -0.9416077733039856, "logits/rejected": -0.664757251739502, "logps/chosen": -0.05168641358613968, "logps/rejected": -4.555634498596191, "loss": 0.0561, "odds_ratio_loss": 0.002042082604020834, "rewards/accuracies": 1.0, "rewards/chosen": -0.005168641917407513, "rewards/margins": 0.4503948390483856, "rewards/rejected": -0.45556342601776123, "sft_loss": 0.05168641358613968, "step": 3478 }, { "epoch": 5.0310918293564715, "grad_norm": 1.2067829756850041, "learning_rate": 5.245663955652655e-07, "logits/chosen": -0.9000402688980103, "logits/rejected": -0.79487544298172, "logps/chosen": -0.028296932578086853, "logps/rejected": -4.450551986694336, "loss": 0.0379, "odds_ratio_loss": 0.00259765493683517, "rewards/accuracies": 1.0, "rewards/chosen": -0.0028296932578086853, "rewards/margins": 0.44222545623779297, "rewards/rejected": -0.44505518674850464, "sft_loss": 0.028296932578086853, "step": 3479 }, { "epoch": 5.032537960954447, "grad_norm": 1.0215153751903814, "learning_rate": 5.230298467555361e-07, "logits/chosen": -0.7747162580490112, "logits/rejected": -0.48691344261169434, "logps/chosen": -0.02550322189927101, "logps/rejected": -8.04299259185791, "loss": 0.0276, "odds_ratio_loss": 0.0014245238853618503, "rewards/accuracies": 1.0, "rewards/chosen": -0.0025503220967948437, "rewards/margins": 0.8017489910125732, "rewards/rejected": -0.8042992353439331, "sft_loss": 0.02550322189927101, "step": 3480 }, { "epoch": 5.033984092552422, "grad_norm": 1.1753442740039661, "learning_rate": 5.214953942240612e-07, "logits/chosen": -0.9011898040771484, "logits/rejected": -0.7257996201515198, "logps/chosen": -0.007925175130367279, "logps/rejected": -5.687167644500732, "loss": 0.0415, "odds_ratio_loss": 0.00017209735233336687, "rewards/accuracies": 1.0, "rewards/chosen": -0.0007925175596028566, "rewards/margins": 0.5679242610931396, "rewards/rejected": -0.5687167644500732, "sft_loss": 0.007925175130367279, "step": 3481 }, { "epoch": 5.035430224150398, "grad_norm": 1.0682223299406943, "learning_rate": 5.199630388959693e-07, "logits/chosen": -1.0496717691421509, "logits/rejected": -0.7140789031982422, "logps/chosen": -0.019513538107275963, "logps/rejected": -4.503504276275635, "loss": 0.0383, "odds_ratio_loss": 0.0018820172408595681, "rewards/accuracies": 1.0, "rewards/chosen": -0.001951353857293725, "rewards/margins": 0.44839906692504883, "rewards/rejected": -0.45035040378570557, "sft_loss": 0.019513538107275963, "step": 3482 }, { "epoch": 5.036876355748373, "grad_norm": 0.9663346190107585, "learning_rate": 5.184327816951221e-07, "logits/chosen": -0.9517099857330322, "logits/rejected": -0.5351824164390564, "logps/chosen": -0.047696616500616074, "logps/rejected": -4.995662689208984, "loss": 0.0298, "odds_ratio_loss": 0.0005317270988598466, "rewards/accuracies": 1.0, "rewards/chosen": -0.00476966192945838, "rewards/margins": 0.49479660391807556, "rewards/rejected": -0.49956628680229187, "sft_loss": 0.047696616500616074, "step": 3483 }, { "epoch": 5.038322487346348, "grad_norm": 0.99346643446749, "learning_rate": 5.169046235441175e-07, "logits/chosen": -0.755199670791626, "logits/rejected": -0.641110897064209, "logps/chosen": -0.022800451144576073, "logps/rejected": -3.9687485694885254, "loss": 0.0388, "odds_ratio_loss": 0.001115166931413114, "rewards/accuracies": 1.0, "rewards/chosen": -0.0022800450678914785, "rewards/margins": 0.39459484815597534, "rewards/rejected": -0.3968748450279236, "sft_loss": 0.022800451144576073, "step": 3484 }, { "epoch": 5.039768618944324, "grad_norm": 0.8647052648937972, "learning_rate": 5.153785653642875e-07, "logits/chosen": -0.9964678287506104, "logits/rejected": -0.8342572450637817, "logps/chosen": -0.030764533206820488, "logps/rejected": -5.155585765838623, "loss": 0.0248, "odds_ratio_loss": 0.002030643867328763, "rewards/accuracies": 1.0, "rewards/chosen": -0.0030764532275497913, "rewards/margins": 0.5124821662902832, "rewards/rejected": -0.5155586004257202, "sft_loss": 0.030764533206820488, "step": 3485 }, { "epoch": 5.0412147505422995, "grad_norm": 1.0490770833472225, "learning_rate": 5.13854608075699e-07, "logits/chosen": -0.8880283236503601, "logits/rejected": -0.8042649626731873, "logps/chosen": -0.032138124108314514, "logps/rejected": -4.834699630737305, "loss": 0.0413, "odds_ratio_loss": 0.0069372812286019325, "rewards/accuracies": 1.0, "rewards/chosen": -0.0032138123642653227, "rewards/margins": 0.48025619983673096, "rewards/rejected": -0.48347002267837524, "sft_loss": 0.032138124108314514, "step": 3486 }, { "epoch": 5.042660882140275, "grad_norm": 1.3260914708649687, "learning_rate": 5.123327525971501e-07, "logits/chosen": -0.9982175230979919, "logits/rejected": -0.6111714839935303, "logps/chosen": -0.08715112507343292, "logps/rejected": -6.953100204467773, "loss": 0.0725, "odds_ratio_loss": 0.008616614155471325, "rewards/accuracies": 1.0, "rewards/chosen": -0.008715112693607807, "rewards/margins": 0.6865949034690857, "rewards/rejected": -0.6953100562095642, "sft_loss": 0.08715112507343292, "step": 3487 }, { "epoch": 5.04410701373825, "grad_norm": 1.2545860974001273, "learning_rate": 5.108129998461752e-07, "logits/chosen": -0.9156689643859863, "logits/rejected": -0.7918872237205505, "logps/chosen": -0.1049668937921524, "logps/rejected": -3.2424416542053223, "loss": 0.0559, "odds_ratio_loss": 0.013586277142167091, "rewards/accuracies": 1.0, "rewards/chosen": -0.01049669086933136, "rewards/margins": 0.31374746561050415, "rewards/rejected": -0.3242441713809967, "sft_loss": 0.1049668937921524, "step": 3488 }, { "epoch": 5.045553145336226, "grad_norm": 0.9690430135015121, "learning_rate": 5.092953507390368e-07, "logits/chosen": -0.8073430061340332, "logits/rejected": -0.7314937710762024, "logps/chosen": -0.028009576722979546, "logps/rejected": -4.2882981300354, "loss": 0.0272, "odds_ratio_loss": 0.0006923983455635607, "rewards/accuracies": 1.0, "rewards/chosen": -0.002800957765430212, "rewards/margins": 0.426028847694397, "rewards/rejected": -0.4288298189640045, "sft_loss": 0.028009576722979546, "step": 3489 }, { "epoch": 5.046999276934201, "grad_norm": 0.9797258107602022, "learning_rate": 5.077798061907322e-07, "logits/chosen": -0.8194557428359985, "logits/rejected": -0.6159572601318359, "logps/chosen": -0.030368125066161156, "logps/rejected": -4.142151832580566, "loss": 0.0403, "odds_ratio_loss": 0.0020896908827126026, "rewards/accuracies": 1.0, "rewards/chosen": -0.0030368126463145018, "rewards/margins": 0.4111783504486084, "rewards/rejected": -0.4142151474952698, "sft_loss": 0.030368125066161156, "step": 3490 }, { "epoch": 5.048445408532176, "grad_norm": 1.1084600799985203, "learning_rate": 5.062663671149896e-07, "logits/chosen": -0.7641061544418335, "logits/rejected": -0.5759693384170532, "logps/chosen": -0.07876641303300858, "logps/rejected": -4.516148567199707, "loss": 0.0499, "odds_ratio_loss": 0.004298293497413397, "rewards/accuracies": 1.0, "rewards/chosen": -0.007876641117036343, "rewards/margins": 0.4437382221221924, "rewards/rejected": -0.4516148567199707, "sft_loss": 0.07876641303300858, "step": 3491 }, { "epoch": 5.049891540130152, "grad_norm": 0.9494340453804003, "learning_rate": 5.047550344242668e-07, "logits/chosen": -0.8096336722373962, "logits/rejected": -0.5875648856163025, "logps/chosen": -0.020555946975946426, "logps/rejected": -4.530761241912842, "loss": 0.0319, "odds_ratio_loss": 0.0013419195311143994, "rewards/accuracies": 1.0, "rewards/chosen": -0.002055594464763999, "rewards/margins": 0.4510205388069153, "rewards/rejected": -0.4530760645866394, "sft_loss": 0.020555946975946426, "step": 3492 }, { "epoch": 5.0513376717281275, "grad_norm": 1.1190637140374868, "learning_rate": 5.032458090297509e-07, "logits/chosen": -0.7927988767623901, "logits/rejected": -0.6353293657302856, "logps/chosen": -0.047286052256822586, "logps/rejected": -3.4200520515441895, "loss": 0.0454, "odds_ratio_loss": 0.0031753459479659796, "rewards/accuracies": 1.0, "rewards/chosen": -0.004728605039417744, "rewards/margins": 0.3372766077518463, "rewards/rejected": -0.34200519323349, "sft_loss": 0.047286052256822586, "step": 3493 }, { "epoch": 5.052783803326102, "grad_norm": 0.9510278762097967, "learning_rate": 5.017386918413598e-07, "logits/chosen": -0.6925092935562134, "logits/rejected": -0.5789450407028198, "logps/chosen": -0.0461755096912384, "logps/rejected": -3.9706852436065674, "loss": 0.038, "odds_ratio_loss": 0.0035718618892133236, "rewards/accuracies": 1.0, "rewards/chosen": -0.00461755134165287, "rewards/margins": 0.3924509286880493, "rewards/rejected": -0.39706850051879883, "sft_loss": 0.0461755096912384, "step": 3494 }, { "epoch": 5.054229934924078, "grad_norm": 0.8347912311398593, "learning_rate": 5.002336837677408e-07, "logits/chosen": -1.0878486633300781, "logits/rejected": -0.8472960591316223, "logps/chosen": -0.07545731961727142, "logps/rejected": -5.063971996307373, "loss": 0.0312, "odds_ratio_loss": 0.004800775554031134, "rewards/accuracies": 1.0, "rewards/chosen": -0.007545732893049717, "rewards/margins": 0.4988514482975006, "rewards/rejected": -0.5063971877098083, "sft_loss": 0.07545731961727142, "step": 3495 }, { "epoch": 5.055676066522054, "grad_norm": 1.1106826128697525, "learning_rate": 4.987307857162672e-07, "logits/chosen": -1.1164376735687256, "logits/rejected": -0.9372444748878479, "logps/chosen": -0.07102254778146744, "logps/rejected": -3.9332127571105957, "loss": 0.0463, "odds_ratio_loss": 0.005403660237789154, "rewards/accuracies": 1.0, "rewards/chosen": -0.007102255243808031, "rewards/margins": 0.3862190842628479, "rewards/rejected": -0.39332127571105957, "sft_loss": 0.07102254778146744, "step": 3496 }, { "epoch": 5.0571221981200285, "grad_norm": 0.9065018234326692, "learning_rate": 4.972299985930441e-07, "logits/chosen": -0.7647346258163452, "logits/rejected": -0.568095862865448, "logps/chosen": -0.047225866466760635, "logps/rejected": -4.031482696533203, "loss": 0.0332, "odds_ratio_loss": 0.0016099303029477596, "rewards/accuracies": 1.0, "rewards/chosen": -0.0047225868329405785, "rewards/margins": 0.3984256982803345, "rewards/rejected": -0.4031482934951782, "sft_loss": 0.047225866466760635, "step": 3497 }, { "epoch": 5.058568329718004, "grad_norm": 1.235864047380871, "learning_rate": 4.957313233029001e-07, "logits/chosen": -0.9007717967033386, "logits/rejected": -0.6701961159706116, "logps/chosen": -0.04815499857068062, "logps/rejected": -4.450354099273682, "loss": 0.0414, "odds_ratio_loss": 0.002228476107120514, "rewards/accuracies": 1.0, "rewards/chosen": -0.004815499763935804, "rewards/margins": 0.4402199387550354, "rewards/rejected": -0.4450354278087616, "sft_loss": 0.04815499857068062, "step": 3498 }, { "epoch": 5.06001446131598, "grad_norm": 1.3037735453615267, "learning_rate": 4.942347607493929e-07, "logits/chosen": -0.7589642405509949, "logits/rejected": -0.6731579303741455, "logps/chosen": -0.07891285419464111, "logps/rejected": -4.627119064331055, "loss": 0.0565, "odds_ratio_loss": 0.006445009261369705, "rewards/accuracies": 1.0, "rewards/chosen": -0.007891286164522171, "rewards/margins": 0.4548206329345703, "rewards/rejected": -0.462711900472641, "sft_loss": 0.07891285419464111, "step": 3499 }, { "epoch": 5.061460592913956, "grad_norm": 1.0390461211048085, "learning_rate": 4.927403118348055e-07, "logits/chosen": -0.8114687204360962, "logits/rejected": -0.5995222330093384, "logps/chosen": -0.023380516096949577, "logps/rejected": -5.17085075378418, "loss": 0.04, "odds_ratio_loss": 0.0011913108173757792, "rewards/accuracies": 1.0, "rewards/chosen": -0.0023380513302981853, "rewards/margins": 0.5147470235824585, "rewards/rejected": -0.517085075378418, "sft_loss": 0.023380516096949577, "step": 3500 }, { "epoch": 5.06290672451193, "grad_norm": 1.1592534723874754, "learning_rate": 4.912479774601465e-07, "logits/chosen": -0.7811102271080017, "logits/rejected": -0.6491488814353943, "logps/chosen": -0.035685863345861435, "logps/rejected": -4.719926834106445, "loss": 0.0449, "odds_ratio_loss": 0.0020906818099319935, "rewards/accuracies": 1.0, "rewards/chosen": -0.0035685861948877573, "rewards/margins": 0.46842408180236816, "rewards/rejected": -0.4719926714897156, "sft_loss": 0.035685863345861435, "step": 3501 }, { "epoch": 5.064352856109906, "grad_norm": 0.9900282846292379, "learning_rate": 4.897577585251493e-07, "logits/chosen": -0.9106247425079346, "logits/rejected": -0.506430447101593, "logps/chosen": -0.034568313509225845, "logps/rejected": -5.847599506378174, "loss": 0.0336, "odds_ratio_loss": 0.0007620738469995558, "rewards/accuracies": 1.0, "rewards/chosen": -0.003456831444054842, "rewards/margins": 0.5813031792640686, "rewards/rejected": -0.5847600698471069, "sft_loss": 0.034568313509225845, "step": 3502 }, { "epoch": 5.065798987707882, "grad_norm": 1.3192284096007874, "learning_rate": 4.882696559282728e-07, "logits/chosen": -0.7228418588638306, "logits/rejected": -0.6639438271522522, "logps/chosen": -0.05023326724767685, "logps/rejected": -4.81099796295166, "loss": 0.0544, "odds_ratio_loss": 0.003748027142137289, "rewards/accuracies": 1.0, "rewards/chosen": -0.005023326724767685, "rewards/margins": 0.47607648372650146, "rewards/rejected": -0.48109978437423706, "sft_loss": 0.05023326724767685, "step": 3503 }, { "epoch": 5.0672451193058565, "grad_norm": 1.4630431883421127, "learning_rate": 4.867836705667008e-07, "logits/chosen": -0.5559595227241516, "logits/rejected": -0.4254433512687683, "logps/chosen": -0.03188515082001686, "logps/rejected": -5.043299674987793, "loss": 0.0303, "odds_ratio_loss": 0.00941612757742405, "rewards/accuracies": 1.0, "rewards/chosen": -0.003188515082001686, "rewards/margins": 0.5011414289474487, "rewards/rejected": -0.5043299794197083, "sft_loss": 0.03188515082001686, "step": 3504 }, { "epoch": 5.068691250903832, "grad_norm": 1.1457513160225274, "learning_rate": 4.852998033363374e-07, "logits/chosen": -0.8476303815841675, "logits/rejected": -0.5761409401893616, "logps/chosen": -0.05392240732908249, "logps/rejected": -6.117643356323242, "loss": 0.0486, "odds_ratio_loss": 0.0024544643238186836, "rewards/accuracies": 1.0, "rewards/chosen": -0.005392240826040506, "rewards/margins": 0.6063721179962158, "rewards/rejected": -0.6117643713951111, "sft_loss": 0.05392240732908249, "step": 3505 }, { "epoch": 5.070137382501808, "grad_norm": 0.9423254883145569, "learning_rate": 4.838180551318137e-07, "logits/chosen": -0.9022258520126343, "logits/rejected": -0.6234160661697388, "logps/chosen": -0.047306958585977554, "logps/rejected": -6.651636123657227, "loss": 0.033, "odds_ratio_loss": 0.000815044913906604, "rewards/accuracies": 1.0, "rewards/chosen": -0.004730695858597755, "rewards/margins": 0.6604328751564026, "rewards/rejected": -0.6651636362075806, "sft_loss": 0.047306958585977554, "step": 3506 }, { "epoch": 5.071583514099783, "grad_norm": 0.8296553426657953, "learning_rate": 4.823384268464798e-07, "logits/chosen": -1.0280296802520752, "logits/rejected": -0.623708963394165, "logps/chosen": -0.007112645544111729, "logps/rejected": -5.498015403747559, "loss": 0.0226, "odds_ratio_loss": 0.0005501247942447662, "rewards/accuracies": 1.0, "rewards/chosen": -0.000711264496203512, "rewards/margins": 0.5490902662277222, "rewards/rejected": -0.5498015284538269, "sft_loss": 0.007112645544111729, "step": 3507 }, { "epoch": 5.073029645697758, "grad_norm": 0.9846452216260417, "learning_rate": 4.8086091937241e-07, "logits/chosen": -0.853293776512146, "logits/rejected": -0.660023033618927, "logps/chosen": -0.04029952734708786, "logps/rejected": -4.744614601135254, "loss": 0.0329, "odds_ratio_loss": 0.0031669042073190212, "rewards/accuracies": 1.0, "rewards/chosen": -0.004029952920973301, "rewards/margins": 0.47043153643608093, "rewards/rejected": -0.47446149587631226, "sft_loss": 0.04029952734708786, "step": 3508 }, { "epoch": 5.074475777295734, "grad_norm": 1.1616560808192198, "learning_rate": 4.793855336003973e-07, "logits/chosen": -0.8065732717514038, "logits/rejected": -0.7198399901390076, "logps/chosen": -0.030910378322005272, "logps/rejected": -4.870575904846191, "loss": 0.0367, "odds_ratio_loss": 0.000926865846849978, "rewards/accuracies": 1.0, "rewards/chosen": -0.0030910377390682697, "rewards/margins": 0.483966588973999, "rewards/rejected": -0.487057626247406, "sft_loss": 0.030910378322005272, "step": 3509 }, { "epoch": 5.07592190889371, "grad_norm": 1.460148493444669, "learning_rate": 4.7791227041996e-07, "logits/chosen": -1.0374577045440674, "logits/rejected": -0.7249501347541809, "logps/chosen": -0.035487934947013855, "logps/rejected": -6.014105796813965, "loss": 0.0256, "odds_ratio_loss": 0.0014334238367155194, "rewards/accuracies": 1.0, "rewards/chosen": -0.0035487934947013855, "rewards/margins": 0.5978617668151855, "rewards/rejected": -0.6014105677604675, "sft_loss": 0.035487934947013855, "step": 3510 }, { "epoch": 5.0773680404916846, "grad_norm": 1.24139935366732, "learning_rate": 4.764411307193312e-07, "logits/chosen": -1.1357358694076538, "logits/rejected": -0.7631771564483643, "logps/chosen": -0.051570966839790344, "logps/rejected": -4.790763854980469, "loss": 0.0336, "odds_ratio_loss": 0.0028275088407099247, "rewards/accuracies": 1.0, "rewards/chosen": -0.005157096311450005, "rewards/margins": 0.4739193320274353, "rewards/rejected": -0.47907641530036926, "sft_loss": 0.051570966839790344, "step": 3511 }, { "epoch": 5.07881417208966, "grad_norm": 1.2128643594847868, "learning_rate": 4.749721153854689e-07, "logits/chosen": -0.8864186406135559, "logits/rejected": -0.7438483238220215, "logps/chosen": -0.05787164717912674, "logps/rejected": -3.919307231903076, "loss": 0.0467, "odds_ratio_loss": 0.012838841415941715, "rewards/accuracies": 1.0, "rewards/chosen": -0.005787164904177189, "rewards/margins": 0.38614362478256226, "rewards/rejected": -0.3919307589530945, "sft_loss": 0.05787164717912674, "step": 3512 }, { "epoch": 5.080260303687636, "grad_norm": 0.8749759144457411, "learning_rate": 4.735052253040459e-07, "logits/chosen": -0.8959901332855225, "logits/rejected": -0.7548617124557495, "logps/chosen": -0.03229469805955887, "logps/rejected": -3.567509651184082, "loss": 0.0229, "odds_ratio_loss": 0.005712658166885376, "rewards/accuracies": 1.0, "rewards/chosen": -0.0032294695265591145, "rewards/margins": 0.353521466255188, "rewards/rejected": -0.3567509353160858, "sft_loss": 0.03229469805955887, "step": 3513 }, { "epoch": 5.081706435285611, "grad_norm": 1.123889356229595, "learning_rate": 4.720404613594575e-07, "logits/chosen": -0.8648489713668823, "logits/rejected": -0.6787019968032837, "logps/chosen": -0.07309830188751221, "logps/rejected": -5.374810695648193, "loss": 0.0453, "odds_ratio_loss": 0.002101986203342676, "rewards/accuracies": 1.0, "rewards/chosen": -0.007309830281883478, "rewards/margins": 0.5301712155342102, "rewards/rejected": -0.5374810099601746, "sft_loss": 0.07309830188751221, "step": 3514 }, { "epoch": 5.083152566883586, "grad_norm": 0.9471250243679993, "learning_rate": 4.7057782443481464e-07, "logits/chosen": -0.8933843970298767, "logits/rejected": -0.5827321410179138, "logps/chosen": -0.00811697170138359, "logps/rejected": -5.076469421386719, "loss": 0.0269, "odds_ratio_loss": 0.000255201623076573, "rewards/accuracies": 1.0, "rewards/chosen": -0.0008116972167044878, "rewards/margins": 0.5068352222442627, "rewards/rejected": -0.507646918296814, "sft_loss": 0.00811697170138359, "step": 3515 }, { "epoch": 5.084598698481562, "grad_norm": 1.1133186028853113, "learning_rate": 4.691173154119461e-07, "logits/chosen": -1.0953632593154907, "logits/rejected": -0.741931140422821, "logps/chosen": -0.04556608200073242, "logps/rejected": -5.745540618896484, "loss": 0.0409, "odds_ratio_loss": 0.0018734941259026527, "rewards/accuracies": 1.0, "rewards/chosen": -0.004556608386337757, "rewards/margins": 0.5699974298477173, "rewards/rejected": -0.5745540857315063, "sft_loss": 0.04556608200073242, "step": 3516 }, { "epoch": 5.086044830079537, "grad_norm": 0.9562504605078648, "learning_rate": 4.6765893517139775e-07, "logits/chosen": -0.7702620625495911, "logits/rejected": -0.5316519141197205, "logps/chosen": -0.0347968190908432, "logps/rejected": -5.1375651359558105, "loss": 0.031, "odds_ratio_loss": 0.004826520103961229, "rewards/accuracies": 1.0, "rewards/chosen": -0.00347968190908432, "rewards/margins": 0.5102768540382385, "rewards/rejected": -0.5137565732002258, "sft_loss": 0.0347968190908432, "step": 3517 }, { "epoch": 5.087490961677513, "grad_norm": 0.9150585192597411, "learning_rate": 4.662026845924334e-07, "logits/chosen": -0.9138441681861877, "logits/rejected": -0.624638020992279, "logps/chosen": -0.019125226885080338, "logps/rejected": -5.921594142913818, "loss": 0.0292, "odds_ratio_loss": 0.0004167947336100042, "rewards/accuracies": 1.0, "rewards/chosen": -0.00191252282820642, "rewards/margins": 0.5902469158172607, "rewards/rejected": -0.5921594500541687, "sft_loss": 0.019125226885080338, "step": 3518 }, { "epoch": 5.088937093275488, "grad_norm": 0.9595462720990704, "learning_rate": 4.647485645530325e-07, "logits/chosen": -0.6247076988220215, "logits/rejected": -0.4782922565937042, "logps/chosen": -0.024594586342573166, "logps/rejected": -3.9685606956481934, "loss": 0.0443, "odds_ratio_loss": 0.0030578388832509518, "rewards/accuracies": 1.0, "rewards/chosen": -0.002459458541125059, "rewards/margins": 0.3943966031074524, "rewards/rejected": -0.39685600996017456, "sft_loss": 0.024594586342573166, "step": 3519 }, { "epoch": 5.090383224873463, "grad_norm": 1.2927196416567392, "learning_rate": 4.632965759298879e-07, "logits/chosen": -0.9268531799316406, "logits/rejected": -0.7796077728271484, "logps/chosen": -0.11328568309545517, "logps/rejected": -3.251885414123535, "loss": 0.0521, "odds_ratio_loss": 0.05676887556910515, "rewards/accuracies": 0.9375, "rewards/chosen": -0.011328568682074547, "rewards/margins": 0.3138599693775177, "rewards/rejected": -0.325188547372818, "sft_loss": 0.11328568309545517, "step": 3520 }, { "epoch": 5.091829356471439, "grad_norm": 1.1535490074379513, "learning_rate": 4.618467195984106e-07, "logits/chosen": -1.0983455181121826, "logits/rejected": -0.9028714895248413, "logps/chosen": -0.04674210399389267, "logps/rejected": -4.079455375671387, "loss": 0.0532, "odds_ratio_loss": 0.003504629246890545, "rewards/accuracies": 1.0, "rewards/chosen": -0.004674210678786039, "rewards/margins": 0.403271347284317, "rewards/rejected": -0.40794551372528076, "sft_loss": 0.04674210399389267, "step": 3521 }, { "epoch": 5.093275488069414, "grad_norm": 1.0675836746198444, "learning_rate": 4.603989964327235e-07, "logits/chosen": -1.1998631954193115, "logits/rejected": -0.9866847395896912, "logps/chosen": -0.03705126419663429, "logps/rejected": -4.348075866699219, "loss": 0.0375, "odds_ratio_loss": 0.0019462001509964466, "rewards/accuracies": 1.0, "rewards/chosen": -0.003705126466229558, "rewards/margins": 0.431102454662323, "rewards/rejected": -0.43480759859085083, "sft_loss": 0.03705126419663429, "step": 3522 }, { "epoch": 5.09472161966739, "grad_norm": 0.9915894254744622, "learning_rate": 4.58953407305664e-07, "logits/chosen": -1.121100664138794, "logits/rejected": -0.7228765487670898, "logps/chosen": -0.026232311502099037, "logps/rejected": -6.338787078857422, "loss": 0.0325, "odds_ratio_loss": 0.0014822124503552914, "rewards/accuracies": 1.0, "rewards/chosen": -0.0026232311502099037, "rewards/margins": 0.6312555074691772, "rewards/rejected": -0.633878767490387, "sft_loss": 0.026232311502099037, "step": 3523 }, { "epoch": 5.096167751265365, "grad_norm": 1.0533120114898256, "learning_rate": 4.5750995308878336e-07, "logits/chosen": -1.0108604431152344, "logits/rejected": -0.7267636656761169, "logps/chosen": -0.06367263197898865, "logps/rejected": -4.689202308654785, "loss": 0.0441, "odds_ratio_loss": 0.0063222860917449, "rewards/accuracies": 1.0, "rewards/chosen": -0.006367263849824667, "rewards/margins": 0.4625529646873474, "rewards/rejected": -0.46892020106315613, "sft_loss": 0.06367263197898865, "step": 3524 }, { "epoch": 5.097613882863341, "grad_norm": 1.0608449766673291, "learning_rate": 4.560686346523459e-07, "logits/chosen": -0.8225513696670532, "logits/rejected": -0.6037327647209167, "logps/chosen": -0.01524380873888731, "logps/rejected": -5.777705669403076, "loss": 0.0403, "odds_ratio_loss": 0.0009167609387077391, "rewards/accuracies": 1.0, "rewards/chosen": -0.001524380873888731, "rewards/margins": 0.5762461423873901, "rewards/rejected": -0.5777705907821655, "sft_loss": 0.01524380873888731, "step": 3525 }, { "epoch": 5.099060014461316, "grad_norm": 0.801826400322166, "learning_rate": 4.546294528653272e-07, "logits/chosen": -0.8241301774978638, "logits/rejected": -0.6081610918045044, "logps/chosen": -0.017750507220625877, "logps/rejected": -5.126501560211182, "loss": 0.0228, "odds_ratio_loss": 0.00040757720125839114, "rewards/accuracies": 1.0, "rewards/chosen": -0.0017750507686287165, "rewards/margins": 0.5108751058578491, "rewards/rejected": -0.5126501321792603, "sft_loss": 0.017750507220625877, "step": 3526 }, { "epoch": 5.100506146059291, "grad_norm": 1.2157011286624781, "learning_rate": 4.531924085954162e-07, "logits/chosen": -0.6494585275650024, "logits/rejected": -0.5403815507888794, "logps/chosen": -0.00851333886384964, "logps/rejected": -7.799459457397461, "loss": 0.0363, "odds_ratio_loss": 0.00040093838470056653, "rewards/accuracies": 1.0, "rewards/chosen": -0.0008513339562341571, "rewards/margins": 0.7790946960449219, "rewards/rejected": -0.779945969581604, "sft_loss": 0.00851333886384964, "step": 3527 }, { "epoch": 5.101952277657267, "grad_norm": 1.141295091187854, "learning_rate": 4.5175750270901105e-07, "logits/chosen": -0.8939746022224426, "logits/rejected": -0.5252866744995117, "logps/chosen": -0.01969527266919613, "logps/rejected": -7.3741254806518555, "loss": 0.0357, "odds_ratio_loss": 0.0005816838820464909, "rewards/accuracies": 1.0, "rewards/chosen": -0.001969527220353484, "rewards/margins": 0.735443115234375, "rewards/rejected": -0.7374125719070435, "sft_loss": 0.01969527266919613, "step": 3528 }, { "epoch": 5.1033984092552425, "grad_norm": 1.4337925233330169, "learning_rate": 4.5032473607122366e-07, "logits/chosen": -0.78791344165802, "logits/rejected": -0.6291660070419312, "logps/chosen": -0.03807862848043442, "logps/rejected": -4.798315048217773, "loss": 0.0534, "odds_ratio_loss": 0.0023045032285153866, "rewards/accuracies": 1.0, "rewards/chosen": -0.003807862987741828, "rewards/margins": 0.4760235846042633, "rewards/rejected": -0.4798315167427063, "sft_loss": 0.03807862848043442, "step": 3529 }, { "epoch": 5.104844540853217, "grad_norm": 1.0117496069143304, "learning_rate": 4.4889410954587294e-07, "logits/chosen": -0.8519380688667297, "logits/rejected": -0.6767643690109253, "logps/chosen": -0.06508355587720871, "logps/rejected": -5.1234869956970215, "loss": 0.0355, "odds_ratio_loss": 0.007920399308204651, "rewards/accuracies": 1.0, "rewards/chosen": -0.0065083554945886135, "rewards/margins": 0.5058403015136719, "rewards/rejected": -0.5123487114906311, "sft_loss": 0.06508355587720871, "step": 3530 }, { "epoch": 5.106290672451193, "grad_norm": 0.9701742964861159, "learning_rate": 4.4746562399548884e-07, "logits/chosen": -0.9526634812355042, "logits/rejected": -0.6120060086250305, "logps/chosen": -0.0644385814666748, "logps/rejected": -6.016313552856445, "loss": 0.0499, "odds_ratio_loss": 0.002297525992617011, "rewards/accuracies": 1.0, "rewards/chosen": -0.006443857681006193, "rewards/margins": 0.5951874852180481, "rewards/rejected": -0.6016313433647156, "sft_loss": 0.0644385814666748, "step": 3531 }, { "epoch": 5.107736804049169, "grad_norm": 1.1593006104315133, "learning_rate": 4.460392802813118e-07, "logits/chosen": -0.837909996509552, "logits/rejected": -0.5113778710365295, "logps/chosen": -0.06349142640829086, "logps/rejected": -6.190591812133789, "loss": 0.044, "odds_ratio_loss": 0.0006093709962442517, "rewards/accuracies": 1.0, "rewards/chosen": -0.006349142640829086, "rewards/margins": 0.6127101182937622, "rewards/rejected": -0.6190592646598816, "sft_loss": 0.06349142640829086, "step": 3532 }, { "epoch": 5.109182935647144, "grad_norm": 1.134106514295472, "learning_rate": 4.4461507926328813e-07, "logits/chosen": -1.0936976671218872, "logits/rejected": -0.8021297454833984, "logps/chosen": -0.03575780615210533, "logps/rejected": -4.95504093170166, "loss": 0.0414, "odds_ratio_loss": 0.001436459249816835, "rewards/accuracies": 1.0, "rewards/chosen": -0.0035757804289460182, "rewards/margins": 0.4919283390045166, "rewards/rejected": -0.49550408124923706, "sft_loss": 0.03575780615210533, "step": 3533 }, { "epoch": 5.110629067245119, "grad_norm": 1.3570453478023297, "learning_rate": 4.4319302180007544e-07, "logits/chosen": -1.0296474695205688, "logits/rejected": -0.7075653076171875, "logps/chosen": -0.046218566596508026, "logps/rejected": -4.962112903594971, "loss": 0.0569, "odds_ratio_loss": 0.0019982897210866213, "rewards/accuracies": 1.0, "rewards/chosen": -0.00462185638025403, "rewards/margins": 0.4915894865989685, "rewards/rejected": -0.49621132016181946, "sft_loss": 0.046218566596508026, "step": 3534 }, { "epoch": 5.112075198843095, "grad_norm": 1.0757747569105476, "learning_rate": 4.417731087490364e-07, "logits/chosen": -0.9343793392181396, "logits/rejected": -0.631847620010376, "logps/chosen": -0.07113965600728989, "logps/rejected": -4.919393539428711, "loss": 0.0413, "odds_ratio_loss": 0.001878237002529204, "rewards/accuracies": 1.0, "rewards/chosen": -0.007113965693861246, "rewards/margins": 0.48482537269592285, "rewards/rejected": -0.4919393062591553, "sft_loss": 0.07113965600728989, "step": 3535 }, { "epoch": 5.1135213304410705, "grad_norm": 1.112657526372105, "learning_rate": 4.4035534096624303e-07, "logits/chosen": -0.8756897449493408, "logits/rejected": -0.7892172932624817, "logps/chosen": -0.04378293454647064, "logps/rejected": -3.8384764194488525, "loss": 0.0336, "odds_ratio_loss": 0.003223699051886797, "rewards/accuracies": 1.0, "rewards/chosen": -0.004378293640911579, "rewards/margins": 0.3794693648815155, "rewards/rejected": -0.3838476538658142, "sft_loss": 0.04378293454647064, "step": 3536 }, { "epoch": 5.114967462039045, "grad_norm": 1.0098252973514328, "learning_rate": 4.389397193064717e-07, "logits/chosen": -0.8488441705703735, "logits/rejected": -0.7493760585784912, "logps/chosen": -0.01889643445611, "logps/rejected": -4.077645301818848, "loss": 0.0225, "odds_ratio_loss": 0.0024902906734496355, "rewards/accuracies": 1.0, "rewards/chosen": -0.0018896434921771288, "rewards/margins": 0.4058748781681061, "rewards/rejected": -0.4077645540237427, "sft_loss": 0.01889643445611, "step": 3537 }, { "epoch": 5.116413593637021, "grad_norm": 1.1998715837416576, "learning_rate": 4.375262446232066e-07, "logits/chosen": -0.6818573474884033, "logits/rejected": -0.4833465814590454, "logps/chosen": -0.15603819489479065, "logps/rejected": -6.571020603179932, "loss": 0.0731, "odds_ratio_loss": 0.002038759645074606, "rewards/accuracies": 1.0, "rewards/chosen": -0.015603819862008095, "rewards/margins": 0.641498327255249, "rewards/rejected": -0.657102108001709, "sft_loss": 0.15603819489479065, "step": 3538 }, { "epoch": 5.117859725234997, "grad_norm": 1.399862952925075, "learning_rate": 4.36114917768637e-07, "logits/chosen": -0.9014131426811218, "logits/rejected": -0.7489269971847534, "logps/chosen": -0.020976906642317772, "logps/rejected": -4.785622596740723, "loss": 0.0412, "odds_ratio_loss": 0.0008768899133428931, "rewards/accuracies": 1.0, "rewards/chosen": -0.002097690710797906, "rewards/margins": 0.47646456956863403, "rewards/rejected": -0.47856229543685913, "sft_loss": 0.020976906642317772, "step": 3539 }, { "epoch": 5.1193058568329715, "grad_norm": 1.3979035193631253, "learning_rate": 4.3470573959365665e-07, "logits/chosen": -0.9969203472137451, "logits/rejected": -0.704779863357544, "logps/chosen": -0.022899752482771873, "logps/rejected": -3.8194432258605957, "loss": 0.0423, "odds_ratio_loss": 0.0018510606605559587, "rewards/accuracies": 1.0, "rewards/chosen": -0.0022899750620126724, "rewards/margins": 0.3796543478965759, "rewards/rejected": -0.38194429874420166, "sft_loss": 0.022899752482771873, "step": 3540 }, { "epoch": 5.120751988430947, "grad_norm": 1.1232720454542073, "learning_rate": 4.3329871094786383e-07, "logits/chosen": -0.9428566694259644, "logits/rejected": -0.6366056799888611, "logps/chosen": -0.02941741608083248, "logps/rejected": -7.422673225402832, "loss": 0.0437, "odds_ratio_loss": 0.0006806895835325122, "rewards/accuracies": 1.0, "rewards/chosen": -0.002941741608083248, "rewards/margins": 0.7393256425857544, "rewards/rejected": -0.742267370223999, "sft_loss": 0.02941741608083248, "step": 3541 }, { "epoch": 5.122198120028923, "grad_norm": 1.0411339399045239, "learning_rate": 4.318938326795627e-07, "logits/chosen": -0.8791346549987793, "logits/rejected": -0.6777920722961426, "logps/chosen": -0.022412490099668503, "logps/rejected": -5.245683670043945, "loss": 0.0359, "odds_ratio_loss": 0.0014050828758627176, "rewards/accuracies": 1.0, "rewards/chosen": -0.0022412489634007215, "rewards/margins": 0.5223270654678345, "rewards/rejected": -0.5245683193206787, "sft_loss": 0.022412490099668503, "step": 3542 }, { "epoch": 5.123644251626898, "grad_norm": 1.07489111007517, "learning_rate": 4.304911056357583e-07, "logits/chosen": -0.7371894121170044, "logits/rejected": -0.5516063570976257, "logps/chosen": -0.010756559669971466, "logps/rejected": -6.3359551429748535, "loss": 0.0307, "odds_ratio_loss": 0.0009100245079025626, "rewards/accuracies": 1.0, "rewards/chosen": -0.0010756559204310179, "rewards/margins": 0.6325198411941528, "rewards/rejected": -0.6335955262184143, "sft_loss": 0.010756559669971466, "step": 3543 }, { "epoch": 5.125090383224873, "grad_norm": 1.0239088458657568, "learning_rate": 4.290905306621604e-07, "logits/chosen": -0.9055813550949097, "logits/rejected": -0.6601625680923462, "logps/chosen": -0.014378732070326805, "logps/rejected": -6.063301086425781, "loss": 0.0239, "odds_ratio_loss": 0.0005538854748010635, "rewards/accuracies": 1.0, "rewards/chosen": -0.0014378732303157449, "rewards/margins": 0.6048922538757324, "rewards/rejected": -0.606330156326294, "sft_loss": 0.014378732070326805, "step": 3544 }, { "epoch": 5.126536514822849, "grad_norm": 1.2712624476901007, "learning_rate": 4.276921086031815e-07, "logits/chosen": -0.9251024127006531, "logits/rejected": -0.7072352170944214, "logps/chosen": -0.057558659464120865, "logps/rejected": -4.377816677093506, "loss": 0.0389, "odds_ratio_loss": 0.0034858500584959984, "rewards/accuracies": 1.0, "rewards/chosen": -0.005755866877734661, "rewards/margins": 0.4320257604122162, "rewards/rejected": -0.4377816617488861, "sft_loss": 0.057558659464120865, "step": 3545 }, { "epoch": 5.127982646420825, "grad_norm": 0.983345093814679, "learning_rate": 4.2629584030193564e-07, "logits/chosen": -0.8956122994422913, "logits/rejected": -0.6748299598693848, "logps/chosen": -0.07077537477016449, "logps/rejected": -4.568532943725586, "loss": 0.0295, "odds_ratio_loss": 0.000756526249460876, "rewards/accuracies": 1.0, "rewards/chosen": -0.007077537477016449, "rewards/margins": 0.4497757852077484, "rewards/rejected": -0.45685333013534546, "sft_loss": 0.07077537477016449, "step": 3546 }, { "epoch": 5.1294287780187995, "grad_norm": 1.0892353016146112, "learning_rate": 4.2490172660023705e-07, "logits/chosen": -0.8571529984474182, "logits/rejected": -0.6739146709442139, "logps/chosen": -0.014833889901638031, "logps/rejected": -6.780934810638428, "loss": 0.0406, "odds_ratio_loss": 0.0011146971955895424, "rewards/accuracies": 1.0, "rewards/chosen": -0.0014833889435976744, "rewards/margins": 0.6766101121902466, "rewards/rejected": -0.6780935525894165, "sft_loss": 0.014833889901638031, "step": 3547 }, { "epoch": 5.130874909616775, "grad_norm": 1.1610728467235858, "learning_rate": 4.235097683386022e-07, "logits/chosen": -0.913325309753418, "logits/rejected": -0.6378690004348755, "logps/chosen": -0.011191330850124359, "logps/rejected": -4.821349143981934, "loss": 0.0471, "odds_ratio_loss": 0.0005949364276602864, "rewards/accuracies": 1.0, "rewards/chosen": -0.0011191332014277577, "rewards/margins": 0.48101580142974854, "rewards/rejected": -0.4821348786354065, "sft_loss": 0.011191330850124359, "step": 3548 }, { "epoch": 5.132321041214751, "grad_norm": 1.1498882814890075, "learning_rate": 4.2211996635624867e-07, "logits/chosen": -0.8509616255760193, "logits/rejected": -0.7862538695335388, "logps/chosen": -0.027645738795399666, "logps/rejected": -5.407499313354492, "loss": 0.0304, "odds_ratio_loss": 0.0012020114809274673, "rewards/accuracies": 1.0, "rewards/chosen": -0.0027645740192383528, "rewards/margins": 0.5379853844642639, "rewards/rejected": -0.5407499670982361, "sft_loss": 0.027645738795399666, "step": 3549 }, { "epoch": 5.133767172812726, "grad_norm": 1.045508537733952, "learning_rate": 4.207323214910925e-07, "logits/chosen": -0.8098764419555664, "logits/rejected": -0.673413872718811, "logps/chosen": -0.019348647445440292, "logps/rejected": -4.263792037963867, "loss": 0.0281, "odds_ratio_loss": 0.0014103625435382128, "rewards/accuracies": 1.0, "rewards/chosen": -0.0019348645582795143, "rewards/margins": 0.4244443476200104, "rewards/rejected": -0.4263792037963867, "sft_loss": 0.019348647445440292, "step": 3550 }, { "epoch": 5.135213304410701, "grad_norm": 0.9118889895034206, "learning_rate": 4.193468345797511e-07, "logits/chosen": -0.6067082285881042, "logits/rejected": -0.3711710274219513, "logps/chosen": -0.019313864409923553, "logps/rejected": -6.416814804077148, "loss": 0.0337, "odds_ratio_loss": 0.0010191251058131456, "rewards/accuracies": 1.0, "rewards/chosen": -0.0019313863012939692, "rewards/margins": 0.6397501230239868, "rewards/rejected": -0.6416815519332886, "sft_loss": 0.019313864409923553, "step": 3551 }, { "epoch": 5.136659436008677, "grad_norm": 1.3245124770343912, "learning_rate": 4.1796350645753795e-07, "logits/chosen": -0.93780517578125, "logits/rejected": -0.7108378410339355, "logps/chosen": -0.04983704164624214, "logps/rejected": -3.885666608810425, "loss": 0.0479, "odds_ratio_loss": 0.0015096311690285802, "rewards/accuracies": 1.0, "rewards/chosen": -0.004983704537153244, "rewards/margins": 0.3835829496383667, "rewards/rejected": -0.38856664299964905, "sft_loss": 0.04983704164624214, "step": 3552 }, { "epoch": 5.138105567606652, "grad_norm": 1.1490700715634683, "learning_rate": 4.1658233795846833e-07, "logits/chosen": -0.8055789470672607, "logits/rejected": -0.6550903916358948, "logps/chosen": -0.017309710383415222, "logps/rejected": -4.0452656745910645, "loss": 0.0241, "odds_ratio_loss": 0.013794164173305035, "rewards/accuracies": 1.0, "rewards/chosen": -0.0017309710383415222, "rewards/margins": 0.40279555320739746, "rewards/rejected": -0.40452659130096436, "sft_loss": 0.017309710383415222, "step": 3553 }, { "epoch": 5.1395516992046275, "grad_norm": 1.0066328630819037, "learning_rate": 4.152033299152533e-07, "logits/chosen": -1.1695951223373413, "logits/rejected": -0.7323697805404663, "logps/chosen": -0.01908089965581894, "logps/rejected": -6.578709125518799, "loss": 0.0284, "odds_ratio_loss": 0.004583199508488178, "rewards/accuracies": 1.0, "rewards/chosen": -0.001908089965581894, "rewards/margins": 0.6559628844261169, "rewards/rejected": -0.657870888710022, "sft_loss": 0.01908089965581894, "step": 3554 }, { "epoch": 5.140997830802603, "grad_norm": 1.2003316460791233, "learning_rate": 4.138264831593021e-07, "logits/chosen": -0.8967068791389465, "logits/rejected": -0.6410534381866455, "logps/chosen": -0.05355698615312576, "logps/rejected": -5.4609832763671875, "loss": 0.0489, "odds_ratio_loss": 0.045562319457530975, "rewards/accuracies": 0.9375, "rewards/chosen": -0.005355698522180319, "rewards/margins": 0.5407426357269287, "rewards/rejected": -0.5460983514785767, "sft_loss": 0.05355698615312576, "step": 3555 }, { "epoch": 5.142443962400579, "grad_norm": 1.2024952735105818, "learning_rate": 4.1245179852071967e-07, "logits/chosen": -0.9525247812271118, "logits/rejected": -0.7273567914962769, "logps/chosen": -0.0484442338347435, "logps/rejected": -5.277726173400879, "loss": 0.0394, "odds_ratio_loss": 0.004583648405969143, "rewards/accuracies": 1.0, "rewards/chosen": -0.00484442338347435, "rewards/margins": 0.5229281783103943, "rewards/rejected": -0.5277726054191589, "sft_loss": 0.0484442338347435, "step": 3556 }, { "epoch": 5.143890093998554, "grad_norm": 1.248422159406862, "learning_rate": 4.110792768283091e-07, "logits/chosen": -0.9277870059013367, "logits/rejected": -0.5297996401786804, "logps/chosen": -0.03848704323172569, "logps/rejected": -6.8542633056640625, "loss": 0.0542, "odds_ratio_loss": 0.002286599949002266, "rewards/accuracies": 1.0, "rewards/chosen": -0.003848704043775797, "rewards/margins": 0.6815776228904724, "rewards/rejected": -0.6854263544082642, "sft_loss": 0.03848704323172569, "step": 3557 }, { "epoch": 5.145336225596529, "grad_norm": 0.9575974433447748, "learning_rate": 4.0970891890956995e-07, "logits/chosen": -1.039902925491333, "logits/rejected": -0.7400610446929932, "logps/chosen": -0.012508687563240528, "logps/rejected": -6.0867815017700195, "loss": 0.0329, "odds_ratio_loss": 0.0008413865580223501, "rewards/accuracies": 1.0, "rewards/chosen": -0.0012508687796071172, "rewards/margins": 0.6074272990226746, "rewards/rejected": -0.6086782217025757, "sft_loss": 0.012508687563240528, "step": 3558 }, { "epoch": 5.146782357194505, "grad_norm": 1.514298510265316, "learning_rate": 4.0834072559069457e-07, "logits/chosen": -0.8292897939682007, "logits/rejected": -0.5984725952148438, "logps/chosen": -0.034817490726709366, "logps/rejected": -5.678520202636719, "loss": 0.0387, "odds_ratio_loss": 0.006148543208837509, "rewards/accuracies": 1.0, "rewards/chosen": -0.003481748979538679, "rewards/margins": 0.5643702745437622, "rewards/rejected": -0.5678520202636719, "sft_loss": 0.034817490726709366, "step": 3559 }, { "epoch": 5.14822848879248, "grad_norm": 1.236658751957511, "learning_rate": 4.069746976965733e-07, "logits/chosen": -0.9042907953262329, "logits/rejected": -0.6579476594924927, "logps/chosen": -0.05839492008090019, "logps/rejected": -5.013850688934326, "loss": 0.0544, "odds_ratio_loss": 0.0038068797439336777, "rewards/accuracies": 1.0, "rewards/chosen": -0.005839492194354534, "rewards/margins": 0.4955455958843231, "rewards/rejected": -0.5013850927352905, "sft_loss": 0.05839492008090019, "step": 3560 }, { "epoch": 5.1496746203904555, "grad_norm": 1.566794146100296, "learning_rate": 4.0561083605078884e-07, "logits/chosen": -0.7548511028289795, "logits/rejected": -0.6357748508453369, "logps/chosen": -0.06090143322944641, "logps/rejected": -3.7928695678710938, "loss": 0.044, "odds_ratio_loss": 0.009436688385903835, "rewards/accuracies": 1.0, "rewards/chosen": -0.006090143695473671, "rewards/margins": 0.3731968402862549, "rewards/rejected": -0.3792869448661804, "sft_loss": 0.06090143322944641, "step": 3561 }, { "epoch": 5.151120751988431, "grad_norm": 0.815848357915086, "learning_rate": 4.0424914147561794e-07, "logits/chosen": -0.8279675245285034, "logits/rejected": -0.7303451299667358, "logps/chosen": -0.014521737582981586, "logps/rejected": -6.461040019989014, "loss": 0.024, "odds_ratio_loss": 0.0017956249648705125, "rewards/accuracies": 1.0, "rewards/chosen": -0.0014521738048642874, "rewards/margins": 0.6446517705917358, "rewards/rejected": -0.6461040377616882, "sft_loss": 0.014521737582981586, "step": 3562 }, { "epoch": 5.152566883586406, "grad_norm": 1.71662469859775, "learning_rate": 4.028896147920311e-07, "logits/chosen": -0.964228630065918, "logits/rejected": -0.7277406454086304, "logps/chosen": -0.05546468123793602, "logps/rejected": -4.663585186004639, "loss": 0.043, "odds_ratio_loss": 0.0061240424402058125, "rewards/accuracies": 1.0, "rewards/chosen": -0.005546468310058117, "rewards/margins": 0.4608120024204254, "rewards/rejected": -0.466358482837677, "sft_loss": 0.05546468123793602, "step": 3563 }, { "epoch": 5.154013015184382, "grad_norm": 1.041441761654688, "learning_rate": 4.0153225681969305e-07, "logits/chosen": -0.9566247463226318, "logits/rejected": -0.5977626442909241, "logps/chosen": -0.04568367078900337, "logps/rejected": -4.601743698120117, "loss": 0.0402, "odds_ratio_loss": 0.002992587396875024, "rewards/accuracies": 1.0, "rewards/chosen": -0.004568367265164852, "rewards/margins": 0.45560598373413086, "rewards/rejected": -0.4601743519306183, "sft_loss": 0.04568367078900337, "step": 3564 }, { "epoch": 5.155459146782357, "grad_norm": 1.1027106642119917, "learning_rate": 4.0017706837695897e-07, "logits/chosen": -0.8168602585792542, "logits/rejected": -0.6069326400756836, "logps/chosen": -0.019194474443793297, "logps/rejected": -4.3833513259887695, "loss": 0.0617, "odds_ratio_loss": 0.0007897147443145514, "rewards/accuracies": 1.0, "rewards/chosen": -0.001919447211548686, "rewards/margins": 0.4364157021045685, "rewards/rejected": -0.4383351504802704, "sft_loss": 0.019194474443793297, "step": 3565 }, { "epoch": 5.156905278380332, "grad_norm": 1.1202583156584103, "learning_rate": 3.988240502808784e-07, "logits/chosen": -0.8883881568908691, "logits/rejected": -0.6389689445495605, "logps/chosen": -0.11527302116155624, "logps/rejected": -4.131171226501465, "loss": 0.0563, "odds_ratio_loss": 0.004591973032802343, "rewards/accuracies": 1.0, "rewards/chosen": -0.011527301743626595, "rewards/margins": 0.4015898108482361, "rewards/rejected": -0.41311711072921753, "sft_loss": 0.11527302116155624, "step": 3566 }, { "epoch": 5.158351409978308, "grad_norm": 1.0016267585153877, "learning_rate": 3.97473203347189e-07, "logits/chosen": -1.000503420829773, "logits/rejected": -0.7691615223884583, "logps/chosen": -0.02070147544145584, "logps/rejected": -6.152464866638184, "loss": 0.026, "odds_ratio_loss": 0.0008875165949575603, "rewards/accuracies": 1.0, "rewards/chosen": -0.002070147544145584, "rewards/margins": 0.6131762862205505, "rewards/rejected": -0.6152464151382446, "sft_loss": 0.02070147544145584, "step": 3567 }, { "epoch": 5.159797541576284, "grad_norm": 0.7536591073806905, "learning_rate": 3.9612452839032384e-07, "logits/chosen": -0.7870176434516907, "logits/rejected": -0.5479042530059814, "logps/chosen": -0.018979590386152267, "logps/rejected": -5.483916282653809, "loss": 0.0241, "odds_ratio_loss": 0.0007888816762715578, "rewards/accuracies": 1.0, "rewards/chosen": -0.0018979592714458704, "rewards/margins": 0.5464937090873718, "rewards/rejected": -0.5483916401863098, "sft_loss": 0.018979590386152267, "step": 3568 }, { "epoch": 5.161243673174259, "grad_norm": 0.9994488183017014, "learning_rate": 3.9477802622340217e-07, "logits/chosen": -0.8722624778747559, "logits/rejected": -0.7047505974769592, "logps/chosen": -0.030218884348869324, "logps/rejected": -5.5768723487854, "loss": 0.0393, "odds_ratio_loss": 0.0025864059571176767, "rewards/accuracies": 1.0, "rewards/chosen": -0.0030218884348869324, "rewards/margins": 0.5546653270721436, "rewards/rejected": -0.5576872229576111, "sft_loss": 0.030218884348869324, "step": 3569 }, { "epoch": 5.162689804772234, "grad_norm": 1.0533039507181938, "learning_rate": 3.934336976582355e-07, "logits/chosen": -1.0874637365341187, "logits/rejected": -0.7242406010627747, "logps/chosen": -0.022548483684659004, "logps/rejected": -5.560819625854492, "loss": 0.0218, "odds_ratio_loss": 0.0016226425068452954, "rewards/accuracies": 1.0, "rewards/chosen": -0.0022548483684659004, "rewards/margins": 0.5538271069526672, "rewards/rejected": -0.5560819506645203, "sft_loss": 0.022548483684659004, "step": 3570 }, { "epoch": 5.16413593637021, "grad_norm": 1.0401194910356404, "learning_rate": 3.9209154350532535e-07, "logits/chosen": -0.6649314165115356, "logits/rejected": -0.5735057592391968, "logps/chosen": -0.0342964343726635, "logps/rejected": -4.208144664764404, "loss": 0.0313, "odds_ratio_loss": 0.005412380211055279, "rewards/accuracies": 1.0, "rewards/chosen": -0.0034296438097953796, "rewards/margins": 0.4173848330974579, "rewards/rejected": -0.4208144545555115, "sft_loss": 0.0342964343726635, "step": 3571 }, { "epoch": 5.165582067968185, "grad_norm": 1.12759993077361, "learning_rate": 3.9075156457385994e-07, "logits/chosen": -0.8029493093490601, "logits/rejected": -0.6737925410270691, "logps/chosen": -0.03257250413298607, "logps/rejected": -4.218453407287598, "loss": 0.0459, "odds_ratio_loss": 0.0021414184011518955, "rewards/accuracies": 1.0, "rewards/chosen": -0.003257250413298607, "rewards/margins": 0.41858813166618347, "rewards/rejected": -0.42184534668922424, "sft_loss": 0.03257250413298607, "step": 3572 }, { "epoch": 5.16702819956616, "grad_norm": 1.228504842476863, "learning_rate": 3.894137616717197e-07, "logits/chosen": -0.9006476402282715, "logits/rejected": -0.9127538204193115, "logps/chosen": -0.02118997648358345, "logps/rejected": -5.368014335632324, "loss": 0.0517, "odds_ratio_loss": 0.0006409134948626161, "rewards/accuracies": 1.0, "rewards/chosen": -0.0021189977414906025, "rewards/margins": 0.5346824526786804, "rewards/rejected": -0.5368014574050903, "sft_loss": 0.02118997648358345, "step": 3573 }, { "epoch": 5.168474331164136, "grad_norm": 1.081205055296991, "learning_rate": 3.8807813560546876e-07, "logits/chosen": -0.8775331377983093, "logits/rejected": -0.7043969035148621, "logps/chosen": -0.021752668544650078, "logps/rejected": -3.4318575859069824, "loss": 0.0464, "odds_ratio_loss": 0.004134073853492737, "rewards/accuracies": 1.0, "rewards/chosen": -0.0021752668544650078, "rewards/margins": 0.34101051092147827, "rewards/rejected": -0.34318578243255615, "sft_loss": 0.021752668544650078, "step": 3574 }, { "epoch": 5.169920462762112, "grad_norm": 1.0730844091789573, "learning_rate": 3.86744687180363e-07, "logits/chosen": -0.725942850112915, "logits/rejected": -0.6615370512008667, "logps/chosen": -0.022522861137986183, "logps/rejected": -5.256943702697754, "loss": 0.0366, "odds_ratio_loss": 0.002798852976411581, "rewards/accuracies": 1.0, "rewards/chosen": -0.0022522863000631332, "rewards/margins": 0.5234421491622925, "rewards/rejected": -0.5256944298744202, "sft_loss": 0.022522861137986183, "step": 3575 }, { "epoch": 5.171366594360086, "grad_norm": 0.9166018136013868, "learning_rate": 3.8541341720034247e-07, "logits/chosen": -0.8853042721748352, "logits/rejected": -0.6347314715385437, "logps/chosen": -0.03648467734456062, "logps/rejected": -6.572963237762451, "loss": 0.033, "odds_ratio_loss": 0.003740268060937524, "rewards/accuracies": 1.0, "rewards/chosen": -0.003648467594757676, "rewards/margins": 0.6536478400230408, "rewards/rejected": -0.6572964191436768, "sft_loss": 0.03648467734456062, "step": 3576 }, { "epoch": 5.172812725958062, "grad_norm": 0.9377469551604307, "learning_rate": 3.840843264680349e-07, "logits/chosen": -0.8311636447906494, "logits/rejected": -0.6866295337677002, "logps/chosen": -0.06475996226072311, "logps/rejected": -5.250019550323486, "loss": 0.0387, "odds_ratio_loss": 0.005017520859837532, "rewards/accuracies": 1.0, "rewards/chosen": -0.006475997157394886, "rewards/margins": 0.518526017665863, "rewards/rejected": -0.5250020027160645, "sft_loss": 0.06475996226072311, "step": 3577 }, { "epoch": 5.174258857556038, "grad_norm": 1.2834851376400325, "learning_rate": 3.8275741578475306e-07, "logits/chosen": -0.8156872987747192, "logits/rejected": -0.6633107662200928, "logps/chosen": -0.06075110286474228, "logps/rejected": -5.4929046630859375, "loss": 0.074, "odds_ratio_loss": 0.0036945268511772156, "rewards/accuracies": 1.0, "rewards/chosen": -0.006075110752135515, "rewards/margins": 0.5432153940200806, "rewards/rejected": -0.5492904782295227, "sft_loss": 0.06075110286474228, "step": 3578 }, { "epoch": 5.1757049891540134, "grad_norm": 1.1359955274660638, "learning_rate": 3.814326859504984e-07, "logits/chosen": -0.9458404183387756, "logits/rejected": -0.8643139600753784, "logps/chosen": -0.04484132304787636, "logps/rejected": -3.9378392696380615, "loss": 0.0407, "odds_ratio_loss": 0.0018446637550368905, "rewards/accuracies": 1.0, "rewards/chosen": -0.004484132397919893, "rewards/margins": 0.38929980993270874, "rewards/rejected": -0.39378395676612854, "sft_loss": 0.04484132304787636, "step": 3579 }, { "epoch": 5.177151120751988, "grad_norm": 1.1248681812196126, "learning_rate": 3.801101377639533e-07, "logits/chosen": -0.885506272315979, "logits/rejected": -0.6628941893577576, "logps/chosen": -0.05343920737504959, "logps/rejected": -6.036564826965332, "loss": 0.0496, "odds_ratio_loss": 0.0014431718736886978, "rewards/accuracies": 1.0, "rewards/chosen": -0.005343920085579157, "rewards/margins": 0.5983126163482666, "rewards/rejected": -0.603656530380249, "sft_loss": 0.05343920737504959, "step": 3580 }, { "epoch": 5.178597252349964, "grad_norm": 2.1787270917311545, "learning_rate": 3.7878977202248887e-07, "logits/chosen": -1.0105148553848267, "logits/rejected": -0.766710102558136, "logps/chosen": -0.028319966048002243, "logps/rejected": -5.47403621673584, "loss": 0.0372, "odds_ratio_loss": 0.0020666704513132572, "rewards/accuracies": 1.0, "rewards/chosen": -0.002831996651366353, "rewards/margins": 0.5445716381072998, "rewards/rejected": -0.5474036335945129, "sft_loss": 0.028319966048002243, "step": 3581 }, { "epoch": 5.18004338394794, "grad_norm": 1.0220245059137052, "learning_rate": 3.7747158952215716e-07, "logits/chosen": -0.7910235524177551, "logits/rejected": -0.7252139449119568, "logps/chosen": -0.027546580880880356, "logps/rejected": -4.437729835510254, "loss": 0.0296, "odds_ratio_loss": 0.003080027410760522, "rewards/accuracies": 1.0, "rewards/chosen": -0.0027546582277864218, "rewards/margins": 0.44101834297180176, "rewards/rejected": -0.4437730312347412, "sft_loss": 0.027546580880880356, "step": 3582 }, { "epoch": 5.181489515545914, "grad_norm": 1.0665922920451818, "learning_rate": 3.7615559105769633e-07, "logits/chosen": -0.9450021982192993, "logits/rejected": -0.7053030729293823, "logps/chosen": -0.05629965662956238, "logps/rejected": -5.696890354156494, "loss": 0.0455, "odds_ratio_loss": 0.0044265990145504475, "rewards/accuracies": 1.0, "rewards/chosen": -0.00562996556982398, "rewards/margins": 0.5640590786933899, "rewards/rejected": -0.5696890354156494, "sft_loss": 0.05629965662956238, "step": 3583 }, { "epoch": 5.18293564714389, "grad_norm": 1.1682569222454637, "learning_rate": 3.748417774225259e-07, "logits/chosen": -1.0742039680480957, "logits/rejected": -0.7987134456634521, "logps/chosen": -0.023768005892634392, "logps/rejected": -5.302433490753174, "loss": 0.0289, "odds_ratio_loss": 0.0009636090835556388, "rewards/accuracies": 1.0, "rewards/chosen": -0.002376800635829568, "rewards/margins": 0.5278666019439697, "rewards/rejected": -0.5302433967590332, "sft_loss": 0.023768005892634392, "step": 3584 }, { "epoch": 5.184381778741866, "grad_norm": 1.1421657305611848, "learning_rate": 3.7353014940874993e-07, "logits/chosen": -0.9357766509056091, "logits/rejected": -0.648613452911377, "logps/chosen": -0.06362772732973099, "logps/rejected": -6.172553539276123, "loss": 0.047, "odds_ratio_loss": 0.00930915866047144, "rewards/accuracies": 1.0, "rewards/chosen": -0.006362773012369871, "rewards/margins": 0.6108925938606262, "rewards/rejected": -0.6172553300857544, "sft_loss": 0.06362772732973099, "step": 3585 }, { "epoch": 5.185827910339841, "grad_norm": 0.8659853390467411, "learning_rate": 3.722207078071533e-07, "logits/chosen": -1.0251359939575195, "logits/rejected": -0.760617733001709, "logps/chosen": -0.05932139605283737, "logps/rejected": -5.85208797454834, "loss": 0.0315, "odds_ratio_loss": 0.00122246949467808, "rewards/accuracies": 1.0, "rewards/chosen": -0.005932139698415995, "rewards/margins": 0.5792766809463501, "rewards/rejected": -0.5852087736129761, "sft_loss": 0.05932139605283737, "step": 3586 }, { "epoch": 5.187274041937816, "grad_norm": 0.7003907957478049, "learning_rate": 3.7091345340720226e-07, "logits/chosen": -0.8258812427520752, "logits/rejected": -0.7379279136657715, "logps/chosen": -0.006125980988144875, "logps/rejected": -5.172117233276367, "loss": 0.0164, "odds_ratio_loss": 0.0004759537405334413, "rewards/accuracies": 1.0, "rewards/chosen": -0.0006125981453806162, "rewards/margins": 0.516599178314209, "rewards/rejected": -0.5172117352485657, "sft_loss": 0.006125980988144875, "step": 3587 }, { "epoch": 5.188720173535792, "grad_norm": 0.9972301335770085, "learning_rate": 3.696083869970472e-07, "logits/chosen": -0.8752817511558533, "logits/rejected": -0.8511292934417725, "logps/chosen": -0.010707555338740349, "logps/rejected": -6.42230749130249, "loss": 0.0448, "odds_ratio_loss": 0.0006266254931688309, "rewards/accuracies": 1.0, "rewards/chosen": -0.0010707555338740349, "rewards/margins": 0.6411600112915039, "rewards/rejected": -0.642230749130249, "sft_loss": 0.010707555338740349, "step": 3588 }, { "epoch": 5.190166305133767, "grad_norm": 1.0174824776073386, "learning_rate": 3.683055093635161e-07, "logits/chosen": -0.9354603290557861, "logits/rejected": -0.7528461217880249, "logps/chosen": -0.04549839720129967, "logps/rejected": -4.9645256996154785, "loss": 0.0321, "odds_ratio_loss": 0.003526362357661128, "rewards/accuracies": 1.0, "rewards/chosen": -0.004549840465188026, "rewards/margins": 0.4919027090072632, "rewards/rejected": -0.49645254015922546, "sft_loss": 0.04549839720129967, "step": 3589 }, { "epoch": 5.191612436731742, "grad_norm": 1.044001053692902, "learning_rate": 3.670048212921202e-07, "logits/chosen": -0.8328427672386169, "logits/rejected": -0.7321763038635254, "logps/chosen": -0.06193680316209793, "logps/rejected": -5.603020668029785, "loss": 0.0464, "odds_ratio_loss": 0.0021486992482095957, "rewards/accuracies": 1.0, "rewards/chosen": -0.006193680688738823, "rewards/margins": 0.5541083812713623, "rewards/rejected": -0.5603021383285522, "sft_loss": 0.06193680316209793, "step": 3590 }, { "epoch": 5.193058568329718, "grad_norm": 1.0798462980834291, "learning_rate": 3.657063235670468e-07, "logits/chosen": -0.8687400221824646, "logits/rejected": -0.6818861961364746, "logps/chosen": -0.014150983653962612, "logps/rejected": -5.715865135192871, "loss": 0.0237, "odds_ratio_loss": 0.0009401044226251543, "rewards/accuracies": 1.0, "rewards/chosen": -0.0014150984352454543, "rewards/margins": 0.5701714158058167, "rewards/rejected": -0.5715864896774292, "sft_loss": 0.014150983653962612, "step": 3591 }, { "epoch": 5.194504699927694, "grad_norm": 1.15719150666348, "learning_rate": 3.644100169711679e-07, "logits/chosen": -1.0285594463348389, "logits/rejected": -0.725699245929718, "logps/chosen": -0.03213905915617943, "logps/rejected": -4.655622482299805, "loss": 0.0505, "odds_ratio_loss": 0.001043865573592484, "rewards/accuracies": 1.0, "rewards/chosen": -0.0032139059621840715, "rewards/margins": 0.4623483717441559, "rewards/rejected": -0.46556228399276733, "sft_loss": 0.03213905915617943, "step": 3592 }, { "epoch": 5.195950831525669, "grad_norm": 1.2436932447416695, "learning_rate": 3.6311590228602995e-07, "logits/chosen": -0.9513838887214661, "logits/rejected": -0.8044277429580688, "logps/chosen": -0.06614846736192703, "logps/rejected": -3.153280735015869, "loss": 0.0561, "odds_ratio_loss": 0.04593777656555176, "rewards/accuracies": 0.9375, "rewards/chosen": -0.006614846643060446, "rewards/margins": 0.30871322751045227, "rewards/rejected": -0.31532809138298035, "sft_loss": 0.06614846736192703, "step": 3593 }, { "epoch": 5.197396963123644, "grad_norm": 1.119076034023042, "learning_rate": 3.618239802918595e-07, "logits/chosen": -0.8513356447219849, "logits/rejected": -0.6646866202354431, "logps/chosen": -0.05874083191156387, "logps/rejected": -5.149399757385254, "loss": 0.0623, "odds_ratio_loss": 0.0020238799042999744, "rewards/accuracies": 1.0, "rewards/chosen": -0.005874083377420902, "rewards/margins": 0.5090658664703369, "rewards/rejected": -0.5149399638175964, "sft_loss": 0.05874083191156387, "step": 3594 }, { "epoch": 5.19884309472162, "grad_norm": 1.283803578447028, "learning_rate": 3.605342517675609e-07, "logits/chosen": -1.0322444438934326, "logits/rejected": -0.7828861474990845, "logps/chosen": -0.03251064568758011, "logps/rejected": -4.867115497589111, "loss": 0.0395, "odds_ratio_loss": 0.0020458081271499395, "rewards/accuracies": 1.0, "rewards/chosen": -0.003251064568758011, "rewards/margins": 0.48346051573753357, "rewards/rejected": -0.4867115318775177, "sft_loss": 0.03251064568758011, "step": 3595 }, { "epoch": 5.200289226319595, "grad_norm": 1.0396061625618158, "learning_rate": 3.592467174907172e-07, "logits/chosen": -0.6187576651573181, "logits/rejected": -0.5112270712852478, "logps/chosen": -0.022534213960170746, "logps/rejected": -5.44777774810791, "loss": 0.0381, "odds_ratio_loss": 0.002426896011456847, "rewards/accuracies": 1.0, "rewards/chosen": -0.0022534215822815895, "rewards/margins": 0.5425243973731995, "rewards/rejected": -0.5447777509689331, "sft_loss": 0.022534213960170746, "step": 3596 }, { "epoch": 5.2017353579175705, "grad_norm": 0.8656320470579015, "learning_rate": 3.5796137823758653e-07, "logits/chosen": -0.8526214957237244, "logits/rejected": -0.6017423272132874, "logps/chosen": -0.02538936212658882, "logps/rejected": -4.2573161125183105, "loss": 0.028, "odds_ratio_loss": 0.0021512117236852646, "rewards/accuracies": 1.0, "rewards/chosen": -0.002538936212658882, "rewards/margins": 0.42319270968437195, "rewards/rejected": -0.4257315993309021, "sft_loss": 0.02538936212658882, "step": 3597 }, { "epoch": 5.203181489515546, "grad_norm": 1.1406021212993034, "learning_rate": 3.5667823478310545e-07, "logits/chosen": -1.1594440937042236, "logits/rejected": -0.7526164650917053, "logps/chosen": -0.01821194216609001, "logps/rejected": -5.545048236846924, "loss": 0.0433, "odds_ratio_loss": 0.0004932095180265605, "rewards/accuracies": 1.0, "rewards/chosen": -0.0018211941933259368, "rewards/margins": 0.5526836514472961, "rewards/rejected": -0.5545048713684082, "sft_loss": 0.01821194216609001, "step": 3598 }, { "epoch": 5.204627621113521, "grad_norm": 1.1263883171005709, "learning_rate": 3.553972879008862e-07, "logits/chosen": -0.9884305000305176, "logits/rejected": -0.7626382112503052, "logps/chosen": -0.04639644920825958, "logps/rejected": -5.2319722175598145, "loss": 0.036, "odds_ratio_loss": 0.004956886637955904, "rewards/accuracies": 1.0, "rewards/chosen": -0.004639644641429186, "rewards/margins": 0.518557608127594, "rewards/rejected": -0.5231972336769104, "sft_loss": 0.04639644920825958, "step": 3599 }, { "epoch": 5.206073752711497, "grad_norm": 0.9227492998883203, "learning_rate": 3.5411853836321634e-07, "logits/chosen": -0.9538089036941528, "logits/rejected": -0.7468767166137695, "logps/chosen": -0.02350049838423729, "logps/rejected": -4.7276201248168945, "loss": 0.0261, "odds_ratio_loss": 0.0009906530613079667, "rewards/accuracies": 1.0, "rewards/chosen": -0.002350050024688244, "rewards/margins": 0.4704119563102722, "rewards/rejected": -0.47276201844215393, "sft_loss": 0.02350049838423729, "step": 3600 }, { "epoch": 5.207519884309472, "grad_norm": 0.8800865521581338, "learning_rate": 3.528419869410584e-07, "logits/chosen": -0.9478657245635986, "logits/rejected": -0.6646949648857117, "logps/chosen": -0.0317431278526783, "logps/rejected": -4.364032745361328, "loss": 0.026, "odds_ratio_loss": 0.0042336005717515945, "rewards/accuracies": 1.0, "rewards/chosen": -0.0031743128784000874, "rewards/margins": 0.4332289695739746, "rewards/rejected": -0.4364032745361328, "sft_loss": 0.0317431278526783, "step": 3601 }, { "epoch": 5.208966015907448, "grad_norm": 0.6992216686421211, "learning_rate": 3.51567634404049e-07, "logits/chosen": -0.8134947419166565, "logits/rejected": -0.8427659869194031, "logps/chosen": -0.010465124621987343, "logps/rejected": -3.6230411529541016, "loss": 0.0108, "odds_ratio_loss": 0.0008967835456132889, "rewards/accuracies": 1.0, "rewards/chosen": -0.001046512508764863, "rewards/margins": 0.3612575829029083, "rewards/rejected": -0.36230409145355225, "sft_loss": 0.010465124621987343, "step": 3602 }, { "epoch": 5.210412147505423, "grad_norm": 1.2174837052780079, "learning_rate": 3.5029548152050214e-07, "logits/chosen": -0.8519737124443054, "logits/rejected": -0.7378709316253662, "logps/chosen": -0.02950909174978733, "logps/rejected": -4.193276405334473, "loss": 0.0394, "odds_ratio_loss": 0.00271158991381526, "rewards/accuracies": 1.0, "rewards/chosen": -0.0029509093146771193, "rewards/margins": 0.4163767695426941, "rewards/rejected": -0.41932767629623413, "sft_loss": 0.02950909174978733, "step": 3603 }, { "epoch": 5.2118582791033985, "grad_norm": 1.1976004631004957, "learning_rate": 3.490255290574011e-07, "logits/chosen": -0.6854555606842041, "logits/rejected": -0.5996547937393188, "logps/chosen": -0.057780683040618896, "logps/rejected": -5.569599628448486, "loss": 0.0346, "odds_ratio_loss": 0.010212712921202183, "rewards/accuracies": 1.0, "rewards/chosen": -0.005778068210929632, "rewards/margins": 0.5511819124221802, "rewards/rejected": -0.5569599270820618, "sft_loss": 0.057780683040618896, "step": 3604 }, { "epoch": 5.213304410701374, "grad_norm": 0.9238247408528008, "learning_rate": 3.4775777778040774e-07, "logits/chosen": -1.1148799657821655, "logits/rejected": -0.7859690189361572, "logps/chosen": -0.03792307525873184, "logps/rejected": -5.748502731323242, "loss": 0.0281, "odds_ratio_loss": 0.001623988151550293, "rewards/accuracies": 1.0, "rewards/chosen": -0.003792307572439313, "rewards/margins": 0.5710579752922058, "rewards/rejected": -0.5748502612113953, "sft_loss": 0.03792307525873184, "step": 3605 }, { "epoch": 5.214750542299349, "grad_norm": 1.077370433581713, "learning_rate": 3.464922284538514e-07, "logits/chosen": -1.1062169075012207, "logits/rejected": -0.7723128795623779, "logps/chosen": -0.013403604738414288, "logps/rejected": -7.84562873840332, "loss": 0.0353, "odds_ratio_loss": 0.00027603365015238523, "rewards/accuracies": 1.0, "rewards/chosen": -0.0013403603807091713, "rewards/margins": 0.783222496509552, "rewards/rejected": -0.784562885761261, "sft_loss": 0.013403604738414288, "step": 3606 }, { "epoch": 5.216196673897325, "grad_norm": 1.0207445296410194, "learning_rate": 3.4522888184073827e-07, "logits/chosen": -0.8922250270843506, "logits/rejected": -0.7089736461639404, "logps/chosen": -0.02101828157901764, "logps/rejected": -5.1013641357421875, "loss": 0.0418, "odds_ratio_loss": 0.002262045629322529, "rewards/accuracies": 1.0, "rewards/chosen": -0.002101828111335635, "rewards/margins": 0.5080346465110779, "rewards/rejected": -0.5101364850997925, "sft_loss": 0.02101828157901764, "step": 3607 }, { "epoch": 5.2176428054953, "grad_norm": 0.9616877093511352, "learning_rate": 3.439677387027444e-07, "logits/chosen": -0.8647952675819397, "logits/rejected": -0.7060336470603943, "logps/chosen": -0.03401113301515579, "logps/rejected": -4.374388694763184, "loss": 0.0328, "odds_ratio_loss": 0.002195565262809396, "rewards/accuracies": 1.0, "rewards/chosen": -0.0034011139068752527, "rewards/margins": 0.4340377449989319, "rewards/rejected": -0.4374389052391052, "sft_loss": 0.03401113301515579, "step": 3608 }, { "epoch": 5.219088937093275, "grad_norm": 1.5959179216953558, "learning_rate": 3.427087998002172e-07, "logits/chosen": -0.7751748561859131, "logits/rejected": -0.7734540104866028, "logps/chosen": -0.02873246558010578, "logps/rejected": -4.718321800231934, "loss": 0.0603, "odds_ratio_loss": 0.001208885689266026, "rewards/accuracies": 1.0, "rewards/chosen": -0.0028732463251799345, "rewards/margins": 0.46895891427993774, "rewards/rejected": -0.47183215618133545, "sft_loss": 0.02873246558010578, "step": 3609 }, { "epoch": 5.220535068691251, "grad_norm": 1.0541829578203996, "learning_rate": 3.4145206589217515e-07, "logits/chosen": -0.8666414022445679, "logits/rejected": -0.6371941566467285, "logps/chosen": -0.010170397348701954, "logps/rejected": -6.458046913146973, "loss": 0.0202, "odds_ratio_loss": 0.00042813006439246237, "rewards/accuracies": 1.0, "rewards/chosen": -0.0010170398745685816, "rewards/margins": 0.6447876691818237, "rewards/rejected": -0.6458047032356262, "sft_loss": 0.010170397348701954, "step": 3610 }, { "epoch": 5.2219812002892265, "grad_norm": 1.146404624092723, "learning_rate": 3.401975377363082e-07, "logits/chosen": -0.8071302175521851, "logits/rejected": -0.5387327075004578, "logps/chosen": -0.012531624175608158, "logps/rejected": -6.042186737060547, "loss": 0.0357, "odds_ratio_loss": 0.0007533429889008403, "rewards/accuracies": 1.0, "rewards/chosen": -0.0012531625106930733, "rewards/margins": 0.6029655933380127, "rewards/rejected": -0.6042186617851257, "sft_loss": 0.012531624175608158, "step": 3611 }, { "epoch": 5.223427331887201, "grad_norm": 1.0744901126786583, "learning_rate": 3.3894521608897765e-07, "logits/chosen": -0.7540593147277832, "logits/rejected": -0.7037791609764099, "logps/chosen": -0.059880174696445465, "logps/rejected": -4.644082069396973, "loss": 0.0566, "odds_ratio_loss": 0.0015735100023448467, "rewards/accuracies": 1.0, "rewards/chosen": -0.005988018121570349, "rewards/margins": 0.4584202468395233, "rewards/rejected": -0.4644082486629486, "sft_loss": 0.059880174696445465, "step": 3612 }, { "epoch": 5.224873463485177, "grad_norm": 0.9823145699084468, "learning_rate": 3.376951017052101e-07, "logits/chosen": -0.85802161693573, "logits/rejected": -0.617484986782074, "logps/chosen": -0.03595628961920738, "logps/rejected": -6.122747421264648, "loss": 0.0285, "odds_ratio_loss": 0.0011885353596881032, "rewards/accuracies": 1.0, "rewards/chosen": -0.0035956294741481543, "rewards/margins": 0.6086791753768921, "rewards/rejected": -0.6122748255729675, "sft_loss": 0.03595628961920738, "step": 3613 }, { "epoch": 5.226319595083153, "grad_norm": 1.017079576772903, "learning_rate": 3.364471953387067e-07, "logits/chosen": -0.7847793102264404, "logits/rejected": -0.6176234483718872, "logps/chosen": -0.008673002012073994, "logps/rejected": -7.5913262367248535, "loss": 0.0459, "odds_ratio_loss": 0.0004855759325437248, "rewards/accuracies": 1.0, "rewards/chosen": -0.0008673002012073994, "rewards/margins": 0.7582653760910034, "rewards/rejected": -0.7591326236724854, "sft_loss": 0.008673002012073994, "step": 3614 }, { "epoch": 5.227765726681128, "grad_norm": 0.9512168280789924, "learning_rate": 3.3520149774183406e-07, "logits/chosen": -0.9897427558898926, "logits/rejected": -0.7149899005889893, "logps/chosen": -0.02276892587542534, "logps/rejected": -6.676823616027832, "loss": 0.0346, "odds_ratio_loss": 0.0011359690688550472, "rewards/accuracies": 1.0, "rewards/chosen": -0.002276892773807049, "rewards/margins": 0.6654054522514343, "rewards/rejected": -0.6676823496818542, "sft_loss": 0.02276892587542534, "step": 3615 }, { "epoch": 5.229211858279103, "grad_norm": 0.9427603088962667, "learning_rate": 3.339580096656269e-07, "logits/chosen": -0.9399840235710144, "logits/rejected": -0.733213484287262, "logps/chosen": -0.03748312592506409, "logps/rejected": -3.965913772583008, "loss": 0.0268, "odds_ratio_loss": 0.0014005664270371199, "rewards/accuracies": 1.0, "rewards/chosen": -0.0037483125925064087, "rewards/margins": 0.3928430676460266, "rewards/rejected": -0.3965914249420166, "sft_loss": 0.03748312592506409, "step": 3616 }, { "epoch": 5.230657989877079, "grad_norm": 1.1554821711969416, "learning_rate": 3.327167318597892e-07, "logits/chosen": -0.9230865836143494, "logits/rejected": -0.6254661679267883, "logps/chosen": -0.03255566582083702, "logps/rejected": -6.558465480804443, "loss": 0.0366, "odds_ratio_loss": 0.0009877877309918404, "rewards/accuracies": 1.0, "rewards/chosen": -0.0032555670477449894, "rewards/margins": 0.6525909900665283, "rewards/rejected": -0.6558465957641602, "sft_loss": 0.03255566582083702, "step": 3617 }, { "epoch": 5.2321041214750545, "grad_norm": 0.9584221143268201, "learning_rate": 3.3147766507269295e-07, "logits/chosen": -0.9985564947128296, "logits/rejected": -0.804498553276062, "logps/chosen": -0.05001387745141983, "logps/rejected": -4.83536434173584, "loss": 0.0304, "odds_ratio_loss": 0.0040518310852348804, "rewards/accuracies": 1.0, "rewards/chosen": -0.005001388024538755, "rewards/margins": 0.4785350561141968, "rewards/rejected": -0.4835364520549774, "sft_loss": 0.05001387745141983, "step": 3618 }, { "epoch": 5.233550253073029, "grad_norm": 1.2707849962144895, "learning_rate": 3.3024081005137514e-07, "logits/chosen": -0.9007452726364136, "logits/rejected": -0.6459848284721375, "logps/chosen": -0.014544611796736717, "logps/rejected": -6.230293273925781, "loss": 0.0342, "odds_ratio_loss": 0.0008624520851299167, "rewards/accuracies": 1.0, "rewards/chosen": -0.0014544612495228648, "rewards/margins": 0.621574878692627, "rewards/rejected": -0.623029351234436, "sft_loss": 0.014544611796736717, "step": 3619 }, { "epoch": 5.234996384671005, "grad_norm": 0.9682404550627938, "learning_rate": 3.2900616754154075e-07, "logits/chosen": -0.8395979404449463, "logits/rejected": -0.6162151098251343, "logps/chosen": -0.08151617646217346, "logps/rejected": -5.730838775634766, "loss": 0.0429, "odds_ratio_loss": 0.0006240714574232697, "rewards/accuracies": 1.0, "rewards/chosen": -0.008151616901159286, "rewards/margins": 0.5649322271347046, "rewards/rejected": -0.5730838775634766, "sft_loss": 0.08151617646217346, "step": 3620 }, { "epoch": 5.236442516268981, "grad_norm": 1.2687445581896963, "learning_rate": 3.277737382875596e-07, "logits/chosen": -0.9045553803443909, "logits/rejected": -0.7344595193862915, "logps/chosen": -0.058114439249038696, "logps/rejected": -5.224374771118164, "loss": 0.0321, "odds_ratio_loss": 0.0007322908495552838, "rewards/accuracies": 1.0, "rewards/chosen": -0.005811444483697414, "rewards/margins": 0.5166260004043579, "rewards/rejected": -0.5224374532699585, "sft_loss": 0.058114439249038696, "step": 3621 }, { "epoch": 5.2378886478669555, "grad_norm": 1.3522088419469593, "learning_rate": 3.2654352303246935e-07, "logits/chosen": -0.895087718963623, "logits/rejected": -0.734992265701294, "logps/chosen": -0.04217066988348961, "logps/rejected": -4.8282928466796875, "loss": 0.0498, "odds_ratio_loss": 0.0015827922616153955, "rewards/accuracies": 1.0, "rewards/chosen": -0.004217067267745733, "rewards/margins": 0.4786122441291809, "rewards/rejected": -0.4828292727470398, "sft_loss": 0.04217066988348961, "step": 3622 }, { "epoch": 5.239334779464931, "grad_norm": 1.0023821915021813, "learning_rate": 3.2531552251797045e-07, "logits/chosen": -0.8689554333686829, "logits/rejected": -0.6638770699501038, "logps/chosen": -0.0453006774187088, "logps/rejected": -6.682604789733887, "loss": 0.0355, "odds_ratio_loss": 0.00197001826018095, "rewards/accuracies": 1.0, "rewards/chosen": -0.004530067555606365, "rewards/margins": 0.6637303829193115, "rewards/rejected": -0.6682605147361755, "sft_loss": 0.0453006774187088, "step": 3623 }, { "epoch": 5.240780911062907, "grad_norm": 1.1418476339361996, "learning_rate": 3.2408973748442803e-07, "logits/chosen": -0.880037248134613, "logits/rejected": -0.5957615375518799, "logps/chosen": -0.060221537947654724, "logps/rejected": -7.038527488708496, "loss": 0.0551, "odds_ratio_loss": 0.0034473263658583164, "rewards/accuracies": 1.0, "rewards/chosen": -0.006022154353559017, "rewards/margins": 0.6978305578231812, "rewards/rejected": -0.7038527727127075, "sft_loss": 0.060221537947654724, "step": 3624 }, { "epoch": 5.242227042660883, "grad_norm": 1.073359967589388, "learning_rate": 3.2286616867087445e-07, "logits/chosen": -0.9191368818283081, "logits/rejected": -0.6997087001800537, "logps/chosen": -0.016141196712851524, "logps/rejected": -4.670000076293945, "loss": 0.03, "odds_ratio_loss": 0.0009297472424805164, "rewards/accuracies": 1.0, "rewards/chosen": -0.0016141196247190237, "rewards/margins": 0.46538591384887695, "rewards/rejected": -0.4670000374317169, "sft_loss": 0.016141196712851524, "step": 3625 }, { "epoch": 5.243673174258857, "grad_norm": 0.9690415562551309, "learning_rate": 3.216448168150019e-07, "logits/chosen": -0.9207147359848022, "logits/rejected": -0.6570227742195129, "logps/chosen": -0.03261332958936691, "logps/rejected": -7.021213531494141, "loss": 0.0324, "odds_ratio_loss": 0.003072987077757716, "rewards/accuracies": 1.0, "rewards/chosen": -0.003261332865804434, "rewards/margins": 0.6988599896430969, "rewards/rejected": -0.702121376991272, "sft_loss": 0.03261332958936691, "step": 3626 }, { "epoch": 5.245119305856833, "grad_norm": 0.9459142870812083, "learning_rate": 3.2042568265316974e-07, "logits/chosen": -0.8446757793426514, "logits/rejected": -0.6071657538414001, "logps/chosen": -0.04911842197179794, "logps/rejected": -6.459288597106934, "loss": 0.0422, "odds_ratio_loss": 0.0015342968981713057, "rewards/accuracies": 1.0, "rewards/chosen": -0.004911842290312052, "rewards/margins": 0.6410170197486877, "rewards/rejected": -0.6459288597106934, "sft_loss": 0.04911842197179794, "step": 3627 }, { "epoch": 5.246565437454809, "grad_norm": 1.0470928306134115, "learning_rate": 3.192087669203971e-07, "logits/chosen": -1.0162910223007202, "logits/rejected": -0.8110986948013306, "logps/chosen": -0.03871820867061615, "logps/rejected": -4.199558258056641, "loss": 0.0339, "odds_ratio_loss": 0.0035957153886556625, "rewards/accuracies": 1.0, "rewards/chosen": -0.003871820867061615, "rewards/margins": 0.4160839915275574, "rewards/rejected": -0.4199557900428772, "sft_loss": 0.03871820867061615, "step": 3628 }, { "epoch": 5.2480115690527835, "grad_norm": 1.2074291502339676, "learning_rate": 3.179940703503683e-07, "logits/chosen": -0.9160524606704712, "logits/rejected": -0.7021023035049438, "logps/chosen": -0.08635597676038742, "logps/rejected": -4.336616039276123, "loss": 0.0801, "odds_ratio_loss": 0.004517072346061468, "rewards/accuracies": 1.0, "rewards/chosen": -0.008635598234832287, "rewards/margins": 0.425025999546051, "rewards/rejected": -0.4336616098880768, "sft_loss": 0.08635597676038742, "step": 3629 }, { "epoch": 5.249457700650759, "grad_norm": 1.0665810428402844, "learning_rate": 3.167815936754272e-07, "logits/chosen": -0.9472986459732056, "logits/rejected": -0.7375108003616333, "logps/chosen": -0.04879705607891083, "logps/rejected": -4.624138832092285, "loss": 0.037, "odds_ratio_loss": 0.002154088346287608, "rewards/accuracies": 1.0, "rewards/chosen": -0.0048797051422297955, "rewards/margins": 0.45753422379493713, "rewards/rejected": -0.4624139070510864, "sft_loss": 0.04879705607891083, "step": 3630 }, { "epoch": 5.250903832248735, "grad_norm": 0.9773223496389528, "learning_rate": 3.155713376265816e-07, "logits/chosen": -0.8467633724212646, "logits/rejected": -0.5423392653465271, "logps/chosen": -0.018480662256479263, "logps/rejected": -5.556922912597656, "loss": 0.031, "odds_ratio_loss": 0.0009833640651777387, "rewards/accuracies": 1.0, "rewards/chosen": -0.0018480663420632482, "rewards/margins": 0.5538442730903625, "rewards/rejected": -0.5556923151016235, "sft_loss": 0.018480662256479263, "step": 3631 }, { "epoch": 5.25234996384671, "grad_norm": 1.0079363653872178, "learning_rate": 3.143633029334989e-07, "logits/chosen": -0.7981322407722473, "logits/rejected": -0.7674664258956909, "logps/chosen": -0.03736051917076111, "logps/rejected": -4.720569610595703, "loss": 0.0279, "odds_ratio_loss": 0.0012767021544277668, "rewards/accuracies": 1.0, "rewards/chosen": -0.0037360521964728832, "rewards/margins": 0.46832096576690674, "rewards/rejected": -0.4720570147037506, "sft_loss": 0.03736051917076111, "step": 3632 }, { "epoch": 5.253796095444685, "grad_norm": 1.768817806827234, "learning_rate": 3.131574903245071e-07, "logits/chosen": -1.0161972045898438, "logits/rejected": -0.7614036798477173, "logps/chosen": -0.10368499159812927, "logps/rejected": -4.6447930335998535, "loss": 0.0543, "odds_ratio_loss": 0.008217780850827694, "rewards/accuracies": 1.0, "rewards/chosen": -0.010368499904870987, "rewards/margins": 0.4541108310222626, "rewards/rejected": -0.4644792973995209, "sft_loss": 0.10368499159812927, "step": 3633 }, { "epoch": 5.255242227042661, "grad_norm": 1.276381303497112, "learning_rate": 3.119539005265954e-07, "logits/chosen": -1.0342979431152344, "logits/rejected": -0.7442997694015503, "logps/chosen": -0.01700776070356369, "logps/rejected": -5.295718193054199, "loss": 0.0303, "odds_ratio_loss": 0.0007272001821547747, "rewards/accuracies": 1.0, "rewards/chosen": -0.0017007759306579828, "rewards/margins": 0.5278711318969727, "rewards/rejected": -0.5295718908309937, "sft_loss": 0.01700776070356369, "step": 3634 }, { "epoch": 5.256688358640636, "grad_norm": 1.1448473614666281, "learning_rate": 3.1075253426541357e-07, "logits/chosen": -1.0042964220046997, "logits/rejected": -0.7400972247123718, "logps/chosen": -0.024408066645264626, "logps/rejected": -7.579883575439453, "loss": 0.0317, "odds_ratio_loss": 0.00141650321893394, "rewards/accuracies": 1.0, "rewards/chosen": -0.0024408064782619476, "rewards/margins": 0.7555475831031799, "rewards/rejected": -0.7579883337020874, "sft_loss": 0.024408066645264626, "step": 3635 }, { "epoch": 5.258134490238612, "grad_norm": 1.3010295902650386, "learning_rate": 3.095533922652684e-07, "logits/chosen": -0.7892809510231018, "logits/rejected": -0.6047863364219666, "logps/chosen": -0.015120243653655052, "logps/rejected": -6.354447364807129, "loss": 0.0472, "odds_ratio_loss": 0.00048150643124245107, "rewards/accuracies": 1.0, "rewards/chosen": -0.0015120243187993765, "rewards/margins": 0.6339326500892639, "rewards/rejected": -0.635444700717926, "sft_loss": 0.015120243653655052, "step": 3636 }, { "epoch": 5.259580621836587, "grad_norm": 1.068873809854963, "learning_rate": 3.0835647524912744e-07, "logits/chosen": -0.9303781986236572, "logits/rejected": -0.7163772583007812, "logps/chosen": -0.04354140907526016, "logps/rejected": -5.129241943359375, "loss": 0.029, "odds_ratio_loss": 0.0013979937648400664, "rewards/accuracies": 1.0, "rewards/chosen": -0.004354140721261501, "rewards/margins": 0.5085700750350952, "rewards/rejected": -0.5129241943359375, "sft_loss": 0.04354140907526016, "step": 3637 }, { "epoch": 5.261026753434563, "grad_norm": 0.8962780000482491, "learning_rate": 3.071617839386178e-07, "logits/chosen": -0.8089204430580139, "logits/rejected": -0.4361693561077118, "logps/chosen": -0.04192004352807999, "logps/rejected": -4.811159610748291, "loss": 0.0228, "odds_ratio_loss": 0.0021075494587421417, "rewards/accuracies": 1.0, "rewards/chosen": -0.004192003980278969, "rewards/margins": 0.47692394256591797, "rewards/rejected": -0.48111599683761597, "sft_loss": 0.04192004352807999, "step": 3638 }, { "epoch": 5.262472885032538, "grad_norm": 0.8829913670211695, "learning_rate": 3.05969319054022e-07, "logits/chosen": -0.8424152135848999, "logits/rejected": -0.5214813351631165, "logps/chosen": -0.05059170350432396, "logps/rejected": -3.3692402839660645, "loss": 0.0291, "odds_ratio_loss": 0.005258472170680761, "rewards/accuracies": 1.0, "rewards/chosen": -0.005059171002358198, "rewards/margins": 0.3318648636341095, "rewards/rejected": -0.3369240164756775, "sft_loss": 0.05059170350432396, "step": 3639 }, { "epoch": 5.263919016630513, "grad_norm": 1.1489498228376966, "learning_rate": 3.047790813142819e-07, "logits/chosen": -0.8711774349212646, "logits/rejected": -0.6267974972724915, "logps/chosen": -0.02280341647565365, "logps/rejected": -6.154178619384766, "loss": 0.0389, "odds_ratio_loss": 0.0012691746233031154, "rewards/accuracies": 1.0, "rewards/chosen": -0.0022803416941314936, "rewards/margins": 0.6131375432014465, "rewards/rejected": -0.6154178380966187, "sft_loss": 0.02280341647565365, "step": 3640 }, { "epoch": 5.265365148228489, "grad_norm": 0.9965937815243027, "learning_rate": 3.0359107143699536e-07, "logits/chosen": -0.7954994440078735, "logits/rejected": -0.726338803768158, "logps/chosen": -0.027501408010721207, "logps/rejected": -5.424743175506592, "loss": 0.0283, "odds_ratio_loss": 0.006880844011902809, "rewards/accuracies": 1.0, "rewards/chosen": -0.002750141080468893, "rewards/margins": 0.5397241711616516, "rewards/rejected": -0.5424743294715881, "sft_loss": 0.027501408010721207, "step": 3641 }, { "epoch": 5.266811279826464, "grad_norm": 1.1401410273538075, "learning_rate": 3.024052901384193e-07, "logits/chosen": -0.8828942179679871, "logits/rejected": -0.6594923734664917, "logps/chosen": -0.08100616931915283, "logps/rejected": -4.891511917114258, "loss": 0.0521, "odds_ratio_loss": 0.0096984738484025, "rewards/accuracies": 1.0, "rewards/chosen": -0.008100616745650768, "rewards/margins": 0.481050580739975, "rewards/rejected": -0.4891512393951416, "sft_loss": 0.08100616931915283, "step": 3642 }, { "epoch": 5.26825741142444, "grad_norm": 1.0742692127897753, "learning_rate": 3.0122173813346454e-07, "logits/chosen": -0.662639856338501, "logits/rejected": -0.5347196459770203, "logps/chosen": -0.035757362842559814, "logps/rejected": -5.963934421539307, "loss": 0.0396, "odds_ratio_loss": 0.0011856274213641882, "rewards/accuracies": 1.0, "rewards/chosen": -0.0035757366567850113, "rewards/margins": 0.5928177237510681, "rewards/rejected": -0.5963934659957886, "sft_loss": 0.035757362842559814, "step": 3643 }, { "epoch": 5.269703543022415, "grad_norm": 1.0375189083630167, "learning_rate": 3.000404161357002e-07, "logits/chosen": -0.9065849184989929, "logits/rejected": -0.7200076580047607, "logps/chosen": -0.02265940047800541, "logps/rejected": -5.422698020935059, "loss": 0.0291, "odds_ratio_loss": 0.0008585632895119488, "rewards/accuracies": 1.0, "rewards/chosen": -0.0022659399546682835, "rewards/margins": 0.5400038957595825, "rewards/rejected": -0.5422698259353638, "sft_loss": 0.02265940047800541, "step": 3644 }, { "epoch": 5.27114967462039, "grad_norm": 1.1315767384118531, "learning_rate": 2.988613248573486e-07, "logits/chosen": -0.824489951133728, "logits/rejected": -0.6248018741607666, "logps/chosen": -0.058798667043447495, "logps/rejected": -5.063660621643066, "loss": 0.0388, "odds_ratio_loss": 0.04671594500541687, "rewards/accuracies": 0.9375, "rewards/chosen": -0.005879866890609264, "rewards/margins": 0.5004861950874329, "rewards/rejected": -0.5063660740852356, "sft_loss": 0.058798667043447495, "step": 3645 }, { "epoch": 5.272595806218366, "grad_norm": 1.2204605587673572, "learning_rate": 2.9768446500928915e-07, "logits/chosen": -1.0131525993347168, "logits/rejected": -0.9568882584571838, "logps/chosen": -0.04865047708153725, "logps/rejected": -5.075432777404785, "loss": 0.0305, "odds_ratio_loss": 0.0028830140363425016, "rewards/accuracies": 1.0, "rewards/chosen": -0.004865047987550497, "rewards/margins": 0.5026782751083374, "rewards/rejected": -0.5075433254241943, "sft_loss": 0.04865047708153725, "step": 3646 }, { "epoch": 5.2740419378163415, "grad_norm": 0.9682725426804464, "learning_rate": 2.9650983730105503e-07, "logits/chosen": -0.8504335880279541, "logits/rejected": -0.846091091632843, "logps/chosen": -0.029791761189699173, "logps/rejected": -4.313238143920898, "loss": 0.0311, "odds_ratio_loss": 0.0022929785773158073, "rewards/accuracies": 1.0, "rewards/chosen": -0.0029791758861392736, "rewards/margins": 0.4283446669578552, "rewards/rejected": -0.4313238561153412, "sft_loss": 0.029791761189699173, "step": 3647 }, { "epoch": 5.275488069414317, "grad_norm": 1.0233502761452942, "learning_rate": 2.953374424408328e-07, "logits/chosen": -0.6208338737487793, "logits/rejected": -0.6304566860198975, "logps/chosen": -0.027606675401329994, "logps/rejected": -4.565278053283691, "loss": 0.0385, "odds_ratio_loss": 0.0008353454759344459, "rewards/accuracies": 1.0, "rewards/chosen": -0.0027606673538684845, "rewards/margins": 0.4537671208381653, "rewards/rejected": -0.45652779936790466, "sft_loss": 0.027606675401329994, "step": 3648 }, { "epoch": 5.276934201012292, "grad_norm": 0.9556294865840138, "learning_rate": 2.9416728113546363e-07, "logits/chosen": -0.974734902381897, "logits/rejected": -0.6098443865776062, "logps/chosen": -0.039620328694581985, "logps/rejected": -7.678589820861816, "loss": 0.0374, "odds_ratio_loss": 0.0007251370116136968, "rewards/accuracies": 1.0, "rewards/chosen": -0.003962032496929169, "rewards/margins": 0.7638970017433167, "rewards/rejected": -0.7678591012954712, "sft_loss": 0.039620328694581985, "step": 3649 }, { "epoch": 5.278380332610268, "grad_norm": 1.1251246650139406, "learning_rate": 2.929993540904436e-07, "logits/chosen": -0.9035037755966187, "logits/rejected": -0.7772727608680725, "logps/chosen": -0.020724035799503326, "logps/rejected": -4.813797473907471, "loss": 0.0343, "odds_ratio_loss": 0.0009286232525482774, "rewards/accuracies": 1.0, "rewards/chosen": -0.00207240367308259, "rewards/margins": 0.4793073534965515, "rewards/rejected": -0.4813797175884247, "sft_loss": 0.020724035799503326, "step": 3650 }, { "epoch": 5.279826464208243, "grad_norm": 0.8900567501477809, "learning_rate": 2.918336620099184e-07, "logits/chosen": -0.7675405740737915, "logits/rejected": -0.6586223840713501, "logps/chosen": -0.01650574989616871, "logps/rejected": -6.721004009246826, "loss": 0.0315, "odds_ratio_loss": 0.0005637137801386416, "rewards/accuracies": 1.0, "rewards/chosen": -0.0016505750827491283, "rewards/margins": 0.6704498529434204, "rewards/rejected": -0.6721004247665405, "sft_loss": 0.01650574989616871, "step": 3651 }, { "epoch": 5.281272595806218, "grad_norm": 1.1633737173097518, "learning_rate": 2.9067020559668945e-07, "logits/chosen": -0.8274843692779541, "logits/rejected": -0.6375514268875122, "logps/chosen": -0.053619079291820526, "logps/rejected": -6.285444259643555, "loss": 0.0341, "odds_ratio_loss": 0.003317378694191575, "rewards/accuracies": 1.0, "rewards/chosen": -0.0053619081154465675, "rewards/margins": 0.6231825351715088, "rewards/rejected": -0.6285443902015686, "sft_loss": 0.053619079291820526, "step": 3652 }, { "epoch": 5.282718727404194, "grad_norm": 1.157606261307862, "learning_rate": 2.895089855522088e-07, "logits/chosen": -0.8747952580451965, "logits/rejected": -0.7444599866867065, "logps/chosen": -0.04207966476678848, "logps/rejected": -5.985525131225586, "loss": 0.0419, "odds_ratio_loss": 0.004523225128650665, "rewards/accuracies": 1.0, "rewards/chosen": -0.004207966383546591, "rewards/margins": 0.5943444967269897, "rewards/rejected": -0.5985524654388428, "sft_loss": 0.04207966476678848, "step": 3653 }, { "epoch": 5.2841648590021695, "grad_norm": 1.4338523494202897, "learning_rate": 2.8835000257658016e-07, "logits/chosen": -0.9471051692962646, "logits/rejected": -0.5582077503204346, "logps/chosen": -0.01802823692560196, "logps/rejected": -4.929285526275635, "loss": 0.0317, "odds_ratio_loss": 0.0007497454062104225, "rewards/accuracies": 1.0, "rewards/chosen": -0.0018028237391263247, "rewards/margins": 0.49112576246261597, "rewards/rejected": -0.49292856454849243, "sft_loss": 0.01802823692560196, "step": 3654 }, { "epoch": 5.285610990600144, "grad_norm": 1.2172564226035754, "learning_rate": 2.8719325736855873e-07, "logits/chosen": -0.8020838499069214, "logits/rejected": -0.7115020155906677, "logps/chosen": -0.04903501272201538, "logps/rejected": -4.307126045227051, "loss": 0.0442, "odds_ratio_loss": 0.0023487398866564035, "rewards/accuracies": 1.0, "rewards/chosen": -0.0049035013653337955, "rewards/margins": 0.4258091449737549, "rewards/rejected": -0.43071264028549194, "sft_loss": 0.04903501272201538, "step": 3655 }, { "epoch": 5.28705712219812, "grad_norm": 1.2232310635801635, "learning_rate": 2.860387506255497e-07, "logits/chosen": -0.7558038234710693, "logits/rejected": -0.5430575609207153, "logps/chosen": -0.031046954914927483, "logps/rejected": -6.496105194091797, "loss": 0.0503, "odds_ratio_loss": 0.0024577996227890253, "rewards/accuracies": 1.0, "rewards/chosen": -0.003104696050286293, "rewards/margins": 0.6465058326721191, "rewards/rejected": -0.6496106386184692, "sft_loss": 0.031046954914927483, "step": 3656 }, { "epoch": 5.288503253796096, "grad_norm": 1.2243215854146698, "learning_rate": 2.848864830436111e-07, "logits/chosen": -0.9211279153823853, "logits/rejected": -0.5959931015968323, "logps/chosen": -0.09139852970838547, "logps/rejected": -4.205726623535156, "loss": 0.0597, "odds_ratio_loss": 0.005354328081011772, "rewards/accuracies": 1.0, "rewards/chosen": -0.009139853529632092, "rewards/margins": 0.41143280267715454, "rewards/rejected": -0.4205726683139801, "sft_loss": 0.09139852970838547, "step": 3657 }, { "epoch": 5.2899493853940704, "grad_norm": 1.121340360401462, "learning_rate": 2.837364553174475e-07, "logits/chosen": -1.0993698835372925, "logits/rejected": -0.5834642052650452, "logps/chosen": -0.05646635964512825, "logps/rejected": -6.199383735656738, "loss": 0.0398, "odds_ratio_loss": 0.001794778392650187, "rewards/accuracies": 1.0, "rewards/chosen": -0.00564663577824831, "rewards/margins": 0.6142917275428772, "rewards/rejected": -0.6199383735656738, "sft_loss": 0.05646635964512825, "step": 3658 }, { "epoch": 5.291395516992046, "grad_norm": 1.3170381422047455, "learning_rate": 2.825886681404164e-07, "logits/chosen": -0.7939042448997498, "logits/rejected": -0.5887212157249451, "logps/chosen": -0.03460104390978813, "logps/rejected": -5.4101104736328125, "loss": 0.0285, "odds_ratio_loss": 0.0012456791009753942, "rewards/accuracies": 1.0, "rewards/chosen": -0.0034601043444126844, "rewards/margins": 0.5375509262084961, "rewards/rejected": -0.5410110354423523, "sft_loss": 0.03460104390978813, "step": 3659 }, { "epoch": 5.292841648590022, "grad_norm": 0.919145469791782, "learning_rate": 2.8144312220452194e-07, "logits/chosen": -0.6260144710540771, "logits/rejected": -0.6858643293380737, "logps/chosen": -0.01188691146671772, "logps/rejected": -4.332086086273193, "loss": 0.028, "odds_ratio_loss": 0.0014277580194175243, "rewards/accuracies": 1.0, "rewards/chosen": -0.0011886911233887076, "rewards/margins": 0.4320199489593506, "rewards/rejected": -0.4332086145877838, "sft_loss": 0.01188691146671772, "step": 3660 }, { "epoch": 5.2942877801879975, "grad_norm": 1.1722799731959772, "learning_rate": 2.802998182004188e-07, "logits/chosen": -0.865659236907959, "logits/rejected": -0.5501079559326172, "logps/chosen": -0.058488842099905014, "logps/rejected": -5.399849891662598, "loss": 0.0557, "odds_ratio_loss": 0.002198958769440651, "rewards/accuracies": 1.0, "rewards/chosen": -0.005848885513842106, "rewards/margins": 0.5341361165046692, "rewards/rejected": -0.5399850010871887, "sft_loss": 0.058488842099905014, "step": 3661 }, { "epoch": 5.295733911785972, "grad_norm": 1.0278872351784067, "learning_rate": 2.791587568174094e-07, "logits/chosen": -0.8193449974060059, "logits/rejected": -0.7792457342147827, "logps/chosen": -0.04936240613460541, "logps/rejected": -4.964071273803711, "loss": 0.025, "odds_ratio_loss": 0.0040309252217411995, "rewards/accuracies": 1.0, "rewards/chosen": -0.004936240613460541, "rewards/margins": 0.49147090315818787, "rewards/rejected": -0.496407151222229, "sft_loss": 0.04936240613460541, "step": 3662 }, { "epoch": 5.297180043383948, "grad_norm": 1.203031731438833, "learning_rate": 2.7801993874344297e-07, "logits/chosen": -0.8412383198738098, "logits/rejected": -0.7706151008605957, "logps/chosen": -0.027266865596175194, "logps/rejected": -5.0899739265441895, "loss": 0.0517, "odds_ratio_loss": 0.001991531578823924, "rewards/accuracies": 1.0, "rewards/chosen": -0.002726686652749777, "rewards/margins": 0.5062706470489502, "rewards/rejected": -0.50899738073349, "sft_loss": 0.027266865596175194, "step": 3663 }, { "epoch": 5.298626174981924, "grad_norm": 1.023947653615631, "learning_rate": 2.7688336466511743e-07, "logits/chosen": -0.9421272873878479, "logits/rejected": -0.6665881872177124, "logps/chosen": -0.036513958126306534, "logps/rejected": -5.57096529006958, "loss": 0.0252, "odds_ratio_loss": 0.0018653357401490211, "rewards/accuracies": 1.0, "rewards/chosen": -0.003651396371424198, "rewards/margins": 0.5534451007843018, "rewards/rejected": -0.557096540927887, "sft_loss": 0.036513958126306534, "step": 3664 }, { "epoch": 5.3000723065798985, "grad_norm": 1.110676709749025, "learning_rate": 2.7574903526767746e-07, "logits/chosen": -0.6372553110122681, "logits/rejected": -0.5765061378479004, "logps/chosen": -0.02319909632205963, "logps/rejected": -4.816458702087402, "loss": 0.0382, "odds_ratio_loss": 0.0015413042856380343, "rewards/accuracies": 1.0, "rewards/chosen": -0.002319909632205963, "rewards/margins": 0.47932595014572144, "rewards/rejected": -0.4816458225250244, "sft_loss": 0.02319909632205963, "step": 3665 }, { "epoch": 5.301518438177874, "grad_norm": 1.0978912010109163, "learning_rate": 2.746169512350152e-07, "logits/chosen": -0.7656617164611816, "logits/rejected": -0.5390478372573853, "logps/chosen": -0.04759720712900162, "logps/rejected": -4.945181846618652, "loss": 0.0338, "odds_ratio_loss": 0.001722943503409624, "rewards/accuracies": 1.0, "rewards/chosen": -0.004759720992296934, "rewards/margins": 0.4897584915161133, "rewards/rejected": -0.4945182204246521, "sft_loss": 0.04759720712900162, "step": 3666 }, { "epoch": 5.30296456977585, "grad_norm": 1.012794228374218, "learning_rate": 2.734871132496672e-07, "logits/chosen": -1.0013247728347778, "logits/rejected": -0.7348878383636475, "logps/chosen": -0.023679528385400772, "logps/rejected": -4.578136920928955, "loss": 0.0206, "odds_ratio_loss": 0.0015165224904194474, "rewards/accuracies": 1.0, "rewards/chosen": -0.002367952838540077, "rewards/margins": 0.4554457664489746, "rewards/rejected": -0.45781373977661133, "sft_loss": 0.023679528385400772, "step": 3667 }, { "epoch": 5.304410701373825, "grad_norm": 1.4279003426255663, "learning_rate": 2.7235952199281854e-07, "logits/chosen": -0.9840003252029419, "logits/rejected": -0.8327428102493286, "logps/chosen": -0.041510019451379776, "logps/rejected": -4.400811195373535, "loss": 0.0386, "odds_ratio_loss": 0.0024359440430998802, "rewards/accuracies": 1.0, "rewards/chosen": -0.004151002038270235, "rewards/margins": 0.43593013286590576, "rewards/rejected": -0.4400811195373535, "sft_loss": 0.041510019451379776, "step": 3668 }, { "epoch": 5.3058568329718, "grad_norm": 1.1038023610062029, "learning_rate": 2.712341781442973e-07, "logits/chosen": -0.8254796266555786, "logits/rejected": -0.7522153854370117, "logps/chosen": -0.022977057844400406, "logps/rejected": -5.291687488555908, "loss": 0.0455, "odds_ratio_loss": 0.001104579190723598, "rewards/accuracies": 1.0, "rewards/chosen": -0.0022977059707045555, "rewards/margins": 0.5268710851669312, "rewards/rejected": -0.5291687846183777, "sft_loss": 0.022977057844400406, "step": 3669 }, { "epoch": 5.307302964569776, "grad_norm": 1.072757689228915, "learning_rate": 2.701110823825772e-07, "logits/chosen": -0.9172158241271973, "logits/rejected": -0.7597464323043823, "logps/chosen": -0.03024320863187313, "logps/rejected": -4.548722743988037, "loss": 0.0425, "odds_ratio_loss": 0.0015936695272102952, "rewards/accuracies": 1.0, "rewards/chosen": -0.0030243208166211843, "rewards/margins": 0.45184794068336487, "rewards/rejected": -0.4548722505569458, "sft_loss": 0.03024320863187313, "step": 3670 }, { "epoch": 5.308749096167752, "grad_norm": 1.3097028126399897, "learning_rate": 2.689902353847766e-07, "logits/chosen": -1.211554765701294, "logits/rejected": -0.7927272915840149, "logps/chosen": -0.04887429624795914, "logps/rejected": -4.841168403625488, "loss": 0.041, "odds_ratio_loss": 0.001834406517446041, "rewards/accuracies": 1.0, "rewards/chosen": -0.004887429066002369, "rewards/margins": 0.4792294502258301, "rewards/rejected": -0.4841168522834778, "sft_loss": 0.04887429624795914, "step": 3671 }, { "epoch": 5.3101952277657265, "grad_norm": 1.107848445275236, "learning_rate": 2.678716378266599e-07, "logits/chosen": -0.8874702453613281, "logits/rejected": -0.7778452634811401, "logps/chosen": -0.013926111161708832, "logps/rejected": -6.384116172790527, "loss": 0.0593, "odds_ratio_loss": 0.0006129151443019509, "rewards/accuracies": 1.0, "rewards/chosen": -0.0013926110696047544, "rewards/margins": 0.6370189785957336, "rewards/rejected": -0.6384116411209106, "sft_loss": 0.013926111161708832, "step": 3672 }, { "epoch": 5.311641359363702, "grad_norm": 1.086942198501488, "learning_rate": 2.6675529038263157e-07, "logits/chosen": -0.855125904083252, "logits/rejected": -0.7381969690322876, "logps/chosen": -0.018707137554883957, "logps/rejected": -3.6854591369628906, "loss": 0.0419, "odds_ratio_loss": 0.0006958214216865599, "rewards/accuracies": 1.0, "rewards/chosen": -0.00187071377877146, "rewards/margins": 0.36667516827583313, "rewards/rejected": -0.36854591965675354, "sft_loss": 0.018707137554883957, "step": 3673 }, { "epoch": 5.313087490961678, "grad_norm": 1.1795744742774177, "learning_rate": 2.6564119372574347e-07, "logits/chosen": -0.8077712059020996, "logits/rejected": -0.6191185116767883, "logps/chosen": -0.06234436109662056, "logps/rejected": -6.990647315979004, "loss": 0.0462, "odds_ratio_loss": 0.007911072112619877, "rewards/accuracies": 1.0, "rewards/chosen": -0.006234435830265284, "rewards/margins": 0.6928303241729736, "rewards/rejected": -0.6990647315979004, "sft_loss": 0.06234436109662056, "step": 3674 }, { "epoch": 5.314533622559653, "grad_norm": 0.968028462978872, "learning_rate": 2.6452934852768714e-07, "logits/chosen": -0.7257063388824463, "logits/rejected": -0.6271748542785645, "logps/chosen": -0.016601260751485825, "logps/rejected": -4.678224563598633, "loss": 0.0308, "odds_ratio_loss": 0.0008440697565674782, "rewards/accuracies": 1.0, "rewards/chosen": -0.0016601260285824537, "rewards/margins": 0.4661623239517212, "rewards/rejected": -0.46782243251800537, "sft_loss": 0.016601260751485825, "step": 3675 }, { "epoch": 5.315979754157628, "grad_norm": 1.1731861474929364, "learning_rate": 2.634197554587998e-07, "logits/chosen": -1.1070533990859985, "logits/rejected": -0.7468483448028564, "logps/chosen": -0.05605795979499817, "logps/rejected": -6.352745532989502, "loss": 0.037, "odds_ratio_loss": 0.0028356327675282955, "rewards/accuracies": 1.0, "rewards/chosen": -0.005605795886367559, "rewards/margins": 0.6296687722206116, "rewards/rejected": -0.6352745890617371, "sft_loss": 0.05605795979499817, "step": 3676 }, { "epoch": 5.317425885755604, "grad_norm": 1.077797013393816, "learning_rate": 2.623124151880578e-07, "logits/chosen": -0.8413434028625488, "logits/rejected": -0.6536193490028381, "logps/chosen": -0.04118379205465317, "logps/rejected": -5.763056755065918, "loss": 0.042, "odds_ratio_loss": 0.004730723798274994, "rewards/accuracies": 1.0, "rewards/chosen": -0.004118379671126604, "rewards/margins": 0.5721873641014099, "rewards/rejected": -0.5763057470321655, "sft_loss": 0.04118379205465317, "step": 3677 }, { "epoch": 5.318872017353579, "grad_norm": 1.3456733360766195, "learning_rate": 2.612073283830818e-07, "logits/chosen": -0.9398325681686401, "logits/rejected": -0.6609033346176147, "logps/chosen": -0.02295401319861412, "logps/rejected": -6.218092441558838, "loss": 0.0263, "odds_ratio_loss": 0.0007425962830893695, "rewards/accuracies": 1.0, "rewards/chosen": -0.0022954014129936695, "rewards/margins": 0.6195138692855835, "rewards/rejected": -0.6218092441558838, "sft_loss": 0.02295401319861412, "step": 3678 }, { "epoch": 5.3203181489515545, "grad_norm": 1.3016144488878012, "learning_rate": 2.6010449571013215e-07, "logits/chosen": -0.8513684868812561, "logits/rejected": -0.6279063820838928, "logps/chosen": -0.07777070999145508, "logps/rejected": -6.661202907562256, "loss": 0.0392, "odds_ratio_loss": 0.0036332891322672367, "rewards/accuracies": 1.0, "rewards/chosen": -0.007777070626616478, "rewards/margins": 0.6583431959152222, "rewards/rejected": -0.6661202907562256, "sft_loss": 0.07777070999145508, "step": 3679 }, { "epoch": 5.32176428054953, "grad_norm": 1.0491356786995216, "learning_rate": 2.5900391783411035e-07, "logits/chosen": -1.0663862228393555, "logits/rejected": -0.8108906149864197, "logps/chosen": -0.03312306851148605, "logps/rejected": -4.926994323730469, "loss": 0.0266, "odds_ratio_loss": 0.0015353760682046413, "rewards/accuracies": 1.0, "rewards/chosen": -0.003312306944280863, "rewards/margins": 0.4893871545791626, "rewards/rejected": -0.49269944429397583, "sft_loss": 0.03312306851148605, "step": 3680 }, { "epoch": 5.323210412147505, "grad_norm": 1.5135907197437433, "learning_rate": 2.579055954185603e-07, "logits/chosen": -1.0617835521697998, "logits/rejected": -0.8939220905303955, "logps/chosen": -0.05323568731546402, "logps/rejected": -4.0288519859313965, "loss": 0.0397, "odds_ratio_loss": 0.0026793223805725574, "rewards/accuracies": 1.0, "rewards/chosen": -0.0053235688246786594, "rewards/margins": 0.39756157994270325, "rewards/rejected": -0.40288519859313965, "sft_loss": 0.05323568731546402, "step": 3681 }, { "epoch": 5.324656543745481, "grad_norm": 1.1680256177062494, "learning_rate": 2.5680952912566334e-07, "logits/chosen": -0.8926557302474976, "logits/rejected": -0.7793102264404297, "logps/chosen": -0.05652204155921936, "logps/rejected": -4.692435264587402, "loss": 0.039, "odds_ratio_loss": 0.006315143778920174, "rewards/accuracies": 1.0, "rewards/chosen": -0.005652204155921936, "rewards/margins": 0.4635912775993347, "rewards/rejected": -0.46924352645874023, "sft_loss": 0.05652204155921936, "step": 3682 }, { "epoch": 5.326102675343456, "grad_norm": 1.495092313640595, "learning_rate": 2.557157196162425e-07, "logits/chosen": -1.017024040222168, "logits/rejected": -0.7390504479408264, "logps/chosen": -0.046689312905073166, "logps/rejected": -6.402212142944336, "loss": 0.0614, "odds_ratio_loss": 0.00260357023216784, "rewards/accuracies": 1.0, "rewards/chosen": -0.004668931011110544, "rewards/margins": 0.6355522871017456, "rewards/rejected": -0.6402212381362915, "sft_loss": 0.046689312905073166, "step": 3683 }, { "epoch": 5.327548806941432, "grad_norm": 0.9797352539134356, "learning_rate": 2.546241675497591e-07, "logits/chosen": -0.9606846570968628, "logits/rejected": -0.7178159952163696, "logps/chosen": -0.05462236702442169, "logps/rejected": -7.01438570022583, "loss": 0.0277, "odds_ratio_loss": 0.0016510548302903771, "rewards/accuracies": 1.0, "rewards/chosen": -0.005462236702442169, "rewards/margins": 0.6959763169288635, "rewards/rejected": -0.7014386057853699, "sft_loss": 0.05462236702442169, "step": 3684 }, { "epoch": 5.328994938539407, "grad_norm": 0.9631824763305563, "learning_rate": 2.5353487358431527e-07, "logits/chosen": -0.9838446378707886, "logits/rejected": -0.7861350774765015, "logps/chosen": -0.035723727196455, "logps/rejected": -4.819340229034424, "loss": 0.0255, "odds_ratio_loss": 0.002584266010671854, "rewards/accuracies": 1.0, "rewards/chosen": -0.003572372952476144, "rewards/margins": 0.47836166620254517, "rewards/rejected": -0.4819340407848358, "sft_loss": 0.035723727196455, "step": 3685 }, { "epoch": 5.3304410701373826, "grad_norm": 1.2332280571344059, "learning_rate": 2.524478383766491e-07, "logits/chosen": -0.9059146642684937, "logits/rejected": -0.7595691680908203, "logps/chosen": -0.02609408274292946, "logps/rejected": -4.347700119018555, "loss": 0.0646, "odds_ratio_loss": 0.002554179634898901, "rewards/accuracies": 1.0, "rewards/chosen": -0.002609408460557461, "rewards/margins": 0.4321606159210205, "rewards/rejected": -0.43476998805999756, "sft_loss": 0.02609408274292946, "step": 3686 }, { "epoch": 5.331887201735358, "grad_norm": 0.8983449394324652, "learning_rate": 2.5136306258213857e-07, "logits/chosen": -0.9400614500045776, "logits/rejected": -0.7500208616256714, "logps/chosen": -0.028386441990733147, "logps/rejected": -4.063958644866943, "loss": 0.0234, "odds_ratio_loss": 0.0013379440642893314, "rewards/accuracies": 1.0, "rewards/chosen": -0.0028386441990733147, "rewards/margins": 0.4035572111606598, "rewards/rejected": -0.4063958525657654, "sft_loss": 0.028386441990733147, "step": 3687 }, { "epoch": 5.333333333333333, "grad_norm": 0.9751487290928306, "learning_rate": 2.502805468547984e-07, "logits/chosen": -1.0151832103729248, "logits/rejected": -0.7417925596237183, "logps/chosen": -0.03261277452111244, "logps/rejected": -5.35139274597168, "loss": 0.0328, "odds_ratio_loss": 0.001738175400532782, "rewards/accuracies": 1.0, "rewards/chosen": -0.003261277452111244, "rewards/margins": 0.5318779945373535, "rewards/rejected": -0.535139262676239, "sft_loss": 0.03261277452111244, "step": 3688 }, { "epoch": 5.334779464931309, "grad_norm": 1.1841885288210054, "learning_rate": 2.4920029184728285e-07, "logits/chosen": -0.8535258173942566, "logits/rejected": -0.711528480052948, "logps/chosen": -0.04814765602350235, "logps/rejected": -4.029658317565918, "loss": 0.0384, "odds_ratio_loss": 0.0027197981253266335, "rewards/accuracies": 1.0, "rewards/chosen": -0.00481476541608572, "rewards/margins": 0.3981510400772095, "rewards/rejected": -0.40296584367752075, "sft_loss": 0.04814765602350235, "step": 3689 }, { "epoch": 5.336225596529284, "grad_norm": 1.0863611869122904, "learning_rate": 2.481222982108799e-07, "logits/chosen": -0.7337764501571655, "logits/rejected": -0.6401863694190979, "logps/chosen": -0.02598814107477665, "logps/rejected": -5.963187217712402, "loss": 0.0348, "odds_ratio_loss": 0.0013941468205302954, "rewards/accuracies": 1.0, "rewards/chosen": -0.0025988139677792788, "rewards/margins": 0.5937198996543884, "rewards/rejected": -0.5963187217712402, "sft_loss": 0.02598814107477665, "step": 3690 }, { "epoch": 5.337671728127259, "grad_norm": 1.112598630427125, "learning_rate": 2.470465665955173e-07, "logits/chosen": -1.239977240562439, "logits/rejected": -0.7182906866073608, "logps/chosen": -0.03799346834421158, "logps/rejected": -7.847053527832031, "loss": 0.0339, "odds_ratio_loss": 0.002206320408731699, "rewards/accuracies": 1.0, "rewards/chosen": -0.0037993467412889004, "rewards/margins": 0.7809059619903564, "rewards/rejected": -0.784705400466919, "sft_loss": 0.03799346834421158, "step": 3691 }, { "epoch": 5.339117859725235, "grad_norm": 0.8147289146984105, "learning_rate": 2.4597309764975737e-07, "logits/chosen": -0.7928810119628906, "logits/rejected": -0.7506893873214722, "logps/chosen": -0.005609842017292976, "logps/rejected": -7.077286243438721, "loss": 0.0131, "odds_ratio_loss": 0.0003021466836798936, "rewards/accuracies": 1.0, "rewards/chosen": -0.0005609841900877655, "rewards/margins": 0.7071676254272461, "rewards/rejected": -0.7077286243438721, "sft_loss": 0.005609842017292976, "step": 3692 }, { "epoch": 5.340563991323211, "grad_norm": 1.3433904863941526, "learning_rate": 2.449018920207986e-07, "logits/chosen": -0.9152747988700867, "logits/rejected": -0.6203851699829102, "logps/chosen": -0.05753957852721214, "logps/rejected": -6.203818321228027, "loss": 0.0708, "odds_ratio_loss": 0.0015491548692807555, "rewards/accuracies": 1.0, "rewards/chosen": -0.005753958132117987, "rewards/margins": 0.6146278381347656, "rewards/rejected": -0.6203818321228027, "sft_loss": 0.05753957852721214, "step": 3693 }, { "epoch": 5.342010122921186, "grad_norm": 1.3976027188444067, "learning_rate": 2.438329503544745e-07, "logits/chosen": -0.9144161939620972, "logits/rejected": -0.783090353012085, "logps/chosen": -0.04463702812790871, "logps/rejected": -4.492361068725586, "loss": 0.0524, "odds_ratio_loss": 0.002019796520471573, "rewards/accuracies": 1.0, "rewards/chosen": -0.004463702440261841, "rewards/margins": 0.4447723627090454, "rewards/rejected": -0.44923609495162964, "sft_loss": 0.04463702812790871, "step": 3694 }, { "epoch": 5.343456254519161, "grad_norm": 0.9595421865106609, "learning_rate": 2.427662732952531e-07, "logits/chosen": -0.891353189945221, "logits/rejected": -0.7665066719055176, "logps/chosen": -0.017603110522031784, "logps/rejected": -5.0525360107421875, "loss": 0.0355, "odds_ratio_loss": 0.0015732902102172375, "rewards/accuracies": 1.0, "rewards/chosen": -0.0017603111919015646, "rewards/margins": 0.5034933090209961, "rewards/rejected": -0.5052536129951477, "sft_loss": 0.017603110522031784, "step": 3695 }, { "epoch": 5.344902386117137, "grad_norm": 0.9786549040478829, "learning_rate": 2.4170186148624003e-07, "logits/chosen": -0.8309451341629028, "logits/rejected": -0.7362473011016846, "logps/chosen": -0.02056121453642845, "logps/rejected": -4.408735752105713, "loss": 0.0377, "odds_ratio_loss": 0.0009791934862732887, "rewards/accuracies": 1.0, "rewards/chosen": -0.002056121826171875, "rewards/margins": 0.43881747126579285, "rewards/rejected": -0.44087356328964233, "sft_loss": 0.02056121453642845, "step": 3696 }, { "epoch": 5.346348517715112, "grad_norm": 0.9266218741647381, "learning_rate": 2.406397155691713e-07, "logits/chosen": -1.0537879467010498, "logits/rejected": -0.9023940563201904, "logps/chosen": -0.04712200164794922, "logps/rejected": -3.5155301094055176, "loss": 0.0293, "odds_ratio_loss": 0.003726676106452942, "rewards/accuracies": 1.0, "rewards/chosen": -0.004712200257927179, "rewards/margins": 0.34684082865715027, "rewards/rejected": -0.3515530228614807, "sft_loss": 0.04712200164794922, "step": 3697 }, { "epoch": 5.347794649313087, "grad_norm": 0.8161323163215715, "learning_rate": 2.3957983618442037e-07, "logits/chosen": -0.8924893140792847, "logits/rejected": -0.6488431692123413, "logps/chosen": -0.019642913714051247, "logps/rejected": -5.622814178466797, "loss": 0.0344, "odds_ratio_loss": 0.0009415658423677087, "rewards/accuracies": 1.0, "rewards/chosen": -0.0019642915576696396, "rewards/margins": 0.5603170990943909, "rewards/rejected": -0.5622814297676086, "sft_loss": 0.019642913714051247, "step": 3698 }, { "epoch": 5.349240780911063, "grad_norm": 1.0073435267680912, "learning_rate": 2.385222239709903e-07, "logits/chosen": -0.939033031463623, "logits/rejected": -0.7678767442703247, "logps/chosen": -0.023407379165291786, "logps/rejected": -5.834734916687012, "loss": 0.0317, "odds_ratio_loss": 0.001715653808787465, "rewards/accuracies": 1.0, "rewards/chosen": -0.0023407377302646637, "rewards/margins": 0.5811327695846558, "rewards/rejected": -0.5834734439849854, "sft_loss": 0.023407379165291786, "step": 3699 }, { "epoch": 5.350686912509039, "grad_norm": 1.3694533433954266, "learning_rate": 2.374668795665218e-07, "logits/chosen": -0.9073183536529541, "logits/rejected": -0.8298658132553101, "logps/chosen": -0.01922656036913395, "logps/rejected": -4.1897687911987305, "loss": 0.0341, "odds_ratio_loss": 0.0013735933462157845, "rewards/accuracies": 1.0, "rewards/chosen": -0.0019226560834795237, "rewards/margins": 0.4170542359352112, "rewards/rejected": -0.41897690296173096, "sft_loss": 0.01922656036913395, "step": 3700 }, { "epoch": 5.352133044107013, "grad_norm": 0.8168201385481627, "learning_rate": 2.3641380360728447e-07, "logits/chosen": -0.9471131563186646, "logits/rejected": -0.6616654396057129, "logps/chosen": -0.019169360399246216, "logps/rejected": -6.367465972900391, "loss": 0.0228, "odds_ratio_loss": 0.0011902657570317388, "rewards/accuracies": 1.0, "rewards/chosen": -0.0019169360166415572, "rewards/margins": 0.6348296403884888, "rewards/rejected": -0.6367465853691101, "sft_loss": 0.019169360399246216, "step": 3701 }, { "epoch": 5.353579175704989, "grad_norm": 0.9598137915603296, "learning_rate": 2.3536299672818205e-07, "logits/chosen": -0.6729775071144104, "logits/rejected": -0.6349964737892151, "logps/chosen": -0.05535079538822174, "logps/rejected": -4.376411437988281, "loss": 0.0344, "odds_ratio_loss": 0.0037783747538924217, "rewards/accuracies": 1.0, "rewards/chosen": -0.005535079166293144, "rewards/margins": 0.432106077671051, "rewards/rejected": -0.4376411736011505, "sft_loss": 0.05535079538822174, "step": 3702 }, { "epoch": 5.355025307302965, "grad_norm": 1.06738741943502, "learning_rate": 2.3431445956274954e-07, "logits/chosen": -1.015805721282959, "logits/rejected": -0.7145316004753113, "logps/chosen": -0.027575181797146797, "logps/rejected": -4.448596000671387, "loss": 0.0476, "odds_ratio_loss": 0.0016211337642744184, "rewards/accuracies": 1.0, "rewards/chosen": -0.002757518319413066, "rewards/margins": 0.4421020746231079, "rewards/rejected": -0.4448596239089966, "sft_loss": 0.027575181797146797, "step": 3703 }, { "epoch": 5.35647143890094, "grad_norm": 1.1261370264518713, "learning_rate": 2.3326819274315368e-07, "logits/chosen": -0.7991698980331421, "logits/rejected": -0.5997868776321411, "logps/chosen": -0.008198452182114124, "logps/rejected": -5.460474967956543, "loss": 0.036, "odds_ratio_loss": 0.00043488398659974337, "rewards/accuracies": 1.0, "rewards/chosen": -0.0008198452414944768, "rewards/margins": 0.5452276468276978, "rewards/rejected": -0.5460475087165833, "sft_loss": 0.008198452182114124, "step": 3704 }, { "epoch": 5.357917570498915, "grad_norm": 1.0861893135806593, "learning_rate": 2.3222419690019435e-07, "logits/chosen": -0.8181073069572449, "logits/rejected": -0.6101012229919434, "logps/chosen": -0.044420160353183746, "logps/rejected": -5.721585750579834, "loss": 0.0363, "odds_ratio_loss": 0.0018686356488615274, "rewards/accuracies": 1.0, "rewards/chosen": -0.004442016128450632, "rewards/margins": 0.5677164793014526, "rewards/rejected": -0.5721585154533386, "sft_loss": 0.044420160353183746, "step": 3705 }, { "epoch": 5.359363702096891, "grad_norm": 0.9995579308978673, "learning_rate": 2.3118247266329872e-07, "logits/chosen": -0.7560127973556519, "logits/rejected": -0.5476009845733643, "logps/chosen": -0.04683084785938263, "logps/rejected": -7.408351421356201, "loss": 0.0357, "odds_ratio_loss": 0.0008836622582748532, "rewards/accuracies": 1.0, "rewards/chosen": -0.004683084320276976, "rewards/margins": 0.7361520528793335, "rewards/rejected": -0.7408351302146912, "sft_loss": 0.04683084785938263, "step": 3706 }, { "epoch": 5.360809833694867, "grad_norm": 0.8843187410781583, "learning_rate": 2.3014302066052748e-07, "logits/chosen": -0.827779233455658, "logits/rejected": -0.6347323656082153, "logps/chosen": -0.010669386014342308, "logps/rejected": -3.1830084323883057, "loss": 0.0342, "odds_ratio_loss": 0.0007001932244747877, "rewards/accuracies": 1.0, "rewards/chosen": -0.001066938741132617, "rewards/margins": 0.3172339200973511, "rewards/rejected": -0.31830084323883057, "sft_loss": 0.010669386014342308, "step": 3707 }, { "epoch": 5.362255965292841, "grad_norm": 0.9632499279044018, "learning_rate": 2.291058415185696e-07, "logits/chosen": -0.9718351364135742, "logits/rejected": -0.7804862260818481, "logps/chosen": -0.02261853590607643, "logps/rejected": -3.541126012802124, "loss": 0.0345, "odds_ratio_loss": 0.044430263340473175, "rewards/accuracies": 0.9375, "rewards/chosen": -0.002261853776872158, "rewards/margins": 0.3518507480621338, "rewards/rejected": -0.3541126251220703, "sft_loss": 0.02261853590607643, "step": 3708 }, { "epoch": 5.363702096890817, "grad_norm": 1.055009170351824, "learning_rate": 2.2807093586274396e-07, "logits/chosen": -0.840499758720398, "logits/rejected": -0.7038973569869995, "logps/chosen": -0.04808530956506729, "logps/rejected": -5.187693119049072, "loss": 0.0315, "odds_ratio_loss": 0.0014545705635100603, "rewards/accuracies": 1.0, "rewards/chosen": -0.004808531142771244, "rewards/margins": 0.5139607787132263, "rewards/rejected": -0.5187693238258362, "sft_loss": 0.04808530956506729, "step": 3709 }, { "epoch": 5.365148228488793, "grad_norm": 1.1656407978335859, "learning_rate": 2.27038304316999e-07, "logits/chosen": -0.6774870157241821, "logits/rejected": -0.48925572633743286, "logps/chosen": -0.009755231440067291, "logps/rejected": -6.098196983337402, "loss": 0.0354, "odds_ratio_loss": 0.0006270483136177063, "rewards/accuracies": 1.0, "rewards/chosen": -0.00097552320221439, "rewards/margins": 0.6088441610336304, "rewards/rejected": -0.6098197102546692, "sft_loss": 0.009755231440067291, "step": 3710 }, { "epoch": 5.366594360086768, "grad_norm": 1.0338534343629824, "learning_rate": 2.2600794750391273e-07, "logits/chosen": -0.7158972024917603, "logits/rejected": -0.6297093629837036, "logps/chosen": -0.022365085780620575, "logps/rejected": -4.423712730407715, "loss": 0.0354, "odds_ratio_loss": 0.0009890659712255, "rewards/accuracies": 1.0, "rewards/chosen": -0.0022365087643265724, "rewards/margins": 0.4401347041130066, "rewards/rejected": -0.4423712491989136, "sft_loss": 0.022365085780620575, "step": 3711 }, { "epoch": 5.368040491684743, "grad_norm": 1.1168479872493293, "learning_rate": 2.2497986604469e-07, "logits/chosen": -0.7737710475921631, "logits/rejected": -0.6601718068122864, "logps/chosen": -0.008890267461538315, "logps/rejected": -5.52163028717041, "loss": 0.0382, "odds_ratio_loss": 0.002172160428017378, "rewards/accuracies": 1.0, "rewards/chosen": -0.0008890267345122993, "rewards/margins": 0.5512740612030029, "rewards/rejected": -0.5521630644798279, "sft_loss": 0.008890267461538315, "step": 3712 }, { "epoch": 5.369486623282719, "grad_norm": 1.1080662987117444, "learning_rate": 2.2395406055916655e-07, "logits/chosen": -0.8222386837005615, "logits/rejected": -0.668541431427002, "logps/chosen": -0.0324178971350193, "logps/rejected": -5.162585735321045, "loss": 0.037, "odds_ratio_loss": 0.0027628212701529264, "rewards/accuracies": 1.0, "rewards/chosen": -0.0032417899928987026, "rewards/margins": 0.5130168199539185, "rewards/rejected": -0.5162585973739624, "sft_loss": 0.0324178971350193, "step": 3713 }, { "epoch": 5.370932754880694, "grad_norm": 1.4577184250784672, "learning_rate": 2.2293053166580278e-07, "logits/chosen": -0.9864585995674133, "logits/rejected": -0.5810337066650391, "logps/chosen": -0.02716916799545288, "logps/rejected": -5.216684341430664, "loss": 0.0637, "odds_ratio_loss": 0.002008678624406457, "rewards/accuracies": 1.0, "rewards/chosen": -0.002716916613280773, "rewards/margins": 0.5189515352249146, "rewards/rejected": -0.5216684341430664, "sft_loss": 0.02716916799545288, "step": 3714 }, { "epoch": 5.3723788864786695, "grad_norm": 1.471600650771753, "learning_rate": 2.2190927998168952e-07, "logits/chosen": -0.794467568397522, "logits/rejected": -0.6563206315040588, "logps/chosen": -0.056384190917015076, "logps/rejected": -5.826344966888428, "loss": 0.0401, "odds_ratio_loss": 0.005085381213575602, "rewards/accuracies": 1.0, "rewards/chosen": -0.005638418719172478, "rewards/margins": 0.5769960880279541, "rewards/rejected": -0.5826345086097717, "sft_loss": 0.056384190917015076, "step": 3715 }, { "epoch": 5.373825018076645, "grad_norm": 1.1637947808860327, "learning_rate": 2.2089030612254223e-07, "logits/chosen": -0.579232394695282, "logits/rejected": -0.5474807024002075, "logps/chosen": -0.016251739114522934, "logps/rejected": -4.210132122039795, "loss": 0.0401, "odds_ratio_loss": 0.0010360369924455881, "rewards/accuracies": 1.0, "rewards/chosen": -0.0016251739580184221, "rewards/margins": 0.4193880558013916, "rewards/rejected": -0.4210132360458374, "sft_loss": 0.016251739114522934, "step": 3716 }, { "epoch": 5.375271149674621, "grad_norm": 1.4630343781840194, "learning_rate": 2.198736107027046e-07, "logits/chosen": -1.1237835884094238, "logits/rejected": -0.7257318496704102, "logps/chosen": -0.008886278606951237, "logps/rejected": -7.131657123565674, "loss": 0.0322, "odds_ratio_loss": 0.00011410063598304987, "rewards/accuracies": 1.0, "rewards/chosen": -0.0008886277792043984, "rewards/margins": 0.7122771143913269, "rewards/rejected": -0.7131657004356384, "sft_loss": 0.008886278606951237, "step": 3717 }, { "epoch": 5.376717281272596, "grad_norm": 1.022017673541387, "learning_rate": 2.18859194335145e-07, "logits/chosen": -0.9754039645195007, "logits/rejected": -0.7429482340812683, "logps/chosen": -0.024609964340925217, "logps/rejected": -5.291652679443359, "loss": 0.0202, "odds_ratio_loss": 0.0017671944806352258, "rewards/accuracies": 1.0, "rewards/chosen": -0.002460996387526393, "rewards/margins": 0.5267042517662048, "rewards/rejected": -0.5291653275489807, "sft_loss": 0.024609964340925217, "step": 3718 }, { "epoch": 5.378163412870571, "grad_norm": 1.1961288331402615, "learning_rate": 2.178470576314595e-07, "logits/chosen": -0.8683935403823853, "logits/rejected": -0.6840660572052002, "logps/chosen": -0.021254369989037514, "logps/rejected": -6.397770881652832, "loss": 0.0351, "odds_ratio_loss": 0.002227251650765538, "rewards/accuracies": 1.0, "rewards/chosen": -0.002125436905771494, "rewards/margins": 0.6376516819000244, "rewards/rejected": -0.6397770643234253, "sft_loss": 0.021254369989037514, "step": 3719 }, { "epoch": 5.379609544468547, "grad_norm": 1.1777353978631238, "learning_rate": 2.1683720120186977e-07, "logits/chosen": -0.7626063823699951, "logits/rejected": -0.6675928831100464, "logps/chosen": -0.006277444772422314, "logps/rejected": -4.882328987121582, "loss": 0.0248, "odds_ratio_loss": 0.0005971640930511057, "rewards/accuracies": 1.0, "rewards/chosen": -0.0006277445354498923, "rewards/margins": 0.48760515451431274, "rewards/rejected": -0.48823288083076477, "sft_loss": 0.006277444772422314, "step": 3720 }, { "epoch": 5.381055676066522, "grad_norm": 0.9532634639236007, "learning_rate": 2.1582962565522124e-07, "logits/chosen": -0.9451834559440613, "logits/rejected": -0.7273751497268677, "logps/chosen": -0.011347649618983269, "logps/rejected": -4.3883376121521, "loss": 0.0325, "odds_ratio_loss": 0.0006653146119788289, "rewards/accuracies": 1.0, "rewards/chosen": -0.001134765101596713, "rewards/margins": 0.43769901990890503, "rewards/rejected": -0.4388337731361389, "sft_loss": 0.011347649618983269, "step": 3721 }, { "epoch": 5.3825018076644975, "grad_norm": 0.9895790538702688, "learning_rate": 2.1482433159898528e-07, "logits/chosen": -1.0272815227508545, "logits/rejected": -0.8672689199447632, "logps/chosen": -0.016958389431238174, "logps/rejected": -4.27727746963501, "loss": 0.0295, "odds_ratio_loss": 0.0013183187693357468, "rewards/accuracies": 1.0, "rewards/chosen": -0.001695839106105268, "rewards/margins": 0.4260319173336029, "rewards/rejected": -0.4277278184890747, "sft_loss": 0.016958389431238174, "step": 3722 }, { "epoch": 5.383947939262473, "grad_norm": 1.1363864606811938, "learning_rate": 2.138213196392571e-07, "logits/chosen": -1.0011087656021118, "logits/rejected": -0.7626456022262573, "logps/chosen": -0.041216447949409485, "logps/rejected": -5.496888160705566, "loss": 0.0293, "odds_ratio_loss": 0.0034582449588924646, "rewards/accuracies": 1.0, "rewards/chosen": -0.004121644888073206, "rewards/margins": 0.5455672144889832, "rewards/rejected": -0.5496888160705566, "sft_loss": 0.041216447949409485, "step": 3723 }, { "epoch": 5.385394070860448, "grad_norm": 1.236532097092624, "learning_rate": 2.128205903807574e-07, "logits/chosen": -0.9743160605430603, "logits/rejected": -0.8304657936096191, "logps/chosen": -0.013960372656583786, "logps/rejected": -3.3339335918426514, "loss": 0.043, "odds_ratio_loss": 0.001198888523504138, "rewards/accuracies": 1.0, "rewards/chosen": -0.0013960371725261211, "rewards/margins": 0.3319973349571228, "rewards/rejected": -0.3333933651447296, "sft_loss": 0.013960372656583786, "step": 3724 }, { "epoch": 5.386840202458424, "grad_norm": 1.1081513188918342, "learning_rate": 2.1182214442682755e-07, "logits/chosen": -0.8094361424446106, "logits/rejected": -0.5704917311668396, "logps/chosen": -0.05938174948096275, "logps/rejected": -6.215296268463135, "loss": 0.0384, "odds_ratio_loss": 0.004498139023780823, "rewards/accuracies": 1.0, "rewards/chosen": -0.00593817513436079, "rewards/margins": 0.6155914068222046, "rewards/rejected": -0.6215296387672424, "sft_loss": 0.05938174948096275, "step": 3725 }, { "epoch": 5.388286334056399, "grad_norm": 1.3102679271672462, "learning_rate": 2.1082598237943627e-07, "logits/chosen": -0.904535174369812, "logits/rejected": -0.7191603183746338, "logps/chosen": -0.10017338395118713, "logps/rejected": -4.470028877258301, "loss": 0.0492, "odds_ratio_loss": 0.0035221197176724672, "rewards/accuracies": 1.0, "rewards/chosen": -0.010017338208854198, "rewards/margins": 0.4369855523109436, "rewards/rejected": -0.4470028877258301, "sft_loss": 0.10017338395118713, "step": 3726 }, { "epoch": 5.389732465654374, "grad_norm": 1.1096478004364896, "learning_rate": 2.09832104839172e-07, "logits/chosen": -0.9260188341140747, "logits/rejected": -0.6095644235610962, "logps/chosen": -0.04054839536547661, "logps/rejected": -5.756593704223633, "loss": 0.0386, "odds_ratio_loss": 0.0023188358172774315, "rewards/accuracies": 1.0, "rewards/chosen": -0.004054839722812176, "rewards/margins": 0.5716045498847961, "rewards/rejected": -0.5756593942642212, "sft_loss": 0.04054839536547661, "step": 3727 }, { "epoch": 5.39117859725235, "grad_norm": 1.0116071233532076, "learning_rate": 2.0884051240524837e-07, "logits/chosen": -0.8574241399765015, "logits/rejected": -0.7022998332977295, "logps/chosen": -0.011260045692324638, "logps/rejected": -5.040678977966309, "loss": 0.0398, "odds_ratio_loss": 0.0007709235651418567, "rewards/accuracies": 1.0, "rewards/chosen": -0.0011260046157985926, "rewards/margins": 0.502941906452179, "rewards/rejected": -0.5040678977966309, "sft_loss": 0.011260045692324638, "step": 3728 }, { "epoch": 5.3926247288503255, "grad_norm": 1.1338471650039585, "learning_rate": 2.0785120567549906e-07, "logits/chosen": -1.079079031944275, "logits/rejected": -0.7133069038391113, "logps/chosen": -0.047831885516643524, "logps/rejected": -5.607325077056885, "loss": 0.0412, "odds_ratio_loss": 0.0006497707217931747, "rewards/accuracies": 1.0, "rewards/chosen": -0.004783188924193382, "rewards/margins": 0.5559492707252502, "rewards/rejected": -0.5607325434684753, "sft_loss": 0.047831885516643524, "step": 3729 }, { "epoch": 5.394070860448301, "grad_norm": 1.257604801635166, "learning_rate": 2.0686418524638172e-07, "logits/chosen": -1.0921964645385742, "logits/rejected": -0.5764094591140747, "logps/chosen": -0.06157654896378517, "logps/rejected": -5.977487564086914, "loss": 0.0485, "odds_ratio_loss": 0.003265151521191001, "rewards/accuracies": 1.0, "rewards/chosen": -0.006157655268907547, "rewards/margins": 0.5915911197662354, "rewards/rejected": -0.5977488160133362, "sft_loss": 0.06157654896378517, "step": 3730 }, { "epoch": 5.395516992046276, "grad_norm": 1.1152765120174375, "learning_rate": 2.058794517129736e-07, "logits/chosen": -1.0628501176834106, "logits/rejected": -0.7114624977111816, "logps/chosen": -0.017897220328450203, "logps/rejected": -4.574059963226318, "loss": 0.0478, "odds_ratio_loss": 0.0005718549946323037, "rewards/accuracies": 1.0, "rewards/chosen": -0.0017897221259772778, "rewards/margins": 0.455616295337677, "rewards/rejected": -0.4574059844017029, "sft_loss": 0.017897220328450203, "step": 3731 }, { "epoch": 5.396963123644252, "grad_norm": 1.43232482896934, "learning_rate": 2.0489700566897583e-07, "logits/chosen": -0.8116693496704102, "logits/rejected": -0.612543523311615, "logps/chosen": -0.05777638033032417, "logps/rejected": -3.095944404602051, "loss": 0.0409, "odds_ratio_loss": 0.004115620162338018, "rewards/accuracies": 1.0, "rewards/chosen": -0.005777638405561447, "rewards/margins": 0.3038167953491211, "rewards/rejected": -0.30959442257881165, "sft_loss": 0.05777638033032417, "step": 3732 }, { "epoch": 5.398409255242227, "grad_norm": 1.2261328303484362, "learning_rate": 2.0391684770670747e-07, "logits/chosen": -1.0025750398635864, "logits/rejected": -0.8340614438056946, "logps/chosen": -0.03172256425023079, "logps/rejected": -4.1400065422058105, "loss": 0.0499, "odds_ratio_loss": 0.0006993028800934553, "rewards/accuracies": 1.0, "rewards/chosen": -0.003172256052494049, "rewards/margins": 0.4108284115791321, "rewards/rejected": -0.4140006899833679, "sft_loss": 0.03172256425023079, "step": 3733 }, { "epoch": 5.399855386840202, "grad_norm": 1.2380490360257563, "learning_rate": 2.029389784171096e-07, "logits/chosen": -0.9819266200065613, "logits/rejected": -0.7181769609451294, "logps/chosen": -0.027787383645772934, "logps/rejected": -5.828107833862305, "loss": 0.0399, "odds_ratio_loss": 0.002232284052297473, "rewards/accuracies": 1.0, "rewards/chosen": -0.0027787385042756796, "rewards/margins": 0.5800320506095886, "rewards/rejected": -0.5828108191490173, "sft_loss": 0.027787383645772934, "step": 3734 }, { "epoch": 5.401301518438178, "grad_norm": 1.1825096755119007, "learning_rate": 2.0196339838974353e-07, "logits/chosen": -0.796126663684845, "logits/rejected": -0.738174557685852, "logps/chosen": -0.01109451986849308, "logps/rejected": -5.02370548248291, "loss": 0.0318, "odds_ratio_loss": 0.001242693280801177, "rewards/accuracies": 1.0, "rewards/chosen": -0.001109451986849308, "rewards/margins": 0.5012611150741577, "rewards/rejected": -0.5023705959320068, "sft_loss": 0.01109451986849308, "step": 3735 }, { "epoch": 5.4027476500361535, "grad_norm": 1.286684498068449, "learning_rate": 2.009901082127894e-07, "logits/chosen": -0.9500235915184021, "logits/rejected": -0.5735840201377869, "logps/chosen": -0.03152807801961899, "logps/rejected": -4.867311477661133, "loss": 0.0536, "odds_ratio_loss": 0.001049485756084323, "rewards/accuracies": 1.0, "rewards/chosen": -0.0031528077088296413, "rewards/margins": 0.4835783541202545, "rewards/rejected": -0.4867311418056488, "sft_loss": 0.03152807801961899, "step": 3736 }, { "epoch": 5.404193781634128, "grad_norm": 1.0746790175253347, "learning_rate": 2.0001910847304893e-07, "logits/chosen": -1.0120949745178223, "logits/rejected": -0.7268694639205933, "logps/chosen": -0.017900846898555756, "logps/rejected": -7.724081039428711, "loss": 0.0353, "odds_ratio_loss": 0.00031501834746450186, "rewards/accuracies": 1.0, "rewards/chosen": -0.0017900847597047687, "rewards/margins": 0.7706180810928345, "rewards/rejected": -0.7724080681800842, "sft_loss": 0.017900846898555756, "step": 3737 }, { "epoch": 5.405639913232104, "grad_norm": 1.0475786833357763, "learning_rate": 1.9905039975594008e-07, "logits/chosen": -1.074700117111206, "logits/rejected": -0.692499577999115, "logps/chosen": -0.04430760070681572, "logps/rejected": -5.837507247924805, "loss": 0.0468, "odds_ratio_loss": 0.0013199535897001624, "rewards/accuracies": 1.0, "rewards/chosen": -0.004430760163813829, "rewards/margins": 0.579319953918457, "rewards/rejected": -0.5837507247924805, "sft_loss": 0.04430760070681572, "step": 3738 }, { "epoch": 5.40708604483008, "grad_norm": 1.0466069420380553, "learning_rate": 1.9808398264550142e-07, "logits/chosen": -0.8342309594154358, "logits/rejected": -0.5902801752090454, "logps/chosen": -0.01617673598229885, "logps/rejected": -6.466917037963867, "loss": 0.0397, "odds_ratio_loss": 0.0010513067245483398, "rewards/accuracies": 1.0, "rewards/chosen": -0.0016176735516637564, "rewards/margins": 0.6450740098953247, "rewards/rejected": -0.6466916799545288, "sft_loss": 0.01617673598229885, "step": 3739 }, { "epoch": 5.408532176428055, "grad_norm": 1.1221433314828366, "learning_rate": 1.9711985772438998e-07, "logits/chosen": -0.9502919912338257, "logits/rejected": -0.8300921320915222, "logps/chosen": -0.03330773860216141, "logps/rejected": -3.3701939582824707, "loss": 0.0392, "odds_ratio_loss": 0.003339210757985711, "rewards/accuracies": 1.0, "rewards/chosen": -0.003330774139612913, "rewards/margins": 0.3336886465549469, "rewards/rejected": -0.3370194435119629, "sft_loss": 0.03330773860216141, "step": 3740 }, { "epoch": 5.40997830802603, "grad_norm": 0.82956288869114, "learning_rate": 1.9615802557387995e-07, "logits/chosen": -0.7121748924255371, "logits/rejected": -0.6548438668251038, "logps/chosen": -0.015078652650117874, "logps/rejected": -6.196207523345947, "loss": 0.0187, "odds_ratio_loss": 0.002625955268740654, "rewards/accuracies": 1.0, "rewards/chosen": -0.0015078652650117874, "rewards/margins": 0.6181129217147827, "rewards/rejected": -0.6196207404136658, "sft_loss": 0.015078652650117874, "step": 3741 }, { "epoch": 5.411424439624006, "grad_norm": 1.1509802573765, "learning_rate": 1.9519848677386207e-07, "logits/chosen": -1.094519019126892, "logits/rejected": -0.7549199461936951, "logps/chosen": -0.03511492535471916, "logps/rejected": -5.343645095825195, "loss": 0.0348, "odds_ratio_loss": 0.0017946298466995358, "rewards/accuracies": 1.0, "rewards/chosen": -0.0035114926286041737, "rewards/margins": 0.5308530330657959, "rewards/rejected": -0.5343644618988037, "sft_loss": 0.03511492535471916, "step": 3742 }, { "epoch": 5.412870571221982, "grad_norm": 1.1964810085365847, "learning_rate": 1.942412419028483e-07, "logits/chosen": -0.9508788585662842, "logits/rejected": -0.7533563375473022, "logps/chosen": -0.03399357944726944, "logps/rejected": -5.815676212310791, "loss": 0.051, "odds_ratio_loss": 0.0037674754858016968, "rewards/accuracies": 1.0, "rewards/chosen": -0.0033993578981608152, "rewards/margins": 0.5781682729721069, "rewards/rejected": -0.581567645072937, "sft_loss": 0.03399357944726944, "step": 3743 }, { "epoch": 5.414316702819956, "grad_norm": 1.0946416488877777, "learning_rate": 1.9328629153796317e-07, "logits/chosen": -1.065413236618042, "logits/rejected": -0.7837972640991211, "logps/chosen": -0.025034120306372643, "logps/rejected": -5.018168926239014, "loss": 0.0377, "odds_ratio_loss": 0.0013940563658252358, "rewards/accuracies": 1.0, "rewards/chosen": -0.002503412077203393, "rewards/margins": 0.49931347370147705, "rewards/rejected": -0.5018168687820435, "sft_loss": 0.025034120306372643, "step": 3744 }, { "epoch": 5.415762834417932, "grad_norm": 0.916282563166462, "learning_rate": 1.9233363625495057e-07, "logits/chosen": -0.8925954699516296, "logits/rejected": -0.6070317625999451, "logps/chosen": -0.018731582909822464, "logps/rejected": -3.630098342895508, "loss": 0.0252, "odds_ratio_loss": 0.0011892381589859724, "rewards/accuracies": 1.0, "rewards/chosen": -0.0018731581512838602, "rewards/margins": 0.36113670468330383, "rewards/rejected": -0.36300989985466003, "sft_loss": 0.018731582909822464, "step": 3745 }, { "epoch": 5.417208966015908, "grad_norm": 0.9606132598371431, "learning_rate": 1.9138327662817065e-07, "logits/chosen": -0.8515411615371704, "logits/rejected": -0.6688830852508545, "logps/chosen": -0.01788986660540104, "logps/rejected": -5.797184944152832, "loss": 0.0263, "odds_ratio_loss": 0.0017000783700495958, "rewards/accuracies": 1.0, "rewards/chosen": -0.0017889868468046188, "rewards/margins": 0.5779294967651367, "rewards/rejected": -0.5797185301780701, "sft_loss": 0.01788986660540104, "step": 3746 }, { "epoch": 5.4186550976138825, "grad_norm": 1.0628812282654512, "learning_rate": 1.9043521323059752e-07, "logits/chosen": -0.8878743052482605, "logits/rejected": -0.6467165946960449, "logps/chosen": -0.061739467084407806, "logps/rejected": -6.201615810394287, "loss": 0.0443, "odds_ratio_loss": 0.001052954001352191, "rewards/accuracies": 1.0, "rewards/chosen": -0.006173947360366583, "rewards/margins": 0.6139876842498779, "rewards/rejected": -0.6201615929603577, "sft_loss": 0.061739467084407806, "step": 3747 }, { "epoch": 5.420101229211858, "grad_norm": 1.0019449779058243, "learning_rate": 1.8948944663382328e-07, "logits/chosen": -0.8111994862556458, "logits/rejected": -0.5611193180084229, "logps/chosen": -0.023934748023748398, "logps/rejected": -6.190435409545898, "loss": 0.0274, "odds_ratio_loss": 0.0013642843114212155, "rewards/accuracies": 1.0, "rewards/chosen": -0.0023934748023748398, "rewards/margins": 0.6166501045227051, "rewards/rejected": -0.6190435290336609, "sft_loss": 0.023934748023748398, "step": 3748 }, { "epoch": 5.421547360809834, "grad_norm": 3.0322591142715507, "learning_rate": 1.8854597740805267e-07, "logits/chosen": -0.8009194731712341, "logits/rejected": -0.5865261554718018, "logps/chosen": -0.047983746975660324, "logps/rejected": -4.174554824829102, "loss": 0.0246, "odds_ratio_loss": 0.001906828721985221, "rewards/accuracies": 1.0, "rewards/chosen": -0.004798375070095062, "rewards/margins": 0.41265714168548584, "rewards/rejected": -0.4174554944038391, "sft_loss": 0.047983746975660324, "step": 3749 }, { "epoch": 5.422993492407809, "grad_norm": 1.2038166729132256, "learning_rate": 1.8760480612210848e-07, "logits/chosen": -0.7556824088096619, "logits/rejected": -0.704839289188385, "logps/chosen": -0.046191371977329254, "logps/rejected": -3.388821601867676, "loss": 0.0348, "odds_ratio_loss": 0.004520849324762821, "rewards/accuracies": 1.0, "rewards/chosen": -0.00461913738399744, "rewards/margins": 0.33426302671432495, "rewards/rejected": -0.338882178068161, "sft_loss": 0.046191371977329254, "step": 3750 }, { "epoch": 5.424439624005784, "grad_norm": 1.366842333937244, "learning_rate": 1.866659333434244e-07, "logits/chosen": -0.8494126796722412, "logits/rejected": -0.5990194082260132, "logps/chosen": -0.06559625267982483, "logps/rejected": -5.316402912139893, "loss": 0.0417, "odds_ratio_loss": 0.005418956745415926, "rewards/accuracies": 1.0, "rewards/chosen": -0.006559625267982483, "rewards/margins": 0.525080680847168, "rewards/rejected": -0.531640350818634, "sft_loss": 0.06559625267982483, "step": 3751 }, { "epoch": 5.42588575560376, "grad_norm": 1.0710079807891604, "learning_rate": 1.8572935963805246e-07, "logits/chosen": -0.991873562335968, "logits/rejected": -0.8343634605407715, "logps/chosen": -0.026091061532497406, "logps/rejected": -5.874682426452637, "loss": 0.0445, "odds_ratio_loss": 0.0013955554459244013, "rewards/accuracies": 1.0, "rewards/chosen": -0.002609106246381998, "rewards/margins": 0.5848591327667236, "rewards/rejected": -0.5874682664871216, "sft_loss": 0.026091061532497406, "step": 3752 }, { "epoch": 5.427331887201736, "grad_norm": 1.127959440254014, "learning_rate": 1.8479508557065525e-07, "logits/chosen": -0.8945366740226746, "logits/rejected": -0.5752571821212769, "logps/chosen": -0.012399179860949516, "logps/rejected": -8.56202507019043, "loss": 0.0271, "odds_ratio_loss": 0.0005926304729655385, "rewards/accuracies": 1.0, "rewards/chosen": -0.001239917939528823, "rewards/margins": 0.8549625873565674, "rewards/rejected": -0.8562024235725403, "sft_loss": 0.012399179860949516, "step": 3753 }, { "epoch": 5.428778018799711, "grad_norm": 1.0389549793471313, "learning_rate": 1.838631117045102e-07, "logits/chosen": -0.8418703079223633, "logits/rejected": -0.6339346766471863, "logps/chosen": -0.013523176312446594, "logps/rejected": -6.4567999839782715, "loss": 0.0346, "odds_ratio_loss": 0.0007304397877305746, "rewards/accuracies": 1.0, "rewards/chosen": -0.0013523175148293376, "rewards/margins": 0.6443277597427368, "rewards/rejected": -0.6456800103187561, "sft_loss": 0.013523176312446594, "step": 3754 }, { "epoch": 5.430224150397686, "grad_norm": 1.2868339892184242, "learning_rate": 1.8293343860150823e-07, "logits/chosen": -0.9397022128105164, "logits/rejected": -0.6365588903427124, "logps/chosen": -0.04777473583817482, "logps/rejected": -4.0239644050598145, "loss": 0.0416, "odds_ratio_loss": 0.0024959403090178967, "rewards/accuracies": 1.0, "rewards/chosen": -0.004777473863214254, "rewards/margins": 0.39761897921562195, "rewards/rejected": -0.40239647030830383, "sft_loss": 0.04777473583817482, "step": 3755 }, { "epoch": 5.431670281995662, "grad_norm": 0.9056856128260714, "learning_rate": 1.8200606682215215e-07, "logits/chosen": -0.9845834374427795, "logits/rejected": -0.6842622756958008, "logps/chosen": -0.07160784304141998, "logps/rejected": -5.863983631134033, "loss": 0.0313, "odds_ratio_loss": 0.005471718031913042, "rewards/accuracies": 1.0, "rewards/chosen": -0.007160785607993603, "rewards/margins": 0.5792375802993774, "rewards/rejected": -0.5863983631134033, "sft_loss": 0.07160784304141998, "step": 3756 }, { "epoch": 5.433116413593637, "grad_norm": 1.1195527461945456, "learning_rate": 1.810809969255578e-07, "logits/chosen": -0.8662546873092651, "logits/rejected": -0.6356720924377441, "logps/chosen": -0.047709230333566666, "logps/rejected": -5.315559387207031, "loss": 0.0366, "odds_ratio_loss": 0.0005675168940797448, "rewards/accuracies": 1.0, "rewards/chosen": -0.004770922940224409, "rewards/margins": 0.5267850160598755, "rewards/rejected": -0.5315558910369873, "sft_loss": 0.047709230333566666, "step": 3757 }, { "epoch": 5.434562545191612, "grad_norm": 1.0522209896848032, "learning_rate": 1.801582294694537e-07, "logits/chosen": -0.690969705581665, "logits/rejected": -0.4468667805194855, "logps/chosen": -0.01992463506758213, "logps/rejected": -5.407593727111816, "loss": 0.0331, "odds_ratio_loss": 0.0010238731047138572, "rewards/accuracies": 1.0, "rewards/chosen": -0.0019924635998904705, "rewards/margins": 0.5387669205665588, "rewards/rejected": -0.5407594442367554, "sft_loss": 0.01992463506758213, "step": 3758 }, { "epoch": 5.436008676789588, "grad_norm": 1.1908143743026758, "learning_rate": 1.7923776501018017e-07, "logits/chosen": -0.8162471055984497, "logits/rejected": -0.5732954740524292, "logps/chosen": -0.04143443703651428, "logps/rejected": -6.230381011962891, "loss": 0.0379, "odds_ratio_loss": 0.00850686989724636, "rewards/accuracies": 1.0, "rewards/chosen": -0.004143443889915943, "rewards/margins": 0.618894636631012, "rewards/rejected": -0.623038113117218, "sft_loss": 0.04143443703651428, "step": 3759 }, { "epoch": 5.437454808387563, "grad_norm": 1.1326155288447952, "learning_rate": 1.783196041026871e-07, "logits/chosen": -1.0182111263275146, "logits/rejected": -0.8080993294715881, "logps/chosen": -0.028092101216316223, "logps/rejected": -3.2441256046295166, "loss": 0.0268, "odds_ratio_loss": 0.0010873202700167894, "rewards/accuracies": 1.0, "rewards/chosen": -0.0028092102147638798, "rewards/margins": 0.32160335779190063, "rewards/rejected": -0.3244125545024872, "sft_loss": 0.028092101216316223, "step": 3760 }, { "epoch": 5.438900939985539, "grad_norm": 1.0215706085996055, "learning_rate": 1.774037473005392e-07, "logits/chosen": -1.03084135055542, "logits/rejected": -0.6289142370223999, "logps/chosen": -0.04927384853363037, "logps/rejected": -6.17075252532959, "loss": 0.0429, "odds_ratio_loss": 0.0005610902444459498, "rewards/accuracies": 1.0, "rewards/chosen": -0.004927385598421097, "rewards/margins": 0.612147867679596, "rewards/rejected": -0.6170752048492432, "sft_loss": 0.04927384853363037, "step": 3761 }, { "epoch": 5.440347071583514, "grad_norm": 0.7157495419275686, "learning_rate": 1.7649019515590902e-07, "logits/chosen": -0.9550158381462097, "logits/rejected": -0.5582857131958008, "logps/chosen": -0.010214051231741905, "logps/rejected": -5.4750213623046875, "loss": 0.0143, "odds_ratio_loss": 0.000640549638774246, "rewards/accuracies": 1.0, "rewards/chosen": -0.0010214050998911262, "rewards/margins": 0.5464807152748108, "rewards/rejected": -0.5475021600723267, "sft_loss": 0.010214051231741905, "step": 3762 }, { "epoch": 5.44179320318149, "grad_norm": 1.2906910872242197, "learning_rate": 1.7557894821957996e-07, "logits/chosen": -0.8110982179641724, "logits/rejected": -0.6344393491744995, "logps/chosen": -0.032719340175390244, "logps/rejected": -4.284968376159668, "loss": 0.0422, "odds_ratio_loss": 0.0016023035859689116, "rewards/accuracies": 1.0, "rewards/chosen": -0.0032719343435019255, "rewards/margins": 0.4252249002456665, "rewards/rejected": -0.4284968376159668, "sft_loss": 0.032719340175390244, "step": 3763 }, { "epoch": 5.443239334779465, "grad_norm": 1.20926488363375, "learning_rate": 1.7467000704094635e-07, "logits/chosen": -1.0415936708450317, "logits/rejected": -0.7036805748939514, "logps/chosen": -0.07510319352149963, "logps/rejected": -5.472287178039551, "loss": 0.0323, "odds_ratio_loss": 0.004779131151735783, "rewards/accuracies": 1.0, "rewards/chosen": -0.007510320283472538, "rewards/margins": 0.5397183895111084, "rewards/rejected": -0.5472287535667419, "sft_loss": 0.07510319352149963, "step": 3764 }, { "epoch": 5.44468546637744, "grad_norm": 1.705118382870525, "learning_rate": 1.737633721680134e-07, "logits/chosen": -0.8406180739402771, "logits/rejected": -0.6496351957321167, "logps/chosen": -0.037446528673172, "logps/rejected": -6.6907958984375, "loss": 0.0355, "odds_ratio_loss": 0.002988762455061078, "rewards/accuracies": 1.0, "rewards/chosen": -0.003744652960449457, "rewards/margins": 0.665334939956665, "rewards/rejected": -0.669079601764679, "sft_loss": 0.037446528673172, "step": 3765 }, { "epoch": 5.446131597975416, "grad_norm": 1.0836249668191964, "learning_rate": 1.7285904414739316e-07, "logits/chosen": -0.6858553886413574, "logits/rejected": -0.6467568278312683, "logps/chosen": -0.033721573650836945, "logps/rejected": -4.718149185180664, "loss": 0.032, "odds_ratio_loss": 0.002487786579877138, "rewards/accuracies": 1.0, "rewards/chosen": -0.0033721576910465956, "rewards/margins": 0.46844279766082764, "rewards/rejected": -0.47181493043899536, "sft_loss": 0.033721573650836945, "step": 3766 }, { "epoch": 5.447577729573391, "grad_norm": 1.2582651528740414, "learning_rate": 1.7195702352430907e-07, "logits/chosen": -0.7453847527503967, "logits/rejected": -0.525572657585144, "logps/chosen": -0.05578567460179329, "logps/rejected": -5.3713698387146, "loss": 0.0575, "odds_ratio_loss": 0.0022328821942210197, "rewards/accuracies": 1.0, "rewards/chosen": -0.005578567273914814, "rewards/margins": 0.5315583944320679, "rewards/rejected": -0.537136971950531, "sft_loss": 0.05578567460179329, "step": 3767 }, { "epoch": 5.449023861171367, "grad_norm": 1.1303720815241651, "learning_rate": 1.7105731084259278e-07, "logits/chosen": -0.7345448732376099, "logits/rejected": -0.5634473562240601, "logps/chosen": -0.04168213903903961, "logps/rejected": -5.113981246948242, "loss": 0.0329, "odds_ratio_loss": 0.005578060168772936, "rewards/accuracies": 1.0, "rewards/chosen": -0.004168213810771704, "rewards/margins": 0.5072299242019653, "rewards/rejected": -0.5113981366157532, "sft_loss": 0.04168213903903961, "step": 3768 }, { "epoch": 5.450469992769342, "grad_norm": 1.005911181986314, "learning_rate": 1.7015990664468415e-07, "logits/chosen": -1.0068395137786865, "logits/rejected": -0.6776146292686462, "logps/chosen": -0.04365962743759155, "logps/rejected": -4.5361223220825195, "loss": 0.0292, "odds_ratio_loss": 0.003663485636934638, "rewards/accuracies": 1.0, "rewards/chosen": -0.00436596293002367, "rewards/margins": 0.44924625754356384, "rewards/rejected": -0.45361220836639404, "sft_loss": 0.04365962743759155, "step": 3769 }, { "epoch": 5.451916124367317, "grad_norm": 1.4564721360998285, "learning_rate": 1.6926481147163173e-07, "logits/chosen": -0.907903790473938, "logits/rejected": -0.6354748010635376, "logps/chosen": -0.03070155158638954, "logps/rejected": -5.978352069854736, "loss": 0.0857, "odds_ratio_loss": 0.0009719114750623703, "rewards/accuracies": 1.0, "rewards/chosen": -0.003070155158638954, "rewards/margins": 0.5947650671005249, "rewards/rejected": -0.5978351831436157, "sft_loss": 0.03070155158638954, "step": 3770 }, { "epoch": 5.453362255965293, "grad_norm": 1.0511769484125821, "learning_rate": 1.6837202586309185e-07, "logits/chosen": -0.9518678188323975, "logits/rejected": -0.609727144241333, "logps/chosen": -0.014927854761481285, "logps/rejected": -7.192644119262695, "loss": 0.0293, "odds_ratio_loss": 0.0006604095688089728, "rewards/accuracies": 1.0, "rewards/chosen": -0.0014927855227142572, "rewards/margins": 0.7177716493606567, "rewards/rejected": -0.7192643880844116, "sft_loss": 0.014927854761481285, "step": 3771 }, { "epoch": 5.4548083875632685, "grad_norm": 1.347646014723408, "learning_rate": 1.6748155035732813e-07, "logits/chosen": -1.0877472162246704, "logits/rejected": -0.6998102068901062, "logps/chosen": -0.053270746022462845, "logps/rejected": -6.024081707000732, "loss": 0.0624, "odds_ratio_loss": 0.0028402383904904127, "rewards/accuracies": 1.0, "rewards/chosen": -0.005327074788510799, "rewards/margins": 0.5970811247825623, "rewards/rejected": -0.6024081707000732, "sft_loss": 0.053270746022462845, "step": 3772 }, { "epoch": 5.456254519161243, "grad_norm": 0.7840991919195984, "learning_rate": 1.6659338549121117e-07, "logits/chosen": -0.6631830334663391, "logits/rejected": -0.6605867743492126, "logps/chosen": -0.004134493414312601, "logps/rejected": -4.816030502319336, "loss": 0.0235, "odds_ratio_loss": 0.0005341377691365778, "rewards/accuracies": 1.0, "rewards/chosen": -0.000413449335610494, "rewards/margins": 0.4811896085739136, "rewards/rejected": -0.4816030263900757, "sft_loss": 0.004134493414312601, "step": 3773 }, { "epoch": 5.457700650759219, "grad_norm": 1.1194943030611022, "learning_rate": 1.6570753180021925e-07, "logits/chosen": -0.8060338497161865, "logits/rejected": -0.675667941570282, "logps/chosen": -0.06142951920628548, "logps/rejected": -4.382281303405762, "loss": 0.0487, "odds_ratio_loss": 0.008763323538005352, "rewards/accuracies": 1.0, "rewards/chosen": -0.006142952013760805, "rewards/margins": 0.43208521604537964, "rewards/rejected": -0.43822813034057617, "sft_loss": 0.06142951920628548, "step": 3774 }, { "epoch": 5.459146782357195, "grad_norm": 1.258915842060084, "learning_rate": 1.648239898184367e-07, "logits/chosen": -0.7317820191383362, "logits/rejected": -0.609990119934082, "logps/chosen": -0.047425124794244766, "logps/rejected": -4.409567832946777, "loss": 0.046, "odds_ratio_loss": 0.006635497324168682, "rewards/accuracies": 1.0, "rewards/chosen": -0.004742512945085764, "rewards/margins": 0.43621429800987244, "rewards/rejected": -0.4409567713737488, "sft_loss": 0.047425124794244766, "step": 3775 }, { "epoch": 5.46059291395517, "grad_norm": 1.3023881499886933, "learning_rate": 1.639427600785548e-07, "logits/chosen": -0.6300375461578369, "logits/rejected": -0.5426366329193115, "logps/chosen": -0.01603570207953453, "logps/rejected": -6.023974418640137, "loss": 0.0337, "odds_ratio_loss": 0.00035005330573767424, "rewards/accuracies": 1.0, "rewards/chosen": -0.0016035701846703887, "rewards/margins": 0.6007938385009766, "rewards/rejected": -0.6023973822593689, "sft_loss": 0.01603570207953453, "step": 3776 }, { "epoch": 5.462039045553145, "grad_norm": 1.1724540495948623, "learning_rate": 1.6306384311186938e-07, "logits/chosen": -0.6645495891571045, "logits/rejected": -0.45771369338035583, "logps/chosen": -0.013555881567299366, "logps/rejected": -5.649051666259766, "loss": 0.038, "odds_ratio_loss": 0.0011070972541347146, "rewards/accuracies": 1.0, "rewards/chosen": -0.0013555882032960653, "rewards/margins": 0.5635495781898499, "rewards/rejected": -0.5649051666259766, "sft_loss": 0.013555881567299366, "step": 3777 }, { "epoch": 5.463485177151121, "grad_norm": 1.0593367688075082, "learning_rate": 1.6218723944828416e-07, "logits/chosen": -0.9113006591796875, "logits/rejected": -0.8548449873924255, "logps/chosen": -0.052851203829050064, "logps/rejected": -3.4363300800323486, "loss": 0.0516, "odds_ratio_loss": 0.01034949254244566, "rewards/accuracies": 1.0, "rewards/chosen": -0.005285120103508234, "rewards/margins": 0.3383478820323944, "rewards/rejected": -0.3436330258846283, "sft_loss": 0.052851203829050064, "step": 3778 }, { "epoch": 5.4649313087490965, "grad_norm": 0.8107933568205687, "learning_rate": 1.6131294961630526e-07, "logits/chosen": -0.8878468871116638, "logits/rejected": -0.6497321724891663, "logps/chosen": -0.013816386461257935, "logps/rejected": -6.9916181564331055, "loss": 0.0238, "odds_ratio_loss": 0.0004862607456743717, "rewards/accuracies": 1.0, "rewards/chosen": -0.0013816386926919222, "rewards/margins": 0.697780191898346, "rewards/rejected": -0.6991618871688843, "sft_loss": 0.013816386461257935, "step": 3779 }, { "epoch": 5.466377440347071, "grad_norm": 1.1183439742376857, "learning_rate": 1.6044097414304614e-07, "logits/chosen": -1.0266106128692627, "logits/rejected": -0.7022008895874023, "logps/chosen": -0.0061122276820242405, "logps/rejected": -7.002909183502197, "loss": 0.0498, "odds_ratio_loss": 0.0008495476795360446, "rewards/accuracies": 1.0, "rewards/chosen": -0.0006112227565608919, "rewards/margins": 0.6996797323226929, "rewards/rejected": -0.7002909183502197, "sft_loss": 0.0061122276820242405, "step": 3780 }, { "epoch": 5.467823571945047, "grad_norm": 1.049467888638357, "learning_rate": 1.5957131355422315e-07, "logits/chosen": -0.73927241563797, "logits/rejected": -0.6167184114456177, "logps/chosen": -0.026645377278327942, "logps/rejected": -3.968277931213379, "loss": 0.0297, "odds_ratio_loss": 0.0038094024639576674, "rewards/accuracies": 1.0, "rewards/chosen": -0.0026645378675311804, "rewards/margins": 0.39416325092315674, "rewards/rejected": -0.3968278169631958, "sft_loss": 0.026645377278327942, "step": 3781 }, { "epoch": 5.469269703543023, "grad_norm": 0.9677568438433538, "learning_rate": 1.5870396837415868e-07, "logits/chosen": -0.8513559699058533, "logits/rejected": -0.7282320261001587, "logps/chosen": -0.008808178827166557, "logps/rejected": -4.3206939697265625, "loss": 0.0331, "odds_ratio_loss": 0.0005816483753733337, "rewards/accuracies": 1.0, "rewards/chosen": -0.0008808178827166557, "rewards/margins": 0.4311886429786682, "rewards/rejected": -0.43206942081451416, "sft_loss": 0.008808178827166557, "step": 3782 }, { "epoch": 5.4707158351409975, "grad_norm": 1.0715870165143846, "learning_rate": 1.5783893912577794e-07, "logits/chosen": -0.9287527203559875, "logits/rejected": -0.66707444190979, "logps/chosen": -0.06083793565630913, "logps/rejected": -4.999859809875488, "loss": 0.0463, "odds_ratio_loss": 0.0012267936253920197, "rewards/accuracies": 1.0, "rewards/chosen": -0.006083793938159943, "rewards/margins": 0.49390220642089844, "rewards/rejected": -0.4999859929084778, "sft_loss": 0.06083793565630913, "step": 3783 }, { "epoch": 5.472161966738973, "grad_norm": 1.064733100710332, "learning_rate": 1.5697622633061137e-07, "logits/chosen": -1.0185866355895996, "logits/rejected": -0.7323493957519531, "logps/chosen": -0.02041870355606079, "logps/rejected": -5.43452787399292, "loss": 0.0333, "odds_ratio_loss": 0.0006175260059535503, "rewards/accuracies": 1.0, "rewards/chosen": -0.0020418702624738216, "rewards/margins": 0.5414108633995056, "rewards/rejected": -0.5434527397155762, "sft_loss": 0.02041870355606079, "step": 3784 }, { "epoch": 5.473608098336949, "grad_norm": 1.0432839302721317, "learning_rate": 1.5611583050878997e-07, "logits/chosen": -0.8604859113693237, "logits/rejected": -0.6487876176834106, "logps/chosen": -0.0667792409658432, "logps/rejected": -6.021456718444824, "loss": 0.0364, "odds_ratio_loss": 0.0029010814614593983, "rewards/accuracies": 1.0, "rewards/chosen": -0.006677924655377865, "rewards/margins": 0.5954678058624268, "rewards/rejected": -0.602145791053772, "sft_loss": 0.0667792409658432, "step": 3785 }, { "epoch": 5.4750542299349245, "grad_norm": 1.1140883737750038, "learning_rate": 1.5525775217905125e-07, "logits/chosen": -0.7549240589141846, "logits/rejected": -0.5772415399551392, "logps/chosen": -0.04872463271021843, "logps/rejected": -5.993658065795898, "loss": 0.0415, "odds_ratio_loss": 0.003810073481872678, "rewards/accuracies": 1.0, "rewards/chosen": -0.0048724631778895855, "rewards/margins": 0.5944933891296387, "rewards/rejected": -0.5993658304214478, "sft_loss": 0.04872463271021843, "step": 3786 }, { "epoch": 5.476500361532899, "grad_norm": 1.323116620766381, "learning_rate": 1.5440199185873294e-07, "logits/chosen": -0.9334354996681213, "logits/rejected": -0.9096828103065491, "logps/chosen": -0.049010857939720154, "logps/rejected": -4.4386420249938965, "loss": 0.0353, "odds_ratio_loss": 0.0016335193067789078, "rewards/accuracies": 1.0, "rewards/chosen": -0.00490108598023653, "rewards/margins": 0.4389631450176239, "rewards/rejected": -0.44386419653892517, "sft_loss": 0.049010857939720154, "step": 3787 }, { "epoch": 5.477946493130875, "grad_norm": 1.030544588663756, "learning_rate": 1.5354855006377565e-07, "logits/chosen": -1.0335475206375122, "logits/rejected": -0.6511580944061279, "logps/chosen": -0.041097480803728104, "logps/rejected": -5.674369812011719, "loss": 0.0245, "odds_ratio_loss": 0.00024172822304535657, "rewards/accuracies": 1.0, "rewards/chosen": -0.004109748173505068, "rewards/margins": 0.5633272528648376, "rewards/rejected": -0.5674369931221008, "sft_loss": 0.041097480803728104, "step": 3788 }, { "epoch": 5.479392624728851, "grad_norm": 1.1010779889775346, "learning_rate": 1.526974273087238e-07, "logits/chosen": -0.7723639011383057, "logits/rejected": -0.5067367553710938, "logps/chosen": -0.026965174823999405, "logps/rejected": -5.289927959442139, "loss": 0.0328, "odds_ratio_loss": 0.00020186560868751258, "rewards/accuracies": 1.0, "rewards/chosen": -0.0026965176220983267, "rewards/margins": 0.5262963175773621, "rewards/rejected": -0.5289928317070007, "sft_loss": 0.026965174823999405, "step": 3789 }, { "epoch": 5.4808387563268255, "grad_norm": 1.0022160795883452, "learning_rate": 1.518486241067216e-07, "logits/chosen": -0.8188205361366272, "logits/rejected": -0.5412254333496094, "logps/chosen": -0.030381806194782257, "logps/rejected": -5.232437610626221, "loss": 0.0318, "odds_ratio_loss": 0.0006156917079351842, "rewards/accuracies": 1.0, "rewards/chosen": -0.0030381809920072556, "rewards/margins": 0.520205557346344, "rewards/rejected": -0.52324378490448, "sft_loss": 0.030381806194782257, "step": 3790 }, { "epoch": 5.482284887924801, "grad_norm": 0.8690394603358813, "learning_rate": 1.5100214096951658e-07, "logits/chosen": -0.923912525177002, "logits/rejected": -0.7971416115760803, "logps/chosen": -0.01047974918037653, "logps/rejected": -5.119403839111328, "loss": 0.011, "odds_ratio_loss": 0.0005628624348901212, "rewards/accuracies": 1.0, "rewards/chosen": -0.0010479751508682966, "rewards/margins": 0.510892391204834, "rewards/rejected": -0.5119403600692749, "sft_loss": 0.01047974918037653, "step": 3791 }, { "epoch": 5.483731019522777, "grad_norm": 1.065241764993738, "learning_rate": 1.5015797840745515e-07, "logits/chosen": -1.1814494132995605, "logits/rejected": -0.6983795166015625, "logps/chosen": -0.02755448967218399, "logps/rejected": -5.53033971786499, "loss": 0.0342, "odds_ratio_loss": 0.00043123989598825574, "rewards/accuracies": 1.0, "rewards/chosen": -0.0027554489206522703, "rewards/margins": 0.5502785444259644, "rewards/rejected": -0.5530340075492859, "sft_loss": 0.02755448967218399, "step": 3792 }, { "epoch": 5.485177151120752, "grad_norm": 0.9422856535843215, "learning_rate": 1.4931613692948753e-07, "logits/chosen": -0.9157315492630005, "logits/rejected": -0.8082464933395386, "logps/chosen": -0.010187807492911816, "logps/rejected": -4.607365608215332, "loss": 0.0246, "odds_ratio_loss": 0.001327059231698513, "rewards/accuracies": 1.0, "rewards/chosen": -0.0010187807492911816, "rewards/margins": 0.4597177505493164, "rewards/rejected": -0.4607365131378174, "sft_loss": 0.010187807492911816, "step": 3793 }, { "epoch": 5.486623282718727, "grad_norm": 1.2102618901021103, "learning_rate": 1.484766170431624e-07, "logits/chosen": -0.7651352286338806, "logits/rejected": -0.6443876028060913, "logps/chosen": -0.050633467733860016, "logps/rejected": -5.098237991333008, "loss": 0.0429, "odds_ratio_loss": 0.001227756729349494, "rewards/accuracies": 1.0, "rewards/chosen": -0.005063347052782774, "rewards/margins": 0.5047605037689209, "rewards/rejected": -0.5098237991333008, "sft_loss": 0.050633467733860016, "step": 3794 }, { "epoch": 5.488069414316703, "grad_norm": 1.1833110758817549, "learning_rate": 1.4763941925462954e-07, "logits/chosen": -0.8539189100265503, "logits/rejected": -0.7393923401832581, "logps/chosen": -0.05777610465884209, "logps/rejected": -3.9655282497406006, "loss": 0.0452, "odds_ratio_loss": 0.00516713410615921, "rewards/accuracies": 1.0, "rewards/chosen": -0.005777610465884209, "rewards/margins": 0.390775203704834, "rewards/rejected": -0.39655283093452454, "sft_loss": 0.05777610465884209, "step": 3795 }, { "epoch": 5.489515545914678, "grad_norm": 1.214943791765227, "learning_rate": 1.4680454406863763e-07, "logits/chosen": -0.6452954411506653, "logits/rejected": -0.5753892660140991, "logps/chosen": -0.015782607719302177, "logps/rejected": -5.62052059173584, "loss": 0.0454, "odds_ratio_loss": 0.00034531878191046417, "rewards/accuracies": 1.0, "rewards/chosen": -0.001578260911628604, "rewards/margins": 0.5604738593101501, "rewards/rejected": -0.5620521306991577, "sft_loss": 0.015782607719302177, "step": 3796 }, { "epoch": 5.4909616775126535, "grad_norm": 1.4352775561501738, "learning_rate": 1.4597199198853782e-07, "logits/chosen": -1.0027751922607422, "logits/rejected": -0.7584199905395508, "logps/chosen": -0.02233309857547283, "logps/rejected": -6.617671489715576, "loss": 0.0524, "odds_ratio_loss": 0.0024742737878113985, "rewards/accuracies": 1.0, "rewards/chosen": -0.0022333101369440556, "rewards/margins": 0.6595338582992554, "rewards/rejected": -0.6617671847343445, "sft_loss": 0.02233309857547283, "step": 3797 }, { "epoch": 5.492407809110629, "grad_norm": 1.23422188608422, "learning_rate": 1.451417635162775e-07, "logits/chosen": -0.8085861802101135, "logits/rejected": -0.6014904975891113, "logps/chosen": -0.0513208769261837, "logps/rejected": -5.220034599304199, "loss": 0.0471, "odds_ratio_loss": 0.0029555868823081255, "rewards/accuracies": 1.0, "rewards/chosen": -0.005132087506353855, "rewards/margins": 0.5168713331222534, "rewards/rejected": -0.5220034122467041, "sft_loss": 0.0513208769261837, "step": 3798 }, { "epoch": 5.493853940708605, "grad_norm": 1.1457292152366192, "learning_rate": 1.4431385915240513e-07, "logits/chosen": -0.862919270992279, "logits/rejected": -0.627816379070282, "logps/chosen": -0.03642456978559494, "logps/rejected": -5.0828752517700195, "loss": 0.0481, "odds_ratio_loss": 0.003557011019438505, "rewards/accuracies": 1.0, "rewards/chosen": -0.0036424570716917515, "rewards/margins": 0.5046451091766357, "rewards/rejected": -0.5082875490188599, "sft_loss": 0.03642456978559494, "step": 3799 }, { "epoch": 5.49530007230658, "grad_norm": 0.9247872005994776, "learning_rate": 1.4348827939606723e-07, "logits/chosen": -0.8906946182250977, "logits/rejected": -0.6677629351615906, "logps/chosen": -0.01218663901090622, "logps/rejected": -5.341521739959717, "loss": 0.034, "odds_ratio_loss": 0.0004317264538258314, "rewards/accuracies": 1.0, "rewards/chosen": -0.0012186639942228794, "rewards/margins": 0.5329335331916809, "rewards/rejected": -0.5341522097587585, "sft_loss": 0.01218663901090622, "step": 3800 }, { "epoch": 5.496746203904555, "grad_norm": 0.9295214085597644, "learning_rate": 1.426650247450092e-07, "logits/chosen": -0.8590705990791321, "logits/rejected": -0.7467610836029053, "logps/chosen": -0.012503638863563538, "logps/rejected": -5.067593097686768, "loss": 0.0382, "odds_ratio_loss": 0.0009685508557595313, "rewards/accuracies": 1.0, "rewards/chosen": -0.0012503638863563538, "rewards/margins": 0.5055089592933655, "rewards/rejected": -0.5067592859268188, "sft_loss": 0.012503638863563538, "step": 3801 }, { "epoch": 5.498192335502531, "grad_norm": 0.9849191492306798, "learning_rate": 1.41844095695574e-07, "logits/chosen": -1.0076994895935059, "logits/rejected": -0.4970240294933319, "logps/chosen": -0.03561976179480553, "logps/rejected": -7.417174339294434, "loss": 0.0355, "odds_ratio_loss": 0.0002764179080259055, "rewards/accuracies": 1.0, "rewards/chosen": -0.0035619763657450676, "rewards/margins": 0.7381554841995239, "rewards/rejected": -0.7417174577713013, "sft_loss": 0.03561976179480553, "step": 3802 }, { "epoch": 5.499638467100506, "grad_norm": 1.3091784075782222, "learning_rate": 1.4102549274270214e-07, "logits/chosen": -0.9715660214424133, "logits/rejected": -0.8041377067565918, "logps/chosen": -0.05225411802530289, "logps/rejected": -5.449763298034668, "loss": 0.0405, "odds_ratio_loss": 0.004342366475611925, "rewards/accuracies": 1.0, "rewards/chosen": -0.005225412547588348, "rewards/margins": 0.5397509336471558, "rewards/rejected": -0.5449763536453247, "sft_loss": 0.05225411802530289, "step": 3803 }, { "epoch": 5.5010845986984815, "grad_norm": 1.0966034645413594, "learning_rate": 1.4020921637993356e-07, "logits/chosen": -1.114017367362976, "logits/rejected": -0.7877224683761597, "logps/chosen": -0.016746368259191513, "logps/rejected": -5.36176061630249, "loss": 0.0411, "odds_ratio_loss": 0.00036486214958131313, "rewards/accuracies": 1.0, "rewards/chosen": -0.0016746367327868938, "rewards/margins": 0.5345014333724976, "rewards/rejected": -0.5361760854721069, "sft_loss": 0.016746368259191513, "step": 3804 }, { "epoch": 5.502530730296457, "grad_norm": 1.1649674357871873, "learning_rate": 1.3939526709940342e-07, "logits/chosen": -0.9118725061416626, "logits/rejected": -0.7113963961601257, "logps/chosen": -0.04102957993745804, "logps/rejected": -4.159704208374023, "loss": 0.0406, "odds_ratio_loss": 0.0027073349338024855, "rewards/accuracies": 1.0, "rewards/chosen": -0.004102957900613546, "rewards/margins": 0.4118674397468567, "rewards/rejected": -0.41597044467926025, "sft_loss": 0.04102957993745804, "step": 3805 }, { "epoch": 5.503976861894432, "grad_norm": 1.1222388477018694, "learning_rate": 1.385836453918454e-07, "logits/chosen": -0.8803320527076721, "logits/rejected": -0.738538920879364, "logps/chosen": -0.03088982217013836, "logps/rejected": -3.4778146743774414, "loss": 0.0437, "odds_ratio_loss": 0.0015149106038734317, "rewards/accuracies": 1.0, "rewards/chosen": -0.0030889820773154497, "rewards/margins": 0.3446924686431885, "rewards/rejected": -0.3477814793586731, "sft_loss": 0.03088982217013836, "step": 3806 }, { "epoch": 5.505422993492408, "grad_norm": 0.8878311657170987, "learning_rate": 1.3777435174658903e-07, "logits/chosen": -1.147339105606079, "logits/rejected": -0.8466560244560242, "logps/chosen": -0.02538106217980385, "logps/rejected": -4.762484550476074, "loss": 0.0388, "odds_ratio_loss": 0.001554648159071803, "rewards/accuracies": 1.0, "rewards/chosen": -0.0025381064042448997, "rewards/margins": 0.4737103581428528, "rewards/rejected": -0.47624844312667847, "sft_loss": 0.02538106217980385, "step": 3807 }, { "epoch": 5.506869125090383, "grad_norm": 1.1476764572136273, "learning_rate": 1.3696738665156038e-07, "logits/chosen": -0.9432680010795593, "logits/rejected": -0.7418482303619385, "logps/chosen": -0.062033966183662415, "logps/rejected": -4.3220930099487305, "loss": 0.0452, "odds_ratio_loss": 0.0031421349849551916, "rewards/accuracies": 1.0, "rewards/chosen": -0.006203396245837212, "rewards/margins": 0.42600592970848083, "rewards/rejected": -0.432209312915802, "sft_loss": 0.062033966183662415, "step": 3808 }, { "epoch": 5.508315256688359, "grad_norm": 0.946154333369156, "learning_rate": 1.361627505932823e-07, "logits/chosen": -1.051152229309082, "logits/rejected": -0.6539809703826904, "logps/chosen": -0.022800182923674583, "logps/rejected": -4.930760383605957, "loss": 0.0418, "odds_ratio_loss": 0.0010601935209706426, "rewards/accuracies": 1.0, "rewards/chosen": -0.0022800182923674583, "rewards/margins": 0.4907959997653961, "rewards/rejected": -0.49307602643966675, "sft_loss": 0.022800182923674583, "step": 3809 }, { "epoch": 5.509761388286334, "grad_norm": 1.1866627966271115, "learning_rate": 1.3536044405687208e-07, "logits/chosen": -0.6882243156433105, "logits/rejected": -0.4875558018684387, "logps/chosen": -0.035662174224853516, "logps/rejected": -5.658996105194092, "loss": 0.0525, "odds_ratio_loss": 0.0008063373970799148, "rewards/accuracies": 1.0, "rewards/chosen": -0.0035662176087498665, "rewards/margins": 0.5623334646224976, "rewards/rejected": -0.565899670124054, "sft_loss": 0.035662174224853516, "step": 3810 }, { "epoch": 5.51120751988431, "grad_norm": 0.9004650888610211, "learning_rate": 1.3456046752604323e-07, "logits/chosen": -1.0605201721191406, "logits/rejected": -0.5877838134765625, "logps/chosen": -0.010158369317650795, "logps/rejected": -6.685586929321289, "loss": 0.0362, "odds_ratio_loss": 0.0003088012454099953, "rewards/accuracies": 1.0, "rewards/chosen": -0.0010158369550481439, "rewards/margins": 0.6675429344177246, "rewards/rejected": -0.6685587167739868, "sft_loss": 0.010158369317650795, "step": 3811 }, { "epoch": 5.512653651482285, "grad_norm": 1.1943879093949175, "learning_rate": 1.3376282148310457e-07, "logits/chosen": -0.8050583600997925, "logits/rejected": -0.7147977948188782, "logps/chosen": -0.01518384087830782, "logps/rejected": -6.681069374084473, "loss": 0.0403, "odds_ratio_loss": 0.0011472441256046295, "rewards/accuracies": 1.0, "rewards/chosen": -0.001518384087830782, "rewards/margins": 0.666588544845581, "rewards/rejected": -0.6681069731712341, "sft_loss": 0.01518384087830782, "step": 3812 }, { "epoch": 5.51409978308026, "grad_norm": 1.362313364484537, "learning_rate": 1.3296750640896126e-07, "logits/chosen": -1.0888941287994385, "logits/rejected": -0.8052299618721008, "logps/chosen": -0.03261774033308029, "logps/rejected": -5.1450724601745605, "loss": 0.0468, "odds_ratio_loss": 0.0014005877310410142, "rewards/accuracies": 1.0, "rewards/chosen": -0.0032617738470435143, "rewards/margins": 0.5112454295158386, "rewards/rejected": -0.5145072937011719, "sft_loss": 0.03261774033308029, "step": 3813 }, { "epoch": 5.515545914678236, "grad_norm": 1.286476278575021, "learning_rate": 1.3217452278311014e-07, "logits/chosen": -0.7903501391410828, "logits/rejected": -0.6983763575553894, "logps/chosen": -0.01914118230342865, "logps/rejected": -5.037604331970215, "loss": 0.0488, "odds_ratio_loss": 0.0035344092175364494, "rewards/accuracies": 1.0, "rewards/chosen": -0.0019141181837767363, "rewards/margins": 0.5018463730812073, "rewards/rejected": -0.5037604570388794, "sft_loss": 0.01914118230342865, "step": 3814 }, { "epoch": 5.516992046276211, "grad_norm": 1.2606595706769816, "learning_rate": 1.3138387108364478e-07, "logits/chosen": -1.197737216949463, "logits/rejected": -0.8296566009521484, "logps/chosen": -0.052802085876464844, "logps/rejected": -4.593600749969482, "loss": 0.0515, "odds_ratio_loss": 0.003228149376809597, "rewards/accuracies": 1.0, "rewards/chosen": -0.005280209239572287, "rewards/margins": 0.45407986640930176, "rewards/rejected": -0.4593600630760193, "sft_loss": 0.052802085876464844, "step": 3815 }, { "epoch": 5.518438177874186, "grad_norm": 1.322864716916177, "learning_rate": 1.3059555178725145e-07, "logits/chosen": -0.8976123929023743, "logits/rejected": -0.626125156879425, "logps/chosen": -0.020292531698942184, "logps/rejected": -5.278232574462891, "loss": 0.0306, "odds_ratio_loss": 0.0016008391976356506, "rewards/accuracies": 1.0, "rewards/chosen": -0.002029252937063575, "rewards/margins": 0.5257940292358398, "rewards/rejected": -0.527823269367218, "sft_loss": 0.020292531698942184, "step": 3816 }, { "epoch": 5.519884309472162, "grad_norm": 1.1411805522730452, "learning_rate": 1.2980956536921217e-07, "logits/chosen": -0.8470653295516968, "logits/rejected": -0.7692535519599915, "logps/chosen": -0.036463767290115356, "logps/rejected": -5.199902534484863, "loss": 0.0409, "odds_ratio_loss": 0.0013247218448668718, "rewards/accuracies": 1.0, "rewards/chosen": -0.0036463765427470207, "rewards/margins": 0.516343891620636, "rewards/rejected": -0.5199902653694153, "sft_loss": 0.036463767290115356, "step": 3817 }, { "epoch": 5.521330441070138, "grad_norm": 1.1491703460727254, "learning_rate": 1.2902591230339897e-07, "logits/chosen": -0.9239856600761414, "logits/rejected": -0.7358173727989197, "logps/chosen": -0.014124940149486065, "logps/rejected": -5.4396443367004395, "loss": 0.0579, "odds_ratio_loss": 0.0010359040461480618, "rewards/accuracies": 1.0, "rewards/chosen": -0.001412494108080864, "rewards/margins": 0.5425519347190857, "rewards/rejected": -0.5439644455909729, "sft_loss": 0.014124940149486065, "step": 3818 }, { "epoch": 5.522776572668112, "grad_norm": 1.3151003884044896, "learning_rate": 1.2824459306228064e-07, "logits/chosen": -0.6562463045120239, "logits/rejected": -0.5251370072364807, "logps/chosen": -0.04931947961449623, "logps/rejected": -4.668588638305664, "loss": 0.0453, "odds_ratio_loss": 0.002963886596262455, "rewards/accuracies": 1.0, "rewards/chosen": -0.004931948147714138, "rewards/margins": 0.4619269371032715, "rewards/rejected": -0.4668588638305664, "sft_loss": 0.04931947961449623, "step": 3819 }, { "epoch": 5.524222704266088, "grad_norm": 0.9885430023773936, "learning_rate": 1.2746560811691674e-07, "logits/chosen": -1.072084903717041, "logits/rejected": -0.89268958568573, "logps/chosen": -0.03535031899809837, "logps/rejected": -6.679300785064697, "loss": 0.04, "odds_ratio_loss": 0.0026512041222304106, "rewards/accuracies": 1.0, "rewards/chosen": -0.003535032505169511, "rewards/margins": 0.6643950939178467, "rewards/rejected": -0.6679300665855408, "sft_loss": 0.03535031899809837, "step": 3820 }, { "epoch": 5.525668835864064, "grad_norm": 1.061112092644839, "learning_rate": 1.2668895793696144e-07, "logits/chosen": -1.0231108665466309, "logits/rejected": -0.6869786977767944, "logps/chosen": -0.02733795717358589, "logps/rejected": -6.306743621826172, "loss": 0.0235, "odds_ratio_loss": 0.000864526373334229, "rewards/accuracies": 1.0, "rewards/chosen": -0.002733795903623104, "rewards/margins": 0.6279405355453491, "rewards/rejected": -0.6306743621826172, "sft_loss": 0.02733795717358589, "step": 3821 }, { "epoch": 5.527114967462039, "grad_norm": 1.308480386878777, "learning_rate": 1.2591464299065834e-07, "logits/chosen": -0.8179947137832642, "logits/rejected": -0.6588776111602783, "logps/chosen": -0.06917057186365128, "logps/rejected": -5.604111671447754, "loss": 0.0651, "odds_ratio_loss": 0.0042171357199549675, "rewards/accuracies": 1.0, "rewards/chosen": -0.006917057558894157, "rewards/margins": 0.5534940958023071, "rewards/rejected": -0.5604111552238464, "sft_loss": 0.06917057186365128, "step": 3822 }, { "epoch": 5.528561099060014, "grad_norm": 1.155715414947543, "learning_rate": 1.2514266374484606e-07, "logits/chosen": -0.9295008182525635, "logits/rejected": -0.6454451084136963, "logps/chosen": -0.03257459029555321, "logps/rejected": -6.534205436706543, "loss": 0.0209, "odds_ratio_loss": 0.0021665338426828384, "rewards/accuracies": 1.0, "rewards/chosen": -0.0032574590295553207, "rewards/margins": 0.6501630544662476, "rewards/rejected": -0.6534205079078674, "sft_loss": 0.03257459029555321, "step": 3823 }, { "epoch": 5.53000723065799, "grad_norm": 0.9684929471760406, "learning_rate": 1.243730206649527e-07, "logits/chosen": -1.0152008533477783, "logits/rejected": -0.8463742733001709, "logps/chosen": -0.026521919295191765, "logps/rejected": -4.2680816650390625, "loss": 0.0446, "odds_ratio_loss": 0.002520361915230751, "rewards/accuracies": 1.0, "rewards/chosen": -0.002652192022651434, "rewards/margins": 0.42415598034858704, "rewards/rejected": -0.4268081784248352, "sft_loss": 0.026521919295191765, "step": 3824 }, { "epoch": 5.531453362255966, "grad_norm": 1.1373144870061254, "learning_rate": 1.2360571421500044e-07, "logits/chosen": -0.9026045799255371, "logits/rejected": -0.739682674407959, "logps/chosen": -0.033317435532808304, "logps/rejected": -4.749547481536865, "loss": 0.039, "odds_ratio_loss": 0.0017841738881543279, "rewards/accuracies": 1.0, "rewards/chosen": -0.003331744112074375, "rewards/margins": 0.471623033285141, "rewards/rejected": -0.4749547839164734, "sft_loss": 0.033317435532808304, "step": 3825 }, { "epoch": 5.53289949385394, "grad_norm": 0.9615381215348964, "learning_rate": 1.2284074485760009e-07, "logits/chosen": -0.8409866094589233, "logits/rejected": -0.6542403697967529, "logps/chosen": -0.0341576412320137, "logps/rejected": -4.307783126831055, "loss": 0.0338, "odds_ratio_loss": 0.00148858898319304, "rewards/accuracies": 1.0, "rewards/chosen": -0.003415764309465885, "rewards/margins": 0.4273625612258911, "rewards/rejected": -0.4307783246040344, "sft_loss": 0.0341576412320137, "step": 3826 }, { "epoch": 5.534345625451916, "grad_norm": 1.1509488481197494, "learning_rate": 1.2207811305395388e-07, "logits/chosen": -0.8256188035011292, "logits/rejected": -0.8999884128570557, "logps/chosen": -0.03968646377325058, "logps/rejected": -4.411059379577637, "loss": 0.0258, "odds_ratio_loss": 0.0018667414551600814, "rewards/accuracies": 1.0, "rewards/chosen": -0.0039686462841928005, "rewards/margins": 0.437137246131897, "rewards/rejected": -0.4411059319972992, "sft_loss": 0.03968646377325058, "step": 3827 }, { "epoch": 5.535791757049892, "grad_norm": 0.9584637218026467, "learning_rate": 1.213178192638571e-07, "logits/chosen": -0.7055195569992065, "logits/rejected": -0.49205851554870605, "logps/chosen": -0.04943579062819481, "logps/rejected": -6.112897872924805, "loss": 0.0412, "odds_ratio_loss": 0.0011792955920100212, "rewards/accuracies": 1.0, "rewards/chosen": -0.004943579435348511, "rewards/margins": 0.6063462495803833, "rewards/rejected": -0.6112898588180542, "sft_loss": 0.04943579062819481, "step": 3828 }, { "epoch": 5.537237888647867, "grad_norm": 1.1558628450281507, "learning_rate": 1.205598639456924e-07, "logits/chosen": -1.1022794246673584, "logits/rejected": -0.8147293329238892, "logps/chosen": -0.05759043246507645, "logps/rejected": -5.212984085083008, "loss": 0.0574, "odds_ratio_loss": 0.0012822567950934172, "rewards/accuracies": 1.0, "rewards/chosen": -0.005759043153375387, "rewards/margins": 0.5155394077301025, "rewards/rejected": -0.5212984085083008, "sft_loss": 0.05759043246507645, "step": 3829 }, { "epoch": 5.538684020245842, "grad_norm": 1.1812263848411928, "learning_rate": 1.198042475564347e-07, "logits/chosen": -0.9783557653427124, "logits/rejected": -0.8160287141799927, "logps/chosen": -0.09639565646648407, "logps/rejected": -7.522600173950195, "loss": 0.0447, "odds_ratio_loss": 0.008017424494028091, "rewards/accuracies": 1.0, "rewards/chosen": -0.009639564901590347, "rewards/margins": 0.7426204681396484, "rewards/rejected": -0.7522600293159485, "sft_loss": 0.09639565646648407, "step": 3830 }, { "epoch": 5.540130151843818, "grad_norm": 1.2066995822125324, "learning_rate": 1.1905097055164714e-07, "logits/chosen": -0.8420087099075317, "logits/rejected": -0.6522636413574219, "logps/chosen": -0.03687209263443947, "logps/rejected": -3.5625946521759033, "loss": 0.0414, "odds_ratio_loss": 0.0023647851776331663, "rewards/accuracies": 1.0, "rewards/chosen": -0.0036872094497084618, "rewards/margins": 0.3525722324848175, "rewards/rejected": -0.3562594950199127, "sft_loss": 0.03687209263443947, "step": 3831 }, { "epoch": 5.541576283441794, "grad_norm": 0.8654303496060788, "learning_rate": 1.1830003338548423e-07, "logits/chosen": -1.0921556949615479, "logits/rejected": -0.850192129611969, "logps/chosen": -0.013794208876788616, "logps/rejected": -6.353579521179199, "loss": 0.027, "odds_ratio_loss": 0.001722605200484395, "rewards/accuracies": 1.0, "rewards/chosen": -0.001379420980811119, "rewards/margins": 0.6339784860610962, "rewards/rejected": -0.6353579163551331, "sft_loss": 0.013794208876788616, "step": 3832 }, { "epoch": 5.5430224150397684, "grad_norm": 0.977867382746297, "learning_rate": 1.1755143651068822e-07, "logits/chosen": -0.7232560515403748, "logits/rejected": -0.6421326398849487, "logps/chosen": -0.03204415738582611, "logps/rejected": -4.54522705078125, "loss": 0.0241, "odds_ratio_loss": 0.005562347825616598, "rewards/accuracies": 1.0, "rewards/chosen": -0.003204415552318096, "rewards/margins": 0.45131826400756836, "rewards/rejected": -0.45452266931533813, "sft_loss": 0.03204415738582611, "step": 3833 }, { "epoch": 5.544468546637744, "grad_norm": 1.0993207812461236, "learning_rate": 1.1680518037859054e-07, "logits/chosen": -0.9139794111251831, "logits/rejected": -0.622349202632904, "logps/chosen": -0.04837263375520706, "logps/rejected": -7.495334625244141, "loss": 0.0411, "odds_ratio_loss": 0.005845530424267054, "rewards/accuracies": 1.0, "rewards/chosen": -0.004837263375520706, "rewards/margins": 0.7446962594985962, "rewards/rejected": -0.7495335340499878, "sft_loss": 0.04837263375520706, "step": 3834 }, { "epoch": 5.54591467823572, "grad_norm": 1.089833611551456, "learning_rate": 1.1606126543911177e-07, "logits/chosen": -0.9266623854637146, "logits/rejected": -0.5840288400650024, "logps/chosen": -0.028001118451356888, "logps/rejected": -7.177366733551025, "loss": 0.0306, "odds_ratio_loss": 0.0006162969511933625, "rewards/accuracies": 1.0, "rewards/chosen": -0.0028001118917018175, "rewards/margins": 0.7149365544319153, "rewards/rejected": -0.7177366018295288, "sft_loss": 0.028001118451356888, "step": 3835 }, { "epoch": 5.547360809833695, "grad_norm": 1.204977731108127, "learning_rate": 1.1531969214076198e-07, "logits/chosen": -0.8889631628990173, "logits/rejected": -0.7671388387680054, "logps/chosen": -0.04466234892606735, "logps/rejected": -3.038712501525879, "loss": 0.0505, "odds_ratio_loss": 0.0071504805237054825, "rewards/accuracies": 1.0, "rewards/chosen": -0.00446623470634222, "rewards/margins": 0.299405038356781, "rewards/rejected": -0.3038712739944458, "sft_loss": 0.04466234892606735, "step": 3836 }, { "epoch": 5.54880694143167, "grad_norm": 1.7867119071684434, "learning_rate": 1.1458046093063733e-07, "logits/chosen": -1.1271165609359741, "logits/rejected": -0.827365517616272, "logps/chosen": -0.00812723208218813, "logps/rejected": -5.488351821899414, "loss": 0.0466, "odds_ratio_loss": 0.0012041174340993166, "rewards/accuracies": 1.0, "rewards/chosen": -0.0008127232431434095, "rewards/margins": 0.548022449016571, "rewards/rejected": -0.5488352179527283, "sft_loss": 0.00812723208218813, "step": 3837 }, { "epoch": 5.550253073029646, "grad_norm": 1.1182672956514397, "learning_rate": 1.1384357225442398e-07, "logits/chosen": -0.706390380859375, "logits/rejected": -0.6600760817527771, "logps/chosen": -0.03536364808678627, "logps/rejected": -6.150212287902832, "loss": 0.0308, "odds_ratio_loss": 0.001387185649946332, "rewards/accuracies": 1.0, "rewards/chosen": -0.003536364994943142, "rewards/margins": 0.611484944820404, "rewards/rejected": -0.615021288394928, "sft_loss": 0.03536364808678627, "step": 3838 }, { "epoch": 5.551699204627621, "grad_norm": 1.0491739599778058, "learning_rate": 1.1310902655639454e-07, "logits/chosen": -0.7480611801147461, "logits/rejected": -0.7505444288253784, "logps/chosen": -0.03836183249950409, "logps/rejected": -4.293073654174805, "loss": 0.0509, "odds_ratio_loss": 0.0020930215250700712, "rewards/accuracies": 1.0, "rewards/chosen": -0.0038361833430826664, "rewards/margins": 0.4254711866378784, "rewards/rejected": -0.42930734157562256, "sft_loss": 0.03836183249950409, "step": 3839 }, { "epoch": 5.5531453362255965, "grad_norm": 1.2106392513909316, "learning_rate": 1.1237682427940942e-07, "logits/chosen": -0.8354946374893188, "logits/rejected": -0.564354658126831, "logps/chosen": -0.042547404766082764, "logps/rejected": -5.4656572341918945, "loss": 0.0358, "odds_ratio_loss": 0.0024263339582830667, "rewards/accuracies": 1.0, "rewards/chosen": -0.004254741128534079, "rewards/margins": 0.5423109531402588, "rewards/rejected": -0.5465657114982605, "sft_loss": 0.042547404766082764, "step": 3840 }, { "epoch": 5.554591467823572, "grad_norm": 1.115882395839301, "learning_rate": 1.1164696586491639e-07, "logits/chosen": -0.7860689163208008, "logits/rejected": -0.60820472240448, "logps/chosen": -0.038941726088523865, "logps/rejected": -5.316815376281738, "loss": 0.0391, "odds_ratio_loss": 0.005218683276325464, "rewards/accuracies": 1.0, "rewards/chosen": -0.00389417284168303, "rewards/margins": 0.5277873873710632, "rewards/rejected": -0.5316815376281738, "sft_loss": 0.038941726088523865, "step": 3841 }, { "epoch": 5.556037599421547, "grad_norm": 0.8934107860559392, "learning_rate": 1.1091945175294836e-07, "logits/chosen": -0.8913710713386536, "logits/rejected": -0.8013601303100586, "logps/chosen": -0.009748661890625954, "logps/rejected": -4.847626686096191, "loss": 0.0175, "odds_ratio_loss": 0.0006343543063849211, "rewards/accuracies": 1.0, "rewards/chosen": -0.0009748662123456597, "rewards/margins": 0.4837878346443176, "rewards/rejected": -0.48476266860961914, "sft_loss": 0.009748661890625954, "step": 3842 }, { "epoch": 5.557483731019523, "grad_norm": 1.15693764012382, "learning_rate": 1.1019428238212825e-07, "logits/chosen": -0.779668927192688, "logits/rejected": -0.6887148022651672, "logps/chosen": -0.05625636503100395, "logps/rejected": -5.346826076507568, "loss": 0.0428, "odds_ratio_loss": 0.007378511130809784, "rewards/accuracies": 1.0, "rewards/chosen": -0.005625636782497168, "rewards/margins": 0.5290569067001343, "rewards/rejected": -0.53468257188797, "sft_loss": 0.05625636503100395, "step": 3843 }, { "epoch": 5.558929862617498, "grad_norm": 1.0916272154464464, "learning_rate": 1.0947145818966186e-07, "logits/chosen": -0.8776683807373047, "logits/rejected": -0.6630545854568481, "logps/chosen": -0.01515410840511322, "logps/rejected": -6.905978679656982, "loss": 0.0341, "odds_ratio_loss": 0.0017984689911827445, "rewards/accuracies": 1.0, "rewards/chosen": -0.001515410840511322, "rewards/margins": 0.6890825033187866, "rewards/rejected": -0.6905978918075562, "sft_loss": 0.01515410840511322, "step": 3844 }, { "epoch": 5.560375994215473, "grad_norm": 1.26311105122681, "learning_rate": 1.0875097961134372e-07, "logits/chosen": -0.8992950916290283, "logits/rejected": -0.7164445519447327, "logps/chosen": -0.03995127975940704, "logps/rejected": -3.731813430786133, "loss": 0.0421, "odds_ratio_loss": 0.004833152983337641, "rewards/accuracies": 1.0, "rewards/chosen": -0.003995127510279417, "rewards/margins": 0.3691861927509308, "rewards/rejected": -0.3731813430786133, "sft_loss": 0.03995127975940704, "step": 3845 }, { "epoch": 5.561822125813449, "grad_norm": 0.9533168885486478, "learning_rate": 1.0803284708155213e-07, "logits/chosen": -0.8431378602981567, "logits/rejected": -0.6289638876914978, "logps/chosen": -0.050756603479385376, "logps/rejected": -4.342966079711914, "loss": 0.0278, "odds_ratio_loss": 0.004125285893678665, "rewards/accuracies": 1.0, "rewards/chosen": -0.0050756605342030525, "rewards/margins": 0.42922091484069824, "rewards/rejected": -0.4342966079711914, "sft_loss": 0.050756603479385376, "step": 3846 }, { "epoch": 5.5632682574114245, "grad_norm": 1.1297521726341693, "learning_rate": 1.0731706103325233e-07, "logits/chosen": -1.105302095413208, "logits/rejected": -0.6120625138282776, "logps/chosen": -0.022785475477576256, "logps/rejected": -6.2301530838012695, "loss": 0.036, "odds_ratio_loss": 0.0008173306123353541, "rewards/accuracies": 1.0, "rewards/chosen": -0.0022785477340221405, "rewards/margins": 0.6207367777824402, "rewards/rejected": -0.623015284538269, "sft_loss": 0.022785475477576256, "step": 3847 }, { "epoch": 5.5647143890094, "grad_norm": 1.0183290147990447, "learning_rate": 1.0660362189799465e-07, "logits/chosen": -0.8919758796691895, "logits/rejected": -0.7195218801498413, "logps/chosen": -0.03661835193634033, "logps/rejected": -5.3943705558776855, "loss": 0.0272, "odds_ratio_loss": 0.0016591616440564394, "rewards/accuracies": 1.0, "rewards/chosen": -0.003661835566163063, "rewards/margins": 0.5357752442359924, "rewards/rejected": -0.5394370555877686, "sft_loss": 0.03661835193634033, "step": 3848 }, { "epoch": 5.566160520607375, "grad_norm": 1.0784389721266068, "learning_rate": 1.058925301059137e-07, "logits/chosen": -0.8064311742782593, "logits/rejected": -0.6183796525001526, "logps/chosen": -0.039033301174640656, "logps/rejected": -4.795423984527588, "loss": 0.0287, "odds_ratio_loss": 0.0012832069769501686, "rewards/accuracies": 1.0, "rewards/chosen": -0.0039033303037285805, "rewards/margins": 0.47563910484313965, "rewards/rejected": -0.47954240441322327, "sft_loss": 0.039033301174640656, "step": 3849 }, { "epoch": 5.567606652205351, "grad_norm": 0.8967336618921115, "learning_rate": 1.0518378608572964e-07, "logits/chosen": -1.0078152418136597, "logits/rejected": -0.7482145428657532, "logps/chosen": -0.015405582264065742, "logps/rejected": -3.899319648742676, "loss": 0.0197, "odds_ratio_loss": 0.0012769848108291626, "rewards/accuracies": 1.0, "rewards/chosen": -0.0015405581798404455, "rewards/margins": 0.3883914053440094, "rewards/rejected": -0.38993197679519653, "sft_loss": 0.015405582264065742, "step": 3850 }, { "epoch": 5.569052783803326, "grad_norm": 1.1995841884111325, "learning_rate": 1.0447739026474645e-07, "logits/chosen": -0.6040602922439575, "logits/rejected": -0.5330982208251953, "logps/chosen": -0.04164326936006546, "logps/rejected": -5.938526153564453, "loss": 0.0619, "odds_ratio_loss": 0.0008817025227472186, "rewards/accuracies": 1.0, "rewards/chosen": -0.004164327401667833, "rewards/margins": 0.5896883606910706, "rewards/rejected": -0.593852698802948, "sft_loss": 0.04164326936006546, "step": 3851 }, { "epoch": 5.570498915401301, "grad_norm": 0.9183906800328296, "learning_rate": 1.0377334306885322e-07, "logits/chosen": -0.6867368817329407, "logits/rejected": -0.7344608902931213, "logps/chosen": -0.058505572378635406, "logps/rejected": -5.076149940490723, "loss": 0.0417, "odds_ratio_loss": 0.013662266544997692, "rewards/accuracies": 1.0, "rewards/chosen": -0.005850557237863541, "rewards/margins": 0.5017644762992859, "rewards/rejected": -0.5076150298118591, "sft_loss": 0.058505572378635406, "step": 3852 }, { "epoch": 5.571945046999277, "grad_norm": 1.1114949751042007, "learning_rate": 1.030716449225224e-07, "logits/chosen": -0.8221277594566345, "logits/rejected": -0.6434429287910461, "logps/chosen": -0.027723146602511406, "logps/rejected": -4.452295303344727, "loss": 0.0497, "odds_ratio_loss": 0.045040663331747055, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0027723144739866257, "rewards/margins": 0.4424571990966797, "rewards/rejected": -0.44522953033447266, "sft_loss": 0.027723146602511406, "step": 3853 }, { "epoch": 5.5733911785972525, "grad_norm": 1.0600027055877603, "learning_rate": 1.0237229624881116e-07, "logits/chosen": -0.8413448929786682, "logits/rejected": -0.5777408480644226, "logps/chosen": -0.023316072300076485, "logps/rejected": -5.208468437194824, "loss": 0.0336, "odds_ratio_loss": 0.001969376113265753, "rewards/accuracies": 1.0, "rewards/chosen": -0.002331607509404421, "rewards/margins": 0.5185152292251587, "rewards/rejected": -0.5208468437194824, "sft_loss": 0.023316072300076485, "step": 3854 }, { "epoch": 5.574837310195228, "grad_norm": 0.8143625402129068, "learning_rate": 1.0167529746935866e-07, "logits/chosen": -0.8654884099960327, "logits/rejected": -0.5924917459487915, "logps/chosen": -0.011484618298709393, "logps/rejected": -8.372770309448242, "loss": 0.0213, "odds_ratio_loss": 0.001312417909502983, "rewards/accuracies": 1.0, "rewards/chosen": -0.0011484617134556174, "rewards/margins": 0.8361285924911499, "rewards/rejected": -0.8372770547866821, "sft_loss": 0.011484618298709393, "step": 3855 }, { "epoch": 5.576283441793203, "grad_norm": 0.9756613799933704, "learning_rate": 1.009806490043883e-07, "logits/chosen": -0.7985130548477173, "logits/rejected": -0.7025433778762817, "logps/chosen": -0.027194581925868988, "logps/rejected": -4.829598426818848, "loss": 0.0327, "odds_ratio_loss": 0.0021613496355712414, "rewards/accuracies": 1.0, "rewards/chosen": -0.002719458192586899, "rewards/margins": 0.4802403748035431, "rewards/rejected": -0.4829598069190979, "sft_loss": 0.027194581925868988, "step": 3856 }, { "epoch": 5.577729573391179, "grad_norm": 1.3785283373997754, "learning_rate": 1.0028835127270552e-07, "logits/chosen": -1.0098246335983276, "logits/rejected": -0.864977240562439, "logps/chosen": -0.018646353855729103, "logps/rejected": -4.30742073059082, "loss": 0.0339, "odds_ratio_loss": 0.0010222363052889705, "rewards/accuracies": 1.0, "rewards/chosen": -0.0018646355019882321, "rewards/margins": 0.42887741327285767, "rewards/rejected": -0.4307420551776886, "sft_loss": 0.018646353855729103, "step": 3857 }, { "epoch": 5.579175704989154, "grad_norm": 0.9558186473549585, "learning_rate": 9.959840469170044e-08, "logits/chosen": -0.9542050957679749, "logits/rejected": -0.6801411509513855, "logps/chosen": -0.017612973228096962, "logps/rejected": -5.700991630554199, "loss": 0.0347, "odds_ratio_loss": 0.001063553267158568, "rewards/accuracies": 1.0, "rewards/chosen": -0.0017612973460927606, "rewards/margins": 0.5683378577232361, "rewards/rejected": -0.5700991153717041, "sft_loss": 0.017612973228096962, "step": 3858 }, { "epoch": 5.580621836587129, "grad_norm": 1.1468823365770084, "learning_rate": 9.891080967734345e-08, "logits/chosen": -1.0696161985397339, "logits/rejected": -0.7148972153663635, "logps/chosen": -0.03979545831680298, "logps/rejected": -6.093506813049316, "loss": 0.0352, "odds_ratio_loss": 0.0021976944990456104, "rewards/accuracies": 1.0, "rewards/chosen": -0.003979546017944813, "rewards/margins": 0.6053711771965027, "rewards/rejected": -0.6093507409095764, "sft_loss": 0.03979545831680298, "step": 3859 }, { "epoch": 5.582067968185105, "grad_norm": 1.0516361506414187, "learning_rate": 9.822556664418913e-08, "logits/chosen": -0.9046825766563416, "logits/rejected": -0.6393083333969116, "logps/chosen": -0.059828951954841614, "logps/rejected": -4.932526111602783, "loss": 0.0438, "odds_ratio_loss": 0.004070539027452469, "rewards/accuracies": 1.0, "rewards/chosen": -0.005982895381748676, "rewards/margins": 0.4872697591781616, "rewards/rejected": -0.49325263500213623, "sft_loss": 0.059828951954841614, "step": 3860 }, { "epoch": 5.5835140997830806, "grad_norm": 1.1184859002794636, "learning_rate": 9.754267600537148e-08, "logits/chosen": -1.1668930053710938, "logits/rejected": -0.8483262658119202, "logps/chosen": -0.07912689447402954, "logps/rejected": -4.509005546569824, "loss": 0.0481, "odds_ratio_loss": 0.003705930430442095, "rewards/accuracies": 1.0, "rewards/chosen": -0.00791268888860941, "rewards/margins": 0.44298791885375977, "rewards/rejected": -0.4509005546569824, "sft_loss": 0.07912689447402954, "step": 3861 }, { "epoch": 5.584960231381055, "grad_norm": 1.0245326939879393, "learning_rate": 9.686213817260957e-08, "logits/chosen": -0.8007428646087646, "logits/rejected": -0.7032575011253357, "logps/chosen": -0.024395223706960678, "logps/rejected": -5.995718002319336, "loss": 0.0308, "odds_ratio_loss": 0.0045672086998820305, "rewards/accuracies": 1.0, "rewards/chosen": -0.0024395224172621965, "rewards/margins": 0.5971323251724243, "rewards/rejected": -0.5995718240737915, "sft_loss": 0.024395223706960678, "step": 3862 }, { "epoch": 5.586406362979031, "grad_norm": 0.792686060002036, "learning_rate": 9.618395355620146e-08, "logits/chosen": -0.9847527742385864, "logits/rejected": -0.9884693622589111, "logps/chosen": -0.016595035791397095, "logps/rejected": -4.327853202819824, "loss": 0.0219, "odds_ratio_loss": 0.001219768775627017, "rewards/accuracies": 1.0, "rewards/chosen": -0.001659503672271967, "rewards/margins": 0.43112584948539734, "rewards/rejected": -0.4327853322029114, "sft_loss": 0.016595035791397095, "step": 3863 }, { "epoch": 5.587852494577007, "grad_norm": 1.0262122915001675, "learning_rate": 9.550812256502671e-08, "logits/chosen": -0.7402524948120117, "logits/rejected": -0.650458037853241, "logps/chosen": -0.05518195778131485, "logps/rejected": -6.806795120239258, "loss": 0.0316, "odds_ratio_loss": 0.005077338311821222, "rewards/accuracies": 1.0, "rewards/chosen": -0.0055181956849992275, "rewards/margins": 0.6751613020896912, "rewards/rejected": -0.6806795597076416, "sft_loss": 0.05518195778131485, "step": 3864 }, { "epoch": 5.5892986261749815, "grad_norm": 0.9213171627134767, "learning_rate": 9.483464560654653e-08, "logits/chosen": -0.7630059719085693, "logits/rejected": -0.5672922730445862, "logps/chosen": -0.02237100899219513, "logps/rejected": -5.245082855224609, "loss": 0.0263, "odds_ratio_loss": 0.0012939394218847156, "rewards/accuracies": 1.0, "rewards/chosen": -0.002237100852653384, "rewards/margins": 0.5222712159156799, "rewards/rejected": -0.5245083570480347, "sft_loss": 0.02237100899219513, "step": 3865 }, { "epoch": 5.590744757772957, "grad_norm": 0.9947073685141449, "learning_rate": 9.41635230868032e-08, "logits/chosen": -0.8050935864448547, "logits/rejected": -0.6179883480072021, "logps/chosen": -0.02250969037413597, "logps/rejected": -5.135504722595215, "loss": 0.0357, "odds_ratio_loss": 0.0018040683353319764, "rewards/accuracies": 1.0, "rewards/chosen": -0.0022509689442813396, "rewards/margins": 0.5112994909286499, "rewards/rejected": -0.5135504603385925, "sft_loss": 0.02250969037413597, "step": 3866 }, { "epoch": 5.592190889370933, "grad_norm": 1.0721662892607096, "learning_rate": 9.349475541041885e-08, "logits/chosen": -1.047861099243164, "logits/rejected": -0.7601584196090698, "logps/chosen": -0.07620863616466522, "logps/rejected": -5.097728252410889, "loss": 0.0393, "odds_ratio_loss": 0.003793524345383048, "rewards/accuracies": 1.0, "rewards/chosen": -0.007620863616466522, "rewards/margins": 0.5021519660949707, "rewards/rejected": -0.5097728371620178, "sft_loss": 0.07620863616466522, "step": 3867 }, { "epoch": 5.593637020968908, "grad_norm": 1.2066394169606451, "learning_rate": 9.282834298059539e-08, "logits/chosen": -0.8130267858505249, "logits/rejected": -0.7708932161331177, "logps/chosen": -0.016214922070503235, "logps/rejected": -5.153808116912842, "loss": 0.0443, "odds_ratio_loss": 0.0012468149652704597, "rewards/accuracies": 1.0, "rewards/chosen": -0.0016214922070503235, "rewards/margins": 0.5137593150138855, "rewards/rejected": -0.5153807997703552, "sft_loss": 0.016214922070503235, "step": 3868 }, { "epoch": 5.595083152566883, "grad_norm": 1.157611327012197, "learning_rate": 9.216428619911587e-08, "logits/chosen": -0.827752411365509, "logits/rejected": -0.6157627701759338, "logps/chosen": -0.031848832964897156, "logps/rejected": -5.636829376220703, "loss": 0.04, "odds_ratio_loss": 0.0006597494357265532, "rewards/accuracies": 1.0, "rewards/chosen": -0.003184883389621973, "rewards/margins": 0.5604981184005737, "rewards/rejected": -0.5636829733848572, "sft_loss": 0.031848832964897156, "step": 3869 }, { "epoch": 5.596529284164859, "grad_norm": 1.440751755555854, "learning_rate": 9.150258546634271e-08, "logits/chosen": -0.6490340232849121, "logits/rejected": -0.48528796434402466, "logps/chosen": -0.02013351395726204, "logps/rejected": -6.363707542419434, "loss": 0.0612, "odds_ratio_loss": 0.0006388231995515525, "rewards/accuracies": 1.0, "rewards/chosen": -0.0020133513025939465, "rewards/margins": 0.6343573927879333, "rewards/rejected": -0.6363707780838013, "sft_loss": 0.02013351395726204, "step": 3870 }, { "epoch": 5.597975415762835, "grad_norm": 0.9224358342142104, "learning_rate": 9.084324118121767e-08, "logits/chosen": -0.8425424695014954, "logits/rejected": -0.6517443060874939, "logps/chosen": -0.012473606504499912, "logps/rejected": -5.4249186515808105, "loss": 0.0261, "odds_ratio_loss": 0.0011486727744340897, "rewards/accuracies": 1.0, "rewards/chosen": -0.0012473606038838625, "rewards/margins": 0.5412445068359375, "rewards/rejected": -0.5424919128417969, "sft_loss": 0.012473606504499912, "step": 3871 }, { "epoch": 5.5994215473608095, "grad_norm": 1.047728691464437, "learning_rate": 9.018625374126188e-08, "logits/chosen": -0.787708044052124, "logits/rejected": -0.6361563801765442, "logps/chosen": -0.040176477283239365, "logps/rejected": -5.250421524047852, "loss": 0.0409, "odds_ratio_loss": 0.0012823616852983832, "rewards/accuracies": 1.0, "rewards/chosen": -0.004017648287117481, "rewards/margins": 0.5210245251655579, "rewards/rejected": -0.5250421166419983, "sft_loss": 0.040176477283239365, "step": 3872 }, { "epoch": 5.600867678958785, "grad_norm": 1.005615697820088, "learning_rate": 8.953162354257538e-08, "logits/chosen": -0.9109957218170166, "logits/rejected": -0.7220777273178101, "logps/chosen": -0.017372364178299904, "logps/rejected": -5.085519313812256, "loss": 0.0377, "odds_ratio_loss": 0.0010335225379094481, "rewards/accuracies": 1.0, "rewards/chosen": -0.001737236394546926, "rewards/margins": 0.50681471824646, "rewards/rejected": -0.5085519552230835, "sft_loss": 0.017372364178299904, "step": 3873 }, { "epoch": 5.602313810556761, "grad_norm": 0.859332407138211, "learning_rate": 8.887935097983712e-08, "logits/chosen": -0.8807803392410278, "logits/rejected": -0.7796051502227783, "logps/chosen": -0.039813119918107986, "logps/rejected": -5.857760906219482, "loss": 0.0247, "odds_ratio_loss": 0.004182462580502033, "rewards/accuracies": 1.0, "rewards/chosen": -0.003981312271207571, "rewards/margins": 0.581794798374176, "rewards/rejected": -0.585776150226593, "sft_loss": 0.039813119918107986, "step": 3874 }, { "epoch": 5.603759942154736, "grad_norm": 0.8777433120960881, "learning_rate": 8.822943644630454e-08, "logits/chosen": -0.9151740074157715, "logits/rejected": -0.6227646470069885, "logps/chosen": -0.06858855485916138, "logps/rejected": -5.1172966957092285, "loss": 0.0345, "odds_ratio_loss": 0.001364107825793326, "rewards/accuracies": 1.0, "rewards/chosen": -0.006858856417238712, "rewards/margins": 0.5048707723617554, "rewards/rejected": -0.5117296576499939, "sft_loss": 0.06858855485916138, "step": 3875 }, { "epoch": 5.605206073752711, "grad_norm": 1.226504746702496, "learning_rate": 8.758188033381353e-08, "logits/chosen": -0.7460124492645264, "logits/rejected": -0.5613774061203003, "logps/chosen": -0.05687641724944115, "logps/rejected": -5.019678115844727, "loss": 0.0342, "odds_ratio_loss": 0.003194852964952588, "rewards/accuracies": 1.0, "rewards/chosen": -0.005687640979886055, "rewards/margins": 0.49628016352653503, "rewards/rejected": -0.5019677877426147, "sft_loss": 0.05687641724944115, "step": 3876 }, { "epoch": 5.606652205350687, "grad_norm": 1.1254751656230593, "learning_rate": 8.69366830327789e-08, "logits/chosen": -0.8707925081253052, "logits/rejected": -0.6477944850921631, "logps/chosen": -0.12705475091934204, "logps/rejected": -3.9010202884674072, "loss": 0.0603, "odds_ratio_loss": 0.006006310693919659, "rewards/accuracies": 1.0, "rewards/chosen": -0.012705476023256779, "rewards/margins": 0.3773965835571289, "rewards/rejected": -0.3901020586490631, "sft_loss": 0.12705475091934204, "step": 3877 }, { "epoch": 5.608098336948663, "grad_norm": 0.8131598612601004, "learning_rate": 8.629384493219128e-08, "logits/chosen": -0.8724921345710754, "logits/rejected": -0.7644715309143066, "logps/chosen": -0.05918511375784874, "logps/rejected": -4.924114227294922, "loss": 0.0357, "odds_ratio_loss": 0.00036744706449098885, "rewards/accuracies": 1.0, "rewards/chosen": -0.005918511189520359, "rewards/margins": 0.48649293184280396, "rewards/rejected": -0.49241143465042114, "sft_loss": 0.05918511375784874, "step": 3878 }, { "epoch": 5.609544468546638, "grad_norm": 1.1354278291872029, "learning_rate": 8.565336641962106e-08, "logits/chosen": -0.7309091687202454, "logits/rejected": -0.5149399638175964, "logps/chosen": -0.017144229263067245, "logps/rejected": -5.465782642364502, "loss": 0.0358, "odds_ratio_loss": 0.000733592314645648, "rewards/accuracies": 1.0, "rewards/chosen": -0.001714422949589789, "rewards/margins": 0.5448638200759888, "rewards/rejected": -0.5465782284736633, "sft_loss": 0.017144229263067245, "step": 3879 }, { "epoch": 5.610990600144613, "grad_norm": 0.907687038565481, "learning_rate": 8.501524788121494e-08, "logits/chosen": -0.8225124478340149, "logits/rejected": -0.456741601228714, "logps/chosen": -0.012008003890514374, "logps/rejected": -6.019430160522461, "loss": 0.0395, "odds_ratio_loss": 0.0003659829089883715, "rewards/accuracies": 1.0, "rewards/chosen": -0.00120080029591918, "rewards/margins": 0.6007422208786011, "rewards/rejected": -0.6019430160522461, "sft_loss": 0.012008003890514374, "step": 3880 }, { "epoch": 5.612436731742589, "grad_norm": 0.8500819053650278, "learning_rate": 8.437948970169629e-08, "logits/chosen": -0.8494061231613159, "logits/rejected": -0.6348491311073303, "logps/chosen": -0.01760895922780037, "logps/rejected": -5.886898040771484, "loss": 0.0273, "odds_ratio_loss": 0.0010809004306793213, "rewards/accuracies": 1.0, "rewards/chosen": -0.0017608960624784231, "rewards/margins": 0.5869288444519043, "rewards/rejected": -0.5886898040771484, "sft_loss": 0.01760895922780037, "step": 3881 }, { "epoch": 5.613882863340564, "grad_norm": 1.107555678550244, "learning_rate": 8.374609226436735e-08, "logits/chosen": -1.0346620082855225, "logits/rejected": -0.6543772220611572, "logps/chosen": -0.04019223153591156, "logps/rejected": -6.5853447914123535, "loss": 0.0296, "odds_ratio_loss": 0.0007978305802680552, "rewards/accuracies": 1.0, "rewards/chosen": -0.004019223153591156, "rewards/margins": 0.654515266418457, "rewards/rejected": -0.6585345268249512, "sft_loss": 0.04019223153591156, "step": 3882 }, { "epoch": 5.615328994938539, "grad_norm": 1.6594008368972768, "learning_rate": 8.311505595110446e-08, "logits/chosen": -0.9576680064201355, "logits/rejected": -0.7107992172241211, "logps/chosen": -0.03443386033177376, "logps/rejected": -4.302804946899414, "loss": 0.0296, "odds_ratio_loss": 0.0024423853028565645, "rewards/accuracies": 1.0, "rewards/chosen": -0.003443386172875762, "rewards/margins": 0.42683711647987366, "rewards/rejected": -0.43028050661087036, "sft_loss": 0.03443386033177376, "step": 3883 }, { "epoch": 5.616775126536515, "grad_norm": 0.8736460272012623, "learning_rate": 8.248638114236283e-08, "logits/chosen": -0.8958337306976318, "logits/rejected": -0.6962901949882507, "logps/chosen": -0.035607580095529556, "logps/rejected": -4.740180015563965, "loss": 0.0246, "odds_ratio_loss": 0.0013252833159640431, "rewards/accuracies": 1.0, "rewards/chosen": -0.003560757962986827, "rewards/margins": 0.4704572558403015, "rewards/rejected": -0.4740179777145386, "sft_loss": 0.035607580095529556, "step": 3884 }, { "epoch": 5.61822125813449, "grad_norm": 1.1111572783157635, "learning_rate": 8.186006821717173e-08, "logits/chosen": -0.9697292447090149, "logits/rejected": -0.8010947704315186, "logps/chosen": -0.08695336431264877, "logps/rejected": -5.5294318199157715, "loss": 0.0365, "odds_ratio_loss": 0.000782729999627918, "rewards/accuracies": 1.0, "rewards/chosen": -0.008695336058735847, "rewards/margins": 0.5442478656768799, "rewards/rejected": -0.552943229675293, "sft_loss": 0.08695336431264877, "step": 3885 }, { "epoch": 5.619667389732466, "grad_norm": 1.0281330550146877, "learning_rate": 8.123611755313887e-08, "logits/chosen": -1.1211961507797241, "logits/rejected": -0.7908611297607422, "logps/chosen": -0.022736605256795883, "logps/rejected": -6.762401580810547, "loss": 0.0396, "odds_ratio_loss": 0.001896974048577249, "rewards/accuracies": 1.0, "rewards/chosen": -0.0022736606188118458, "rewards/margins": 0.6739665269851685, "rewards/rejected": -0.6762401461601257, "sft_loss": 0.022736605256795883, "step": 3886 }, { "epoch": 5.621113521330441, "grad_norm": 1.2903301444306965, "learning_rate": 8.061452952644598e-08, "logits/chosen": -0.7981445789337158, "logits/rejected": -0.6683976054191589, "logps/chosen": -0.051232676953077316, "logps/rejected": -4.502172470092773, "loss": 0.0434, "odds_ratio_loss": 0.0008279865724034607, "rewards/accuracies": 1.0, "rewards/chosen": -0.005123267415910959, "rewards/margins": 0.4450939893722534, "rewards/rejected": -0.45021724700927734, "sft_loss": 0.051232676953077316, "step": 3887 }, { "epoch": 5.622559652928416, "grad_norm": 1.4460602872131916, "learning_rate": 7.999530451185022e-08, "logits/chosen": -0.9000306129455566, "logits/rejected": -0.6654245853424072, "logps/chosen": -0.06389784812927246, "logps/rejected": -5.316483497619629, "loss": 0.0643, "odds_ratio_loss": 0.006937685422599316, "rewards/accuracies": 1.0, "rewards/chosen": -0.006389784626662731, "rewards/margins": 0.5252585411071777, "rewards/rejected": -0.5316482782363892, "sft_loss": 0.06389784812927246, "step": 3888 }, { "epoch": 5.624005784526392, "grad_norm": 1.245773734697951, "learning_rate": 7.937844288268447e-08, "logits/chosen": -1.00962495803833, "logits/rejected": -0.862888514995575, "logps/chosen": -0.06227367743849754, "logps/rejected": -5.003812789916992, "loss": 0.0382, "odds_ratio_loss": 0.0036341436207294464, "rewards/accuracies": 1.0, "rewards/chosen": -0.006227367557585239, "rewards/margins": 0.4941539466381073, "rewards/rejected": -0.5003812909126282, "sft_loss": 0.06227367743849754, "step": 3889 }, { "epoch": 5.6254519161243675, "grad_norm": 1.0056455308192838, "learning_rate": 7.876394501085837e-08, "logits/chosen": -0.9098703861236572, "logits/rejected": -0.7144231796264648, "logps/chosen": -0.07516808062791824, "logps/rejected": -4.229522705078125, "loss": 0.0351, "odds_ratio_loss": 0.0012169769033789635, "rewards/accuracies": 1.0, "rewards/chosen": -0.007516808342188597, "rewards/margins": 0.41543543338775635, "rewards/rejected": -0.42295223474502563, "sft_loss": 0.07516808062791824, "step": 3890 }, { "epoch": 5.626898047722342, "grad_norm": 1.070424100517713, "learning_rate": 7.815181126685332e-08, "logits/chosen": -0.8877123594284058, "logits/rejected": -0.5519871115684509, "logps/chosen": -0.024476561695337296, "logps/rejected": -5.705207824707031, "loss": 0.0375, "odds_ratio_loss": 0.0009460779256187379, "rewards/accuracies": 1.0, "rewards/chosen": -0.002447655890136957, "rewards/margins": 0.5680731534957886, "rewards/rejected": -0.5705208778381348, "sft_loss": 0.024476561695337296, "step": 3891 }, { "epoch": 5.628344179320318, "grad_norm": 1.002367017711253, "learning_rate": 7.754204201972791e-08, "logits/chosen": -1.0830185413360596, "logits/rejected": -0.8066056966781616, "logps/chosen": -0.02625204622745514, "logps/rejected": -5.084045886993408, "loss": 0.0426, "odds_ratio_loss": 0.0028547344263643026, "rewards/accuracies": 1.0, "rewards/chosen": -0.0026252048555761576, "rewards/margins": 0.5057793855667114, "rewards/rejected": -0.5084046125411987, "sft_loss": 0.02625204622745514, "step": 3892 }, { "epoch": 5.629790310918294, "grad_norm": 1.3223399901665873, "learning_rate": 7.693463763711472e-08, "logits/chosen": -0.7070760726928711, "logits/rejected": -0.5678169131278992, "logps/chosen": -0.028207141906023026, "logps/rejected": -5.583174228668213, "loss": 0.0541, "odds_ratio_loss": 0.0005133537924848497, "rewards/accuracies": 1.0, "rewards/chosen": -0.0028207143768668175, "rewards/margins": 0.5554966330528259, "rewards/rejected": -0.5583173632621765, "sft_loss": 0.028207141906023026, "step": 3893 }, { "epoch": 5.631236442516269, "grad_norm": 1.0036968645987125, "learning_rate": 7.632959848521903e-08, "logits/chosen": -0.7904921770095825, "logits/rejected": -0.7324553728103638, "logps/chosen": -0.03105638548731804, "logps/rejected": -4.606823921203613, "loss": 0.043, "odds_ratio_loss": 0.0032230219803750515, "rewards/accuracies": 1.0, "rewards/chosen": -0.003105638548731804, "rewards/margins": 0.4575767517089844, "rewards/rejected": -0.46068239212036133, "sft_loss": 0.03105638548731804, "step": 3894 }, { "epoch": 5.632682574114244, "grad_norm": 1.2934557508771576, "learning_rate": 7.572692492882237e-08, "logits/chosen": -1.0913989543914795, "logits/rejected": -0.7185604572296143, "logps/chosen": -0.06864849478006363, "logps/rejected": -5.759372711181641, "loss": 0.04, "odds_ratio_loss": 0.0022043841890990734, "rewards/accuracies": 1.0, "rewards/chosen": -0.00686484994366765, "rewards/margins": 0.5690724849700928, "rewards/rejected": -0.5759373307228088, "sft_loss": 0.06864849478006363, "step": 3895 }, { "epoch": 5.63412870571222, "grad_norm": 0.9311986244435818, "learning_rate": 7.512661733127723e-08, "logits/chosen": -0.8719162940979004, "logits/rejected": -0.5234925746917725, "logps/chosen": -0.013395091518759727, "logps/rejected": -7.250702857971191, "loss": 0.0307, "odds_ratio_loss": 0.00034357167896814644, "rewards/accuracies": 1.0, "rewards/chosen": -0.0013395091518759727, "rewards/margins": 0.7237308025360107, "rewards/rejected": -0.7250703573226929, "sft_loss": 0.013395091518759727, "step": 3896 }, { "epoch": 5.6355748373101955, "grad_norm": 1.3758621137819032, "learning_rate": 7.452867605451318e-08, "logits/chosen": -0.7366227507591248, "logits/rejected": -0.581299901008606, "logps/chosen": -0.033642686903476715, "logps/rejected": -6.8972673416137695, "loss": 0.0332, "odds_ratio_loss": 0.0029167046304792166, "rewards/accuracies": 1.0, "rewards/chosen": -0.0033642686903476715, "rewards/margins": 0.6863625049591064, "rewards/rejected": -0.689726710319519, "sft_loss": 0.033642686903476715, "step": 3897 }, { "epoch": 5.63702096890817, "grad_norm": 1.1931931524825197, "learning_rate": 7.393310145902987e-08, "logits/chosen": -0.883941650390625, "logits/rejected": -0.7122170925140381, "logps/chosen": -0.038125600665807724, "logps/rejected": -5.787795066833496, "loss": 0.0342, "odds_ratio_loss": 0.0028251700568944216, "rewards/accuracies": 1.0, "rewards/chosen": -0.0038125598803162575, "rewards/margins": 0.5749669671058655, "rewards/rejected": -0.5787795186042786, "sft_loss": 0.038125600665807724, "step": 3898 }, { "epoch": 5.638467100506146, "grad_norm": 0.8511199612804827, "learning_rate": 7.333989390390183e-08, "logits/chosen": -1.15104341506958, "logits/rejected": -0.6643425226211548, "logps/chosen": -0.05288753658533096, "logps/rejected": -6.539332389831543, "loss": 0.0303, "odds_ratio_loss": 0.006646599620580673, "rewards/accuracies": 1.0, "rewards/chosen": -0.005288753658533096, "rewards/margins": 0.6486445069313049, "rewards/rejected": -0.6539332866668701, "sft_loss": 0.05288753658533096, "step": 3899 }, { "epoch": 5.639913232104122, "grad_norm": 0.9691714602883686, "learning_rate": 7.274905374677631e-08, "logits/chosen": -1.0755349397659302, "logits/rejected": -0.599575400352478, "logps/chosen": -0.03918579965829849, "logps/rejected": -4.2741804122924805, "loss": 0.0328, "odds_ratio_loss": 0.0014571095816791058, "rewards/accuracies": 1.0, "rewards/chosen": -0.003918579779565334, "rewards/margins": 0.4234994649887085, "rewards/rejected": -0.427418053150177, "sft_loss": 0.03918579965829849, "step": 3900 }, { "epoch": 5.641359363702097, "grad_norm": 0.8756828917317308, "learning_rate": 7.216058134387326e-08, "logits/chosen": -0.8658799529075623, "logits/rejected": -0.6761690378189087, "logps/chosen": -0.01636935956776142, "logps/rejected": -5.539029598236084, "loss": 0.0242, "odds_ratio_loss": 0.000594843877479434, "rewards/accuracies": 1.0, "rewards/chosen": -0.0016369358636438847, "rewards/margins": 0.552266001701355, "rewards/rejected": -0.5539029836654663, "sft_loss": 0.01636935956776142, "step": 3901 }, { "epoch": 5.642805495300072, "grad_norm": 1.2086469436479532, "learning_rate": 7.157447704998443e-08, "logits/chosen": -0.72590172290802, "logits/rejected": -0.47788357734680176, "logps/chosen": -0.05741055682301521, "logps/rejected": -6.098160743713379, "loss": 0.0431, "odds_ratio_loss": 0.0019345948239788413, "rewards/accuracies": 1.0, "rewards/chosen": -0.005741056054830551, "rewards/margins": 0.6040751338005066, "rewards/rejected": -0.6098161339759827, "sft_loss": 0.05741055682301521, "step": 3902 }, { "epoch": 5.644251626898048, "grad_norm": 0.960222765821685, "learning_rate": 7.099074121847426e-08, "logits/chosen": -0.8502532243728638, "logits/rejected": -0.6490471363067627, "logps/chosen": -0.027551371604204178, "logps/rejected": -4.097101211547852, "loss": 0.0514, "odds_ratio_loss": 0.0017816171748563647, "rewards/accuracies": 1.0, "rewards/chosen": -0.0027551373932510614, "rewards/margins": 0.4069550037384033, "rewards/rejected": -0.409710168838501, "sft_loss": 0.027551371604204178, "step": 3903 }, { "epoch": 5.6456977584960235, "grad_norm": 1.6231756077512831, "learning_rate": 7.040937420127946e-08, "logits/chosen": -0.9650408029556274, "logits/rejected": -0.8903849720954895, "logps/chosen": -0.08151555061340332, "logps/rejected": -4.457081317901611, "loss": 0.0511, "odds_ratio_loss": 0.0022657865192741156, "rewards/accuracies": 1.0, "rewards/chosen": -0.008151555433869362, "rewards/margins": 0.43755653500556946, "rewards/rejected": -0.44570815563201904, "sft_loss": 0.08151555061340332, "step": 3904 }, { "epoch": 5.647143890093998, "grad_norm": 0.9400284707795573, "learning_rate": 6.983037634890809e-08, "logits/chosen": -0.7602153420448303, "logits/rejected": -0.5387284159660339, "logps/chosen": -0.009475589729845524, "logps/rejected": -6.176396369934082, "loss": 0.0221, "odds_ratio_loss": 0.0016149263828992844, "rewards/accuracies": 1.0, "rewards/chosen": -0.0009475590195506811, "rewards/margins": 0.616692066192627, "rewards/rejected": -0.6176395416259766, "sft_loss": 0.009475589729845524, "step": 3905 }, { "epoch": 5.648590021691974, "grad_norm": 0.8122607809365746, "learning_rate": 6.925374801044048e-08, "logits/chosen": -0.7360423803329468, "logits/rejected": -0.7772014737129211, "logps/chosen": -0.029816610738635063, "logps/rejected": -5.955297470092773, "loss": 0.0296, "odds_ratio_loss": 0.003913013264536858, "rewards/accuracies": 1.0, "rewards/chosen": -0.002981661120429635, "rewards/margins": 0.592548131942749, "rewards/rejected": -0.5955298542976379, "sft_loss": 0.029816610738635063, "step": 3906 }, { "epoch": 5.65003615328995, "grad_norm": 1.3780036646742946, "learning_rate": 6.867948953352787e-08, "logits/chosen": -0.9336844682693481, "logits/rejected": -0.586871862411499, "logps/chosen": -0.06249716877937317, "logps/rejected": -4.8502020835876465, "loss": 0.058, "odds_ratio_loss": 0.0021649636328220367, "rewards/accuracies": 1.0, "rewards/chosen": -0.006249716971069574, "rewards/margins": 0.47877049446105957, "rewards/rejected": -0.4850202202796936, "sft_loss": 0.06249716877937317, "step": 3907 }, { "epoch": 5.6514822848879245, "grad_norm": 0.9200298747281895, "learning_rate": 6.810760126439285e-08, "logits/chosen": -0.903180718421936, "logits/rejected": -0.6644023060798645, "logps/chosen": -0.028901048004627228, "logps/rejected": -5.7049994468688965, "loss": 0.0239, "odds_ratio_loss": 0.0007545308326371014, "rewards/accuracies": 1.0, "rewards/chosen": -0.0028901048935949802, "rewards/margins": 0.5676099061965942, "rewards/rejected": -0.5705000162124634, "sft_loss": 0.028901048004627228, "step": 3908 }, { "epoch": 5.6529284164859, "grad_norm": 1.024708514255704, "learning_rate": 6.753808354782898e-08, "logits/chosen": -0.8410828113555908, "logits/rejected": -0.6992049217224121, "logps/chosen": -0.020299401134252548, "logps/rejected": -3.7354702949523926, "loss": 0.0264, "odds_ratio_loss": 0.0014652770478278399, "rewards/accuracies": 1.0, "rewards/chosen": -0.002029940253123641, "rewards/margins": 0.3715171217918396, "rewards/rejected": -0.3735470175743103, "sft_loss": 0.020299401134252548, "step": 3909 }, { "epoch": 5.654374548083876, "grad_norm": 0.9967547588704444, "learning_rate": 6.697093672720067e-08, "logits/chosen": -0.9125003218650818, "logits/rejected": -0.7792860269546509, "logps/chosen": -0.027637945488095284, "logps/rejected": -5.146880149841309, "loss": 0.0521, "odds_ratio_loss": 0.0023928144946694374, "rewards/accuracies": 1.0, "rewards/chosen": -0.0027637947350740433, "rewards/margins": 0.5119242668151855, "rewards/rejected": -0.5146880149841309, "sft_loss": 0.027637945488095284, "step": 3910 }, { "epoch": 5.655820679681851, "grad_norm": 1.0733336260472248, "learning_rate": 6.640616114444287e-08, "logits/chosen": -0.8398350477218628, "logits/rejected": -0.598203718662262, "logps/chosen": -0.019008953124284744, "logps/rejected": -3.937981605529785, "loss": 0.0404, "odds_ratio_loss": 0.0007247254834510386, "rewards/accuracies": 1.0, "rewards/chosen": -0.0019008952658623457, "rewards/margins": 0.3918972611427307, "rewards/rejected": -0.39379817247390747, "sft_loss": 0.019008953124284744, "step": 3911 }, { "epoch": 5.657266811279826, "grad_norm": 1.0696577971049983, "learning_rate": 6.584375714006052e-08, "logits/chosen": -0.9154694676399231, "logits/rejected": -0.65110182762146, "logps/chosen": -0.011146700009703636, "logps/rejected": -6.324227809906006, "loss": 0.0469, "odds_ratio_loss": 0.0005139351123943925, "rewards/accuracies": 1.0, "rewards/chosen": -0.0011146699544042349, "rewards/margins": 0.6313080787658691, "rewards/rejected": -0.6324228048324585, "sft_loss": 0.011146700009703636, "step": 3912 }, { "epoch": 5.658712942877802, "grad_norm": 1.1318905034129576, "learning_rate": 6.528372505312907e-08, "logits/chosen": -1.1783943176269531, "logits/rejected": -0.9010468125343323, "logps/chosen": -0.03388334438204765, "logps/rejected": -6.663463592529297, "loss": 0.0409, "odds_ratio_loss": 0.002394784474745393, "rewards/accuracies": 1.0, "rewards/chosen": -0.0033883345313370228, "rewards/margins": 0.662958025932312, "rewards/rejected": -0.6663463115692139, "sft_loss": 0.03388334438204765, "step": 3913 }, { "epoch": 5.660159074475777, "grad_norm": 0.9399824898174974, "learning_rate": 6.472606522129487e-08, "logits/chosen": -0.9045823812484741, "logits/rejected": -0.847064733505249, "logps/chosen": -0.0137400571256876, "logps/rejected": -4.138844966888428, "loss": 0.0284, "odds_ratio_loss": 0.0009114354033954442, "rewards/accuracies": 1.0, "rewards/chosen": -0.0013740058057010174, "rewards/margins": 0.412510484457016, "rewards/rejected": -0.4138845205307007, "sft_loss": 0.0137400571256876, "step": 3914 }, { "epoch": 5.6616052060737525, "grad_norm": 0.9973586984702861, "learning_rate": 6.417077798077209e-08, "logits/chosen": -1.010211706161499, "logits/rejected": -0.861878514289856, "logps/chosen": -0.061721622943878174, "logps/rejected": -4.818317413330078, "loss": 0.0299, "odds_ratio_loss": 0.004441537894308567, "rewards/accuracies": 1.0, "rewards/chosen": -0.006172161549329758, "rewards/margins": 0.4756595492362976, "rewards/rejected": -0.48183172941207886, "sft_loss": 0.061721622943878174, "step": 3915 }, { "epoch": 5.663051337671728, "grad_norm": 0.998537365179468, "learning_rate": 6.361786366634625e-08, "logits/chosen": -1.0144376754760742, "logits/rejected": -0.8828725218772888, "logps/chosen": -0.038881972432136536, "logps/rejected": -5.611100196838379, "loss": 0.0355, "odds_ratio_loss": 0.0021481101866811514, "rewards/accuracies": 1.0, "rewards/chosen": -0.0038881972432136536, "rewards/margins": 0.5572217702865601, "rewards/rejected": -0.5611100196838379, "sft_loss": 0.038881972432136536, "step": 3916 }, { "epoch": 5.664497469269704, "grad_norm": 1.3821746245355928, "learning_rate": 6.306732261137027e-08, "logits/chosen": -1.0927997827529907, "logits/rejected": -0.7127183675765991, "logps/chosen": -0.03770868852734566, "logps/rejected": -5.420331001281738, "loss": 0.0344, "odds_ratio_loss": 0.0027799506206065416, "rewards/accuracies": 1.0, "rewards/chosen": -0.0037708692252635956, "rewards/margins": 0.5382621884346008, "rewards/rejected": -0.5420330762863159, "sft_loss": 0.03770868852734566, "step": 3917 }, { "epoch": 5.665943600867679, "grad_norm": 1.2771633707593975, "learning_rate": 6.251915514776884e-08, "logits/chosen": -0.7677035331726074, "logits/rejected": -0.5734249353408813, "logps/chosen": -0.03404954820871353, "logps/rejected": -6.219034194946289, "loss": 0.0283, "odds_ratio_loss": 0.00040826547774486244, "rewards/accuracies": 1.0, "rewards/chosen": -0.003404954681172967, "rewards/margins": 0.6184984445571899, "rewards/rejected": -0.6219034194946289, "sft_loss": 0.03404954820871353, "step": 3918 }, { "epoch": 5.667389732465654, "grad_norm": 1.0819061444739764, "learning_rate": 6.197336160603362e-08, "logits/chosen": -1.1142338514328003, "logits/rejected": -0.6450099349021912, "logps/chosen": -0.08008662611246109, "logps/rejected": -5.973351955413818, "loss": 0.0478, "odds_ratio_loss": 0.0011582360602915287, "rewards/accuracies": 1.0, "rewards/chosen": -0.008008661679923534, "rewards/margins": 0.5893265008926392, "rewards/rejected": -0.5973352193832397, "sft_loss": 0.08008662611246109, "step": 3919 }, { "epoch": 5.66883586406363, "grad_norm": 0.979593206342996, "learning_rate": 6.142994231522492e-08, "logits/chosen": -0.7907422184944153, "logits/rejected": -0.5625496506690979, "logps/chosen": -0.012852794490754604, "logps/rejected": -4.704351425170898, "loss": 0.0259, "odds_ratio_loss": 0.0008764659287407994, "rewards/accuracies": 1.0, "rewards/chosen": -0.0012852795189246535, "rewards/margins": 0.4691498577594757, "rewards/rejected": -0.47043511271476746, "sft_loss": 0.012852794490754604, "step": 3920 }, { "epoch": 5.670281995661605, "grad_norm": 0.9326543092206134, "learning_rate": 6.088889760297312e-08, "logits/chosen": -0.769900918006897, "logits/rejected": -0.5406790971755981, "logps/chosen": -0.01830982230603695, "logps/rejected": -5.646090507507324, "loss": 0.027, "odds_ratio_loss": 0.002648351714015007, "rewards/accuracies": 1.0, "rewards/chosen": -0.0018309823935851455, "rewards/margins": 0.562778115272522, "rewards/rejected": -0.5646090507507324, "sft_loss": 0.01830982230603695, "step": 3921 }, { "epoch": 5.6717281272595805, "grad_norm": 1.1029927893990603, "learning_rate": 6.035022779547549e-08, "logits/chosen": -0.7700226306915283, "logits/rejected": -0.752061128616333, "logps/chosen": -0.06235011667013168, "logps/rejected": -5.587028503417969, "loss": 0.0411, "odds_ratio_loss": 0.0030640983022749424, "rewards/accuracies": 1.0, "rewards/chosen": -0.006235011853277683, "rewards/margins": 0.5524678230285645, "rewards/rejected": -0.558702826499939, "sft_loss": 0.06235011667013168, "step": 3922 }, { "epoch": 5.673174258857556, "grad_norm": 1.1957365894214984, "learning_rate": 5.981393321749894e-08, "logits/chosen": -0.8172686100006104, "logits/rejected": -0.7470732927322388, "logps/chosen": -0.06232059746980667, "logps/rejected": -3.8870420455932617, "loss": 0.0631, "odds_ratio_loss": 0.007645574398338795, "rewards/accuracies": 1.0, "rewards/chosen": -0.006232059560716152, "rewards/margins": 0.38247212767601013, "rewards/rejected": -0.38870421051979065, "sft_loss": 0.06232059746980667, "step": 3923 }, { "epoch": 5.674620390455532, "grad_norm": 0.9288076500148977, "learning_rate": 5.928001419237638e-08, "logits/chosen": -0.9686081409454346, "logits/rejected": -0.8795914649963379, "logps/chosen": -0.009057226590812206, "logps/rejected": -6.156771659851074, "loss": 0.0374, "odds_ratio_loss": 0.0005254077259451151, "rewards/accuracies": 1.0, "rewards/chosen": -0.0009057226707227528, "rewards/margins": 0.6147714853286743, "rewards/rejected": -0.6156772375106812, "sft_loss": 0.009057226590812206, "step": 3924 }, { "epoch": 5.676066522053507, "grad_norm": 1.0560130705009327, "learning_rate": 5.8748471042010305e-08, "logits/chosen": -1.093846321105957, "logits/rejected": -0.7616704702377319, "logps/chosen": -0.03402628004550934, "logps/rejected": -6.118165969848633, "loss": 0.0326, "odds_ratio_loss": 0.0014892476610839367, "rewards/accuracies": 1.0, "rewards/chosen": -0.0034026282373815775, "rewards/margins": 0.6084139943122864, "rewards/rejected": -0.6118166446685791, "sft_loss": 0.03402628004550934, "step": 3925 }, { "epoch": 5.677512653651482, "grad_norm": 0.8648937856981704, "learning_rate": 5.8219304086869705e-08, "logits/chosen": -0.9695035219192505, "logits/rejected": -0.7986159324645996, "logps/chosen": -0.03180558234453201, "logps/rejected": -4.99746561050415, "loss": 0.0244, "odds_ratio_loss": 0.003096992615610361, "rewards/accuracies": 1.0, "rewards/chosen": -0.003180558094754815, "rewards/margins": 0.4965660274028778, "rewards/rejected": -0.49974656105041504, "sft_loss": 0.03180558234453201, "step": 3926 }, { "epoch": 5.678958785249458, "grad_norm": 0.9465923606611053, "learning_rate": 5.7692513645991814e-08, "logits/chosen": -0.9850341081619263, "logits/rejected": -0.7746860980987549, "logps/chosen": -0.024150801822543144, "logps/rejected": -5.850712776184082, "loss": 0.0322, "odds_ratio_loss": 0.0011510425247251987, "rewards/accuracies": 1.0, "rewards/chosen": -0.002415080089122057, "rewards/margins": 0.58265620470047, "rewards/rejected": -0.585071325302124, "sft_loss": 0.024150801822543144, "step": 3927 }, { "epoch": 5.680404916847433, "grad_norm": 1.0225694246772676, "learning_rate": 5.716810003697947e-08, "logits/chosen": -0.9132033586502075, "logits/rejected": -0.6539651155471802, "logps/chosen": -0.03499581664800644, "logps/rejected": -5.632237911224365, "loss": 0.0302, "odds_ratio_loss": 0.0011239980813115835, "rewards/accuracies": 1.0, "rewards/chosen": -0.003499581478536129, "rewards/margins": 0.5597242116928101, "rewards/rejected": -0.5632237195968628, "sft_loss": 0.03499581664800644, "step": 3928 }, { "epoch": 5.681851048445409, "grad_norm": 1.2386475856717893, "learning_rate": 5.664606357600465e-08, "logits/chosen": -0.8918399214744568, "logits/rejected": -0.7364063262939453, "logps/chosen": -0.05718269944190979, "logps/rejected": -5.176689147949219, "loss": 0.0438, "odds_ratio_loss": 0.0016231231857091188, "rewards/accuracies": 1.0, "rewards/chosen": -0.005718270316720009, "rewards/margins": 0.511950671672821, "rewards/rejected": -0.5176689028739929, "sft_loss": 0.05718269944190979, "step": 3929 }, { "epoch": 5.683297180043384, "grad_norm": 1.2202685542736273, "learning_rate": 5.612640457780449e-08, "logits/chosen": -0.8680728673934937, "logits/rejected": -0.6936816573143005, "logps/chosen": -0.035446494817733765, "logps/rejected": -3.8519763946533203, "loss": 0.0305, "odds_ratio_loss": 0.002002793364226818, "rewards/accuracies": 1.0, "rewards/chosen": -0.003544649574905634, "rewards/margins": 0.3816530108451843, "rewards/rejected": -0.38519763946533203, "sft_loss": 0.035446494817733765, "step": 3930 }, { "epoch": 5.684743311641359, "grad_norm": 1.1173651380498892, "learning_rate": 5.5609123355683906e-08, "logits/chosen": -0.9929153919219971, "logits/rejected": -0.6234613060951233, "logps/chosen": -0.020111847668886185, "logps/rejected": -6.586942672729492, "loss": 0.0348, "odds_ratio_loss": 0.00016166100976988673, "rewards/accuracies": 1.0, "rewards/chosen": -0.002011184813454747, "rewards/margins": 0.656683087348938, "rewards/rejected": -0.6586943864822388, "sft_loss": 0.020111847668886185, "step": 3931 }, { "epoch": 5.686189443239335, "grad_norm": 0.8926495914234045, "learning_rate": 5.5094220221513e-08, "logits/chosen": -1.0222707986831665, "logits/rejected": -0.8124725222587585, "logps/chosen": -0.01710999198257923, "logps/rejected": -4.495701313018799, "loss": 0.0258, "odds_ratio_loss": 0.0012623387156054378, "rewards/accuracies": 1.0, "rewards/chosen": -0.0017109992913901806, "rewards/margins": 0.44785916805267334, "rewards/rejected": -0.4495701789855957, "sft_loss": 0.01710999198257923, "step": 3932 }, { "epoch": 5.68763557483731, "grad_norm": 1.2731963859242328, "learning_rate": 5.4581695485729665e-08, "logits/chosen": -0.7314611077308655, "logits/rejected": -0.7094411849975586, "logps/chosen": -0.024839885532855988, "logps/rejected": -6.205300331115723, "loss": 0.0328, "odds_ratio_loss": 0.0008890972239896655, "rewards/accuracies": 1.0, "rewards/chosen": -0.0024839891120791435, "rewards/margins": 0.6180460453033447, "rewards/rejected": -0.6205300688743591, "sft_loss": 0.024839885532855988, "step": 3933 }, { "epoch": 5.689081706435285, "grad_norm": 1.1808337226365984, "learning_rate": 5.407154945733605e-08, "logits/chosen": -0.7870131731033325, "logits/rejected": -0.5856943130493164, "logps/chosen": -0.09459728002548218, "logps/rejected": -5.333677291870117, "loss": 0.0516, "odds_ratio_loss": 0.005132491700351238, "rewards/accuracies": 1.0, "rewards/chosen": -0.009459727443754673, "rewards/margins": 0.5239080190658569, "rewards/rejected": -0.5333677530288696, "sft_loss": 0.09459728002548218, "step": 3934 }, { "epoch": 5.690527838033261, "grad_norm": 0.9096943069943451, "learning_rate": 5.3563782443901254e-08, "logits/chosen": -0.8053758144378662, "logits/rejected": -0.6625349521636963, "logps/chosen": -0.030897527933120728, "logps/rejected": -3.146284580230713, "loss": 0.0285, "odds_ratio_loss": 0.0019149701111018658, "rewards/accuracies": 1.0, "rewards/chosen": -0.003089752746745944, "rewards/margins": 0.3115387260913849, "rewards/rejected": -0.3146284520626068, "sft_loss": 0.030897527933120728, "step": 3935 }, { "epoch": 5.691973969631237, "grad_norm": 1.3116878149312352, "learning_rate": 5.305839475156082e-08, "logits/chosen": -0.9307630658149719, "logits/rejected": -0.6568589806556702, "logps/chosen": -0.05684416741132736, "logps/rejected": -5.24498176574707, "loss": 0.0543, "odds_ratio_loss": 0.002522802911698818, "rewards/accuracies": 1.0, "rewards/chosen": -0.005684417672455311, "rewards/margins": 0.5188138484954834, "rewards/rejected": -0.5244981646537781, "sft_loss": 0.05684416741132736, "step": 3936 }, { "epoch": 5.693420101229211, "grad_norm": 1.133640478705631, "learning_rate": 5.2555386685013247e-08, "logits/chosen": -0.8474830389022827, "logits/rejected": -0.6886616349220276, "logps/chosen": -0.07056474685668945, "logps/rejected": -3.7979629039764404, "loss": 0.0575, "odds_ratio_loss": 0.0038194474764168262, "rewards/accuracies": 1.0, "rewards/chosen": -0.00705647561699152, "rewards/margins": 0.3727398216724396, "rewards/rejected": -0.37979626655578613, "sft_loss": 0.07056474685668945, "step": 3937 }, { "epoch": 5.694866232827187, "grad_norm": 0.9958326615719902, "learning_rate": 5.2054758547525724e-08, "logits/chosen": -0.7519106268882751, "logits/rejected": -0.6501749753952026, "logps/chosen": -0.06976844370365143, "logps/rejected": -6.353296279907227, "loss": 0.0386, "odds_ratio_loss": 0.018397467210888863, "rewards/accuracies": 1.0, "rewards/chosen": -0.006976844742894173, "rewards/margins": 0.6283528208732605, "rewards/rejected": -0.6353296041488647, "sft_loss": 0.06976844370365143, "step": 3938 }, { "epoch": 5.696312364425163, "grad_norm": 0.9255804107593475, "learning_rate": 5.1556510640927476e-08, "logits/chosen": -0.729579746723175, "logits/rejected": -0.6220409870147705, "logps/chosen": -0.02262992598116398, "logps/rejected": -4.2784223556518555, "loss": 0.0388, "odds_ratio_loss": 0.0035235087852925062, "rewards/accuracies": 1.0, "rewards/chosen": -0.002262992551550269, "rewards/margins": 0.4255792498588562, "rewards/rejected": -0.42784222960472107, "sft_loss": 0.02262992598116398, "step": 3939 }, { "epoch": 5.697758496023138, "grad_norm": 1.0616569156321858, "learning_rate": 5.1060643265614655e-08, "logits/chosen": -0.8722561597824097, "logits/rejected": -0.7307120561599731, "logps/chosen": -0.018616218119859695, "logps/rejected": -4.420228481292725, "loss": 0.0281, "odds_ratio_loss": 0.0006978900055401027, "rewards/accuracies": 1.0, "rewards/chosen": -0.0018616218585520983, "rewards/margins": 0.44016122817993164, "rewards/rejected": -0.4420228898525238, "sft_loss": 0.018616218119859695, "step": 3940 }, { "epoch": 5.699204627621113, "grad_norm": 1.0380237647568364, "learning_rate": 5.056715672054768e-08, "logits/chosen": -0.9298012256622314, "logits/rejected": -0.6356141567230225, "logps/chosen": -0.04043480008840561, "logps/rejected": -4.05310583114624, "loss": 0.0241, "odds_ratio_loss": 0.0012896271655336022, "rewards/accuracies": 1.0, "rewards/chosen": -0.0040434799157083035, "rewards/margins": 0.4012671113014221, "rewards/rejected": -0.40531060099601746, "sft_loss": 0.04043480008840561, "step": 3941 }, { "epoch": 5.700650759219089, "grad_norm": 0.9822767657440377, "learning_rate": 5.007605130325121e-08, "logits/chosen": -0.8983743190765381, "logits/rejected": -0.6858455538749695, "logps/chosen": -0.02578556537628174, "logps/rejected": -3.956112861633301, "loss": 0.022, "odds_ratio_loss": 0.0009663483360782266, "rewards/accuracies": 1.0, "rewards/chosen": -0.002578556537628174, "rewards/margins": 0.3930327296257019, "rewards/rejected": -0.3956112861633301, "sft_loss": 0.02578556537628174, "step": 3942 }, { "epoch": 5.702096890817065, "grad_norm": 0.9640921864883987, "learning_rate": 4.958732730981374e-08, "logits/chosen": -0.5660995841026306, "logits/rejected": -0.5064358115196228, "logps/chosen": -0.04737599939107895, "logps/rejected": -5.079545021057129, "loss": 0.0378, "odds_ratio_loss": 0.004981108475476503, "rewards/accuracies": 1.0, "rewards/chosen": -0.00473759975284338, "rewards/margins": 0.5032169222831726, "rewards/rejected": -0.5079545378684998, "sft_loss": 0.04737599939107895, "step": 3943 }, { "epoch": 5.703543022415039, "grad_norm": 1.0044203070515496, "learning_rate": 4.910098503489024e-08, "logits/chosen": -0.5947985649108887, "logits/rejected": -0.4939855635166168, "logps/chosen": -0.04397990554571152, "logps/rejected": -5.396112442016602, "loss": 0.0367, "odds_ratio_loss": 0.006674241274595261, "rewards/accuracies": 1.0, "rewards/chosen": -0.004397990647703409, "rewards/margins": 0.53521329164505, "rewards/rejected": -0.5396112203598022, "sft_loss": 0.04397990554571152, "step": 3944 }, { "epoch": 5.704989154013015, "grad_norm": 1.149313184171066, "learning_rate": 4.861702477169727e-08, "logits/chosen": -0.9143013954162598, "logits/rejected": -0.6934011578559875, "logps/chosen": -0.009784114547073841, "logps/rejected": -4.77062463760376, "loss": 0.0218, "odds_ratio_loss": 0.0006179651827551425, "rewards/accuracies": 1.0, "rewards/chosen": -0.0009784114081412554, "rewards/margins": 0.4760840833187103, "rewards/rejected": -0.477062463760376, "sft_loss": 0.009784114547073841, "step": 3945 }, { "epoch": 5.706435285610991, "grad_norm": 1.2237874570885532, "learning_rate": 4.8135446812016536e-08, "logits/chosen": -1.1765408515930176, "logits/rejected": -0.8189411163330078, "logps/chosen": -0.06264963001012802, "logps/rejected": -6.33249044418335, "loss": 0.0452, "odds_ratio_loss": 0.008238528855144978, "rewards/accuracies": 1.0, "rewards/chosen": -0.006264963187277317, "rewards/margins": 0.6269841194152832, "rewards/rejected": -0.633249044418335, "sft_loss": 0.06264963001012802, "step": 3946 }, { "epoch": 5.7078814172089665, "grad_norm": 1.02404607668381, "learning_rate": 4.765625144619356e-08, "logits/chosen": -0.9042776823043823, "logits/rejected": -0.6364085674285889, "logps/chosen": -0.04948314279317856, "logps/rejected": -4.562315940856934, "loss": 0.039, "odds_ratio_loss": 0.0030646594241261482, "rewards/accuracies": 1.0, "rewards/chosen": -0.004948314279317856, "rewards/margins": 0.4512832760810852, "rewards/rejected": -0.45623159408569336, "sft_loss": 0.04948314279317856, "step": 3947 }, { "epoch": 5.709327548806941, "grad_norm": 1.1728511222826667, "learning_rate": 4.717943896313681e-08, "logits/chosen": -0.9472683072090149, "logits/rejected": -0.7687931060791016, "logps/chosen": -0.03766496852040291, "logps/rejected": -5.983687400817871, "loss": 0.0461, "odds_ratio_loss": 0.0009112533880397677, "rewards/accuracies": 1.0, "rewards/chosen": -0.003766496665775776, "rewards/margins": 0.5946022272109985, "rewards/rejected": -0.598368763923645, "sft_loss": 0.03766496852040291, "step": 3948 }, { "epoch": 5.710773680404917, "grad_norm": 0.9785284414828915, "learning_rate": 4.670500965031765e-08, "logits/chosen": -0.8775100708007812, "logits/rejected": -0.6726743578910828, "logps/chosen": -0.055906932801008224, "logps/rejected": -5.177661418914795, "loss": 0.0339, "odds_ratio_loss": 0.004313563462346792, "rewards/accuracies": 1.0, "rewards/chosen": -0.005590693559497595, "rewards/margins": 0.512175440788269, "rewards/rejected": -0.5177661180496216, "sft_loss": 0.055906932801008224, "step": 3949 }, { "epoch": 5.712219812002893, "grad_norm": 4.807573142568754, "learning_rate": 4.623296379377217e-08, "logits/chosen": -0.856716513633728, "logits/rejected": -0.6778161525726318, "logps/chosen": -0.0741441547870636, "logps/rejected": -5.552947998046875, "loss": 0.0455, "odds_ratio_loss": 0.008630833588540554, "rewards/accuracies": 1.0, "rewards/chosen": -0.0074144164100289345, "rewards/margins": 0.5478804707527161, "rewards/rejected": -0.5552948713302612, "sft_loss": 0.0741441547870636, "step": 3950 }, { "epoch": 5.713665943600867, "grad_norm": 0.792542970153592, "learning_rate": 4.5763301678098053e-08, "logits/chosen": -1.1035850048065186, "logits/rejected": -0.9252421855926514, "logps/chosen": -0.04065769910812378, "logps/rejected": -3.750488758087158, "loss": 0.0269, "odds_ratio_loss": 0.0019774322863668203, "rewards/accuracies": 1.0, "rewards/chosen": -0.004065769724547863, "rewards/margins": 0.3709830939769745, "rewards/rejected": -0.37504884600639343, "sft_loss": 0.04065769910812378, "step": 3951 }, { "epoch": 5.715112075198843, "grad_norm": 0.9629993811033565, "learning_rate": 4.5296023586456345e-08, "logits/chosen": -0.7754743099212646, "logits/rejected": -0.6971641778945923, "logps/chosen": -0.020119009539484978, "logps/rejected": -3.7750449180603027, "loss": 0.0333, "odds_ratio_loss": 0.04542544111609459, "rewards/accuracies": 0.9375, "rewards/chosen": -0.002011900767683983, "rewards/margins": 0.3754926323890686, "rewards/rejected": -0.37750452756881714, "sft_loss": 0.020119009539484978, "step": 3952 }, { "epoch": 5.716558206796819, "grad_norm": 0.7073799305020456, "learning_rate": 4.483112980057147e-08, "logits/chosen": -1.0943269729614258, "logits/rejected": -0.6168371438980103, "logps/chosen": -0.013947287574410439, "logps/rejected": -5.2490081787109375, "loss": 0.0179, "odds_ratio_loss": 0.0006277449429035187, "rewards/accuracies": 1.0, "rewards/chosen": -0.0013947287807241082, "rewards/margins": 0.5235061645507812, "rewards/rejected": -0.5249009132385254, "sft_loss": 0.013947287574410439, "step": 3953 }, { "epoch": 5.718004338394794, "grad_norm": 0.9432485852232118, "learning_rate": 4.436862060072855e-08, "logits/chosen": -0.8107536435127258, "logits/rejected": -0.7643659114837646, "logps/chosen": -0.02018115669488907, "logps/rejected": -3.54256010055542, "loss": 0.0125, "odds_ratio_loss": 0.001157900900579989, "rewards/accuracies": 1.0, "rewards/chosen": -0.0020181157160550356, "rewards/margins": 0.35223788022994995, "rewards/rejected": -0.3542560338973999, "sft_loss": 0.02018115669488907, "step": 3954 }, { "epoch": 5.719450469992769, "grad_norm": 1.6223932525353142, "learning_rate": 4.3908496265776973e-08, "logits/chosen": -0.651569128036499, "logits/rejected": -0.5832317471504211, "logps/chosen": -0.06823825091123581, "logps/rejected": -4.777907848358154, "loss": 0.0692, "odds_ratio_loss": 0.004125387407839298, "rewards/accuracies": 1.0, "rewards/chosen": -0.006823825184255838, "rewards/margins": 0.47096699476242065, "rewards/rejected": -0.47779080271720886, "sft_loss": 0.06823825091123581, "step": 3955 }, { "epoch": 5.720896601590745, "grad_norm": 1.0945610279479845, "learning_rate": 4.3450757073126844e-08, "logits/chosen": -1.1655569076538086, "logits/rejected": -0.6393382549285889, "logps/chosen": -0.023160209879279137, "logps/rejected": -6.733799457550049, "loss": 0.0396, "odds_ratio_loss": 0.0007181918481364846, "rewards/accuracies": 1.0, "rewards/chosen": -0.0023160211276263, "rewards/margins": 0.6710639595985413, "rewards/rejected": -0.6733798980712891, "sft_loss": 0.023160209879279137, "step": 3956 }, { "epoch": 5.72234273318872, "grad_norm": 0.9881255041140731, "learning_rate": 4.2995403298751176e-08, "logits/chosen": -0.8412911295890808, "logits/rejected": -0.7012451887130737, "logps/chosen": -0.035623397678136826, "logps/rejected": -3.769057035446167, "loss": 0.0458, "odds_ratio_loss": 0.002924562431871891, "rewards/accuracies": 1.0, "rewards/chosen": -0.003562340047210455, "rewards/margins": 0.3733433783054352, "rewards/rejected": -0.3769057095050812, "sft_loss": 0.035623397678136826, "step": 3957 }, { "epoch": 5.7237888647866955, "grad_norm": 1.2450271286281953, "learning_rate": 4.2542435217184146e-08, "logits/chosen": -0.7657533884048462, "logits/rejected": -0.7611573934555054, "logps/chosen": -0.04250839352607727, "logps/rejected": -5.744058609008789, "loss": 0.0353, "odds_ratio_loss": 0.003883287776261568, "rewards/accuracies": 1.0, "rewards/chosen": -0.004250839352607727, "rewards/margins": 0.5701550245285034, "rewards/rejected": -0.5744057893753052, "sft_loss": 0.04250839352607727, "step": 3958 }, { "epoch": 5.725234996384671, "grad_norm": 0.9684630858692042, "learning_rate": 4.209185310152197e-08, "logits/chosen": -1.0507510900497437, "logits/rejected": -0.7392194867134094, "logps/chosen": -0.020687285810709, "logps/rejected": -5.629661560058594, "loss": 0.0284, "odds_ratio_loss": 0.0012234591413289309, "rewards/accuracies": 1.0, "rewards/chosen": -0.0020687286742031574, "rewards/margins": 0.5608974695205688, "rewards/rejected": -0.5629662275314331, "sft_loss": 0.020687285810709, "step": 3959 }, { "epoch": 5.726681127982646, "grad_norm": 1.0111903958418516, "learning_rate": 4.164365722342245e-08, "logits/chosen": -0.8547532558441162, "logits/rejected": -0.7358726859092712, "logps/chosen": -0.03250580281019211, "logps/rejected": -4.449581623077393, "loss": 0.0251, "odds_ratio_loss": 0.0019836989231407642, "rewards/accuracies": 1.0, "rewards/chosen": -0.0032505805138498545, "rewards/margins": 0.4417075514793396, "rewards/rejected": -0.4449581503868103, "sft_loss": 0.03250580281019211, "step": 3960 }, { "epoch": 5.728127259580622, "grad_norm": 0.8954341522636333, "learning_rate": 4.119784785310454e-08, "logits/chosen": -0.97066330909729, "logits/rejected": -0.7040786743164062, "logps/chosen": -0.014331339858472347, "logps/rejected": -5.2800421714782715, "loss": 0.0229, "odds_ratio_loss": 0.0006313954363577068, "rewards/accuracies": 1.0, "rewards/chosen": -0.0014331340789794922, "rewards/margins": 0.5265710949897766, "rewards/rejected": -0.5280042290687561, "sft_loss": 0.014331339858472347, "step": 3961 }, { "epoch": 5.729573391178597, "grad_norm": 0.8325532370707646, "learning_rate": 4.0754425259348355e-08, "logits/chosen": -0.9457544088363647, "logits/rejected": -0.6079109907150269, "logps/chosen": -0.01744541898369789, "logps/rejected": -5.451538562774658, "loss": 0.0226, "odds_ratio_loss": 0.0007178646046668291, "rewards/accuracies": 1.0, "rewards/chosen": -0.0017445420380681753, "rewards/margins": 0.5434092879295349, "rewards/rejected": -0.5451538562774658, "sft_loss": 0.01744541898369789, "step": 3962 }, { "epoch": 5.731019522776573, "grad_norm": 1.112537502022365, "learning_rate": 4.031338970949516e-08, "logits/chosen": -0.7553691864013672, "logits/rejected": -0.5517176985740662, "logps/chosen": -0.03140060976147652, "logps/rejected": -5.191702842712402, "loss": 0.0459, "odds_ratio_loss": 0.0007400895119644701, "rewards/accuracies": 1.0, "rewards/chosen": -0.0031400611624121666, "rewards/margins": 0.5160301923751831, "rewards/rejected": -0.5191702842712402, "sft_loss": 0.03140060976147652, "step": 3963 }, { "epoch": 5.732465654374548, "grad_norm": 1.01456036033676, "learning_rate": 3.987474146944647e-08, "logits/chosen": -1.02970552444458, "logits/rejected": -0.8884641528129578, "logps/chosen": -0.013003876432776451, "logps/rejected": -4.971007823944092, "loss": 0.0344, "odds_ratio_loss": 0.0005930347251705825, "rewards/accuracies": 1.0, "rewards/chosen": -0.0013003875501453876, "rewards/margins": 0.4958004057407379, "rewards/rejected": -0.4971008002758026, "sft_loss": 0.013003876432776451, "step": 3964 }, { "epoch": 5.7339117859725235, "grad_norm": 1.2032055887513502, "learning_rate": 3.943848080366541e-08, "logits/chosen": -1.0313166379928589, "logits/rejected": -0.6610112190246582, "logps/chosen": -0.013816236518323421, "logps/rejected": -5.627511978149414, "loss": 0.0335, "odds_ratio_loss": 0.00041836718446575105, "rewards/accuracies": 1.0, "rewards/chosen": -0.0013816235587000847, "rewards/margins": 0.5613695383071899, "rewards/rejected": -0.5627512335777283, "sft_loss": 0.013816236518323421, "step": 3965 }, { "epoch": 5.735357917570499, "grad_norm": 1.0401773408038113, "learning_rate": 3.9004607975174905e-08, "logits/chosen": -0.9395818114280701, "logits/rejected": -0.770209789276123, "logps/chosen": -0.027776891365647316, "logps/rejected": -4.5239949226379395, "loss": 0.0187, "odds_ratio_loss": 0.001369029050692916, "rewards/accuracies": 1.0, "rewards/chosen": -0.0027776893693953753, "rewards/margins": 0.4496217966079712, "rewards/rejected": -0.45239946246147156, "sft_loss": 0.027776891365647316, "step": 3966 }, { "epoch": 5.736804049168474, "grad_norm": 0.9541026537621927, "learning_rate": 3.857312324555862e-08, "logits/chosen": -0.7973968386650085, "logits/rejected": -0.5857910513877869, "logps/chosen": -0.018500562757253647, "logps/rejected": -5.999993324279785, "loss": 0.0199, "odds_ratio_loss": 0.0004114778130315244, "rewards/accuracies": 1.0, "rewards/chosen": -0.001850056229159236, "rewards/margins": 0.598149299621582, "rewards/rejected": -0.5999993681907654, "sft_loss": 0.018500562757253647, "step": 3967 }, { "epoch": 5.73825018076645, "grad_norm": 1.1225011262565519, "learning_rate": 3.814402687496043e-08, "logits/chosen": -1.0490517616271973, "logits/rejected": -0.8305553793907166, "logps/chosen": -0.005585083272308111, "logps/rejected": -5.262382984161377, "loss": 0.0306, "odds_ratio_loss": 0.0003785730223171413, "rewards/accuracies": 1.0, "rewards/chosen": -0.0005585083272308111, "rewards/margins": 0.5256798267364502, "rewards/rejected": -0.5262383222579956, "sft_loss": 0.005585083272308111, "step": 3968 }, { "epoch": 5.739696312364425, "grad_norm": 1.026685490340686, "learning_rate": 3.7717319122083645e-08, "logits/chosen": -1.0224274396896362, "logits/rejected": -0.8586872816085815, "logps/chosen": -0.006939824670553207, "logps/rejected": -5.025428295135498, "loss": 0.0293, "odds_ratio_loss": 0.00030064102611504495, "rewards/accuracies": 1.0, "rewards/chosen": -0.0006939824670553207, "rewards/margins": 0.5018488764762878, "rewards/rejected": -0.5025428533554077, "sft_loss": 0.006939824670553207, "step": 3969 }, { "epoch": 5.741142443962401, "grad_norm": 1.1829241105985957, "learning_rate": 3.729300024419224e-08, "logits/chosen": -0.8081978559494019, "logits/rejected": -0.518790602684021, "logps/chosen": -0.026094887405633926, "logps/rejected": -4.8559250831604, "loss": 0.0539, "odds_ratio_loss": 0.001554901129566133, "rewards/accuracies": 1.0, "rewards/chosen": -0.0026094887871295214, "rewards/margins": 0.4829829931259155, "rewards/rejected": -0.48559248447418213, "sft_loss": 0.026094887405633926, "step": 3970 }, { "epoch": 5.742588575560376, "grad_norm": 1.1125981286298714, "learning_rate": 3.687107049710958e-08, "logits/chosen": -0.969440758228302, "logits/rejected": -0.677169919013977, "logps/chosen": -0.040819112211465836, "logps/rejected": -5.158903121948242, "loss": 0.044, "odds_ratio_loss": 0.0023379060439765453, "rewards/accuracies": 1.0, "rewards/chosen": -0.0040819114074110985, "rewards/margins": 0.511808454990387, "rewards/rejected": -0.51589035987854, "sft_loss": 0.040819112211465836, "step": 3971 }, { "epoch": 5.7440347071583515, "grad_norm": 1.0875350672506494, "learning_rate": 3.645153013521929e-08, "logits/chosen": -1.1375861167907715, "logits/rejected": -0.7329793572425842, "logps/chosen": -0.025086410343647003, "logps/rejected": -6.695981025695801, "loss": 0.0328, "odds_ratio_loss": 0.0005195082630962133, "rewards/accuracies": 1.0, "rewards/chosen": -0.0025086409877985716, "rewards/margins": 0.6670895218849182, "rewards/rejected": -0.6695981621742249, "sft_loss": 0.025086410343647003, "step": 3972 }, { "epoch": 5.745480838756327, "grad_norm": 1.2322911875291596, "learning_rate": 3.603437941146303e-08, "logits/chosen": -0.9515436887741089, "logits/rejected": -0.791551947593689, "logps/chosen": -0.009043844416737556, "logps/rejected": -4.902947425842285, "loss": 0.0559, "odds_ratio_loss": 0.0005873933550901711, "rewards/accuracies": 1.0, "rewards/chosen": -0.0009043845348060131, "rewards/margins": 0.48939037322998047, "rewards/rejected": -0.49029478430747986, "sft_loss": 0.009043844416737556, "step": 3973 }, { "epoch": 5.746926970354302, "grad_norm": 1.1251125949181537, "learning_rate": 3.561961857734275e-08, "logits/chosen": -0.7727161645889282, "logits/rejected": -0.610916793346405, "logps/chosen": -0.03913598507642746, "logps/rejected": -4.861371040344238, "loss": 0.0393, "odds_ratio_loss": 0.007291535846889019, "rewards/accuracies": 1.0, "rewards/chosen": -0.003913598600775003, "rewards/margins": 0.4822235107421875, "rewards/rejected": -0.4861370921134949, "sft_loss": 0.03913598507642746, "step": 3974 }, { "epoch": 5.748373101952278, "grad_norm": 1.5473900691851843, "learning_rate": 3.520724788291973e-08, "logits/chosen": -0.9996442794799805, "logits/rejected": -0.7617815732955933, "logps/chosen": -0.06447423249483109, "logps/rejected": -6.056490421295166, "loss": 0.0534, "odds_ratio_loss": 0.0018590696854516864, "rewards/accuracies": 1.0, "rewards/chosen": -0.006447423715144396, "rewards/margins": 0.5992016792297363, "rewards/rejected": -0.6056490540504456, "sft_loss": 0.06447423249483109, "step": 3975 }, { "epoch": 5.749819233550253, "grad_norm": 1.215426207668915, "learning_rate": 3.479726757681289e-08, "logits/chosen": -0.7984232902526855, "logits/rejected": -0.5945329666137695, "logps/chosen": -0.022249722853302956, "logps/rejected": -6.509105205535889, "loss": 0.0322, "odds_ratio_loss": 0.0006876873667351902, "rewards/accuracies": 1.0, "rewards/chosen": -0.0022249724715948105, "rewards/margins": 0.6486855745315552, "rewards/rejected": -0.650910496711731, "sft_loss": 0.022249722853302956, "step": 3976 }, { "epoch": 5.751265365148228, "grad_norm": 1.1150668910106851, "learning_rate": 3.4389677906201843e-08, "logits/chosen": -0.9797489643096924, "logits/rejected": -0.6965649127960205, "logps/chosen": -0.014419843442738056, "logps/rejected": -5.9641337394714355, "loss": 0.0456, "odds_ratio_loss": 0.0016502912621945143, "rewards/accuracies": 1.0, "rewards/chosen": -0.001441984437406063, "rewards/margins": 0.5949714183807373, "rewards/rejected": -0.5964133739471436, "sft_loss": 0.014419843442738056, "step": 3977 }, { "epoch": 5.752711496746204, "grad_norm": 0.9972980551037653, "learning_rate": 3.3984479116822896e-08, "logits/chosen": -0.7742958664894104, "logits/rejected": -0.6961202025413513, "logps/chosen": -0.021118473261594772, "logps/rejected": -4.253470420837402, "loss": 0.0226, "odds_ratio_loss": 0.001717887818813324, "rewards/accuracies": 1.0, "rewards/chosen": -0.002111847046762705, "rewards/margins": 0.4232351779937744, "rewards/rejected": -0.42534705996513367, "sft_loss": 0.021118473261594772, "step": 3978 }, { "epoch": 5.7541576283441795, "grad_norm": 1.1409967671513455, "learning_rate": 3.3581671452973084e-08, "logits/chosen": -0.895980954170227, "logits/rejected": -0.7544868588447571, "logps/chosen": -0.0419120267033577, "logps/rejected": -4.296963214874268, "loss": 0.038, "odds_ratio_loss": 0.0015717416536062956, "rewards/accuracies": 1.0, "rewards/chosen": -0.0041912030428647995, "rewards/margins": 0.4255051016807556, "rewards/rejected": -0.42969632148742676, "sft_loss": 0.0419120267033577, "step": 3979 }, { "epoch": 5.755603759942154, "grad_norm": 1.054092428879142, "learning_rate": 3.318125515750614e-08, "logits/chosen": -1.0647022724151611, "logits/rejected": -0.7363189458847046, "logps/chosen": -0.05184062197804451, "logps/rejected": -6.126248836517334, "loss": 0.0282, "odds_ratio_loss": 0.0024300923105329275, "rewards/accuracies": 1.0, "rewards/chosen": -0.0051840622909367085, "rewards/margins": 0.6074408292770386, "rewards/rejected": -0.6126248836517334, "sft_loss": 0.05184062197804451, "step": 3980 }, { "epoch": 5.75704989154013, "grad_norm": 1.1317499613490938, "learning_rate": 3.278323047183429e-08, "logits/chosen": -0.673163890838623, "logits/rejected": -0.6283612251281738, "logps/chosen": -0.024727502837777138, "logps/rejected": -5.055089473724365, "loss": 0.0313, "odds_ratio_loss": 0.004691070411354303, "rewards/accuracies": 1.0, "rewards/chosen": -0.0024727503769099712, "rewards/margins": 0.5030362606048584, "rewards/rejected": -0.5055089592933655, "sft_loss": 0.024727502837777138, "step": 3981 }, { "epoch": 5.758496023138106, "grad_norm": 1.2086838902742276, "learning_rate": 3.238759763592824e-08, "logits/chosen": -0.9168479442596436, "logits/rejected": -0.7292149662971497, "logps/chosen": -0.05325371026992798, "logps/rejected": -5.35057258605957, "loss": 0.0847, "odds_ratio_loss": 0.0033136121928691864, "rewards/accuracies": 1.0, "rewards/chosen": -0.005325371399521828, "rewards/margins": 0.5297318696975708, "rewards/rejected": -0.5350572466850281, "sft_loss": 0.05325371026992798, "step": 3982 }, { "epoch": 5.7599421547360805, "grad_norm": 1.2961516935194144, "learning_rate": 3.199435688831631e-08, "logits/chosen": -0.8049898147583008, "logits/rejected": -0.677711009979248, "logps/chosen": -0.03922729194164276, "logps/rejected": -3.4130773544311523, "loss": 0.0436, "odds_ratio_loss": 0.0017275214195251465, "rewards/accuracies": 1.0, "rewards/chosen": -0.003922729752957821, "rewards/margins": 0.33738502860069275, "rewards/rejected": -0.34130772948265076, "sft_loss": 0.03922729194164276, "step": 3983 }, { "epoch": 5.761388286334056, "grad_norm": 1.188349583853395, "learning_rate": 3.1603508466085284e-08, "logits/chosen": -0.9943714141845703, "logits/rejected": -0.6691036224365234, "logps/chosen": -0.025151284411549568, "logps/rejected": -5.576414108276367, "loss": 0.0331, "odds_ratio_loss": 0.0018484786851331592, "rewards/accuracies": 1.0, "rewards/chosen": -0.002515128580853343, "rewards/margins": 0.5551262497901917, "rewards/rejected": -0.5576413869857788, "sft_loss": 0.025151284411549568, "step": 3984 }, { "epoch": 5.762834417932032, "grad_norm": 1.1340347678222187, "learning_rate": 3.1215052604879114e-08, "logits/chosen": -0.8503623008728027, "logits/rejected": -0.7320210933685303, "logps/chosen": -0.02072015404701233, "logps/rejected": -3.383143901824951, "loss": 0.0354, "odds_ratio_loss": 0.0017923712730407715, "rewards/accuracies": 1.0, "rewards/chosen": -0.002072015544399619, "rewards/margins": 0.3362423777580261, "rewards/rejected": -0.338314414024353, "sft_loss": 0.02072015404701233, "step": 3985 }, { "epoch": 5.764280549530008, "grad_norm": 1.3273836595656827, "learning_rate": 3.082898953889845e-08, "logits/chosen": -0.8875366449356079, "logits/rejected": -0.5429030060768127, "logps/chosen": -0.04038511961698532, "logps/rejected": -6.3761820793151855, "loss": 0.0474, "odds_ratio_loss": 0.004100095946341753, "rewards/accuracies": 1.0, "rewards/chosen": -0.004038511775434017, "rewards/margins": 0.6335797309875488, "rewards/rejected": -0.6376181840896606, "sft_loss": 0.04038511961698532, "step": 3986 }, { "epoch": 5.765726681127982, "grad_norm": 1.0007986876611032, "learning_rate": 3.044531950090334e-08, "logits/chosen": -0.8018543720245361, "logits/rejected": -0.6697442531585693, "logps/chosen": -0.05521427094936371, "logps/rejected": -5.116559982299805, "loss": 0.0362, "odds_ratio_loss": 0.0010527849663048983, "rewards/accuracies": 1.0, "rewards/chosen": -0.0055214278399944305, "rewards/margins": 0.506134569644928, "rewards/rejected": -0.5116559863090515, "sft_loss": 0.05521427094936371, "step": 3987 }, { "epoch": 5.767172812725958, "grad_norm": 1.3050876636722932, "learning_rate": 3.006404272220919e-08, "logits/chosen": -1.0676295757293701, "logits/rejected": -0.7167788147926331, "logps/chosen": -0.06772436201572418, "logps/rejected": -4.485447883605957, "loss": 0.0367, "odds_ratio_loss": 0.009542109444737434, "rewards/accuracies": 1.0, "rewards/chosen": -0.006772437132894993, "rewards/margins": 0.44177234172821045, "rewards/rejected": -0.44854480028152466, "sft_loss": 0.06772436201572418, "step": 3988 }, { "epoch": 5.768618944323934, "grad_norm": 1.275730282332951, "learning_rate": 2.9685159432689012e-08, "logits/chosen": -0.8527493476867676, "logits/rejected": -0.6094472408294678, "logps/chosen": -0.05909667909145355, "logps/rejected": -7.165646553039551, "loss": 0.0344, "odds_ratio_loss": 0.003317814087495208, "rewards/accuracies": 1.0, "rewards/chosen": -0.005909668281674385, "rewards/margins": 0.7106549739837646, "rewards/rejected": -0.7165646553039551, "sft_loss": 0.05909667909145355, "step": 3989 }, { "epoch": 5.7700650759219085, "grad_norm": 1.2166176608174204, "learning_rate": 2.9308669860773848e-08, "logits/chosen": -0.9953593015670776, "logits/rejected": -0.6956831216812134, "logps/chosen": -0.01776827871799469, "logps/rejected": -6.028247833251953, "loss": 0.0369, "odds_ratio_loss": 0.002291513839736581, "rewards/accuracies": 1.0, "rewards/chosen": -0.0017768278485164046, "rewards/margins": 0.6010479927062988, "rewards/rejected": -0.6028247475624084, "sft_loss": 0.01776827871799469, "step": 3990 }, { "epoch": 5.771511207519884, "grad_norm": 1.2792370639647088, "learning_rate": 2.893457423344925e-08, "logits/chosen": -0.6821169853210449, "logits/rejected": -0.5734527111053467, "logps/chosen": -0.019364001229405403, "logps/rejected": -4.074793815612793, "loss": 0.0419, "odds_ratio_loss": 0.0023358322214335203, "rewards/accuracies": 1.0, "rewards/chosen": -0.0019364003092050552, "rewards/margins": 0.4055430293083191, "rewards/rejected": -0.4074794352054596, "sft_loss": 0.019364001229405403, "step": 3991 }, { "epoch": 5.77295733911786, "grad_norm": 0.9186218043755977, "learning_rate": 2.8562872776260126e-08, "logits/chosen": -0.7581536769866943, "logits/rejected": -0.6958929300308228, "logps/chosen": -0.025422383099794388, "logps/rejected": -5.949764728546143, "loss": 0.0207, "odds_ratio_loss": 0.0005549125489778817, "rewards/accuracies": 1.0, "rewards/chosen": -0.0025422382168471813, "rewards/margins": 0.5924341678619385, "rewards/rejected": -0.5949764251708984, "sft_loss": 0.025422383099794388, "step": 3992 }, { "epoch": 5.774403470715836, "grad_norm": 0.9030764581302508, "learning_rate": 2.8193565713306335e-08, "logits/chosen": -0.9597541093826294, "logits/rejected": -0.6982543468475342, "logps/chosen": -0.010779261589050293, "logps/rejected": -5.799703598022461, "loss": 0.0222, "odds_ratio_loss": 0.0005417458014562726, "rewards/accuracies": 1.0, "rewards/chosen": -0.0010779262520372868, "rewards/margins": 0.5788924098014832, "rewards/rejected": -0.5799703598022461, "sft_loss": 0.010779261589050293, "step": 3993 }, { "epoch": 5.77584960231381, "grad_norm": 0.8221212441297421, "learning_rate": 2.7826653267243984e-08, "logits/chosen": -0.9700363874435425, "logits/rejected": -0.885134756565094, "logps/chosen": -0.01687433198094368, "logps/rejected": -5.0756330490112305, "loss": 0.0204, "odds_ratio_loss": 0.0016347735654562712, "rewards/accuracies": 1.0, "rewards/chosen": -0.0016874329885467887, "rewards/margins": 0.5058758854866028, "rewards/rejected": -0.5075633525848389, "sft_loss": 0.01687433198094368, "step": 3994 }, { "epoch": 5.777295733911786, "grad_norm": 1.0148568039942327, "learning_rate": 2.746213565928679e-08, "logits/chosen": -0.9796086549758911, "logits/rejected": -0.5948249697685242, "logps/chosen": -0.02354905754327774, "logps/rejected": -5.957253456115723, "loss": 0.0458, "odds_ratio_loss": 0.0007033887668512762, "rewards/accuracies": 1.0, "rewards/chosen": -0.0023549057077616453, "rewards/margins": 0.5933704972267151, "rewards/rejected": -0.5957253575325012, "sft_loss": 0.02354905754327774, "step": 3995 }, { "epoch": 5.778741865509762, "grad_norm": 1.0693204506045595, "learning_rate": 2.7100013109202957e-08, "logits/chosen": -1.2479994297027588, "logits/rejected": -0.8474553823471069, "logps/chosen": -0.04476558417081833, "logps/rejected": -4.169618606567383, "loss": 0.0323, "odds_ratio_loss": 0.004479460418224335, "rewards/accuracies": 1.0, "rewards/chosen": -0.004476558417081833, "rewards/margins": 0.4124853014945984, "rewards/rejected": -0.4169618785381317, "sft_loss": 0.04476558417081833, "step": 3996 }, { "epoch": 5.780187997107737, "grad_norm": 1.6249830066844961, "learning_rate": 2.6740285835317844e-08, "logits/chosen": -0.7211387157440186, "logits/rejected": -0.5791776776313782, "logps/chosen": -0.08303683251142502, "logps/rejected": -6.241660118103027, "loss": 0.0439, "odds_ratio_loss": 0.005026006139814854, "rewards/accuracies": 1.0, "rewards/chosen": -0.008303683251142502, "rewards/margins": 0.615862250328064, "rewards/rejected": -0.624165952205658, "sft_loss": 0.08303683251142502, "step": 3997 }, { "epoch": 5.781634128705712, "grad_norm": 1.112640675962451, "learning_rate": 2.638295405451263e-08, "logits/chosen": -0.973607063293457, "logits/rejected": -0.6138404607772827, "logps/chosen": -0.034950047731399536, "logps/rejected": -5.843950271606445, "loss": 0.0291, "odds_ratio_loss": 0.0012881564907729626, "rewards/accuracies": 1.0, "rewards/chosen": -0.003495004726573825, "rewards/margins": 0.5808999538421631, "rewards/rejected": -0.5843949913978577, "sft_loss": 0.034950047731399536, "step": 3998 }, { "epoch": 5.783080260303688, "grad_norm": 0.8817721981331313, "learning_rate": 2.602801798222387e-08, "logits/chosen": -1.0843451023101807, "logits/rejected": -0.8955305218696594, "logps/chosen": -0.014239782467484474, "logps/rejected": -3.2525007724761963, "loss": 0.0249, "odds_ratio_loss": 0.0006716042989864945, "rewards/accuracies": 1.0, "rewards/chosen": -0.0014239782467484474, "rewards/margins": 0.3238261044025421, "rewards/rejected": -0.3252500891685486, "sft_loss": 0.014239782467484474, "step": 3999 }, { "epoch": 5.784526391901663, "grad_norm": 0.9708590418536193, "learning_rate": 2.567547783244306e-08, "logits/chosen": -0.9054265022277832, "logits/rejected": -0.6644700765609741, "logps/chosen": -0.03736726939678192, "logps/rejected": -4.768521785736084, "loss": 0.0316, "odds_ratio_loss": 0.0016982683446258307, "rewards/accuracies": 1.0, "rewards/chosen": -0.003736727172508836, "rewards/margins": 0.4731154441833496, "rewards/rejected": -0.4768521785736084, "sft_loss": 0.03736726939678192, "step": 4000 }, { "epoch": 5.785972523499638, "grad_norm": 1.6144577125437707, "learning_rate": 2.5325333817719285e-08, "logits/chosen": -0.8311997056007385, "logits/rejected": -0.7566261291503906, "logps/chosen": -0.05182730779051781, "logps/rejected": -3.975294828414917, "loss": 0.0513, "odds_ratio_loss": 0.00553969107568264, "rewards/accuracies": 1.0, "rewards/chosen": -0.005182730499655008, "rewards/margins": 0.3923467695713043, "rewards/rejected": -0.3975295126438141, "sft_loss": 0.05182730779051781, "step": 4001 }, { "epoch": 5.787418655097614, "grad_norm": 1.0544511861272714, "learning_rate": 2.4977586149154793e-08, "logits/chosen": -0.8234452605247498, "logits/rejected": -0.6371239423751831, "logps/chosen": -0.055266425013542175, "logps/rejected": -4.471240520477295, "loss": 0.0312, "odds_ratio_loss": 0.002024412387982011, "rewards/accuracies": 1.0, "rewards/chosen": -0.00552664278075099, "rewards/margins": 0.4415974020957947, "rewards/rejected": -0.44712403416633606, "sft_loss": 0.055266425013542175, "step": 4002 }, { "epoch": 5.788864786695589, "grad_norm": 0.9448910379395407, "learning_rate": 2.4632235036408544e-08, "logits/chosen": -1.094842553138733, "logits/rejected": -0.8462741374969482, "logps/chosen": -0.020257651805877686, "logps/rejected": -5.7198896408081055, "loss": 0.0359, "odds_ratio_loss": 0.0031071146950125694, "rewards/accuracies": 1.0, "rewards/chosen": -0.00202576513402164, "rewards/margins": 0.5699632167816162, "rewards/rejected": -0.5719889402389526, "sft_loss": 0.020257651805877686, "step": 4003 }, { "epoch": 5.790310918293565, "grad_norm": 1.2029466658124404, "learning_rate": 2.4289280687693093e-08, "logits/chosen": -1.007336974143982, "logits/rejected": -0.8913165330886841, "logps/chosen": -0.04219206050038338, "logps/rejected": -3.9805102348327637, "loss": 0.0373, "odds_ratio_loss": 0.007664866745471954, "rewards/accuracies": 1.0, "rewards/chosen": -0.004219206050038338, "rewards/margins": 0.3938317894935608, "rewards/rejected": -0.39805102348327637, "sft_loss": 0.04219206050038338, "step": 4004 }, { "epoch": 5.79175704989154, "grad_norm": 1.5579647159504868, "learning_rate": 2.3948723309777706e-08, "logits/chosen": -0.90506911277771, "logits/rejected": -0.6676803827285767, "logps/chosen": -0.04042967036366463, "logps/rejected": -3.899822950363159, "loss": 0.0528, "odds_ratio_loss": 0.0016432093689218163, "rewards/accuracies": 1.0, "rewards/chosen": -0.004042967222630978, "rewards/margins": 0.3859393000602722, "rewards/rejected": -0.38998228311538696, "sft_loss": 0.04042967036366463, "step": 4005 }, { "epoch": 5.793203181489515, "grad_norm": 0.9664642233082743, "learning_rate": 2.361056310798526e-08, "logits/chosen": -1.0167044401168823, "logits/rejected": -0.588829517364502, "logps/chosen": -0.047890324145555496, "logps/rejected": -5.325345039367676, "loss": 0.0256, "odds_ratio_loss": 0.0006860626745037735, "rewards/accuracies": 1.0, "rewards/chosen": -0.004789032973349094, "rewards/margins": 0.5277454853057861, "rewards/rejected": -0.5325345396995544, "sft_loss": 0.047890324145555496, "step": 4006 }, { "epoch": 5.794649313087491, "grad_norm": 1.2513957261398723, "learning_rate": 2.3274800286193997e-08, "logits/chosen": -1.0148444175720215, "logits/rejected": -0.7270374298095703, "logps/chosen": -0.026285681873559952, "logps/rejected": -3.6439900398254395, "loss": 0.0293, "odds_ratio_loss": 0.004125781357288361, "rewards/accuracies": 1.0, "rewards/chosen": -0.0026285680942237377, "rewards/margins": 0.3617704510688782, "rewards/rejected": -0.3643990159034729, "sft_loss": 0.026285681873559952, "step": 4007 }, { "epoch": 5.7960954446854664, "grad_norm": 1.4118563796416221, "learning_rate": 2.2941435046836654e-08, "logits/chosen": -0.9013469219207764, "logits/rejected": -0.8452996015548706, "logps/chosen": -0.04646843299269676, "logps/rejected": -6.354674816131592, "loss": 0.033, "odds_ratio_loss": 0.0004940081853419542, "rewards/accuracies": 1.0, "rewards/chosen": -0.0046468437649309635, "rewards/margins": 0.6308206915855408, "rewards/rejected": -0.635467529296875, "sft_loss": 0.04646843299269676, "step": 4008 }, { "epoch": 5.797541576283442, "grad_norm": 1.5375611646583789, "learning_rate": 2.2610467590900463e-08, "logits/chosen": -1.030297875404358, "logits/rejected": -0.726286768913269, "logps/chosen": -0.022617783397436142, "logps/rejected": -4.68665885925293, "loss": 0.0272, "odds_ratio_loss": 0.0016010688850656152, "rewards/accuracies": 1.0, "rewards/chosen": -0.002261778572574258, "rewards/margins": 0.46640413999557495, "rewards/rejected": -0.4686659276485443, "sft_loss": 0.022617783397436142, "step": 4009 }, { "epoch": 5.798987707881417, "grad_norm": 0.9161126912343743, "learning_rate": 2.2281898117926244e-08, "logits/chosen": -1.0119342803955078, "logits/rejected": -0.6778802275657654, "logps/chosen": -0.019781216979026794, "logps/rejected": -5.734238624572754, "loss": 0.0274, "odds_ratio_loss": 0.002621131483465433, "rewards/accuracies": 1.0, "rewards/chosen": -0.0019781216979026794, "rewards/margins": 0.5714457631111145, "rewards/rejected": -0.5734239220619202, "sft_loss": 0.019781216979026794, "step": 4010 }, { "epoch": 5.800433839479393, "grad_norm": 0.8694353605814207, "learning_rate": 2.1955726826010655e-08, "logits/chosen": -0.8888824582099915, "logits/rejected": -0.6554268002510071, "logps/chosen": -0.008277302607893944, "logps/rejected": -4.462595462799072, "loss": 0.0235, "odds_ratio_loss": 0.0004954091855324805, "rewards/accuracies": 1.0, "rewards/chosen": -0.0008277302840724587, "rewards/margins": 0.44543182849884033, "rewards/rejected": -0.4462595582008362, "sft_loss": 0.008277302607893944, "step": 4011 }, { "epoch": 5.801879971077368, "grad_norm": 1.0813962888466517, "learning_rate": 2.1631953911803058e-08, "logits/chosen": -0.9825663566589355, "logits/rejected": -0.7994692325592041, "logps/chosen": -0.014045009389519691, "logps/rejected": -5.245484352111816, "loss": 0.0358, "odds_ratio_loss": 0.044613584876060486, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0014045010320842266, "rewards/margins": 0.5231439471244812, "rewards/rejected": -0.5245484113693237, "sft_loss": 0.014045009389519691, "step": 4012 }, { "epoch": 5.803326102675343, "grad_norm": 0.9881938059874377, "learning_rate": 2.131057957050775e-08, "logits/chosen": -0.9296178221702576, "logits/rejected": -0.662962794303894, "logps/chosen": -0.015109667554497719, "logps/rejected": -5.480372428894043, "loss": 0.0432, "odds_ratio_loss": 0.0005065487348474562, "rewards/accuracies": 1.0, "rewards/chosen": -0.0015109669184312224, "rewards/margins": 0.546526312828064, "rewards/rejected": -0.5480372905731201, "sft_loss": 0.015109667554497719, "step": 4013 }, { "epoch": 5.804772234273319, "grad_norm": 1.4249186140475822, "learning_rate": 2.0991603995881736e-08, "logits/chosen": -1.1877706050872803, "logits/rejected": -0.7925946116447449, "logps/chosen": -0.0431302934885025, "logps/rejected": -6.408783912658691, "loss": 0.0465, "odds_ratio_loss": 0.0022244546562433243, "rewards/accuracies": 1.0, "rewards/chosen": -0.00431302934885025, "rewards/margins": 0.6365653872489929, "rewards/rejected": -0.6408783793449402, "sft_loss": 0.0431302934885025, "step": 4014 }, { "epoch": 5.8062183658712945, "grad_norm": 1.0239024998010973, "learning_rate": 2.0675027380237408e-08, "logits/chosen": -0.9312243461608887, "logits/rejected": -0.8070486783981323, "logps/chosen": -0.02381393127143383, "logps/rejected": -5.620185852050781, "loss": 0.0412, "odds_ratio_loss": 0.0018174147699028254, "rewards/accuracies": 1.0, "rewards/chosen": -0.002381392987444997, "rewards/margins": 0.559637188911438, "rewards/rejected": -0.562018632888794, "sft_loss": 0.02381393127143383, "step": 4015 }, { "epoch": 5.80766449746927, "grad_norm": 1.1124521948539061, "learning_rate": 2.0360849914439427e-08, "logits/chosen": -0.933996319770813, "logits/rejected": -0.8640280961990356, "logps/chosen": -0.049488216638565063, "logps/rejected": -4.251837730407715, "loss": 0.0513, "odds_ratio_loss": 0.004922335967421532, "rewards/accuracies": 1.0, "rewards/chosen": -0.004948821850121021, "rewards/margins": 0.4202350080013275, "rewards/rejected": -0.42518380284309387, "sft_loss": 0.049488216638565063, "step": 4016 }, { "epoch": 5.809110629067245, "grad_norm": 1.50884766745822, "learning_rate": 2.0049071787906933e-08, "logits/chosen": -0.7553597092628479, "logits/rejected": -0.6329598426818848, "logps/chosen": -0.020680755376815796, "logps/rejected": -4.397462368011475, "loss": 0.037, "odds_ratio_loss": 0.0012878633569926023, "rewards/accuracies": 1.0, "rewards/chosen": -0.0020680755842477083, "rewards/margins": 0.43767818808555603, "rewards/rejected": -0.43974626064300537, "sft_loss": 0.020680755376815796, "step": 4017 }, { "epoch": 5.810556760665221, "grad_norm": 1.254658529361879, "learning_rate": 1.973969318861224e-08, "logits/chosen": -0.9069069623947144, "logits/rejected": -0.8351166248321533, "logps/chosen": -0.07286441326141357, "logps/rejected": -4.8769145011901855, "loss": 0.0591, "odds_ratio_loss": 0.004483464173972607, "rewards/accuracies": 1.0, "rewards/chosen": -0.00728644197806716, "rewards/margins": 0.4804050326347351, "rewards/rejected": -0.4876914620399475, "sft_loss": 0.07286441326141357, "step": 4018 }, { "epoch": 5.812002892263196, "grad_norm": 1.443364183996849, "learning_rate": 1.9432714303080354e-08, "logits/chosen": -0.9718085527420044, "logits/rejected": -0.824963390827179, "logps/chosen": -0.062426358461380005, "logps/rejected": -4.95826530456543, "loss": 0.0591, "odds_ratio_loss": 0.002314309123903513, "rewards/accuracies": 1.0, "rewards/chosen": -0.006242636125534773, "rewards/margins": 0.4895838499069214, "rewards/rejected": -0.49582648277282715, "sft_loss": 0.062426358461380005, "step": 4019 }, { "epoch": 5.813449023861171, "grad_norm": 1.0809438633806718, "learning_rate": 1.9128135316390348e-08, "logits/chosen": -0.9536488652229309, "logits/rejected": -0.690174400806427, "logps/chosen": -0.03028007224202156, "logps/rejected": -4.903285980224609, "loss": 0.0359, "odds_ratio_loss": 0.0007200206746347249, "rewards/accuracies": 1.0, "rewards/chosen": -0.0030280069913715124, "rewards/margins": 0.4873005747795105, "rewards/rejected": -0.4903286099433899, "sft_loss": 0.03028007224202156, "step": 4020 }, { "epoch": 5.814895155459147, "grad_norm": 1.1851391157157583, "learning_rate": 1.882595641217355e-08, "logits/chosen": -1.0572259426116943, "logits/rejected": -0.785851240158081, "logps/chosen": -0.01872413232922554, "logps/rejected": -4.886478900909424, "loss": 0.0436, "odds_ratio_loss": 0.0007939153583720326, "rewards/accuracies": 1.0, "rewards/chosen": -0.0018724132096394897, "rewards/margins": 0.4867754578590393, "rewards/rejected": -0.4886478781700134, "sft_loss": 0.01872413232922554, "step": 4021 }, { "epoch": 5.8163412870571225, "grad_norm": 1.0114293223244128, "learning_rate": 1.8526177772615336e-08, "logits/chosen": -0.8728271126747131, "logits/rejected": -0.6743614077568054, "logps/chosen": -0.03183284401893616, "logps/rejected": -4.913818836212158, "loss": 0.022, "odds_ratio_loss": 0.0015718166250735521, "rewards/accuracies": 1.0, "rewards/chosen": -0.003183284541592002, "rewards/margins": 0.4881986081600189, "rewards/rejected": -0.4913818836212158, "sft_loss": 0.03183284401893616, "step": 4022 }, { "epoch": 5.817787418655097, "grad_norm": 0.9150372634239352, "learning_rate": 1.8228799578452914e-08, "logits/chosen": -0.879639208316803, "logits/rejected": -0.6301390528678894, "logps/chosen": -0.015895256772637367, "logps/rejected": -5.926244258880615, "loss": 0.0277, "odds_ratio_loss": 0.0004677917459048331, "rewards/accuracies": 1.0, "rewards/chosen": -0.0015895256074145436, "rewards/margins": 0.5910348892211914, "rewards/rejected": -0.5926244258880615, "sft_loss": 0.015895256772637367, "step": 4023 }, { "epoch": 5.819233550253073, "grad_norm": 0.8348971690681323, "learning_rate": 1.7933822008977527e-08, "logits/chosen": -0.8647024631500244, "logits/rejected": -0.7957019805908203, "logps/chosen": -0.017145434394478798, "logps/rejected": -4.466237545013428, "loss": 0.0225, "odds_ratio_loss": 0.0012718301732093096, "rewards/accuracies": 1.0, "rewards/chosen": -0.0017145435558632016, "rewards/margins": 0.4449092149734497, "rewards/rejected": -0.4466237723827362, "sft_loss": 0.017145434394478798, "step": 4024 }, { "epoch": 5.820679681851049, "grad_norm": 1.1128379037048841, "learning_rate": 1.764124524203092e-08, "logits/chosen": -1.0014090538024902, "logits/rejected": -0.808147668838501, "logps/chosen": -0.0898212268948555, "logps/rejected": -3.219820499420166, "loss": 0.0551, "odds_ratio_loss": 0.00917213223874569, "rewards/accuracies": 1.0, "rewards/chosen": -0.008982122875750065, "rewards/margins": 0.3129999041557312, "rewards/rejected": -0.3219820261001587, "sft_loss": 0.0898212268948555, "step": 4025 }, { "epoch": 5.8221258134490235, "grad_norm": 1.0459193455299765, "learning_rate": 1.735106945400977e-08, "logits/chosen": -0.9716974496841431, "logits/rejected": -0.6809044480323792, "logps/chosen": -0.02012844756245613, "logps/rejected": -5.090946674346924, "loss": 0.0366, "odds_ratio_loss": 0.0011630118824541569, "rewards/accuracies": 1.0, "rewards/chosen": -0.0020128446631133556, "rewards/margins": 0.5070818066596985, "rewards/rejected": -0.5090946555137634, "sft_loss": 0.02012844756245613, "step": 4026 }, { "epoch": 5.823571945046999, "grad_norm": 1.0703583145731792, "learning_rate": 1.706329481986213e-08, "logits/chosen": -0.8691219687461853, "logits/rejected": -0.5219488739967346, "logps/chosen": -0.014108894392848015, "logps/rejected": -5.664173126220703, "loss": 0.0341, "odds_ratio_loss": 0.00077531993156299, "rewards/accuracies": 1.0, "rewards/chosen": -0.0014108894392848015, "rewards/margins": 0.56500643491745, "rewards/rejected": -0.5664172768592834, "sft_loss": 0.014108894392848015, "step": 4027 }, { "epoch": 5.825018076644975, "grad_norm": 0.7928782376077622, "learning_rate": 1.6777921513087433e-08, "logits/chosen": -0.8064377307891846, "logits/rejected": -0.6964023113250732, "logps/chosen": -0.013883027248084545, "logps/rejected": -3.529005765914917, "loss": 0.0165, "odds_ratio_loss": 0.000492023304104805, "rewards/accuracies": 1.0, "rewards/chosen": -0.0013883027713745832, "rewards/margins": 0.3515123128890991, "rewards/rejected": -0.35290059447288513, "sft_loss": 0.013883027248084545, "step": 4028 }, { "epoch": 5.82646420824295, "grad_norm": 1.116988376167779, "learning_rate": 1.6494949705739613e-08, "logits/chosen": -1.0281099081039429, "logits/rejected": -0.9229446649551392, "logps/chosen": -0.012211017310619354, "logps/rejected": -4.516913414001465, "loss": 0.0414, "odds_ratio_loss": 0.0013461960479617119, "rewards/accuracies": 1.0, "rewards/chosen": -0.0012211017310619354, "rewards/margins": 0.4504702091217041, "rewards/rejected": -0.45169129967689514, "sft_loss": 0.012211017310619354, "step": 4029 }, { "epoch": 5.827910339840925, "grad_norm": 1.0687469194910033, "learning_rate": 1.62143795684222e-08, "logits/chosen": -0.8571509718894958, "logits/rejected": -0.6989127397537231, "logps/chosen": -0.03518003597855568, "logps/rejected": -4.067432403564453, "loss": 0.0362, "odds_ratio_loss": 0.0014950365293771029, "rewards/accuracies": 1.0, "rewards/chosen": -0.003518004436045885, "rewards/margins": 0.40322527289390564, "rewards/rejected": -0.40674328804016113, "sft_loss": 0.03518003597855568, "step": 4030 }, { "epoch": 5.829356471438901, "grad_norm": 1.1180683825844981, "learning_rate": 1.5936211270292765e-08, "logits/chosen": -1.0204957723617554, "logits/rejected": -0.8662518262863159, "logps/chosen": -0.060904499143362045, "logps/rejected": -4.2967681884765625, "loss": 0.0523, "odds_ratio_loss": 0.001792628550902009, "rewards/accuracies": 1.0, "rewards/chosen": -0.006090449169278145, "rewards/margins": 0.423586368560791, "rewards/rejected": -0.42967677116394043, "sft_loss": 0.060904499143362045, "step": 4031 }, { "epoch": 5.830802603036877, "grad_norm": 1.1415004321422098, "learning_rate": 1.566044497905983e-08, "logits/chosen": -0.7944949865341187, "logits/rejected": -0.6358327269554138, "logps/chosen": -0.030271006748080254, "logps/rejected": -5.495286464691162, "loss": 0.0406, "odds_ratio_loss": 0.0012499038130044937, "rewards/accuracies": 1.0, "rewards/chosen": -0.003027101047337055, "rewards/margins": 0.5465015172958374, "rewards/rejected": -0.5495285987854004, "sft_loss": 0.030271006748080254, "step": 4032 }, { "epoch": 5.8322487346348515, "grad_norm": 0.9578429236167958, "learning_rate": 1.538708086098417e-08, "logits/chosen": -0.8756827116012573, "logits/rejected": -0.5940501689910889, "logps/chosen": -0.01414337195456028, "logps/rejected": -4.33171272277832, "loss": 0.0358, "odds_ratio_loss": 0.0012647686526179314, "rewards/accuracies": 1.0, "rewards/chosen": -0.001414337195456028, "rewards/margins": 0.43175697326660156, "rewards/rejected": -0.4331713020801544, "sft_loss": 0.01414337195456028, "step": 4033 }, { "epoch": 5.833694866232827, "grad_norm": 1.023502337177867, "learning_rate": 1.511611908087751e-08, "logits/chosen": -0.9340368509292603, "logits/rejected": -0.7580222487449646, "logps/chosen": -0.029262201860547066, "logps/rejected": -5.634925842285156, "loss": 0.0434, "odds_ratio_loss": 0.0013582634273916483, "rewards/accuracies": 1.0, "rewards/chosen": -0.0029262204188853502, "rewards/margins": 0.5605663061141968, "rewards/rejected": -0.5634925365447998, "sft_loss": 0.029262201860547066, "step": 4034 }, { "epoch": 5.835140997830803, "grad_norm": 1.1248514719985416, "learning_rate": 1.4847559802103837e-08, "logits/chosen": -0.9322491884231567, "logits/rejected": -0.6265773177146912, "logps/chosen": -0.0576665922999382, "logps/rejected": -4.378793239593506, "loss": 0.0477, "odds_ratio_loss": 0.0018896459368988872, "rewards/accuracies": 1.0, "rewards/chosen": -0.005766659043729305, "rewards/margins": 0.4321126639842987, "rewards/rejected": -0.4378793239593506, "sft_loss": 0.0576665922999382, "step": 4035 }, { "epoch": 5.836587129428778, "grad_norm": 1.2991973773075023, "learning_rate": 1.4581403186578523e-08, "logits/chosen": -0.8614668846130371, "logits/rejected": -0.6605913043022156, "logps/chosen": -0.06553688645362854, "logps/rejected": -4.359433650970459, "loss": 0.0473, "odds_ratio_loss": 0.00791736226528883, "rewards/accuracies": 1.0, "rewards/chosen": -0.006553689017891884, "rewards/margins": 0.42938968539237976, "rewards/rejected": -0.4359433948993683, "sft_loss": 0.06553688645362854, "step": 4036 }, { "epoch": 5.838033261026753, "grad_norm": 0.9138828175946025, "learning_rate": 1.4317649394768761e-08, "logits/chosen": -0.9851160049438477, "logits/rejected": -0.6789707541465759, "logps/chosen": -0.009366050362586975, "logps/rejected": -6.126029014587402, "loss": 0.0244, "odds_ratio_loss": 0.0009717661887407303, "rewards/accuracies": 1.0, "rewards/chosen": -0.0009366050362586975, "rewards/margins": 0.6116663217544556, "rewards/rejected": -0.6126028895378113, "sft_loss": 0.009366050362586975, "step": 4037 }, { "epoch": 5.839479392624729, "grad_norm": 1.2300109167391127, "learning_rate": 1.4056298585692238e-08, "logits/chosen": -1.1383479833602905, "logits/rejected": -0.7096400856971741, "logps/chosen": -0.13449586927890778, "logps/rejected": -5.386882781982422, "loss": 0.0668, "odds_ratio_loss": 0.0036082954611629248, "rewards/accuracies": 1.0, "rewards/chosen": -0.013449587859213352, "rewards/margins": 0.5252387523651123, "rewards/rejected": -0.5386883616447449, "sft_loss": 0.13449586927890778, "step": 4038 }, { "epoch": 5.840925524222705, "grad_norm": 1.380616416611711, "learning_rate": 1.3797350916918914e-08, "logits/chosen": -0.9529186487197876, "logits/rejected": -0.7264933586120605, "logps/chosen": -0.02300257608294487, "logps/rejected": -4.905316352844238, "loss": 0.0418, "odds_ratio_loss": 0.0007615200011059642, "rewards/accuracies": 1.0, "rewards/chosen": -0.002300257794559002, "rewards/margins": 0.4882313907146454, "rewards/rejected": -0.49053165316581726, "sft_loss": 0.02300257608294487, "step": 4039 }, { "epoch": 5.8423716558206795, "grad_norm": 1.0373658257981904, "learning_rate": 1.3540806544568794e-08, "logits/chosen": -0.6172590255737305, "logits/rejected": -0.42707759141921997, "logps/chosen": -0.020513903349637985, "logps/rejected": -5.75158166885376, "loss": 0.0309, "odds_ratio_loss": 0.001809641718864441, "rewards/accuracies": 1.0, "rewards/chosen": -0.002051390241831541, "rewards/margins": 0.5731067657470703, "rewards/rejected": -0.5751581788063049, "sft_loss": 0.020513903349637985, "step": 4040 }, { "epoch": 5.843817787418655, "grad_norm": 1.0530917340846524, "learning_rate": 1.3286665623313264e-08, "logits/chosen": -0.6495931148529053, "logits/rejected": -0.6027662754058838, "logps/chosen": -0.018377287313342094, "logps/rejected": -4.457110404968262, "loss": 0.0297, "odds_ratio_loss": 0.0023091405164450407, "rewards/accuracies": 1.0, "rewards/chosen": -0.00183772889431566, "rewards/margins": 0.4438733458518982, "rewards/rejected": -0.44571104645729065, "sft_loss": 0.018377287313342094, "step": 4041 }, { "epoch": 5.845263919016631, "grad_norm": 0.8971616920905066, "learning_rate": 1.3034928306375537e-08, "logits/chosen": -0.9880181550979614, "logits/rejected": -0.7460534572601318, "logps/chosen": -0.007409685291349888, "logps/rejected": -6.935779571533203, "loss": 0.0375, "odds_ratio_loss": 0.0009576200391165912, "rewards/accuracies": 1.0, "rewards/chosen": -0.0007409685058519244, "rewards/margins": 0.6928369998931885, "rewards/rejected": -0.6935780048370361, "sft_loss": 0.007409685291349888, "step": 4042 }, { "epoch": 5.846710050614606, "grad_norm": 1.1496577963793577, "learning_rate": 1.2785594745528427e-08, "logits/chosen": -0.7424166798591614, "logits/rejected": -0.7751407623291016, "logps/chosen": -0.08187150955200195, "logps/rejected": -4.955060958862305, "loss": 0.0482, "odds_ratio_loss": 0.004987210966646671, "rewards/accuracies": 1.0, "rewards/chosen": -0.008187152445316315, "rewards/margins": 0.4873189330101013, "rewards/rejected": -0.4955061078071594, "sft_loss": 0.08187150955200195, "step": 4043 }, { "epoch": 5.848156182212581, "grad_norm": 1.1586948812349995, "learning_rate": 1.2538665091096135e-08, "logits/chosen": -1.0148038864135742, "logits/rejected": -0.7616249322891235, "logps/chosen": -0.051184628158807755, "logps/rejected": -4.20012903213501, "loss": 0.0561, "odds_ratio_loss": 0.0011503919959068298, "rewards/accuracies": 1.0, "rewards/chosen": -0.005118463188409805, "rewards/margins": 0.4148944616317749, "rewards/rejected": -0.420012891292572, "sft_loss": 0.051184628158807755, "step": 4044 }, { "epoch": 5.849602313810557, "grad_norm": 1.000916813030637, "learning_rate": 1.2294139491953348e-08, "logits/chosen": -1.1537386178970337, "logits/rejected": -0.8139678835868835, "logps/chosen": -0.07197534292936325, "logps/rejected": -5.211237907409668, "loss": 0.0403, "odds_ratio_loss": 0.0006955383578315377, "rewards/accuracies": 1.0, "rewards/chosen": -0.00719753373414278, "rewards/margins": 0.5139262080192566, "rewards/rejected": -0.5211237668991089, "sft_loss": 0.07197534292936325, "step": 4045 }, { "epoch": 5.851048445408532, "grad_norm": 0.7275097227346992, "learning_rate": 1.205201809552614e-08, "logits/chosen": -0.868467390537262, "logits/rejected": -0.738372802734375, "logps/chosen": -0.008017164655029774, "logps/rejected": -5.244393825531006, "loss": 0.0164, "odds_ratio_loss": 0.0011052724439650774, "rewards/accuracies": 1.0, "rewards/chosen": -0.0008017165237106383, "rewards/margins": 0.5236376523971558, "rewards/rejected": -0.5244393348693848, "sft_loss": 0.008017164655029774, "step": 4046 }, { "epoch": 5.8524945770065075, "grad_norm": 1.3911350884327334, "learning_rate": 1.1812301047789741e-08, "logits/chosen": -0.9431469440460205, "logits/rejected": -0.5744068622589111, "logps/chosen": -0.028782209381461143, "logps/rejected": -5.3098039627075195, "loss": 0.0439, "odds_ratio_loss": 0.0012701970990747213, "rewards/accuracies": 1.0, "rewards/chosen": -0.002878220984712243, "rewards/margins": 0.5281022191047668, "rewards/rejected": -0.5309804677963257, "sft_loss": 0.028782209381461143, "step": 4047 }, { "epoch": 5.853940708604483, "grad_norm": 1.797128317747924, "learning_rate": 1.157498849327032e-08, "logits/chosen": -0.7650055289268494, "logits/rejected": -0.6522244811058044, "logps/chosen": -0.019537970423698425, "logps/rejected": -4.800078392028809, "loss": 0.038, "odds_ratio_loss": 0.0024411689955741167, "rewards/accuracies": 1.0, "rewards/chosen": -0.001953796949237585, "rewards/margins": 0.478054016828537, "rewards/rejected": -0.4800078272819519, "sft_loss": 0.019537970423698425, "step": 4048 }, { "epoch": 5.855386840202458, "grad_norm": 1.0121678029353458, "learning_rate": 1.134008057504543e-08, "logits/chosen": -1.054601788520813, "logits/rejected": -0.7696323990821838, "logps/chosen": -0.024660352617502213, "logps/rejected": -4.637537479400635, "loss": 0.0491, "odds_ratio_loss": 0.0005984175368212163, "rewards/accuracies": 1.0, "rewards/chosen": -0.0024660350754857063, "rewards/margins": 0.4612876772880554, "rewards/rejected": -0.46375373005867004, "sft_loss": 0.024660352617502213, "step": 4049 }, { "epoch": 5.856832971800434, "grad_norm": 0.9120344722940604, "learning_rate": 1.110757743474089e-08, "logits/chosen": -0.880794882774353, "logits/rejected": -0.620921790599823, "logps/chosen": -0.027760500088334084, "logps/rejected": -4.301288604736328, "loss": 0.0361, "odds_ratio_loss": 0.04534320533275604, "rewards/accuracies": 0.9375, "rewards/chosen": -0.002776050241664052, "rewards/margins": 0.4273528456687927, "rewards/rejected": -0.43012890219688416, "sft_loss": 0.027760500088334084, "step": 4050 }, { "epoch": 5.858279103398409, "grad_norm": 0.9597825250130472, "learning_rate": 1.0877479212534347e-08, "logits/chosen": -0.7906923294067383, "logits/rejected": -0.7207321524620056, "logps/chosen": -0.026014819741249084, "logps/rejected": -4.051596164703369, "loss": 0.0285, "odds_ratio_loss": 0.045702412724494934, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0026014824397861958, "rewards/margins": 0.4025581479072571, "rewards/rejected": -0.405159592628479, "sft_loss": 0.026014819741249084, "step": 4051 }, { "epoch": 5.859725234996384, "grad_norm": 1.0139612740090866, "learning_rate": 1.0649786047152164e-08, "logits/chosen": -0.8806436657905579, "logits/rejected": -0.8103398084640503, "logps/chosen": -0.009493944235146046, "logps/rejected": -6.130977153778076, "loss": 0.0262, "odds_ratio_loss": 0.00031167615088634193, "rewards/accuracies": 1.0, "rewards/chosen": -0.0009493945399299264, "rewards/margins": 0.6121483445167542, "rewards/rejected": -0.6130977869033813, "sft_loss": 0.009493944235146046, "step": 4052 }, { "epoch": 5.86117136659436, "grad_norm": 1.1549340742035668, "learning_rate": 1.0424498075872534e-08, "logits/chosen": -1.0818321704864502, "logits/rejected": -0.6798063516616821, "logps/chosen": -0.02330070734024048, "logps/rejected": -6.7499237060546875, "loss": 0.0461, "odds_ratio_loss": 0.00016744097229093313, "rewards/accuracies": 1.0, "rewards/chosen": -0.0023300708271563053, "rewards/margins": 0.6726623773574829, "rewards/rejected": -0.6749923825263977, "sft_loss": 0.02330070734024048, "step": 4053 }, { "epoch": 5.862617498192336, "grad_norm": 1.1474299609090526, "learning_rate": 1.020161543452147e-08, "logits/chosen": -0.9909787178039551, "logits/rejected": -0.7948153018951416, "logps/chosen": -0.06000065430998802, "logps/rejected": -5.568455219268799, "loss": 0.0374, "odds_ratio_loss": 0.0031150178983807564, "rewards/accuracies": 1.0, "rewards/chosen": -0.006000065244734287, "rewards/margins": 0.5508455038070679, "rewards/rejected": -0.5568455457687378, "sft_loss": 0.06000065430998802, "step": 4054 }, { "epoch": 5.864063629790311, "grad_norm": 1.2769821049762768, "learning_rate": 9.98113825747593e-09, "logits/chosen": -0.5809721946716309, "logits/rejected": -0.4099165201187134, "logps/chosen": -0.05332209914922714, "logps/rejected": -5.088987827301025, "loss": 0.0353, "odds_ratio_loss": 0.0025865475181490183, "rewards/accuracies": 1.0, "rewards/chosen": -0.005332210101187229, "rewards/margins": 0.503566563129425, "rewards/rejected": -0.5088987946510315, "sft_loss": 0.05332209914922714, "step": 4055 }, { "epoch": 5.865509761388286, "grad_norm": 0.9667710277502991, "learning_rate": 9.763066677662912e-09, "logits/chosen": -0.8778347969055176, "logits/rejected": -0.7671928405761719, "logps/chosen": -0.01711271144449711, "logps/rejected": -4.43693733215332, "loss": 0.0314, "odds_ratio_loss": 0.0010325959883630276, "rewards/accuracies": 1.0, "rewards/chosen": -0.0017112712375819683, "rewards/margins": 0.4419825077056885, "rewards/rejected": -0.44369375705718994, "sft_loss": 0.01711271144449711, "step": 4056 }, { "epoch": 5.866955892986262, "grad_norm": 1.2189116844595043, "learning_rate": 9.547400826557694e-09, "logits/chosen": -0.8398338556289673, "logits/rejected": -0.68487548828125, "logps/chosen": -0.06597983837127686, "logps/rejected": -7.124354839324951, "loss": 0.0583, "odds_ratio_loss": 0.005012186244130135, "rewards/accuracies": 1.0, "rewards/chosen": -0.006597983185201883, "rewards/margins": 0.7058374881744385, "rewards/rejected": -0.7124354839324951, "sft_loss": 0.06597983837127686, "step": 4057 }, { "epoch": 5.868402024584237, "grad_norm": 0.8571045099648362, "learning_rate": 9.334140834186933e-09, "logits/chosen": -0.7491556406021118, "logits/rejected": -0.6324608325958252, "logps/chosen": -0.020480100065469742, "logps/rejected": -6.475594997406006, "loss": 0.0228, "odds_ratio_loss": 0.0007072009611874819, "rewards/accuracies": 1.0, "rewards/chosen": -0.002048010006546974, "rewards/margins": 0.6455115079879761, "rewards/rejected": -0.6475595235824585, "sft_loss": 0.020480100065469742, "step": 4058 }, { "epoch": 5.869848156182212, "grad_norm": 1.1139143128225042, "learning_rate": 9.123286829125554e-09, "logits/chosen": -0.9277915954589844, "logits/rejected": -0.49561363458633423, "logps/chosen": -0.05119159817695618, "logps/rejected": -5.136085510253906, "loss": 0.0391, "odds_ratio_loss": 0.0015518446452915668, "rewards/accuracies": 1.0, "rewards/chosen": -0.005119160283356905, "rewards/margins": 0.5084893703460693, "rewards/rejected": -0.5136085152626038, "sft_loss": 0.05119159817695618, "step": 4059 }, { "epoch": 5.871294287780188, "grad_norm": 0.9491723434647289, "learning_rate": 8.914838938498093e-09, "logits/chosen": -0.7413885593414307, "logits/rejected": -0.6934369802474976, "logps/chosen": -0.01211594045162201, "logps/rejected": -4.734433174133301, "loss": 0.0284, "odds_ratio_loss": 0.0015892100054770708, "rewards/accuracies": 1.0, "rewards/chosen": -0.0012115939753130078, "rewards/margins": 0.4722316861152649, "rewards/rejected": -0.47344329953193665, "sft_loss": 0.01211594045162201, "step": 4060 }, { "epoch": 5.872740419378164, "grad_norm": 1.237417223112888, "learning_rate": 8.708797287978687e-09, "logits/chosen": -0.7332858443260193, "logits/rejected": -0.5800739526748657, "logps/chosen": -0.04828640818595886, "logps/rejected": -3.8269550800323486, "loss": 0.0499, "odds_ratio_loss": 0.0024078148417174816, "rewards/accuracies": 1.0, "rewards/chosen": -0.004828641191124916, "rewards/margins": 0.37786686420440674, "rewards/rejected": -0.3826954960823059, "sft_loss": 0.04828640818595886, "step": 4061 }, { "epoch": 5.874186550976139, "grad_norm": 0.9927827938054246, "learning_rate": 8.505162001790189e-09, "logits/chosen": -0.6603043079376221, "logits/rejected": -0.5567718744277954, "logps/chosen": -0.03948831930756569, "logps/rejected": -6.371557235717773, "loss": 0.0292, "odds_ratio_loss": 0.002562589943408966, "rewards/accuracies": 1.0, "rewards/chosen": -0.003948831930756569, "rewards/margins": 0.6332069039344788, "rewards/rejected": -0.6371557712554932, "sft_loss": 0.03948831930756569, "step": 4062 }, { "epoch": 5.875632682574114, "grad_norm": 1.145853515466299, "learning_rate": 8.303933202705949e-09, "logits/chosen": -0.8784758448600769, "logits/rejected": -0.6068578958511353, "logps/chosen": -0.06510598957538605, "logps/rejected": -4.118241310119629, "loss": 0.0385, "odds_ratio_loss": 0.004363437183201313, "rewards/accuracies": 1.0, "rewards/chosen": -0.0065105995163321495, "rewards/margins": 0.40531349182128906, "rewards/rejected": -0.411824107170105, "sft_loss": 0.06510598957538605, "step": 4063 }, { "epoch": 5.87707881417209, "grad_norm": 1.0442896336680918, "learning_rate": 8.105111012046696e-09, "logits/chosen": -0.8733867406845093, "logits/rejected": -0.4977574944496155, "logps/chosen": -0.039922237396240234, "logps/rejected": -5.064108848571777, "loss": 0.0383, "odds_ratio_loss": 0.001056623412296176, "rewards/accuracies": 1.0, "rewards/chosen": -0.003992223646491766, "rewards/margins": 0.5024186372756958, "rewards/rejected": -0.5064108967781067, "sft_loss": 0.039922237396240234, "step": 4064 }, { "epoch": 5.8785249457700655, "grad_norm": 0.9958511906403898, "learning_rate": 7.908695549683653e-09, "logits/chosen": -0.8687781095504761, "logits/rejected": -0.6260200142860413, "logps/chosen": -0.04101001098752022, "logps/rejected": -7.215920448303223, "loss": 0.0274, "odds_ratio_loss": 0.0009698215289972723, "rewards/accuracies": 1.0, "rewards/chosen": -0.0041010016575455666, "rewards/margins": 0.7174910306930542, "rewards/rejected": -0.7215920686721802, "sft_loss": 0.04101001098752022, "step": 4065 }, { "epoch": 5.87997107736804, "grad_norm": 1.1871995791926684, "learning_rate": 7.714686934035874e-09, "logits/chosen": -0.7915902137756348, "logits/rejected": -0.627166211605072, "logps/chosen": -0.05223681777715683, "logps/rejected": -6.953492164611816, "loss": 0.0521, "odds_ratio_loss": 0.008496535010635853, "rewards/accuracies": 1.0, "rewards/chosen": -0.005223682150244713, "rewards/margins": 0.6901255249977112, "rewards/rejected": -0.6953492164611816, "sft_loss": 0.05223681777715683, "step": 4066 }, { "epoch": 5.881417208966016, "grad_norm": 1.109079649068121, "learning_rate": 7.523085282072461e-09, "logits/chosen": -0.9851240515708923, "logits/rejected": -0.7789430618286133, "logps/chosen": -0.03919222205877304, "logps/rejected": -5.137519836425781, "loss": 0.0302, "odds_ratio_loss": 0.001989867305383086, "rewards/accuracies": 1.0, "rewards/chosen": -0.003919222392141819, "rewards/margins": 0.5098327398300171, "rewards/rejected": -0.5137519836425781, "sft_loss": 0.03919222205877304, "step": 4067 }, { "epoch": 5.882863340563992, "grad_norm": 0.9660218011534741, "learning_rate": 7.333890709310342e-09, "logits/chosen": -0.8588817715644836, "logits/rejected": -0.6250803470611572, "logps/chosen": -0.011442586779594421, "logps/rejected": -7.525029182434082, "loss": 0.0248, "odds_ratio_loss": 0.0010403214255347848, "rewards/accuracies": 1.0, "rewards/chosen": -0.0011442586546763778, "rewards/margins": 0.7513586282730103, "rewards/rejected": -0.7525028586387634, "sft_loss": 0.011442586779594421, "step": 4068 }, { "epoch": 5.884309472161966, "grad_norm": 0.8837517132609889, "learning_rate": 7.147103329816051e-09, "logits/chosen": -0.8711720705032349, "logits/rejected": -0.5237680673599243, "logps/chosen": -0.04525275528430939, "logps/rejected": -6.475998878479004, "loss": 0.0377, "odds_ratio_loss": 0.003399358130991459, "rewards/accuracies": 1.0, "rewards/chosen": -0.004525275435298681, "rewards/margins": 0.643074631690979, "rewards/rejected": -0.6475998759269714, "sft_loss": 0.04525275528430939, "step": 4069 }, { "epoch": 5.885755603759942, "grad_norm": 1.288411564629479, "learning_rate": 6.962723256203951e-09, "logits/chosen": -0.7793477773666382, "logits/rejected": -0.7089977264404297, "logps/chosen": -0.02952570654451847, "logps/rejected": -6.359105110168457, "loss": 0.0389, "odds_ratio_loss": 0.0031170076690614223, "rewards/accuracies": 1.0, "rewards/chosen": -0.002952571026980877, "rewards/margins": 0.6329580545425415, "rewards/rejected": -0.6359105706214905, "sft_loss": 0.02952570654451847, "step": 4070 }, { "epoch": 5.887201735357918, "grad_norm": 1.036569757462945, "learning_rate": 6.780750599637564e-09, "logits/chosen": -0.9587365984916687, "logits/rejected": -0.8759207129478455, "logps/chosen": -0.017817378044128418, "logps/rejected": -5.159303188323975, "loss": 0.0256, "odds_ratio_loss": 0.0029128207825124264, "rewards/accuracies": 1.0, "rewards/chosen": -0.001781738013960421, "rewards/margins": 0.5141485929489136, "rewards/rejected": -0.5159302949905396, "sft_loss": 0.017817378044128418, "step": 4071 }, { "epoch": 5.888647866955893, "grad_norm": 1.0826197981867185, "learning_rate": 6.601185469829129e-09, "logits/chosen": -0.9221165776252747, "logits/rejected": -0.646438479423523, "logps/chosen": -0.013732078485190868, "logps/rejected": -5.320560932159424, "loss": 0.0397, "odds_ratio_loss": 0.0002973644877783954, "rewards/accuracies": 1.0, "rewards/chosen": -0.0013732078950852156, "rewards/margins": 0.5306828618049622, "rewards/rejected": -0.5320560932159424, "sft_loss": 0.013732078485190868, "step": 4072 }, { "epoch": 5.890093998553868, "grad_norm": 1.0761047766337628, "learning_rate": 6.424027975038715e-09, "logits/chosen": -0.9772175550460815, "logits/rejected": -0.7124834656715393, "logps/chosen": -0.06356082856655121, "logps/rejected": -5.914202690124512, "loss": 0.0368, "odds_ratio_loss": 0.0028968786355108023, "rewards/accuracies": 1.0, "rewards/chosen": -0.006356082856655121, "rewards/margins": 0.585064172744751, "rewards/rejected": -0.5914202928543091, "sft_loss": 0.06356082856655121, "step": 4073 }, { "epoch": 5.891540130151844, "grad_norm": 0.981644402128146, "learning_rate": 6.2492782220759935e-09, "logits/chosen": -0.8365199565887451, "logits/rejected": -0.5724873542785645, "logps/chosen": -0.029665423557162285, "logps/rejected": -6.783669471740723, "loss": 0.0422, "odds_ratio_loss": 0.00043808904592879117, "rewards/accuracies": 1.0, "rewards/chosen": -0.0029665424954146147, "rewards/margins": 0.6754004955291748, "rewards/rejected": -0.678367018699646, "sft_loss": 0.029665423557162285, "step": 4074 }, { "epoch": 5.892986261749819, "grad_norm": 1.2626934450956793, "learning_rate": 6.076936316297132e-09, "logits/chosen": -0.7279162406921387, "logits/rejected": -0.46087944507598877, "logps/chosen": -0.05320172384381294, "logps/rejected": -5.229532718658447, "loss": 0.0402, "odds_ratio_loss": 0.00094027747400105, "rewards/accuracies": 1.0, "rewards/chosen": -0.005320172291249037, "rewards/margins": 0.5176330804824829, "rewards/rejected": -0.5229532718658447, "sft_loss": 0.05320172384381294, "step": 4075 }, { "epoch": 5.8944323933477945, "grad_norm": 0.8948413475516522, "learning_rate": 5.907002361608793e-09, "logits/chosen": -0.8324787616729736, "logits/rejected": -0.8268545866012573, "logps/chosen": -0.026477977633476257, "logps/rejected": -4.179195404052734, "loss": 0.028, "odds_ratio_loss": 0.0011238600127398968, "rewards/accuracies": 1.0, "rewards/chosen": -0.002647798042744398, "rewards/margins": 0.4152717590332031, "rewards/rejected": -0.4179195463657379, "sft_loss": 0.026477977633476257, "step": 4076 }, { "epoch": 5.89587852494577, "grad_norm": 1.1699254325701098, "learning_rate": 5.739476460464132e-09, "logits/chosen": -1.1227381229400635, "logits/rejected": -0.7359456419944763, "logps/chosen": -0.016686441376805305, "logps/rejected": -5.694275856018066, "loss": 0.0226, "odds_ratio_loss": 0.0007282921578735113, "rewards/accuracies": 1.0, "rewards/chosen": -0.0016686442540958524, "rewards/margins": 0.5677589774131775, "rewards/rejected": -0.5694276094436646, "sft_loss": 0.016686441376805305, "step": 4077 }, { "epoch": 5.897324656543746, "grad_norm": 1.093944591158634, "learning_rate": 5.574358713865468e-09, "logits/chosen": -0.9437555074691772, "logits/rejected": -0.7433983683586121, "logps/chosen": -0.11696302890777588, "logps/rejected": -4.217023849487305, "loss": 0.0546, "odds_ratio_loss": 0.007626968435943127, "rewards/accuracies": 1.0, "rewards/chosen": -0.011696303263306618, "rewards/margins": 0.4100060760974884, "rewards/rejected": -0.42170241475105286, "sft_loss": 0.11696302890777588, "step": 4078 }, { "epoch": 5.898770788141721, "grad_norm": 0.9801406350864609, "learning_rate": 5.411649221362502e-09, "logits/chosen": -0.710649311542511, "logits/rejected": -0.6215068697929382, "logps/chosen": -0.02466878667473793, "logps/rejected": -4.457521915435791, "loss": 0.0285, "odds_ratio_loss": 0.001121659530326724, "rewards/accuracies": 1.0, "rewards/chosen": -0.0024668786209076643, "rewards/margins": 0.44328534603118896, "rewards/rejected": -0.44575223326683044, "sft_loss": 0.02466878667473793, "step": 4079 }, { "epoch": 5.900216919739696, "grad_norm": 0.8747769046834969, "learning_rate": 5.251348081054097e-09, "logits/chosen": -0.8850862979888916, "logits/rejected": -0.5741367936134338, "logps/chosen": -0.023076584562659264, "logps/rejected": -6.402840614318848, "loss": 0.0327, "odds_ratio_loss": 0.0006071379175409675, "rewards/accuracies": 1.0, "rewards/chosen": -0.002307658549398184, "rewards/margins": 0.6379764080047607, "rewards/rejected": -0.6402841210365295, "sft_loss": 0.023076584562659264, "step": 4080 }, { "epoch": 5.901663051337672, "grad_norm": 0.9270878906304059, "learning_rate": 5.09345538958561e-09, "logits/chosen": -0.8760882019996643, "logits/rejected": -0.5293622016906738, "logps/chosen": -0.03015088476240635, "logps/rejected": -6.523216247558594, "loss": 0.0332, "odds_ratio_loss": 0.0021572900004684925, "rewards/accuracies": 1.0, "rewards/chosen": -0.0030150888487696648, "rewards/margins": 0.6493065357208252, "rewards/rejected": -0.6523215770721436, "sft_loss": 0.03015088476240635, "step": 4081 }, { "epoch": 5.903109182935647, "grad_norm": 0.9858558759772296, "learning_rate": 4.9379712421515615e-09, "logits/chosen": -1.0308140516281128, "logits/rejected": -0.6831567287445068, "logps/chosen": -0.025113025680184364, "logps/rejected": -6.239609718322754, "loss": 0.0274, "odds_ratio_loss": 0.0007894702721387148, "rewards/accuracies": 1.0, "rewards/chosen": -0.002511302474886179, "rewards/margins": 0.6214496493339539, "rewards/rejected": -0.6239609718322754, "sft_loss": 0.025113025680184364, "step": 4082 }, { "epoch": 5.9045553145336225, "grad_norm": 1.236514338792531, "learning_rate": 4.784895732493854e-09, "logits/chosen": -0.9402459859848022, "logits/rejected": -0.7553094029426575, "logps/chosen": -0.01938311569392681, "logps/rejected": -5.070533275604248, "loss": 0.0356, "odds_ratio_loss": 0.0006489180377684534, "rewards/accuracies": 1.0, "rewards/chosen": -0.0019383116159588099, "rewards/margins": 0.5051149725914001, "rewards/rejected": -0.5070533156394958, "sft_loss": 0.01938311569392681, "step": 4083 }, { "epoch": 5.906001446131598, "grad_norm": 0.8111350183711371, "learning_rate": 4.634228952902219e-09, "logits/chosen": -0.8608274459838867, "logits/rejected": -0.7472224235534668, "logps/chosen": -0.023615194484591484, "logps/rejected": -4.508618354797363, "loss": 0.0267, "odds_ratio_loss": 0.003320841584354639, "rewards/accuracies": 1.0, "rewards/chosen": -0.002361519727855921, "rewards/margins": 0.4485003352165222, "rewards/rejected": -0.4508618712425232, "sft_loss": 0.023615194484591484, "step": 4084 }, { "epoch": 5.907447577729574, "grad_norm": 1.0202535691554027, "learning_rate": 4.485970994214661e-09, "logits/chosen": -0.8583634495735168, "logits/rejected": -0.5765115022659302, "logps/chosen": -0.04926097393035889, "logps/rejected": -4.539674758911133, "loss": 0.0491, "odds_ratio_loss": 0.0009557848679833114, "rewards/accuracies": 1.0, "rewards/chosen": -0.004926097579300404, "rewards/margins": 0.4490413963794708, "rewards/rejected": -0.45396748185157776, "sft_loss": 0.04926097393035889, "step": 4085 }, { "epoch": 5.908893709327549, "grad_norm": 1.632546593521506, "learning_rate": 4.340121945815678e-09, "logits/chosen": -0.7539007067680359, "logits/rejected": -0.818702220916748, "logps/chosen": -0.051897093653678894, "logps/rejected": -3.8027875423431396, "loss": 0.0464, "odds_ratio_loss": 0.006976235192269087, "rewards/accuracies": 1.0, "rewards/chosen": -0.005189709831029177, "rewards/margins": 0.37508904933929443, "rewards/rejected": -0.3802787661552429, "sft_loss": 0.051897093653678894, "step": 4086 }, { "epoch": 5.910339840925524, "grad_norm": 0.996259047615235, "learning_rate": 4.196681895638487e-09, "logits/chosen": -1.0506974458694458, "logits/rejected": -0.7812222242355347, "logps/chosen": -0.023906368762254715, "logps/rejected": -6.1207275390625, "loss": 0.0326, "odds_ratio_loss": 0.002137015340849757, "rewards/accuracies": 1.0, "rewards/chosen": -0.0023906370624899864, "rewards/margins": 0.6096821427345276, "rewards/rejected": -0.6120727062225342, "sft_loss": 0.023906368762254715, "step": 4087 }, { "epoch": 5.9117859725235, "grad_norm": 1.012901522073042, "learning_rate": 4.055650930164134e-09, "logits/chosen": -0.6814955472946167, "logits/rejected": -0.5374518036842346, "logps/chosen": -0.019741419702768326, "logps/rejected": -4.390100955963135, "loss": 0.0445, "odds_ratio_loss": 0.0011601306032389402, "rewards/accuracies": 1.0, "rewards/chosen": -0.0019741421565413475, "rewards/margins": 0.4370359778404236, "rewards/rejected": -0.4390101432800293, "sft_loss": 0.019741419702768326, "step": 4088 }, { "epoch": 5.913232104121475, "grad_norm": 1.1705865509991935, "learning_rate": 3.917029134420158e-09, "logits/chosen": -0.8490438461303711, "logits/rejected": -0.6280502080917358, "logps/chosen": -0.014538794755935669, "logps/rejected": -5.418238162994385, "loss": 0.0617, "odds_ratio_loss": 0.0008393567986786366, "rewards/accuracies": 1.0, "rewards/chosen": -0.0014538795221596956, "rewards/margins": 0.5403699278831482, "rewards/rejected": -0.5418238043785095, "sft_loss": 0.014538794755935669, "step": 4089 }, { "epoch": 5.9146782357194505, "grad_norm": 0.8268715863355088, "learning_rate": 3.780816591981928e-09, "logits/chosen": -0.925495982170105, "logits/rejected": -0.7166245579719543, "logps/chosen": -0.02626338042318821, "logps/rejected": -5.465725898742676, "loss": 0.0239, "odds_ratio_loss": 0.000887808040715754, "rewards/accuracies": 1.0, "rewards/chosen": -0.0026263382751494646, "rewards/margins": 0.5439462661743164, "rewards/rejected": -0.5465726852416992, "sft_loss": 0.02626338042318821, "step": 4090 }, { "epoch": 5.916124367317426, "grad_norm": 1.104740571942427, "learning_rate": 3.6470133849735297e-09, "logits/chosen": -0.9207170009613037, "logits/rejected": -0.827392041683197, "logps/chosen": -0.04282090440392494, "logps/rejected": -4.764659881591797, "loss": 0.0384, "odds_ratio_loss": 0.0029545726720243692, "rewards/accuracies": 1.0, "rewards/chosen": -0.004282090347260237, "rewards/margins": 0.47218388319015503, "rewards/rejected": -0.47646600008010864, "sft_loss": 0.04282090440392494, "step": 4091 }, { "epoch": 5.917570498915401, "grad_norm": 0.9063259918507958, "learning_rate": 3.515619594064212e-09, "logits/chosen": -1.0181001424789429, "logits/rejected": -0.8038081526756287, "logps/chosen": -0.03374544531106949, "logps/rejected": -4.169094085693359, "loss": 0.0323, "odds_ratio_loss": 0.0018833805806934834, "rewards/accuracies": 1.0, "rewards/chosen": -0.003374544670805335, "rewards/margins": 0.41353487968444824, "rewards/rejected": -0.41690942645072937, "sft_loss": 0.03374544531106949, "step": 4092 }, { "epoch": 5.919016630513377, "grad_norm": 1.0335977859705563, "learning_rate": 3.3866352984728285e-09, "logits/chosen": -1.0784063339233398, "logits/rejected": -0.7443411350250244, "logps/chosen": -0.03114999271929264, "logps/rejected": -4.165608882904053, "loss": 0.0326, "odds_ratio_loss": 0.0012884940952062607, "rewards/accuracies": 1.0, "rewards/chosen": -0.003114999271929264, "rewards/margins": 0.4134458899497986, "rewards/rejected": -0.4165608882904053, "sft_loss": 0.03114999271929264, "step": 4093 }, { "epoch": 5.920462762111352, "grad_norm": 0.8634076399115647, "learning_rate": 3.260060575963841e-09, "logits/chosen": -0.8990287780761719, "logits/rejected": -0.723576545715332, "logps/chosen": -0.055409032851457596, "logps/rejected": -4.412961959838867, "loss": 0.0307, "odds_ratio_loss": 0.005224099848419428, "rewards/accuracies": 1.0, "rewards/chosen": -0.005540903657674789, "rewards/margins": 0.43575531244277954, "rewards/rejected": -0.44129621982574463, "sft_loss": 0.055409032851457596, "step": 4094 }, { "epoch": 5.921908893709327, "grad_norm": 1.0221510601388344, "learning_rate": 3.1358955028495393e-09, "logits/chosen": -0.9376885890960693, "logits/rejected": -0.6291782855987549, "logps/chosen": -0.016738008707761765, "logps/rejected": -5.539161682128906, "loss": 0.0392, "odds_ratio_loss": 0.0006912790704518557, "rewards/accuracies": 1.0, "rewards/chosen": -0.0016738008707761765, "rewards/margins": 0.5522423982620239, "rewards/rejected": -0.5539162158966064, "sft_loss": 0.016738008707761765, "step": 4095 }, { "epoch": 5.923355025307303, "grad_norm": 1.1064587046513101, "learning_rate": 3.0141401539900415e-09, "logits/chosen": -0.6891365647315979, "logits/rejected": -0.5489487648010254, "logps/chosen": -0.09406330436468124, "logps/rejected": -4.19540548324585, "loss": 0.0651, "odds_ratio_loss": 0.0437358096241951, "rewards/accuracies": 0.9375, "rewards/chosen": -0.00940632913261652, "rewards/margins": 0.4101342260837555, "rewards/rejected": -0.4195405840873718, "sft_loss": 0.09406330436468124, "step": 4096 }, { "epoch": 5.9248011569052785, "grad_norm": 1.3920590957005463, "learning_rate": 2.894794602791517e-09, "logits/chosen": -1.16707181930542, "logits/rejected": -0.9460763931274414, "logps/chosen": -0.023493018001317978, "logps/rejected": -5.3490495681762695, "loss": 0.072, "odds_ratio_loss": 0.001594938919879496, "rewards/accuracies": 1.0, "rewards/chosen": -0.002349301939830184, "rewards/margins": 0.5325556993484497, "rewards/rejected": -0.534904956817627, "sft_loss": 0.023493018001317978, "step": 4097 }, { "epoch": 5.926247288503253, "grad_norm": 1.091799372443769, "learning_rate": 2.777858921208409e-09, "logits/chosen": -0.741195559501648, "logits/rejected": -0.6491609811782837, "logps/chosen": -0.02929551713168621, "logps/rejected": -3.8806638717651367, "loss": 0.0334, "odds_ratio_loss": 0.0010335511760786176, "rewards/accuracies": 1.0, "rewards/chosen": -0.0029295519925653934, "rewards/margins": 0.38513684272766113, "rewards/rejected": -0.3880664110183716, "sft_loss": 0.02929551713168621, "step": 4098 }, { "epoch": 5.927693420101229, "grad_norm": 1.2381780680971113, "learning_rate": 2.663333179741212e-09, "logits/chosen": -0.9526777267456055, "logits/rejected": -0.7601011395454407, "logps/chosen": -0.05737978592514992, "logps/rejected": -4.47212553024292, "loss": 0.0468, "odds_ratio_loss": 0.0037496332079172134, "rewards/accuracies": 1.0, "rewards/chosen": -0.005737978499382734, "rewards/margins": 0.441474586725235, "rewards/rejected": -0.4472126066684723, "sft_loss": 0.05737978592514992, "step": 4099 }, { "epoch": 5.929139551699205, "grad_norm": 1.1290870074892114, "learning_rate": 2.5512174474382475e-09, "logits/chosen": -0.8009154796600342, "logits/rejected": -0.6760156154632568, "logps/chosen": -0.038477495312690735, "logps/rejected": -4.925846099853516, "loss": 0.0438, "odds_ratio_loss": 0.0019320531282573938, "rewards/accuracies": 1.0, "rewards/chosen": -0.003847749438136816, "rewards/margins": 0.4887368679046631, "rewards/rejected": -0.49258458614349365, "sft_loss": 0.038477495312690735, "step": 4100 }, { "epoch": 5.93058568329718, "grad_norm": 1.1307251208355675, "learning_rate": 2.441511791894335e-09, "logits/chosen": -0.9180901050567627, "logits/rejected": -0.581337034702301, "logps/chosen": -0.07019197195768356, "logps/rejected": -6.812994956970215, "loss": 0.0422, "odds_ratio_loss": 0.0022215438075363636, "rewards/accuracies": 1.0, "rewards/chosen": -0.007019197568297386, "rewards/margins": 0.6742802858352661, "rewards/rejected": -0.6812995076179504, "sft_loss": 0.07019197195768356, "step": 4101 }, { "epoch": 5.932031814895155, "grad_norm": 0.9974677158336643, "learning_rate": 2.3342162792516772e-09, "logits/chosen": -0.832883358001709, "logits/rejected": -0.5870314836502075, "logps/chosen": -0.03074686974287033, "logps/rejected": -5.959352493286133, "loss": 0.0416, "odds_ratio_loss": 0.001421600696630776, "rewards/accuracies": 1.0, "rewards/chosen": -0.0030746874399483204, "rewards/margins": 0.5928605794906616, "rewards/rejected": -0.5959352254867554, "sft_loss": 0.03074686974287033, "step": 4102 }, { "epoch": 5.933477946493131, "grad_norm": 1.4469006514086222, "learning_rate": 2.229330974198529e-09, "logits/chosen": -0.8372644186019897, "logits/rejected": -0.6750832200050354, "logps/chosen": -0.04740104451775551, "logps/rejected": -6.3367462158203125, "loss": 0.0425, "odds_ratio_loss": 0.002821737201884389, "rewards/accuracies": 1.0, "rewards/chosen": -0.004740104544907808, "rewards/margins": 0.6289345622062683, "rewards/rejected": -0.6336746215820312, "sft_loss": 0.04740104451775551, "step": 4103 }, { "epoch": 5.934924078091107, "grad_norm": 0.9240492769234988, "learning_rate": 2.126855939971417e-09, "logits/chosen": -0.8022180795669556, "logits/rejected": -0.7096238732337952, "logps/chosen": -0.01701393537223339, "logps/rejected": -3.7467222213745117, "loss": 0.0328, "odds_ratio_loss": 0.0013247316237539053, "rewards/accuracies": 1.0, "rewards/chosen": -0.0017013936303555965, "rewards/margins": 0.372970849275589, "rewards/rejected": -0.3746722340583801, "sft_loss": 0.01701393537223339, "step": 4104 }, { "epoch": 5.936370209689081, "grad_norm": 0.9598971144707034, "learning_rate": 2.0267912383520324e-09, "logits/chosen": -0.9782860279083252, "logits/rejected": -0.621020495891571, "logps/chosen": -0.05000707879662514, "logps/rejected": -5.905955791473389, "loss": 0.0432, "odds_ratio_loss": 0.0016283662989735603, "rewards/accuracies": 1.0, "rewards/chosen": -0.005000708159059286, "rewards/margins": 0.585594892501831, "rewards/rejected": -0.5905956625938416, "sft_loss": 0.05000707879662514, "step": 4105 }, { "epoch": 5.937816341287057, "grad_norm": 2.7765226016012057, "learning_rate": 1.9291369296707825e-09, "logits/chosen": -0.8960604071617126, "logits/rejected": -0.817623496055603, "logps/chosen": -0.014780182391405106, "logps/rejected": -6.727514266967773, "loss": 0.054, "odds_ratio_loss": 0.0007389021338894963, "rewards/accuracies": 1.0, "rewards/chosen": -0.0014780182391405106, "rewards/margins": 0.671273410320282, "rewards/rejected": -0.6727514266967773, "sft_loss": 0.014780182391405106, "step": 4106 }, { "epoch": 5.939262472885033, "grad_norm": 1.1274069239257218, "learning_rate": 1.8338930728027946e-09, "logits/chosen": -0.964221715927124, "logits/rejected": -0.6402559876441956, "logps/chosen": -0.05291515588760376, "logps/rejected": -7.261722087860107, "loss": 0.0399, "odds_ratio_loss": 0.0011750500416383147, "rewards/accuracies": 1.0, "rewards/chosen": -0.005291515961289406, "rewards/margins": 0.7208806276321411, "rewards/rejected": -0.7261722087860107, "sft_loss": 0.05291515588760376, "step": 4107 }, { "epoch": 5.940708604483008, "grad_norm": 0.924331347038122, "learning_rate": 1.7410597251719116e-09, "logits/chosen": -0.9784330725669861, "logits/rejected": -0.5999252796173096, "logps/chosen": -0.03772180899977684, "logps/rejected": -6.771916389465332, "loss": 0.0248, "odds_ratio_loss": 0.0036124063190072775, "rewards/accuracies": 1.0, "rewards/chosen": -0.0037721809931099415, "rewards/margins": 0.6734194755554199, "rewards/rejected": -0.6771916151046753, "sft_loss": 0.03772180899977684, "step": 4108 }, { "epoch": 5.942154736080983, "grad_norm": 1.3007265136932227, "learning_rate": 1.650636942746697e-09, "logits/chosen": -0.9911519289016724, "logits/rejected": -0.7490329742431641, "logps/chosen": -0.04981255158782005, "logps/rejected": -5.104490280151367, "loss": 0.0629, "odds_ratio_loss": 0.0008861341630108654, "rewards/accuracies": 1.0, "rewards/chosen": -0.004981255158782005, "rewards/margins": 0.5054677724838257, "rewards/rejected": -0.5104490518569946, "sft_loss": 0.04981255158782005, "step": 4109 }, { "epoch": 5.943600867678959, "grad_norm": 0.9034752594114349, "learning_rate": 1.5626247800444303e-09, "logits/chosen": -0.9556131362915039, "logits/rejected": -0.6480990648269653, "logps/chosen": -0.02443695440888405, "logps/rejected": -5.567079544067383, "loss": 0.0258, "odds_ratio_loss": 0.0025377324782311916, "rewards/accuracies": 1.0, "rewards/chosen": -0.002443695208057761, "rewards/margins": 0.5542643070220947, "rewards/rejected": -0.5567079782485962, "sft_loss": 0.02443695440888405, "step": 4110 }, { "epoch": 5.945046999276935, "grad_norm": 1.27019890219729, "learning_rate": 1.4770232901271107e-09, "logits/chosen": -0.7591812610626221, "logits/rejected": -0.641418993473053, "logps/chosen": -0.024198981001973152, "logps/rejected": -3.361138343811035, "loss": 0.0345, "odds_ratio_loss": 0.002360550919547677, "rewards/accuracies": 1.0, "rewards/chosen": -0.002419898519292474, "rewards/margins": 0.3336939215660095, "rewards/rejected": -0.336113840341568, "sft_loss": 0.024198981001973152, "step": 4111 }, { "epoch": 5.946493130874909, "grad_norm": 1.4078149386948933, "learning_rate": 1.3938325246045656e-09, "logits/chosen": -1.0027142763137817, "logits/rejected": -0.9888021945953369, "logps/chosen": -0.02432049624621868, "logps/rejected": -3.972032070159912, "loss": 0.0295, "odds_ratio_loss": 0.003202601568773389, "rewards/accuracies": 1.0, "rewards/chosen": -0.0024320497177541256, "rewards/margins": 0.39477115869522095, "rewards/rejected": -0.3972032070159912, "sft_loss": 0.02432049624621868, "step": 4112 }, { "epoch": 5.947939262472885, "grad_norm": 0.955294787070582, "learning_rate": 1.313052533633119e-09, "logits/chosen": -0.8551870584487915, "logits/rejected": -0.6820370554924011, "logps/chosen": -0.010347655974328518, "logps/rejected": -4.61668586730957, "loss": 0.0274, "odds_ratio_loss": 0.00016555214824620634, "rewards/accuracies": 1.0, "rewards/chosen": -0.0010347655043005943, "rewards/margins": 0.460633784532547, "rewards/rejected": -0.46166858077049255, "sft_loss": 0.010347655974328518, "step": 4113 }, { "epoch": 5.949385394070861, "grad_norm": 1.2465098226086702, "learning_rate": 1.2346833659147016e-09, "logits/chosen": -0.8677682876586914, "logits/rejected": -0.7311995029449463, "logps/chosen": -0.03744862973690033, "logps/rejected": -4.943976402282715, "loss": 0.0577, "odds_ratio_loss": 0.0047299060970544815, "rewards/accuracies": 1.0, "rewards/chosen": -0.0037448632065206766, "rewards/margins": 0.49065274000167847, "rewards/rejected": -0.4943976402282715, "sft_loss": 0.03744862973690033, "step": 4114 }, { "epoch": 5.9508315256688356, "grad_norm": 1.020861812684583, "learning_rate": 1.1587250686986294e-09, "logits/chosen": -0.9044225811958313, "logits/rejected": -0.594450056552887, "logps/chosen": -0.03550455719232559, "logps/rejected": -6.127573013305664, "loss": 0.0468, "odds_ratio_loss": 0.0004433983704075217, "rewards/accuracies": 1.0, "rewards/chosen": -0.003550455439835787, "rewards/margins": 0.6092069149017334, "rewards/rejected": -0.6127573847770691, "sft_loss": 0.03550455719232559, "step": 4115 }, { "epoch": 5.952277657266811, "grad_norm": 1.0219147309467258, "learning_rate": 1.085177687780714e-09, "logits/chosen": -0.8875457644462585, "logits/rejected": -0.6773673295974731, "logps/chosen": -0.03830548748373985, "logps/rejected": -5.578269004821777, "loss": 0.0257, "odds_ratio_loss": 0.0014839848736301064, "rewards/accuracies": 1.0, "rewards/chosen": -0.0038305488415062428, "rewards/margins": 0.5539963841438293, "rewards/rejected": -0.5578269362449646, "sft_loss": 0.03830548748373985, "step": 4116 }, { "epoch": 5.953723788864787, "grad_norm": 1.0919805731483818, "learning_rate": 1.0140412675023747e-09, "logits/chosen": -1.0210849046707153, "logits/rejected": -0.7989510297775269, "logps/chosen": -0.011519749648869038, "logps/rejected": -4.2892608642578125, "loss": 0.051, "odds_ratio_loss": 0.0009958500741049647, "rewards/accuracies": 1.0, "rewards/chosen": -0.0011519748950377107, "rewards/margins": 0.4277741014957428, "rewards/rejected": -0.42892611026763916, "sft_loss": 0.011519749648869038, "step": 4117 }, { "epoch": 5.955169920462762, "grad_norm": 1.038445647093431, "learning_rate": 9.453158507528592e-10, "logits/chosen": -0.9300838112831116, "logits/rejected": -0.8889990448951721, "logps/chosen": -0.08415620028972626, "logps/rejected": -4.28843879699707, "loss": 0.0552, "odds_ratio_loss": 0.0070391446352005005, "rewards/accuracies": 1.0, "rewards/chosen": -0.008415620774030685, "rewards/margins": 0.4204282760620117, "rewards/rejected": -0.4288438856601715, "sft_loss": 0.08415620028972626, "step": 4118 }, { "epoch": 5.956616052060737, "grad_norm": 1.8490327493708787, "learning_rate": 8.790014789661348e-10, "logits/chosen": -0.842028796672821, "logits/rejected": -0.7200061678886414, "logps/chosen": -0.04375598579645157, "logps/rejected": -4.8058576583862305, "loss": 0.0314, "odds_ratio_loss": 0.0012701171217486262, "rewards/accuracies": 1.0, "rewards/chosen": -0.004375598393380642, "rewards/margins": 0.47621017694473267, "rewards/rejected": -0.4805857241153717, "sft_loss": 0.04375598579645157, "step": 4119 }, { "epoch": 5.958062183658713, "grad_norm": 1.5751740362087219, "learning_rate": 8.150981921239975e-10, "logits/chosen": -1.0434660911560059, "logits/rejected": -0.7287979125976562, "logps/chosen": -0.026583680883049965, "logps/rejected": -6.822535991668701, "loss": 0.0562, "odds_ratio_loss": 0.0017852864693850279, "rewards/accuracies": 1.0, "rewards/chosen": -0.0026583680883049965, "rewards/margins": 0.6795952320098877, "rewards/rejected": -0.6822535991668701, "sft_loss": 0.026583680883049965, "step": 4120 }, { "epoch": 5.959508315256688, "grad_norm": 1.1963090985219154, "learning_rate": 7.536060287534063e-10, "logits/chosen": -0.8197766542434692, "logits/rejected": -0.6914808750152588, "logps/chosen": -0.05560974031686783, "logps/rejected": -7.24658727645874, "loss": 0.0612, "odds_ratio_loss": 0.010990302078425884, "rewards/accuracies": 1.0, "rewards/chosen": -0.00556097412481904, "rewards/margins": 0.7190977931022644, "rewards/rejected": -0.724658727645874, "sft_loss": 0.05560974031686783, "step": 4121 }, { "epoch": 5.960954446854664, "grad_norm": 1.8334323902812746, "learning_rate": 6.94525025928705e-10, "logits/chosen": -0.8709729313850403, "logits/rejected": -0.594711184501648, "logps/chosen": -0.03179216384887695, "logps/rejected": -10.035598754882812, "loss": 0.035, "odds_ratio_loss": 0.0020171799696981907, "rewards/accuracies": 1.0, "rewards/chosen": -0.003179216757416725, "rewards/margins": 1.0003807544708252, "rewards/rejected": -1.0035598278045654, "sft_loss": 0.03179216384887695, "step": 4122 }, { "epoch": 5.962400578452639, "grad_norm": 0.8575311426686644, "learning_rate": 6.37855219269845e-10, "logits/chosen": -0.7879198789596558, "logits/rejected": -0.7165445685386658, "logps/chosen": -0.015290379524230957, "logps/rejected": -4.180186748504639, "loss": 0.0287, "odds_ratio_loss": 0.0012166141532361507, "rewards/accuracies": 1.0, "rewards/chosen": -0.0015290379524230957, "rewards/margins": 0.4164896011352539, "rewards/rejected": -0.418018639087677, "sft_loss": 0.015290379524230957, "step": 4123 }, { "epoch": 5.963846710050615, "grad_norm": 1.037426781554966, "learning_rate": 5.835966429432737e-10, "logits/chosen": -0.7037444710731506, "logits/rejected": -0.450633704662323, "logps/chosen": -0.05810614302754402, "logps/rejected": -5.363962650299072, "loss": 0.0327, "odds_ratio_loss": 0.0023976389784365892, "rewards/accuracies": 1.0, "rewards/chosen": -0.005810614209622145, "rewards/margins": 0.5305856466293335, "rewards/rejected": -0.5363962650299072, "sft_loss": 0.05810614302754402, "step": 4124 }, { "epoch": 5.96529284164859, "grad_norm": 1.2952351363680497, "learning_rate": 5.317493296614906e-10, "logits/chosen": -0.9828850030899048, "logits/rejected": -0.8129862546920776, "logps/chosen": -0.027900131419301033, "logps/rejected": -4.454830169677734, "loss": 0.0626, "odds_ratio_loss": 0.0007958858041092753, "rewards/accuracies": 1.0, "rewards/chosen": -0.0027900133281946182, "rewards/margins": 0.44269299507141113, "rewards/rejected": -0.4454830288887024, "sft_loss": 0.027900131419301033, "step": 4125 }, { "epoch": 5.966738973246565, "grad_norm": 0.9134255978257638, "learning_rate": 4.82313310683935e-10, "logits/chosen": -0.9281235933303833, "logits/rejected": -0.725083589553833, "logps/chosen": -0.01804085075855255, "logps/rejected": -5.371147155761719, "loss": 0.0258, "odds_ratio_loss": 0.0014887560391798615, "rewards/accuracies": 1.0, "rewards/chosen": -0.0018040850991383195, "rewards/margins": 0.535310685634613, "rewards/rejected": -0.5371147394180298, "sft_loss": 0.01804085075855255, "step": 4126 }, { "epoch": 5.968185104844541, "grad_norm": 0.9621230174545535, "learning_rate": 4.3528861581521026e-10, "logits/chosen": -0.9386307001113892, "logits/rejected": -0.6423209309577942, "logps/chosen": -0.01551287341862917, "logps/rejected": -5.554164886474609, "loss": 0.0274, "odds_ratio_loss": 0.0015682197408750653, "rewards/accuracies": 1.0, "rewards/chosen": -0.0015512873651459813, "rewards/margins": 0.5538651943206787, "rewards/rejected": -0.555416464805603, "sft_loss": 0.01551287341862917, "step": 4127 }, { "epoch": 5.969631236442516, "grad_norm": 1.4551812506826591, "learning_rate": 3.906752734073038e-10, "logits/chosen": -0.9425130486488342, "logits/rejected": -0.6164754629135132, "logps/chosen": -0.03378310054540634, "logps/rejected": -5.581912994384766, "loss": 0.0562, "odds_ratio_loss": 0.0012506656348705292, "rewards/accuracies": 1.0, "rewards/chosen": -0.003378310240805149, "rewards/margins": 0.5548129677772522, "rewards/rejected": -0.5581912994384766, "sft_loss": 0.03378310054540634, "step": 4128 }, { "epoch": 5.971077368040492, "grad_norm": 1.430946768404967, "learning_rate": 3.4847331035736673e-10, "logits/chosen": -0.7740320563316345, "logits/rejected": -0.625731348991394, "logps/chosen": -0.11199574172496796, "logps/rejected": -3.3949060440063477, "loss": 0.0756, "odds_ratio_loss": 0.010786962695419788, "rewards/accuracies": 1.0, "rewards/chosen": -0.011199574917554855, "rewards/margins": 0.32829102873802185, "rewards/rejected": -0.3394905924797058, "sft_loss": 0.11199574172496796, "step": 4129 }, { "epoch": 5.972523499638467, "grad_norm": 1.0795984072522444, "learning_rate": 3.0868275210904624e-10, "logits/chosen": -0.8547477722167969, "logits/rejected": -0.7329878807067871, "logps/chosen": -0.03964013606309891, "logps/rejected": -5.210457801818848, "loss": 0.0474, "odds_ratio_loss": 0.003418078413233161, "rewards/accuracies": 1.0, "rewards/chosen": -0.003964013420045376, "rewards/margins": 0.5170817971229553, "rewards/rejected": -0.5210458040237427, "sft_loss": 0.03964013606309891, "step": 4130 }, { "epoch": 5.973969631236443, "grad_norm": 0.9997914713259782, "learning_rate": 2.713036226520415e-10, "logits/chosen": -0.8122600317001343, "logits/rejected": -0.599828839302063, "logps/chosen": -0.014296657405793667, "logps/rejected": -6.734135627746582, "loss": 0.0385, "odds_ratio_loss": 0.000533695740159601, "rewards/accuracies": 1.0, "rewards/chosen": -0.0014296658337116241, "rewards/margins": 0.6719839572906494, "rewards/rejected": -0.6734135746955872, "sft_loss": 0.014296657405793667, "step": 4131 }, { "epoch": 5.975415762834418, "grad_norm": 0.955240881670924, "learning_rate": 2.363359445229918e-10, "logits/chosen": -0.9173808693885803, "logits/rejected": -0.8124196529388428, "logps/chosen": -0.046773605048656464, "logps/rejected": -4.923510551452637, "loss": 0.0306, "odds_ratio_loss": 0.0019419525051489472, "rewards/accuracies": 1.0, "rewards/chosen": -0.004677360877394676, "rewards/margins": 0.48767372965812683, "rewards/rejected": -0.49235108494758606, "sft_loss": 0.046773605048656464, "step": 4132 }, { "epoch": 5.9768618944323935, "grad_norm": 1.0552507197470187, "learning_rate": 2.037797388036999e-10, "logits/chosen": -0.6335623860359192, "logits/rejected": -0.5481550693511963, "logps/chosen": -0.014158733189105988, "logps/rejected": -5.313653469085693, "loss": 0.028, "odds_ratio_loss": 0.0016354866093024611, "rewards/accuracies": 1.0, "rewards/chosen": -0.0014158734120428562, "rewards/margins": 0.5299494862556458, "rewards/rejected": -0.5313653349876404, "sft_loss": 0.014158733189105988, "step": 4133 }, { "epoch": 5.978308026030369, "grad_norm": 1.3318819399781283, "learning_rate": 1.7363502512246497e-10, "logits/chosen": -0.861269474029541, "logits/rejected": -0.8041742444038391, "logps/chosen": -0.06389249116182327, "logps/rejected": -4.00693416595459, "loss": 0.061, "odds_ratio_loss": 0.003401533467695117, "rewards/accuracies": 1.0, "rewards/chosen": -0.006389249116182327, "rewards/margins": 0.39430415630340576, "rewards/rejected": -0.400693416595459, "sft_loss": 0.06389249116182327, "step": 4134 }, { "epoch": 5.979754157628344, "grad_norm": 1.1397991365853217, "learning_rate": 1.4590182165363785e-10, "logits/chosen": -0.9926788210868835, "logits/rejected": -0.7052085399627686, "logps/chosen": -0.01665268838405609, "logps/rejected": -6.901638984680176, "loss": 0.0334, "odds_ratio_loss": 0.0005437415675260127, "rewards/accuracies": 1.0, "rewards/chosen": -0.001665269024670124, "rewards/margins": 0.6884986758232117, "rewards/rejected": -0.6901639103889465, "sft_loss": 0.01665268838405609, "step": 4135 }, { "epoch": 5.98120028922632, "grad_norm": 1.0293090818178157, "learning_rate": 1.2058014511717728e-10, "logits/chosen": -0.7687342762947083, "logits/rejected": -0.5340158343315125, "logps/chosen": -0.012591686099767685, "logps/rejected": -6.006010055541992, "loss": 0.032, "odds_ratio_loss": 0.0005818761419504881, "rewards/accuracies": 1.0, "rewards/chosen": -0.0012591686099767685, "rewards/margins": 0.5993418097496033, "rewards/rejected": -0.600601077079773, "sft_loss": 0.012591686099767685, "step": 4136 }, { "epoch": 5.982646420824295, "grad_norm": 1.0500799088028352, "learning_rate": 9.767001078087034e-11, "logits/chosen": -0.8796525001525879, "logits/rejected": -0.6548293232917786, "logps/chosen": -0.05132802948355675, "logps/rejected": -5.8061323165893555, "loss": 0.0436, "odds_ratio_loss": 0.002121829893440008, "rewards/accuracies": 1.0, "rewards/chosen": -0.005132803227752447, "rewards/margins": 0.5754804015159607, "rewards/rejected": -0.5806131958961487, "sft_loss": 0.05132802948355675, "step": 4137 }, { "epoch": 5.98409255242227, "grad_norm": 1.3440124989758566, "learning_rate": 7.717143245589142e-11, "logits/chosen": -1.0992934703826904, "logits/rejected": -0.7260419130325317, "logps/chosen": -0.04053273797035217, "logps/rejected": -5.290022373199463, "loss": 0.0648, "odds_ratio_loss": 0.0017471958417445421, "rewards/accuracies": 1.0, "rewards/chosen": -0.00405327370390296, "rewards/margins": 0.5249489545822144, "rewards/rejected": -0.5290022492408752, "sft_loss": 0.04053273797035217, "step": 4138 }, { "epoch": 5.985538684020246, "grad_norm": 1.1554344347871206, "learning_rate": 5.908442250168733e-11, "logits/chosen": -0.9692047834396362, "logits/rejected": -0.6804442405700684, "logps/chosen": -0.048383478075265884, "logps/rejected": -6.995375633239746, "loss": 0.0316, "odds_ratio_loss": 0.0027454195078462362, "rewards/accuracies": 1.0, "rewards/chosen": -0.004838347434997559, "rewards/margins": 0.6946991682052612, "rewards/rejected": -0.6995375156402588, "sft_loss": 0.048383478075265884, "step": 4139 }, { "epoch": 5.9869848156182215, "grad_norm": 0.8871899630146395, "learning_rate": 4.3408991823312704e-11, "logits/chosen": -0.9535342454910278, "logits/rejected": -0.8384948968887329, "logps/chosen": -0.029851065948605537, "logps/rejected": -3.435246467590332, "loss": 0.0232, "odds_ratio_loss": 0.0023980343248695135, "rewards/accuracies": 1.0, "rewards/chosen": -0.002985106548294425, "rewards/margins": 0.3405395746231079, "rewards/rejected": -0.34352466464042664, "sft_loss": 0.029851065948605537, "step": 4140 }, { "epoch": 5.988430947216196, "grad_norm": 0.9226185349789879, "learning_rate": 3.014514987054184e-11, "logits/chosen": -0.9060051441192627, "logits/rejected": -0.8312381505966187, "logps/chosen": -0.0319242998957634, "logps/rejected": -4.604552745819092, "loss": 0.0285, "odds_ratio_loss": 0.0014555552043020725, "rewards/accuracies": 1.0, "rewards/chosen": -0.003192430129274726, "rewards/margins": 0.4572628140449524, "rewards/rejected": -0.4604552388191223, "sft_loss": 0.0319242998957634, "step": 4141 }, { "epoch": 5.989877078814172, "grad_norm": 0.9592362753561068, "learning_rate": 1.9292904640977324e-11, "logits/chosen": -1.004399061203003, "logits/rejected": -0.97425377368927, "logps/chosen": -0.029115300625562668, "logps/rejected": -4.420380115509033, "loss": 0.0256, "odds_ratio_loss": 0.002002717461436987, "rewards/accuracies": 1.0, "rewards/chosen": -0.0029115299694240093, "rewards/margins": 0.4391264319419861, "rewards/rejected": -0.4420379400253296, "sft_loss": 0.029115300625562668, "step": 4142 }, { "epoch": 5.991323210412148, "grad_norm": 1.1190789757311155, "learning_rate": 1.0852262677385482e-11, "logits/chosen": -0.7953178286552429, "logits/rejected": -0.6801662445068359, "logps/chosen": -0.028218841180205345, "logps/rejected": -4.215031623840332, "loss": 0.0411, "odds_ratio_loss": 0.0011911361943930387, "rewards/accuracies": 1.0, "rewards/chosen": -0.0028218841180205345, "rewards/margins": 0.4186813235282898, "rewards/rejected": -0.4215031862258911, "sft_loss": 0.028218841180205345, "step": 4143 }, { "epoch": 5.9927693420101225, "grad_norm": 0.9158155976864413, "learning_rate": 4.823229068140478e-12, "logits/chosen": -0.8988432884216309, "logits/rejected": -0.7506909966468811, "logps/chosen": -0.029472343623638153, "logps/rejected": -3.428910970687866, "loss": 0.0258, "odds_ratio_loss": 0.0006631941068917513, "rewards/accuracies": 1.0, "rewards/chosen": -0.002947234082967043, "rewards/margins": 0.33994388580322266, "rewards/rejected": -0.3428910970687866, "sft_loss": 0.029472343623638153, "step": 4144 }, { "epoch": 5.994215473608098, "grad_norm": 1.056239819329163, "learning_rate": 1.2058074490006732e-12, "logits/chosen": -0.9353763461112976, "logits/rejected": -0.7485519647598267, "logps/chosen": -0.03557208925485611, "logps/rejected": -5.1342082023620605, "loss": 0.0394, "odds_ratio_loss": 0.0014125681482255459, "rewards/accuracies": 1.0, "rewards/chosen": -0.0035572086926549673, "rewards/margins": 0.5098636150360107, "rewards/rejected": -0.513420820236206, "sft_loss": 0.03557208925485611, "step": 4145 }, { "epoch": 5.995661605206074, "grad_norm": 1.0557108508160415, "learning_rate": 0.0, "logits/chosen": -0.6718660593032837, "logits/rejected": -0.5680021643638611, "logps/chosen": -0.026484325528144836, "logps/rejected": -5.1328911781311035, "loss": 0.0381, "odds_ratio_loss": 0.004209108650684357, "rewards/accuracies": 1.0, "rewards/chosen": -0.0026484327390789986, "rewards/margins": 0.5106407403945923, "rewards/rejected": -0.5132891535758972, "sft_loss": 0.026484325528144836, "step": 4146 } ], "logging_steps": 1.0, "max_steps": 4146, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "total_flos": 1411327439929344.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }