diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7158 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.988679245283019, + "eval_steps": 500, + "global_step": 396, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "grad_norm": 6.326096763058934, + "learning_rate": 1.25e-08, + "logps/chosen": -36.02279281616211, + "logps/rejected": -41.85474395751953, + "loss": 0.6931, + "losses/dpo": 0.6931471824645996, + "losses/sft": 1.3949329853057861, + "losses/total": 0.6931471824645996, + "ref_logps/chosen": -36.02279281616211, + "ref_logps/rejected": -41.85474395751953, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.02, + "grad_norm": 5.576784855809719, + "learning_rate": 2.5e-08, + "logps/chosen": -33.77919387817383, + "logps/rejected": -41.04405975341797, + "loss": 0.6931, + "losses/dpo": 0.6931471824645996, + "losses/sft": 1.3951497077941895, + "losses/total": 0.6931471824645996, + "ref_logps/chosen": -33.77919387817383, + "ref_logps/rejected": -41.04405975341797, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.02, + "grad_norm": 6.263050301271669, + "learning_rate": 3.75e-08, + "logps/chosen": -38.8697509765625, + "logps/rejected": -48.85557556152344, + "loss": 0.6931, + "losses/dpo": 0.6860073804855347, + "losses/sft": 1.6376307010650635, + "losses/total": 0.6860073804855347, + "ref_logps/chosen": -38.87074279785156, + "ref_logps/rejected": -48.853511810302734, + "rewards/accuracies": 0.515625, + "rewards/chosen": 9.899254655465484e-05, + "rewards/margins": 0.0003055855631828308, + "rewards/rejected": -0.0002065933949779719, + "step": 3 + }, + { + "epoch": 0.03, + "grad_norm": 5.738951829533344, + "learning_rate": 5e-08, + "logps/chosen": -36.64889144897461, + "logps/rejected": -42.698097229003906, + "loss": 0.6924, + "losses/dpo": 0.6935421228408813, + "losses/sft": 1.4897900819778442, + "losses/total": 0.6935421228408813, + "ref_logps/chosen": -36.668033599853516, + "ref_logps/rejected": -42.70002746582031, + "rewards/accuracies": 0.484375, + "rewards/chosen": 0.0019142806995660067, + "rewards/margins": 0.001721527660265565, + "rewards/rejected": 0.00019275324302725494, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 6.637504299884417, + "learning_rate": 6.25e-08, + "logps/chosen": -41.41233825683594, + "logps/rejected": -47.04777145385742, + "loss": 0.6939, + "losses/dpo": 0.6956198215484619, + "losses/sft": 1.1974728107452393, + "losses/total": 0.6956198215484619, + "ref_logps/chosen": -41.40231704711914, + "ref_logps/rejected": -47.051856994628906, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.0010022701462730765, + "rewards/margins": -0.0014111174969002604, + "rewards/rejected": 0.00040884732152335346, + "step": 5 + }, + { + "epoch": 0.05, + "grad_norm": 5.760428498194468, + "learning_rate": 7.5e-08, + "logps/chosen": -34.51856994628906, + "logps/rejected": -41.675804138183594, + "loss": 0.6946, + "losses/dpo": 0.6942625641822815, + "losses/sft": 1.3869932889938354, + "losses/total": 0.6942625641822815, + "ref_logps/chosen": -34.49778747558594, + "ref_logps/rejected": -41.68275451660156, + "rewards/accuracies": 0.421875, + "rewards/chosen": -0.0020779455080628395, + "rewards/margins": -0.002773313783109188, + "rewards/rejected": 0.0006953685078769922, + "step": 6 + }, + { + "epoch": 0.05, + "grad_norm": 5.773272972704967, + "learning_rate": 8.75e-08, + "logps/chosen": -36.6628303527832, + "logps/rejected": -42.856834411621094, + "loss": 0.6927, + "losses/dpo": 0.6944370269775391, + "losses/sft": 1.2695866823196411, + "losses/total": 0.6944370269775391, + "ref_logps/chosen": -36.669891357421875, + "ref_logps/rejected": -42.8528938293457, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0007062811637297273, + "rewards/margins": 0.0011000875383615494, + "rewards/rejected": -0.00039380654925480485, + "step": 7 + }, + { + "epoch": 0.06, + "grad_norm": 6.837371123256996, + "learning_rate": 1e-07, + "logps/chosen": -41.66258239746094, + "logps/rejected": -43.34931182861328, + "loss": 0.6943, + "losses/dpo": 0.6919102668762207, + "losses/sft": 1.317617654800415, + "losses/total": 0.6919102668762207, + "ref_logps/chosen": -41.65662384033203, + "ref_logps/rejected": -43.36621856689453, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0005958047113381326, + "rewards/margins": -0.0022863391786813736, + "rewards/rejected": 0.0016905345255509019, + "step": 8 + }, + { + "epoch": 0.07, + "grad_norm": 6.116952924505858, + "learning_rate": 1.125e-07, + "logps/chosen": -37.05712890625, + "logps/rejected": -46.517696380615234, + "loss": 0.6917, + "losses/dpo": 0.692311704158783, + "losses/sft": 1.112796664237976, + "losses/total": 0.692311704158783, + "ref_logps/chosen": -37.07765197753906, + "ref_logps/rejected": -46.507904052734375, + "rewards/accuracies": 0.6015625, + "rewards/chosen": 0.0020526223815977573, + "rewards/margins": 0.003031900618225336, + "rewards/rejected": -0.0009792782366275787, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 5.761856386512759, + "learning_rate": 1.25e-07, + "logps/chosen": -33.799774169921875, + "logps/rejected": -41.23558044433594, + "loss": 0.6924, + "losses/dpo": 0.6941465139389038, + "losses/sft": 1.1185486316680908, + "losses/total": 0.6941465139389038, + "ref_logps/chosen": -33.81248474121094, + "ref_logps/rejected": -41.23206329345703, + "rewards/accuracies": 0.5234375, + "rewards/chosen": 0.0012711097951978445, + "rewards/margins": 0.0016234376234933734, + "rewards/rejected": -0.0003523279447108507, + "step": 10 + }, + { + "epoch": 0.08, + "grad_norm": 5.743856364003174, + "learning_rate": 1.375e-07, + "logps/chosen": -36.227317810058594, + "logps/rejected": -40.51737976074219, + "loss": 0.6927, + "losses/dpo": 0.6916883587837219, + "losses/sft": 1.4357692003250122, + "losses/total": 0.6916883587837219, + "ref_logps/chosen": -36.23785400390625, + "ref_logps/rejected": -40.51884078979492, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.0010535644832998514, + "rewards/margins": 0.0009071138338185847, + "rewards/rejected": 0.0001464505330659449, + "step": 11 + }, + { + "epoch": 0.09, + "grad_norm": 5.9204140360711355, + "learning_rate": 1.5e-07, + "logps/chosen": -38.817134857177734, + "logps/rejected": -42.217681884765625, + "loss": 0.6925, + "losses/dpo": 0.6985405683517456, + "losses/sft": 1.4544084072113037, + "losses/total": 0.6985405683517456, + "ref_logps/chosen": -38.83327102661133, + "ref_logps/rejected": -42.220176696777344, + "rewards/accuracies": 0.4921875, + "rewards/chosen": 0.0016135365003719926, + "rewards/margins": 0.0013644276186823845, + "rewards/rejected": 0.0002491088816896081, + "step": 12 + }, + { + "epoch": 0.1, + "grad_norm": 6.190779724671626, + "learning_rate": 1.625e-07, + "logps/chosen": -37.33137512207031, + "logps/rejected": -46.71794128417969, + "loss": 0.6901, + "losses/dpo": 0.6943342685699463, + "losses/sft": 1.3390721082687378, + "losses/total": 0.6943342685699463, + "ref_logps/chosen": -37.34603500366211, + "ref_logps/rejected": -46.670997619628906, + "rewards/accuracies": 0.578125, + "rewards/chosen": 0.0014661503955721855, + "rewards/margins": 0.006160825490951538, + "rewards/rejected": -0.004694675095379353, + "step": 13 + }, + { + "epoch": 0.11, + "grad_norm": 5.535961166630566, + "learning_rate": 1.75e-07, + "logps/chosen": -34.35616683959961, + "logps/rejected": -40.568878173828125, + "loss": 0.6923, + "losses/dpo": 0.6914072036743164, + "losses/sft": 1.0790843963623047, + "losses/total": 0.6914072036743164, + "ref_logps/chosen": -34.35405731201172, + "ref_logps/rejected": -40.548362731933594, + "rewards/accuracies": 0.5390625, + "rewards/chosen": -0.00021077337441965938, + "rewards/margins": 0.001840681186877191, + "rewards/rejected": -0.002051454270258546, + "step": 14 + }, + { + "epoch": 0.11, + "grad_norm": 5.994324182587906, + "learning_rate": 1.875e-07, + "logps/chosen": -35.86518859863281, + "logps/rejected": -41.03656005859375, + "loss": 0.6932, + "losses/dpo": 0.690587043762207, + "losses/sft": 1.5208988189697266, + "losses/total": 0.690587043762207, + "ref_logps/chosen": -35.85986328125, + "ref_logps/rejected": -41.031028747558594, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.0005322899669408798, + "rewards/margins": 2.0701438188552856e-05, + "rewards/rejected": -0.00055299187079072, + "step": 15 + }, + { + "epoch": 0.12, + "grad_norm": 5.908538195984286, + "learning_rate": 2e-07, + "logps/chosen": -36.70806884765625, + "logps/rejected": -39.733882904052734, + "loss": 0.6926, + "losses/dpo": 0.6891317963600159, + "losses/sft": 1.1712957620620728, + "losses/total": 0.6891317963600159, + "ref_logps/chosen": -36.713016510009766, + "ref_logps/rejected": -39.726341247558594, + "rewards/accuracies": 0.515625, + "rewards/chosen": 0.0004950040020048618, + "rewards/margins": 0.0012491128873080015, + "rewards/rejected": -0.0007541090017184615, + "step": 16 + }, + { + "epoch": 0.13, + "grad_norm": 6.17892190890487, + "learning_rate": 2.1249999999999998e-07, + "logps/chosen": -38.60041046142578, + "logps/rejected": -43.30579376220703, + "loss": 0.694, + "losses/dpo": 0.6916015148162842, + "losses/sft": 1.3250274658203125, + "losses/total": 0.6916015148162842, + "ref_logps/chosen": -38.579288482666016, + "ref_logps/rejected": -43.300140380859375, + "rewards/accuracies": 0.4609375, + "rewards/chosen": -0.0021122824400663376, + "rewards/margins": -0.001547331572510302, + "rewards/rejected": -0.0005649511003866792, + "step": 17 + }, + { + "epoch": 0.14, + "grad_norm": 6.020104058580606, + "learning_rate": 2.25e-07, + "logps/chosen": -37.50771713256836, + "logps/rejected": -41.76362609863281, + "loss": 0.6919, + "losses/dpo": 0.6925865411758423, + "losses/sft": 1.3761729001998901, + "losses/total": 0.6925865411758423, + "ref_logps/chosen": -37.507423400878906, + "ref_logps/rejected": -41.736366271972656, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.9282993637025356e-05, + "rewards/margins": 0.0026969274040311575, + "rewards/rejected": -0.002726210281252861, + "step": 18 + }, + { + "epoch": 0.14, + "grad_norm": 5.765433605501229, + "learning_rate": 2.3749999999999998e-07, + "logps/chosen": -32.96650695800781, + "logps/rejected": -42.942771911621094, + "loss": 0.6917, + "losses/dpo": 0.6924209594726562, + "losses/sft": 1.1748046875, + "losses/total": 0.6924209594726562, + "ref_logps/chosen": -32.971160888671875, + "ref_logps/rejected": -42.91703796386719, + "rewards/accuracies": 0.5546875, + "rewards/chosen": 0.00046532286796718836, + "rewards/margins": 0.0030381008982658386, + "rewards/rejected": -0.002572778146713972, + "step": 19 + }, + { + "epoch": 0.15, + "grad_norm": 5.824641204085951, + "learning_rate": 2.5e-07, + "logps/chosen": -37.962188720703125, + "logps/rejected": -43.213279724121094, + "loss": 0.6932, + "losses/dpo": 0.694710373878479, + "losses/sft": 1.2030720710754395, + "losses/total": 0.694710373878479, + "ref_logps/chosen": -37.919189453125, + "ref_logps/rejected": -43.171043395996094, + "rewards/accuracies": 0.5078125, + "rewards/chosen": -0.0043003251776099205, + "rewards/margins": -7.65085278544575e-05, + "rewards/rejected": -0.004223817028105259, + "step": 20 + }, + { + "epoch": 0.16, + "grad_norm": 6.193915477797873, + "learning_rate": 2.625e-07, + "logps/chosen": -37.831146240234375, + "logps/rejected": -47.295005798339844, + "loss": 0.6954, + "losses/dpo": 0.6994068622589111, + "losses/sft": 1.2481111288070679, + "losses/total": 0.6994068622589111, + "ref_logps/chosen": -37.76181411743164, + "ref_logps/rejected": -47.269264221191406, + "rewards/accuracies": 0.4453125, + "rewards/chosen": -0.006933108903467655, + "rewards/margins": -0.004359052516520023, + "rewards/rejected": -0.002574056386947632, + "step": 21 + }, + { + "epoch": 0.17, + "grad_norm": 5.703247411276419, + "learning_rate": 2.75e-07, + "logps/chosen": -34.446189880371094, + "logps/rejected": -42.82508850097656, + "loss": 0.6939, + "losses/dpo": 0.6885063648223877, + "losses/sft": 1.2574893236160278, + "losses/total": 0.6885063648223877, + "ref_logps/chosen": -34.39026641845703, + "ref_logps/rejected": -42.782203674316406, + "rewards/accuracies": 0.4921875, + "rewards/chosen": -0.005592696368694305, + "rewards/margins": -0.0013045003870502114, + "rewards/rejected": -0.004288196098059416, + "step": 22 + }, + { + "epoch": 0.17, + "grad_norm": 5.586186575504996, + "learning_rate": 2.8749999999999995e-07, + "logps/chosen": -35.78218078613281, + "logps/rejected": -46.140350341796875, + "loss": 0.6929, + "losses/dpo": 0.6936126351356506, + "losses/sft": 1.4059488773345947, + "losses/total": 0.6936126351356506, + "ref_logps/chosen": -35.713783264160156, + "ref_logps/rejected": -46.065738677978516, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.00683995708823204, + "rewards/margins": 0.0006212468724697828, + "rewards/rejected": -0.00746120372787118, + "step": 23 + }, + { + "epoch": 0.18, + "grad_norm": 6.2807240194752, + "learning_rate": 3e-07, + "logps/chosen": -37.896644592285156, + "logps/rejected": -43.448909759521484, + "loss": 0.6914, + "losses/dpo": 0.6850650310516357, + "losses/sft": 1.4576250314712524, + "losses/total": 0.6850650310516357, + "ref_logps/chosen": -37.83003616333008, + "ref_logps/rejected": -43.34458923339844, + "rewards/accuracies": 0.5703125, + "rewards/chosen": -0.00666093360632658, + "rewards/margins": 0.00377137353643775, + "rewards/rejected": -0.010432307608425617, + "step": 24 + }, + { + "epoch": 0.19, + "grad_norm": 5.6714251513252485, + "learning_rate": 3.1249999999999997e-07, + "logps/chosen": -36.5435791015625, + "logps/rejected": -41.46415710449219, + "loss": 0.6923, + "losses/dpo": 0.6902315020561218, + "losses/sft": 1.3371169567108154, + "losses/total": 0.6902315020561218, + "ref_logps/chosen": -36.474647521972656, + "ref_logps/rejected": -41.37662887573242, + "rewards/accuracies": 0.5390625, + "rewards/chosen": -0.006893564946949482, + "rewards/margins": 0.001859544194303453, + "rewards/rejected": -0.00875310879200697, + "step": 25 + }, + { + "epoch": 0.2, + "grad_norm": 6.279222054280467, + "learning_rate": 3.25e-07, + "logps/chosen": -37.0484733581543, + "logps/rejected": -44.5318603515625, + "loss": 0.6919, + "losses/dpo": 0.6916804313659668, + "losses/sft": 1.2641081809997559, + "losses/total": 0.6916804313659668, + "ref_logps/chosen": -36.94280242919922, + "ref_logps/rejected": -44.40016174316406, + "rewards/accuracies": 0.5859375, + "rewards/chosen": -0.010566946119070053, + "rewards/margins": 0.0026026167906820774, + "rewards/rejected": -0.013169562444090843, + "step": 26 + }, + { + "epoch": 0.2, + "grad_norm": 5.569722658657549, + "learning_rate": 3.375e-07, + "logps/chosen": -33.24622344970703, + "logps/rejected": -39.62266540527344, + "loss": 0.6918, + "losses/dpo": 0.6929441094398499, + "losses/sft": 1.0789260864257812, + "losses/total": 0.6929441094398499, + "ref_logps/chosen": -33.152587890625, + "ref_logps/rejected": -39.500396728515625, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.009363781660795212, + "rewards/margins": 0.002862950786948204, + "rewards/rejected": -0.012226731516420841, + "step": 27 + }, + { + "epoch": 0.21, + "grad_norm": 6.178096448266021, + "learning_rate": 3.5e-07, + "logps/chosen": -40.909210205078125, + "logps/rejected": -43.54678726196289, + "loss": 0.6932, + "losses/dpo": 0.6962201595306396, + "losses/sft": 1.4941421747207642, + "losses/total": 0.6962201595306396, + "ref_logps/chosen": -40.74734878540039, + "ref_logps/rejected": -43.38502883911133, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.016186244785785675, + "rewards/margins": -1.0382413165643811e-05, + "rewards/rejected": -0.01617586426436901, + "step": 28 + }, + { + "epoch": 0.22, + "grad_norm": 6.351428202937179, + "learning_rate": 3.6249999999999997e-07, + "logps/chosen": -38.18675231933594, + "logps/rejected": -45.641944885253906, + "loss": 0.6918, + "losses/dpo": 0.6936982870101929, + "losses/sft": 1.5615143775939941, + "losses/total": 0.6936982870101929, + "ref_logps/chosen": -38.07172775268555, + "ref_logps/rejected": -45.49729919433594, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.011502932757139206, + "rewards/margins": 0.002961072139441967, + "rewards/rejected": -0.014464004896581173, + "step": 29 + }, + { + "epoch": 0.23, + "grad_norm": 6.197352455942284, + "learning_rate": 3.75e-07, + "logps/chosen": -38.10821533203125, + "logps/rejected": -44.8267707824707, + "loss": 0.6913, + "losses/dpo": 0.6886826753616333, + "losses/sft": 1.2551246881484985, + "losses/total": 0.6886826753616333, + "ref_logps/chosen": -37.95869827270508, + "ref_logps/rejected": -44.637386322021484, + "rewards/accuracies": 0.5546875, + "rewards/chosen": -0.014951780438423157, + "rewards/margins": 0.00398671068251133, + "rewards/rejected": -0.018938491120934486, + "step": 30 + }, + { + "epoch": 0.23, + "grad_norm": 6.109331160092928, + "learning_rate": 3.875e-07, + "logps/chosen": -37.73468017578125, + "logps/rejected": -45.03502655029297, + "loss": 0.6939, + "losses/dpo": 0.6942582130432129, + "losses/sft": 1.307703971862793, + "losses/total": 0.6942582130432129, + "ref_logps/chosen": -37.55635070800781, + "ref_logps/rejected": -44.870731353759766, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.01783285290002823, + "rewards/margins": -0.0014032268663868308, + "rewards/rejected": -0.01642962545156479, + "step": 31 + }, + { + "epoch": 0.24, + "grad_norm": 5.667604081077805, + "learning_rate": 4e-07, + "logps/chosen": -35.0442008972168, + "logps/rejected": -43.61913299560547, + "loss": 0.6936, + "losses/dpo": 0.7014378309249878, + "losses/sft": 1.3467621803283691, + "losses/total": 0.7014378309249878, + "ref_logps/chosen": -34.83974838256836, + "ref_logps/rejected": -43.42043685913086, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02044512704014778, + "rewards/margins": -0.0005755843594670296, + "rewards/rejected": -0.019869543612003326, + "step": 32 + }, + { + "epoch": 0.25, + "grad_norm": 5.678502382891613, + "learning_rate": 4.1249999999999997e-07, + "logps/chosen": -32.68036651611328, + "logps/rejected": -37.5178337097168, + "loss": 0.6907, + "losses/dpo": 0.6907713413238525, + "losses/sft": 1.0900930166244507, + "losses/total": 0.6907713413238525, + "ref_logps/chosen": -32.45442199707031, + "ref_logps/rejected": -37.23982238769531, + "rewards/accuracies": 0.5234375, + "rewards/chosen": -0.0225942712277174, + "rewards/margins": 0.005206821020692587, + "rewards/rejected": -0.027801092714071274, + "step": 33 + }, + { + "epoch": 0.26, + "grad_norm": 5.921573916861138, + "learning_rate": 4.2499999999999995e-07, + "logps/chosen": -38.70556640625, + "logps/rejected": -44.084251403808594, + "loss": 0.6917, + "losses/dpo": 0.6954081058502197, + "losses/sft": 1.3240528106689453, + "losses/total": 0.6954081058502197, + "ref_logps/chosen": -38.448394775390625, + "ref_logps/rejected": -43.795318603515625, + "rewards/accuracies": 0.5390625, + "rewards/chosen": -0.025717251002788544, + "rewards/margins": 0.0031764586456120014, + "rewards/rejected": -0.02889370732009411, + "step": 34 + }, + { + "epoch": 0.26, + "grad_norm": 6.262461804418144, + "learning_rate": 4.375e-07, + "logps/chosen": -38.28644561767578, + "logps/rejected": -44.10706329345703, + "loss": 0.6906, + "losses/dpo": 0.6915950179100037, + "losses/sft": 1.5137853622436523, + "losses/total": 0.6915950179100037, + "ref_logps/chosen": -37.993186950683594, + "ref_logps/rejected": -43.75933837890625, + "rewards/accuracies": 0.5703125, + "rewards/chosen": -0.029325801879167557, + "rewards/margins": 0.005446841474622488, + "rewards/rejected": -0.03477264195680618, + "step": 35 + }, + { + "epoch": 0.27, + "grad_norm": 6.51719678701152, + "learning_rate": 4.5e-07, + "logps/chosen": -38.827396392822266, + "logps/rejected": -44.608299255371094, + "loss": 0.6898, + "losses/dpo": 0.6939886212348938, + "losses/sft": 1.1804178953170776, + "losses/total": 0.6939886212348938, + "ref_logps/chosen": -38.524925231933594, + "ref_logps/rejected": -44.23436737060547, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.03024711087346077, + "rewards/margins": 0.007146051619201899, + "rewards/rejected": -0.03739316016435623, + "step": 36 + }, + { + "epoch": 0.28, + "grad_norm": 6.054369450491099, + "learning_rate": 4.625e-07, + "logps/chosen": -37.75469970703125, + "logps/rejected": -40.86686706542969, + "loss": 0.69, + "losses/dpo": 0.6901004910469055, + "losses/sft": 1.2039740085601807, + "losses/total": 0.6901004910469055, + "ref_logps/chosen": -37.38399124145508, + "ref_logps/rejected": -40.42985534667969, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.03707098215818405, + "rewards/margins": 0.006630584131926298, + "rewards/rejected": -0.04370156675577164, + "step": 37 + }, + { + "epoch": 0.29, + "grad_norm": 5.811128045517565, + "learning_rate": 4.7499999999999995e-07, + "logps/chosen": -37.347511291503906, + "logps/rejected": -42.597320556640625, + "loss": 0.6932, + "losses/dpo": 0.6947627663612366, + "losses/sft": 1.5172080993652344, + "losses/total": 0.6947627663612366, + "ref_logps/chosen": -36.92784881591797, + "ref_logps/rejected": -42.17408752441406, + "rewards/accuracies": 0.5078125, + "rewards/chosen": -0.041966233402490616, + "rewards/margins": 0.00035727641079574823, + "rewards/rejected": -0.042323507368564606, + "step": 38 + }, + { + "epoch": 0.29, + "grad_norm": 6.232125527781941, + "learning_rate": 4.875e-07, + "logps/chosen": -35.76224899291992, + "logps/rejected": -40.480010986328125, + "loss": 0.6877, + "losses/dpo": 0.6863731741905212, + "losses/sft": 1.403287410736084, + "losses/total": 0.6863731741905212, + "ref_logps/chosen": -35.296600341796875, + "ref_logps/rejected": -39.899620056152344, + "rewards/accuracies": 0.5390625, + "rewards/chosen": -0.04656480997800827, + "rewards/margins": 0.011474234983325005, + "rewards/rejected": -0.058039046823978424, + "step": 39 + }, + { + "epoch": 0.3, + "grad_norm": 5.919014140290479, + "learning_rate": 5e-07, + "logps/chosen": -33.405452728271484, + "logps/rejected": -40.23749542236328, + "loss": 0.6917, + "losses/dpo": 0.7027544975280762, + "losses/sft": 1.5135366916656494, + "losses/total": 0.7027544975280762, + "ref_logps/chosen": -32.92824935913086, + "ref_logps/rejected": -39.72548294067383, + "rewards/accuracies": 0.5703125, + "rewards/chosen": -0.04772059991955757, + "rewards/margins": 0.0034805855248123407, + "rewards/rejected": -0.05120118334889412, + "step": 40 + }, + { + "epoch": 0.31, + "grad_norm": 6.156564024356789, + "learning_rate": 4.985955056179775e-07, + "logps/chosen": -33.48844528198242, + "logps/rejected": -40.55287551879883, + "loss": 0.6886, + "losses/dpo": 0.6869601011276245, + "losses/sft": 1.2104542255401611, + "losses/total": 0.6869601011276245, + "ref_logps/chosen": -32.960693359375, + "ref_logps/rejected": -39.92742919921875, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.052775099873542786, + "rewards/margins": 0.009769486263394356, + "rewards/rejected": -0.06254458427429199, + "step": 41 + }, + { + "epoch": 0.32, + "grad_norm": 5.9874098679402445, + "learning_rate": 4.97191011235955e-07, + "logps/chosen": -37.491756439208984, + "logps/rejected": -44.21824645996094, + "loss": 0.6903, + "losses/dpo": 0.6947405934333801, + "losses/sft": 1.5526431798934937, + "losses/total": 0.6947405934333801, + "ref_logps/chosen": -36.944496154785156, + "ref_logps/rejected": -43.608970642089844, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.05472607538104057, + "rewards/margins": 0.006201753858476877, + "rewards/rejected": -0.060927826911211014, + "step": 42 + }, + { + "epoch": 0.32, + "grad_norm": 5.890690107516997, + "learning_rate": 4.957865168539325e-07, + "logps/chosen": -37.96784210205078, + "logps/rejected": -44.18370056152344, + "loss": 0.6911, + "losses/dpo": 0.6906970143318176, + "losses/sft": 1.5630677938461304, + "losses/total": 0.6906970143318176, + "ref_logps/chosen": -37.31348419189453, + "ref_logps/rejected": -43.480613708496094, + "rewards/accuracies": 0.5078125, + "rewards/chosen": -0.06543563306331635, + "rewards/margins": 0.0048727355897426605, + "rewards/rejected": -0.07030836492776871, + "step": 43 + }, + { + "epoch": 0.33, + "grad_norm": 5.6620046922389475, + "learning_rate": 4.943820224719101e-07, + "logps/chosen": -33.37147903442383, + "logps/rejected": -40.975284576416016, + "loss": 0.6876, + "losses/dpo": 0.6959986090660095, + "losses/sft": 1.4914252758026123, + "losses/total": 0.6959986090660095, + "ref_logps/chosen": -32.77292251586914, + "ref_logps/rejected": -40.25656509399414, + "rewards/accuracies": 0.5859375, + "rewards/chosen": -0.05985547974705696, + "rewards/margins": 0.012016610242426395, + "rewards/rejected": -0.07187209278345108, + "step": 44 + }, + { + "epoch": 0.34, + "grad_norm": 5.975010174130114, + "learning_rate": 4.929775280898877e-07, + "logps/chosen": -36.01771545410156, + "logps/rejected": -40.14152145385742, + "loss": 0.686, + "losses/dpo": 0.6887790560722351, + "losses/sft": 1.0922722816467285, + "losses/total": 0.6887790560722351, + "ref_logps/chosen": -35.33580017089844, + "ref_logps/rejected": -39.30628204345703, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.06819140911102295, + "rewards/margins": 0.015332860872149467, + "rewards/rejected": -0.08352427184581757, + "step": 45 + }, + { + "epoch": 0.35, + "grad_norm": 5.941321729150391, + "learning_rate": 4.915730337078651e-07, + "logps/chosen": -37.23257064819336, + "logps/rejected": -43.88367462158203, + "loss": 0.6872, + "losses/dpo": 0.6943268775939941, + "losses/sft": 1.3947020769119263, + "losses/total": 0.6943268775939941, + "ref_logps/chosen": -36.47919845581055, + "ref_logps/rejected": -43.00157165527344, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.07533714175224304, + "rewards/margins": 0.012872692197561264, + "rewards/rejected": -0.08820983022451401, + "step": 46 + }, + { + "epoch": 0.35, + "grad_norm": 6.015115529321941, + "learning_rate": 4.901685393258427e-07, + "logps/chosen": -40.33525085449219, + "logps/rejected": -41.87712478637695, + "loss": 0.6904, + "losses/dpo": 0.6985194683074951, + "losses/sft": 1.4163267612457275, + "losses/total": 0.6985194683074951, + "ref_logps/chosen": -39.42625045776367, + "ref_logps/rejected": -40.897945404052734, + "rewards/accuracies": 0.5234375, + "rewards/chosen": -0.09090035408735275, + "rewards/margins": 0.007017695810645819, + "rewards/rejected": -0.09791804850101471, + "step": 47 + }, + { + "epoch": 0.36, + "grad_norm": 5.733756968847646, + "learning_rate": 4.887640449438202e-07, + "logps/chosen": -36.97081756591797, + "logps/rejected": -42.80936050415039, + "loss": 0.6882, + "losses/dpo": 0.696724534034729, + "losses/sft": 1.2510000467300415, + "losses/total": 0.696724534034729, + "ref_logps/chosen": -36.11339569091797, + "ref_logps/rejected": -41.84064483642578, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.08574248850345612, + "rewards/margins": 0.011129248887300491, + "rewards/rejected": -0.09687173366546631, + "step": 48 + }, + { + "epoch": 0.37, + "grad_norm": 5.879327522594857, + "learning_rate": 4.873595505617978e-07, + "logps/chosen": -33.350772857666016, + "logps/rejected": -41.509918212890625, + "loss": 0.6838, + "losses/dpo": 0.6769124269485474, + "losses/sft": 1.218217372894287, + "losses/total": 0.6769124269485474, + "ref_logps/chosen": -32.432064056396484, + "ref_logps/rejected": -40.38986587524414, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.09187072515487671, + "rewards/margins": 0.020134272053837776, + "rewards/rejected": -0.11200500279664993, + "step": 49 + }, + { + "epoch": 0.38, + "grad_norm": 6.158412418626052, + "learning_rate": 4.859550561797752e-07, + "logps/chosen": -38.05060577392578, + "logps/rejected": -45.274757385253906, + "loss": 0.6828, + "losses/dpo": 0.6935728788375854, + "losses/sft": 1.33810293674469, + "losses/total": 0.6935728788375854, + "ref_logps/chosen": -36.988319396972656, + "ref_logps/rejected": -43.98942565917969, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10622845590114594, + "rewards/margins": 0.022304760292172432, + "rewards/rejected": -0.12853321433067322, + "step": 50 + }, + { + "epoch": 0.38, + "grad_norm": 6.285183993622672, + "learning_rate": 4.845505617977528e-07, + "logps/chosen": -38.64442443847656, + "logps/rejected": -42.03549575805664, + "loss": 0.6823, + "losses/dpo": 0.6838083267211914, + "losses/sft": 1.414647102355957, + "losses/total": 0.6838083267211914, + "ref_logps/chosen": -37.58879089355469, + "ref_logps/rejected": -40.74079132080078, + "rewards/accuracies": 0.5859375, + "rewards/chosen": -0.1055637076497078, + "rewards/margins": 0.023906776681542397, + "rewards/rejected": -0.12947048246860504, + "step": 51 + }, + { + "epoch": 0.39, + "grad_norm": 5.8676763170916155, + "learning_rate": 4.831460674157303e-07, + "logps/chosen": -35.28524398803711, + "logps/rejected": -43.29574966430664, + "loss": 0.6824, + "losses/dpo": 0.6761789321899414, + "losses/sft": 1.1140950918197632, + "losses/total": 0.6761789321899414, + "ref_logps/chosen": -34.05834197998047, + "ref_logps/rejected": -41.828880310058594, + "rewards/accuracies": 0.5234375, + "rewards/chosen": -0.12269000709056854, + "rewards/margins": 0.023997044190764427, + "rewards/rejected": -0.14668706059455872, + "step": 52 + }, + { + "epoch": 0.4, + "grad_norm": 6.153262147929733, + "learning_rate": 4.817415730337078e-07, + "logps/chosen": -34.02470016479492, + "logps/rejected": -38.51059341430664, + "loss": 0.6808, + "losses/dpo": 0.6883823871612549, + "losses/sft": 1.1792895793914795, + "losses/total": 0.6883823871612549, + "ref_logps/chosen": -32.95384216308594, + "ref_logps/rejected": -37.16828155517578, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.10708627104759216, + "rewards/margins": 0.02714475244283676, + "rewards/rejected": -0.13423103094100952, + "step": 53 + }, + { + "epoch": 0.41, + "grad_norm": 5.930275284806721, + "learning_rate": 4.803370786516854e-07, + "logps/chosen": -39.90019989013672, + "logps/rejected": -41.967960357666016, + "loss": 0.6817, + "losses/dpo": 0.6764520406723022, + "losses/sft": 1.4552464485168457, + "losses/total": 0.6764520406723022, + "ref_logps/chosen": -38.590492248535156, + "ref_logps/rejected": -40.406002044677734, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.1309707909822464, + "rewards/margins": 0.025225069373846054, + "rewards/rejected": -0.15619586408138275, + "step": 54 + }, + { + "epoch": 0.42, + "grad_norm": 5.991852629106404, + "learning_rate": 4.789325842696629e-07, + "logps/chosen": -37.67607116699219, + "logps/rejected": -42.05184555053711, + "loss": 0.6906, + "losses/dpo": 0.6933637261390686, + "losses/sft": 1.3182023763656616, + "losses/total": 0.6933637261390686, + "ref_logps/chosen": -36.22807312011719, + "ref_logps/rejected": -40.539833068847656, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.14479960501194, + "rewards/margins": 0.006401616148650646, + "rewards/rejected": -0.15120121836662292, + "step": 55 + }, + { + "epoch": 0.42, + "grad_norm": 5.887970211048207, + "learning_rate": 4.775280898876405e-07, + "logps/chosen": -36.0313835144043, + "logps/rejected": -41.34480285644531, + "loss": 0.6806, + "losses/dpo": 0.6776463985443115, + "losses/sft": 1.3762413263320923, + "losses/total": 0.6776463985443115, + "ref_logps/chosen": -34.71417999267578, + "ref_logps/rejected": -39.74456024169922, + "rewards/accuracies": 0.6015625, + "rewards/chosen": -0.131720170378685, + "rewards/margins": 0.02830405905842781, + "rewards/rejected": -0.1600242257118225, + "step": 56 + }, + { + "epoch": 0.43, + "grad_norm": 6.202121890129415, + "learning_rate": 4.7612359550561797e-07, + "logps/chosen": -39.138973236083984, + "logps/rejected": -44.62040710449219, + "loss": 0.6868, + "losses/dpo": 0.7154799699783325, + "losses/sft": 1.4311680793762207, + "losses/total": 0.7154799699783325, + "ref_logps/chosen": -37.48638153076172, + "ref_logps/rejected": -42.81147003173828, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1652592122554779, + "rewards/margins": 0.01563437283039093, + "rewards/rejected": -0.18089357018470764, + "step": 57 + }, + { + "epoch": 0.44, + "grad_norm": 6.251681349620222, + "learning_rate": 4.747191011235955e-07, + "logps/chosen": -35.74232864379883, + "logps/rejected": -41.246910095214844, + "loss": 0.6762, + "losses/dpo": 0.6785226464271545, + "losses/sft": 1.2456488609313965, + "losses/total": 0.6785226464271545, + "ref_logps/chosen": -34.32046890258789, + "ref_logps/rejected": -39.45071792602539, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.14218537509441376, + "rewards/margins": 0.037433870136737823, + "rewards/rejected": -0.17961923778057098, + "step": 58 + }, + { + "epoch": 0.45, + "grad_norm": 6.186489597098538, + "learning_rate": 4.7331460674157303e-07, + "logps/chosen": -38.993804931640625, + "logps/rejected": -48.68840789794922, + "loss": 0.6799, + "losses/dpo": 0.6576354503631592, + "losses/sft": 1.2577842473983765, + "losses/total": 0.6576354503631592, + "ref_logps/chosen": -37.452022552490234, + "ref_logps/rejected": -46.83843994140625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15417808294296265, + "rewards/margins": 0.03081856295466423, + "rewards/rejected": -0.18499664962291718, + "step": 59 + }, + { + "epoch": 0.45, + "grad_norm": 6.176384076659114, + "learning_rate": 4.7191011235955054e-07, + "logps/chosen": -36.94293975830078, + "logps/rejected": -43.75997543334961, + "loss": 0.6818, + "losses/dpo": 0.6777645349502563, + "losses/sft": 1.4646830558776855, + "losses/total": 0.6777645349502563, + "ref_logps/chosen": -35.410682678222656, + "ref_logps/rejected": -41.96450424194336, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.15322577953338623, + "rewards/margins": 0.026321690529584885, + "rewards/rejected": -0.17954745888710022, + "step": 60 + }, + { + "epoch": 0.46, + "grad_norm": 6.222192772165732, + "learning_rate": 4.705056179775281e-07, + "logps/chosen": -38.04816436767578, + "logps/rejected": -46.636329650878906, + "loss": 0.6813, + "losses/dpo": 0.6915292739868164, + "losses/sft": 1.5139144659042358, + "losses/total": 0.6915292739868164, + "ref_logps/chosen": -36.346763610839844, + "ref_logps/rejected": -44.65103530883789, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.170139878988266, + "rewards/margins": 0.028389303013682365, + "rewards/rejected": -0.1985291838645935, + "step": 61 + }, + { + "epoch": 0.47, + "grad_norm": 6.581489224854424, + "learning_rate": 4.691011235955056e-07, + "logps/chosen": -39.37269973754883, + "logps/rejected": -42.562713623046875, + "loss": 0.6729, + "losses/dpo": 0.6604301333427429, + "losses/sft": 1.2340155839920044, + "losses/total": 0.6604301333427429, + "ref_logps/chosen": -37.8352165222168, + "ref_logps/rejected": -40.57086181640625, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.15374788641929626, + "rewards/margins": 0.04543708637356758, + "rewards/rejected": -0.19918496906757355, + "step": 62 + }, + { + "epoch": 0.48, + "grad_norm": 6.270954655004902, + "learning_rate": 4.6769662921348315e-07, + "logps/chosen": -35.57749938964844, + "logps/rejected": -43.94036102294922, + "loss": 0.6719, + "losses/dpo": 0.6689096689224243, + "losses/sft": 1.3980541229248047, + "losses/total": 0.6689096689224243, + "ref_logps/chosen": -33.870826721191406, + "ref_logps/rejected": -41.761024475097656, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.1706671416759491, + "rewards/margins": 0.0472659207880497, + "rewards/rejected": -0.2179330587387085, + "step": 63 + }, + { + "epoch": 0.48, + "grad_norm": 5.825145606750587, + "learning_rate": 4.662921348314606e-07, + "logps/chosen": -36.180145263671875, + "logps/rejected": -42.19972229003906, + "loss": 0.6867, + "losses/dpo": 0.6951602697372437, + "losses/sft": 1.4974910020828247, + "losses/total": 0.6951602697372437, + "ref_logps/chosen": -34.28754425048828, + "ref_logps/rejected": -40.13561248779297, + "rewards/accuracies": 0.5234375, + "rewards/chosen": -0.18926027417182922, + "rewards/margins": 0.017150741070508957, + "rewards/rejected": -0.20641100406646729, + "step": 64 + }, + { + "epoch": 0.49, + "grad_norm": 5.954970035091233, + "learning_rate": 4.6488764044943816e-07, + "logps/chosen": -41.472923278808594, + "logps/rejected": -45.73348617553711, + "loss": 0.6781, + "losses/dpo": 0.667303204536438, + "losses/sft": 1.494096040725708, + "losses/total": 0.667303204536438, + "ref_logps/chosen": -39.698944091796875, + "ref_logps/rejected": -43.6091423034668, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.177398219704628, + "rewards/margins": 0.03503631800413132, + "rewards/rejected": -0.2124345451593399, + "step": 65 + }, + { + "epoch": 0.5, + "grad_norm": 6.488401314246342, + "learning_rate": 4.634831460674157e-07, + "logps/chosen": -39.84260177612305, + "logps/rejected": -49.195159912109375, + "loss": 0.6716, + "losses/dpo": 0.6679590940475464, + "losses/sft": 1.3698948621749878, + "losses/total": 0.6679590940475464, + "ref_logps/chosen": -37.98674774169922, + "ref_logps/rejected": -46.86464309692383, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.18558543920516968, + "rewards/margins": 0.047466084361076355, + "rewards/rejected": -0.23305150866508484, + "step": 66 + }, + { + "epoch": 0.51, + "grad_norm": 6.139969478930884, + "learning_rate": 4.620786516853932e-07, + "logps/chosen": -36.54951858520508, + "logps/rejected": -42.6442756652832, + "loss": 0.6689, + "losses/dpo": 0.650477409362793, + "losses/sft": 1.350743055343628, + "losses/total": 0.650477409362793, + "ref_logps/chosen": -34.77081298828125, + "ref_logps/rejected": -40.319297790527344, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17787054181098938, + "rewards/margins": 0.05462724715471268, + "rewards/rejected": -0.23249778151512146, + "step": 67 + }, + { + "epoch": 0.51, + "grad_norm": 6.626542224506714, + "learning_rate": 4.606741573033708e-07, + "logps/chosen": -38.910194396972656, + "logps/rejected": -44.71943664550781, + "loss": 0.6588, + "losses/dpo": 0.6475476622581482, + "losses/sft": 1.0136208534240723, + "losses/total": 0.6475476622581482, + "ref_logps/chosen": -37.08655548095703, + "ref_logps/rejected": -42.12825393676758, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -0.18236377835273743, + "rewards/margins": 0.0767548531293869, + "rewards/rejected": -0.2591186463832855, + "step": 68 + }, + { + "epoch": 0.52, + "grad_norm": 6.316526648907962, + "learning_rate": 4.592696629213483e-07, + "logps/chosen": -39.12900924682617, + "logps/rejected": -47.94546890258789, + "loss": 0.6741, + "losses/dpo": 0.6746849417686462, + "losses/sft": 1.3253227472305298, + "losses/total": 0.6746849417686462, + "ref_logps/chosen": -36.78126525878906, + "ref_logps/rejected": -45.148887634277344, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.23477408289909363, + "rewards/margins": 0.044883839786052704, + "rewards/rejected": -0.27965790033340454, + "step": 69 + }, + { + "epoch": 0.53, + "grad_norm": 6.029340644383451, + "learning_rate": 4.5786516853932584e-07, + "logps/chosen": -37.168025970458984, + "logps/rejected": -43.3531494140625, + "loss": 0.6776, + "losses/dpo": 0.708085298538208, + "losses/sft": 1.549338698387146, + "losses/total": 0.708085298538208, + "ref_logps/chosen": -34.82072067260742, + "ref_logps/rejected": -40.60224151611328, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.2347305417060852, + "rewards/margins": 0.04036000370979309, + "rewards/rejected": -0.2750905454158783, + "step": 70 + }, + { + "epoch": 0.54, + "grad_norm": 6.206457245959275, + "learning_rate": 4.5646067415730334e-07, + "logps/chosen": -37.381324768066406, + "logps/rejected": -44.06721878051758, + "loss": 0.667, + "losses/dpo": 0.6923149228096008, + "losses/sft": 1.499281883239746, + "losses/total": 0.6923149228096008, + "ref_logps/chosen": -35.004127502441406, + "ref_logps/rejected": -41.090354919433594, + "rewards/accuracies": 0.6328125, + "rewards/chosen": -0.23771986365318298, + "rewards/margins": 0.05996667221188545, + "rewards/rejected": -0.29768651723861694, + "step": 71 + }, + { + "epoch": 0.54, + "grad_norm": 6.238705763349497, + "learning_rate": 4.550561797752809e-07, + "logps/chosen": -38.5302734375, + "logps/rejected": -48.384620666503906, + "loss": 0.6669, + "losses/dpo": 0.6794298887252808, + "losses/sft": 1.3331537246704102, + "losses/total": 0.6794298887252808, + "ref_logps/chosen": -35.968894958496094, + "ref_logps/rejected": -45.22618103027344, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.25613832473754883, + "rewards/margins": 0.05970541387796402, + "rewards/rejected": -0.31584370136260986, + "step": 72 + }, + { + "epoch": 0.55, + "grad_norm": 6.369104792363388, + "learning_rate": 4.536516853932584e-07, + "logps/chosen": -38.55243682861328, + "logps/rejected": -46.81627655029297, + "loss": 0.6609, + "losses/dpo": 0.6863117218017578, + "losses/sft": 1.404316782951355, + "losses/total": 0.6863117218017578, + "ref_logps/chosen": -35.96749496459961, + "ref_logps/rejected": -43.47722625732422, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.25849413871765137, + "rewards/margins": 0.07541059702634811, + "rewards/rejected": -0.3339047431945801, + "step": 73 + }, + { + "epoch": 0.56, + "grad_norm": 6.15339242747321, + "learning_rate": 4.522471910112359e-07, + "logps/chosen": -39.58115005493164, + "logps/rejected": -44.1653938293457, + "loss": 0.6782, + "losses/dpo": 0.7283678650856018, + "losses/sft": 1.3683419227600098, + "losses/total": 0.7283678650856018, + "ref_logps/chosen": -37.06593704223633, + "ref_logps/rejected": -41.25994873046875, + "rewards/accuracies": 0.5859375, + "rewards/chosen": -0.25152140855789185, + "rewards/margins": 0.03902304172515869, + "rewards/rejected": -0.29054442048072815, + "step": 74 + }, + { + "epoch": 0.57, + "grad_norm": 6.3073523395391105, + "learning_rate": 4.5084269662921347e-07, + "logps/chosen": -39.416324615478516, + "logps/rejected": -45.20884323120117, + "loss": 0.6702, + "losses/dpo": 0.6884989738464355, + "losses/sft": 1.2989376783370972, + "losses/total": 0.6884989738464355, + "ref_logps/chosen": -36.72044372558594, + "ref_logps/rejected": -41.92823791503906, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.2695878744125366, + "rewards/margins": 0.058472514152526855, + "rewards/rejected": -0.3280603885650635, + "step": 75 + }, + { + "epoch": 0.57, + "grad_norm": 6.353149418167083, + "learning_rate": 4.4943820224719097e-07, + "logps/chosen": -39.451934814453125, + "logps/rejected": -45.58893585205078, + "loss": 0.6758, + "losses/dpo": 0.6700998544692993, + "losses/sft": 1.423154354095459, + "losses/total": 0.6700998544692993, + "ref_logps/chosen": -36.6813850402832, + "ref_logps/rejected": -42.35654830932617, + "rewards/accuracies": 0.5546875, + "rewards/chosen": -0.2770548164844513, + "rewards/margins": 0.0461842380464077, + "rewards/rejected": -0.3232390582561493, + "step": 76 + }, + { + "epoch": 0.58, + "grad_norm": 6.243464205666771, + "learning_rate": 4.4803370786516853e-07, + "logps/chosen": -38.50192642211914, + "logps/rejected": -44.94510269165039, + "loss": 0.6654, + "losses/dpo": 0.6559799909591675, + "losses/sft": 1.3645029067993164, + "losses/total": 0.6559799909591675, + "ref_logps/chosen": -35.83762741088867, + "ref_logps/rejected": -41.636741638183594, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2664298415184021, + "rewards/margins": 0.06440602242946625, + "rewards/rejected": -0.33083584904670715, + "step": 77 + }, + { + "epoch": 0.59, + "grad_norm": 6.529746915602167, + "learning_rate": 4.4662921348314603e-07, + "logps/chosen": -38.366634368896484, + "logps/rejected": -48.25501251220703, + "loss": 0.6701, + "losses/dpo": 0.6894055008888245, + "losses/sft": 1.4073951244354248, + "losses/total": 0.6894055008888245, + "ref_logps/chosen": -35.34041976928711, + "ref_logps/rejected": -44.6351318359375, + "rewards/accuracies": 0.5859375, + "rewards/chosen": -0.3026217818260193, + "rewards/margins": 0.05936632677912712, + "rewards/rejected": -0.3619880974292755, + "step": 78 + }, + { + "epoch": 0.6, + "grad_norm": 6.186041404774562, + "learning_rate": 4.452247191011236e-07, + "logps/chosen": -37.969024658203125, + "logps/rejected": -46.56663131713867, + "loss": 0.6582, + "losses/dpo": 0.6372844576835632, + "losses/sft": 1.1740036010742188, + "losses/total": 0.6372844576835632, + "ref_logps/chosen": -35.09920883178711, + "ref_logps/rejected": -42.874725341796875, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.2869817614555359, + "rewards/margins": 0.08220900595188141, + "rewards/rejected": -0.3691907525062561, + "step": 79 + }, + { + "epoch": 0.6, + "grad_norm": 6.41463598459145, + "learning_rate": 4.438202247191011e-07, + "logps/chosen": -43.05072021484375, + "logps/rejected": -49.178314208984375, + "loss": 0.656, + "losses/dpo": 0.6740515232086182, + "losses/sft": 1.4272187948226929, + "losses/total": 0.6740515232086182, + "ref_logps/chosen": -39.619014739990234, + "ref_logps/rejected": -44.83480453491211, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.3431706726551056, + "rewards/margins": 0.09118058532476425, + "rewards/rejected": -0.43435126543045044, + "step": 80 + }, + { + "epoch": 0.61, + "grad_norm": 6.840871211971457, + "learning_rate": 4.4241573033707865e-07, + "logps/chosen": -43.888370513916016, + "logps/rejected": -47.332916259765625, + "loss": 0.6729, + "losses/dpo": 0.666955828666687, + "losses/sft": 1.6874582767486572, + "losses/total": 0.666955828666687, + "ref_logps/chosen": -40.38330841064453, + "ref_logps/rejected": -43.3054084777832, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.350506067276001, + "rewards/margins": 0.05224461108446121, + "rewards/rejected": -0.4027506709098816, + "step": 81 + }, + { + "epoch": 0.62, + "grad_norm": 6.882624120223548, + "learning_rate": 4.410112359550562e-07, + "logps/chosen": -40.2237548828125, + "logps/rejected": -46.99496078491211, + "loss": 0.6607, + "losses/dpo": 0.6718687415122986, + "losses/sft": 1.5186784267425537, + "losses/total": 0.6718687415122986, + "ref_logps/chosen": -36.70365905761719, + "ref_logps/rejected": -42.63638687133789, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.3520098030567169, + "rewards/margins": 0.08384796977043152, + "rewards/rejected": -0.43585777282714844, + "step": 82 + }, + { + "epoch": 0.63, + "grad_norm": 6.3488191331703385, + "learning_rate": 4.3960674157303366e-07, + "logps/chosen": -40.38496780395508, + "logps/rejected": -46.7673454284668, + "loss": 0.6535, + "losses/dpo": 0.6566940546035767, + "losses/sft": 1.3071130514144897, + "losses/total": 0.6566940546035767, + "ref_logps/chosen": -37.20966339111328, + "ref_logps/rejected": -42.63634490966797, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.3175300061702728, + "rewards/margins": 0.09556981176137924, + "rewards/rejected": -0.4130997955799103, + "step": 83 + }, + { + "epoch": 0.63, + "grad_norm": 6.624302993852389, + "learning_rate": 4.382022471910112e-07, + "logps/chosen": -42.17374801635742, + "logps/rejected": -49.17514419555664, + "loss": 0.6571, + "losses/dpo": 0.6181658506393433, + "losses/sft": 1.3204035758972168, + "losses/total": 0.6181658506393433, + "ref_logps/chosen": -38.54387664794922, + "ref_logps/rejected": -44.659461975097656, + "rewards/accuracies": 0.6328125, + "rewards/chosen": -0.3629874587059021, + "rewards/margins": 0.08858054131269455, + "rewards/rejected": -0.45156803727149963, + "step": 84 + }, + { + "epoch": 0.64, + "grad_norm": 6.569663592940448, + "learning_rate": 4.367977528089887e-07, + "logps/chosen": -39.99671936035156, + "logps/rejected": -48.49413299560547, + "loss": 0.6562, + "losses/dpo": 0.6639370322227478, + "losses/sft": 1.6048388481140137, + "losses/total": 0.6639370322227478, + "ref_logps/chosen": -36.648887634277344, + "ref_logps/rejected": -44.24271774291992, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.3347826600074768, + "rewards/margins": 0.09035841375589371, + "rewards/rejected": -0.4251410961151123, + "step": 85 + }, + { + "epoch": 0.65, + "grad_norm": 6.597580499931281, + "learning_rate": 4.353932584269663e-07, + "logps/chosen": -41.4986572265625, + "logps/rejected": -48.67082214355469, + "loss": 0.6519, + "losses/dpo": 0.6654509902000427, + "losses/sft": 1.462377905845642, + "losses/total": 0.6654509902000427, + "ref_logps/chosen": -37.87129211425781, + "ref_logps/rejected": -43.97351837158203, + "rewards/accuracies": 0.6171875, + "rewards/chosen": -0.3627370595932007, + "rewards/margins": 0.10699345916509628, + "rewards/rejected": -0.46973055601119995, + "step": 86 + }, + { + "epoch": 0.66, + "grad_norm": 6.479183906632411, + "learning_rate": 4.339887640449438e-07, + "logps/chosen": -41.78961944580078, + "logps/rejected": -47.387901306152344, + "loss": 0.6791, + "losses/dpo": 0.6730961799621582, + "losses/sft": 1.1305738687515259, + "losses/total": 0.6730961799621582, + "ref_logps/chosen": -37.842323303222656, + "ref_logps/rejected": -42.93647766113281, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.3947296738624573, + "rewards/margins": 0.05041254311800003, + "rewards/rejected": -0.4451422691345215, + "step": 87 + }, + { + "epoch": 0.66, + "grad_norm": 6.926719176011086, + "learning_rate": 4.3258426966292134e-07, + "logps/chosen": -43.21299743652344, + "logps/rejected": -47.084434509277344, + "loss": 0.6673, + "losses/dpo": 0.6536482572555542, + "losses/sft": 1.2500860691070557, + "losses/total": 0.6536482572555542, + "ref_logps/chosen": -39.38795471191406, + "ref_logps/rejected": -42.582969665527344, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.38250401616096497, + "rewards/margins": 0.06764230877161026, + "rewards/rejected": -0.45014631748199463, + "step": 88 + }, + { + "epoch": 0.67, + "grad_norm": 6.563435223333862, + "learning_rate": 4.311797752808989e-07, + "logps/chosen": -40.25920104980469, + "logps/rejected": -49.489097595214844, + "loss": 0.6508, + "losses/dpo": 0.6591900587081909, + "losses/sft": 1.3429124355316162, + "losses/total": 0.6591900587081909, + "ref_logps/chosen": -36.316349029541016, + "ref_logps/rejected": -44.49497985839844, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3942852020263672, + "rewards/margins": 0.1051262766122818, + "rewards/rejected": -0.4994114637374878, + "step": 89 + }, + { + "epoch": 0.68, + "grad_norm": 6.671599802331672, + "learning_rate": 4.297752808988764e-07, + "logps/chosen": -42.98493957519531, + "logps/rejected": -45.29029846191406, + "loss": 0.6775, + "losses/dpo": 0.6989056468009949, + "losses/sft": 1.7236398458480835, + "losses/total": 0.6989056468009949, + "ref_logps/chosen": -38.74510955810547, + "ref_logps/rejected": -40.55065155029297, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.4239833354949951, + "rewards/margins": 0.04998103156685829, + "rewards/rejected": -0.4739643633365631, + "step": 90 + }, + { + "epoch": 0.69, + "grad_norm": 6.921605107059482, + "learning_rate": 4.2837078651685396e-07, + "logps/chosen": -42.04779052734375, + "logps/rejected": -47.75447463989258, + "loss": 0.6577, + "losses/dpo": 0.6272084712982178, + "losses/sft": 1.5017703771591187, + "losses/total": 0.6272084712982178, + "ref_logps/chosen": -37.97248458862305, + "ref_logps/rejected": -42.7065315246582, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.40753045678138733, + "rewards/margins": 0.09726397693157196, + "rewards/rejected": -0.5047944784164429, + "step": 91 + }, + { + "epoch": 0.69, + "grad_norm": 6.759776662372792, + "learning_rate": 4.269662921348314e-07, + "logps/chosen": -44.38639831542969, + "logps/rejected": -53.21236038208008, + "loss": 0.6431, + "losses/dpo": 0.7165791988372803, + "losses/sft": 1.5609912872314453, + "losses/total": 0.7165791988372803, + "ref_logps/chosen": -39.95743942260742, + "ref_logps/rejected": -47.561588287353516, + "rewards/accuracies": 0.6171875, + "rewards/chosen": -0.4428956210613251, + "rewards/margins": 0.122181735932827, + "rewards/rejected": -0.5650773644447327, + "step": 92 + }, + { + "epoch": 0.7, + "grad_norm": 6.767540666000382, + "learning_rate": 4.2556179775280896e-07, + "logps/chosen": -39.6769905090332, + "logps/rejected": -45.86317443847656, + "loss": 0.6772, + "losses/dpo": 0.6330491900444031, + "losses/sft": 1.33146333694458, + "losses/total": 0.6330491900444031, + "ref_logps/chosen": -35.39988708496094, + "ref_logps/rejected": -41.038116455078125, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.42771056294441223, + "rewards/margins": 0.05479476973414421, + "rewards/rejected": -0.4825053811073303, + "step": 93 + }, + { + "epoch": 0.71, + "grad_norm": 6.521925535129618, + "learning_rate": 4.2415730337078647e-07, + "logps/chosen": -43.78227996826172, + "logps/rejected": -47.82459259033203, + "loss": 0.6607, + "losses/dpo": 0.6985595226287842, + "losses/sft": 1.530924677848816, + "losses/total": 0.6985595226287842, + "ref_logps/chosen": -39.28633499145508, + "ref_logps/rejected": -42.36700439453125, + "rewards/accuracies": 0.6015625, + "rewards/chosen": -0.4495944082736969, + "rewards/margins": 0.09616444259881973, + "rewards/rejected": -0.545758843421936, + "step": 94 + }, + { + "epoch": 0.72, + "grad_norm": 6.606025652021848, + "learning_rate": 4.22752808988764e-07, + "logps/chosen": -44.53917694091797, + "logps/rejected": -49.32555389404297, + "loss": 0.6783, + "losses/dpo": 0.6152039766311646, + "losses/sft": 1.5025076866149902, + "losses/total": 0.6152039766311646, + "ref_logps/chosen": -39.66822814941406, + "ref_logps/rejected": -43.90290451049805, + "rewards/accuracies": 0.6015625, + "rewards/chosen": -0.4870951175689697, + "rewards/margins": 0.05516959726810455, + "rewards/rejected": -0.5422646999359131, + "step": 95 + }, + { + "epoch": 0.72, + "grad_norm": 6.456362063653043, + "learning_rate": 4.2134831460674153e-07, + "logps/chosen": -40.66051483154297, + "logps/rejected": -50.35266876220703, + "loss": 0.6339, + "losses/dpo": 0.5940225124359131, + "losses/sft": 1.3329205513000488, + "losses/total": 0.5940225124359131, + "ref_logps/chosen": -36.583961486816406, + "ref_logps/rejected": -44.78839874267578, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.40765535831451416, + "rewards/margins": 0.14877161383628845, + "rewards/rejected": -0.556427001953125, + "step": 96 + }, + { + "epoch": 0.73, + "grad_norm": 7.002732175851258, + "learning_rate": 4.199438202247191e-07, + "logps/chosen": -40.17961502075195, + "logps/rejected": -50.040138244628906, + "loss": 0.6253, + "losses/dpo": 0.597855806350708, + "losses/sft": 1.5503275394439697, + "losses/total": 0.597855806350708, + "ref_logps/chosen": -35.91657257080078, + "ref_logps/rejected": -44.05973815917969, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.426303893327713, + "rewards/margins": 0.17173629999160767, + "rewards/rejected": -0.5980401635169983, + "step": 97 + }, + { + "epoch": 0.74, + "grad_norm": 6.7690281568226345, + "learning_rate": 4.1853932584269664e-07, + "logps/chosen": -43.26731872558594, + "logps/rejected": -48.155426025390625, + "loss": 0.6528, + "losses/dpo": 0.6972070932388306, + "losses/sft": 1.3802154064178467, + "losses/total": 0.6972070932388306, + "ref_logps/chosen": -38.819725036621094, + "ref_logps/rejected": -42.62653350830078, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4447590410709381, + "rewards/margins": 0.10813023149967194, + "rewards/rejected": -0.5528892278671265, + "step": 98 + }, + { + "epoch": 0.75, + "grad_norm": 6.184786618255584, + "learning_rate": 4.1713483146067415e-07, + "logps/chosen": -39.052734375, + "logps/rejected": -45.65272521972656, + "loss": 0.6289, + "losses/dpo": 0.5835955142974854, + "losses/sft": 1.2479004859924316, + "losses/total": 0.5835955142974854, + "ref_logps/chosen": -35.05288314819336, + "ref_logps/rejected": -40.07136154174805, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.39998501539230347, + "rewards/margins": 0.15815110504627228, + "rewards/rejected": -0.5581361055374146, + "step": 99 + }, + { + "epoch": 0.75, + "grad_norm": 6.925410262368385, + "learning_rate": 4.157303370786517e-07, + "logps/chosen": -44.249752044677734, + "logps/rejected": -44.935245513916016, + "loss": 0.6711, + "losses/dpo": 0.5667402744293213, + "losses/sft": 1.424223780632019, + "losses/total": 0.5667402744293213, + "ref_logps/chosen": -38.972206115722656, + "ref_logps/rejected": -38.912513732910156, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.5277543067932129, + "rewards/margins": 0.07451874017715454, + "rewards/rejected": -0.6022731065750122, + "step": 100 + }, + { + "epoch": 0.76, + "grad_norm": 6.586928303266985, + "learning_rate": 4.1432584269662915e-07, + "logps/chosen": -39.689693450927734, + "logps/rejected": -48.46234130859375, + "loss": 0.6509, + "losses/dpo": 0.6659662127494812, + "losses/sft": 1.3264880180358887, + "losses/total": 0.6659662127494812, + "ref_logps/chosen": -35.252235412597656, + "ref_logps/rejected": -42.87641525268555, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.4437457323074341, + "rewards/margins": 0.11484652757644653, + "rewards/rejected": -0.5585922598838806, + "step": 101 + }, + { + "epoch": 0.77, + "grad_norm": 6.103700351208487, + "learning_rate": 4.129213483146067e-07, + "logps/chosen": -38.51823425292969, + "logps/rejected": -43.52346420288086, + "loss": 0.6509, + "losses/dpo": 0.6539372205734253, + "losses/sft": 1.3750677108764648, + "losses/total": 0.6539372205734253, + "ref_logps/chosen": -34.18145751953125, + "ref_logps/rejected": -38.04938507080078, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.4336775541305542, + "rewards/margins": 0.11373014003038406, + "rewards/rejected": -0.5474076867103577, + "step": 102 + }, + { + "epoch": 0.78, + "grad_norm": 6.228970412657457, + "learning_rate": 4.115168539325842e-07, + "logps/chosen": -42.1187629699707, + "logps/rejected": -47.93737030029297, + "loss": 0.6451, + "losses/dpo": 0.646047055721283, + "losses/sft": 1.497565507888794, + "losses/total": 0.646047055721283, + "ref_logps/chosen": -37.47208786010742, + "ref_logps/rejected": -42.03216552734375, + "rewards/accuracies": 0.6328125, + "rewards/chosen": -0.4646672010421753, + "rewards/margins": 0.12585340440273285, + "rewards/rejected": -0.5905206203460693, + "step": 103 + }, + { + "epoch": 0.78, + "grad_norm": 6.892504983818195, + "learning_rate": 4.1011235955056177e-07, + "logps/chosen": -42.86591339111328, + "logps/rejected": -48.31887435913086, + "loss": 0.6494, + "losses/dpo": 0.6817602515220642, + "losses/sft": 1.5651347637176514, + "losses/total": 0.6817602515220642, + "ref_logps/chosen": -37.89812088012695, + "ref_logps/rejected": -42.230018615722656, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.4967789351940155, + "rewards/margins": 0.11210669577121735, + "rewards/rejected": -0.608885645866394, + "step": 104 + }, + { + "epoch": 0.79, + "grad_norm": 6.344274461611194, + "learning_rate": 4.0870786516853933e-07, + "logps/chosen": -38.07393264770508, + "logps/rejected": -46.695167541503906, + "loss": 0.6485, + "losses/dpo": 0.6693782806396484, + "losses/sft": 1.4964573383331299, + "losses/total": 0.6693782806396484, + "ref_logps/chosen": -33.26963806152344, + "ref_logps/rejected": -40.731658935546875, + "rewards/accuracies": 0.6015625, + "rewards/chosen": -0.48042935132980347, + "rewards/margins": 0.11592163890600204, + "rewards/rejected": -0.5963510274887085, + "step": 105 + }, + { + "epoch": 0.8, + "grad_norm": 6.340224167086584, + "learning_rate": 4.0730337078651683e-07, + "logps/chosen": -34.64811706542969, + "logps/rejected": -44.656005859375, + "loss": 0.6748, + "losses/dpo": 0.7200191020965576, + "losses/sft": 1.2917957305908203, + "losses/total": 0.7200191020965576, + "ref_logps/chosen": -29.991100311279297, + "ref_logps/rejected": -39.274078369140625, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.4657020568847656, + "rewards/margins": 0.07249079644680023, + "rewards/rejected": -0.5381928086280823, + "step": 106 + }, + { + "epoch": 0.81, + "grad_norm": 6.810734673228144, + "learning_rate": 4.058988764044944e-07, + "logps/chosen": -43.92599868774414, + "logps/rejected": -51.205841064453125, + "loss": 0.6618, + "losses/dpo": 0.7375708818435669, + "losses/sft": 1.6257060766220093, + "losses/total": 0.7375708818435669, + "ref_logps/chosen": -38.43563461303711, + "ref_logps/rejected": -44.785430908203125, + "rewards/accuracies": 0.5859375, + "rewards/chosen": -0.5490366220474243, + "rewards/margins": 0.09300415217876434, + "rewards/rejected": -0.6420407295227051, + "step": 107 + }, + { + "epoch": 0.82, + "grad_norm": 6.646468325900178, + "learning_rate": 4.044943820224719e-07, + "logps/chosen": -41.907615661621094, + "logps/rejected": -47.17523956298828, + "loss": 0.6665, + "losses/dpo": 0.6535848379135132, + "losses/sft": 1.5487432479858398, + "losses/total": 0.6535848379135132, + "ref_logps/chosen": -36.636451721191406, + "ref_logps/rejected": -41.005767822265625, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5271163582801819, + "rewards/margins": 0.08983068913221359, + "rewards/rejected": -0.6169470548629761, + "step": 108 + }, + { + "epoch": 0.82, + "grad_norm": 6.796059919133426, + "learning_rate": 4.0308988764044945e-07, + "logps/chosen": -43.95292663574219, + "logps/rejected": -48.59518814086914, + "loss": 0.6619, + "losses/dpo": 0.6743461489677429, + "losses/sft": 1.5721744298934937, + "losses/total": 0.6743461489677429, + "ref_logps/chosen": -38.58289337158203, + "ref_logps/rejected": -42.209529876708984, + "rewards/accuracies": 0.5859375, + "rewards/chosen": -0.5370036363601685, + "rewards/margins": 0.10156210511922836, + "rewards/rejected": -0.6385657787322998, + "step": 109 + }, + { + "epoch": 0.83, + "grad_norm": 6.472585584476915, + "learning_rate": 4.0168539325842696e-07, + "logps/chosen": -40.216651916503906, + "logps/rejected": -45.985801696777344, + "loss": 0.6793, + "losses/dpo": 0.7015002965927124, + "losses/sft": 1.661520004272461, + "losses/total": 0.7015002965927124, + "ref_logps/chosen": -34.707008361816406, + "ref_logps/rejected": -39.90086364746094, + "rewards/accuracies": 0.5859375, + "rewards/chosen": -0.5509647130966187, + "rewards/margins": 0.05752916634082794, + "rewards/rejected": -0.6084938049316406, + "step": 110 + }, + { + "epoch": 0.84, + "grad_norm": 6.441943056630329, + "learning_rate": 4.0028089887640446e-07, + "logps/chosen": -40.84614944458008, + "logps/rejected": -49.78240966796875, + "loss": 0.6495, + "losses/dpo": 0.6826507449150085, + "losses/sft": 1.6292600631713867, + "losses/total": 0.6826507449150085, + "ref_logps/chosen": -35.30500411987305, + "ref_logps/rejected": -43.05198669433594, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.554114580154419, + "rewards/margins": 0.1189279854297638, + "rewards/rejected": -0.6730425953865051, + "step": 111 + }, + { + "epoch": 0.85, + "grad_norm": 6.739697461780844, + "learning_rate": 3.9887640449438196e-07, + "logps/chosen": -43.101287841796875, + "logps/rejected": -51.60324478149414, + "loss": 0.6365, + "losses/dpo": 0.617262601852417, + "losses/sft": 1.4229466915130615, + "losses/total": 0.617262601852417, + "ref_logps/chosen": -37.866512298583984, + "ref_logps/rejected": -44.80317687988281, + "rewards/accuracies": 0.6328125, + "rewards/chosen": -0.52347731590271, + "rewards/margins": 0.1565295159816742, + "rewards/rejected": -0.6800068020820618, + "step": 112 + }, + { + "epoch": 0.85, + "grad_norm": 7.023593025692052, + "learning_rate": 3.974719101123595e-07, + "logps/chosen": -41.23580551147461, + "logps/rejected": -52.932403564453125, + "loss": 0.621, + "losses/dpo": 0.6078984141349792, + "losses/sft": 1.4510893821716309, + "losses/total": 0.6078984141349792, + "ref_logps/chosen": -36.275489807128906, + "ref_logps/rejected": -46.033939361572266, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.49603164196014404, + "rewards/margins": 0.19381484389305115, + "rewards/rejected": -0.6898465156555176, + "step": 113 + }, + { + "epoch": 0.86, + "grad_norm": 6.441040652132362, + "learning_rate": 3.960674157303371e-07, + "logps/chosen": -38.938751220703125, + "logps/rejected": -47.65938186645508, + "loss": 0.6436, + "losses/dpo": 0.6575403809547424, + "losses/sft": 1.4100581407546997, + "losses/total": 0.6575403809547424, + "ref_logps/chosen": -33.69282531738281, + "ref_logps/rejected": -41.06393814086914, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.5245928764343262, + "rewards/margins": 0.13495120406150818, + "rewards/rejected": -0.6595441102981567, + "step": 114 + }, + { + "epoch": 0.87, + "grad_norm": 6.831582112977574, + "learning_rate": 3.946629213483146e-07, + "logps/chosen": -41.802799224853516, + "logps/rejected": -49.96432876586914, + "loss": 0.6368, + "losses/dpo": 0.6237789392471313, + "losses/sft": 1.5177757740020752, + "losses/total": 0.6237789392471313, + "ref_logps/chosen": -36.240840911865234, + "ref_logps/rejected": -42.76991653442383, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.5561960935592651, + "rewards/margins": 0.16324520111083984, + "rewards/rejected": -0.719441294670105, + "step": 115 + }, + { + "epoch": 0.88, + "grad_norm": 7.410431235906546, + "learning_rate": 3.9325842696629214e-07, + "logps/chosen": -45.350669860839844, + "logps/rejected": -48.64668655395508, + "loss": 0.6848, + "losses/dpo": 0.746525228023529, + "losses/sft": 1.8295865058898926, + "losses/total": 0.746525228023529, + "ref_logps/chosen": -39.40565490722656, + "ref_logps/rejected": -42.114837646484375, + "rewards/accuracies": 0.6015625, + "rewards/chosen": -0.5945014357566833, + "rewards/margins": 0.058684106916189194, + "rewards/rejected": -0.6531856060028076, + "step": 116 + }, + { + "epoch": 0.88, + "grad_norm": 6.7521875642568245, + "learning_rate": 3.9185393258426964e-07, + "logps/chosen": -42.558738708496094, + "logps/rejected": -48.05232238769531, + "loss": 0.6395, + "losses/dpo": 0.6771230697631836, + "losses/sft": 1.4978280067443848, + "losses/total": 0.6771230697631836, + "ref_logps/chosen": -37.11561584472656, + "ref_logps/rejected": -41.03240203857422, + "rewards/accuracies": 0.6171875, + "rewards/chosen": -0.5443119406700134, + "rewards/margins": 0.1576804369688034, + "rewards/rejected": -0.701992392539978, + "step": 117 + }, + { + "epoch": 0.89, + "grad_norm": 6.852926877450363, + "learning_rate": 3.904494382022472e-07, + "logps/chosen": -43.11158752441406, + "logps/rejected": -50.49040985107422, + "loss": 0.6294, + "losses/dpo": 0.6048296689987183, + "losses/sft": 1.4263670444488525, + "losses/total": 0.6048296689987183, + "ref_logps/chosen": -37.525428771972656, + "ref_logps/rejected": -43.2536735534668, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.5586156845092773, + "rewards/margins": 0.16505761444568634, + "rewards/rejected": -0.7236733436584473, + "step": 118 + }, + { + "epoch": 0.9, + "grad_norm": 7.482804237101483, + "learning_rate": 3.890449438202247e-07, + "logps/chosen": -42.81207275390625, + "logps/rejected": -48.305213928222656, + "loss": 0.6723, + "losses/dpo": 0.6438789367675781, + "losses/sft": 1.3842287063598633, + "losses/total": 0.6438789367675781, + "ref_logps/chosen": -37.165802001953125, + "ref_logps/rejected": -41.8455696105957, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5646266937255859, + "rewards/margins": 0.08133774995803833, + "rewards/rejected": -0.6459644436836243, + "step": 119 + }, + { + "epoch": 0.91, + "grad_norm": 6.612121590764, + "learning_rate": 3.876404494382022e-07, + "logps/chosen": -40.42414093017578, + "logps/rejected": -49.36077880859375, + "loss": 0.6449, + "losses/dpo": 0.6813696622848511, + "losses/sft": 1.6905653476715088, + "losses/total": 0.6813696622848511, + "ref_logps/chosen": -34.49721145629883, + "ref_logps/rejected": -42.062259674072266, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.5926928520202637, + "rewards/margins": 0.13715943694114685, + "rewards/rejected": -0.7298523187637329, + "step": 120 + }, + { + "epoch": 0.91, + "grad_norm": 6.529474424321076, + "learning_rate": 3.8623595505617977e-07, + "logps/chosen": -43.11798095703125, + "logps/rejected": -51.801422119140625, + "loss": 0.6396, + "losses/dpo": 0.6661785840988159, + "losses/sft": 1.4052226543426514, + "losses/total": 0.6661785840988159, + "ref_logps/chosen": -37.71819305419922, + "ref_logps/rejected": -44.816795349121094, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.5399786233901978, + "rewards/margins": 0.15848389267921448, + "rewards/rejected": -0.6984626054763794, + "step": 121 + }, + { + "epoch": 0.92, + "grad_norm": 6.759925517450978, + "learning_rate": 3.8483146067415727e-07, + "logps/chosen": -44.635955810546875, + "logps/rejected": -53.689002990722656, + "loss": 0.6172, + "losses/dpo": 0.5884010195732117, + "losses/sft": 1.7550606727600098, + "losses/total": 0.5884010195732117, + "ref_logps/chosen": -38.62323760986328, + "ref_logps/rejected": -45.53942108154297, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6012718677520752, + "rewards/margins": 0.21368616819381714, + "rewards/rejected": -0.8149580359458923, + "step": 122 + }, + { + "epoch": 0.93, + "grad_norm": 6.5150883611431, + "learning_rate": 3.834269662921348e-07, + "logps/chosen": -41.34044647216797, + "logps/rejected": -50.817466735839844, + "loss": 0.6373, + "losses/dpo": 0.6241766214370728, + "losses/sft": 1.3161594867706299, + "losses/total": 0.6241766214370728, + "ref_logps/chosen": -35.719482421875, + "ref_logps/rejected": -43.593727111816406, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.562096118927002, + "rewards/margins": 0.16027754545211792, + "rewards/rejected": -0.7223736047744751, + "step": 123 + }, + { + "epoch": 0.94, + "grad_norm": 7.117780641865591, + "learning_rate": 3.8202247191011233e-07, + "logps/chosen": -40.98164367675781, + "logps/rejected": -46.440032958984375, + "loss": 0.6259, + "losses/dpo": 0.7275031805038452, + "losses/sft": 1.3312557935714722, + "losses/total": 0.7275031805038452, + "ref_logps/chosen": -36.18989181518555, + "ref_logps/rejected": -39.75829315185547, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.4791754186153412, + "rewards/margins": 0.1889985203742981, + "rewards/rejected": -0.6681739091873169, + "step": 124 + }, + { + "epoch": 0.94, + "grad_norm": 6.810891199884393, + "learning_rate": 3.806179775280899e-07, + "logps/chosen": -45.05056381225586, + "logps/rejected": -51.01411819458008, + "loss": 0.6422, + "losses/dpo": 0.5984268188476562, + "losses/sft": 1.6079349517822266, + "losses/total": 0.5984268188476562, + "ref_logps/chosen": -39.050132751464844, + "ref_logps/rejected": -43.51178741455078, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -0.6000430583953857, + "rewards/margins": 0.15018987655639648, + "rewards/rejected": -0.750232994556427, + "step": 125 + }, + { + "epoch": 0.95, + "grad_norm": 7.000300157233934, + "learning_rate": 3.792134831460674e-07, + "logps/chosen": -45.488685607910156, + "logps/rejected": -53.24082946777344, + "loss": 0.6293, + "losses/dpo": 0.6512585878372192, + "losses/sft": 1.70095694065094, + "losses/total": 0.6512585878372192, + "ref_logps/chosen": -39.463134765625, + "ref_logps/rejected": -45.5393180847168, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.602555513381958, + "rewards/margins": 0.16759565472602844, + "rewards/rejected": -0.7701511383056641, + "step": 126 + }, + { + "epoch": 0.96, + "grad_norm": 6.866553634275337, + "learning_rate": 3.7780898876404495e-07, + "logps/chosen": -46.352333068847656, + "logps/rejected": -50.944129943847656, + "loss": 0.6501, + "losses/dpo": 0.7261393070220947, + "losses/sft": 1.7794756889343262, + "losses/total": 0.7261393070220947, + "ref_logps/chosen": -40.64281463623047, + "ref_logps/rejected": -43.911109924316406, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.5709517002105713, + "rewards/margins": 0.13234999775886536, + "rewards/rejected": -0.7033016681671143, + "step": 127 + }, + { + "epoch": 0.97, + "grad_norm": 7.1746623953368385, + "learning_rate": 3.7640449438202245e-07, + "logps/chosen": -42.81266403198242, + "logps/rejected": -47.58647155761719, + "loss": 0.6484, + "losses/dpo": 0.6509548425674438, + "losses/sft": 1.4893585443496704, + "losses/total": 0.6509548425674438, + "ref_logps/chosen": -36.92261505126953, + "ref_logps/rejected": -40.24052429199219, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.5890049338340759, + "rewards/margins": 0.14558979868888855, + "rewards/rejected": -0.7345947623252869, + "step": 128 + }, + { + "epoch": 0.97, + "grad_norm": 6.639733312989805, + "learning_rate": 3.75e-07, + "logps/chosen": -42.699241638183594, + "logps/rejected": -49.38917922973633, + "loss": 0.6384, + "losses/dpo": 0.7398217916488647, + "losses/sft": 1.8203296661376953, + "losses/total": 0.7398217916488647, + "ref_logps/chosen": -36.63134002685547, + "ref_logps/rejected": -41.697608947753906, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.6067899465560913, + "rewards/margins": 0.16236720979213715, + "rewards/rejected": -0.7691571712493896, + "step": 129 + }, + { + "epoch": 0.98, + "grad_norm": 6.840872832592906, + "learning_rate": 3.735955056179775e-07, + "logps/chosen": -38.55268096923828, + "logps/rejected": -46.57276153564453, + "loss": 0.6455, + "losses/dpo": 0.6873192191123962, + "losses/sft": 1.3916716575622559, + "losses/total": 0.6873192191123962, + "ref_logps/chosen": -33.59415054321289, + "ref_logps/rejected": -40.15166091918945, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.4958529472351074, + "rewards/margins": 0.14625714719295502, + "rewards/rejected": -0.6421101093292236, + "step": 130 + }, + { + "epoch": 0.99, + "grad_norm": 7.001016051342946, + "learning_rate": 3.72191011235955e-07, + "logps/chosen": -42.6187629699707, + "logps/rejected": -47.61820983886719, + "loss": 0.6507, + "losses/dpo": 0.555785059928894, + "losses/sft": 1.6289881467819214, + "losses/total": 0.555785059928894, + "ref_logps/chosen": -36.840423583984375, + "ref_logps/rejected": -40.43097686767578, + "rewards/accuracies": 0.6015625, + "rewards/chosen": -0.5778340101242065, + "rewards/margins": 0.14088886976242065, + "rewards/rejected": -0.7187228202819824, + "step": 131 + }, + { + "epoch": 1.0, + "grad_norm": 6.69674019742392, + "learning_rate": 3.707865168539326e-07, + "logps/chosen": -42.53615951538086, + "logps/rejected": -50.24591064453125, + "loss": 0.639, + "losses/dpo": 0.658089280128479, + "losses/sft": 1.6500622034072876, + "losses/total": 0.658089280128479, + "ref_logps/chosen": -36.53786087036133, + "ref_logps/rejected": -42.35614776611328, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.599829912185669, + "rewards/margins": 0.1891460418701172, + "rewards/rejected": -0.7889760136604309, + "step": 132 + }, + { + "epoch": 1.0, + "grad_norm": 6.641666012117393, + "learning_rate": 3.693820224719101e-07, + "logps/chosen": -40.87996292114258, + "logps/rejected": -51.007049560546875, + "loss": 0.6244, + "losses/dpo": 0.6069691181182861, + "losses/sft": 1.3164952993392944, + "losses/total": 0.6069691181182861, + "ref_logps/chosen": -35.29632568359375, + "ref_logps/rejected": -43.411521911621094, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.5583640933036804, + "rewards/margins": 0.20118848979473114, + "rewards/rejected": -0.7595525979995728, + "step": 133 + }, + { + "epoch": 1.01, + "grad_norm": 6.588946335066842, + "learning_rate": 3.6797752808988764e-07, + "logps/chosen": -42.399169921875, + "logps/rejected": -51.85491180419922, + "loss": 0.6191, + "losses/dpo": 0.5947903394699097, + "losses/sft": 1.4862775802612305, + "losses/total": 0.5947903394699097, + "ref_logps/chosen": -36.425140380859375, + "ref_logps/rejected": -43.687225341796875, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.5974029898643494, + "rewards/margins": 0.2193659394979477, + "rewards/rejected": -0.8167688846588135, + "step": 134 + }, + { + "epoch": 1.02, + "grad_norm": 6.690035072129239, + "learning_rate": 3.6657303370786514e-07, + "logps/chosen": -41.394432067871094, + "logps/rejected": -53.464168548583984, + "loss": 0.6, + "losses/dpo": 0.6503059267997742, + "losses/sft": 1.5431207418441772, + "losses/total": 0.6503059267997742, + "ref_logps/chosen": -35.42436218261719, + "ref_logps/rejected": -44.949180603027344, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5970069766044617, + "rewards/margins": 0.25449231266975403, + "rewards/rejected": -0.8514993190765381, + "step": 135 + }, + { + "epoch": 1.03, + "grad_norm": 7.159913988692313, + "learning_rate": 3.651685393258427e-07, + "logps/chosen": -47.11418533325195, + "logps/rejected": -49.349937438964844, + "loss": 0.6642, + "losses/dpo": 0.70440673828125, + "losses/sft": 1.6172586679458618, + "losses/total": 0.70440673828125, + "ref_logps/chosen": -40.66786193847656, + "ref_logps/rejected": -41.70207977294922, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.6446323394775391, + "rewards/margins": 0.12015305459499359, + "rewards/rejected": -0.7647854685783386, + "step": 136 + }, + { + "epoch": 1.03, + "grad_norm": 6.861296978626527, + "learning_rate": 3.637640449438202e-07, + "logps/chosen": -41.73448181152344, + "logps/rejected": -49.953067779541016, + "loss": 0.6114, + "losses/dpo": 0.6396130323410034, + "losses/sft": 1.41642427444458, + "losses/total": 0.6396130323410034, + "ref_logps/chosen": -36.14557647705078, + "ref_logps/rejected": -42.22886657714844, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.5588902235031128, + "rewards/margins": 0.21353021264076233, + "rewards/rejected": -0.7724204063415527, + "step": 137 + }, + { + "epoch": 1.04, + "grad_norm": 6.8531162605456055, + "learning_rate": 3.6235955056179776e-07, + "logps/chosen": -40.27118682861328, + "logps/rejected": -51.58380889892578, + "loss": 0.6205, + "losses/dpo": 0.6927103400230408, + "losses/sft": 1.5446867942810059, + "losses/total": 0.6927103400230408, + "ref_logps/chosen": -34.374351501464844, + "ref_logps/rejected": -43.53186798095703, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5896837115287781, + "rewards/margins": 0.2155105173587799, + "rewards/rejected": -0.8051942586898804, + "step": 138 + }, + { + "epoch": 1.05, + "grad_norm": 6.581792409395151, + "learning_rate": 3.6095505617977526e-07, + "logps/chosen": -40.182411193847656, + "logps/rejected": -50.48395538330078, + "loss": 0.6351, + "losses/dpo": 0.5907813906669617, + "losses/sft": 1.5051367282867432, + "losses/total": 0.5907813906669617, + "ref_logps/chosen": -34.705772399902344, + "ref_logps/rejected": -43.20262908935547, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.5476638078689575, + "rewards/margins": 0.18046864867210388, + "rewards/rejected": -0.728132426738739, + "step": 139 + }, + { + "epoch": 1.06, + "grad_norm": 7.099683859328159, + "learning_rate": 3.5955056179775277e-07, + "logps/chosen": -47.343101501464844, + "logps/rejected": -54.016380310058594, + "loss": 0.5962, + "losses/dpo": 0.5484156012535095, + "losses/sft": 1.3340450525283813, + "losses/total": 0.5484156012535095, + "ref_logps/chosen": -40.93060302734375, + "ref_logps/rejected": -45.07525634765625, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.6412495374679565, + "rewards/margins": 0.2528632581233978, + "rewards/rejected": -0.8941128253936768, + "step": 140 + }, + { + "epoch": 1.06, + "grad_norm": 6.6246858434595435, + "learning_rate": 3.581460674157303e-07, + "logps/chosen": -42.795745849609375, + "logps/rejected": -50.99128341674805, + "loss": 0.6105, + "losses/dpo": 0.6333677768707275, + "losses/sft": 1.7310549020767212, + "losses/total": 0.6333677768707275, + "ref_logps/chosen": -36.29172897338867, + "ref_logps/rejected": -42.22053527832031, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.6504020690917969, + "rewards/margins": 0.2266732007265091, + "rewards/rejected": -0.8770751953125, + "step": 141 + }, + { + "epoch": 1.07, + "grad_norm": 6.585988374877859, + "learning_rate": 3.5674157303370783e-07, + "logps/chosen": -39.767051696777344, + "logps/rejected": -45.62493896484375, + "loss": 0.6184, + "losses/dpo": 0.6107473373413086, + "losses/sft": 1.3934905529022217, + "losses/total": 0.6107473373413086, + "ref_logps/chosen": -34.30314636230469, + "ref_logps/rejected": -38.085365295410156, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5463899374008179, + "rewards/margins": 0.20756718516349792, + "rewards/rejected": -0.7539570927619934, + "step": 142 + }, + { + "epoch": 1.08, + "grad_norm": 7.605347145021406, + "learning_rate": 3.553370786516854e-07, + "logps/chosen": -45.32689666748047, + "logps/rejected": -53.28538513183594, + "loss": 0.6396, + "losses/dpo": 0.6139785051345825, + "losses/sft": 1.4996390342712402, + "losses/total": 0.6139785051345825, + "ref_logps/chosen": -38.790870666503906, + "ref_logps/rejected": -44.86207580566406, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.6536027789115906, + "rewards/margins": 0.18872803449630737, + "rewards/rejected": -0.842330813407898, + "step": 143 + }, + { + "epoch": 1.09, + "grad_norm": 6.375447848895096, + "learning_rate": 3.539325842696629e-07, + "logps/chosen": -39.950706481933594, + "logps/rejected": -52.67605972290039, + "loss": 0.6352, + "losses/dpo": 0.562360405921936, + "losses/sft": 1.5273569822311401, + "losses/total": 0.562360405921936, + "ref_logps/chosen": -33.4902458190918, + "ref_logps/rejected": -44.185813903808594, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.6460464000701904, + "rewards/margins": 0.20297789573669434, + "rewards/rejected": -0.8490242958068848, + "step": 144 + }, + { + "epoch": 1.09, + "grad_norm": 6.570913856555609, + "learning_rate": 3.5252808988764045e-07, + "logps/chosen": -41.82993698120117, + "logps/rejected": -49.303138732910156, + "loss": 0.616, + "losses/dpo": 0.6345305442810059, + "losses/sft": 1.6690033674240112, + "losses/total": 0.6345305442810059, + "ref_logps/chosen": -36.14568328857422, + "ref_logps/rejected": -41.49669647216797, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.5684253573417664, + "rewards/margins": 0.2122190296649933, + "rewards/rejected": -0.7806443572044373, + "step": 145 + }, + { + "epoch": 1.1, + "grad_norm": 6.513567190172644, + "learning_rate": 3.51123595505618e-07, + "logps/chosen": -41.52465057373047, + "logps/rejected": -53.12078094482422, + "loss": 0.6021, + "losses/dpo": 0.6336867809295654, + "losses/sft": 1.6118431091308594, + "losses/total": 0.6336867809295654, + "ref_logps/chosen": -35.433265686035156, + "ref_logps/rejected": -44.33394241333008, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -0.6091387271881104, + "rewards/margins": 0.2695454955101013, + "rewards/rejected": -0.8786842823028564, + "step": 146 + }, + { + "epoch": 1.11, + "grad_norm": 7.159126675968495, + "learning_rate": 3.497191011235955e-07, + "logps/chosen": -45.985076904296875, + "logps/rejected": -56.15589904785156, + "loss": 0.6053, + "losses/dpo": 0.6400711536407471, + "losses/sft": 1.618746042251587, + "losses/total": 0.6400711536407471, + "ref_logps/chosen": -39.94523239135742, + "ref_logps/rejected": -47.24679946899414, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6039848327636719, + "rewards/margins": 0.2869252562522888, + "rewards/rejected": -0.8909100294113159, + "step": 147 + }, + { + "epoch": 1.12, + "grad_norm": 6.31408614588678, + "learning_rate": 3.48314606741573e-07, + "logps/chosen": -36.47071075439453, + "logps/rejected": -44.49042510986328, + "loss": 0.6059, + "losses/dpo": 0.6014193892478943, + "losses/sft": 1.4461239576339722, + "losses/total": 0.6014193892478943, + "ref_logps/chosen": -31.124897003173828, + "ref_logps/rejected": -36.84246063232422, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.5345816016197205, + "rewards/margins": 0.23021462559700012, + "rewards/rejected": -0.7647961974143982, + "step": 148 + }, + { + "epoch": 1.12, + "grad_norm": 6.681653921230616, + "learning_rate": 3.469101123595505e-07, + "logps/chosen": -47.83122634887695, + "logps/rejected": -53.77499008178711, + "loss": 0.5731, + "losses/dpo": 0.5383450388908386, + "losses/sft": 1.5721720457077026, + "losses/total": 0.5383450388908386, + "ref_logps/chosen": -41.766536712646484, + "ref_logps/rejected": -44.522464752197266, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.6064690947532654, + "rewards/margins": 0.3187834620475769, + "rewards/rejected": -0.9252525568008423, + "step": 149 + }, + { + "epoch": 1.13, + "grad_norm": 6.904604135283266, + "learning_rate": 3.4550561797752807e-07, + "logps/chosen": -44.048912048339844, + "logps/rejected": -51.318206787109375, + "loss": 0.6352, + "losses/dpo": 0.7056742906570435, + "losses/sft": 1.706668496131897, + "losses/total": 0.7056742906570435, + "ref_logps/chosen": -37.56008529663086, + "ref_logps/rejected": -42.8933219909668, + "rewards/accuracies": 0.6328125, + "rewards/chosen": -0.6488831043243408, + "rewards/margins": 0.19360551238059998, + "rewards/rejected": -0.8424886465072632, + "step": 150 + }, + { + "epoch": 1.14, + "grad_norm": 6.64022622762387, + "learning_rate": 3.441011235955056e-07, + "logps/chosen": -43.570884704589844, + "logps/rejected": -52.240318298339844, + "loss": 0.593, + "losses/dpo": 0.6028671264648438, + "losses/sft": 1.4086356163024902, + "losses/total": 0.6028671264648438, + "ref_logps/chosen": -38.0629768371582, + "ref_logps/rejected": -43.7314453125, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.5507906079292297, + "rewards/margins": 0.3000965714454651, + "rewards/rejected": -0.8508871793746948, + "step": 151 + }, + { + "epoch": 1.15, + "grad_norm": 6.233323532005839, + "learning_rate": 3.4269662921348313e-07, + "logps/chosen": -37.21058654785156, + "logps/rejected": -48.0037841796875, + "loss": 0.594, + "losses/dpo": 0.614506721496582, + "losses/sft": 1.60889732837677, + "losses/total": 0.614506721496582, + "ref_logps/chosen": -31.845245361328125, + "ref_logps/rejected": -39.9993782043457, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.536533772945404, + "rewards/margins": 0.2639070749282837, + "rewards/rejected": -0.800440788269043, + "step": 152 + }, + { + "epoch": 1.15, + "grad_norm": 6.3687654295239335, + "learning_rate": 3.4129213483146064e-07, + "logps/chosen": -38.31336975097656, + "logps/rejected": -46.770484924316406, + "loss": 0.6353, + "losses/dpo": 0.67460036277771, + "losses/sft": 1.5722901821136475, + "losses/total": 0.67460036277771, + "ref_logps/chosen": -32.139488220214844, + "ref_logps/rejected": -38.692901611328125, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.617388129234314, + "rewards/margins": 0.1903703510761261, + "rewards/rejected": -0.8077584505081177, + "step": 153 + }, + { + "epoch": 1.16, + "grad_norm": 6.739574073940404, + "learning_rate": 3.398876404494382e-07, + "logps/chosen": -40.92176055908203, + "logps/rejected": -55.60083770751953, + "loss": 0.5768, + "losses/dpo": 0.6374070644378662, + "losses/sft": 1.6940921545028687, + "losses/total": 0.6374070644378662, + "ref_logps/chosen": -34.817481994628906, + "ref_logps/rejected": -46.21774673461914, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.6104279160499573, + "rewards/margins": 0.3278810977935791, + "rewards/rejected": -0.9383090138435364, + "step": 154 + }, + { + "epoch": 1.17, + "grad_norm": 6.879053261576145, + "learning_rate": 3.3848314606741575e-07, + "logps/chosen": -42.9251823425293, + "logps/rejected": -51.702964782714844, + "loss": 0.6139, + "losses/dpo": 0.6116975545883179, + "losses/sft": 1.40887451171875, + "losses/total": 0.6116975545883179, + "ref_logps/chosen": -37.03919982910156, + "ref_logps/rejected": -43.293392181396484, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.5885984301567078, + "rewards/margins": 0.25235864520072937, + "rewards/rejected": -0.8409571051597595, + "step": 155 + }, + { + "epoch": 1.18, + "grad_norm": 6.694865071581937, + "learning_rate": 3.3707865168539325e-07, + "logps/chosen": -42.003990173339844, + "logps/rejected": -50.280548095703125, + "loss": 0.6314, + "losses/dpo": 0.6589547991752625, + "losses/sft": 1.475001573562622, + "losses/total": 0.6589547991752625, + "ref_logps/chosen": -35.952449798583984, + "ref_logps/rejected": -42.320465087890625, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.6051540374755859, + "rewards/margins": 0.1908540576696396, + "rewards/rejected": -0.7960080504417419, + "step": 156 + }, + { + "epoch": 1.18, + "grad_norm": 6.680369593277332, + "learning_rate": 3.356741573033708e-07, + "logps/chosen": -38.10074996948242, + "logps/rejected": -47.0694694519043, + "loss": 0.6165, + "losses/dpo": 0.5133580565452576, + "losses/sft": 1.357291340827942, + "losses/total": 0.5133580565452576, + "ref_logps/chosen": -32.30625915527344, + "ref_logps/rejected": -38.88528060913086, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.5794489979743958, + "rewards/margins": 0.23896978795528412, + "rewards/rejected": -0.8184187412261963, + "step": 157 + }, + { + "epoch": 1.19, + "grad_norm": 7.163827933877828, + "learning_rate": 3.3426966292134826e-07, + "logps/chosen": -43.57667541503906, + "logps/rejected": -53.30217742919922, + "loss": 0.5908, + "losses/dpo": 0.574418306350708, + "losses/sft": 1.7125813961029053, + "losses/total": 0.574418306350708, + "ref_logps/chosen": -37.39282989501953, + "ref_logps/rejected": -44.2191162109375, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.6183844804763794, + "rewards/margins": 0.2899210453033447, + "rewards/rejected": -0.9083055257797241, + "step": 158 + }, + { + "epoch": 1.2, + "grad_norm": 7.534492030316663, + "learning_rate": 3.328651685393258e-07, + "logps/chosen": -43.87846755981445, + "logps/rejected": -50.815635681152344, + "loss": 0.6314, + "losses/dpo": 0.6066723465919495, + "losses/sft": 1.625878930091858, + "losses/total": 0.6066723465919495, + "ref_logps/chosen": -37.40098190307617, + "ref_logps/rejected": -42.1273193359375, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.6477489471435547, + "rewards/margins": 0.2210829257965088, + "rewards/rejected": -0.8688318729400635, + "step": 159 + }, + { + "epoch": 1.21, + "grad_norm": 6.461969936591063, + "learning_rate": 3.314606741573033e-07, + "logps/chosen": -41.52513122558594, + "logps/rejected": -50.716064453125, + "loss": 0.5689, + "losses/dpo": 0.5582201480865479, + "losses/sft": 1.5073952674865723, + "losses/total": 0.5582201480865479, + "ref_logps/chosen": -35.848846435546875, + "ref_logps/rejected": -41.70889663696289, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.567628026008606, + "rewards/margins": 0.333088219165802, + "rewards/rejected": -0.9007163047790527, + "step": 160 + }, + { + "epoch": 1.22, + "grad_norm": 6.942966081986398, + "learning_rate": 3.300561797752809e-07, + "logps/chosen": -45.35464859008789, + "logps/rejected": -48.982810974121094, + "loss": 0.6365, + "losses/dpo": 0.5614318251609802, + "losses/sft": 1.7859472036361694, + "losses/total": 0.5614318251609802, + "ref_logps/chosen": -39.001136779785156, + "ref_logps/rejected": -40.674232482910156, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.6353514790534973, + "rewards/margins": 0.19550636410713196, + "rewards/rejected": -0.8308578729629517, + "step": 161 + }, + { + "epoch": 1.22, + "grad_norm": 7.180092377812206, + "learning_rate": 3.2865168539325844e-07, + "logps/chosen": -45.20994186401367, + "logps/rejected": -52.836875915527344, + "loss": 0.6224, + "losses/dpo": 0.5493739247322083, + "losses/sft": 1.547910213470459, + "losses/total": 0.5493739247322083, + "ref_logps/chosen": -38.36485290527344, + "ref_logps/rejected": -43.65592575073242, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.684508740901947, + "rewards/margins": 0.2335864156484604, + "rewards/rejected": -0.9180951714515686, + "step": 162 + }, + { + "epoch": 1.23, + "grad_norm": 6.839328465422381, + "learning_rate": 3.2724719101123594e-07, + "logps/chosen": -44.02252197265625, + "logps/rejected": -48.20290756225586, + "loss": 0.6261, + "losses/dpo": 0.5617036819458008, + "losses/sft": 1.5726804733276367, + "losses/total": 0.5617036819458008, + "ref_logps/chosen": -37.56252670288086, + "ref_logps/rejected": -39.5811882019043, + "rewards/accuracies": 0.6328125, + "rewards/chosen": -0.6459991931915283, + "rewards/margins": 0.21617242693901062, + "rewards/rejected": -0.8621717095375061, + "step": 163 + }, + { + "epoch": 1.24, + "grad_norm": 7.080895585848076, + "learning_rate": 3.258426966292135e-07, + "logps/chosen": -44.87802505493164, + "logps/rejected": -53.4232292175293, + "loss": 0.6444, + "losses/dpo": 0.5438011884689331, + "losses/sft": 1.6607606410980225, + "losses/total": 0.5438011884689331, + "ref_logps/chosen": -37.73242950439453, + "ref_logps/rejected": -44.338134765625, + "rewards/accuracies": 0.6328125, + "rewards/chosen": -0.7145596146583557, + "rewards/margins": 0.1939493715763092, + "rewards/rejected": -0.9085089564323425, + "step": 164 + }, + { + "epoch": 1.25, + "grad_norm": 6.817784306171125, + "learning_rate": 3.24438202247191e-07, + "logps/chosen": -41.74745559692383, + "logps/rejected": -54.77409362792969, + "loss": 0.5757, + "losses/dpo": 0.5074477195739746, + "losses/sft": 1.4384926557540894, + "losses/total": 0.5074477195739746, + "ref_logps/chosen": -35.96852493286133, + "ref_logps/rejected": -45.66744613647461, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.5778931379318237, + "rewards/margins": 0.3327715992927551, + "rewards/rejected": -0.9106647372245789, + "step": 165 + }, + { + "epoch": 1.25, + "grad_norm": 6.73402019836184, + "learning_rate": 3.2303370786516856e-07, + "logps/chosen": -42.45885467529297, + "logps/rejected": -55.83628845214844, + "loss": 0.5582, + "losses/dpo": 0.5949134230613708, + "losses/sft": 1.7396336793899536, + "losses/total": 0.5949134230613708, + "ref_logps/chosen": -35.91239547729492, + "ref_logps/rejected": -45.41798400878906, + "rewards/accuracies": 0.7578125, + "rewards/chosen": -0.6546458601951599, + "rewards/margins": 0.387184739112854, + "rewards/rejected": -1.0418306589126587, + "step": 166 + }, + { + "epoch": 1.26, + "grad_norm": 6.484969223325532, + "learning_rate": 3.21629213483146e-07, + "logps/chosen": -39.55500030517578, + "logps/rejected": -53.75917053222656, + "loss": 0.5553, + "losses/dpo": 0.5379188060760498, + "losses/sft": 1.6754546165466309, + "losses/total": 0.5379188060760498, + "ref_logps/chosen": -33.612388610839844, + "ref_logps/rejected": -44.048587799072266, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -0.5942604541778564, + "rewards/margins": 0.3767976760864258, + "rewards/rejected": -0.971058189868927, + "step": 167 + }, + { + "epoch": 1.27, + "grad_norm": 7.553429925940112, + "learning_rate": 3.2022471910112357e-07, + "logps/chosen": -42.48908615112305, + "logps/rejected": -53.703208923339844, + "loss": 0.6156, + "losses/dpo": 0.5968649983406067, + "losses/sft": 1.4021852016448975, + "losses/total": 0.5968649983406067, + "ref_logps/chosen": -36.02775573730469, + "ref_logps/rejected": -44.81937026977539, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.6461330056190491, + "rewards/margins": 0.24225082993507385, + "rewards/rejected": -0.8883838057518005, + "step": 168 + }, + { + "epoch": 1.28, + "grad_norm": 6.938904519141351, + "learning_rate": 3.1882022471910107e-07, + "logps/chosen": -41.907588958740234, + "logps/rejected": -51.34623718261719, + "loss": 0.606, + "losses/dpo": 0.5469992756843567, + "losses/sft": 1.526263952255249, + "losses/total": 0.5469992756843567, + "ref_logps/chosen": -35.054046630859375, + "ref_logps/rejected": -41.81439971923828, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.6853541135787964, + "rewards/margins": 0.26782965660095215, + "rewards/rejected": -0.9531837701797485, + "step": 169 + }, + { + "epoch": 1.28, + "grad_norm": 6.455751734226468, + "learning_rate": 3.1741573033707863e-07, + "logps/chosen": -43.997074127197266, + "logps/rejected": -52.1095085144043, + "loss": 0.5652, + "losses/dpo": 0.5796064138412476, + "losses/sft": 1.7088840007781982, + "losses/total": 0.5796064138412476, + "ref_logps/chosen": -37.47534942626953, + "ref_logps/rejected": -42.05298614501953, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.6521726846694946, + "rewards/margins": 0.35347938537597656, + "rewards/rejected": -1.0056521892547607, + "step": 170 + }, + { + "epoch": 1.29, + "grad_norm": 6.925720662101029, + "learning_rate": 3.160112359550562e-07, + "logps/chosen": -42.9494743347168, + "logps/rejected": -51.66035461425781, + "loss": 0.5733, + "losses/dpo": 0.5771209001541138, + "losses/sft": 1.3783965110778809, + "losses/total": 0.5771209001541138, + "ref_logps/chosen": -36.78116989135742, + "ref_logps/rejected": -41.843605041503906, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.6168303489685059, + "rewards/margins": 0.36484503746032715, + "rewards/rejected": -0.981675386428833, + "step": 171 + }, + { + "epoch": 1.3, + "grad_norm": 7.01889024226675, + "learning_rate": 3.146067415730337e-07, + "logps/chosen": -45.94253158569336, + "logps/rejected": -52.16916275024414, + "loss": 0.6015, + "losses/dpo": 0.5898208618164062, + "losses/sft": 1.6218047142028809, + "losses/total": 0.5898208618164062, + "ref_logps/chosen": -39.25119400024414, + "ref_logps/rejected": -42.79926300048828, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -0.669133722782135, + "rewards/margins": 0.2678561806678772, + "rewards/rejected": -0.936989963054657, + "step": 172 + }, + { + "epoch": 1.31, + "grad_norm": 7.495610381062607, + "learning_rate": 3.1320224719101125e-07, + "logps/chosen": -44.17959976196289, + "logps/rejected": -53.05359649658203, + "loss": 0.6002, + "losses/dpo": 0.630514919757843, + "losses/sft": 1.7863552570343018, + "losses/total": 0.630514919757843, + "ref_logps/chosen": -37.75782012939453, + "ref_logps/rejected": -43.63001251220703, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.642177939414978, + "rewards/margins": 0.30018070340156555, + "rewards/rejected": -0.9423586130142212, + "step": 173 + }, + { + "epoch": 1.31, + "grad_norm": 7.4434324895050015, + "learning_rate": 3.1179775280898875e-07, + "logps/chosen": -47.16429901123047, + "logps/rejected": -53.266780853271484, + "loss": 0.6233, + "losses/dpo": 0.5390438437461853, + "losses/sft": 1.480837345123291, + "losses/total": 0.5390438437461853, + "ref_logps/chosen": -40.324798583984375, + "ref_logps/rejected": -44.025787353515625, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6839500069618225, + "rewards/margins": 0.24014970660209656, + "rewards/rejected": -0.9240997433662415, + "step": 174 + }, + { + "epoch": 1.32, + "grad_norm": 7.0617610747390644, + "learning_rate": 3.103932584269663e-07, + "logps/chosen": -45.060882568359375, + "logps/rejected": -54.979156494140625, + "loss": 0.5891, + "losses/dpo": 0.5568109750747681, + "losses/sft": 1.6295528411865234, + "losses/total": 0.5568109750747681, + "ref_logps/chosen": -37.929840087890625, + "ref_logps/rejected": -44.73991394042969, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7131036520004272, + "rewards/margins": 0.3108205795288086, + "rewards/rejected": -1.0239241123199463, + "step": 175 + }, + { + "epoch": 1.33, + "grad_norm": 6.663174112724157, + "learning_rate": 3.0898876404494376e-07, + "logps/chosen": -42.657920837402344, + "logps/rejected": -47.67673110961914, + "loss": 0.629, + "losses/dpo": 0.7223004102706909, + "losses/sft": 1.3237264156341553, + "losses/total": 0.7223004102706909, + "ref_logps/chosen": -36.358001708984375, + "ref_logps/rejected": -38.938507080078125, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.6299920082092285, + "rewards/margins": 0.24383032321929932, + "rewards/rejected": -0.8738222122192383, + "step": 176 + }, + { + "epoch": 1.34, + "grad_norm": 7.940936921413792, + "learning_rate": 3.075842696629213e-07, + "logps/chosen": -48.067203521728516, + "logps/rejected": -52.38322067260742, + "loss": 0.6277, + "losses/dpo": 0.6528229117393494, + "losses/sft": 1.5663461685180664, + "losses/total": 0.6528229117393494, + "ref_logps/chosen": -40.93121337890625, + "ref_logps/rejected": -42.879188537597656, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.7135992050170898, + "rewards/margins": 0.23680387437343597, + "rewards/rejected": -0.950403094291687, + "step": 177 + }, + { + "epoch": 1.34, + "grad_norm": 6.9909555671219366, + "learning_rate": 3.0617977528089887e-07, + "logps/chosen": -43.74810791015625, + "logps/rejected": -51.602622985839844, + "loss": 0.6134, + "losses/dpo": 0.6351133584976196, + "losses/sft": 1.548452615737915, + "losses/total": 0.6351133584976196, + "ref_logps/chosen": -36.539649963378906, + "ref_logps/rejected": -41.724647521972656, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.7208462953567505, + "rewards/margins": 0.26695096492767334, + "rewards/rejected": -0.9877973794937134, + "step": 178 + }, + { + "epoch": 1.35, + "grad_norm": 7.516487264439432, + "learning_rate": 3.047752808988764e-07, + "logps/chosen": -45.41664123535156, + "logps/rejected": -52.19443130493164, + "loss": 0.6325, + "losses/dpo": 0.6936246752738953, + "losses/sft": 1.418731451034546, + "losses/total": 0.6936246752738953, + "ref_logps/chosen": -37.89662170410156, + "ref_logps/rejected": -42.46479797363281, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7520017623901367, + "rewards/margins": 0.22096163034439087, + "rewards/rejected": -0.9729634523391724, + "step": 179 + }, + { + "epoch": 1.36, + "grad_norm": 7.386682971240318, + "learning_rate": 3.0337078651685393e-07, + "logps/chosen": -44.32128143310547, + "logps/rejected": -56.52943420410156, + "loss": 0.589, + "losses/dpo": 0.50272536277771, + "losses/sft": 1.4175364971160889, + "losses/total": 0.50272536277771, + "ref_logps/chosen": -36.94242477416992, + "ref_logps/rejected": -45.8451042175293, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.7378860712051392, + "rewards/margins": 0.3305472731590271, + "rewards/rejected": -1.0684332847595215, + "step": 180 + }, + { + "epoch": 1.37, + "grad_norm": 7.070520625555024, + "learning_rate": 3.0196629213483144e-07, + "logps/chosen": -42.875099182128906, + "logps/rejected": -52.050384521484375, + "loss": 0.5815, + "losses/dpo": 0.6107900738716125, + "losses/sft": 1.7456879615783691, + "losses/total": 0.6107900738716125, + "ref_logps/chosen": -36.82639694213867, + "ref_logps/rejected": -42.541603088378906, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.604870617389679, + "rewards/margins": 0.34600716829299927, + "rewards/rejected": -0.9508777856826782, + "step": 181 + }, + { + "epoch": 1.37, + "grad_norm": 6.880606354027993, + "learning_rate": 3.00561797752809e-07, + "logps/chosen": -44.50994110107422, + "logps/rejected": -50.33254623413086, + "loss": 0.5976, + "losses/dpo": 0.5528784990310669, + "losses/sft": 1.7669578790664673, + "losses/total": 0.5528784990310669, + "ref_logps/chosen": -37.44862365722656, + "ref_logps/rejected": -40.247039794921875, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -0.7061322331428528, + "rewards/margins": 0.30241847038269043, + "rewards/rejected": -1.0085506439208984, + "step": 182 + }, + { + "epoch": 1.38, + "grad_norm": 7.02791538850361, + "learning_rate": 2.991573033707865e-07, + "logps/chosen": -43.87753677368164, + "logps/rejected": -49.34949493408203, + "loss": 0.6184, + "losses/dpo": 0.5282893180847168, + "losses/sft": 1.5792714357376099, + "losses/total": 0.5282893180847168, + "ref_logps/chosen": -36.59724044799805, + "ref_logps/rejected": -39.5660285949707, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.7280292510986328, + "rewards/margins": 0.25031745433807373, + "rewards/rejected": -0.9783467054367065, + "step": 183 + }, + { + "epoch": 1.39, + "grad_norm": 7.145778710825677, + "learning_rate": 2.9775280898876406e-07, + "logps/chosen": -44.12153625488281, + "logps/rejected": -50.03323745727539, + "loss": 0.5994, + "losses/dpo": 0.5445400476455688, + "losses/sft": 1.3715200424194336, + "losses/total": 0.5445400476455688, + "ref_logps/chosen": -36.779212951660156, + "ref_logps/rejected": -39.96929168701172, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.7342325448989868, + "rewards/margins": 0.2721615731716156, + "rewards/rejected": -1.0063941478729248, + "step": 184 + }, + { + "epoch": 1.4, + "grad_norm": 7.033244427636761, + "learning_rate": 2.9634831460674156e-07, + "logps/chosen": -44.52336120605469, + "logps/rejected": -52.357810974121094, + "loss": 0.5954, + "losses/dpo": 0.7007085084915161, + "losses/sft": 1.6124120950698853, + "losses/total": 0.7007085084915161, + "ref_logps/chosen": -37.63973617553711, + "ref_logps/rejected": -42.18433380126953, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6883625984191895, + "rewards/margins": 0.32898518443107605, + "rewards/rejected": -1.017347812652588, + "step": 185 + }, + { + "epoch": 1.4, + "grad_norm": 6.657232520968286, + "learning_rate": 2.9494382022471906e-07, + "logps/chosen": -41.616920471191406, + "logps/rejected": -52.240867614746094, + "loss": 0.5708, + "losses/dpo": 0.6222548484802246, + "losses/sft": 1.4851452112197876, + "losses/total": 0.6222548484802246, + "ref_logps/chosen": -35.22339630126953, + "ref_logps/rejected": -42.063621520996094, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.6393523216247559, + "rewards/margins": 0.37837234139442444, + "rewards/rejected": -1.0177247524261475, + "step": 186 + }, + { + "epoch": 1.41, + "grad_norm": 7.293022584639844, + "learning_rate": 2.935393258426966e-07, + "logps/chosen": -44.5795783996582, + "logps/rejected": -51.8583869934082, + "loss": 0.613, + "losses/dpo": 0.5626444816589355, + "losses/sft": 1.5801838636398315, + "losses/total": 0.5626444816589355, + "ref_logps/chosen": -37.548030853271484, + "ref_logps/rejected": -41.92090606689453, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7031550407409668, + "rewards/margins": 0.2905934154987335, + "rewards/rejected": -0.9937484264373779, + "step": 187 + }, + { + "epoch": 1.42, + "grad_norm": 6.7646034429768385, + "learning_rate": 2.921348314606741e-07, + "logps/chosen": -38.493064880371094, + "logps/rejected": -52.74738693237305, + "loss": 0.5673, + "losses/dpo": 0.5454456210136414, + "losses/sft": 1.4984629154205322, + "losses/total": 0.5454456210136414, + "ref_logps/chosen": -31.663148880004883, + "ref_logps/rejected": -42.071197509765625, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.6829913854598999, + "rewards/margins": 0.38462772965431213, + "rewards/rejected": -1.0676190853118896, + "step": 188 + }, + { + "epoch": 1.43, + "grad_norm": 6.461217065826113, + "learning_rate": 2.907303370786517e-07, + "logps/chosen": -41.286556243896484, + "logps/rejected": -50.088348388671875, + "loss": 0.5702, + "losses/dpo": 0.5214348435401917, + "losses/sft": 1.4683022499084473, + "losses/total": 0.5214348435401917, + "ref_logps/chosen": -35.07402038574219, + "ref_logps/rejected": -40.096168518066406, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6212539672851562, + "rewards/margins": 0.377963662147522, + "rewards/rejected": -0.9992176294326782, + "step": 189 + }, + { + "epoch": 1.43, + "grad_norm": 7.005927567034465, + "learning_rate": 2.893258426966292e-07, + "logps/chosen": -40.85224533081055, + "logps/rejected": -47.50454330444336, + "loss": 0.6468, + "losses/dpo": 0.5624043941497803, + "losses/sft": 1.3726718425750732, + "losses/total": 0.5624043941497803, + "ref_logps/chosen": -34.436492919921875, + "ref_logps/rejected": -39.054630279541016, + "rewards/accuracies": 0.6328125, + "rewards/chosen": -0.6415754556655884, + "rewards/margins": 0.2034158706665039, + "rewards/rejected": -0.8449913263320923, + "step": 190 + }, + { + "epoch": 1.44, + "grad_norm": 7.483874035437063, + "learning_rate": 2.8792134831460674e-07, + "logps/chosen": -42.50736999511719, + "logps/rejected": -58.21019744873047, + "loss": 0.5523, + "losses/dpo": 0.6698145270347595, + "losses/sft": 1.5408368110656738, + "losses/total": 0.6698145270347595, + "ref_logps/chosen": -36.81276321411133, + "ref_logps/rejected": -48.2528076171875, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.5694608688354492, + "rewards/margins": 0.4262778162956238, + "rewards/rejected": -0.995738685131073, + "step": 191 + }, + { + "epoch": 1.45, + "grad_norm": 6.9825083587199, + "learning_rate": 2.8651685393258425e-07, + "logps/chosen": -45.57709503173828, + "logps/rejected": -53.14373016357422, + "loss": 0.5855, + "losses/dpo": 0.557357132434845, + "losses/sft": 1.6354026794433594, + "losses/total": 0.557357132434845, + "ref_logps/chosen": -39.346527099609375, + "ref_logps/rejected": -43.652854919433594, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6230565309524536, + "rewards/margins": 0.32603132724761963, + "rewards/rejected": -0.9490878582000732, + "step": 192 + }, + { + "epoch": 1.46, + "grad_norm": 6.9248793596215314, + "learning_rate": 2.851123595505618e-07, + "logps/chosen": -41.4918098449707, + "logps/rejected": -51.642330169677734, + "loss": 0.614, + "losses/dpo": 0.5539823770523071, + "losses/sft": 1.4280143976211548, + "losses/total": 0.5539823770523071, + "ref_logps/chosen": -34.640289306640625, + "ref_logps/rejected": -42.01956558227539, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.6851522326469421, + "rewards/margins": 0.2771243751049042, + "rewards/rejected": -0.9622765779495239, + "step": 193 + }, + { + "epoch": 1.46, + "grad_norm": 7.819580369685109, + "learning_rate": 2.8370786516853936e-07, + "logps/chosen": -45.17947769165039, + "logps/rejected": -54.26673126220703, + "loss": 0.5983, + "losses/dpo": 0.5532131195068359, + "losses/sft": 1.5786592960357666, + "losses/total": 0.5532131195068359, + "ref_logps/chosen": -38.547950744628906, + "ref_logps/rejected": -44.515296936035156, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6631526350975037, + "rewards/margins": 0.3119913339614868, + "rewards/rejected": -0.9751439094543457, + "step": 194 + }, + { + "epoch": 1.47, + "grad_norm": 6.908089828532824, + "learning_rate": 2.823033707865168e-07, + "logps/chosen": -39.231468200683594, + "logps/rejected": -55.21925735473633, + "loss": 0.5648, + "losses/dpo": 0.5973429083824158, + "losses/sft": 1.6660652160644531, + "losses/total": 0.5973429083824158, + "ref_logps/chosen": -32.866737365722656, + "ref_logps/rejected": -44.704856872558594, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.6364729404449463, + "rewards/margins": 0.4149664640426636, + "rewards/rejected": -1.0514394044876099, + "step": 195 + }, + { + "epoch": 1.48, + "grad_norm": 6.726944591334497, + "learning_rate": 2.8089887640449437e-07, + "logps/chosen": -40.06050109863281, + "logps/rejected": -53.288673400878906, + "loss": 0.5791, + "losses/dpo": 0.5540711879730225, + "losses/sft": 1.7805967330932617, + "losses/total": 0.5540711879730225, + "ref_logps/chosen": -33.56330490112305, + "ref_logps/rejected": -42.997859954833984, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.649719774723053, + "rewards/margins": 0.37936151027679443, + "rewards/rejected": -1.0290813446044922, + "step": 196 + }, + { + "epoch": 1.49, + "grad_norm": 7.150851904029176, + "learning_rate": 2.794943820224719e-07, + "logps/chosen": -47.1893424987793, + "logps/rejected": -61.44281005859375, + "loss": 0.5702, + "losses/dpo": 0.7234626412391663, + "losses/sft": 1.6843864917755127, + "losses/total": 0.7234626412391663, + "ref_logps/chosen": -39.28502655029297, + "ref_logps/rejected": -49.627784729003906, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.7904319763183594, + "rewards/margins": 0.39107024669647217, + "rewards/rejected": -1.1815022230148315, + "step": 197 + }, + { + "epoch": 1.49, + "grad_norm": 7.427853846385361, + "learning_rate": 2.7808988764044943e-07, + "logps/chosen": -43.90837097167969, + "logps/rejected": -49.889678955078125, + "loss": 0.6097, + "losses/dpo": 0.5974606275558472, + "losses/sft": 1.7023361921310425, + "losses/total": 0.5974606275558472, + "ref_logps/chosen": -36.68721008300781, + "ref_logps/rejected": -39.80302047729492, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.722116231918335, + "rewards/margins": 0.2865493595600128, + "rewards/rejected": -1.0086655616760254, + "step": 198 + }, + { + "epoch": 1.5, + "grad_norm": 6.720052652716852, + "learning_rate": 2.7668539325842694e-07, + "logps/chosen": -40.47029495239258, + "logps/rejected": -52.58824157714844, + "loss": 0.5673, + "losses/dpo": 0.5275993347167969, + "losses/sft": 1.4116981029510498, + "losses/total": 0.5275993347167969, + "ref_logps/chosen": -33.68723678588867, + "ref_logps/rejected": -42.008827209472656, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6783058643341064, + "rewards/margins": 0.3796355426311493, + "rewards/rejected": -1.0579413175582886, + "step": 199 + }, + { + "epoch": 1.51, + "grad_norm": 7.464608685292226, + "learning_rate": 2.752808988764045e-07, + "logps/chosen": -46.24801254272461, + "logps/rejected": -54.933780670166016, + "loss": 0.61, + "losses/dpo": 0.6066948771476746, + "losses/sft": 1.6309008598327637, + "losses/total": 0.6066948771476746, + "ref_logps/chosen": -38.81559753417969, + "ref_logps/rejected": -44.58855438232422, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.743241548538208, + "rewards/margins": 0.2912812829017639, + "rewards/rejected": -1.0345228910446167, + "step": 200 + }, + { + "epoch": 1.52, + "grad_norm": 7.360337757301619, + "learning_rate": 2.73876404494382e-07, + "logps/chosen": -42.876792907714844, + "logps/rejected": -50.461334228515625, + "loss": 0.6213, + "losses/dpo": 0.6310982704162598, + "losses/sft": 1.441427230834961, + "losses/total": 0.6310982704162598, + "ref_logps/chosen": -36.28274917602539, + "ref_logps/rejected": -41.36058044433594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6594043970108032, + "rewards/margins": 0.2506704330444336, + "rewards/rejected": -0.9100748300552368, + "step": 201 + }, + { + "epoch": 1.52, + "grad_norm": 7.43302729298079, + "learning_rate": 2.7247191011235955e-07, + "logps/chosen": -43.45911407470703, + "logps/rejected": -50.20298385620117, + "loss": 0.5552, + "losses/dpo": 0.5599596500396729, + "losses/sft": 1.4739470481872559, + "losses/total": 0.5599596500396729, + "ref_logps/chosen": -37.075191497802734, + "ref_logps/rejected": -39.420005798339844, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.6383919715881348, + "rewards/margins": 0.4399053752422333, + "rewards/rejected": -1.0782973766326904, + "step": 202 + }, + { + "epoch": 1.53, + "grad_norm": 7.05544065339559, + "learning_rate": 2.710674157303371e-07, + "logps/chosen": -48.13520050048828, + "logps/rejected": -55.488975524902344, + "loss": 0.5683, + "losses/dpo": 0.5512528419494629, + "losses/sft": 1.421828269958496, + "losses/total": 0.5512528419494629, + "ref_logps/chosen": -40.806800842285156, + "ref_logps/rejected": -44.204917907714844, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7328400611877441, + "rewards/margins": 0.39556559920310974, + "rewards/rejected": -1.1284055709838867, + "step": 203 + }, + { + "epoch": 1.54, + "grad_norm": 7.072397811821575, + "learning_rate": 2.6966292134831456e-07, + "logps/chosen": -45.89094161987305, + "logps/rejected": -56.12247085571289, + "loss": 0.5687, + "losses/dpo": 0.5942108035087585, + "losses/sft": 1.671670913696289, + "losses/total": 0.5942108035087585, + "ref_logps/chosen": -38.66583251953125, + "ref_logps/rejected": -45.235042572021484, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.7225111722946167, + "rewards/margins": 0.3662317395210266, + "rewards/rejected": -1.088742971420288, + "step": 204 + }, + { + "epoch": 1.55, + "grad_norm": 7.135519070946246, + "learning_rate": 2.682584269662921e-07, + "logps/chosen": -44.51463317871094, + "logps/rejected": -53.46598434448242, + "loss": 0.5668, + "losses/dpo": 0.5319070816040039, + "losses/sft": 1.5628294944763184, + "losses/total": 0.5319070816040039, + "ref_logps/chosen": -37.279029846191406, + "ref_logps/rejected": -42.57787322998047, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7235599756240845, + "rewards/margins": 0.36525097489356995, + "rewards/rejected": -1.0888110399246216, + "step": 205 + }, + { + "epoch": 1.55, + "grad_norm": 7.107636740157782, + "learning_rate": 2.668539325842696e-07, + "logps/chosen": -43.40117645263672, + "logps/rejected": -54.69598388671875, + "loss": 0.5524, + "losses/dpo": 0.5264509320259094, + "losses/sft": 1.5363452434539795, + "losses/total": 0.5264509320259094, + "ref_logps/chosen": -36.371551513671875, + "ref_logps/rejected": -43.2625732421875, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7029624581336975, + "rewards/margins": 0.44037845730781555, + "rewards/rejected": -1.1433409452438354, + "step": 206 + }, + { + "epoch": 1.56, + "grad_norm": 7.289414925759057, + "learning_rate": 2.654494382022472e-07, + "logps/chosen": -42.475379943847656, + "logps/rejected": -49.646728515625, + "loss": 0.6046, + "losses/dpo": 0.6572248935699463, + "losses/sft": 1.6387099027633667, + "losses/total": 0.6572248935699463, + "ref_logps/chosen": -35.308807373046875, + "ref_logps/rejected": -39.25323486328125, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.7166574597358704, + "rewards/margins": 0.32269221544265747, + "rewards/rejected": -1.0393496751785278, + "step": 207 + }, + { + "epoch": 1.57, + "grad_norm": 7.273876435078599, + "learning_rate": 2.640449438202247e-07, + "logps/chosen": -42.280967712402344, + "logps/rejected": -48.80766296386719, + "loss": 0.6213, + "losses/dpo": 0.5971169471740723, + "losses/sft": 1.7042605876922607, + "losses/total": 0.5971169471740723, + "ref_logps/chosen": -35.188377380371094, + "ref_logps/rejected": -39.048980712890625, + "rewards/accuracies": 0.6015625, + "rewards/chosen": -0.7092592716217041, + "rewards/margins": 0.2666093707084656, + "rewards/rejected": -0.9758686423301697, + "step": 208 + }, + { + "epoch": 1.58, + "grad_norm": 7.389043621661051, + "learning_rate": 2.6264044943820224e-07, + "logps/chosen": -43.01720428466797, + "logps/rejected": -52.86360549926758, + "loss": 0.6004, + "losses/dpo": 0.5311284065246582, + "losses/sft": 1.673902988433838, + "losses/total": 0.5311284065246582, + "ref_logps/chosen": -35.34561538696289, + "ref_logps/rejected": -41.90196228027344, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.7671589851379395, + "rewards/margins": 0.3290054500102997, + "rewards/rejected": -1.096164345741272, + "step": 209 + }, + { + "epoch": 1.58, + "grad_norm": 7.051856361062949, + "learning_rate": 2.612359550561798e-07, + "logps/chosen": -43.410194396972656, + "logps/rejected": -56.95100784301758, + "loss": 0.5527, + "losses/dpo": 0.494179904460907, + "losses/sft": 1.3610440492630005, + "losses/total": 0.494179904460907, + "ref_logps/chosen": -36.352073669433594, + "ref_logps/rejected": -45.52418518066406, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -0.7058122754096985, + "rewards/margins": 0.4368700683116913, + "rewards/rejected": -1.1426823139190674, + "step": 210 + }, + { + "epoch": 1.59, + "grad_norm": 7.075204680484654, + "learning_rate": 2.598314606741573e-07, + "logps/chosen": -44.7838249206543, + "logps/rejected": -52.02484130859375, + "loss": 0.6038, + "losses/dpo": 0.5901740193367004, + "losses/sft": 1.7182517051696777, + "losses/total": 0.5901740193367004, + "ref_logps/chosen": -37.16931915283203, + "ref_logps/rejected": -41.17018127441406, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.7614503502845764, + "rewards/margins": 0.32401591539382935, + "rewards/rejected": -1.0854662656784058, + "step": 211 + }, + { + "epoch": 1.6, + "grad_norm": 7.444039226905721, + "learning_rate": 2.5842696629213486e-07, + "logps/chosen": -41.19989776611328, + "logps/rejected": -49.64472961425781, + "loss": 0.5961, + "losses/dpo": 0.5703378319740295, + "losses/sft": 1.288915753364563, + "losses/total": 0.5703378319740295, + "ref_logps/chosen": -34.397457122802734, + "ref_logps/rejected": -39.43400573730469, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6802438497543335, + "rewards/margins": 0.34082797169685364, + "rewards/rejected": -1.0210717916488647, + "step": 212 + }, + { + "epoch": 1.61, + "grad_norm": 7.3272234640712455, + "learning_rate": 2.5702247191011236e-07, + "logps/chosen": -49.161766052246094, + "logps/rejected": -55.671295166015625, + "loss": 0.5862, + "losses/dpo": 0.7499480843544006, + "losses/sft": 1.8793140649795532, + "losses/total": 0.7499480843544006, + "ref_logps/chosen": -41.297489166259766, + "ref_logps/rejected": -44.11161422729492, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7864278554916382, + "rewards/margins": 0.3695400655269623, + "rewards/rejected": -1.1559679508209229, + "step": 213 + }, + { + "epoch": 1.62, + "grad_norm": 7.817780320058273, + "learning_rate": 2.5561797752808987e-07, + "logps/chosen": -46.763206481933594, + "logps/rejected": -53.703033447265625, + "loss": 0.6266, + "losses/dpo": 0.48047423362731934, + "losses/sft": 1.5680122375488281, + "losses/total": 0.48047423362731934, + "ref_logps/chosen": -39.49818801879883, + "ref_logps/rejected": -43.53962326049805, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.7265015840530396, + "rewards/margins": 0.28983935713768005, + "rewards/rejected": -1.016340970993042, + "step": 214 + }, + { + "epoch": 1.62, + "grad_norm": 7.508782666466954, + "learning_rate": 2.5421348314606737e-07, + "logps/chosen": -47.78954315185547, + "logps/rejected": -56.90927505493164, + "loss": 0.5628, + "losses/dpo": 0.537736177444458, + "losses/sft": 1.6823458671569824, + "losses/total": 0.537736177444458, + "ref_logps/chosen": -40.28362274169922, + "ref_logps/rejected": -45.37610626220703, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.7505923509597778, + "rewards/margins": 0.4027244448661804, + "rewards/rejected": -1.1533167362213135, + "step": 215 + }, + { + "epoch": 1.63, + "grad_norm": 7.806242305852612, + "learning_rate": 2.5280898876404493e-07, + "logps/chosen": -47.2044677734375, + "logps/rejected": -58.619651794433594, + "loss": 0.5899, + "losses/dpo": 0.6399192214012146, + "losses/sft": 1.363295316696167, + "losses/total": 0.6399192214012146, + "ref_logps/chosen": -39.20256423950195, + "ref_logps/rejected": -46.85260772705078, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8001901507377625, + "rewards/margins": 0.3765140473842621, + "rewards/rejected": -1.1767041683197021, + "step": 216 + }, + { + "epoch": 1.64, + "grad_norm": 6.621098009181271, + "learning_rate": 2.5140449438202243e-07, + "logps/chosen": -36.010169982910156, + "logps/rejected": -48.608699798583984, + "loss": 0.5501, + "losses/dpo": 0.5600734949111938, + "losses/sft": 1.3302438259124756, + "losses/total": 0.5600734949111938, + "ref_logps/chosen": -29.427637100219727, + "ref_logps/rejected": -37.72674560546875, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.6582531929016113, + "rewards/margins": 0.4299423098564148, + "rewards/rejected": -1.088195562362671, + "step": 217 + }, + { + "epoch": 1.65, + "grad_norm": 7.0657521689003735, + "learning_rate": 2.5e-07, + "logps/chosen": -42.20947265625, + "logps/rejected": -53.40728759765625, + "loss": 0.5808, + "losses/dpo": 0.5706441402435303, + "losses/sft": 1.390072226524353, + "losses/total": 0.5706441402435303, + "ref_logps/chosen": -34.44993591308594, + "ref_logps/rejected": -41.737003326416016, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.7759537696838379, + "rewards/margins": 0.3910742402076721, + "rewards/rejected": -1.1670279502868652, + "step": 218 + }, + { + "epoch": 1.65, + "grad_norm": 7.920819614767415, + "learning_rate": 2.485955056179775e-07, + "logps/chosen": -46.45621109008789, + "logps/rejected": -53.37653350830078, + "loss": 0.6258, + "losses/dpo": 0.5177885293960571, + "losses/sft": 1.4505321979522705, + "losses/total": 0.5177885293960571, + "ref_logps/chosen": -38.40946960449219, + "ref_logps/rejected": -42.48009490966797, + "rewards/accuracies": 0.6328125, + "rewards/chosen": -0.8046744465827942, + "rewards/margins": 0.2849688231945038, + "rewards/rejected": -1.0896432399749756, + "step": 219 + }, + { + "epoch": 1.66, + "grad_norm": 6.775480921328623, + "learning_rate": 2.4719101123595505e-07, + "logps/chosen": -43.19866943359375, + "logps/rejected": -51.142852783203125, + "loss": 0.5708, + "losses/dpo": 0.631821870803833, + "losses/sft": 1.687159776687622, + "losses/total": 0.631821870803833, + "ref_logps/chosen": -35.93583297729492, + "ref_logps/rejected": -40.11430358886719, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.7262836694717407, + "rewards/margins": 0.3765709102153778, + "rewards/rejected": -1.102854609489441, + "step": 220 + }, + { + "epoch": 1.67, + "grad_norm": 7.00534024427554, + "learning_rate": 2.4578651685393255e-07, + "logps/chosen": -43.32523727416992, + "logps/rejected": -52.18841552734375, + "loss": 0.56, + "losses/dpo": 0.5959673523902893, + "losses/sft": 1.5886725187301636, + "losses/total": 0.5959673523902893, + "ref_logps/chosen": -35.884517669677734, + "ref_logps/rejected": -40.54563522338867, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.7440718412399292, + "rewards/margins": 0.42020630836486816, + "rewards/rejected": -1.1642781496047974, + "step": 221 + }, + { + "epoch": 1.68, + "grad_norm": 7.150852349968827, + "learning_rate": 2.443820224719101e-07, + "logps/chosen": -42.47400665283203, + "logps/rejected": -53.8497314453125, + "loss": 0.5456, + "losses/dpo": 0.5008928775787354, + "losses/sft": 1.4967145919799805, + "losses/total": 0.5008928775787354, + "ref_logps/chosen": -35.4063720703125, + "ref_logps/rejected": -41.911190032958984, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7067632675170898, + "rewards/margins": 0.48709067702293396, + "rewards/rejected": -1.1938539743423462, + "step": 222 + }, + { + "epoch": 1.68, + "grad_norm": 7.009242529601585, + "learning_rate": 2.429775280898876e-07, + "logps/chosen": -42.480735778808594, + "logps/rejected": -56.96538543701172, + "loss": 0.5687, + "losses/dpo": 0.5494006872177124, + "losses/sft": 1.660073161125183, + "losses/total": 0.5494006872177124, + "ref_logps/chosen": -34.94923400878906, + "ref_logps/rejected": -45.03327178955078, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.753150224685669, + "rewards/margins": 0.44006073474884033, + "rewards/rejected": -1.1932109594345093, + "step": 223 + }, + { + "epoch": 1.69, + "grad_norm": 7.145198782494123, + "learning_rate": 2.4157303370786517e-07, + "logps/chosen": -46.52253341674805, + "logps/rejected": -56.88560485839844, + "loss": 0.5578, + "losses/dpo": 0.6753450632095337, + "losses/sft": 1.733784556388855, + "losses/total": 0.6753450632095337, + "ref_logps/chosen": -38.87651062011719, + "ref_logps/rejected": -44.95452117919922, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.7646023035049438, + "rewards/margins": 0.428506076335907, + "rewards/rejected": -1.193108320236206, + "step": 224 + }, + { + "epoch": 1.7, + "grad_norm": 7.2140738897995895, + "learning_rate": 2.401685393258427e-07, + "logps/chosen": -44.606842041015625, + "logps/rejected": -51.53977966308594, + "loss": 0.5755, + "losses/dpo": 0.6050464510917664, + "losses/sft": 1.4844509363174438, + "losses/total": 0.6050464510917664, + "ref_logps/chosen": -37.366485595703125, + "ref_logps/rejected": -40.17390441894531, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.7240351438522339, + "rewards/margins": 0.41255253553390503, + "rewards/rejected": -1.1365876197814941, + "step": 225 + }, + { + "epoch": 1.71, + "grad_norm": 7.316056461598082, + "learning_rate": 2.3876404494382023e-07, + "logps/chosen": -43.40976333618164, + "logps/rejected": -51.119468688964844, + "loss": 0.5856, + "losses/dpo": 0.6437182426452637, + "losses/sft": 1.6879228353500366, + "losses/total": 0.6437182426452637, + "ref_logps/chosen": -35.9763298034668, + "ref_logps/rejected": -39.941932678222656, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -0.7433432936668396, + "rewards/margins": 0.37441009283065796, + "rewards/rejected": -1.1177533864974976, + "step": 226 + }, + { + "epoch": 1.71, + "grad_norm": 7.078331857989057, + "learning_rate": 2.3735955056179774e-07, + "logps/chosen": -45.81120681762695, + "logps/rejected": -50.85576629638672, + "loss": 0.6076, + "losses/dpo": 0.7625922560691833, + "losses/sft": 1.5723658800125122, + "losses/total": 0.7625922560691833, + "ref_logps/chosen": -38.21784973144531, + "ref_logps/rejected": -40.007240295410156, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.7593356966972351, + "rewards/margins": 0.32551684975624084, + "rewards/rejected": -1.0848525762557983, + "step": 227 + }, + { + "epoch": 1.72, + "grad_norm": 7.206138039626543, + "learning_rate": 2.3595505617977527e-07, + "logps/chosen": -43.7403450012207, + "logps/rejected": -52.108604431152344, + "loss": 0.5922, + "losses/dpo": 0.5139514803886414, + "losses/sft": 1.6670148372650146, + "losses/total": 0.5139514803886414, + "ref_logps/chosen": -35.650115966796875, + "ref_logps/rejected": -40.582130432128906, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.8090231418609619, + "rewards/margins": 0.34362420439720154, + "rewards/rejected": -1.1526473760604858, + "step": 228 + }, + { + "epoch": 1.73, + "grad_norm": 7.61900579513634, + "learning_rate": 2.345505617977528e-07, + "logps/chosen": -42.43614959716797, + "logps/rejected": -52.779483795166016, + "loss": 0.5781, + "losses/dpo": 0.4573014974594116, + "losses/sft": 1.5003488063812256, + "losses/total": 0.4573014974594116, + "ref_logps/chosen": -34.90065002441406, + "ref_logps/rejected": -41.4276237487793, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -0.7535501718521118, + "rewards/margins": 0.38163578510284424, + "rewards/rejected": -1.135185956954956, + "step": 229 + }, + { + "epoch": 1.74, + "grad_norm": 7.829509763007773, + "learning_rate": 2.331460674157303e-07, + "logps/chosen": -47.7276496887207, + "logps/rejected": -56.36402893066406, + "loss": 0.5302, + "losses/dpo": 0.529563307762146, + "losses/sft": 1.6256301403045654, + "losses/total": 0.529563307762146, + "ref_logps/chosen": -40.326351165771484, + "ref_logps/rejected": -43.704612731933594, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.740129828453064, + "rewards/margins": 0.5258119702339172, + "rewards/rejected": -1.2659417390823364, + "step": 230 + }, + { + "epoch": 1.74, + "grad_norm": 8.06687120109026, + "learning_rate": 2.3174157303370786e-07, + "logps/chosen": -44.74425506591797, + "logps/rejected": -55.536312103271484, + "loss": 0.561, + "losses/dpo": 0.4758527874946594, + "losses/sft": 1.3779159784317017, + "losses/total": 0.4758527874946594, + "ref_logps/chosen": -37.057281494140625, + "ref_logps/rejected": -43.34575653076172, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.7686972618103027, + "rewards/margins": 0.45035821199417114, + "rewards/rejected": -1.2190555334091187, + "step": 231 + }, + { + "epoch": 1.75, + "grad_norm": 7.291686332994008, + "learning_rate": 2.303370786516854e-07, + "logps/chosen": -43.00548553466797, + "logps/rejected": -54.49897003173828, + "loss": 0.5834, + "losses/dpo": 0.5421339273452759, + "losses/sft": 1.4051011800765991, + "losses/total": 0.5421339273452759, + "ref_logps/chosen": -35.81233215332031, + "ref_logps/rejected": -43.77638244628906, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.7193150520324707, + "rewards/margins": 0.35294392704963684, + "rewards/rejected": -1.0722589492797852, + "step": 232 + }, + { + "epoch": 1.76, + "grad_norm": 6.669119014424567, + "learning_rate": 2.2893258426966292e-07, + "logps/chosen": -42.595909118652344, + "logps/rejected": -50.517574310302734, + "loss": 0.5942, + "losses/dpo": 0.5890272855758667, + "losses/sft": 1.3421604633331299, + "losses/total": 0.5890272855758667, + "ref_logps/chosen": -35.23419189453125, + "ref_logps/rejected": -39.80196762084961, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.7361720204353333, + "rewards/margins": 0.33538877964019775, + "rewards/rejected": -1.0715608596801758, + "step": 233 + }, + { + "epoch": 1.77, + "grad_norm": 7.634998140383259, + "learning_rate": 2.2752808988764045e-07, + "logps/chosen": -48.97822189331055, + "logps/rejected": -55.01988983154297, + "loss": 0.6041, + "losses/dpo": 0.4961914122104645, + "losses/sft": 1.6347143650054932, + "losses/total": 0.4961914122104645, + "ref_logps/chosen": -40.457027435302734, + "ref_logps/rejected": -43.25347900390625, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8521193265914917, + "rewards/margins": 0.3245222866535187, + "rewards/rejected": -1.1766417026519775, + "step": 234 + }, + { + "epoch": 1.77, + "grad_norm": 7.773471295460603, + "learning_rate": 2.2612359550561795e-07, + "logps/chosen": -46.672576904296875, + "logps/rejected": -54.91902542114258, + "loss": 0.5883, + "losses/dpo": 0.5112382173538208, + "losses/sft": 1.6069546937942505, + "losses/total": 0.5112382173538208, + "ref_logps/chosen": -38.114097595214844, + "ref_logps/rejected": -42.68096160888672, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.8558481931686401, + "rewards/margins": 0.36795809864997864, + "rewards/rejected": -1.2238062620162964, + "step": 235 + }, + { + "epoch": 1.78, + "grad_norm": 6.9224359595021925, + "learning_rate": 2.2471910112359549e-07, + "logps/chosen": -43.26789855957031, + "logps/rejected": -49.846065521240234, + "loss": 0.5976, + "losses/dpo": 0.530718207359314, + "losses/sft": 1.4825395345687866, + "losses/total": 0.530718207359314, + "ref_logps/chosen": -35.522216796875, + "ref_logps/rejected": -38.95735168457031, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.7745683193206787, + "rewards/margins": 0.3143025040626526, + "rewards/rejected": -1.088870882987976, + "step": 236 + }, + { + "epoch": 1.79, + "grad_norm": 7.5835387702946075, + "learning_rate": 2.2331460674157302e-07, + "logps/chosen": -44.358116149902344, + "logps/rejected": -57.51253890991211, + "loss": 0.5667, + "losses/dpo": 0.4763038754463196, + "losses/sft": 1.4994385242462158, + "losses/total": 0.4763038754463196, + "ref_logps/chosen": -36.64021682739258, + "ref_logps/rejected": -45.455902099609375, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7717897295951843, + "rewards/margins": 0.4338740408420563, + "rewards/rejected": -1.205663800239563, + "step": 237 + }, + { + "epoch": 1.8, + "grad_norm": 6.860345248290717, + "learning_rate": 2.2191011235955055e-07, + "logps/chosen": -43.448211669921875, + "logps/rejected": -52.67967224121094, + "loss": 0.5841, + "losses/dpo": 0.5856455564498901, + "losses/sft": 1.5493735074996948, + "losses/total": 0.5856455564498901, + "ref_logps/chosen": -35.36868667602539, + "ref_logps/rejected": -40.977325439453125, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8079524040222168, + "rewards/margins": 0.36228203773498535, + "rewards/rejected": -1.1702344417572021, + "step": 238 + }, + { + "epoch": 1.8, + "grad_norm": 6.877362645097382, + "learning_rate": 2.205056179775281e-07, + "logps/chosen": -43.96727752685547, + "logps/rejected": -54.08544921875, + "loss": 0.561, + "losses/dpo": 0.7524189352989197, + "losses/sft": 1.4943475723266602, + "losses/total": 0.7524189352989197, + "ref_logps/chosen": -36.482398986816406, + "ref_logps/rejected": -42.11739730834961, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -0.7484874725341797, + "rewards/margins": 0.44831788539886475, + "rewards/rejected": -1.1968053579330444, + "step": 239 + }, + { + "epoch": 1.81, + "grad_norm": 8.176287850888796, + "learning_rate": 2.191011235955056e-07, + "logps/chosen": -44.187679290771484, + "logps/rejected": -52.56245422363281, + "loss": 0.6215, + "losses/dpo": 0.5882298946380615, + "losses/sft": 1.509756326675415, + "losses/total": 0.5882298946380615, + "ref_logps/chosen": -35.26359558105469, + "ref_logps/rejected": -40.418216705322266, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.8924084901809692, + "rewards/margins": 0.32201528549194336, + "rewards/rejected": -1.2144238948822021, + "step": 240 + }, + { + "epoch": 1.82, + "grad_norm": 6.748177391589757, + "learning_rate": 2.1769662921348314e-07, + "logps/chosen": -44.592193603515625, + "logps/rejected": -54.5892219543457, + "loss": 0.5454, + "losses/dpo": 0.5010501742362976, + "losses/sft": 1.692970871925354, + "losses/total": 0.5010501742362976, + "ref_logps/chosen": -36.254737854003906, + "ref_logps/rejected": -41.49382019042969, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.8337457180023193, + "rewards/margins": 0.47579440474510193, + "rewards/rejected": -1.3095402717590332, + "step": 241 + }, + { + "epoch": 1.83, + "grad_norm": 7.9524609339083385, + "learning_rate": 2.1629213483146067e-07, + "logps/chosen": -49.09219741821289, + "logps/rejected": -55.76482391357422, + "loss": 0.6033, + "losses/dpo": 0.5440762042999268, + "losses/sft": 1.7360166311264038, + "losses/total": 0.5440762042999268, + "ref_logps/chosen": -39.933494567871094, + "ref_logps/rejected": -43.29194259643555, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.915870189666748, + "rewards/margins": 0.33141782879829407, + "rewards/rejected": -1.2472879886627197, + "step": 242 + }, + { + "epoch": 1.83, + "grad_norm": 7.715847211838437, + "learning_rate": 2.148876404494382e-07, + "logps/chosen": -44.42055130004883, + "logps/rejected": -50.31025314331055, + "loss": 0.6419, + "losses/dpo": 0.6423018574714661, + "losses/sft": 1.8698339462280273, + "losses/total": 0.6423018574714661, + "ref_logps/chosen": -36.14445114135742, + "ref_logps/rejected": -39.49055480957031, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.8276099562644958, + "rewards/margins": 0.2543600797653198, + "rewards/rejected": -1.081969976425171, + "step": 243 + }, + { + "epoch": 1.84, + "grad_norm": 7.717852694753547, + "learning_rate": 2.134831460674157e-07, + "logps/chosen": -45.71333694458008, + "logps/rejected": -56.663360595703125, + "loss": 0.5667, + "losses/dpo": 0.578036904335022, + "losses/sft": 1.548266053199768, + "losses/total": 0.578036904335022, + "ref_logps/chosen": -37.3514404296875, + "ref_logps/rejected": -44.125831604003906, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -0.8361901044845581, + "rewards/margins": 0.4175630807876587, + "rewards/rejected": -1.2537531852722168, + "step": 244 + }, + { + "epoch": 1.85, + "grad_norm": 7.217778469739938, + "learning_rate": 2.1207865168539323e-07, + "logps/chosen": -47.56121826171875, + "logps/rejected": -55.15635299682617, + "loss": 0.584, + "losses/dpo": 0.6499341726303101, + "losses/sft": 1.8146308660507202, + "losses/total": 0.6499341726303101, + "ref_logps/chosen": -39.167484283447266, + "ref_logps/rejected": -43.06206130981445, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8393731117248535, + "rewards/margins": 0.37005579471588135, + "rewards/rejected": -1.2094289064407349, + "step": 245 + }, + { + "epoch": 1.86, + "grad_norm": 7.706531034729977, + "learning_rate": 2.1067415730337076e-07, + "logps/chosen": -45.61647415161133, + "logps/rejected": -55.11760330200195, + "loss": 0.616, + "losses/dpo": 0.45806318521499634, + "losses/sft": 1.4561158418655396, + "losses/total": 0.45806318521499634, + "ref_logps/chosen": -36.999488830566406, + "ref_logps/rejected": -43.26171112060547, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8616988062858582, + "rewards/margins": 0.3238902986049652, + "rewards/rejected": -1.185589075088501, + "step": 246 + }, + { + "epoch": 1.86, + "grad_norm": 7.805010195142929, + "learning_rate": 2.0926966292134832e-07, + "logps/chosen": -43.88758087158203, + "logps/rejected": -54.20425033569336, + "loss": 0.5977, + "losses/dpo": 0.6152101755142212, + "losses/sft": 1.5027949810028076, + "losses/total": 0.6152101755142212, + "ref_logps/chosen": -35.95214080810547, + "ref_logps/rejected": -42.84498977661133, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7935442924499512, + "rewards/margins": 0.3423812687397003, + "rewards/rejected": -1.135925531387329, + "step": 247 + }, + { + "epoch": 1.87, + "grad_norm": 7.406497027707841, + "learning_rate": 2.0786516853932585e-07, + "logps/chosen": -45.788818359375, + "logps/rejected": -53.047203063964844, + "loss": 0.5831, + "losses/dpo": 0.4654901325702667, + "losses/sft": 1.479446291923523, + "losses/total": 0.4654901325702667, + "ref_logps/chosen": -37.57151794433594, + "ref_logps/rejected": -41.39836883544922, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.8217304348945618, + "rewards/margins": 0.34315240383148193, + "rewards/rejected": -1.1648828983306885, + "step": 248 + }, + { + "epoch": 1.88, + "grad_norm": 7.682816189246604, + "learning_rate": 2.0646067415730336e-07, + "logps/chosen": -45.08941650390625, + "logps/rejected": -56.03681182861328, + "loss": 0.6254, + "losses/dpo": 0.6420303583145142, + "losses/sft": 1.767283320426941, + "losses/total": 0.6420303583145142, + "ref_logps/chosen": -35.47539138793945, + "ref_logps/rejected": -43.610809326171875, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -0.9614025950431824, + "rewards/margins": 0.281198114156723, + "rewards/rejected": -1.242600679397583, + "step": 249 + }, + { + "epoch": 1.89, + "grad_norm": 7.381909400967013, + "learning_rate": 2.0505617977528089e-07, + "logps/chosen": -44.02425765991211, + "logps/rejected": -57.3465461730957, + "loss": 0.552, + "losses/dpo": 0.5316831469535828, + "losses/sft": 1.4193787574768066, + "losses/total": 0.5316831469535828, + "ref_logps/chosen": -36.48744201660156, + "ref_logps/rejected": -45.4715576171875, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7536818385124207, + "rewards/margins": 0.43381738662719727, + "rewards/rejected": -1.1874991655349731, + "step": 250 + }, + { + "epoch": 1.89, + "grad_norm": 7.457988261963576, + "learning_rate": 2.0365168539325842e-07, + "logps/chosen": -44.05774688720703, + "logps/rejected": -54.26824951171875, + "loss": 0.5622, + "losses/dpo": 0.6149340867996216, + "losses/sft": 1.7144936323165894, + "losses/total": 0.6149340867996216, + "ref_logps/chosen": -35.81959533691406, + "ref_logps/rejected": -41.808128356933594, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.8238149881362915, + "rewards/margins": 0.4221975803375244, + "rewards/rejected": -1.246012568473816, + "step": 251 + }, + { + "epoch": 1.9, + "grad_norm": 6.886174599694686, + "learning_rate": 2.0224719101123595e-07, + "logps/chosen": -42.96266174316406, + "logps/rejected": -57.41224670410156, + "loss": 0.5338, + "losses/dpo": 0.5937738418579102, + "losses/sft": 1.7894150018692017, + "losses/total": 0.5937738418579102, + "ref_logps/chosen": -35.350067138671875, + "ref_logps/rejected": -44.520416259765625, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.7612596750259399, + "rewards/margins": 0.5279234051704407, + "rewards/rejected": -1.2891831398010254, + "step": 252 + }, + { + "epoch": 1.91, + "grad_norm": 7.229890008822798, + "learning_rate": 2.0084269662921348e-07, + "logps/chosen": -40.15806198120117, + "logps/rejected": -51.43259811401367, + "loss": 0.5704, + "losses/dpo": 0.6577882170677185, + "losses/sft": 1.8345617055892944, + "losses/total": 0.6577882170677185, + "ref_logps/chosen": -32.859107971191406, + "ref_logps/rejected": -40.261077880859375, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7298952341079712, + "rewards/margins": 0.3872564733028412, + "rewards/rejected": -1.1171517372131348, + "step": 253 + }, + { + "epoch": 1.92, + "grad_norm": 7.4822334369379995, + "learning_rate": 1.9943820224719098e-07, + "logps/chosen": -47.686946868896484, + "logps/rejected": -57.150779724121094, + "loss": 0.5379, + "losses/dpo": 0.5903155207633972, + "losses/sft": 1.7529627084732056, + "losses/total": 0.5903155207633972, + "ref_logps/chosen": -39.64442443847656, + "ref_logps/rejected": -44.537864685058594, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -0.8042521476745605, + "rewards/margins": 0.45703911781311035, + "rewards/rejected": -1.2612911462783813, + "step": 254 + }, + { + "epoch": 1.92, + "grad_norm": 8.026318217758316, + "learning_rate": 1.9803370786516854e-07, + "logps/chosen": -48.09050750732422, + "logps/rejected": -55.54762268066406, + "loss": 0.6168, + "losses/dpo": 0.6026707887649536, + "losses/sft": 1.538877248764038, + "losses/total": 0.6026707887649536, + "ref_logps/chosen": -39.93636703491211, + "ref_logps/rejected": -44.63798522949219, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.8154144287109375, + "rewards/margins": 0.2755492627620697, + "rewards/rejected": -1.0909637212753296, + "step": 255 + }, + { + "epoch": 1.93, + "grad_norm": 7.789382763460605, + "learning_rate": 1.9662921348314607e-07, + "logps/chosen": -42.904396057128906, + "logps/rejected": -52.95304489135742, + "loss": 0.6004, + "losses/dpo": 0.6533941626548767, + "losses/sft": 1.7555681467056274, + "losses/total": 0.6533941626548767, + "ref_logps/chosen": -35.12152862548828, + "ref_logps/rejected": -41.70171356201172, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.7782862186431885, + "rewards/margins": 0.34684672951698303, + "rewards/rejected": -1.1251329183578491, + "step": 256 + }, + { + "epoch": 1.94, + "grad_norm": 7.650755358628509, + "learning_rate": 1.952247191011236e-07, + "logps/chosen": -47.547119140625, + "logps/rejected": -55.00044250488281, + "loss": 0.5857, + "losses/dpo": 0.533769965171814, + "losses/sft": 1.518601655960083, + "losses/total": 0.533769965171814, + "ref_logps/chosen": -39.58103942871094, + "ref_logps/rejected": -43.044471740722656, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7966080904006958, + "rewards/margins": 0.3989890217781067, + "rewards/rejected": -1.1955971717834473, + "step": 257 + }, + { + "epoch": 1.95, + "grad_norm": 7.218761250045399, + "learning_rate": 1.938202247191011e-07, + "logps/chosen": -45.877933502197266, + "logps/rejected": -55.09804916381836, + "loss": 0.5628, + "losses/dpo": 0.5916406512260437, + "losses/sft": 1.787639856338501, + "losses/total": 0.5916406512260437, + "ref_logps/chosen": -37.8803596496582, + "ref_logps/rejected": -42.772945404052734, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.7997570633888245, + "rewards/margins": 0.43275338411331177, + "rewards/rejected": -1.2325104475021362, + "step": 258 + }, + { + "epoch": 1.95, + "grad_norm": 6.746342603050737, + "learning_rate": 1.9241573033707863e-07, + "logps/chosen": -44.5426139831543, + "logps/rejected": -52.97711944580078, + "loss": 0.5404, + "losses/dpo": 0.6297707557678223, + "losses/sft": 1.9339282512664795, + "losses/total": 0.6297707557678223, + "ref_logps/chosen": -36.8856315612793, + "ref_logps/rejected": -40.58320617675781, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7656983137130737, + "rewards/margins": 0.47369277477264404, + "rewards/rejected": -1.2393909692764282, + "step": 259 + }, + { + "epoch": 1.96, + "grad_norm": 7.487018325482117, + "learning_rate": 1.9101123595505617e-07, + "logps/chosen": -42.5137825012207, + "logps/rejected": -53.166908264160156, + "loss": 0.5707, + "losses/dpo": 0.7028491497039795, + "losses/sft": 1.704848289489746, + "losses/total": 0.7028491497039795, + "ref_logps/chosen": -34.8709716796875, + "ref_logps/rejected": -41.41071701049805, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7642812728881836, + "rewards/margins": 0.4113379120826721, + "rewards/rejected": -1.1756192445755005, + "step": 260 + }, + { + "epoch": 1.97, + "grad_norm": 6.897909731781275, + "learning_rate": 1.896067415730337e-07, + "logps/chosen": -42.64958190917969, + "logps/rejected": -54.01194763183594, + "loss": 0.5508, + "losses/dpo": 0.6007636785507202, + "losses/sft": 1.6722173690795898, + "losses/total": 0.6007636785507202, + "ref_logps/chosen": -34.81106948852539, + "ref_logps/rejected": -41.885292053222656, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7838513851165771, + "rewards/margins": 0.428814560174942, + "rewards/rejected": -1.2126659154891968, + "step": 261 + }, + { + "epoch": 1.98, + "grad_norm": 6.802922485274152, + "learning_rate": 1.8820224719101123e-07, + "logps/chosen": -40.00798034667969, + "logps/rejected": -54.30394744873047, + "loss": 0.5499, + "losses/dpo": 0.39324456453323364, + "losses/sft": 1.4311751127243042, + "losses/total": 0.39324456453323364, + "ref_logps/chosen": -32.748207092285156, + "ref_logps/rejected": -42.49259948730469, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -0.7259770035743713, + "rewards/margins": 0.4551584720611572, + "rewards/rejected": -1.1811354160308838, + "step": 262 + }, + { + "epoch": 1.98, + "grad_norm": 6.783836899709174, + "learning_rate": 1.8679775280898876e-07, + "logps/chosen": -39.83095932006836, + "logps/rejected": -54.880165100097656, + "loss": 0.5218, + "losses/dpo": 0.5562885999679565, + "losses/sft": 1.581786036491394, + "losses/total": 0.5562885999679565, + "ref_logps/chosen": -32.503440856933594, + "ref_logps/rejected": -42.00752258300781, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.7327523231506348, + "rewards/margins": 0.5545117259025574, + "rewards/rejected": -1.287264108657837, + "step": 263 + }, + { + "epoch": 1.99, + "grad_norm": 7.863916442503097, + "learning_rate": 1.853932584269663e-07, + "logps/chosen": -50.77809524536133, + "logps/rejected": -57.57705307006836, + "loss": 0.5746, + "losses/dpo": 0.5502392053604126, + "losses/sft": 1.671476125717163, + "losses/total": 0.5502392053604126, + "ref_logps/chosen": -42.0257568359375, + "ref_logps/rejected": -44.61543273925781, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8752338886260986, + "rewards/margins": 0.42092812061309814, + "rewards/rejected": -1.2961618900299072, + "step": 264 + }, + { + "epoch": 2.0, + "grad_norm": 7.397057411594154, + "learning_rate": 1.8398876404494382e-07, + "logps/chosen": -45.31150817871094, + "logps/rejected": -54.171669006347656, + "loss": 0.5905, + "losses/dpo": 0.5974111557006836, + "losses/sft": 1.7264142036437988, + "losses/total": 0.5974111557006836, + "ref_logps/chosen": -36.78529357910156, + "ref_logps/rejected": -41.80047607421875, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.8526214361190796, + "rewards/margins": 0.3844982385635376, + "rewards/rejected": -1.2371195554733276, + "step": 265 + }, + { + "epoch": 2.01, + "grad_norm": 7.017591256124865, + "learning_rate": 1.8258426966292135e-07, + "logps/chosen": -43.99406433105469, + "logps/rejected": -53.245262145996094, + "loss": 0.5349, + "losses/dpo": 0.5464926362037659, + "losses/sft": 1.5807067155838013, + "losses/total": 0.5464926362037659, + "ref_logps/chosen": -36.38019561767578, + "ref_logps/rejected": -40.543190002441406, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.761387288570404, + "rewards/margins": 0.5088198184967041, + "rewards/rejected": -1.2702070474624634, + "step": 266 + }, + { + "epoch": 2.02, + "grad_norm": 6.985219378495969, + "learning_rate": 1.8117977528089888e-07, + "logps/chosen": -44.4912109375, + "logps/rejected": -53.347251892089844, + "loss": 0.5743, + "losses/dpo": 0.5905557870864868, + "losses/sft": 1.7700715065002441, + "losses/total": 0.5905557870864868, + "ref_logps/chosen": -36.70296859741211, + "ref_logps/rejected": -41.48924255371094, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7788243889808655, + "rewards/margins": 0.40697669982910156, + "rewards/rejected": -1.1858012676239014, + "step": 267 + }, + { + "epoch": 2.02, + "grad_norm": 7.021871568299538, + "learning_rate": 1.7977528089887638e-07, + "logps/chosen": -41.247291564941406, + "logps/rejected": -50.476539611816406, + "loss": 0.5583, + "losses/dpo": 0.5536283850669861, + "losses/sft": 1.3929085731506348, + "losses/total": 0.5536283850669861, + "ref_logps/chosen": -34.45301055908203, + "ref_logps/rejected": -39.05378723144531, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.6794286370277405, + "rewards/margins": 0.46284645795822144, + "rewards/rejected": -1.142275094985962, + "step": 268 + }, + { + "epoch": 2.03, + "grad_norm": 7.290879700745406, + "learning_rate": 1.7837078651685391e-07, + "logps/chosen": -44.69060516357422, + "logps/rejected": -52.723419189453125, + "loss": 0.5862, + "losses/dpo": 0.5445826053619385, + "losses/sft": 1.8489296436309814, + "losses/total": 0.5445826053619385, + "ref_logps/chosen": -36.05701446533203, + "ref_logps/rejected": -40.36290740966797, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.863358736038208, + "rewards/margins": 0.37269291281700134, + "rewards/rejected": -1.2360515594482422, + "step": 269 + }, + { + "epoch": 2.04, + "grad_norm": 7.278112500918291, + "learning_rate": 1.7696629213483144e-07, + "logps/chosen": -47.17387771606445, + "logps/rejected": -55.31304168701172, + "loss": 0.5451, + "losses/dpo": 0.4929129481315613, + "losses/sft": 1.2738251686096191, + "losses/total": 0.4929129481315613, + "ref_logps/chosen": -38.97121810913086, + "ref_logps/rejected": -42.48224639892578, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.8202658891677856, + "rewards/margins": 0.4628136157989502, + "rewards/rejected": -1.2830795049667358, + "step": 270 + }, + { + "epoch": 2.05, + "grad_norm": 7.2790694246, + "learning_rate": 1.75561797752809e-07, + "logps/chosen": -39.39592742919922, + "logps/rejected": -49.23228073120117, + "loss": 0.5839, + "losses/dpo": 0.5500213503837585, + "losses/sft": 1.5326621532440186, + "losses/total": 0.5500213503837585, + "ref_logps/chosen": -31.87863540649414, + "ref_logps/rejected": -37.98380661010742, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.7517290115356445, + "rewards/margins": 0.37311792373657227, + "rewards/rejected": -1.1248469352722168, + "step": 271 + }, + { + "epoch": 2.05, + "grad_norm": 6.952262857114201, + "learning_rate": 1.741573033707865e-07, + "logps/chosen": -41.979820251464844, + "logps/rejected": -51.27606964111328, + "loss": 0.5477, + "losses/dpo": 0.5781035423278809, + "losses/sft": 1.6893967390060425, + "losses/total": 0.5781035423278809, + "ref_logps/chosen": -34.47309875488281, + "ref_logps/rejected": -38.76087951660156, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7506722211837769, + "rewards/margins": 0.5008465051651001, + "rewards/rejected": -1.251518726348877, + "step": 272 + }, + { + "epoch": 2.06, + "grad_norm": 7.323213695486467, + "learning_rate": 1.7275280898876404e-07, + "logps/chosen": -46.38153839111328, + "logps/rejected": -57.915809631347656, + "loss": 0.5002, + "losses/dpo": 0.5204892754554749, + "losses/sft": 1.5103009939193726, + "losses/total": 0.5204892754554749, + "ref_logps/chosen": -38.351890563964844, + "ref_logps/rejected": -43.90599060058594, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.8029646873474121, + "rewards/margins": 0.5980167388916016, + "rewards/rejected": -1.4009814262390137, + "step": 273 + }, + { + "epoch": 2.07, + "grad_norm": 7.764155916683402, + "learning_rate": 1.7134831460674157e-07, + "logps/chosen": -45.19919967651367, + "logps/rejected": -51.13863754272461, + "loss": 0.6283, + "losses/dpo": 0.616185188293457, + "losses/sft": 1.6811277866363525, + "losses/total": 0.616185188293457, + "ref_logps/chosen": -36.72953796386719, + "ref_logps/rejected": -39.85737228393555, + "rewards/accuracies": 0.6171875, + "rewards/chosen": -0.8469663858413696, + "rewards/margins": 0.2811599373817444, + "rewards/rejected": -1.1281262636184692, + "step": 274 + }, + { + "epoch": 2.08, + "grad_norm": 7.020433782892144, + "learning_rate": 1.699438202247191e-07, + "logps/chosen": -43.26371765136719, + "logps/rejected": -52.273712158203125, + "loss": 0.5707, + "losses/dpo": 0.584823727607727, + "losses/sft": 1.7780404090881348, + "losses/total": 0.584823727607727, + "ref_logps/chosen": -34.843936920166016, + "ref_logps/rejected": -39.69860076904297, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.8419777154922485, + "rewards/margins": 0.4155334234237671, + "rewards/rejected": -1.2575111389160156, + "step": 275 + }, + { + "epoch": 2.08, + "grad_norm": 7.92264626489854, + "learning_rate": 1.6853932584269663e-07, + "logps/chosen": -47.98881912231445, + "logps/rejected": -56.07038116455078, + "loss": 0.5876, + "losses/dpo": 0.4782869219779968, + "losses/sft": 1.5796866416931152, + "losses/total": 0.4782869219779968, + "ref_logps/chosen": -39.32164764404297, + "ref_logps/rejected": -43.3940315246582, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.8667174577713013, + "rewards/margins": 0.40091750025749207, + "rewards/rejected": -1.2676348686218262, + "step": 276 + }, + { + "epoch": 2.09, + "grad_norm": 6.857885259771192, + "learning_rate": 1.6713483146067413e-07, + "logps/chosen": -42.90391159057617, + "logps/rejected": -53.66696548461914, + "loss": 0.5666, + "losses/dpo": 0.6092857122421265, + "losses/sft": 1.6311126947402954, + "losses/total": 0.6092857122421265, + "ref_logps/chosen": -35.07619857788086, + "ref_logps/rejected": -41.65775680541992, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.7827714085578918, + "rewards/margins": 0.4181497395038605, + "rewards/rejected": -1.2009210586547852, + "step": 277 + }, + { + "epoch": 2.1, + "grad_norm": 7.0081480343548215, + "learning_rate": 1.6573033707865166e-07, + "logps/chosen": -43.48851013183594, + "logps/rejected": -56.560142517089844, + "loss": 0.5552, + "losses/dpo": 0.47724148631095886, + "losses/sft": 1.4892723560333252, + "losses/total": 0.47724148631095886, + "ref_logps/chosen": -35.48023223876953, + "ref_logps/rejected": -43.66110610961914, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.8008283376693726, + "rewards/margins": 0.4890754222869873, + "rewards/rejected": -1.2899038791656494, + "step": 278 + }, + { + "epoch": 2.11, + "grad_norm": 7.160682409155752, + "learning_rate": 1.6432584269662922e-07, + "logps/chosen": -44.21363830566406, + "logps/rejected": -58.09941864013672, + "loss": 0.5245, + "losses/dpo": 0.4419279396533966, + "losses/sft": 1.6377503871917725, + "losses/total": 0.4419279396533966, + "ref_logps/chosen": -36.17689514160156, + "ref_logps/rejected": -44.57625198364258, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8036742210388184, + "rewards/margins": 0.5486425757408142, + "rewards/rejected": -1.3523168563842773, + "step": 279 + }, + { + "epoch": 2.11, + "grad_norm": 7.212324015356224, + "learning_rate": 1.6292134831460675e-07, + "logps/chosen": -44.50836181640625, + "logps/rejected": -51.84413528442383, + "loss": 0.5731, + "losses/dpo": 0.4934471547603607, + "losses/sft": 1.4699177742004395, + "losses/total": 0.4934471547603607, + "ref_logps/chosen": -36.38746643066406, + "ref_logps/rejected": -39.50672149658203, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.8120898604393005, + "rewards/margins": 0.4216514825820923, + "rewards/rejected": -1.233741283416748, + "step": 280 + }, + { + "epoch": 2.12, + "grad_norm": 6.532989304583127, + "learning_rate": 1.6151685393258428e-07, + "logps/chosen": -40.479827880859375, + "logps/rejected": -52.349693298339844, + "loss": 0.5111, + "losses/dpo": 0.5451053380966187, + "losses/sft": 1.5731171369552612, + "losses/total": 0.5451053380966187, + "ref_logps/chosen": -33.727699279785156, + "ref_logps/rejected": -40.0911865234375, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.6752126216888428, + "rewards/margins": 0.5506378412246704, + "rewards/rejected": -1.2258504629135132, + "step": 281 + }, + { + "epoch": 2.13, + "grad_norm": 6.65644378559116, + "learning_rate": 1.6011235955056178e-07, + "logps/chosen": -42.37626266479492, + "logps/rejected": -53.92717742919922, + "loss": 0.5305, + "losses/dpo": 0.523646354675293, + "losses/sft": 1.5671043395996094, + "losses/total": 0.523646354675293, + "ref_logps/chosen": -35.16848373413086, + "ref_logps/rejected": -41.62503433227539, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.7207781076431274, + "rewards/margins": 0.509436309337616, + "rewards/rejected": -1.2302143573760986, + "step": 282 + }, + { + "epoch": 2.14, + "grad_norm": 7.652354022803428, + "learning_rate": 1.5870786516853931e-07, + "logps/chosen": -45.83367919921875, + "logps/rejected": -57.50337219238281, + "loss": 0.5511, + "losses/dpo": 0.5657609105110168, + "losses/sft": 1.511309266090393, + "losses/total": 0.5657609105110168, + "ref_logps/chosen": -37.572113037109375, + "ref_logps/rejected": -44.38706970214844, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.8261568546295166, + "rewards/margins": 0.4854734539985657, + "rewards/rejected": -1.3116302490234375, + "step": 283 + }, + { + "epoch": 2.14, + "grad_norm": 6.7860253446718, + "learning_rate": 1.5730337078651685e-07, + "logps/chosen": -41.25431823730469, + "logps/rejected": -54.90302658081055, + "loss": 0.5188, + "losses/dpo": 0.5078562498092651, + "losses/sft": 1.5500166416168213, + "losses/total": 0.5078562498092651, + "ref_logps/chosen": -33.7675666809082, + "ref_logps/rejected": -41.983367919921875, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.7486748695373535, + "rewards/margins": 0.5432910919189453, + "rewards/rejected": -1.2919659614562988, + "step": 284 + }, + { + "epoch": 2.15, + "grad_norm": 7.11903493041396, + "learning_rate": 1.5589887640449438e-07, + "logps/chosen": -43.894989013671875, + "logps/rejected": -58.60367202758789, + "loss": 0.4914, + "losses/dpo": 0.5028943419456482, + "losses/sft": 1.594357967376709, + "losses/total": 0.5028943419456482, + "ref_logps/chosen": -36.30883026123047, + "ref_logps/rejected": -45.05226516723633, + "rewards/accuracies": 0.8359375, + "rewards/chosen": -0.7586159706115723, + "rewards/margins": 0.5965246558189392, + "rewards/rejected": -1.3551405668258667, + "step": 285 + }, + { + "epoch": 2.16, + "grad_norm": 7.693954508671863, + "learning_rate": 1.5449438202247188e-07, + "logps/chosen": -48.08583068847656, + "logps/rejected": -53.51144027709961, + "loss": 0.5882, + "losses/dpo": 0.8339239954948425, + "losses/sft": 1.617476224899292, + "losses/total": 0.8339239954948425, + "ref_logps/chosen": -40.02094268798828, + "ref_logps/rejected": -41.421348571777344, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.8064886331558228, + "rewards/margins": 0.4025205969810486, + "rewards/rejected": -1.2090092897415161, + "step": 286 + }, + { + "epoch": 2.17, + "grad_norm": 6.936991994103028, + "learning_rate": 1.5308988764044944e-07, + "logps/chosen": -42.915550231933594, + "logps/rejected": -54.570682525634766, + "loss": 0.5427, + "losses/dpo": 0.570111095905304, + "losses/sft": 1.7627439498901367, + "losses/total": 0.570111095905304, + "ref_logps/chosen": -34.83842468261719, + "ref_logps/rejected": -41.10365295410156, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8077125549316406, + "rewards/margins": 0.5389906167984009, + "rewards/rejected": -1.346703052520752, + "step": 287 + }, + { + "epoch": 2.17, + "grad_norm": 7.22270723761247, + "learning_rate": 1.5168539325842697e-07, + "logps/chosen": -42.23722457885742, + "logps/rejected": -57.404205322265625, + "loss": 0.529, + "losses/dpo": 0.5674354434013367, + "losses/sft": 1.5719692707061768, + "losses/total": 0.5674354434013367, + "ref_logps/chosen": -34.721920013427734, + "ref_logps/rejected": -44.73695373535156, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.7515305280685425, + "rewards/margins": 0.5151941180229187, + "rewards/rejected": -1.266724705696106, + "step": 288 + }, + { + "epoch": 2.18, + "grad_norm": 8.53784031336331, + "learning_rate": 1.502808988764045e-07, + "logps/chosen": -48.22527313232422, + "logps/rejected": -58.22871398925781, + "loss": 0.5799, + "losses/dpo": 0.5890235900878906, + "losses/sft": 1.6156002283096313, + "losses/total": 0.5890235900878906, + "ref_logps/chosen": -39.35710525512695, + "ref_logps/rejected": -45.079322814941406, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.8868170976638794, + "rewards/margins": 0.4281224012374878, + "rewards/rejected": -1.3149394989013672, + "step": 289 + }, + { + "epoch": 2.19, + "grad_norm": 6.885607484250047, + "learning_rate": 1.4887640449438203e-07, + "logps/chosen": -42.00331115722656, + "logps/rejected": -51.58038330078125, + "loss": 0.5568, + "losses/dpo": 0.602211058139801, + "losses/sft": 1.4960790872573853, + "losses/total": 0.602211058139801, + "ref_logps/chosen": -33.715057373046875, + "ref_logps/rejected": -39.065757751464844, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.8288247585296631, + "rewards/margins": 0.422637403011322, + "rewards/rejected": -1.2514622211456299, + "step": 290 + }, + { + "epoch": 2.2, + "grad_norm": 6.966987508866502, + "learning_rate": 1.4747191011235953e-07, + "logps/chosen": -43.61931228637695, + "logps/rejected": -58.451629638671875, + "loss": 0.5594, + "losses/dpo": 0.5309076309204102, + "losses/sft": 1.636415958404541, + "losses/total": 0.5309076309204102, + "ref_logps/chosen": -34.87947463989258, + "ref_logps/rejected": -44.885459899902344, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.87398362159729, + "rewards/margins": 0.48263317346572876, + "rewards/rejected": -1.356616735458374, + "step": 291 + }, + { + "epoch": 2.2, + "grad_norm": 6.8192638080381744, + "learning_rate": 1.4606741573033706e-07, + "logps/chosen": -43.92414855957031, + "logps/rejected": -53.39807891845703, + "loss": 0.5452, + "losses/dpo": 0.6017537713050842, + "losses/sft": 1.7611263990402222, + "losses/total": 0.6017537713050842, + "ref_logps/chosen": -35.702823638916016, + "ref_logps/rejected": -40.34605407714844, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.8221321702003479, + "rewards/margins": 0.4830705225467682, + "rewards/rejected": -1.3052027225494385, + "step": 292 + }, + { + "epoch": 2.21, + "grad_norm": 8.710229032299473, + "learning_rate": 1.446629213483146e-07, + "logps/chosen": -51.8635368347168, + "logps/rejected": -58.34959030151367, + "loss": 0.601, + "losses/dpo": 0.5715082883834839, + "losses/sft": 1.490638017654419, + "losses/total": 0.5715082883834839, + "ref_logps/chosen": -42.414031982421875, + "ref_logps/rejected": -45.42848587036133, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.9449502229690552, + "rewards/margins": 0.3471601605415344, + "rewards/rejected": -1.2921103239059448, + "step": 293 + }, + { + "epoch": 2.22, + "grad_norm": 7.6355542087700226, + "learning_rate": 1.4325842696629212e-07, + "logps/chosen": -43.83769607543945, + "logps/rejected": -58.36852264404297, + "loss": 0.546, + "losses/dpo": 0.4579807221889496, + "losses/sft": 1.5301527976989746, + "losses/total": 0.4579807221889496, + "ref_logps/chosen": -35.81403350830078, + "ref_logps/rejected": -44.5776252746582, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.802366316318512, + "rewards/margins": 0.5767236948013306, + "rewards/rejected": -1.3790900707244873, + "step": 294 + }, + { + "epoch": 2.23, + "grad_norm": 7.4022075570091195, + "learning_rate": 1.4185393258426968e-07, + "logps/chosen": -44.79059600830078, + "logps/rejected": -59.63528060913086, + "loss": 0.5251, + "losses/dpo": 0.5625388622283936, + "losses/sft": 1.5417966842651367, + "losses/total": 0.5625388622283936, + "ref_logps/chosen": -36.72273254394531, + "ref_logps/rejected": -46.061744689941406, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.806786060333252, + "rewards/margins": 0.5505677461624146, + "rewards/rejected": -1.357353925704956, + "step": 295 + }, + { + "epoch": 2.23, + "grad_norm": 7.092958234931924, + "learning_rate": 1.4044943820224718e-07, + "logps/chosen": -42.923343658447266, + "logps/rejected": -52.593894958496094, + "loss": 0.5582, + "losses/dpo": 0.46777036786079407, + "losses/sft": 1.5354235172271729, + "losses/total": 0.46777036786079407, + "ref_logps/chosen": -35.31480026245117, + "ref_logps/rejected": -40.524574279785156, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7608542442321777, + "rewards/margins": 0.446077823638916, + "rewards/rejected": -1.2069320678710938, + "step": 296 + }, + { + "epoch": 2.24, + "grad_norm": 7.500648089064134, + "learning_rate": 1.3904494382022472e-07, + "logps/chosen": -43.400211334228516, + "logps/rejected": -54.485557556152344, + "loss": 0.5719, + "losses/dpo": 0.43427377939224243, + "losses/sft": 1.5346068143844604, + "losses/total": 0.43427377939224243, + "ref_logps/chosen": -35.220516204833984, + "ref_logps/rejected": -41.69098663330078, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.8179699182510376, + "rewards/margins": 0.4614875316619873, + "rewards/rejected": -1.2794575691223145, + "step": 297 + }, + { + "epoch": 2.25, + "grad_norm": 6.861133660639989, + "learning_rate": 1.3764044943820225e-07, + "logps/chosen": -40.74993896484375, + "logps/rejected": -55.73876190185547, + "loss": 0.5064, + "losses/dpo": 0.5779513716697693, + "losses/sft": 1.53359055519104, + "losses/total": 0.5779513716697693, + "ref_logps/chosen": -33.70279312133789, + "ref_logps/rejected": -42.673423767089844, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -0.7047147154808044, + "rewards/margins": 0.6018195152282715, + "rewards/rejected": -1.3065342903137207, + "step": 298 + }, + { + "epoch": 2.26, + "grad_norm": 6.77015674340588, + "learning_rate": 1.3623595505617978e-07, + "logps/chosen": -41.57499694824219, + "logps/rejected": -55.820674896240234, + "loss": 0.5056, + "losses/dpo": 0.5236800909042358, + "losses/sft": 1.7500333786010742, + "losses/total": 0.5236800909042358, + "ref_logps/chosen": -34.20399475097656, + "ref_logps/rejected": -42.717681884765625, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.7371004819869995, + "rewards/margins": 0.5731986165046692, + "rewards/rejected": -1.3102991580963135, + "step": 299 + }, + { + "epoch": 2.26, + "grad_norm": 6.520455497747794, + "learning_rate": 1.3483146067415728e-07, + "logps/chosen": -40.784889221191406, + "logps/rejected": -53.35670471191406, + "loss": 0.5158, + "losses/dpo": 0.39004355669021606, + "losses/sft": 1.4663935899734497, + "losses/total": 0.39004355669021606, + "ref_logps/chosen": -33.147525787353516, + "ref_logps/rejected": -39.950157165527344, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7637366056442261, + "rewards/margins": 0.5769186019897461, + "rewards/rejected": -1.3406550884246826, + "step": 300 + }, + { + "epoch": 2.27, + "grad_norm": 7.96122739673963, + "learning_rate": 1.334269662921348e-07, + "logps/chosen": -46.782169342041016, + "logps/rejected": -52.76530456542969, + "loss": 0.5923, + "losses/dpo": 0.6052607297897339, + "losses/sft": 1.6094651222229004, + "losses/total": 0.6052607297897339, + "ref_logps/chosen": -39.2327880859375, + "ref_logps/rejected": -41.223548889160156, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -0.754938006401062, + "rewards/margins": 0.3992377817630768, + "rewards/rejected": -1.1541757583618164, + "step": 301 + }, + { + "epoch": 2.28, + "grad_norm": 7.751653637126333, + "learning_rate": 1.3202247191011234e-07, + "logps/chosen": -49.38646697998047, + "logps/rejected": -61.543209075927734, + "loss": 0.5327, + "losses/dpo": 0.556348443031311, + "losses/sft": 1.8087131977081299, + "losses/total": 0.556348443031311, + "ref_logps/chosen": -40.26612091064453, + "ref_logps/rejected": -47.002288818359375, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.9120345115661621, + "rewards/margins": 0.5420576930046082, + "rewards/rejected": -1.454092264175415, + "step": 302 + }, + { + "epoch": 2.29, + "grad_norm": 7.798526029135885, + "learning_rate": 1.306179775280899e-07, + "logps/chosen": -43.65242004394531, + "logps/rejected": -57.450496673583984, + "loss": 0.5818, + "losses/dpo": 0.6229327321052551, + "losses/sft": 1.691450834274292, + "losses/total": 0.6229327321052551, + "ref_logps/chosen": -35.82228088378906, + "ref_logps/rejected": -45.80632781982422, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.7830138802528381, + "rewards/margins": 0.3814033269882202, + "rewards/rejected": -1.1644171476364136, + "step": 303 + }, + { + "epoch": 2.29, + "grad_norm": 7.543077771323995, + "learning_rate": 1.2921348314606743e-07, + "logps/chosen": -44.790557861328125, + "logps/rejected": -61.33608627319336, + "loss": 0.5259, + "losses/dpo": 0.648471474647522, + "losses/sft": 1.673068881034851, + "losses/total": 0.648471474647522, + "ref_logps/chosen": -36.91156768798828, + "ref_logps/rejected": -48.083534240722656, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -0.7878991961479187, + "rewards/margins": 0.5373560190200806, + "rewards/rejected": -1.325255274772644, + "step": 304 + }, + { + "epoch": 2.3, + "grad_norm": 7.321395556756757, + "learning_rate": 1.2780898876404493e-07, + "logps/chosen": -45.843082427978516, + "logps/rejected": -57.27900695800781, + "loss": 0.5602, + "losses/dpo": 0.5154864192008972, + "losses/sft": 1.5874884128570557, + "losses/total": 0.5154864192008972, + "ref_logps/chosen": -37.167877197265625, + "ref_logps/rejected": -43.86834716796875, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8675205707550049, + "rewards/margins": 0.4735449552536011, + "rewards/rejected": -1.341065526008606, + "step": 305 + }, + { + "epoch": 2.31, + "grad_norm": 7.180779900116491, + "learning_rate": 1.2640449438202246e-07, + "logps/chosen": -45.17388153076172, + "logps/rejected": -55.727230072021484, + "loss": 0.5217, + "losses/dpo": 0.4565548598766327, + "losses/sft": 1.4454078674316406, + "losses/total": 0.4565548598766327, + "ref_logps/chosen": -37.53833770751953, + "ref_logps/rejected": -42.80052185058594, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -0.7635539770126343, + "rewards/margins": 0.529117226600647, + "rewards/rejected": -1.2926712036132812, + "step": 306 + }, + { + "epoch": 2.32, + "grad_norm": 7.748539788981976, + "learning_rate": 1.25e-07, + "logps/chosen": -45.61334991455078, + "logps/rejected": -49.54269790649414, + "loss": 0.5743, + "losses/dpo": 0.4475148916244507, + "losses/sft": 1.3761274814605713, + "losses/total": 0.4475148916244507, + "ref_logps/chosen": -37.51769256591797, + "ref_logps/rejected": -37.59453582763672, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.8095651865005493, + "rewards/margins": 0.38525110483169556, + "rewards/rejected": -1.1948162317276, + "step": 307 + }, + { + "epoch": 2.32, + "grad_norm": 7.42646045779241, + "learning_rate": 1.2359550561797752e-07, + "logps/chosen": -42.217491149902344, + "logps/rejected": -57.62702941894531, + "loss": 0.5103, + "losses/dpo": 0.5864957571029663, + "losses/sft": 1.526570439338684, + "losses/total": 0.5864957571029663, + "ref_logps/chosen": -34.16813659667969, + "ref_logps/rejected": -43.83499526977539, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -0.8049358129501343, + "rewards/margins": 0.5742676258087158, + "rewards/rejected": -1.37920343875885, + "step": 308 + }, + { + "epoch": 2.33, + "grad_norm": 7.402951195988575, + "learning_rate": 1.2219101123595506e-07, + "logps/chosen": -43.753623962402344, + "logps/rejected": -55.725196838378906, + "loss": 0.5457, + "losses/dpo": 0.4583805501461029, + "losses/sft": 1.4125399589538574, + "losses/total": 0.4583805501461029, + "ref_logps/chosen": -35.22587585449219, + "ref_logps/rejected": -42.14164733886719, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -0.8527748584747314, + "rewards/margins": 0.5055804252624512, + "rewards/rejected": -1.3583552837371826, + "step": 309 + }, + { + "epoch": 2.34, + "grad_norm": 8.606023903012021, + "learning_rate": 1.2078651685393259e-07, + "logps/chosen": -52.9200439453125, + "logps/rejected": -61.587310791015625, + "loss": 0.5909, + "losses/dpo": 0.5809124708175659, + "losses/sft": 1.585126280784607, + "losses/total": 0.5809124708175659, + "ref_logps/chosen": -43.860687255859375, + "ref_logps/rejected": -48.097747802734375, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -0.9059357047080994, + "rewards/margins": 0.443020224571228, + "rewards/rejected": -1.3489558696746826, + "step": 310 + }, + { + "epoch": 2.35, + "grad_norm": 7.302220410372284, + "learning_rate": 1.1938202247191012e-07, + "logps/chosen": -43.49970245361328, + "logps/rejected": -58.033485412597656, + "loss": 0.5186, + "losses/dpo": 0.48940473794937134, + "losses/sft": 1.4836596250534058, + "losses/total": 0.48940473794937134, + "ref_logps/chosen": -36.129432678222656, + "ref_logps/rejected": -44.55863952636719, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.7370268702507019, + "rewards/margins": 0.610457181930542, + "rewards/rejected": -1.3474839925765991, + "step": 311 + }, + { + "epoch": 2.35, + "grad_norm": 7.715132554841409, + "learning_rate": 1.1797752808988763e-07, + "logps/chosen": -45.6818733215332, + "logps/rejected": -57.750892639160156, + "loss": 0.5446, + "losses/dpo": 0.6576637625694275, + "losses/sft": 1.6136798858642578, + "losses/total": 0.6576637625694275, + "ref_logps/chosen": -37.14127731323242, + "ref_logps/rejected": -43.89785385131836, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.8540595769882202, + "rewards/margins": 0.5312445759773254, + "rewards/rejected": -1.3853040933609009, + "step": 312 + }, + { + "epoch": 2.36, + "grad_norm": 7.103504431576494, + "learning_rate": 1.1657303370786515e-07, + "logps/chosen": -43.971473693847656, + "logps/rejected": -57.290443420410156, + "loss": 0.5204, + "losses/dpo": 0.5861748456954956, + "losses/sft": 1.7284009456634521, + "losses/total": 0.5861748456954956, + "ref_logps/chosen": -35.535255432128906, + "ref_logps/rejected": -43.220550537109375, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -0.8436219096183777, + "rewards/margins": 0.5633664727210999, + "rewards/rejected": -1.4069883823394775, + "step": 313 + }, + { + "epoch": 2.37, + "grad_norm": 7.74539036906925, + "learning_rate": 1.151685393258427e-07, + "logps/chosen": -45.654815673828125, + "logps/rejected": -55.697998046875, + "loss": 0.5716, + "losses/dpo": 0.6496266722679138, + "losses/sft": 1.7458603382110596, + "losses/total": 0.6496266722679138, + "ref_logps/chosen": -37.42692947387695, + "ref_logps/rejected": -43.09944534301758, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -0.8227887749671936, + "rewards/margins": 0.43706685304641724, + "rewards/rejected": -1.2598556280136108, + "step": 314 + }, + { + "epoch": 2.38, + "grad_norm": 7.3016087270783965, + "learning_rate": 1.1376404494382023e-07, + "logps/chosen": -44.80577087402344, + "logps/rejected": -58.83177185058594, + "loss": 0.5616, + "losses/dpo": 0.5281144380569458, + "losses/sft": 1.5373191833496094, + "losses/total": 0.5281144380569458, + "ref_logps/chosen": -35.78607940673828, + "ref_logps/rejected": -44.85388946533203, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.9019690155982971, + "rewards/margins": 0.49581989645957947, + "rewards/rejected": -1.3977890014648438, + "step": 315 + }, + { + "epoch": 2.38, + "grad_norm": 7.369973670185862, + "learning_rate": 1.1235955056179774e-07, + "logps/chosen": -44.48060607910156, + "logps/rejected": -57.20940399169922, + "loss": 0.5253, + "losses/dpo": 0.5681818723678589, + "losses/sft": 1.7861613035202026, + "losses/total": 0.5681818723678589, + "ref_logps/chosen": -35.98798751831055, + "ref_logps/rejected": -42.8939094543457, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -0.8492615818977356, + "rewards/margins": 0.5822880268096924, + "rewards/rejected": -1.4315495491027832, + "step": 316 + }, + { + "epoch": 2.39, + "grad_norm": 7.531569588872603, + "learning_rate": 1.1095505617977527e-07, + "logps/chosen": -43.765594482421875, + "logps/rejected": -55.92811965942383, + "loss": 0.5324, + "losses/dpo": 0.5917935371398926, + "losses/sft": 1.6781896352767944, + "losses/total": 0.5917935371398926, + "ref_logps/chosen": -35.10169219970703, + "ref_logps/rejected": -42.32771301269531, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -0.8663901090621948, + "rewards/margins": 0.4936509132385254, + "rewards/rejected": -1.3600411415100098, + "step": 317 + }, + { + "epoch": 2.4, + "grad_norm": 7.00021473521157, + "learning_rate": 1.095505617977528e-07, + "logps/chosen": -43.7101936340332, + "logps/rejected": -55.512020111083984, + "loss": 0.5587, + "losses/dpo": 0.3991687297821045, + "losses/sft": 1.6147840023040771, + "losses/total": 0.3991687297821045, + "ref_logps/chosen": -34.230926513671875, + "ref_logps/rejected": -40.64677810668945, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.9479266405105591, + "rewards/margins": 0.5385974049568176, + "rewards/rejected": -1.4865241050720215, + "step": 318 + }, + { + "epoch": 2.41, + "grad_norm": 7.422365442434664, + "learning_rate": 1.0814606741573033e-07, + "logps/chosen": -44.69245910644531, + "logps/rejected": -51.95305252075195, + "loss": 0.5704, + "losses/dpo": 0.5418112277984619, + "losses/sft": 1.3795506954193115, + "losses/total": 0.5418112277984619, + "ref_logps/chosen": -35.97312545776367, + "ref_logps/rejected": -39.2027702331543, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.8719329833984375, + "rewards/margins": 0.40309497714042664, + "rewards/rejected": -1.2750279903411865, + "step": 319 + }, + { + "epoch": 2.42, + "grad_norm": 7.787952340155071, + "learning_rate": 1.0674157303370785e-07, + "logps/chosen": -46.54815673828125, + "logps/rejected": -55.3624153137207, + "loss": 0.5672, + "losses/dpo": 0.5258245468139648, + "losses/sft": 1.7972207069396973, + "losses/total": 0.5258245468139648, + "ref_logps/chosen": -38.29623031616211, + "ref_logps/rejected": -42.49595642089844, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.8251928687095642, + "rewards/margins": 0.4614531695842743, + "rewards/rejected": -1.2866460084915161, + "step": 320 + }, + { + "epoch": 2.42, + "grad_norm": 7.716635082250954, + "learning_rate": 1.0533707865168538e-07, + "logps/chosen": -45.037723541259766, + "logps/rejected": -53.17112350463867, + "loss": 0.5781, + "losses/dpo": 0.7091802358627319, + "losses/sft": 1.6653159856796265, + "losses/total": 0.7091802358627319, + "ref_logps/chosen": -35.86992263793945, + "ref_logps/rejected": -39.99578857421875, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.9167801141738892, + "rewards/margins": 0.4007537364959717, + "rewards/rejected": -1.3175339698791504, + "step": 321 + }, + { + "epoch": 2.43, + "grad_norm": 6.913928315105185, + "learning_rate": 1.0393258426966293e-07, + "logps/chosen": -46.575584411621094, + "logps/rejected": -59.92189407348633, + "loss": 0.4903, + "losses/dpo": 0.4884983003139496, + "losses/sft": 1.5409932136535645, + "losses/total": 0.4884983003139496, + "ref_logps/chosen": -39.07024383544922, + "ref_logps/rejected": -45.98902893066406, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -0.750534176826477, + "rewards/margins": 0.6427518725395203, + "rewards/rejected": -1.3932859897613525, + "step": 322 + }, + { + "epoch": 2.44, + "grad_norm": 7.359357410660962, + "learning_rate": 1.0252808988764044e-07, + "logps/chosen": -43.37224578857422, + "logps/rejected": -57.634010314941406, + "loss": 0.5156, + "losses/dpo": 0.4852214455604553, + "losses/sft": 1.7198714017868042, + "losses/total": 0.4852214455604553, + "ref_logps/chosen": -34.993377685546875, + "ref_logps/rejected": -43.477684020996094, + "rewards/accuracies": 0.796875, + "rewards/chosen": -0.8378866910934448, + "rewards/margins": 0.5777460336685181, + "rewards/rejected": -1.415632724761963, + "step": 323 + }, + { + "epoch": 2.45, + "grad_norm": 6.246986321807027, + "learning_rate": 1.0112359550561797e-07, + "logps/chosen": -39.65964889526367, + "logps/rejected": -53.03920364379883, + "loss": 0.4975, + "losses/dpo": 0.4038864076137543, + "losses/sft": 1.4372718334197998, + "losses/total": 0.4038864076137543, + "ref_logps/chosen": -32.10685348510742, + "ref_logps/rejected": -39.401954650878906, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -0.7552794218063354, + "rewards/margins": 0.6084451675415039, + "rewards/rejected": -1.3637245893478394, + "step": 324 + }, + { + "epoch": 2.45, + "grad_norm": 7.331178402164894, + "learning_rate": 9.971910112359549e-08, + "logps/chosen": -44.56993865966797, + "logps/rejected": -59.496734619140625, + "loss": 0.5273, + "losses/dpo": 0.4201672077178955, + "losses/sft": 1.5359259843826294, + "losses/total": 0.4201672077178955, + "ref_logps/chosen": -36.124656677246094, + "ref_logps/rejected": -45.465816497802734, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -0.8445284962654114, + "rewards/margins": 0.558563232421875, + "rewards/rejected": -1.4030916690826416, + "step": 325 + }, + { + "epoch": 2.46, + "grad_norm": 8.06984724769854, + "learning_rate": 9.831460674157303e-08, + "logps/chosen": -48.639495849609375, + "logps/rejected": -56.76270294189453, + "loss": 0.5197, + "losses/dpo": 0.4748002588748932, + "losses/sft": 1.489260196685791, + "losses/total": 0.4748002588748932, + "ref_logps/chosen": -40.237266540527344, + "ref_logps/rejected": -42.580772399902344, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -0.8402228355407715, + "rewards/margins": 0.5779698491096497, + "rewards/rejected": -1.418192744255066, + "step": 326 + }, + { + "epoch": 2.47, + "grad_norm": 7.991421566802235, + "learning_rate": 9.691011235955055e-08, + "logps/chosen": -46.860626220703125, + "logps/rejected": -58.88548278808594, + "loss": 0.5235, + "losses/dpo": 0.6238963603973389, + "losses/sft": 1.7782843112945557, + "losses/total": 0.6238963603973389, + "ref_logps/chosen": -38.90311050415039, + "ref_logps/rejected": -45.0700569152832, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -0.7957516312599182, + "rewards/margins": 0.5857904553413391, + "rewards/rejected": -1.3815419673919678, + "step": 327 + }, + { + "epoch": 2.48, + "grad_norm": 7.266548128858226, + "learning_rate": 9.550561797752808e-08, + "logps/chosen": -42.59172821044922, + "logps/rejected": -52.10871505737305, + "loss": 0.5667, + "losses/dpo": 0.6280735731124878, + "losses/sft": 1.5307084321975708, + "losses/total": 0.6280735731124878, + "ref_logps/chosen": -34.418861389160156, + "ref_logps/rejected": -39.33207702636719, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.8172866106033325, + "rewards/margins": 0.46037718653678894, + "rewards/rejected": -1.2776637077331543, + "step": 328 + }, + { + "epoch": 2.48, + "grad_norm": 7.642236003437132, + "learning_rate": 9.410112359550561e-08, + "logps/chosen": -45.72086715698242, + "logps/rejected": -52.531341552734375, + "loss": 0.5663, + "losses/dpo": 0.5168911814689636, + "losses/sft": 1.7978273630142212, + "losses/total": 0.5168911814689636, + "ref_logps/chosen": -37.207481384277344, + "ref_logps/rejected": -39.487640380859375, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.851338267326355, + "rewards/margins": 0.4530315101146698, + "rewards/rejected": -1.3043696880340576, + "step": 329 + }, + { + "epoch": 2.49, + "grad_norm": 7.854817727198668, + "learning_rate": 9.269662921348314e-08, + "logps/chosen": -46.91447448730469, + "logps/rejected": -57.34621810913086, + "loss": 0.5504, + "losses/dpo": 0.5696989297866821, + "losses/sft": 1.708069086074829, + "losses/total": 0.5696989297866821, + "ref_logps/chosen": -37.96686553955078, + "ref_logps/rejected": -43.223960876464844, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.894761323928833, + "rewards/margins": 0.5174643993377686, + "rewards/rejected": -1.4122257232666016, + "step": 330 + }, + { + "epoch": 2.5, + "grad_norm": 7.286496155272333, + "learning_rate": 9.129213483146067e-08, + "logps/chosen": -44.248069763183594, + "logps/rejected": -60.32553482055664, + "loss": 0.5098, + "losses/dpo": 0.6059004664421082, + "losses/sft": 1.61500883102417, + "losses/total": 0.6059004664421082, + "ref_logps/chosen": -35.856903076171875, + "ref_logps/rejected": -46.063690185546875, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -0.839116632938385, + "rewards/margins": 0.5870682001113892, + "rewards/rejected": -1.426184892654419, + "step": 331 + }, + { + "epoch": 2.51, + "grad_norm": 7.873630188834188, + "learning_rate": 8.988764044943819e-08, + "logps/chosen": -47.52843475341797, + "logps/rejected": -58.742042541503906, + "loss": 0.5465, + "losses/dpo": 0.44472765922546387, + "losses/sft": 1.8056182861328125, + "losses/total": 0.44472765922546387, + "ref_logps/chosen": -37.702247619628906, + "ref_logps/rejected": -43.76355743408203, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.982619047164917, + "rewards/margins": 0.5152289867401123, + "rewards/rejected": -1.4978480339050293, + "step": 332 + }, + { + "epoch": 2.51, + "grad_norm": 7.654569968967968, + "learning_rate": 8.848314606741572e-08, + "logps/chosen": -45.35044860839844, + "logps/rejected": -54.683128356933594, + "loss": 0.5477, + "losses/dpo": 0.6351585388183594, + "losses/sft": 1.465951681137085, + "losses/total": 0.6351585388183594, + "ref_logps/chosen": -37.354312896728516, + "ref_logps/rejected": -41.636627197265625, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.7996135354042053, + "rewards/margins": 0.505035936832428, + "rewards/rejected": -1.3046493530273438, + "step": 333 + }, + { + "epoch": 2.52, + "grad_norm": 7.093489997148873, + "learning_rate": 8.707865168539325e-08, + "logps/chosen": -44.989524841308594, + "logps/rejected": -56.54049301147461, + "loss": 0.5235, + "losses/dpo": 0.6136016845703125, + "losses/sft": 1.876564860343933, + "losses/total": 0.6136016845703125, + "ref_logps/chosen": -36.00385284423828, + "ref_logps/rejected": -41.88980484008789, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.8985673785209656, + "rewards/margins": 0.566501259803772, + "rewards/rejected": -1.4650685787200928, + "step": 334 + }, + { + "epoch": 2.53, + "grad_norm": 7.6289229336667, + "learning_rate": 8.567415730337078e-08, + "logps/chosen": -45.615413665771484, + "logps/rejected": -56.45619201660156, + "loss": 0.5586, + "losses/dpo": 0.5084734559059143, + "losses/sft": 1.6048380136489868, + "losses/total": 0.5084734559059143, + "ref_logps/chosen": -36.61585998535156, + "ref_logps/rejected": -42.79827880859375, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.8999553322792053, + "rewards/margins": 0.4658358097076416, + "rewards/rejected": -1.3657910823822021, + "step": 335 + }, + { + "epoch": 2.54, + "grad_norm": 8.275598752517682, + "learning_rate": 8.426966292134831e-08, + "logps/chosen": -47.839508056640625, + "logps/rejected": -61.7794303894043, + "loss": 0.5368, + "losses/dpo": 0.5232934355735779, + "losses/sft": 1.4998161792755127, + "losses/total": 0.5232934355735779, + "ref_logps/chosen": -38.512969970703125, + "ref_logps/rejected": -46.98280334472656, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.9326539039611816, + "rewards/margins": 0.5470089316368103, + "rewards/rejected": -1.4796628952026367, + "step": 336 + }, + { + "epoch": 2.54, + "grad_norm": 6.837098147362294, + "learning_rate": 8.286516853932583e-08, + "logps/chosen": -42.03435516357422, + "logps/rejected": -58.38957214355469, + "loss": 0.4918, + "losses/dpo": 0.34719789028167725, + "losses/sft": 1.4158234596252441, + "losses/total": 0.34719789028167725, + "ref_logps/chosen": -34.547210693359375, + "ref_logps/rejected": -44.05992889404297, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.7487142086029053, + "rewards/margins": 0.6842500567436218, + "rewards/rejected": -1.4329640865325928, + "step": 337 + }, + { + "epoch": 2.55, + "grad_norm": 8.253144412756335, + "learning_rate": 8.146067415730337e-08, + "logps/chosen": -45.23094940185547, + "logps/rejected": -53.472965240478516, + "loss": 0.595, + "losses/dpo": 0.5666919350624084, + "losses/sft": 1.5198816061019897, + "losses/total": 0.5666919350624084, + "ref_logps/chosen": -36.06470489501953, + "ref_logps/rejected": -40.16011047363281, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.9166238903999329, + "rewards/margins": 0.4146617650985718, + "rewards/rejected": -1.3312857151031494, + "step": 338 + }, + { + "epoch": 2.56, + "grad_norm": 8.149761017487126, + "learning_rate": 8.005617977528089e-08, + "logps/chosen": -45.32318115234375, + "logps/rejected": -51.304725646972656, + "loss": 0.6056, + "losses/dpo": 0.466902494430542, + "losses/sft": 1.4729348421096802, + "losses/total": 0.466902494430542, + "ref_logps/chosen": -35.85576248168945, + "ref_logps/rejected": -37.858970642089844, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.9467417597770691, + "rewards/margins": 0.39783352613449097, + "rewards/rejected": -1.34457528591156, + "step": 339 + }, + { + "epoch": 2.57, + "grad_norm": 7.054066859896987, + "learning_rate": 7.865168539325842e-08, + "logps/chosen": -45.38795471191406, + "logps/rejected": -57.93950653076172, + "loss": 0.5182, + "losses/dpo": 0.4842032194137573, + "losses/sft": 1.6942293643951416, + "losses/total": 0.4842032194137573, + "ref_logps/chosen": -36.484046936035156, + "ref_logps/rejected": -43.67852783203125, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.8903906345367432, + "rewards/margins": 0.5357075333595276, + "rewards/rejected": -1.426098108291626, + "step": 340 + }, + { + "epoch": 2.57, + "grad_norm": 7.258837050647915, + "learning_rate": 7.724719101123594e-08, + "logps/chosen": -46.00672149658203, + "logps/rejected": -59.08924865722656, + "loss": 0.5373, + "losses/dpo": 0.5424889326095581, + "losses/sft": 1.6475489139556885, + "losses/total": 0.5424889326095581, + "ref_logps/chosen": -37.73750305175781, + "ref_logps/rejected": -45.23866271972656, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.826921820640564, + "rewards/margins": 0.5581368207931519, + "rewards/rejected": -1.3850586414337158, + "step": 341 + }, + { + "epoch": 2.58, + "grad_norm": 7.988126049073018, + "learning_rate": 7.584269662921348e-08, + "logps/chosen": -46.84196472167969, + "logps/rejected": -55.757198333740234, + "loss": 0.5662, + "losses/dpo": 0.353384792804718, + "losses/sft": 1.717570424079895, + "losses/total": 0.353384792804718, + "ref_logps/chosen": -37.82433319091797, + "ref_logps/rejected": -42.26597213745117, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9017627835273743, + "rewards/margins": 0.44735997915267944, + "rewards/rejected": -1.3491227626800537, + "step": 342 + }, + { + "epoch": 2.59, + "grad_norm": 7.618340962447428, + "learning_rate": 7.443820224719101e-08, + "logps/chosen": -43.753684997558594, + "logps/rejected": -55.105316162109375, + "loss": 0.546, + "losses/dpo": 0.6419227123260498, + "losses/sft": 1.6892149448394775, + "losses/total": 0.6419227123260498, + "ref_logps/chosen": -35.38850402832031, + "ref_logps/rejected": -41.928646087646484, + "rewards/accuracies": 0.7578125, + "rewards/chosen": -0.8365182876586914, + "rewards/margins": 0.4811485707759857, + "rewards/rejected": -1.3176668882369995, + "step": 343 + }, + { + "epoch": 2.6, + "grad_norm": 7.5691903171304915, + "learning_rate": 7.303370786516853e-08, + "logps/chosen": -44.20778274536133, + "logps/rejected": -55.97998046875, + "loss": 0.5407, + "losses/dpo": 0.5625724196434021, + "losses/sft": 1.5753792524337769, + "losses/total": 0.5625724196434021, + "ref_logps/chosen": -35.950294494628906, + "ref_logps/rejected": -42.38732147216797, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8257489204406738, + "rewards/margins": 0.5335172414779663, + "rewards/rejected": -1.3592660427093506, + "step": 344 + }, + { + "epoch": 2.6, + "grad_norm": 7.013411401019271, + "learning_rate": 7.162921348314606e-08, + "logps/chosen": -48.60981750488281, + "logps/rejected": -61.60570526123047, + "loss": 0.4779, + "losses/dpo": 0.5409685373306274, + "losses/sft": 1.6795134544372559, + "losses/total": 0.5409685373306274, + "ref_logps/chosen": -39.66438293457031, + "ref_logps/rejected": -45.88689422607422, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -0.8945437073707581, + "rewards/margins": 0.677337646484375, + "rewards/rejected": -1.5718812942504883, + "step": 345 + }, + { + "epoch": 2.61, + "grad_norm": 7.235335525204532, + "learning_rate": 7.022471910112359e-08, + "logps/chosen": -40.0158805847168, + "logps/rejected": -53.02748107910156, + "loss": 0.5321, + "losses/dpo": 0.5608981847763062, + "losses/sft": 1.2928898334503174, + "losses/total": 0.5608981847763062, + "ref_logps/chosen": -32.48070526123047, + "ref_logps/rejected": -40.139122009277344, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -0.7535171508789062, + "rewards/margins": 0.5353185534477234, + "rewards/rejected": -1.2888355255126953, + "step": 346 + }, + { + "epoch": 2.62, + "grad_norm": 7.561289554463479, + "learning_rate": 6.882022471910112e-08, + "logps/chosen": -45.8831787109375, + "logps/rejected": -52.74605178833008, + "loss": 0.5634, + "losses/dpo": 0.5234625935554504, + "losses/sft": 1.5698529481887817, + "losses/total": 0.5234625935554504, + "ref_logps/chosen": -36.858985900878906, + "ref_logps/rejected": -39.08251953125, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9024193286895752, + "rewards/margins": 0.4639340043067932, + "rewards/rejected": -1.3663533926010132, + "step": 347 + }, + { + "epoch": 2.63, + "grad_norm": 7.714313510104845, + "learning_rate": 6.741573033707864e-08, + "logps/chosen": -47.23927307128906, + "logps/rejected": -56.10950469970703, + "loss": 0.5513, + "losses/dpo": 0.5834592580795288, + "losses/sft": 1.8191860914230347, + "losses/total": 0.5834592580795288, + "ref_logps/chosen": -38.12981414794922, + "ref_logps/rejected": -41.886940002441406, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.9109456539154053, + "rewards/margins": 0.5113106966018677, + "rewards/rejected": -1.422256350517273, + "step": 348 + }, + { + "epoch": 2.63, + "grad_norm": 7.158492594820948, + "learning_rate": 6.601123595505617e-08, + "logps/chosen": -48.13493347167969, + "logps/rejected": -60.77044677734375, + "loss": 0.4977, + "losses/dpo": 0.36096107959747314, + "losses/sft": 1.417677640914917, + "losses/total": 0.36096107959747314, + "ref_logps/chosen": -39.71112823486328, + "ref_logps/rejected": -45.55910110473633, + "rewards/accuracies": 0.8203125, + "rewards/chosen": -0.8423808813095093, + "rewards/margins": 0.6787533164024353, + "rewards/rejected": -1.5211341381072998, + "step": 349 + }, + { + "epoch": 2.64, + "grad_norm": 7.665270223156107, + "learning_rate": 6.460674157303371e-08, + "logps/chosen": -45.67338562011719, + "logps/rejected": -52.81538391113281, + "loss": 0.5593, + "losses/dpo": 0.5008495450019836, + "losses/sft": 1.4033509492874146, + "losses/total": 0.5008495450019836, + "ref_logps/chosen": -37.30530548095703, + "ref_logps/rejected": -39.90568542480469, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.8368085026741028, + "rewards/margins": 0.45416122674942017, + "rewards/rejected": -1.290969729423523, + "step": 350 + }, + { + "epoch": 2.65, + "grad_norm": 7.472808082581494, + "learning_rate": 6.320224719101123e-08, + "logps/chosen": -43.985435485839844, + "logps/rejected": -57.295692443847656, + "loss": 0.5187, + "losses/dpo": 0.5680770874023438, + "losses/sft": 1.4148482084274292, + "losses/total": 0.5680770874023438, + "ref_logps/chosen": -36.4133186340332, + "ref_logps/rejected": -43.849090576171875, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.757211446762085, + "rewards/margins": 0.5874490737915039, + "rewards/rejected": -1.3446605205535889, + "step": 351 + }, + { + "epoch": 2.66, + "grad_norm": 8.189112257010201, + "learning_rate": 6.179775280898876e-08, + "logps/chosen": -47.502281188964844, + "logps/rejected": -54.84540939331055, + "loss": 0.583, + "losses/dpo": 0.5579338073730469, + "losses/sft": 1.615804672241211, + "losses/total": 0.5579338073730469, + "ref_logps/chosen": -38.33686065673828, + "ref_logps/rejected": -41.62626647949219, + "rewards/accuracies": 0.6953125, + "rewards/chosen": -0.9165424108505249, + "rewards/margins": 0.40537166595458984, + "rewards/rejected": -1.3219139575958252, + "step": 352 + }, + { + "epoch": 2.66, + "grad_norm": 7.805410708655585, + "learning_rate": 6.039325842696629e-08, + "logps/chosen": -44.361324310302734, + "logps/rejected": -59.17631149291992, + "loss": 0.5472, + "losses/dpo": 0.6015689373016357, + "losses/sft": 1.6676236391067505, + "losses/total": 0.6015689373016357, + "ref_logps/chosen": -35.77091979980469, + "ref_logps/rejected": -45.55202865600586, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.8590403199195862, + "rewards/margins": 0.5033884048461914, + "rewards/rejected": -1.362428903579712, + "step": 353 + }, + { + "epoch": 2.67, + "grad_norm": 8.028184259224918, + "learning_rate": 5.898876404494382e-08, + "logps/chosen": -46.5517463684082, + "logps/rejected": -56.04482650756836, + "loss": 0.5542, + "losses/dpo": 0.6546050310134888, + "losses/sft": 1.504585862159729, + "losses/total": 0.6546050310134888, + "ref_logps/chosen": -37.785274505615234, + "ref_logps/rejected": -42.51100540161133, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.8766471147537231, + "rewards/margins": 0.47673481702804565, + "rewards/rejected": -1.3533821105957031, + "step": 354 + }, + { + "epoch": 2.68, + "grad_norm": 7.260620206463691, + "learning_rate": 5.758426966292135e-08, + "logps/chosen": -48.80982971191406, + "logps/rejected": -56.19672393798828, + "loss": 0.5515, + "losses/dpo": 0.46307122707366943, + "losses/sft": 1.685928463935852, + "losses/total": 0.46307122707366943, + "ref_logps/chosen": -40.1851806640625, + "ref_logps/rejected": -43.02751159667969, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -0.8624651432037354, + "rewards/margins": 0.4544559121131897, + "rewards/rejected": -1.3169212341308594, + "step": 355 + }, + { + "epoch": 2.69, + "grad_norm": 7.136349320311039, + "learning_rate": 5.617977528089887e-08, + "logps/chosen": -41.40632629394531, + "logps/rejected": -54.12514114379883, + "loss": 0.5343, + "losses/dpo": 0.45047110319137573, + "losses/sft": 1.3219261169433594, + "losses/total": 0.45047110319137573, + "ref_logps/chosen": -33.34068298339844, + "ref_logps/rejected": -40.50140380859375, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8065648078918457, + "rewards/margins": 0.555808961391449, + "rewards/rejected": -1.3623738288879395, + "step": 356 + }, + { + "epoch": 2.69, + "grad_norm": 7.456298216594317, + "learning_rate": 5.47752808988764e-08, + "logps/chosen": -44.206722259521484, + "logps/rejected": -55.71735382080078, + "loss": 0.5494, + "losses/dpo": 0.4734205901622772, + "losses/sft": 1.4844837188720703, + "losses/total": 0.4734205901622772, + "ref_logps/chosen": -35.47336959838867, + "ref_logps/rejected": -41.92726516723633, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8733350038528442, + "rewards/margins": 0.5056736469268799, + "rewards/rejected": -1.3790085315704346, + "step": 357 + }, + { + "epoch": 2.7, + "grad_norm": 7.41987426694341, + "learning_rate": 5.3370786516853926e-08, + "logps/chosen": -46.22618865966797, + "logps/rejected": -56.47550964355469, + "loss": 0.5003, + "losses/dpo": 0.562317430973053, + "losses/sft": 1.491492509841919, + "losses/total": 0.562317430973053, + "ref_logps/chosen": -37.904022216796875, + "ref_logps/rejected": -41.78309631347656, + "rewards/accuracies": 0.8046875, + "rewards/chosen": -0.8322172164916992, + "rewards/margins": 0.6370242834091187, + "rewards/rejected": -1.4692414999008179, + "step": 358 + }, + { + "epoch": 2.71, + "grad_norm": 6.765690296642083, + "learning_rate": 5.196629213483146e-08, + "logps/chosen": -41.32649612426758, + "logps/rejected": -55.117488861083984, + "loss": 0.4906, + "losses/dpo": 0.45937132835388184, + "losses/sft": 1.3386218547821045, + "losses/total": 0.45937132835388184, + "ref_logps/chosen": -33.87388610839844, + "ref_logps/rejected": -41.34483337402344, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -0.7452608942985535, + "rewards/margins": 0.6320046782493591, + "rewards/rejected": -1.377265453338623, + "step": 359 + }, + { + "epoch": 2.72, + "grad_norm": 7.95832621655637, + "learning_rate": 5.056179775280899e-08, + "logps/chosen": -44.83673858642578, + "logps/rejected": -54.593666076660156, + "loss": 0.5529, + "losses/dpo": 0.5646368861198425, + "losses/sft": 1.3903212547302246, + "losses/total": 0.5646368861198425, + "ref_logps/chosen": -36.184173583984375, + "ref_logps/rejected": -41.40753173828125, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.865256667137146, + "rewards/margins": 0.45335638523101807, + "rewards/rejected": -1.318613052368164, + "step": 360 + }, + { + "epoch": 2.72, + "grad_norm": 8.359120516602266, + "learning_rate": 4.915730337078652e-08, + "logps/chosen": -48.39961624145508, + "logps/rejected": -54.2540397644043, + "loss": 0.5967, + "losses/dpo": 0.7173389196395874, + "losses/sft": 1.989745020866394, + "losses/total": 0.7173389196395874, + "ref_logps/chosen": -39.236839294433594, + "ref_logps/rejected": -40.71720504760742, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.9162774682044983, + "rewards/margins": 0.4374057650566101, + "rewards/rejected": -1.3536832332611084, + "step": 361 + }, + { + "epoch": 2.73, + "grad_norm": 7.8949116422203645, + "learning_rate": 4.775280898876404e-08, + "logps/chosen": -45.1904296875, + "logps/rejected": -55.9586181640625, + "loss": 0.5313, + "losses/dpo": 0.6307837963104248, + "losses/sft": 1.725508213043213, + "losses/total": 0.6307837963104248, + "ref_logps/chosen": -36.340023040771484, + "ref_logps/rejected": -41.507598876953125, + "rewards/accuracies": 0.7578125, + "rewards/chosen": -0.8850406408309937, + "rewards/margins": 0.5600608587265015, + "rewards/rejected": -1.4451014995574951, + "step": 362 + }, + { + "epoch": 2.74, + "grad_norm": 7.439921856307659, + "learning_rate": 4.634831460674157e-08, + "logps/chosen": -47.467689514160156, + "logps/rejected": -55.250770568847656, + "loss": 0.5418, + "losses/dpo": 0.6079765558242798, + "losses/sft": 1.8188178539276123, + "losses/total": 0.6079765558242798, + "ref_logps/chosen": -38.65827560424805, + "ref_logps/rejected": -41.087459564208984, + "rewards/accuracies": 0.6796875, + "rewards/chosen": -0.8809411525726318, + "rewards/margins": 0.53538978099823, + "rewards/rejected": -1.4163308143615723, + "step": 363 + }, + { + "epoch": 2.75, + "grad_norm": 7.250730071839225, + "learning_rate": 4.4943820224719096e-08, + "logps/chosen": -42.77532196044922, + "logps/rejected": -59.660240173339844, + "loss": 0.4573, + "losses/dpo": 0.4384981393814087, + "losses/sft": 1.4787318706512451, + "losses/total": 0.4384981393814087, + "ref_logps/chosen": -35.051734924316406, + "ref_logps/rejected": -44.746734619140625, + "rewards/accuracies": 0.8671875, + "rewards/chosen": -0.7723584175109863, + "rewards/margins": 0.7189919948577881, + "rewards/rejected": -1.4913504123687744, + "step": 364 + }, + { + "epoch": 2.75, + "grad_norm": 8.23044878029811, + "learning_rate": 4.3539325842696626e-08, + "logps/chosen": -49.66007995605469, + "logps/rejected": -60.10342025756836, + "loss": 0.5469, + "losses/dpo": 0.5084363222122192, + "losses/sft": 1.8791687488555908, + "losses/total": 0.5084363222122192, + "ref_logps/chosen": -40.346317291259766, + "ref_logps/rejected": -45.41026306152344, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9313763380050659, + "rewards/margins": 0.5379395484924316, + "rewards/rejected": -1.469315767288208, + "step": 365 + }, + { + "epoch": 2.76, + "grad_norm": 7.298326331639276, + "learning_rate": 4.213483146067416e-08, + "logps/chosen": -48.57288360595703, + "logps/rejected": -57.29835510253906, + "loss": 0.5395, + "losses/dpo": 0.44340649247169495, + "losses/sft": 1.5243843793869019, + "losses/total": 0.44340649247169495, + "ref_logps/chosen": -39.81139373779297, + "ref_logps/rejected": -43.033912658691406, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8761484622955322, + "rewards/margins": 0.5502957701683044, + "rewards/rejected": -1.4264442920684814, + "step": 366 + }, + { + "epoch": 2.77, + "grad_norm": 8.51804253270616, + "learning_rate": 4.073033707865169e-08, + "logps/chosen": -44.41961669921875, + "logps/rejected": -53.89155578613281, + "loss": 0.5723, + "losses/dpo": 0.5301268100738525, + "losses/sft": 1.8131489753723145, + "losses/total": 0.5301268100738525, + "ref_logps/chosen": -35.50189971923828, + "ref_logps/rejected": -40.51585388183594, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8917717933654785, + "rewards/margins": 0.44579851627349854, + "rewards/rejected": -1.337570309638977, + "step": 367 + }, + { + "epoch": 2.78, + "grad_norm": 7.54105725557247, + "learning_rate": 3.932584269662921e-08, + "logps/chosen": -41.12848663330078, + "logps/rejected": -55.290313720703125, + "loss": 0.54, + "losses/dpo": 0.49226510524749756, + "losses/sft": 1.37047278881073, + "losses/total": 0.49226510524749756, + "ref_logps/chosen": -32.751502990722656, + "ref_logps/rejected": -41.476318359375, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8376982808113098, + "rewards/margins": 0.5437013506889343, + "rewards/rejected": -1.3813996315002441, + "step": 368 + }, + { + "epoch": 2.78, + "grad_norm": 7.706900297271427, + "learning_rate": 3.792134831460674e-08, + "logps/chosen": -45.84465408325195, + "logps/rejected": -56.17218780517578, + "loss": 0.5373, + "losses/dpo": 0.5797220468521118, + "losses/sft": 1.6374412775039673, + "losses/total": 0.5797220468521118, + "ref_logps/chosen": -36.70783233642578, + "ref_logps/rejected": -41.77714538574219, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.913682222366333, + "rewards/margins": 0.5258220434188843, + "rewards/rejected": -1.4395041465759277, + "step": 369 + }, + { + "epoch": 2.79, + "grad_norm": 7.7547810769646555, + "learning_rate": 3.6516853932584266e-08, + "logps/chosen": -42.759098052978516, + "logps/rejected": -52.87897491455078, + "loss": 0.5836, + "losses/dpo": 0.7289267778396606, + "losses/sft": 1.7013481855392456, + "losses/total": 0.7289267778396606, + "ref_logps/chosen": -34.204769134521484, + "ref_logps/rejected": -40.38142776489258, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -0.85543292760849, + "rewards/margins": 0.3943214416503906, + "rewards/rejected": -1.2497543096542358, + "step": 370 + }, + { + "epoch": 2.8, + "grad_norm": 7.07250671481464, + "learning_rate": 3.5112359550561796e-08, + "logps/chosen": -45.09293746948242, + "logps/rejected": -56.41200256347656, + "loss": 0.5025, + "losses/dpo": 0.42966747283935547, + "losses/sft": 1.5621216297149658, + "losses/total": 0.42966747283935547, + "ref_logps/chosen": -37.37934494018555, + "ref_logps/rejected": -42.70643615722656, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7713593244552612, + "rewards/margins": 0.599197506904602, + "rewards/rejected": -1.3705568313598633, + "step": 371 + }, + { + "epoch": 2.81, + "grad_norm": 8.204140371424504, + "learning_rate": 3.370786516853932e-08, + "logps/chosen": -48.024269104003906, + "logps/rejected": -57.5866584777832, + "loss": 0.5389, + "losses/dpo": 0.5919984579086304, + "losses/sft": 1.4933536052703857, + "losses/total": 0.5919984579086304, + "ref_logps/chosen": -39.3967399597168, + "ref_logps/rejected": -43.82619857788086, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -0.8627532124519348, + "rewards/margins": 0.5132932662963867, + "rewards/rejected": -1.3760464191436768, + "step": 372 + }, + { + "epoch": 2.82, + "grad_norm": 8.113226187403898, + "learning_rate": 3.230337078651686e-08, + "logps/chosen": -46.628257751464844, + "logps/rejected": -62.4483642578125, + "loss": 0.4997, + "losses/dpo": 0.4237878918647766, + "losses/sft": 1.5488381385803223, + "losses/total": 0.4237878918647766, + "ref_logps/chosen": -38.190887451171875, + "ref_logps/rejected": -47.450843811035156, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -0.8437370657920837, + "rewards/margins": 0.6560153961181641, + "rewards/rejected": -1.4997525215148926, + "step": 373 + }, + { + "epoch": 2.82, + "grad_norm": 7.437009897432824, + "learning_rate": 3.089887640449438e-08, + "logps/chosen": -44.36549377441406, + "logps/rejected": -59.504005432128906, + "loss": 0.4967, + "losses/dpo": 0.42525550723075867, + "losses/sft": 1.5591559410095215, + "losses/total": 0.42525550723075867, + "ref_logps/chosen": -35.93299102783203, + "ref_logps/rejected": -44.88318634033203, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -0.8432497978210449, + "rewards/margins": 0.6188317537307739, + "rewards/rejected": -1.4620814323425293, + "step": 374 + }, + { + "epoch": 2.83, + "grad_norm": 6.665463460188975, + "learning_rate": 2.949438202247191e-08, + "logps/chosen": -42.69816970825195, + "logps/rejected": -59.00178909301758, + "loss": 0.4858, + "losses/dpo": 0.40338996052742004, + "losses/sft": 1.7176090478897095, + "losses/total": 0.40338996052742004, + "ref_logps/chosen": -34.84870147705078, + "ref_logps/rejected": -44.43999481201172, + "rewards/accuracies": 0.8203125, + "rewards/chosen": -0.7849469184875488, + "rewards/margins": 0.6712321639060974, + "rewards/rejected": -1.456179141998291, + "step": 375 + }, + { + "epoch": 2.84, + "grad_norm": 7.288923139558504, + "learning_rate": 2.8089887640449436e-08, + "logps/chosen": -47.2071533203125, + "logps/rejected": -58.098148345947266, + "loss": 0.5076, + "losses/dpo": 0.5357474088668823, + "losses/sft": 1.654085636138916, + "losses/total": 0.5357474088668823, + "ref_logps/chosen": -38.56999588012695, + "ref_logps/rejected": -43.37090301513672, + "rewards/accuracies": 0.7734375, + "rewards/chosen": -0.8637155890464783, + "rewards/margins": 0.6090089678764343, + "rewards/rejected": -1.4727245569229126, + "step": 376 + }, + { + "epoch": 2.85, + "grad_norm": 7.551625004814956, + "learning_rate": 2.6685393258426963e-08, + "logps/chosen": -45.72412109375, + "logps/rejected": -56.43421173095703, + "loss": 0.5474, + "losses/dpo": 0.544715404510498, + "losses/sft": 1.5618551969528198, + "losses/total": 0.544715404510498, + "ref_logps/chosen": -37.27866744995117, + "ref_logps/rejected": -42.84328079223633, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8445456027984619, + "rewards/margins": 0.5145478248596191, + "rewards/rejected": -1.359093427658081, + "step": 377 + }, + { + "epoch": 2.85, + "grad_norm": 8.025864212794117, + "learning_rate": 2.5280898876404493e-08, + "logps/chosen": -45.621158599853516, + "logps/rejected": -60.68471145629883, + "loss": 0.5285, + "losses/dpo": 0.5538164377212524, + "losses/sft": 1.5718330144882202, + "losses/total": 0.5538164377212524, + "ref_logps/chosen": -36.82067108154297, + "ref_logps/rejected": -46.61594009399414, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8800492286682129, + "rewards/margins": 0.5268282890319824, + "rewards/rejected": -1.4068775177001953, + "step": 378 + }, + { + "epoch": 2.86, + "grad_norm": 7.047129440366168, + "learning_rate": 2.387640449438202e-08, + "logps/chosen": -44.4951057434082, + "logps/rejected": -50.4869499206543, + "loss": 0.549, + "losses/dpo": 0.490747332572937, + "losses/sft": 1.6444151401519775, + "losses/total": 0.490747332572937, + "ref_logps/chosen": -36.696903228759766, + "ref_logps/rejected": -37.55984878540039, + "rewards/accuracies": 0.671875, + "rewards/chosen": -0.7798205614089966, + "rewards/margins": 0.5128894448280334, + "rewards/rejected": -1.2927099466323853, + "step": 379 + }, + { + "epoch": 2.87, + "grad_norm": 7.471899506757266, + "learning_rate": 2.2471910112359548e-08, + "logps/chosen": -47.25148391723633, + "logps/rejected": -59.717864990234375, + "loss": 0.5358, + "losses/dpo": 0.6100134253501892, + "losses/sft": 1.9196665287017822, + "losses/total": 0.6100134253501892, + "ref_logps/chosen": -37.96227264404297, + "ref_logps/rejected": -44.814449310302734, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -0.9289212226867676, + "rewards/margins": 0.5614204406738281, + "rewards/rejected": -1.4903416633605957, + "step": 380 + }, + { + "epoch": 2.88, + "grad_norm": 8.004493640627455, + "learning_rate": 2.106741573033708e-08, + "logps/chosen": -44.516780853271484, + "logps/rejected": -51.935089111328125, + "loss": 0.6119, + "losses/dpo": 0.556452751159668, + "losses/sft": 1.4079639911651611, + "losses/total": 0.556452751159668, + "ref_logps/chosen": -35.908897399902344, + "ref_logps/rejected": -39.84672546386719, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.8607881665229797, + "rewards/margins": 0.34804895520210266, + "rewards/rejected": -1.2088370323181152, + "step": 381 + }, + { + "epoch": 2.88, + "grad_norm": 7.393197706656567, + "learning_rate": 1.9662921348314606e-08, + "logps/chosen": -44.82762908935547, + "logps/rejected": -59.140480041503906, + "loss": 0.509, + "losses/dpo": 0.420447438955307, + "losses/sft": 1.7410156726837158, + "losses/total": 0.420447438955307, + "ref_logps/chosen": -36.08792495727539, + "ref_logps/rejected": -43.82966613769531, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.8739705085754395, + "rewards/margins": 0.6571108102798462, + "rewards/rejected": -1.5310813188552856, + "step": 382 + }, + { + "epoch": 2.89, + "grad_norm": 7.992863219139361, + "learning_rate": 1.8258426966292133e-08, + "logps/chosen": -45.79706573486328, + "logps/rejected": -53.040687561035156, + "loss": 0.5962, + "losses/dpo": 0.6750953197479248, + "losses/sft": 1.7228975296020508, + "losses/total": 0.6750953197479248, + "ref_logps/chosen": -37.67970657348633, + "ref_logps/rejected": -40.953521728515625, + "rewards/accuracies": 0.7109375, + "rewards/chosen": -0.8117363452911377, + "rewards/margins": 0.3969798684120178, + "rewards/rejected": -1.2087161540985107, + "step": 383 + }, + { + "epoch": 2.9, + "grad_norm": 7.487738517511007, + "learning_rate": 1.685393258426966e-08, + "logps/chosen": -45.35090637207031, + "logps/rejected": -57.14335632324219, + "loss": 0.5354, + "losses/dpo": 0.5379496812820435, + "losses/sft": 1.6705958843231201, + "losses/total": 0.5379496812820435, + "ref_logps/chosen": -37.31737518310547, + "ref_logps/rejected": -43.805870056152344, + "rewards/accuracies": 0.7265625, + "rewards/chosen": -0.8033530712127686, + "rewards/margins": 0.5303957462310791, + "rewards/rejected": -1.3337488174438477, + "step": 384 + }, + { + "epoch": 2.91, + "grad_norm": 8.22102430010328, + "learning_rate": 1.544943820224719e-08, + "logps/chosen": -47.1776237487793, + "logps/rejected": -54.27086639404297, + "loss": 0.5733, + "losses/dpo": 0.5613248348236084, + "losses/sft": 1.773917317390442, + "losses/total": 0.5613248348236084, + "ref_logps/chosen": -38.05524444580078, + "ref_logps/rejected": -40.46519470214844, + "rewards/accuracies": 0.6640625, + "rewards/chosen": -0.9122380614280701, + "rewards/margins": 0.4683291018009186, + "rewards/rejected": -1.3805670738220215, + "step": 385 + }, + { + "epoch": 2.91, + "grad_norm": 7.908569868218082, + "learning_rate": 1.4044943820224718e-08, + "logps/chosen": -44.001075744628906, + "logps/rejected": -60.508758544921875, + "loss": 0.5285, + "losses/dpo": 0.5084520578384399, + "losses/sft": 1.5907535552978516, + "losses/total": 0.5084520578384399, + "ref_logps/chosen": -34.90777587890625, + "ref_logps/rejected": -45.287864685058594, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.9093303084373474, + "rewards/margins": 0.6127593517303467, + "rewards/rejected": -1.5220897197723389, + "step": 386 + }, + { + "epoch": 2.92, + "grad_norm": 7.329445132361356, + "learning_rate": 1.2640449438202247e-08, + "logps/chosen": -46.82018280029297, + "logps/rejected": -53.613643646240234, + "loss": 0.521, + "losses/dpo": 0.5018836259841919, + "losses/sft": 1.6243071556091309, + "losses/total": 0.5018836259841919, + "ref_logps/chosen": -38.469993591308594, + "ref_logps/rejected": -39.96007537841797, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -0.8350194692611694, + "rewards/margins": 0.5303376913070679, + "rewards/rejected": -1.3653571605682373, + "step": 387 + }, + { + "epoch": 2.93, + "grad_norm": 7.4822766342943225, + "learning_rate": 1.1235955056179774e-08, + "logps/chosen": -46.926666259765625, + "logps/rejected": -55.5013427734375, + "loss": 0.5439, + "losses/dpo": 0.575495183467865, + "losses/sft": 1.3514134883880615, + "losses/total": 0.575495183467865, + "ref_logps/chosen": -38.2935905456543, + "ref_logps/rejected": -42.219791412353516, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8633076548576355, + "rewards/margins": 0.46484747529029846, + "rewards/rejected": -1.3281550407409668, + "step": 388 + }, + { + "epoch": 2.94, + "grad_norm": 7.481721913520452, + "learning_rate": 9.831460674157303e-09, + "logps/chosen": -46.69519805908203, + "logps/rejected": -55.18059158325195, + "loss": 0.5312, + "losses/dpo": 0.5392994284629822, + "losses/sft": 2.022167682647705, + "losses/total": 0.5392994284629822, + "ref_logps/chosen": -38.2025146484375, + "ref_logps/rejected": -41.62324905395508, + "rewards/accuracies": 0.7578125, + "rewards/chosen": -0.8492681384086609, + "rewards/margins": 0.506466269493103, + "rewards/rejected": -1.3557343482971191, + "step": 389 + }, + { + "epoch": 2.94, + "grad_norm": 8.860083156452712, + "learning_rate": 8.42696629213483e-09, + "logps/chosen": -47.0518798828125, + "logps/rejected": -56.05253601074219, + "loss": 0.6151, + "losses/dpo": 0.8160465955734253, + "losses/sft": 1.661864161491394, + "losses/total": 0.8160465955734253, + "ref_logps/chosen": -37.67930603027344, + "ref_logps/rejected": -42.802921295166016, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.9372565746307373, + "rewards/margins": 0.3877047896385193, + "rewards/rejected": -1.3249614238739014, + "step": 390 + }, + { + "epoch": 2.95, + "grad_norm": 7.697893962559924, + "learning_rate": 7.022471910112359e-09, + "logps/chosen": -46.420570373535156, + "logps/rejected": -56.345977783203125, + "loss": 0.5154, + "losses/dpo": 0.5586492419242859, + "losses/sft": 1.621840476989746, + "losses/total": 0.5586492419242859, + "ref_logps/chosen": -38.219852447509766, + "ref_logps/rejected": -42.38871765136719, + "rewards/accuracies": 0.7890625, + "rewards/chosen": -0.8200712203979492, + "rewards/margins": 0.5756551027297974, + "rewards/rejected": -1.395726203918457, + "step": 391 + }, + { + "epoch": 2.96, + "grad_norm": 7.398815606595402, + "learning_rate": 5.617977528089887e-09, + "logps/chosen": -45.353294372558594, + "logps/rejected": -56.47963333129883, + "loss": 0.5242, + "losses/dpo": 0.5106035470962524, + "losses/sft": 1.4234966039657593, + "losses/total": 0.5106035470962524, + "ref_logps/chosen": -36.788330078125, + "ref_logps/rejected": -42.60810852050781, + "rewards/accuracies": 0.7421875, + "rewards/chosen": -0.8564971089363098, + "rewards/margins": 0.5306553244590759, + "rewards/rejected": -1.3871524333953857, + "step": 392 + }, + { + "epoch": 2.97, + "grad_norm": 7.0449225391612895, + "learning_rate": 4.213483146067415e-09, + "logps/chosen": -44.40395736694336, + "logps/rejected": -53.697776794433594, + "loss": 0.5379, + "losses/dpo": 0.5200778841972351, + "losses/sft": 1.9024913311004639, + "losses/total": 0.5200778841972351, + "ref_logps/chosen": -36.2678337097168, + "ref_logps/rejected": -40.272247314453125, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8136123418807983, + "rewards/margins": 0.5289404392242432, + "rewards/rejected": -1.3425527811050415, + "step": 393 + }, + { + "epoch": 2.97, + "grad_norm": 7.8920504953670525, + "learning_rate": 2.8089887640449435e-09, + "logps/chosen": -45.78767776489258, + "logps/rejected": -58.19701385498047, + "loss": 0.5882, + "losses/dpo": 0.5196930170059204, + "losses/sft": 1.4936178922653198, + "losses/total": 0.5196930170059204, + "ref_logps/chosen": -36.467750549316406, + "ref_logps/rejected": -44.57653045654297, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9319925904273987, + "rewards/margins": 0.43005576729774475, + "rewards/rejected": -1.3620483875274658, + "step": 394 + }, + { + "epoch": 2.98, + "grad_norm": 6.749020955821219, + "learning_rate": 1.4044943820224717e-09, + "logps/chosen": -43.89699935913086, + "logps/rejected": -53.18260955810547, + "loss": 0.5308, + "losses/dpo": 0.7003037333488464, + "losses/sft": 1.696626901626587, + "losses/total": 0.7003037333488464, + "ref_logps/chosen": -35.854217529296875, + "ref_logps/rejected": -39.928733825683594, + "rewards/accuracies": 0.765625, + "rewards/chosen": -0.8042781949043274, + "rewards/margins": 0.5211097002029419, + "rewards/rejected": -1.3253878355026245, + "step": 395 + }, + { + "epoch": 2.99, + "grad_norm": 8.026771636400738, + "learning_rate": 0.0, + "logps/chosen": -48.76679229736328, + "logps/rejected": -59.84498596191406, + "loss": 0.5292, + "losses/dpo": 0.46332383155822754, + "losses/sft": 1.643686056137085, + "losses/total": 0.46332383155822754, + "ref_logps/chosen": -39.41192626953125, + "ref_logps/rejected": -44.815162658691406, + "rewards/accuracies": 0.734375, + "rewards/chosen": -0.935486912727356, + "rewards/margins": 0.567494809627533, + "rewards/rejected": -1.5029817819595337, + "step": 396 + }, + { + "epoch": 2.99, + "step": 396, + "total_flos": 0.0, + "train_loss": 0.6025580500412469, + "train_runtime": 11600.2001, + "train_samples_per_second": 4.386, + "train_steps_per_second": 0.034 + } + ], + "logging_steps": 1.0, + "max_steps": 396, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 70, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}